[PATCH v3 0/3] Add zone append write for zoned device

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3 0/3] Add zone append write for zoned device
@ 2022-10-10  2:33 Sam Li
  2022-10-10  2:33 ` [PATCH v3 1/3] file-posix:add the tracking of the zones write pointers Sam Li
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Sam Li @ 2022-10-10  2:33 UTC (permalink / raw)
  To: qemu-devel
  Cc: stefanha, Hanna Reitz, Kevin Wolf, dmitry.fomichev, hare,
	Fam Zheng, damien.lemoal, qemu-block, Sam Li

v3:
- only read wps when it is locked [Damien]
- allow last smaller zone case [Damien]
- add zone type and state checks in zone_mgmt command [Damien]
- fix RESET_ALL related problems

v2:
- split patch to two patches for better reviewing
- change BlockZoneWps's structure to an array of integers
- use only mutex lock on locking conditions of zone wps
- coding styles and clean-ups

v1:
- introduce zone append write

Sam Li (3):
  file-posix:add the tracking of the zones write pointers
  block: introduce zone append write for zoned devices
  qemu-iotests: test zone append operation

 block/block-backend.c              |  64 +++++++++
 block/file-posix.c                 | 216 ++++++++++++++++++++++++++++-
 block/io.c                         |  21 +++
 block/raw-format.c                 |   7 +
 include/block/block-common.h       |  14 ++
 include/block/block-io.h           |   3 +
 include/block/block_int-common.h   |   8 ++
 include/block/raw-aio.h            |   4 +-
 include/sysemu/block-backend-io.h  |   9 ++
 qemu-io-cmds.c                     |  62 +++++++++
 tests/qemu-iotests/tests/zoned.out |   7 +
 tests/qemu-iotests/tests/zoned.sh  |   9 ++
 12 files changed, 420 insertions(+), 4 deletions(-)

-- 
2.37.3



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v3 1/3] file-posix:add the tracking of the zones write pointers
  2022-10-10  2:33 [PATCH v3 0/3] Add zone append write for zoned device Sam Li
@ 2022-10-10  2:33 ` Sam Li
  2022-10-13  5:13   ` Damien Le Moal
  2022-10-10  2:33 ` [PATCH v3 2/3] block: introduce zone append write for zoned devices Sam Li
  2022-10-10  2:33 ` [PATCH v3 3/3] qemu-iotests: test zone append operation Sam Li
  2 siblings, 1 reply; 11+ messages in thread
From: Sam Li @ 2022-10-10  2:33 UTC (permalink / raw)
  To: qemu-devel
  Cc: stefanha, Hanna Reitz, Kevin Wolf, dmitry.fomichev, hare,
	Fam Zheng, damien.lemoal, qemu-block, Sam Li

Since Linux doesn't have a user API to issue zone append operations to
zoned devices from user space, the file-posix driver is modified to add
zone append emulation using regular writes. To do this, the file-posix
driver tracks the wp location of all zones of the device. It uses an
array of uint64_t. The most significant bit of each wp location indicates
if the zone type is conventional zones.

The zones wp can be changed due to the following operations issued:
- zone reset: change the wp to the start offset of that zone
- zone finish: change to the end location of that zone
- write to a zone
- zone append

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 block/file-posix.c               | 158 +++++++++++++++++++++++++++++++
 include/block/block-common.h     |  14 +++
 include/block/block_int-common.h |   5 +
 3 files changed, 177 insertions(+)

diff --git a/block/file-posix.c b/block/file-posix.c
index a9d347292e..17c0b58158 100755
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -206,6 +206,7 @@ typedef struct RawPosixAIOData {
         struct {
             struct iovec *iov;
             int niov;
+            int64_t *append_sector;
         } io;
         struct {
             uint64_t cmd;
@@ -226,6 +227,7 @@ typedef struct RawPosixAIOData {
         struct {
             unsigned long zone_op;
             const char *zone_op_name;
+            bool all;
         } zone_mgmt;
     };
 } RawPosixAIOData;
@@ -1331,6 +1333,67 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
 #endif
 }
 
+#if defined(CONFIG_BLKZONED)
+static int get_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
+                        unsigned int nrz) {
+    struct blk_zone *blkz;
+    int64_t rep_size;
+    int64_t sector = offset >> BDRV_SECTOR_BITS;
+    int ret, n = 0, i = 0;
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    g_autofree struct blk_zone_report *rep = NULL;
+
+    rep = g_malloc(rep_size);
+    blkz = (struct blk_zone *)(rep + 1);
+    while (n < nrz) {
+        memset(rep, 0, rep_size);
+        rep->sector = sector;
+        rep->nr_zones = nrz - n;
+
+        do {
+            ret = ioctl(fd, BLKREPORTZONE, rep);
+        } while (ret != 0 && errno == EINTR);
+        if (ret != 0) {
+            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+                    fd, offset, errno);
+            return -errno;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; i++, n++) {
+            /*
+             * The wp tracking cares only about sequential writes required and
+             * sequential write preferred zones so that the wp can advance to
+             * the right location.
+             * Use the most significant bit of the wp location to indicate the
+             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
+             */
+            if (!(blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL)) {
+                wps->wp[i] += 1ULL << 63;
+            } else {
+                wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
+            }
+        }
+        sector = blkz[i-1].start + blkz[i-1].len;
+    }
+
+    return 0;
+}
+
+static void update_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
+                            unsigned int nrz) {
+    qemu_mutex_lock(&wps->lock);
+    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
+        error_report("report zone wp failed");
+        return;
+    }
+    qemu_mutex_unlock(&wps->lock);
+}
+#endif
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
@@ -1414,6 +1477,19 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
             error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
             return;
         }
+
+        ret = get_sysfs_long_val(&st, "physical_block_size");
+        if (ret >= 0) {
+            bs->bl.write_granularity = ret;
+        }
+
+        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
+        if (get_zones_wp(0, s->fd, bs->bl.wps, ret) < 0){
+            error_report("report wps failed");
+            g_free(bs->bl.wps);
+            return;
+        }
+        qemu_mutex_init(&bs->bl.wps->lock);
     }
 }
 
@@ -1651,6 +1727,20 @@ static int handle_aiocb_rw(void *opaque)
     ssize_t nbytes;
     char *buf;
 
+    /*
+     * The offset of regular writes, append writes is the wp location
+     * of that zone.
+     */
+    if (aiocb->aio_type & QEMU_AIO_WRITE) {
+        if (aiocb->bs->bl.zone_size > 0) {
+            BlockZoneWps *wps = aiocb->bs->bl.wps;
+            qemu_mutex_lock(&wps->lock);
+            aiocb->aio_offset = wps->wp[aiocb->aio_offset /
+                                        aiocb->bs->bl.zone_size];
+            qemu_mutex_unlock(&wps->lock);
+        }
+    }
+
     if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
         /*
          * If there is just a single buffer, and it is properly aligned
@@ -1725,6 +1815,24 @@ static int handle_aiocb_rw(void *opaque)
 
 out:
     if (nbytes == aiocb->aio_nbytes) {
+#if defined(CONFIG_BLKZONED)
+        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+            BlockZoneWps *wps = aiocb->bs->bl.wps;
+            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
+            if (wps) {
+                qemu_mutex_lock(&wps->lock);
+                if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
+                    uint64_t wend_offset =
+                            aiocb->aio_offset + aiocb->aio_nbytes;
+                    /* Advance the wp if needed */
+                    if (wend_offset > wps->wp[index]){
+                        wps->wp[index] = wend_offset;
+                    }
+                }
+                qemu_mutex_unlock(&wps->lock);
+            }
+        }
+#endif
         return 0;
     } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
         if (aiocb->aio_type & QEMU_AIO_WRITE) {
@@ -1736,6 +1844,12 @@ out:
         }
     } else {
         assert(nbytes < 0);
+#if defined(CONFIG_BLKZONED)
+        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+            update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
+                            aiocb->bs->bl.nr_zones);
+        }
+#endif
         return nbytes;
     }
 }
@@ -2022,12 +2136,17 @@ static int handle_aiocb_zone_report(void *opaque) {
 #if defined(CONFIG_BLKZONED)
 static int handle_aiocb_zone_mgmt(void *opaque) {
     RawPosixAIOData *aiocb = opaque;
+    BlockDriverState *bs = aiocb->bs;
     int fd = aiocb->aio_fildes;
     int64_t sector = aiocb->aio_offset / 512;
     int64_t nr_sectors = aiocb->aio_nbytes / 512;
+    uint64_t wend_offset;
     struct blk_zone_range range;
     int ret;
 
+    BlockZoneWps *wps = bs->bl.wps;
+    int index = aiocb->aio_offset / bs->bl.zone_size;
+
     /* Execute the operation */
     range.sector = sector;
     range.nr_sectors = nr_sectors;
@@ -2035,11 +2154,41 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
         ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
     } while (ret != 0 && errno == EINTR);
     if (ret != 0) {
+        update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
+                        aiocb->bs->bl.nr_zones);
         ret = -errno;
         error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
                      ret);
         return ret;
     }
+
+    qemu_mutex_lock(&wps->lock);
+    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
+         /*
+         * The zoned device allows the last zone smaller that the zone size.
+         */
+        if (aiocb->aio_nbytes < bs->bl.zone_size) {
+            wend_offset = aiocb->aio_offset + aiocb->aio_nbytes;
+        } else {
+            wend_offset = aiocb->aio_offset + bs->bl.zone_size;
+        }
+
+        if (aiocb->aio_offset != wps->wp[index] &&
+            aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
+            if (aiocb->zone_mgmt.all) {
+                for (int i = 0; i < bs->bl.nr_zones; ++i) {
+                    wps->wp[i] = i * bs->bl.zone_size;
+                }
+            } else {
+                wps->wp[index] = aiocb->aio_offset;
+            }
+        } else if (aiocb->aio_offset != wps->wp[index] &&
+            aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
+            wps->wp[index] = wend_offset;
+        }
+    }
+    qemu_mutex_unlock(&wps->lock);
+
     return ret;
 }
 #endif
@@ -2480,6 +2629,12 @@ static void raw_close(BlockDriverState *bs)
     BDRVRawState *s = bs->opaque;
 
     if (s->fd >= 0) {
+#if defined(CONFIG_BLKZONED)
+        if (bs->bl.wps) {
+            qemu_mutex_destroy(&bs->bl.wps->lock);
+            g_free(bs->bl.wps);
+        }
+#endif
         qemu_close(s->fd);
         s->fd = -1;
     }
@@ -3278,6 +3433,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
     int64_t zone_size, zone_size_mask;
     const char *zone_op_name;
     unsigned long zone_op;
+    bool is_all = false;
 
     zone_size = bs->bl.zone_size;
     zone_size_mask = zone_size - 1;
@@ -3314,6 +3470,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
     case BLK_ZO_RESET_ALL:
         zone_op_name = "BLKRESETZONE";
         zone_op = BLKRESETZONE;
+        is_all = true;
         break;
     default:
         g_assert_not_reached();
@@ -3328,6 +3485,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
         .zone_mgmt  = {
             .zone_op = zone_op,
             .zone_op_name = zone_op_name,
+            .all = is_all,
         },
     };
 
diff --git a/include/block/block-common.h b/include/block/block-common.h
index 882de6825e..b8b2dba64a 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -93,6 +93,14 @@ typedef struct BlockZoneDescriptor {
     BlockZoneCondition cond;
 } BlockZoneDescriptor;
 
+/*
+ * Track write pointers of a zone in bytes.
+ */
+typedef struct BlockZoneWps {
+    QemuMutex lock;
+    uint64_t wp[];
+} BlockZoneWps;
+
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
     int cluster_size;
@@ -206,6 +214,12 @@ typedef enum {
 #define BDRV_SECTOR_BITS   9
 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
 
+/*
+ * Get the first most significant bit of wp. If it is zero, then
+ * the zone type is SWR.
+ */
+#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
+
 #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
                                            INT_MAX >> BDRV_SECTOR_BITS)
 #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 37dddc603c..59c2d1316d 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -857,6 +857,11 @@ typedef struct BlockLimits {
 
     /* device capacity expressed in bytes */
     int64_t capacity;
+
+    /* array of write pointers' location of each zone in the zoned device. */
+    BlockZoneWps *wps;
+
+    int64_t write_granularity;
 } BlockLimits;
 
 typedef struct BdrvOpBlocker BdrvOpBlocker;
-- 
2.37.3



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v3 2/3] block: introduce zone append write for zoned devices
  2022-10-10  2:33 [PATCH v3 0/3] Add zone append write for zoned device Sam Li
  2022-10-10  2:33 ` [PATCH v3 1/3] file-posix:add the tracking of the zones write pointers Sam Li
@ 2022-10-10  2:33 ` Sam Li
  2022-10-13  5:54   ` Damien Le Moal
  2022-10-10  2:33 ` [PATCH v3 3/3] qemu-iotests: test zone append operation Sam Li
  2 siblings, 1 reply; 11+ messages in thread
From: Sam Li @ 2022-10-10  2:33 UTC (permalink / raw)
  To: qemu-devel
  Cc: stefanha, Hanna Reitz, Kevin Wolf, dmitry.fomichev, hare,
	Fam Zheng, damien.lemoal, qemu-block, Sam Li

A zone append command is a write operation that specifies the first
logical block of a zone as the write position. When writing to a zoned
block device using zone append, the byte offset of writes is pointing
to the write pointer of that zone. Upon completion the device will
respond with the position the data has been written in the zone.

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 block/block-backend.c             | 64 +++++++++++++++++++++++++++++++
 block/file-posix.c                | 64 ++++++++++++++++++++++++++++---
 block/io.c                        | 21 ++++++++++
 block/raw-format.c                |  7 ++++
 include/block/block-io.h          |  3 ++
 include/block/block_int-common.h  |  3 ++
 include/block/raw-aio.h           |  4 +-
 include/sysemu/block-backend-io.h |  9 +++++
 8 files changed, 168 insertions(+), 7 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index ddc569e3ac..bfdb719bc8 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
         struct {
             BlockZoneOp op;
         } zone_mgmt;
+        struct {
+            int64_t *append_sector;
+        } zone_append;
     };
 } BlkRwCo;
 
@@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
     return &acb->common;
 }
 
+static void coroutine_fn blk_aio_zone_append_entry(void *opaque) {
+    BlkAioEmAIOCB *acb = opaque;
+    BlkRwCo *rwco = &acb->rwco;
+
+    rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector,
+                                   rwco->iobuf, rwco->flags);
+    blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+                                QEMUIOVector *qiov, BdrvRequestFlags flags,
+                                BlockCompletionFunc *cb, void *opaque) {
+    BlkAioEmAIOCB *acb;
+    Coroutine *co;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+    acb->rwco = (BlkRwCo) {
+            .blk    = blk,
+            .ret    = NOT_DONE,
+            .flags  = flags,
+            .iobuf  = qiov,
+            .zone_append = {
+                    .append_sector = offset,
+            },
+    };
+    acb->has_returned = false;
+
+    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
+    bdrv_coroutine_enter(blk_bs(blk), co);
+    acb->has_returned = true;
+    if (acb->rwco.ret != NOT_DONE) {
+        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+                                         blk_aio_complete_bh, acb);
+    }
+
+    return &acb->common;
+}
+
 /*
  * Send a zone_report command.
  * offset is a byte offset from the start of the device. No alignment
@@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
     return ret;
 }
 
+/*
+ * Send a zone_append command.
+ */
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+        QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+    int ret;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    blk_wait_while_drained(blk);
+    if (!blk_is_available(blk)) {
+        blk_dec_in_flight(blk);
+        return -ENOMEDIUM;
+    }
+
+    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
     BlockDriverState *bs = blk_bs(blk);
diff --git a/block/file-posix.c b/block/file-posix.c
index 17c0b58158..08ab164df4 100755
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
     ssize_t len;
 
     do {
-        if (aiocb->aio_type & QEMU_AIO_WRITE)
+        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
             len = qemu_pwritev(aiocb->aio_fildes,
                                aiocb->io.iov,
                                aiocb->io.niov,
@@ -1687,7 +1687,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
     ssize_t len;
 
     while (offset < aiocb->aio_nbytes) {
-        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
             len = pwrite(aiocb->aio_fildes,
                          (const char *)buf + offset,
                          aiocb->aio_nbytes - offset,
@@ -1731,7 +1731,7 @@ static int handle_aiocb_rw(void *opaque)
      * The offset of regular writes, append writes is the wp location
      * of that zone.
      */
-    if (aiocb->aio_type & QEMU_AIO_WRITE) {
+    if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
         if (aiocb->bs->bl.zone_size > 0) {
             BlockZoneWps *wps = aiocb->bs->bl.wps;
             qemu_mutex_lock(&wps->lock);
@@ -1794,7 +1794,7 @@ static int handle_aiocb_rw(void *opaque)
     }
 
     nbytes = handle_aiocb_rw_linear(aiocb, buf);
-    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
+    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
         char *p = buf;
         size_t count = aiocb->aio_nbytes, copy;
         int i;
@@ -1816,7 +1816,7 @@ static int handle_aiocb_rw(void *opaque)
 out:
     if (nbytes == aiocb->aio_nbytes) {
 #if defined(CONFIG_BLKZONED)
-        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
             BlockZoneWps *wps = aiocb->bs->bl.wps;
             int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
             if (wps) {
@@ -1828,6 +1828,11 @@ out:
                     if (wend_offset > wps->wp[index]){
                         wps->wp[index] = wend_offset;
                     }
+
+                    if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
+                        *aiocb->io.append_sector =
+                                wps->wp[index] >> BDRV_SECTOR_BITS;
+                    }
                 }
                 qemu_mutex_unlock(&wps->lock);
             }
@@ -1845,7 +1850,7 @@ out:
     } else {
         assert(nbytes < 0);
 #if defined(CONFIG_BLKZONED)
-        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
             update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
                             aiocb->bs->bl.nr_zones);
         }
@@ -3493,6 +3498,52 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
 }
 #endif
 
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
+                                           int64_t *offset,
+                                           QEMUIOVector *qiov,
+                                           BdrvRequestFlags flags) {
+    BDRVRawState *s = bs->opaque;
+    int64_t zone_size_mask = bs->bl.zone_size - 1;
+    int64_t iov_len = 0;
+    int64_t len = 0;
+    RawPosixAIOData acb;
+
+    if (*offset & zone_size_mask) {
+        error_report("sector offset %" PRId64 " is not aligned to zone size "
+                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
+        return -EINVAL;
+    }
+
+    int64_t wg = bs->bl.write_granularity;
+    int64_t wg_mask = wg - 1;
+    for (int i = 0; i < qiov->niov; i++) {
+       iov_len = qiov->iov[i].iov_len;
+       if (iov_len & wg_mask) {
+           error_report("len of IOVector[%d] %" PRId64 " is not aligned to block "
+                        "size %" PRId64 "", i, iov_len, wg);
+           return -EINVAL;
+       }
+       len += iov_len;
+    }
+
+    acb = (RawPosixAIOData) {
+        .bs = bs,
+        .aio_fildes = s->fd,
+        .aio_type = QEMU_AIO_ZONE_APPEND,
+        .aio_offset = *offset,
+        .aio_nbytes = len,
+        .io = {
+                .iov = qiov->iov,
+                .niov = qiov->niov,
+                .append_sector = offset,
+        },
+    };
+
+    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
+}
+#endif
+
 static coroutine_fn int
 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
                 bool blkdev)
@@ -4268,6 +4319,7 @@ static BlockDriver bdrv_zoned_host_device = {
     /* zone management operations */
     .bdrv_co_zone_report = raw_co_zone_report,
     .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+    .bdrv_co_zone_append = raw_co_zone_append,
 };
 #endif
 
diff --git a/block/io.c b/block/io.c
index e5aaa64e17..935abf2ed4 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3230,6 +3230,27 @@ out:
     return co.ret;
 }
 
+int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
+                        QEMUIOVector *qiov,
+                        BdrvRequestFlags flags)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+    };
+    IO_CODE();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_zone_append) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
 {
     IO_CODE();
diff --git a/block/raw-format.c b/block/raw-format.c
index b885688434..f132880c85 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -325,6 +325,12 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
     return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
 }
 
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, int64_t *offset,
+                                           QEMUIOVector *qiov,
+                                           BdrvRequestFlags flags) {
+    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
+}
+
 static int64_t raw_getlength(BlockDriverState *bs)
 {
     int64_t len;
@@ -628,6 +634,7 @@ BlockDriver bdrv_raw = {
     .bdrv_co_pdiscard     = &raw_co_pdiscard,
     .bdrv_co_zone_report  = &raw_co_zone_report,
     .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
+    .bdrv_co_zone_append = &raw_co_zone_append,
     .bdrv_co_block_status = &raw_co_block_status,
     .bdrv_co_copy_range_from = &raw_co_copy_range_from,
     .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
diff --git a/include/block/block-io.h b/include/block/block-io.h
index f0cdf67d33..6a54453578 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
                                      BlockZoneDescriptor *zones);
 int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
                                    int64_t offset, int64_t len);
+int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
+                                     QEMUIOVector *qiov,
+                                     BdrvRequestFlags flags);
 
 int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 59c2d1316d..a7e7db5646 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -701,6 +701,9 @@ struct BlockDriver {
             BlockZoneDescriptor *zones);
     int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
             int64_t offset, int64_t len);
+    int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
+            int64_t *offset, QEMUIOVector *qiov,
+            BdrvRequestFlags flags);
 
     /* removable device specific */
     bool (*bdrv_is_inserted)(BlockDriverState *bs);
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index 3d26929cdd..f13cc1887b 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -31,6 +31,7 @@
 #define QEMU_AIO_TRUNCATE     0x0080
 #define QEMU_AIO_ZONE_REPORT  0x0100
 #define QEMU_AIO_ZONE_MGMT    0x0200
+#define QEMU_AIO_ZONE_APPEND  0x0400
 #define QEMU_AIO_TYPE_MASK \
         (QEMU_AIO_READ | \
          QEMU_AIO_WRITE | \
@@ -41,7 +42,8 @@
          QEMU_AIO_COPY_RANGE | \
          QEMU_AIO_TRUNCATE  | \
          QEMU_AIO_ZONE_REPORT | \
-         QEMU_AIO_ZONE_MGMT)
+         QEMU_AIO_ZONE_MGMT | \
+         QEMU_AIO_ZONE_APPEND)
 
 /* AIO flags */
 #define QEMU_AIO_MISALIGNED   0x1000
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
index 6835525582..33e35ae5d7 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -51,6 +51,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
 BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
                               int64_t offset, int64_t len,
                               BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+                                QEMUIOVector *qiov, BdrvRequestFlags flags,
+                                BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
                              BlockCompletionFunc *cb, void *opaque);
 void blk_aio_cancel_async(BlockAIOCB *acb);
@@ -172,6 +175,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
                                   int64_t offset, int64_t len);
 int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
                                        int64_t offset, int64_t len);
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+                                    QEMUIOVector *qiov,
+                                    BdrvRequestFlags flags);
+int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset,
+                                         QEMUIOVector *qiov,
+                                         BdrvRequestFlags flags);
 
 int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
                                       int64_t bytes);
-- 
2.37.3



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v3 3/3] qemu-iotests: test zone append operation
  2022-10-10  2:33 [PATCH v3 0/3] Add zone append write for zoned device Sam Li
  2022-10-10  2:33 ` [PATCH v3 1/3] file-posix:add the tracking of the zones write pointers Sam Li
  2022-10-10  2:33 ` [PATCH v3 2/3] block: introduce zone append write for zoned devices Sam Li
@ 2022-10-10  2:33 ` Sam Li
  2 siblings, 0 replies; 11+ messages in thread
From: Sam Li @ 2022-10-10  2:33 UTC (permalink / raw)
  To: qemu-devel
  Cc: stefanha, Hanna Reitz, Kevin Wolf, dmitry.fomichev, hare,
	Fam Zheng, damien.lemoal, qemu-block, Sam Li

This tests is mainly a helper to indicate append writes in block layer
behaves as expected.

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 qemu-io-cmds.c                     | 62 ++++++++++++++++++++++++++++++
 tests/qemu-iotests/tests/zoned.out |  7 ++++
 tests/qemu-iotests/tests/zoned.sh  |  9 +++++
 3 files changed, 78 insertions(+)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index e56c8d1c30..6cb86de35b 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1855,6 +1855,67 @@ static const cmdinfo_t zone_reset_cmd = {
     .oneline = "reset a zone write pointer in zone block device",
 };
 
+static int do_aio_zone_append(BlockBackend *blk, QEMUIOVector *qiov,
+                              int64_t *offset, int flags, int *total)
+{
+    int async_ret = NOT_DONE;
+
+    blk_aio_zone_append(blk, offset, qiov, flags, aio_rw_done, &async_ret);
+    while (async_ret == NOT_DONE) {
+        main_loop_wait(false);
+    }
+
+    *total = qiov->size;
+    return async_ret < 0 ? async_ret : 1;
+}
+
+static int zone_append_f(BlockBackend *blk, int argc, char **argv) {
+    int ret;
+    int flags = 0;
+    int total = 0;
+    int64_t offset;
+    char *buf;
+    int nr_iov;
+    int pattern = 0xcd;
+    QEMUIOVector qiov;
+
+    if (optind > argc - 2) {
+        return -EINVAL;
+    }
+    optind++;
+    offset = cvtnum(argv[optind]);
+    if (offset < 0) {
+        print_cvtnum_err(offset, argv[optind]);
+        return offset;
+    }
+    optind++;
+    nr_iov = argc - optind;
+    buf = create_iovec(blk, &qiov, &argv[optind], nr_iov, pattern);
+    if (buf == NULL) {
+        return -EINVAL;
+    }
+    ret = do_aio_zone_append(blk, &qiov, &offset, flags, &total);
+    if (ret < 0) {
+        printf("zone append failed: %s\n", strerror(-ret));
+        goto out;
+    }
+
+    out:
+    qemu_iovec_destroy(&qiov);
+    qemu_io_free(buf);
+    return ret;
+}
+
+static const cmdinfo_t zone_append_cmd = {
+    .name = "zone_append",
+    .altname = "zap",
+    .cfunc = zone_append_f,
+    .argmin = 3,
+    .argmax = 3,
+    .args = "offset len [len..]",
+    .oneline = "append write a number of bytes at a specified offset",
+};
+
 static int truncate_f(BlockBackend *blk, int argc, char **argv);
 static const cmdinfo_t truncate_cmd = {
     .name       = "truncate",
@@ -2652,6 +2713,7 @@ static void __attribute((constructor)) init_qemuio_commands(void)
     qemuio_add_command(&zone_close_cmd);
     qemuio_add_command(&zone_finish_cmd);
     qemuio_add_command(&zone_reset_cmd);
+    qemuio_add_command(&zone_append_cmd);
     qemuio_add_command(&truncate_cmd);
     qemuio_add_command(&length_cmd);
     qemuio_add_command(&info_cmd);
diff --git a/tests/qemu-iotests/tests/zoned.out b/tests/qemu-iotests/tests/zoned.out
index 0c8f96deb9..b3b139b4ec 100644
--- a/tests/qemu-iotests/tests/zoned.out
+++ b/tests/qemu-iotests/tests/zoned.out
@@ -50,4 +50,11 @@ start: 0x80000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:14, [type: 2]
 (5) resetting the second zone
 After resetting a zone:
 start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
+
+
+(6) append write
+After appending the first zone:
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x18, zcond:2, [type: 2]
+After appending the second zone:
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80018, zcond:2, [type: 2]
 *** done
diff --git a/tests/qemu-iotests/tests/zoned.sh b/tests/qemu-iotests/tests/zoned.sh
index fced0194c5..888711eef2 100755
--- a/tests/qemu-iotests/tests/zoned.sh
+++ b/tests/qemu-iotests/tests/zoned.sh
@@ -79,6 +79,15 @@ echo "(5) resetting the second zone"
 sudo $QEMU_IO $IMG -c "zrs 268435456 268435456"
 echo "After resetting a zone:"
 sudo $QEMU_IO $IMG -c "zrp 268435456 1"
+echo
+echo
+echo "(6) append write" # physical block size of the device is 4096
+sudo $QEMU_IO $IMG -c "zap 0 0x1000 0x2000"
+echo "After appending the first zone:"
+sudo $QEMU_IO $IMG -c "zrp 0 1"
+sudo $QEMU_IO $IMG -c "zap 268435456 0x1000 0x2000"
+echo "After appending the second zone:"
+sudo $QEMU_IO $IMG -c "zrp 268435456 1"
 
 # success, all done
 echo "*** done"
-- 
2.37.3



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 1/3] file-posix:add the tracking of the zones write pointers
  2022-10-10  2:33 ` [PATCH v3 1/3] file-posix:add the tracking of the zones write pointers Sam Li
@ 2022-10-13  5:13   ` Damien Le Moal
  2022-10-13  7:08     ` Sam Li
  0 siblings, 1 reply; 11+ messages in thread
From: Damien Le Moal @ 2022-10-13  5:13 UTC (permalink / raw)
  To: Sam Li, qemu-devel
  Cc: stefanha, Hanna Reitz, Kevin Wolf, dmitry.fomichev, hare,
	Fam Zheng, qemu-block

On 10/10/22 11:33, Sam Li wrote:
> Since Linux doesn't have a user API to issue zone append operations to
> zoned devices from user space, the file-posix driver is modified to add
> zone append emulation using regular writes. To do this, the file-posix
> driver tracks the wp location of all zones of the device. It uses an
> array of uint64_t. The most significant bit of each wp location indicates
> if the zone type is conventional zones.
> 
> The zones wp can be changed due to the following operations issued:
> - zone reset: change the wp to the start offset of that zone
> - zone finish: change to the end location of that zone
> - write to a zone
> - zone append
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/file-posix.c               | 158 +++++++++++++++++++++++++++++++
>  include/block/block-common.h     |  14 +++
>  include/block/block_int-common.h |   5 +
>  3 files changed, 177 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index a9d347292e..17c0b58158 100755
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -206,6 +206,7 @@ typedef struct RawPosixAIOData {
>          struct {
>              struct iovec *iov;
>              int niov;
> +            int64_t *append_sector;

This should be added as part of patch 2. You do not need this to track
the wp of zones in this patch.

>          } io;
>          struct {
>              uint64_t cmd;
> @@ -226,6 +227,7 @@ typedef struct RawPosixAIOData {
>          struct {
>              unsigned long zone_op;
>              const char *zone_op_name;
> +            bool all;
>          } zone_mgmt;
>      };
>  } RawPosixAIOData;
> @@ -1331,6 +1333,67 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
>  #endif
>  }
>  
> +#if defined(CONFIG_BLKZONED)
> +static int get_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,

Nit: It would seem more natural to have the fd argument first...

> +                        unsigned int nrz) {
> +    struct blk_zone *blkz;
> +    int64_t rep_size;
> +    int64_t sector = offset >> BDRV_SECTOR_BITS;
> +    int ret, n = 0, i = 0;
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> +    g_autofree struct blk_zone_report *rep = NULL;
> +
> +    rep = g_malloc(rep_size);
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +        memset(rep, 0, rep_size);
> +        rep->sector = sector;
> +        rep->nr_zones = nrz - n;
> +
> +        do {
> +            ret = ioctl(fd, BLKREPORTZONE, rep);
> +        } while (ret != 0 && errno == EINTR);
> +        if (ret != 0) {
> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> +                    fd, offset, errno);
> +            return -errno;
> +        }
> +
> +        if (!rep->nr_zones) {
> +            break;
> +        }
> +
> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> +            /*
> +             * The wp tracking cares only about sequential writes required and
> +             * sequential write preferred zones so that the wp can advance to
> +             * the right location.
> +             * Use the most significant bit of the wp location to indicate the
> +             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> +             */
> +            if (!(blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL)) {

Double negation... This can simply be:

if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {

> +                wps->wp[i] += 1ULL << 63;

No need for the += here. This can be "=".

> +            } else {
> +                wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> +            }
> +        }
> +        sector = blkz[i-1].start + blkz[i-1].len;

spaces missing around the "-" in the "i-1" expressions.

> +    }
> +
> +    return 0;
> +}
> +
> +static void update_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,

Same nit as above: fd being the first argument would be a little more
natural in my opinion.

> +                            unsigned int nrz) {
> +    qemu_mutex_lock(&wps->lock);
> +    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
> +        error_report("report zone wp failed");
> +        return;

You are leacking the lock here. Remove the return. Also, given that
get_zones_wp() already prints a message if report fails, I do not think
the message here is useful.

Also, why is this function void typed ? How can the caller know if the
update succeeded or not ?

> +    }
> +    qemu_mutex_unlock(&wps->lock);
> +}
> +#endif
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>      BDRVRawState *s = bs->opaque;
> @@ -1414,6 +1477,19 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>              error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
>              return;
>          }
> +
> +        ret = get_sysfs_long_val(&st, "physical_block_size");
> +        if (ret >= 0) {
> +            bs->bl.write_granularity = ret;
> +        }

Why is this change here ? Shouldn't this be part of the previous series
"Add support for zoned device" ?

> +
> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> +        if (get_zones_wp(0, s->fd, bs->bl.wps, ret) < 0){
> +            error_report("report wps failed");
> +            g_free(bs->bl.wps);
> +            return;
> +        }
> +        qemu_mutex_init(&bs->bl.wps->lock);
>      }
>  }
>  
> @@ -1651,6 +1727,20 @@ static int handle_aiocb_rw(void *opaque)
>      ssize_t nbytes;
>      char *buf;
>  
> +    /*
> +     * The offset of regular writes, append writes is the wp location
> +     * of that zone.
> +     */
> +    if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +        if (aiocb->bs->bl.zone_size > 0) {
> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> +            qemu_mutex_lock(&wps->lock);
> +            aiocb->aio_offset = wps->wp[aiocb->aio_offset /
> +                                        aiocb->bs->bl.zone_size];
> +            qemu_mutex_unlock(&wps->lock);
> +        }

I do not understand this hunk at all. What is this trying to do ? zone
append support goes into patch 2. You are overwritting the user
specified aio offset using the tracked wp value. That could result in a
successfull write even if the user sent an unaligned write command. That
is bad.

Here you should only be tracking the write pointer, so increment
wps->wp[index], which you do below.

> +    }
> +
>      if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
>          /*
>           * If there is just a single buffer, and it is properly aligned
> @@ -1725,6 +1815,24 @@ static int handle_aiocb_rw(void *opaque)
>  
>  out:
>      if (nbytes == aiocb->aio_nbytes) {
> +#if defined(CONFIG_BLKZONED)
> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> +            if (wps) {
> +                qemu_mutex_lock(&wps->lock);
> +                if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +                    uint64_t wend_offset =
> +                            aiocb->aio_offset + aiocb->aio_nbytes;
> +                    /* Advance the wp if needed */
> +                    if (wend_offset > wps->wp[index]){
> +                        wps->wp[index] = wend_offset;
> +                    }
> +                }
> +                qemu_mutex_unlock(&wps->lock);
> +            }
> +        }
> +#endif
>          return 0;
>      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
>          if (aiocb->aio_type & QEMU_AIO_WRITE) {
> @@ -1736,6 +1844,12 @@ out:
>          }
>      } else {
>          assert(nbytes < 0);
> +#if defined(CONFIG_BLKZONED)
> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +            update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> +                            aiocb->bs->bl.nr_zones);

You only need to update the target zone of the aio, not all zones.
Updating all zones is actually a bug as wp[] entries for other zones may
be larger than the device reported wp if there are other write aios in
flight. So the last argument must be "1" here.

> +        }
> +#endif
>          return nbytes;
>      }
>  }
> @@ -2022,12 +2136,17 @@ static int handle_aiocb_zone_report(void *opaque) {
>  #if defined(CONFIG_BLKZONED)
>  static int handle_aiocb_zone_mgmt(void *opaque) {
>      RawPosixAIOData *aiocb = opaque;
> +    BlockDriverState *bs = aiocb->bs;
>      int fd = aiocb->aio_fildes;
>      int64_t sector = aiocb->aio_offset / 512;
>      int64_t nr_sectors = aiocb->aio_nbytes / 512;
> +    uint64_t wend_offset;
>      struct blk_zone_range range;
>      int ret;
>  

Why the blank line here ?

> +    BlockZoneWps *wps = bs->bl.wps;
> +    int index = aiocb->aio_offset / bs->bl.zone_size;
> +
>      /* Execute the operation */
>      range.sector = sector;
>      range.nr_sectors = nr_sectors;
> @@ -2035,11 +2154,41 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
>          ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
>      } while (ret != 0 && errno == EINTR);
>      if (ret != 0) {
> +        update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> +                        aiocb->bs->bl.nr_zones);

You need only to update the range of zones that was specified for the
management option, not all zones. So you must specify the zone
management aio offset and size/zone_size here.

>          ret = -errno;
>          error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
>                       ret);
>          return ret;
>      }
> +
> +    qemu_mutex_lock(&wps->lock);
> +    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +         /*
> +         * The zoned device allows the last zone smaller that the zone size.
> +         */

comment indentation is off.

> +        if (aiocb->aio_nbytes < bs->bl.zone_size) {
> +            wend_offset = aiocb->aio_offset + aiocb->aio_nbytes;
> +        } else {
> +            wend_offset = aiocb->aio_offset + bs->bl.zone_size;
> +        }
> +
> +        if (aiocb->aio_offset != wps->wp[index] &&
> +            aiocb->zone_mgmt.zone_op == BLKRESETZONE) {

I do not understand the condition here. Why do you have
"aiocb->aio_offset != wps->wp[index]" ?

> +            if (aiocb->zone_mgmt.all) {

This is the only place where you use this all boolean field. For
simplicity, I would drop this field completely and test that
aiocb->aio_offset == 0 && aiocb->aio_nbytes == bs->bl.capacity to detect
a reset all zones operation.

> +                for (int i = 0; i < bs->bl.nr_zones; ++i) {
> +                    wps->wp[i] = i * bs->bl.zone_size;

You are not handling conventional zones here. For conventional zones,
you should not change the value. Otherwise, BDRV_ZT_IS_CONV() will
always return false after this.

> +                }
> +            } else {
> +                wps->wp[index] = aiocb->aio_offset;
> +            }
> +        } else if (aiocb->aio_offset != wps->wp[index] &&
> +            aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {

Same comment here. Why do you have "aiocb->aio_offset != wps->wp[index]" ?

> +            wps->wp[index] = wend_offset;
> +        }
> +    }
> +    qemu_mutex_unlock(&wps->lock);
> +
>      return ret;
>  }
>  #endif
> @@ -2480,6 +2629,12 @@ static void raw_close(BlockDriverState *bs)
>      BDRVRawState *s = bs->opaque;
>  
>      if (s->fd >= 0) {
> +#if defined(CONFIG_BLKZONED)
> +        if (bs->bl.wps) {
> +            qemu_mutex_destroy(&bs->bl.wps->lock);
> +            g_free(bs->bl.wps);
> +        }
> +#endif
>          qemu_close(s->fd);
>          s->fd = -1;
>      }
> @@ -3278,6 +3433,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>      int64_t zone_size, zone_size_mask;
>      const char *zone_op_name;
>      unsigned long zone_op;
> +    bool is_all = false;
>  
>      zone_size = bs->bl.zone_size;
>      zone_size_mask = zone_size - 1;
> @@ -3314,6 +3470,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>      case BLK_ZO_RESET_ALL:
>          zone_op_name = "BLKRESETZONE";
>          zone_op = BLKRESETZONE;
> +        is_all = true;
>          break;
>      default:
>          g_assert_not_reached();
> @@ -3328,6 +3485,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>          .zone_mgmt  = {
>              .zone_op = zone_op,
>              .zone_op_name = zone_op_name,
> +            .all = is_all,
>          },
>      };
>  
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index 882de6825e..b8b2dba64a 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -93,6 +93,14 @@ typedef struct BlockZoneDescriptor {
>      BlockZoneCondition cond;
>  } BlockZoneDescriptor;
>  
> +/*
> + * Track write pointers of a zone in bytes.
> + */
> +typedef struct BlockZoneWps {
> +    QemuMutex lock;
> +    uint64_t wp[];
> +} BlockZoneWps;
> +
>  typedef struct BlockDriverInfo {
>      /* in bytes, 0 if irrelevant */
>      int cluster_size;
> @@ -206,6 +214,12 @@ typedef enum {
>  #define BDRV_SECTOR_BITS   9
>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
>  
> +/*
> + * Get the first most significant bit of wp. If it is zero, then
> + * the zone type is SWR.
> + */
> +#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
> +
>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
>                                             INT_MAX >> BDRV_SECTOR_BITS)
>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index 37dddc603c..59c2d1316d 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -857,6 +857,11 @@ typedef struct BlockLimits {
>  
>      /* device capacity expressed in bytes */
>      int64_t capacity;
> +
> +    /* array of write pointers' location of each zone in the zoned device. */
> +    BlockZoneWps *wps;
> +
> +    int64_t write_granularity;
>  } BlockLimits;
>  
>  typedef struct BdrvOpBlocker BdrvOpBlocker;

-- 
Damien Le Moal
Western Digital Research



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 2/3] block: introduce zone append write for zoned devices
  2022-10-10  2:33 ` [PATCH v3 2/3] block: introduce zone append write for zoned devices Sam Li
@ 2022-10-13  5:54   ` Damien Le Moal
  2022-10-13  7:27     ` Sam Li
  0 siblings, 1 reply; 11+ messages in thread
From: Damien Le Moal @ 2022-10-13  5:54 UTC (permalink / raw)
  To: Sam Li, qemu-devel
  Cc: stefanha, Hanna Reitz, Kevin Wolf, dmitry.fomichev, hare,
	Fam Zheng, qemu-block

On 10/10/22 11:33, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of writes is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been written in the zone.
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/block-backend.c             | 64 +++++++++++++++++++++++++++++++
>  block/file-posix.c                | 64 ++++++++++++++++++++++++++++---
>  block/io.c                        | 21 ++++++++++
>  block/raw-format.c                |  7 ++++
>  include/block/block-io.h          |  3 ++
>  include/block/block_int-common.h  |  3 ++
>  include/block/raw-aio.h           |  4 +-
>  include/sysemu/block-backend-io.h |  9 +++++
>  8 files changed, 168 insertions(+), 7 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index ddc569e3ac..bfdb719bc8 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>          struct {
>              BlockZoneOp op;
>          } zone_mgmt;
> +        struct {
> +            int64_t *append_sector;

I would call this "sector", since it will always be referenced as
"->zone_append.sector", you get the "append" for free :)

That said, shouldn't this be a byte value, so called "offset" ? Not
entirely sure...

> +        } zone_append;
>      };
>  } BlkRwCo;
>  
> @@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>      return &acb->common;
>  }
>  
> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) {
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = &acb->rwco;
> +
> +    rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector,
> +                                   rwco->iobuf, rwco->flags);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> +                                BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +            .blk    = blk,
> +            .ret    = NOT_DONE,
> +            .flags  = flags,
> +            .iobuf  = qiov,
> +            .zone_append = {
> +                    .append_sector = offset,

See above comment. So since this is a byte value, this needs to be
called "offset", no ?

> +            },
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +    bdrv_coroutine_enter(blk_bs(blk), co);
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> +                                         blk_aio_complete_bh, acb);
> +    }
> +
> +    return &acb->common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>      return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +        QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    if (!blk_is_available(blk)) {
> +        blk_dec_in_flight(blk);
> +        return -ENOMEDIUM;
> +    }
> +
> +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>      BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 17c0b58158..08ab164df4 100755
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>      ssize_t len;
>  
>      do {
> -        if (aiocb->aio_type & QEMU_AIO_WRITE)
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
>              len = qemu_pwritev(aiocb->aio_fildes,
>                                 aiocb->io.iov,
>                                 aiocb->io.niov,

Hu... You are issuing the io for a zone append without first changing
the aiocb offset to be equal to the zone write pointer ? And you are
calling this without the wps->lock held... Changing the aio offset to be
equal to the wp && issuing the io must be atomic.

> @@ -1687,7 +1687,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
>      ssize_t len;
>  
>      while (offset < aiocb->aio_nbytes) {
> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>              len = pwrite(aiocb->aio_fildes,
>                           (const char *)buf + offset,
>                           aiocb->aio_nbytes - offset,

Same comment here.

> @@ -1731,7 +1731,7 @@ static int handle_aiocb_rw(void *opaque)
>       * The offset of regular writes, append writes is the wp location
>       * of that zone.
>       */
> -    if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +    if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>          if (aiocb->bs->bl.zone_size > 0) {
>              BlockZoneWps *wps = aiocb->bs->bl.wps;
>              qemu_mutex_lock(&wps->lock);
> @@ -1794,7 +1794,7 @@ static int handle_aiocb_rw(void *opaque)
>      }
>  
>      nbytes = handle_aiocb_rw_linear(aiocb, buf);
> -    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
> +    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
>          char *p = buf;
>          size_t count = aiocb->aio_nbytes, copy;
>          int i;
> @@ -1816,7 +1816,7 @@ static int handle_aiocb_rw(void *opaque)
>  out:
>      if (nbytes == aiocb->aio_nbytes) {
>  #if defined(CONFIG_BLKZONED)
> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>              BlockZoneWps *wps = aiocb->bs->bl.wps;
>              int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
>              if (wps) {
> @@ -1828,6 +1828,11 @@ out:
>                      if (wend_offset > wps->wp[index]){
>                          wps->wp[index] = wend_offset;
>                      }
> +
> +                    if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> +                        *aiocb->io.append_sector =
> +                                wps->wp[index] >> BDRV_SECTOR_BITS;
> +                    }

Same comment as last time. You must do this BEFORE the previous hunk
that updates the wp. Otherwise, you are NOT returning the position of
the written data, but the position that follows the written data...

If you do a zone append to an empty zone, the append sector you return
must be the zone start sector. You can see here that this will never be
the case unless you reverse the 2 hunks above.

>                  }
>                  qemu_mutex_unlock(&wps->lock);
>              }
> @@ -1845,7 +1850,7 @@ out:
>      } else {
>          assert(nbytes < 0);
>  #if defined(CONFIG_BLKZONED)
> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>              update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
>                              aiocb->bs->bl.nr_zones);
>          }
> @@ -3493,6 +3498,52 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>  }
>  #endif
>  
> +#if defined(CONFIG_BLKZONED)
> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> +                                           int64_t *offset,
> +                                           QEMUIOVector *qiov,
> +                                           BdrvRequestFlags flags) {
> +    BDRVRawState *s = bs->opaque;
> +    int64_t zone_size_mask = bs->bl.zone_size - 1;
> +    int64_t iov_len = 0;
> +    int64_t len = 0;
> +    RawPosixAIOData acb;
> +
> +    if (*offset & zone_size_mask) {
> +        error_report("sector offset %" PRId64 " is not aligned to zone size "
> +                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
> +        return -EINVAL;
> +    }
> +
> +    int64_t wg = bs->bl.write_granularity;
> +    int64_t wg_mask = wg - 1;
> +    for (int i = 0; i < qiov->niov; i++) {
> +       iov_len = qiov->iov[i].iov_len;
> +       if (iov_len & wg_mask) {
> +           error_report("len of IOVector[%d] %" PRId64 " is not aligned to block "
> +                        "size %" PRId64 "", i, iov_len, wg);
> +           return -EINVAL;
> +       }
> +       len += iov_len;
> +    }
> +
> +    acb = (RawPosixAIOData) {
> +        .bs = bs,
> +        .aio_fildes = s->fd,
> +        .aio_type = QEMU_AIO_ZONE_APPEND,
> +        .aio_offset = *offset,
> +        .aio_nbytes = len,
> +        .io = {
> +                .iov = qiov->iov,
> +                .niov = qiov->niov,
> +                .append_sector = offset,
> +        },
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
> +}
> +#endif
> +
>  static coroutine_fn int
>  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>                  bool blkdev)
> @@ -4268,6 +4319,7 @@ static BlockDriver bdrv_zoned_host_device = {
>      /* zone management operations */
>      .bdrv_co_zone_report = raw_co_zone_report,
>      .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> +    .bdrv_co_zone_append = raw_co_zone_append,
>  };
>  #endif
>  
> diff --git a/block/io.c b/block/io.c
> index e5aaa64e17..935abf2ed4 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -3230,6 +3230,27 @@ out:
>      return co.ret;
>  }
>  
> +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> +                        QEMUIOVector *qiov,
> +                        BdrvRequestFlags flags)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +            .coroutine = qemu_coroutine_self(),
> +    };
> +    IO_CODE();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_zone_append) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
>  void *qemu_blockalign(BlockDriverState *bs, size_t size)
>  {
>      IO_CODE();
> diff --git a/block/raw-format.c b/block/raw-format.c
> index b885688434..f132880c85 100644
> --- a/block/raw-format.c
> +++ b/block/raw-format.c
> @@ -325,6 +325,12 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>      return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
>  }
>  
> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, int64_t *offset,
> +                                           QEMUIOVector *qiov,
> +                                           BdrvRequestFlags flags) {
> +    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
> +}
> +
>  static int64_t raw_getlength(BlockDriverState *bs)
>  {
>      int64_t len;
> @@ -628,6 +634,7 @@ BlockDriver bdrv_raw = {
>      .bdrv_co_pdiscard     = &raw_co_pdiscard,
>      .bdrv_co_zone_report  = &raw_co_zone_report,
>      .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
> +    .bdrv_co_zone_append = &raw_co_zone_append,
>      .bdrv_co_block_status = &raw_co_block_status,
>      .bdrv_co_copy_range_from = &raw_co_copy_range_from,
>      .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
> diff --git a/include/block/block-io.h b/include/block/block-io.h
> index f0cdf67d33..6a54453578 100644
> --- a/include/block/block-io.h
> +++ b/include/block/block-io.h
> @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
>                                       BlockZoneDescriptor *zones);
>  int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>                                     int64_t offset, int64_t len);
> +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> +                                     QEMUIOVector *qiov,
> +                                     BdrvRequestFlags flags);
>  
>  int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
>  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index 59c2d1316d..a7e7db5646 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -701,6 +701,9 @@ struct BlockDriver {
>              BlockZoneDescriptor *zones);
>      int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
>              int64_t offset, int64_t len);
> +    int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
> +            int64_t *offset, QEMUIOVector *qiov,
> +            BdrvRequestFlags flags);
>  
>      /* removable device specific */
>      bool (*bdrv_is_inserted)(BlockDriverState *bs);
> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> index 3d26929cdd..f13cc1887b 100644
> --- a/include/block/raw-aio.h
> +++ b/include/block/raw-aio.h
> @@ -31,6 +31,7 @@
>  #define QEMU_AIO_TRUNCATE     0x0080
>  #define QEMU_AIO_ZONE_REPORT  0x0100
>  #define QEMU_AIO_ZONE_MGMT    0x0200
> +#define QEMU_AIO_ZONE_APPEND  0x0400
>  #define QEMU_AIO_TYPE_MASK \
>          (QEMU_AIO_READ | \
>           QEMU_AIO_WRITE | \
> @@ -41,7 +42,8 @@
>           QEMU_AIO_COPY_RANGE | \
>           QEMU_AIO_TRUNCATE  | \
>           QEMU_AIO_ZONE_REPORT | \
> -         QEMU_AIO_ZONE_MGMT)
> +         QEMU_AIO_ZONE_MGMT | \
> +         QEMU_AIO_ZONE_APPEND)
>  
>  /* AIO flags */
>  #define QEMU_AIO_MISALIGNED   0x1000
> diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
> index 6835525582..33e35ae5d7 100644
> --- a/include/sysemu/block-backend-io.h
> +++ b/include/sysemu/block-backend-io.h
> @@ -51,6 +51,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
>  BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                int64_t offset, int64_t len,
>                                BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> +                                BlockCompletionFunc *cb, void *opaque);
>  BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
>                               BlockCompletionFunc *cb, void *opaque);
>  void blk_aio_cancel_async(BlockAIOCB *acb);
> @@ -172,6 +175,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                    int64_t offset, int64_t len);
>  int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                         int64_t offset, int64_t len);
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +                                    QEMUIOVector *qiov,
> +                                    BdrvRequestFlags flags);
> +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset,
> +                                         QEMUIOVector *qiov,
> +                                         BdrvRequestFlags flags);
>  
>  int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
>                                        int64_t bytes);

-- 
Damien Le Moal
Western Digital Research



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 1/3] file-posix:add the tracking of the zones write pointers
  2022-10-13  5:13   ` Damien Le Moal
@ 2022-10-13  7:08     ` Sam Li
  2022-10-13  7:30       ` Damien Le Moal
  0 siblings, 1 reply; 11+ messages in thread
From: Sam Li @ 2022-10-13  7:08 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: qemu-devel, stefanha, Hanna Reitz, Kevin Wolf, dmitry.fomichev,
	hare, Fam Zheng, qemu-block

Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月13日周四 13:13写道：
>
> On 10/10/22 11:33, Sam Li wrote:
> > Since Linux doesn't have a user API to issue zone append operations to
> > zoned devices from user space, the file-posix driver is modified to add
> > zone append emulation using regular writes. To do this, the file-posix
> > driver tracks the wp location of all zones of the device. It uses an
> > array of uint64_t. The most significant bit of each wp location indicates
> > if the zone type is conventional zones.
> >
> > The zones wp can be changed due to the following operations issued:
> > - zone reset: change the wp to the start offset of that zone
> > - zone finish: change to the end location of that zone
> > - write to a zone
> > - zone append
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >  block/file-posix.c               | 158 +++++++++++++++++++++++++++++++
> >  include/block/block-common.h     |  14 +++
> >  include/block/block_int-common.h |   5 +
> >  3 files changed, 177 insertions(+)
> >
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index a9d347292e..17c0b58158 100755
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -206,6 +206,7 @@ typedef struct RawPosixAIOData {
> >          struct {
> >              struct iovec *iov;
> >              int niov;
> > +            int64_t *append_sector;
>
> This should be added as part of patch 2. You do not need this to track
> the wp of zones in this patch.
>
> >          } io;
> >          struct {
> >              uint64_t cmd;
> > @@ -226,6 +227,7 @@ typedef struct RawPosixAIOData {
> >          struct {
> >              unsigned long zone_op;
> >              const char *zone_op_name;
> > +            bool all;
> >          } zone_mgmt;
> >      };
> >  } RawPosixAIOData;
> > @@ -1331,6 +1333,67 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
> >  #endif
> >  }
> >
> > +#if defined(CONFIG_BLKZONED)
> > +static int get_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
>
> Nit: It would seem more natural to have the fd argument first...
>
> > +                        unsigned int nrz) {
> > +    struct blk_zone *blkz;
> > +    int64_t rep_size;
> > +    int64_t sector = offset >> BDRV_SECTOR_BITS;
> > +    int ret, n = 0, i = 0;
> > +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> > +    g_autofree struct blk_zone_report *rep = NULL;
> > +
> > +    rep = g_malloc(rep_size);
> > +    blkz = (struct blk_zone *)(rep + 1);
> > +    while (n < nrz) {
> > +        memset(rep, 0, rep_size);
> > +        rep->sector = sector;
> > +        rep->nr_zones = nrz - n;
> > +
> > +        do {
> > +            ret = ioctl(fd, BLKREPORTZONE, rep);
> > +        } while (ret != 0 && errno == EINTR);
> > +        if (ret != 0) {
> > +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> > +                    fd, offset, errno);
> > +            return -errno;
> > +        }
> > +
> > +        if (!rep->nr_zones) {
> > +            break;
> > +        }
> > +
> > +        for (i = 0; i < rep->nr_zones; i++, n++) {
> > +            /*
> > +             * The wp tracking cares only about sequential writes required and
> > +             * sequential write preferred zones so that the wp can advance to
> > +             * the right location.
> > +             * Use the most significant bit of the wp location to indicate the
> > +             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> > +             */
> > +            if (!(blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL)) {
>
> Double negation... This can simply be:
>
> if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
>
> > +                wps->wp[i] += 1ULL << 63;
>
> No need for the += here. This can be "=".
>
> > +            } else {
> > +                wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> > +            }
> > +        }
> > +        sector = blkz[i-1].start + blkz[i-1].len;
>
> spaces missing around the "-" in the "i-1" expressions.
>
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static void update_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
>
> Same nit as above: fd being the first argument would be a little more
> natural in my opinion.
>
> > +                            unsigned int nrz) {
> > +    qemu_mutex_lock(&wps->lock);
> > +    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
> > +        error_report("report zone wp failed");
> > +        return;
>
> You are leacking the lock here. Remove the return. Also, given that
> get_zones_wp() already prints a message if report fails, I do not think
> the message here is useful.
>
> Also, why is this function void typed ? How can the caller know if the
> update succeeded or not ?

Update failures mean get_zones_wp() fails and that will be reported by
error_report. The error message indicates updates fail not reports
fail. Maybe modifying the message suffices error checking?

+    qemu_mutex_lock(&wps->lock);
+    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
+        error_report("update zone wp failed");
+    }
+    qemu_mutex_unlock(&wps->lock);


>
> > +    }
> > +    qemu_mutex_unlock(&wps->lock);
> > +}
> > +#endif
> > +
> >  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >  {
> >      BDRVRawState *s = bs->opaque;
> > @@ -1414,6 +1477,19 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >              error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
> >              return;
> >          }
> > +
> > +        ret = get_sysfs_long_val(&st, "physical_block_size");
> > +        if (ret >= 0) {
> > +            bs->bl.write_granularity = ret;
> > +        }
>
> Why is this change here ? Shouldn't this be part of the previous series
> "Add support for zoned device" ?

Because only zone append uses write_granularity to check the iovector
size alignment. The previous series doesn't use this field.

>
> > +
> > +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> > +        if (get_zones_wp(0, s->fd, bs->bl.wps, ret) < 0){
> > +            error_report("report wps failed");
> > +            g_free(bs->bl.wps);
> > +            return;
> > +        }
> > +        qemu_mutex_init(&bs->bl.wps->lock);
> >      }
> >  }
> >
> > @@ -1651,6 +1727,20 @@ static int handle_aiocb_rw(void *opaque)
> >      ssize_t nbytes;
> >      char *buf;
> >
> > +    /*
> > +     * The offset of regular writes, append writes is the wp location
> > +     * of that zone.
> > +     */
> > +    if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +        if (aiocb->bs->bl.zone_size > 0) {
> > +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> > +            qemu_mutex_lock(&wps->lock);
> > +            aiocb->aio_offset = wps->wp[aiocb->aio_offset /
> > +                                        aiocb->bs->bl.zone_size];
> > +            qemu_mutex_unlock(&wps->lock);
> > +        }
>
> I do not understand this hunk at all. What is this trying to do ? zone
> append support goes into patch 2. You are overwritting the user
> specified aio offset using the tracked wp value. That could result in a
> successfull write even if the user sent an unaligned write command. That
> is bad.

Ok, regular writes and append writes got mixed up when I changed the
offset to the wp of that zone.

>
> Here you should only be tracking the write pointer, so increment
> wps->wp[index], which you do below.

Understood. Will move it to the next patch.

>
> > +    }
> > +
> >      if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
> >          /*
> >           * If there is just a single buffer, and it is properly aligned
> > @@ -1725,6 +1815,24 @@ static int handle_aiocb_rw(void *opaque)
> >
> >  out:
> >      if (nbytes == aiocb->aio_nbytes) {
> > +#if defined(CONFIG_BLKZONED)
> > +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> > +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> > +            if (wps) {
> > +                qemu_mutex_lock(&wps->lock);
> > +                if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> > +                    uint64_t wend_offset =
> > +                            aiocb->aio_offset + aiocb->aio_nbytes;
> > +                    /* Advance the wp if needed */
> > +                    if (wend_offset > wps->wp[index]){
> > +                        wps->wp[index] = wend_offset;
> > +                    }
> > +                }
> > +                qemu_mutex_unlock(&wps->lock);
> > +            }
> > +        }
> > +#endif
> >          return 0;
> >      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
> >          if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > @@ -1736,6 +1844,12 @@ out:
> >          }
> >      } else {
> >          assert(nbytes < 0);
> > +#if defined(CONFIG_BLKZONED)
> > +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +            update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> > +                            aiocb->bs->bl.nr_zones);
>
> You only need to update the target zone of the aio, not all zones.
> Updating all zones is actually a bug as wp[] entries for other zones may
> be larger than the device reported wp if there are other write aios in
> flight. So the last argument must be "1" here.

Ok, I understood now.

>
> > +        }
> > +#endif
> >          return nbytes;
> >      }
> >  }
> > @@ -2022,12 +2136,17 @@ static int handle_aiocb_zone_report(void *opaque) {
> >  #if defined(CONFIG_BLKZONED)
> >  static int handle_aiocb_zone_mgmt(void *opaque) {
> >      RawPosixAIOData *aiocb = opaque;
> > +    BlockDriverState *bs = aiocb->bs;
> >      int fd = aiocb->aio_fildes;
> >      int64_t sector = aiocb->aio_offset / 512;
> >      int64_t nr_sectors = aiocb->aio_nbytes / 512;
> > +    uint64_t wend_offset;
> >      struct blk_zone_range range;
> >      int ret;
> >
>
> Why the blank line here ?

For readability, separate it from the execution part.

>
> > +    BlockZoneWps *wps = bs->bl.wps;
> > +    int index = aiocb->aio_offset / bs->bl.zone_size;
> > +
> >      /* Execute the operation */
> >      range.sector = sector;
> >      range.nr_sectors = nr_sectors;
> > @@ -2035,11 +2154,41 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
> >          ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
> >      } while (ret != 0 && errno == EINTR);
> >      if (ret != 0) {
> > +        update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> > +                        aiocb->bs->bl.nr_zones);
>
> You need only to update the range of zones that was specified for the
> management option, not all zones. So you must specify the zone
> management aio offset and size/zone_size here.
>
> >          ret = -errno;
> >          error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
> >                       ret);
> >          return ret;
> >      }
> > +
> > +    qemu_mutex_lock(&wps->lock);
> > +    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> > +         /*
> > +         * The zoned device allows the last zone smaller that the zone size.
> > +         */
>
> comment indentation is off.
>
> > +        if (aiocb->aio_nbytes < bs->bl.zone_size) {
> > +            wend_offset = aiocb->aio_offset + aiocb->aio_nbytes;
> > +        } else {
> > +            wend_offset = aiocb->aio_offset + bs->bl.zone_size;
> > +        }
> > +
> > +        if (aiocb->aio_offset != wps->wp[index] &&
> > +            aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
>
> I do not understand the condition here. Why do you have
> "aiocb->aio_offset != wps->wp[index]" ?

It is intended for zone state checks. aio_offset (= start byte of that
zone) = wp means this zone is empty. Only non-empty zones will be
reset.

>
> > +            if (aiocb->zone_mgmt.all) {
>
> This is the only place where you use this all boolean field. For
> simplicity, I would drop this field completely and test that
> aiocb->aio_offset == 0 && aiocb->aio_nbytes == bs->bl.capacity to detect
> a reset all zones operation.

Right, the capacity field makes it possible. I'll drop it.

>
> > +                for (int i = 0; i < bs->bl.nr_zones; ++i) {
> > +                    wps->wp[i] = i * bs->bl.zone_size;
>
> You are not handling conventional zones here. For conventional zones,
> you should not change the value. Otherwise, BDRV_ZT_IS_CONV() will
> always return false after this.

Right, will add a condition line here:
+ if (! BDRV_ZT_IS_CONV(wps->wp[i]))

>
> > +                }
> > +            } else {
> > +                wps->wp[index] = aiocb->aio_offset;
> > +            }
> > +        } else if (aiocb->aio_offset != wps->wp[index] &&
> > +            aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
>
> Same comment here. Why do you have "aiocb->aio_offset != wps->wp[index]" ?

This should be wend_offset != wps->wp[index]. It means if this zone is
full, no need to finish it.

>
> > +            wps->wp[index] = wend_offset;
> > +        }
> > +    }
> > +    qemu_mutex_unlock(&wps->lock);
> > +
> >      return ret;
> >  }
> >  #endif
> > @@ -2480,6 +2629,12 @@ static void raw_close(BlockDriverState *bs)
> >      BDRVRawState *s = bs->opaque;
> >
> >      if (s->fd >= 0) {
> > +#if defined(CONFIG_BLKZONED)
> > +        if (bs->bl.wps) {
> > +            qemu_mutex_destroy(&bs->bl.wps->lock);
> > +            g_free(bs->bl.wps);
> > +        }
> > +#endif
> >          qemu_close(s->fd);
> >          s->fd = -1;
> >      }
> > @@ -3278,6 +3433,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >      int64_t zone_size, zone_size_mask;
> >      const char *zone_op_name;
> >      unsigned long zone_op;
> > +    bool is_all = false;
> >
> >      zone_size = bs->bl.zone_size;
> >      zone_size_mask = zone_size - 1;
> > @@ -3314,6 +3470,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >      case BLK_ZO_RESET_ALL:
> >          zone_op_name = "BLKRESETZONE";
> >          zone_op = BLKRESETZONE;
> > +        is_all = true;
> >          break;
> >      default:
> >          g_assert_not_reached();
> > @@ -3328,6 +3485,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >          .zone_mgmt  = {
> >              .zone_op = zone_op,
> >              .zone_op_name = zone_op_name,
> > +            .all = is_all,
> >          },
> >      };
> >
> > diff --git a/include/block/block-common.h b/include/block/block-common.h
> > index 882de6825e..b8b2dba64a 100644
> > --- a/include/block/block-common.h
> > +++ b/include/block/block-common.h
> > @@ -93,6 +93,14 @@ typedef struct BlockZoneDescriptor {
> >      BlockZoneCondition cond;
> >  } BlockZoneDescriptor;
> >
> > +/*
> > + * Track write pointers of a zone in bytes.
> > + */
> > +typedef struct BlockZoneWps {
> > +    QemuMutex lock;
> > +    uint64_t wp[];
> > +} BlockZoneWps;
> > +
> >  typedef struct BlockDriverInfo {
> >      /* in bytes, 0 if irrelevant */
> >      int cluster_size;
> > @@ -206,6 +214,12 @@ typedef enum {
> >  #define BDRV_SECTOR_BITS   9
> >  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
> >
> > +/*
> > + * Get the first most significant bit of wp. If it is zero, then
> > + * the zone type is SWR.
> > + */
> > +#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
> > +
> >  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
> >                                             INT_MAX >> BDRV_SECTOR_BITS)
> >  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
> > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> > index 37dddc603c..59c2d1316d 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -857,6 +857,11 @@ typedef struct BlockLimits {
> >
> >      /* device capacity expressed in bytes */
> >      int64_t capacity;
> > +
> > +    /* array of write pointers' location of each zone in the zoned device. */
> > +    BlockZoneWps *wps;
> > +
> > +    int64_t write_granularity;
> >  } BlockLimits;
> >
> >  typedef struct BdrvOpBlocker BdrvOpBlocker;
>
> --
> Damien Le Moal
> Western Digital Research
>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 2/3] block: introduce zone append write for zoned devices
  2022-10-13  5:54   ` Damien Le Moal
@ 2022-10-13  7:27     ` Sam Li
  2022-10-13  7:45       ` Damien Le Moal
  0 siblings, 1 reply; 11+ messages in thread
From: Sam Li @ 2022-10-13  7:27 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: qemu-devel, stefanha, Hanna Reitz, Kevin Wolf, dmitry.fomichev,
	hare, Fam Zheng, qemu-block

Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月13日周四 13:55写道：
>
> On 10/10/22 11:33, Sam Li wrote:
> > A zone append command is a write operation that specifies the first
> > logical block of a zone as the write position. When writing to a zoned
> > block device using zone append, the byte offset of writes is pointing
> > to the write pointer of that zone. Upon completion the device will
> > respond with the position the data has been written in the zone.
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >  block/block-backend.c             | 64 +++++++++++++++++++++++++++++++
> >  block/file-posix.c                | 64 ++++++++++++++++++++++++++++---
> >  block/io.c                        | 21 ++++++++++
> >  block/raw-format.c                |  7 ++++
> >  include/block/block-io.h          |  3 ++
> >  include/block/block_int-common.h  |  3 ++
> >  include/block/raw-aio.h           |  4 +-
> >  include/sysemu/block-backend-io.h |  9 +++++
> >  8 files changed, 168 insertions(+), 7 deletions(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index ddc569e3ac..bfdb719bc8 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
> >          struct {
> >              BlockZoneOp op;
> >          } zone_mgmt;
> > +        struct {
> > +            int64_t *append_sector;
>
> I would call this "sector", since it will always be referenced as
> "->zone_append.sector", you get the "append" for free :)
>
> That said, shouldn't this be a byte value, so called "offset" ? Not
> entirely sure...

Yes, it can be changed to "offset"(byte) following QEMU's convention.
Just need to add conversions to virtio_blk_zone_append/*_complete,
which is easily done.

>
> > +        } zone_append;
> >      };
> >  } BlkRwCo;
> >
> > @@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> >      return &acb->common;
> >  }
> >
> > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) {
> > +    BlkAioEmAIOCB *acb = opaque;
> > +    BlkRwCo *rwco = &acb->rwco;
> > +
> > +    rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector,
> > +                                   rwco->iobuf, rwco->flags);
> > +    blk_aio_complete(acb);
> > +}
> > +
> > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> > +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> > +                                BlockCompletionFunc *cb, void *opaque) {
> > +    BlkAioEmAIOCB *acb;
> > +    Coroutine *co;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk);
> > +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> > +    acb->rwco = (BlkRwCo) {
> > +            .blk    = blk,
> > +            .ret    = NOT_DONE,
> > +            .flags  = flags,
> > +            .iobuf  = qiov,
> > +            .zone_append = {
> > +                    .append_sector = offset,
>
> See above comment. So since this is a byte value, this needs to be
> called "offset", no ?

Yes, same answers above.

>
> > +            },
> > +    };
> > +    acb->has_returned = false;
> > +
> > +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> > +    bdrv_coroutine_enter(blk_bs(blk), co);
> > +    acb->has_returned = true;
> > +    if (acb->rwco.ret != NOT_DONE) {
> > +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> > +                                         blk_aio_complete_bh, acb);
> > +    }
> > +
> > +    return &acb->common;
> > +}
> > +
> >  /*
> >   * Send a zone_report command.
> >   * offset is a byte offset from the start of the device. No alignment
> > @@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> >      return ret;
> >  }
> >
> > +/*
> > + * Send a zone_append command.
> > + */
> > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> > +        QEMUIOVector *qiov, BdrvRequestFlags flags)
> > +{
> > +    int ret;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk);
> > +    blk_wait_while_drained(blk);
> > +    if (!blk_is_available(blk)) {
> > +        blk_dec_in_flight(blk);
> > +        return -ENOMEDIUM;
> > +    }
> > +
> > +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> >  void blk_drain(BlockBackend *blk)
> >  {
> >      BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 17c0b58158..08ab164df4 100755
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
> >      ssize_t len;
> >
> >      do {
> > -        if (aiocb->aio_type & QEMU_AIO_WRITE)
> > +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
> >              len = qemu_pwritev(aiocb->aio_fildes,
> >                                 aiocb->io.iov,
> >                                 aiocb->io.niov,
>
> Hu... You are issuing the io for a zone append without first changing
> the aiocb offset to be equal to the zone write pointer ? And you are

It changed in the last patch. But it should be in this patch and make
it specific to zone_append case, like:
if ( type == & QEMU_AIO_ZONE_APPEND) {
    /* change offset here */
}

> calling this without the wps->lock held... Changing the aio offset to be
> equal to the wp && issuing the io must be atomic.

Does this mean puttling locks around pwritev()? Like:
lock(wp);
len = pwritev();
unlock(wp);

Because it is accessing wps[] by offset, which is a wp location. And
when pwritev()  executes, the offset should not be changed by other
ios in flight.

>
> > @@ -1687,7 +1687,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
> >      ssize_t len;
> >
> >      while (offset < aiocb->aio_nbytes) {
> > -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> >              len = pwrite(aiocb->aio_fildes,
> >                           (const char *)buf + offset,
> >                           aiocb->aio_nbytes - offset,
>
> Same comment here.
>
> > @@ -1731,7 +1731,7 @@ static int handle_aiocb_rw(void *opaque)
> >       * The offset of regular writes, append writes is the wp location
> >       * of that zone.
> >       */
> > -    if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +    if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> >          if (aiocb->bs->bl.zone_size > 0) {
> >              BlockZoneWps *wps = aiocb->bs->bl.wps;
> >              qemu_mutex_lock(&wps->lock);
> > @@ -1794,7 +1794,7 @@ static int handle_aiocb_rw(void *opaque)
> >      }
> >
> >      nbytes = handle_aiocb_rw_linear(aiocb, buf);
> > -    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
> > +    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
> >          char *p = buf;
> >          size_t count = aiocb->aio_nbytes, copy;
> >          int i;
> > @@ -1816,7 +1816,7 @@ static int handle_aiocb_rw(void *opaque)
> >  out:
> >      if (nbytes == aiocb->aio_nbytes) {
> >  #if defined(CONFIG_BLKZONED)
> > -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> >              BlockZoneWps *wps = aiocb->bs->bl.wps;
> >              int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> >              if (wps) {
> > @@ -1828,6 +1828,11 @@ out:
> >                      if (wend_offset > wps->wp[index]){
> >                          wps->wp[index] = wend_offset;
> >                      }
> > +
> > +                    if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> > +                        *aiocb->io.append_sector =
> > +                                wps->wp[index] >> BDRV_SECTOR_BITS;
> > +                    }
>
> Same comment as last time. You must do this BEFORE the previous hunk
> that updates the wp. Otherwise, you are NOT returning the position of
> the written data, but the position that follows the written data...
>
> If you do a zone append to an empty zone, the append sector you return
> must be the zone start sector. You can see here that this will never be
> the case unless you reverse the 2 hunks above.

You are right. I mistook the append sector should be the end sector location.

+Upon a successful completion of a VIRTIO_BLK_T_ZONE_APPEND request, the driver
+MAY read the starting sector location of the written data from the request
+field \field{append_sector}.

>
> >                  }
> >                  qemu_mutex_unlock(&wps->lock);
> >              }
> > @@ -1845,7 +1850,7 @@ out:
> >      } else {
> >          assert(nbytes < 0);
> >  #if defined(CONFIG_BLKZONED)
> > -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> >              update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> >                              aiocb->bs->bl.nr_zones);
> >          }
> > @@ -3493,6 +3498,52 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >  }
> >  #endif
> >
> > +#if defined(CONFIG_BLKZONED)
> > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> > +                                           int64_t *offset,
> > +                                           QEMUIOVector *qiov,
> > +                                           BdrvRequestFlags flags) {
> > +    BDRVRawState *s = bs->opaque;
> > +    int64_t zone_size_mask = bs->bl.zone_size - 1;
> > +    int64_t iov_len = 0;
> > +    int64_t len = 0;
> > +    RawPosixAIOData acb;
> > +
> > +    if (*offset & zone_size_mask) {
> > +        error_report("sector offset %" PRId64 " is not aligned to zone size "
> > +                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
> > +        return -EINVAL;
> > +    }
> > +
> > +    int64_t wg = bs->bl.write_granularity;
> > +    int64_t wg_mask = wg - 1;
> > +    for (int i = 0; i < qiov->niov; i++) {
> > +       iov_len = qiov->iov[i].iov_len;
> > +       if (iov_len & wg_mask) {
> > +           error_report("len of IOVector[%d] %" PRId64 " is not aligned to block "
> > +                        "size %" PRId64 "", i, iov_len, wg);
> > +           return -EINVAL;
> > +       }
> > +       len += iov_len;
> > +    }
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs = bs,
> > +        .aio_fildes = s->fd,
> > +        .aio_type = QEMU_AIO_ZONE_APPEND,
> > +        .aio_offset = *offset,
> > +        .aio_nbytes = len,
> > +        .io = {
> > +                .iov = qiov->iov,
> > +                .niov = qiov->niov,
> > +                .append_sector = offset,
> > +        },
> > +    };
> > +
> > +    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
> > +}
> > +#endif
> > +
> >  static coroutine_fn int
> >  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
> >                  bool blkdev)
> > @@ -4268,6 +4319,7 @@ static BlockDriver bdrv_zoned_host_device = {
> >      /* zone management operations */
> >      .bdrv_co_zone_report = raw_co_zone_report,
> >      .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> > +    .bdrv_co_zone_append = raw_co_zone_append,
> >  };
> >  #endif
> >
> > diff --git a/block/io.c b/block/io.c
> > index e5aaa64e17..935abf2ed4 100644
> > --- a/block/io.c
> > +++ b/block/io.c
> > @@ -3230,6 +3230,27 @@ out:
> >      return co.ret;
> >  }
> >
> > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> > +                        QEMUIOVector *qiov,
> > +                        BdrvRequestFlags flags)
> > +{
> > +    BlockDriver *drv = bs->drv;
> > +    CoroutineIOCompletion co = {
> > +            .coroutine = qemu_coroutine_self(),
> > +    };
> > +    IO_CODE();
> > +
> > +    bdrv_inc_in_flight(bs);
> > +    if (!drv || !drv->bdrv_co_zone_append) {
> > +        co.ret = -ENOTSUP;
> > +        goto out;
> > +    }
> > +    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
> > +out:
> > +    bdrv_dec_in_flight(bs);
> > +    return co.ret;
> > +}
> > +
> >  void *qemu_blockalign(BlockDriverState *bs, size_t size)
> >  {
> >      IO_CODE();
> > diff --git a/block/raw-format.c b/block/raw-format.c
> > index b885688434..f132880c85 100644
> > --- a/block/raw-format.c
> > +++ b/block/raw-format.c
> > @@ -325,6 +325,12 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >      return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
> >  }
> >
> > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, int64_t *offset,
> > +                                           QEMUIOVector *qiov,
> > +                                           BdrvRequestFlags flags) {
> > +    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
> > +}
> > +
> >  static int64_t raw_getlength(BlockDriverState *bs)
> >  {
> >      int64_t len;
> > @@ -628,6 +634,7 @@ BlockDriver bdrv_raw = {
> >      .bdrv_co_pdiscard     = &raw_co_pdiscard,
> >      .bdrv_co_zone_report  = &raw_co_zone_report,
> >      .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
> > +    .bdrv_co_zone_append = &raw_co_zone_append,
> >      .bdrv_co_block_status = &raw_co_block_status,
> >      .bdrv_co_copy_range_from = &raw_co_copy_range_from,
> >      .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
> > diff --git a/include/block/block-io.h b/include/block/block-io.h
> > index f0cdf67d33..6a54453578 100644
> > --- a/include/block/block-io.h
> > +++ b/include/block/block-io.h
> > @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
> >                                       BlockZoneDescriptor *zones);
> >  int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >                                     int64_t offset, int64_t len);
> > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> > +                                     QEMUIOVector *qiov,
> > +                                     BdrvRequestFlags flags);
> >
> >  int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
> >  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
> > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> > index 59c2d1316d..a7e7db5646 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -701,6 +701,9 @@ struct BlockDriver {
> >              BlockZoneDescriptor *zones);
> >      int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
> >              int64_t offset, int64_t len);
> > +    int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
> > +            int64_t *offset, QEMUIOVector *qiov,
> > +            BdrvRequestFlags flags);
> >
> >      /* removable device specific */
> >      bool (*bdrv_is_inserted)(BlockDriverState *bs);
> > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> > index 3d26929cdd..f13cc1887b 100644
> > --- a/include/block/raw-aio.h
> > +++ b/include/block/raw-aio.h
> > @@ -31,6 +31,7 @@
> >  #define QEMU_AIO_TRUNCATE     0x0080
> >  #define QEMU_AIO_ZONE_REPORT  0x0100
> >  #define QEMU_AIO_ZONE_MGMT    0x0200
> > +#define QEMU_AIO_ZONE_APPEND  0x0400
> >  #define QEMU_AIO_TYPE_MASK \
> >          (QEMU_AIO_READ | \
> >           QEMU_AIO_WRITE | \
> > @@ -41,7 +42,8 @@
> >           QEMU_AIO_COPY_RANGE | \
> >           QEMU_AIO_TRUNCATE  | \
> >           QEMU_AIO_ZONE_REPORT | \
> > -         QEMU_AIO_ZONE_MGMT)
> > +         QEMU_AIO_ZONE_MGMT | \
> > +         QEMU_AIO_ZONE_APPEND)
> >
> >  /* AIO flags */
> >  #define QEMU_AIO_MISALIGNED   0x1000
> > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
> > index 6835525582..33e35ae5d7 100644
> > --- a/include/sysemu/block-backend-io.h
> > +++ b/include/sysemu/block-backend-io.h
> > @@ -51,6 +51,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
> >  BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> >                                int64_t offset, int64_t len,
> >                                BlockCompletionFunc *cb, void *opaque);
> > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> > +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> > +                                BlockCompletionFunc *cb, void *opaque);
> >  BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
> >                               BlockCompletionFunc *cb, void *opaque);
> >  void blk_aio_cancel_async(BlockAIOCB *acb);
> > @@ -172,6 +175,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> >                                    int64_t offset, int64_t len);
> >  int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> >                                         int64_t offset, int64_t len);
> > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> > +                                    QEMUIOVector *qiov,
> > +                                    BdrvRequestFlags flags);
> > +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset,
> > +                                         QEMUIOVector *qiov,
> > +                                         BdrvRequestFlags flags);
> >
> >  int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
> >                                        int64_t bytes);
>
> --
> Damien Le Moal
> Western Digital Research
>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 1/3] file-posix:add the tracking of the zones write pointers
  2022-10-13  7:08     ` Sam Li
@ 2022-10-13  7:30       ` Damien Le Moal
  2022-10-13  7:46         ` Sam Li
  0 siblings, 1 reply; 11+ messages in thread
From: Damien Le Moal @ 2022-10-13  7:30 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, stefanha, Hanna Reitz, Kevin Wolf, dmitry.fomichev,
	hare, Fam Zheng, qemu-block

On 2022/10/13 16:08, Sam Li wrote:
> Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月13日周四 13:13写道：
>>
>> On 10/10/22 11:33, Sam Li wrote:
>>> Since Linux doesn't have a user API to issue zone append operations to
>>> zoned devices from user space, the file-posix driver is modified to add
>>> zone append emulation using regular writes. To do this, the file-posix
>>> driver tracks the wp location of all zones of the device. It uses an
>>> array of uint64_t. The most significant bit of each wp location indicates
>>> if the zone type is conventional zones.
>>>
>>> The zones wp can be changed due to the following operations issued:
>>> - zone reset: change the wp to the start offset of that zone
>>> - zone finish: change to the end location of that zone
>>> - write to a zone
>>> - zone append
>>>
>>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>>> ---
>>>  block/file-posix.c               | 158 +++++++++++++++++++++++++++++++
>>>  include/block/block-common.h     |  14 +++
>>>  include/block/block_int-common.h |   5 +
>>>  3 files changed, 177 insertions(+)
>>>
>>> diff --git a/block/file-posix.c b/block/file-posix.c
>>> index a9d347292e..17c0b58158 100755
>>> --- a/block/file-posix.c
>>> +++ b/block/file-posix.c
>>> @@ -206,6 +206,7 @@ typedef struct RawPosixAIOData {
>>>          struct {
>>>              struct iovec *iov;
>>>              int niov;
>>> +            int64_t *append_sector;
>>
>> This should be added as part of patch 2. You do not need this to track
>> the wp of zones in this patch.
>>
>>>          } io;
>>>          struct {
>>>              uint64_t cmd;
>>> @@ -226,6 +227,7 @@ typedef struct RawPosixAIOData {
>>>          struct {
>>>              unsigned long zone_op;
>>>              const char *zone_op_name;
>>> +            bool all;
>>>          } zone_mgmt;
>>>      };
>>>  } RawPosixAIOData;
>>> @@ -1331,6 +1333,67 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
>>>  #endif
>>>  }
>>>
>>> +#if defined(CONFIG_BLKZONED)
>>> +static int get_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
>>
>> Nit: It would seem more natural to have the fd argument first...
>>
>>> +                        unsigned int nrz) {
>>> +    struct blk_zone *blkz;
>>> +    int64_t rep_size;
>>> +    int64_t sector = offset >> BDRV_SECTOR_BITS;
>>> +    int ret, n = 0, i = 0;
>>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>>> +    g_autofree struct blk_zone_report *rep = NULL;
>>> +
>>> +    rep = g_malloc(rep_size);
>>> +    blkz = (struct blk_zone *)(rep + 1);
>>> +    while (n < nrz) {
>>> +        memset(rep, 0, rep_size);
>>> +        rep->sector = sector;
>>> +        rep->nr_zones = nrz - n;
>>> +
>>> +        do {
>>> +            ret = ioctl(fd, BLKREPORTZONE, rep);
>>> +        } while (ret != 0 && errno == EINTR);
>>> +        if (ret != 0) {
>>> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
>>> +                    fd, offset, errno);
>>> +            return -errno;
>>> +        }
>>> +
>>> +        if (!rep->nr_zones) {
>>> +            break;
>>> +        }
>>> +
>>> +        for (i = 0; i < rep->nr_zones; i++, n++) {
>>> +            /*
>>> +             * The wp tracking cares only about sequential writes required and
>>> +             * sequential write preferred zones so that the wp can advance to
>>> +             * the right location.
>>> +             * Use the most significant bit of the wp location to indicate the
>>> +             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
>>> +             */
>>> +            if (!(blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL)) {
>>
>> Double negation... This can simply be:
>>
>> if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
>>
>>> +                wps->wp[i] += 1ULL << 63;
>>
>> No need for the += here. This can be "=".
>>
>>> +            } else {
>>> +                wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
>>> +            }
>>> +        }
>>> +        sector = blkz[i-1].start + blkz[i-1].len;
>>
>> spaces missing around the "-" in the "i-1" expressions.
>>
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void update_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
>>
>> Same nit as above: fd being the first argument would be a little more
>> natural in my opinion.
>>
>>> +                            unsigned int nrz) {
>>> +    qemu_mutex_lock(&wps->lock);
>>> +    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
>>> +        error_report("report zone wp failed");
>>> +        return;
>>
>> You are leacking the lock here. Remove the return. Also, given that
>> get_zones_wp() already prints a message if report fails, I do not think
>> the message here is useful.
>>
>> Also, why is this function void typed ? How can the caller know if the
>> update succeeded or not ?
> 
> Update failures mean get_zones_wp() fails and that will be reported by
> error_report. The error message indicates updates fail not reports
> fail. Maybe modifying the message suffices error checking?
> 
> +    qemu_mutex_lock(&wps->lock);
> +    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
> +        error_report("update zone wp failed");
> +    }
> +    qemu_mutex_unlock(&wps->lock);
> 
> 
>>
>>> +    }
>>> +    qemu_mutex_unlock(&wps->lock);
>>> +}
>>> +#endif
>>> +
>>>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>>>  {
>>>      BDRVRawState *s = bs->opaque;
>>> @@ -1414,6 +1477,19 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>>>              error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
>>>              return;
>>>          }
>>> +
>>> +        ret = get_sysfs_long_val(&st, "physical_block_size");
>>> +        if (ret >= 0) {
>>> +            bs->bl.write_granularity = ret;
>>> +        }
>>
>> Why is this change here ? Shouldn't this be part of the previous series
>> "Add support for zoned device" ?
> 
> Because only zone append uses write_granularity to check the iovector
> size alignment. The previous series doesn't use this field.

Then move this to patch 2. This should not be in this patch since you are not
dealing with zone append yet.

> 
>>
>>> +
>>> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
>>> +        if (get_zones_wp(0, s->fd, bs->bl.wps, ret) < 0){
>>> +            error_report("report wps failed");
>>> +            g_free(bs->bl.wps);
>>> +            return;
>>> +        }
>>> +        qemu_mutex_init(&bs->bl.wps->lock);
>>>      }
>>>  }
>>>
>>> @@ -1651,6 +1727,20 @@ static int handle_aiocb_rw(void *opaque)
>>>      ssize_t nbytes;
>>>      char *buf;
>>>
>>> +    /*
>>> +     * The offset of regular writes, append writes is the wp location
>>> +     * of that zone.
>>> +     */
>>> +    if (aiocb->aio_type & QEMU_AIO_WRITE) {
>>> +        if (aiocb->bs->bl.zone_size > 0) {
>>> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
>>> +            qemu_mutex_lock(&wps->lock);
>>> +            aiocb->aio_offset = wps->wp[aiocb->aio_offset /
>>> +                                        aiocb->bs->bl.zone_size];
>>> +            qemu_mutex_unlock(&wps->lock);
>>> +        }
>>
>> I do not understand this hunk at all. What is this trying to do ? zone
>> append support goes into patch 2. You are overwritting the user
>> specified aio offset using the tracked wp value. That could result in a
>> successfull write even if the user sent an unaligned write command. That
>> is bad.
> 
> Ok, regular writes and append writes got mixed up when I changed the
> offset to the wp of that zone.
> 
>>
>> Here you should only be tracking the write pointer, so increment
>> wps->wp[index], which you do below.
> 
> Understood. Will move it to the next patch.

No ! You should not change the aio offset for regular writes. Otherwise you may
hide errors for bad commands from the guest by having them succeed :)
aio offset change should be done ONLY for zone append, not for regular writes.

> 
>>
>>> +    }
>>> +
>>>      if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
>>>          /*
>>>           * If there is just a single buffer, and it is properly aligned
>>> @@ -1725,6 +1815,24 @@ static int handle_aiocb_rw(void *opaque)
>>>
>>>  out:
>>>      if (nbytes == aiocb->aio_nbytes) {
>>> +#if defined(CONFIG_BLKZONED)
>>> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
>>> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
>>> +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
>>> +            if (wps) {
>>> +                qemu_mutex_lock(&wps->lock);
>>> +                if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
>>> +                    uint64_t wend_offset =
>>> +                            aiocb->aio_offset + aiocb->aio_nbytes;
>>> +                    /* Advance the wp if needed */
>>> +                    if (wend_offset > wps->wp[index]){
>>> +                        wps->wp[index] = wend_offset;
>>> +                    }
>>> +                }
>>> +                qemu_mutex_unlock(&wps->lock);
>>> +            }
>>> +        }
>>> +#endif
>>>          return 0;
>>>      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
>>>          if (aiocb->aio_type & QEMU_AIO_WRITE) {
>>> @@ -1736,6 +1844,12 @@ out:
>>>          }
>>>      } else {
>>>          assert(nbytes < 0);
>>> +#if defined(CONFIG_BLKZONED)
>>> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
>>> +            update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
>>> +                            aiocb->bs->bl.nr_zones);
>>
>> You only need to update the target zone of the aio, not all zones.
>> Updating all zones is actually a bug as wp[] entries for other zones may
>> be larger than the device reported wp if there are other write aios in
>> flight. So the last argument must be "1" here.
> 
> Ok, I understood now.
> 
>>
>>> +        }
>>> +#endif
>>>          return nbytes;
>>>      }
>>>  }
>>> @@ -2022,12 +2136,17 @@ static int handle_aiocb_zone_report(void *opaque) {
>>>  #if defined(CONFIG_BLKZONED)
>>>  static int handle_aiocb_zone_mgmt(void *opaque) {
>>>      RawPosixAIOData *aiocb = opaque;
>>> +    BlockDriverState *bs = aiocb->bs;
>>>      int fd = aiocb->aio_fildes;
>>>      int64_t sector = aiocb->aio_offset / 512;
>>>      int64_t nr_sectors = aiocb->aio_nbytes / 512;
>>> +    uint64_t wend_offset;
>>>      struct blk_zone_range range;
>>>      int ret;
>>>
>>
>> Why the blank line here ?
> 
> For readability, separate it from the execution part.

But the following lines are variable declarations. I personally prefer
declarations to stay together before the code :)

> 
>>
>>> +    BlockZoneWps *wps = bs->bl.wps;
>>> +    int index = aiocb->aio_offset / bs->bl.zone_size;
>>> +
>>>      /* Execute the operation */
>>>      range.sector = sector;
>>>      range.nr_sectors = nr_sectors;
>>> @@ -2035,11 +2154,41 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
>>>          ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
>>>      } while (ret != 0 && errno == EINTR);
>>>      if (ret != 0) {
>>> +        update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
>>> +                        aiocb->bs->bl.nr_zones);
>>
>> You need only to update the range of zones that was specified for the
>> management option, not all zones. So you must specify the zone
>> management aio offset and size/zone_size here.
>>
>>>          ret = -errno;
>>>          error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
>>>                       ret);
>>>          return ret;
>>>      }
>>> +
>>> +    qemu_mutex_lock(&wps->lock);
>>> +    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
>>> +         /*
>>> +         * The zoned device allows the last zone smaller that the zone size.
>>> +         */
>>
>> comment indentation is off.
>>
>>> +        if (aiocb->aio_nbytes < bs->bl.zone_size) {
>>> +            wend_offset = aiocb->aio_offset + aiocb->aio_nbytes;
>>> +        } else {
>>> +            wend_offset = aiocb->aio_offset + bs->bl.zone_size;
>>> +        }
>>> +
>>> +        if (aiocb->aio_offset != wps->wp[index] &&
>>> +            aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
>>
>> I do not understand the condition here. Why do you have
>> "aiocb->aio_offset != wps->wp[index]" ?
> 
> It is intended for zone state checks. aio_offset (= start byte of that
> zone) = wp means this zone is empty. Only non-empty zones will be
> reset.

That is not very natural to use an input from the user (the guest) to check the
state of something that you (qemu) maintains internally and hides to the user.
You should do such test with a small helpers like this:

static bool zone_is_empty(bl, index)
{
	return bl->wps.wp[index} == index * bl->zone_size;
}

And note that this will ALWAYS return false for conventional zones.
You are not checking for conventional zones either. Any zone management function
should be immediately failed if addressed to a conventional zone. That is
missing. You need a:

if (BDRV_ZT_IS_CONV(wps->wp[index] && "this is not a zone reset all op")
	return -EIO; /* or similar... */

at the beginning of handle_aiocb_zone_mgmt().

> 
>>
>>> +            if (aiocb->zone_mgmt.all) {
>>
>> This is the only place where you use this all boolean field. For
>> simplicity, I would drop this field completely and test that
>> aiocb->aio_offset == 0 && aiocb->aio_nbytes == bs->bl.capacity to detect
>> a reset all zones operation.
> 
> Right, the capacity field makes it possible. I'll drop it.
> 
>>
>>> +                for (int i = 0; i < bs->bl.nr_zones; ++i) {
>>> +                    wps->wp[i] = i * bs->bl.zone_size;
>>
>> You are not handling conventional zones here. For conventional zones,
>> you should not change the value. Otherwise, BDRV_ZT_IS_CONV() will
>> always return false after this.
> 
> Right, will add a condition line here:
> + if (! BDRV_ZT_IS_CONV(wps->wp[i]))

You need:

if (BDRV_ZT_IS_CONV(wps->wp[i]))
    continue;

as the first lines inside the for loop.


> 
>>
>>> +                }
>>> +            } else {
>>> +                wps->wp[index] = aiocb->aio_offset;
>>> +            }
>>> +        } else if (aiocb->aio_offset != wps->wp[index] &&
>>> +            aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
>>
>> Same comment here. Why do you have "aiocb->aio_offset != wps->wp[index]" ?
> 
> This should be wend_offset != wps->wp[index]. It means if this zone is
> full, no need to finish it.

Nope, this does not mean the zone is full. Full condition would be:

wps->wp[index] >= index * bl->zone_size + zone_cap

But you do not have zone cap per zone (remember that zone capacity is per zone
and may differ between zones)... You could add it to the wp array, but that will
make it larger for not much benefits. Since finishing a zone that is already
full is a very rare case, optimizing for it is not valuable. So simply issue the
zone finish operation. It will be a no-op on the host device if the zone is
already full. No big deal !

> 
>>
>>> +            wps->wp[index] = wend_offset;
>>> +        }
>>> +    }
>>> +    qemu_mutex_unlock(&wps->lock);
>>> +
>>>      return ret;
>>>  }
>>>  #endif
>>> @@ -2480,6 +2629,12 @@ static void raw_close(BlockDriverState *bs)
>>>      BDRVRawState *s = bs->opaque;
>>>
>>>      if (s->fd >= 0) {
>>> +#if defined(CONFIG_BLKZONED)
>>> +        if (bs->bl.wps) {
>>> +            qemu_mutex_destroy(&bs->bl.wps->lock);
>>> +            g_free(bs->bl.wps);
>>> +        }
>>> +#endif
>>>          qemu_close(s->fd);
>>>          s->fd = -1;
>>>      }
>>> @@ -3278,6 +3433,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>>      int64_t zone_size, zone_size_mask;
>>>      const char *zone_op_name;
>>>      unsigned long zone_op;
>>> +    bool is_all = false;
>>>
>>>      zone_size = bs->bl.zone_size;
>>>      zone_size_mask = zone_size - 1;
>>> @@ -3314,6 +3470,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>>      case BLK_ZO_RESET_ALL:
>>>          zone_op_name = "BLKRESETZONE";
>>>          zone_op = BLKRESETZONE;
>>> +        is_all = true;
>>>          break;
>>>      default:
>>>          g_assert_not_reached();
>>> @@ -3328,6 +3485,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>>          .zone_mgmt  = {
>>>              .zone_op = zone_op,
>>>              .zone_op_name = zone_op_name,
>>> +            .all = is_all,
>>>          },
>>>      };
>>>
>>> diff --git a/include/block/block-common.h b/include/block/block-common.h
>>> index 882de6825e..b8b2dba64a 100644
>>> --- a/include/block/block-common.h
>>> +++ b/include/block/block-common.h
>>> @@ -93,6 +93,14 @@ typedef struct BlockZoneDescriptor {
>>>      BlockZoneCondition cond;
>>>  } BlockZoneDescriptor;
>>>
>>> +/*
>>> + * Track write pointers of a zone in bytes.
>>> + */
>>> +typedef struct BlockZoneWps {
>>> +    QemuMutex lock;
>>> +    uint64_t wp[];
>>> +} BlockZoneWps;
>>> +
>>>  typedef struct BlockDriverInfo {
>>>      /* in bytes, 0 if irrelevant */
>>>      int cluster_size;
>>> @@ -206,6 +214,12 @@ typedef enum {
>>>  #define BDRV_SECTOR_BITS   9
>>>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
>>>
>>> +/*
>>> + * Get the first most significant bit of wp. If it is zero, then
>>> + * the zone type is SWR.
>>> + */
>>> +#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
>>> +
>>>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
>>>                                             INT_MAX >> BDRV_SECTOR_BITS)
>>>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
>>> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
>>> index 37dddc603c..59c2d1316d 100644
>>> --- a/include/block/block_int-common.h
>>> +++ b/include/block/block_int-common.h
>>> @@ -857,6 +857,11 @@ typedef struct BlockLimits {
>>>
>>>      /* device capacity expressed in bytes */
>>>      int64_t capacity;
>>> +
>>> +    /* array of write pointers' location of each zone in the zoned device. */
>>> +    BlockZoneWps *wps;
>>> +
>>> +    int64_t write_granularity;
>>>  } BlockLimits;
>>>
>>>  typedef struct BdrvOpBlocker BdrvOpBlocker;
>>
>> --
>> Damien Le Moal
>> Western Digital Research
>>

-- 
Damien Le Moal
Western Digital Research



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 2/3] block: introduce zone append write for zoned devices
  2022-10-13  7:27     ` Sam Li
@ 2022-10-13  7:45       ` Damien Le Moal
  0 siblings, 0 replies; 11+ messages in thread
From: Damien Le Moal @ 2022-10-13  7:45 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, stefanha, Hanna Reitz, Kevin Wolf, dmitry.fomichev,
	hare, Fam Zheng, qemu-block

On 2022/10/13 16:27, Sam Li wrote:
> Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月13日周四 13:55写道：
>>
>> On 10/10/22 11:33, Sam Li wrote:
>>> A zone append command is a write operation that specifies the first
>>> logical block of a zone as the write position. When writing to a zoned
>>> block device using zone append, the byte offset of writes is pointing
>>> to the write pointer of that zone. Upon completion the device will
>>> respond with the position the data has been written in the zone.
>>>
>>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>>> ---
>>>  block/block-backend.c             | 64 +++++++++++++++++++++++++++++++
>>>  block/file-posix.c                | 64 ++++++++++++++++++++++++++++---
>>>  block/io.c                        | 21 ++++++++++
>>>  block/raw-format.c                |  7 ++++
>>>  include/block/block-io.h          |  3 ++
>>>  include/block/block_int-common.h  |  3 ++
>>>  include/block/raw-aio.h           |  4 +-
>>>  include/sysemu/block-backend-io.h |  9 +++++
>>>  8 files changed, 168 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/block/block-backend.c b/block/block-backend.c
>>> index ddc569e3ac..bfdb719bc8 100644
>>> --- a/block/block-backend.c
>>> +++ b/block/block-backend.c
>>> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>>>          struct {
>>>              BlockZoneOp op;
>>>          } zone_mgmt;
>>> +        struct {
>>> +            int64_t *append_sector;
>>
>> I would call this "sector", since it will always be referenced as
>> "->zone_append.sector", you get the "append" for free :)
>>
>> That said, shouldn't this be a byte value, so called "offset" ? Not
>> entirely sure...
> 
> Yes, it can be changed to "offset"(byte) following QEMU's convention.
> Just need to add conversions to virtio_blk_zone_append/*_complete,
> which is easily done.
> 
>>
>>> +        } zone_append;
>>>      };
>>>  } BlkRwCo;
>>>
>>> @@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>>>      return &acb->common;
>>>  }
>>>
>>> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) {
>>> +    BlkAioEmAIOCB *acb = opaque;
>>> +    BlkRwCo *rwco = &acb->rwco;
>>> +
>>> +    rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector,
>>> +                                   rwco->iobuf, rwco->flags);
>>> +    blk_aio_complete(acb);
>>> +}
>>> +
>>> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
>>> +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
>>> +                                BlockCompletionFunc *cb, void *opaque) {
>>> +    BlkAioEmAIOCB *acb;
>>> +    Coroutine *co;
>>> +    IO_CODE();
>>> +
>>> +    blk_inc_in_flight(blk);
>>> +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
>>> +    acb->rwco = (BlkRwCo) {
>>> +            .blk    = blk,
>>> +            .ret    = NOT_DONE,
>>> +            .flags  = flags,
>>> +            .iobuf  = qiov,
>>> +            .zone_append = {
>>> +                    .append_sector = offset,
>>
>> See above comment. So since this is a byte value, this needs to be
>> called "offset", no ?
> 
> Yes, same answers above.
> 
>>
>>> +            },
>>> +    };
>>> +    acb->has_returned = false;
>>> +
>>> +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
>>> +    bdrv_coroutine_enter(blk_bs(blk), co);
>>> +    acb->has_returned = true;
>>> +    if (acb->rwco.ret != NOT_DONE) {
>>> +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
>>> +                                         blk_aio_complete_bh, acb);
>>> +    }
>>> +
>>> +    return &acb->common;
>>> +}
>>> +
>>>  /*
>>>   * Send a zone_report command.
>>>   * offset is a byte offset from the start of the device. No alignment
>>> @@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>>>      return ret;
>>>  }
>>>
>>> +/*
>>> + * Send a zone_append command.
>>> + */
>>> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
>>> +        QEMUIOVector *qiov, BdrvRequestFlags flags)
>>> +{
>>> +    int ret;
>>> +    IO_CODE();
>>> +
>>> +    blk_inc_in_flight(blk);
>>> +    blk_wait_while_drained(blk);
>>> +    if (!blk_is_available(blk)) {
>>> +        blk_dec_in_flight(blk);
>>> +        return -ENOMEDIUM;
>>> +    }
>>> +
>>> +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
>>> +    blk_dec_in_flight(blk);
>>> +    return ret;
>>> +}
>>> +
>>>  void blk_drain(BlockBackend *blk)
>>>  {
>>>      BlockDriverState *bs = blk_bs(blk);
>>> diff --git a/block/file-posix.c b/block/file-posix.c
>>> index 17c0b58158..08ab164df4 100755
>>> --- a/block/file-posix.c
>>> +++ b/block/file-posix.c
>>> @@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>>>      ssize_t len;
>>>
>>>      do {
>>> -        if (aiocb->aio_type & QEMU_AIO_WRITE)
>>> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
>>>              len = qemu_pwritev(aiocb->aio_fildes,
>>>                                 aiocb->io.iov,
>>>                                 aiocb->io.niov,
>>
>> Hu... You are issuing the io for a zone append without first changing
>> the aiocb offset to be equal to the zone write pointer ? And you are
> 
> It changed in the last patch. But it should be in this patch and make
> it specific to zone_append case, like:
> if ( type == & QEMU_AIO_ZONE_APPEND) {
>     /* change offset here */
> }

yes.

> 
>> calling this without the wps->lock held... Changing the aio offset to be
>> equal to the wp && issuing the io must be atomic.
> 
> Does this mean puttling locks around pwritev()? Like:
> lock(wp);
> len = pwritev();
> unlock(wp);

You need the aio offset change, aio issuing (or IO execution depending on the
host system call used) and wp update all atomic:

BlockZoneWps *wps = aiocb->bs->bl.wps;
int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;

lock(wp);

if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
    aiocb->aio_offset = wps->wp[index];
}

len = pwritev();
if (len > 0) {
    wps->wp[index] = wend_offset;
    if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
        *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS;
    }
}

unlock(wp);

Note that you must also take the lock for regular writes to avoid reordering of
pwritev() calls if multiple qemu worker threads are simultaneously handling
write requests for the same zone... I am not sure if this can happen though.
-> Stefan ?

If the write request is processed using a host AIOs (e.g. linux native aio with
io_submit() & io_getevents()), you do not need to hold the lock until the linux
AIO completes. You only need the lock until io_submit() returns. But I am not
sure how linux native aios are handled in qemu code...

> Because it is accessing wps[] by offset, which is a wp location. And
> when pwritev()  executes, the offset should not be changed by other
> ios in flight.

yes, otherwise you may get reversed pwritev() execution order if multiple worker
threads are handling write commands for the same zone. You must serialize them
using the wp lock.

> 
>>
>>> @@ -1687,7 +1687,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
>>>      ssize_t len;
>>>
>>>      while (offset < aiocb->aio_nbytes) {
>>> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
>>> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>>>              len = pwrite(aiocb->aio_fildes,
>>>                           (const char *)buf + offset,
>>>                           aiocb->aio_nbytes - offset,
>>
>> Same comment here.
>>
>>> @@ -1731,7 +1731,7 @@ static int handle_aiocb_rw(void *opaque)
>>>       * The offset of regular writes, append writes is the wp location
>>>       * of that zone.
>>>       */
>>> -    if (aiocb->aio_type & QEMU_AIO_WRITE) {
>>> +    if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>>>          if (aiocb->bs->bl.zone_size > 0) {
>>>              BlockZoneWps *wps = aiocb->bs->bl.wps;
>>>              qemu_mutex_lock(&wps->lock);
>>> @@ -1794,7 +1794,7 @@ static int handle_aiocb_rw(void *opaque)
>>>      }
>>>
>>>      nbytes = handle_aiocb_rw_linear(aiocb, buf);
>>> -    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
>>> +    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
>>>          char *p = buf;
>>>          size_t count = aiocb->aio_nbytes, copy;
>>>          int i;
>>> @@ -1816,7 +1816,7 @@ static int handle_aiocb_rw(void *opaque)
>>>  out:
>>>      if (nbytes == aiocb->aio_nbytes) {
>>>  #if defined(CONFIG_BLKZONED)
>>> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
>>> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>>>              BlockZoneWps *wps = aiocb->bs->bl.wps;
>>>              int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
>>>              if (wps) {
>>> @@ -1828,6 +1828,11 @@ out:
>>>                      if (wend_offset > wps->wp[index]){
>>>                          wps->wp[index] = wend_offset;
>>>                      }
>>> +
>>> +                    if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
>>> +                        *aiocb->io.append_sector =
>>> +                                wps->wp[index] >> BDRV_SECTOR_BITS;
>>> +                    }
>>
>> Same comment as last time. You must do this BEFORE the previous hunk
>> that updates the wp. Otherwise, you are NOT returning the position of
>> the written data, but the position that follows the written data...
>>
>> If you do a zone append to an empty zone, the append sector you return
>> must be the zone start sector. You can see here that this will never be
>> the case unless you reverse the 2 hunks above.
> 
> You are right. I mistook the append sector should be the end sector location.
> 
> +Upon a successful completion of a VIRTIO_BLK_T_ZONE_APPEND request, the driver
> +MAY read the starting sector location of the written data from the request
> +field \field{append_sector}.
> 
>>
>>>                  }
>>>                  qemu_mutex_unlock(&wps->lock);
>>>              }
>>> @@ -1845,7 +1850,7 @@ out:
>>>      } else {
>>>          assert(nbytes < 0);
>>>  #if defined(CONFIG_BLKZONED)
>>> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
>>> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>>>              update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
>>>                              aiocb->bs->bl.nr_zones);
>>>          }
>>> @@ -3493,6 +3498,52 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>>  }
>>>  #endif
>>>
>>> +#if defined(CONFIG_BLKZONED)
>>> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
>>> +                                           int64_t *offset,
>>> +                                           QEMUIOVector *qiov,
>>> +                                           BdrvRequestFlags flags) {
>>> +    BDRVRawState *s = bs->opaque;
>>> +    int64_t zone_size_mask = bs->bl.zone_size - 1;
>>> +    int64_t iov_len = 0;
>>> +    int64_t len = 0;
>>> +    RawPosixAIOData acb;
>>> +
>>> +    if (*offset & zone_size_mask) {
>>> +        error_report("sector offset %" PRId64 " is not aligned to zone size "
>>> +                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
>>> +        return -EINVAL;
>>> +    }
>>> +
>>> +    int64_t wg = bs->bl.write_granularity;
>>> +    int64_t wg_mask = wg - 1;
>>> +    for (int i = 0; i < qiov->niov; i++) {
>>> +       iov_len = qiov->iov[i].iov_len;
>>> +       if (iov_len & wg_mask) {
>>> +           error_report("len of IOVector[%d] %" PRId64 " is not aligned to block "
>>> +                        "size %" PRId64 "", i, iov_len, wg);
>>> +           return -EINVAL;
>>> +       }
>>> +       len += iov_len;
>>> +    }
>>> +
>>> +    acb = (RawPosixAIOData) {
>>> +        .bs = bs,
>>> +        .aio_fildes = s->fd,
>>> +        .aio_type = QEMU_AIO_ZONE_APPEND,
>>> +        .aio_offset = *offset,
>>> +        .aio_nbytes = len,
>>> +        .io = {
>>> +                .iov = qiov->iov,
>>> +                .niov = qiov->niov,
>>> +                .append_sector = offset,
>>> +        },
>>> +    };
>>> +
>>> +    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
>>> +}
>>> +#endif
>>> +
>>>  static coroutine_fn int
>>>  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>>>                  bool blkdev)
>>> @@ -4268,6 +4319,7 @@ static BlockDriver bdrv_zoned_host_device = {
>>>      /* zone management operations */
>>>      .bdrv_co_zone_report = raw_co_zone_report,
>>>      .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>>> +    .bdrv_co_zone_append = raw_co_zone_append,
>>>  };
>>>  #endif
>>>
>>> diff --git a/block/io.c b/block/io.c
>>> index e5aaa64e17..935abf2ed4 100644
>>> --- a/block/io.c
>>> +++ b/block/io.c
>>> @@ -3230,6 +3230,27 @@ out:
>>>      return co.ret;
>>>  }
>>>
>>> +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
>>> +                        QEMUIOVector *qiov,
>>> +                        BdrvRequestFlags flags)
>>> +{
>>> +    BlockDriver *drv = bs->drv;
>>> +    CoroutineIOCompletion co = {
>>> +            .coroutine = qemu_coroutine_self(),
>>> +    };
>>> +    IO_CODE();
>>> +
>>> +    bdrv_inc_in_flight(bs);
>>> +    if (!drv || !drv->bdrv_co_zone_append) {
>>> +        co.ret = -ENOTSUP;
>>> +        goto out;
>>> +    }
>>> +    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
>>> +out:
>>> +    bdrv_dec_in_flight(bs);
>>> +    return co.ret;
>>> +}
>>> +
>>>  void *qemu_blockalign(BlockDriverState *bs, size_t size)
>>>  {
>>>      IO_CODE();
>>> diff --git a/block/raw-format.c b/block/raw-format.c
>>> index b885688434..f132880c85 100644
>>> --- a/block/raw-format.c
>>> +++ b/block/raw-format.c
>>> @@ -325,6 +325,12 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>>      return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
>>>  }
>>>
>>> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, int64_t *offset,
>>> +                                           QEMUIOVector *qiov,
>>> +                                           BdrvRequestFlags flags) {
>>> +    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
>>> +}
>>> +
>>>  static int64_t raw_getlength(BlockDriverState *bs)
>>>  {
>>>      int64_t len;
>>> @@ -628,6 +634,7 @@ BlockDriver bdrv_raw = {
>>>      .bdrv_co_pdiscard     = &raw_co_pdiscard,
>>>      .bdrv_co_zone_report  = &raw_co_zone_report,
>>>      .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
>>> +    .bdrv_co_zone_append = &raw_co_zone_append,
>>>      .bdrv_co_block_status = &raw_co_block_status,
>>>      .bdrv_co_copy_range_from = &raw_co_copy_range_from,
>>>      .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
>>> diff --git a/include/block/block-io.h b/include/block/block-io.h
>>> index f0cdf67d33..6a54453578 100644
>>> --- a/include/block/block-io.h
>>> +++ b/include/block/block-io.h
>>> @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
>>>                                       BlockZoneDescriptor *zones);
>>>  int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>>                                     int64_t offset, int64_t len);
>>> +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
>>> +                                     QEMUIOVector *qiov,
>>> +                                     BdrvRequestFlags flags);
>>>
>>>  int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
>>>  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
>>> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
>>> index 59c2d1316d..a7e7db5646 100644
>>> --- a/include/block/block_int-common.h
>>> +++ b/include/block/block_int-common.h
>>> @@ -701,6 +701,9 @@ struct BlockDriver {
>>>              BlockZoneDescriptor *zones);
>>>      int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
>>>              int64_t offset, int64_t len);
>>> +    int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
>>> +            int64_t *offset, QEMUIOVector *qiov,
>>> +            BdrvRequestFlags flags);
>>>
>>>      /* removable device specific */
>>>      bool (*bdrv_is_inserted)(BlockDriverState *bs);
>>> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
>>> index 3d26929cdd..f13cc1887b 100644
>>> --- a/include/block/raw-aio.h
>>> +++ b/include/block/raw-aio.h
>>> @@ -31,6 +31,7 @@
>>>  #define QEMU_AIO_TRUNCATE     0x0080
>>>  #define QEMU_AIO_ZONE_REPORT  0x0100
>>>  #define QEMU_AIO_ZONE_MGMT    0x0200
>>> +#define QEMU_AIO_ZONE_APPEND  0x0400
>>>  #define QEMU_AIO_TYPE_MASK \
>>>          (QEMU_AIO_READ | \
>>>           QEMU_AIO_WRITE | \
>>> @@ -41,7 +42,8 @@
>>>           QEMU_AIO_COPY_RANGE | \
>>>           QEMU_AIO_TRUNCATE  | \
>>>           QEMU_AIO_ZONE_REPORT | \
>>> -         QEMU_AIO_ZONE_MGMT)
>>> +         QEMU_AIO_ZONE_MGMT | \
>>> +         QEMU_AIO_ZONE_APPEND)
>>>
>>>  /* AIO flags */
>>>  #define QEMU_AIO_MISALIGNED   0x1000
>>> diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
>>> index 6835525582..33e35ae5d7 100644
>>> --- a/include/sysemu/block-backend-io.h
>>> +++ b/include/sysemu/block-backend-io.h
>>> @@ -51,6 +51,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
>>>  BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>>>                                int64_t offset, int64_t len,
>>>                                BlockCompletionFunc *cb, void *opaque);
>>> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
>>> +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
>>> +                                BlockCompletionFunc *cb, void *opaque);
>>>  BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
>>>                               BlockCompletionFunc *cb, void *opaque);
>>>  void blk_aio_cancel_async(BlockAIOCB *acb);
>>> @@ -172,6 +175,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>>>                                    int64_t offset, int64_t len);
>>>  int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>>>                                         int64_t offset, int64_t len);
>>> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
>>> +                                    QEMUIOVector *qiov,
>>> +                                    BdrvRequestFlags flags);
>>> +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset,
>>> +                                         QEMUIOVector *qiov,
>>> +                                         BdrvRequestFlags flags);
>>>
>>>  int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
>>>                                        int64_t bytes);
>>
>> --
>> Damien Le Moal
>> Western Digital Research
>>

-- 
Damien Le Moal
Western Digital Research



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 1/3] file-posix:add the tracking of the zones write pointers
  2022-10-13  7:30       ` Damien Le Moal
@ 2022-10-13  7:46         ` Sam Li
  0 siblings, 0 replies; 11+ messages in thread
From: Sam Li @ 2022-10-13  7:46 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: qemu-devel, stefanha, Hanna Reitz, Kevin Wolf, dmitry.fomichev,
	hare, Fam Zheng, qemu-block

Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月13日周四 15:30写道：
>
> On 2022/10/13 16:08, Sam Li wrote:
> > Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月13日周四 13:13写道：
> >>
> >> On 10/10/22 11:33, Sam Li wrote:
> >>> Since Linux doesn't have a user API to issue zone append operations to
> >>> zoned devices from user space, the file-posix driver is modified to add
> >>> zone append emulation using regular writes. To do this, the file-posix
> >>> driver tracks the wp location of all zones of the device. It uses an
> >>> array of uint64_t. The most significant bit of each wp location indicates
> >>> if the zone type is conventional zones.
> >>>
> >>> The zones wp can be changed due to the following operations issued:
> >>> - zone reset: change the wp to the start offset of that zone
> >>> - zone finish: change to the end location of that zone
> >>> - write to a zone
> >>> - zone append
> >>>
> >>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> >>> ---
> >>>  block/file-posix.c               | 158 +++++++++++++++++++++++++++++++
> >>>  include/block/block-common.h     |  14 +++
> >>>  include/block/block_int-common.h |   5 +
> >>>  3 files changed, 177 insertions(+)
> >>>
> >>> diff --git a/block/file-posix.c b/block/file-posix.c
> >>> index a9d347292e..17c0b58158 100755
> >>> --- a/block/file-posix.c
> >>> +++ b/block/file-posix.c
> >>> @@ -206,6 +206,7 @@ typedef struct RawPosixAIOData {
> >>>          struct {
> >>>              struct iovec *iov;
> >>>              int niov;
> >>> +            int64_t *append_sector;
> >>
> >> This should be added as part of patch 2. You do not need this to track
> >> the wp of zones in this patch.
> >>
> >>>          } io;
> >>>          struct {
> >>>              uint64_t cmd;
> >>> @@ -226,6 +227,7 @@ typedef struct RawPosixAIOData {
> >>>          struct {
> >>>              unsigned long zone_op;
> >>>              const char *zone_op_name;
> >>> +            bool all;
> >>>          } zone_mgmt;
> >>>      };
> >>>  } RawPosixAIOData;
> >>> @@ -1331,6 +1333,67 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
> >>>  #endif
> >>>  }
> >>>
> >>> +#if defined(CONFIG_BLKZONED)
> >>> +static int get_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
> >>
> >> Nit: It would seem more natural to have the fd argument first...
> >>
> >>> +                        unsigned int nrz) {
> >>> +    struct blk_zone *blkz;
> >>> +    int64_t rep_size;
> >>> +    int64_t sector = offset >> BDRV_SECTOR_BITS;
> >>> +    int ret, n = 0, i = 0;
> >>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> >>> +    g_autofree struct blk_zone_report *rep = NULL;
> >>> +
> >>> +    rep = g_malloc(rep_size);
> >>> +    blkz = (struct blk_zone *)(rep + 1);
> >>> +    while (n < nrz) {
> >>> +        memset(rep, 0, rep_size);
> >>> +        rep->sector = sector;
> >>> +        rep->nr_zones = nrz - n;
> >>> +
> >>> +        do {
> >>> +            ret = ioctl(fd, BLKREPORTZONE, rep);
> >>> +        } while (ret != 0 && errno == EINTR);
> >>> +        if (ret != 0) {
> >>> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> >>> +                    fd, offset, errno);
> >>> +            return -errno;
> >>> +        }
> >>> +
> >>> +        if (!rep->nr_zones) {
> >>> +            break;
> >>> +        }
> >>> +
> >>> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> >>> +            /*
> >>> +             * The wp tracking cares only about sequential writes required and
> >>> +             * sequential write preferred zones so that the wp can advance to
> >>> +             * the right location.
> >>> +             * Use the most significant bit of the wp location to indicate the
> >>> +             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> >>> +             */
> >>> +            if (!(blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL)) {
> >>
> >> Double negation... This can simply be:
> >>
> >> if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> >>
> >>> +                wps->wp[i] += 1ULL << 63;
> >>
> >> No need for the += here. This can be "=".
> >>
> >>> +            } else {
> >>> +                wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> >>> +            }
> >>> +        }
> >>> +        sector = blkz[i-1].start + blkz[i-1].len;
> >>
> >> spaces missing around the "-" in the "i-1" expressions.
> >>
> >>> +    }
> >>> +
> >>> +    return 0;
> >>> +}
> >>> +
> >>> +static void update_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
> >>
> >> Same nit as above: fd being the first argument would be a little more
> >> natural in my opinion.
> >>
> >>> +                            unsigned int nrz) {
> >>> +    qemu_mutex_lock(&wps->lock);
> >>> +    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
> >>> +        error_report("report zone wp failed");
> >>> +        return;
> >>
> >> You are leacking the lock here. Remove the return. Also, given that
> >> get_zones_wp() already prints a message if report fails, I do not think
> >> the message here is useful.
> >>
> >> Also, why is this function void typed ? How can the caller know if the
> >> update succeeded or not ?
> >
> > Update failures mean get_zones_wp() fails and that will be reported by
> > error_report. The error message indicates updates fail not reports
> > fail. Maybe modifying the message suffices error checking?
> >
> > +    qemu_mutex_lock(&wps->lock);
> > +    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
> > +        error_report("update zone wp failed");
> > +    }
> > +    qemu_mutex_unlock(&wps->lock);
> >
> >
> >>
> >>> +    }
> >>> +    qemu_mutex_unlock(&wps->lock);
> >>> +}
> >>> +#endif
> >>> +
> >>>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >>>  {
> >>>      BDRVRawState *s = bs->opaque;
> >>> @@ -1414,6 +1477,19 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >>>              error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
> >>>              return;
> >>>          }
> >>> +
> >>> +        ret = get_sysfs_long_val(&st, "physical_block_size");
> >>> +        if (ret >= 0) {
> >>> +            bs->bl.write_granularity = ret;
> >>> +        }
> >>
> >> Why is this change here ? Shouldn't this be part of the previous series
> >> "Add support for zoned device" ?
> >
> > Because only zone append uses write_granularity to check the iovector
> > size alignment. The previous series doesn't use this field.
>
> Then move this to patch 2. This should not be in this patch since you are not
> dealing with zone append yet.
>
> >
> >>
> >>> +
> >>> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> >>> +        if (get_zones_wp(0, s->fd, bs->bl.wps, ret) < 0){
> >>> +            error_report("report wps failed");
> >>> +            g_free(bs->bl.wps);
> >>> +            return;
> >>> +        }
> >>> +        qemu_mutex_init(&bs->bl.wps->lock);
> >>>      }
> >>>  }
> >>>
> >>> @@ -1651,6 +1727,20 @@ static int handle_aiocb_rw(void *opaque)
> >>>      ssize_t nbytes;
> >>>      char *buf;
> >>>
> >>> +    /*
> >>> +     * The offset of regular writes, append writes is the wp location
> >>> +     * of that zone.
> >>> +     */
> >>> +    if (aiocb->aio_type & QEMU_AIO_WRITE) {
> >>> +        if (aiocb->bs->bl.zone_size > 0) {
> >>> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> >>> +            qemu_mutex_lock(&wps->lock);
> >>> +            aiocb->aio_offset = wps->wp[aiocb->aio_offset /
> >>> +                                        aiocb->bs->bl.zone_size];
> >>> +            qemu_mutex_unlock(&wps->lock);
> >>> +        }
> >>
> >> I do not understand this hunk at all. What is this trying to do ? zone
> >> append support goes into patch 2. You are overwritting the user
> >> specified aio offset using the tracked wp value. That could result in a
> >> successfull write even if the user sent an unaligned write command. That
> >> is bad.
> >
> > Ok, regular writes and append writes got mixed up when I changed the
> > offset to the wp of that zone.
> >
> >>
> >> Here you should only be tracking the write pointer, so increment
> >> wps->wp[index], which you do below.
> >
> > Understood. Will move it to the next patch.
>
> No ! You should not change the aio offset for regular writes. Otherwise you may
> hide errors for bad commands from the guest by having them succeed :)
> aio offset change should be done ONLY for zone append, not for regular writes.
>
> >
> >>
> >>> +    }
> >>> +
> >>>      if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
> >>>          /*
> >>>           * If there is just a single buffer, and it is properly aligned
> >>> @@ -1725,6 +1815,24 @@ static int handle_aiocb_rw(void *opaque)
> >>>
> >>>  out:
> >>>      if (nbytes == aiocb->aio_nbytes) {
> >>> +#if defined(CONFIG_BLKZONED)
> >>> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> >>> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> >>> +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> >>> +            if (wps) {
> >>> +                qemu_mutex_lock(&wps->lock);
> >>> +                if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> >>> +                    uint64_t wend_offset =
> >>> +                            aiocb->aio_offset + aiocb->aio_nbytes;
> >>> +                    /* Advance the wp if needed */
> >>> +                    if (wend_offset > wps->wp[index]){
> >>> +                        wps->wp[index] = wend_offset;
> >>> +                    }
> >>> +                }
> >>> +                qemu_mutex_unlock(&wps->lock);
> >>> +            }
> >>> +        }
> >>> +#endif
> >>>          return 0;
> >>>      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
> >>>          if (aiocb->aio_type & QEMU_AIO_WRITE) {
> >>> @@ -1736,6 +1844,12 @@ out:
> >>>          }
> >>>      } else {
> >>>          assert(nbytes < 0);
> >>> +#if defined(CONFIG_BLKZONED)
> >>> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> >>> +            update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> >>> +                            aiocb->bs->bl.nr_zones);
> >>
> >> You only need to update the target zone of the aio, not all zones.
> >> Updating all zones is actually a bug as wp[] entries for other zones may
> >> be larger than the device reported wp if there are other write aios in
> >> flight. So the last argument must be "1" here.
> >
> > Ok, I understood now.
> >
> >>
> >>> +        }
> >>> +#endif
> >>>          return nbytes;
> >>>      }
> >>>  }
> >>> @@ -2022,12 +2136,17 @@ static int handle_aiocb_zone_report(void *opaque) {
> >>>  #if defined(CONFIG_BLKZONED)
> >>>  static int handle_aiocb_zone_mgmt(void *opaque) {
> >>>      RawPosixAIOData *aiocb = opaque;
> >>> +    BlockDriverState *bs = aiocb->bs;
> >>>      int fd = aiocb->aio_fildes;
> >>>      int64_t sector = aiocb->aio_offset / 512;
> >>>      int64_t nr_sectors = aiocb->aio_nbytes / 512;
> >>> +    uint64_t wend_offset;
> >>>      struct blk_zone_range range;
> >>>      int ret;
> >>>
> >>
> >> Why the blank line here ?
> >
> > For readability, separate it from the execution part.
>
> But the following lines are variable declarations. I personally prefer
> declarations to stay together before the code :)

Ok, will change it.

>
> >
> >>
> >>> +    BlockZoneWps *wps = bs->bl.wps;
> >>> +    int index = aiocb->aio_offset / bs->bl.zone_size;
> >>> +
> >>>      /* Execute the operation */
> >>>      range.sector = sector;
> >>>      range.nr_sectors = nr_sectors;
> >>> @@ -2035,11 +2154,41 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
> >>>          ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
> >>>      } while (ret != 0 && errno == EINTR);
> >>>      if (ret != 0) {
> >>> +        update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> >>> +                        aiocb->bs->bl.nr_zones);
> >>
> >> You need only to update the range of zones that was specified for the
> >> management option, not all zones. So you must specify the zone
> >> management aio offset and size/zone_size here.
> >>
> >>>          ret = -errno;
> >>>          error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
> >>>                       ret);
> >>>          return ret;
> >>>      }
> >>> +
> >>> +    qemu_mutex_lock(&wps->lock);
> >>> +    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> >>> +         /*
> >>> +         * The zoned device allows the last zone smaller that the zone size.
> >>> +         */
> >>
> >> comment indentation is off.
> >>
> >>> +        if (aiocb->aio_nbytes < bs->bl.zone_size) {
> >>> +            wend_offset = aiocb->aio_offset + aiocb->aio_nbytes;
> >>> +        } else {
> >>> +            wend_offset = aiocb->aio_offset + bs->bl.zone_size;
> >>> +        }
> >>> +
> >>> +        if (aiocb->aio_offset != wps->wp[index] &&
> >>> +            aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
> >>
> >> I do not understand the condition here. Why do you have
> >> "aiocb->aio_offset != wps->wp[index]" ?
> >
> > It is intended for zone state checks. aio_offset (= start byte of that
> > zone) = wp means this zone is empty. Only non-empty zones will be
> > reset.
>
> That is not very natural to use an input from the user (the guest) to check the
> state of something that you (qemu) maintains internally and hides to the user.
> You should do such test with a small helpers like this:
>
> static bool zone_is_empty(bl, index)
> {
>         return bl->wps.wp[index} == index * bl->zone_size;
> }

Ok.

>
> And note that this will ALWAYS return false for conventional zones.
> You are not checking for conventional zones either. Any zone management function
> should be immediately failed if addressed to a conventional zone. That is
> missing. You need a:
>
> if (BDRV_ZT_IS_CONV(wps->wp[index] && "this is not a zone reset all op")
>         return -EIO; /* or similar... */
>
> at the beginning of handle_aiocb_zone_mgmt().

(Just add an additional note: )
Though this check should be in the block layer API patches, it is
possible when wps[] is introduced.

>
> >
> >>
> >>> +            if (aiocb->zone_mgmt.all) {
> >>
> >> This is the only place where you use this all boolean field. For
> >> simplicity, I would drop this field completely and test that
> >> aiocb->aio_offset == 0 && aiocb->aio_nbytes == bs->bl.capacity to detect
> >> a reset all zones operation.
> >
> > Right, the capacity field makes it possible. I'll drop it.
> >
> >>
> >>> +                for (int i = 0; i < bs->bl.nr_zones; ++i) {
> >>> +                    wps->wp[i] = i * bs->bl.zone_size;
> >>
> >> You are not handling conventional zones here. For conventional zones,
> >> you should not change the value. Otherwise, BDRV_ZT_IS_CONV() will
> >> always return false after this.
> >
> > Right, will add a condition line here:
> > + if (! BDRV_ZT_IS_CONV(wps->wp[i]))
>
> You need:
>
> if (BDRV_ZT_IS_CONV(wps->wp[i]))
>     continue;
>
> as the first lines inside the for loop.

Trivial: looks like the same behavior:
 if (! BDRV_ZT_IS_CONV(wps->wp[i])) {
    /* change pointers */
}

if (BDRV_ZT_IS_CONV(wps->wp[i]))
    continue;
/* change pointers */

>
>
> >
> >>
> >>> +                }
> >>> +            } else {
> >>> +                wps->wp[index] = aiocb->aio_offset;
> >>> +            }
> >>> +        } else if (aiocb->aio_offset != wps->wp[index] &&
> >>> +            aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
> >>
> >> Same comment here. Why do you have "aiocb->aio_offset != wps->wp[index]" ?
> >
> > This should be wend_offset != wps->wp[index]. It means if this zone is
> > full, no need to finish it.
>
> Nope, this does not mean the zone is full. Full condition would be:
>
> wps->wp[index] >= index * bl->zone_size + zone_cap
>
> But you do not have zone cap per zone (remember that zone capacity is per zone
> and may differ between zones)... You could add it to the wp array, but that will
> make it larger for not much benefits. Since finishing a zone that is already
> full is a very rare case, optimizing for it is not valuable. So simply issue the
> zone finish operation. It will be a no-op on the host device if the zone is
> already full. No big deal !

I see.

Thanks!

>
> >
> >>
> >>> +            wps->wp[index] = wend_offset;
> >>> +        }
> >>> +    }
> >>> +    qemu_mutex_unlock(&wps->lock);
> >>> +
> >>>      return ret;
> >>>  }
> >>>  #endif
> >>> @@ -2480,6 +2629,12 @@ static void raw_close(BlockDriverState *bs)
> >>>      BDRVRawState *s = bs->opaque;
> >>>
> >>>      if (s->fd >= 0) {
> >>> +#if defined(CONFIG_BLKZONED)
> >>> +        if (bs->bl.wps) {
> >>> +            qemu_mutex_destroy(&bs->bl.wps->lock);
> >>> +            g_free(bs->bl.wps);
> >>> +        }
> >>> +#endif
> >>>          qemu_close(s->fd);
> >>>          s->fd = -1;
> >>>      }
> >>> @@ -3278,6 +3433,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >>>      int64_t zone_size, zone_size_mask;
> >>>      const char *zone_op_name;
> >>>      unsigned long zone_op;
> >>> +    bool is_all = false;
> >>>
> >>>      zone_size = bs->bl.zone_size;
> >>>      zone_size_mask = zone_size - 1;
> >>> @@ -3314,6 +3470,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >>>      case BLK_ZO_RESET_ALL:
> >>>          zone_op_name = "BLKRESETZONE";
> >>>          zone_op = BLKRESETZONE;
> >>> +        is_all = true;
> >>>          break;
> >>>      default:
> >>>          g_assert_not_reached();
> >>> @@ -3328,6 +3485,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >>>          .zone_mgmt  = {
> >>>              .zone_op = zone_op,
> >>>              .zone_op_name = zone_op_name,
> >>> +            .all = is_all,
> >>>          },
> >>>      };
> >>>
> >>> diff --git a/include/block/block-common.h b/include/block/block-common.h
> >>> index 882de6825e..b8b2dba64a 100644
> >>> --- a/include/block/block-common.h
> >>> +++ b/include/block/block-common.h
> >>> @@ -93,6 +93,14 @@ typedef struct BlockZoneDescriptor {
> >>>      BlockZoneCondition cond;
> >>>  } BlockZoneDescriptor;
> >>>
> >>> +/*
> >>> + * Track write pointers of a zone in bytes.
> >>> + */
> >>> +typedef struct BlockZoneWps {
> >>> +    QemuMutex lock;
> >>> +    uint64_t wp[];
> >>> +} BlockZoneWps;
> >>> +
> >>>  typedef struct BlockDriverInfo {
> >>>      /* in bytes, 0 if irrelevant */
> >>>      int cluster_size;
> >>> @@ -206,6 +214,12 @@ typedef enum {
> >>>  #define BDRV_SECTOR_BITS   9
> >>>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
> >>>
> >>> +/*
> >>> + * Get the first most significant bit of wp. If it is zero, then
> >>> + * the zone type is SWR.
> >>> + */
> >>> +#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
> >>> +
> >>>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
> >>>                                             INT_MAX >> BDRV_SECTOR_BITS)
> >>>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
> >>> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> >>> index 37dddc603c..59c2d1316d 100644
> >>> --- a/include/block/block_int-common.h
> >>> +++ b/include/block/block_int-common.h
> >>> @@ -857,6 +857,11 @@ typedef struct BlockLimits {
> >>>
> >>>      /* device capacity expressed in bytes */
> >>>      int64_t capacity;
> >>> +
> >>> +    /* array of write pointers' location of each zone in the zoned device. */
> >>> +    BlockZoneWps *wps;
> >>> +
> >>> +    int64_t write_granularity;
> >>>  } BlockLimits;
> >>>
> >>>  typedef struct BdrvOpBlocker BdrvOpBlocker;
> >>
> >> --
> >> Damien Le Moal
> >> Western Digital Research
> >>
>
> --
> Damien Le Moal
> Western Digital Research
>


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2022-10-13  8:56 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-10  2:33 [PATCH v3 0/3] Add zone append write for zoned device Sam Li
2022-10-10  2:33 ` [PATCH v3 1/3] file-posix:add the tracking of the zones write pointers Sam Li
2022-10-13  5:13   ` Damien Le Moal
2022-10-13  7:08     ` Sam Li
2022-10-13  7:30       ` Damien Le Moal
2022-10-13  7:46         ` Sam Li
2022-10-10  2:33 ` [PATCH v3 2/3] block: introduce zone append write for zoned devices Sam Li
2022-10-13  5:54   ` Damien Le Moal
2022-10-13  7:27     ` Sam Li
2022-10-13  7:45       ` Damien Le Moal
2022-10-10  2:33 ` [PATCH v3 3/3] qemu-iotests: test zone append operation Sam Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.