[PATCH v2 0/2] add zone append write for zoned device

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 0/2] add zone append write for zoned device
@ 2022-09-29  9:31 Sam Li
  2022-09-29  9:31 ` [PATCH v2 1/2] file-posix: add the tracking of the zones wp Sam Li
  2022-09-29  9:31 ` [PATCH v2 2/2] block: introduce zone append write for zoned devices Sam Li
  0 siblings, 2 replies; 9+ messages in thread
From: Sam Li @ 2022-09-29  9:31 UTC (permalink / raw)
  To: qemu-devel
  Cc: Hanna Reitz, damien.lemoal, Fam Zheng, stefanha, dmitry.fomichev,
	hare, Kevin Wolf, qemu-block, Sam Li

v2:
- split patch to two patches for better reviewing
- change BlockZoneWps's structure to an array of integers
- use only mutex lock on locking conditions of zone wps
- coding styles and clean-ups

v1:
- introduce zone append write

Sam Li (2):
  file-posix: add the tracking of the zones wp
  block: introduce zone append write for zoned devices

 block/block-backend.c              |  65 ++++++++++
 block/file-posix.c                 | 189 ++++++++++++++++++++++++++++-
 block/io.c                         |  21 ++++
 block/raw-format.c                 |   7 ++
 include/block/block-common.h       |  16 +++
 include/block/block-io.h           |   3 +
 include/block/block_int-common.h   |   8 ++
 include/block/raw-aio.h            |   4 +-
 include/sysemu/block-backend-io.h  |   9 ++
 qemu-io-cmds.c                     |  62 ++++++++++
 tests/qemu-iotests/tests/zoned.out |   7 ++
 tests/qemu-iotests/tests/zoned.sh  |   9 ++
 12 files changed, 396 insertions(+), 4 deletions(-)

-- 
2.37.3



^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v2 1/2] file-posix: add the tracking of the zones wp
  2022-09-29  9:31 [PATCH v2 0/2] add zone append write for zoned device Sam Li
@ 2022-09-29  9:31 ` Sam Li
  2022-10-05  1:44   ` Damien Le Moal
  2022-09-29  9:31 ` [PATCH v2 2/2] block: introduce zone append write for zoned devices Sam Li
  1 sibling, 1 reply; 9+ messages in thread
From: Sam Li @ 2022-09-29  9:31 UTC (permalink / raw)
  To: qemu-devel
  Cc: Hanna Reitz, damien.lemoal, Fam Zheng, stefanha, dmitry.fomichev,
	hare, Kevin Wolf, qemu-block, Sam Li

Since Linux doesn't have a user API to issue zone append operations to
zoned devices from user space, the file-posix driver is modified to add
zone append emulation using regular writes. To do this, the file-posix
driver tracks the wp location of all zones of the device. It uses an
array of uint64_t. The most significant bit of each wp location indicates
if the zone type is sequential write required.

The zones wp can be changed due to the following operations issued:
- zone reset: change the wp to the start offset of that zone
- zone finish: change to the end location of that zone
- write to a zone
- zone append

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 block/file-posix.c               | 138 ++++++++++++++++++++++++++++++-
 include/block/block-common.h     |  16 ++++
 include/block/block_int-common.h |   5 ++
 include/block/raw-aio.h          |   4 +-
 4 files changed, 159 insertions(+), 4 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 73656d87f2..33e81ac112 100755
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -206,6 +206,8 @@ typedef struct RawPosixAIOData {
         struct {
             struct iovec *iov;
             int niov;
+            int64_t *append_sector;
+            BlockZoneWps *wps;
         } io;
         struct {
             uint64_t cmd;
@@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
 #endif
 }
 
+#if defined(CONFIG_BLKZONED)
+static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps,
+                          unsigned int nrz) {
+    struct blk_zone *blkz;
+    int64_t rep_size;
+    int64_t sector = offset >> BDRV_SECTOR_BITS;
+    int ret, n = 0, i = 0;
+
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    g_autofree struct blk_zone_report *rep = NULL;
+    rep = g_malloc(rep_size);
+
+    blkz = (struct blk_zone *)(rep + 1);
+    while (n < nrz) {
+        memset(rep, 0, rep_size);
+        rep->sector = sector;
+        rep->nr_zones = nrz - n;
+
+        do {
+            ret = ioctl(fd, BLKREPORTZONE, rep);
+        } while (ret != 0 && errno == EINTR);
+        if (ret != 0) {
+            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+                    fd, offset, errno);
+            return -errno;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; i++, n++) {
+            wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
+            sector = blkz[i].start + blkz[i].len;
+
+            /*
+             * In the wp tracking, it only cares if the zone type is sequential
+             * writes required so that the wp can advance to the right location.
+             * Instead of the type of zone_type which is an 8-bit unsigned
+             * integer, use the first most significant bits of the wp location
+             * to indicate the zone type: 0 for SWR zones and 1 for the
+             * others.
+             */
+            if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) {
+                wps->wp[i] += (uint64_t)1 << 63;
+            }
+        }
+    }
+
+    return 0;
+}
+#endif
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
@@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
             error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
             return;
         }
+
+        ret = get_sysfs_long_val(&st, "physical_block_size");
+        if (ret >= 0) {
+            bs->bl.write_granularity = ret;
+        }
+
+        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
+        qemu_mutex_init(&bs->bl.wps->lock);
+        if (report_zone_wp(0, s->fd, bs->bl.wps, ret) < 0 ) {
+            error_report("report wps failed");
+            qemu_mutex_destroy(&bs->bl.wps->lock);
+            g_free(bs->bl.wps);
+            return;
+        }
     }
 }
 
@@ -1582,7 +1651,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
     ssize_t len;
 
     do {
-        if (aiocb->aio_type & QEMU_AIO_WRITE)
+        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
             len = qemu_pwritev(aiocb->aio_fildes,
                                aiocb->io.iov,
                                aiocb->io.niov,
@@ -1612,7 +1681,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
     ssize_t len;
 
     while (offset < aiocb->aio_nbytes) {
-        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
             len = pwrite(aiocb->aio_fildes,
                          (const char *)buf + offset,
                          aiocb->aio_nbytes - offset,
@@ -1705,7 +1774,7 @@ static int handle_aiocb_rw(void *opaque)
     }
 
     nbytes = handle_aiocb_rw_linear(aiocb, buf);
-    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
+    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
         char *p = buf;
         size_t count = aiocb->aio_nbytes, copy;
         int i;
@@ -1726,6 +1795,23 @@ static int handle_aiocb_rw(void *opaque)
 
 out:
     if (nbytes == aiocb->aio_nbytes) {
+#if defined(CONFIG_BLKZONED)
+        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
+            BlockZoneWps *wps = aiocb->io.wps;
+            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
+            if (wps) {
+               if (BDRV_ZT_IS_SWR(wps->wp[index])) {
+                    qemu_mutex_lock(&wps->lock);
+                    wps->wp[index] += aiocb->aio_nbytes;
+                    qemu_mutex_unlock(&wps->lock);
+                }
+
+                if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
+                    *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS;
+                }
+            }
+        }
+#endif
         return 0;
     } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
         if (aiocb->aio_type & QEMU_AIO_WRITE) {
@@ -1737,6 +1823,19 @@ out:
         }
     } else {
         assert(nbytes < 0);
+#if defined(CONFIG_BLKZONED)
+        if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) {
+            qemu_mutex_lock(&aiocb->bs->bl.wps->lock);
+            if (report_zone_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
+                           aiocb->bs->bl.nr_zones) < 0) {
+                error_report("report zone wp failed");
+                qemu_mutex_destroy(&aiocb->bs->bl.wps->lock);
+                g_free(aiocb->bs->bl.wps);
+                return -EINVAL;
+            }
+            qemu_mutex_unlock(&aiocb->bs->bl.wps->lock);
+        }
+#endif
         return nbytes;
     }
 }
@@ -2027,12 +2126,16 @@ static int handle_aiocb_zone_report(void *opaque) {
 static int handle_aiocb_zone_mgmt(void *opaque) {
 #if defined(CONFIG_BLKZONED)
     RawPosixAIOData *aiocb = opaque;
+    BlockDriverState *bs = aiocb->bs;
     int fd = aiocb->aio_fildes;
     int64_t sector = aiocb->aio_offset / 512;
     int64_t nr_sectors = aiocb->aio_nbytes / 512;
     struct blk_zone_range range;
     int ret;
 
+    BlockZoneWps *wps = bs->bl.wps;
+    int index = aiocb->aio_offset / bs->bl.zone_size;
+
     /* Execute the operation */
     range.sector = sector;
     range.nr_sectors = nr_sectors;
@@ -2045,6 +2148,22 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
                      errno);
         return -errno;
     }
+    
+    if (aiocb->zone_mgmt.all) {
+        for (int i = 0; i < bs->bl.nr_zones; ++i) {
+            qemu_mutex_lock(&wps->lock);
+            wps->wp[i] = i * bs->bl.zone_size;
+            qemu_mutex_unlock(&wps->lock);
+        }
+    } else if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
+        qemu_mutex_lock(&wps->lock);
+        wps->wp[index] = aiocb->aio_offset;
+        qemu_mutex_unlock(&wps->lock);
+    } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
+        qemu_mutex_lock(&wps->lock);
+        wps->wp[index] = aiocb->aio_offset + bs->bl.zone_size;
+        qemu_mutex_unlock(&wps->lock);
+    }
     return ret;
 #else
     return -ENOTSUP;
@@ -2355,6 +2474,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
         },
     };
 
+    BlockZoneWps *wps = bs->bl.wps;
+    acb.io.wps = wps;
     assert(qiov->size == bytes);
     return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
 }
@@ -2465,6 +2586,12 @@ static void raw_close(BlockDriverState *bs)
     BDRVRawState *s = bs->opaque;
 
     if (s->fd >= 0) {
+#if defined(CONFIG_BLKZONED)
+        if (bs->bl.wps) {
+            qemu_mutex_destroy(&bs->bl.wps->lock);
+            g_free(bs->bl.wps);
+        }
+#endif
         qemu_close(s->fd);
         s->fd = -1;
     }
@@ -3299,6 +3426,11 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
         zone_op_name = "BLKRESETZONE";
         zone_op = BLKRESETZONE;
         break;
+    case BLK_ZO_RESET_ALL:
+        zone_op_name = "BLKRESETZONE";
+        zone_op = BLKRESETZONE;
+        is_all = true;
+        break;
     default:
         g_assert_not_reached();
     }
diff --git a/include/block/block-common.h b/include/block/block-common.h
index 8efb6b0c43..43bfc484eb 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -58,6 +58,7 @@ typedef enum BlockZoneOp {
     BLK_ZO_CLOSE,
     BLK_ZO_FINISH,
     BLK_ZO_RESET,
+    BLK_ZO_RESET_ALL,
 } BlockZoneOp;
 
 typedef enum BlockZoneModel {
@@ -96,6 +97,14 @@ typedef struct BlockZoneDescriptor {
     BlockZoneCondition cond;
 } BlockZoneDescriptor;
 
+/*
+ * Track write pointers of a zone in bytes.
+ */
+typedef struct BlockZoneWps {
+    QemuMutex lock;
+    uint64_t wp[];
+} BlockZoneWps;
+
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
     int cluster_size;
@@ -209,6 +218,13 @@ typedef enum {
 #define BDRV_SECTOR_BITS   9
 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
 
+/*
+ * Get the first most significant bit of WP. If it is zero, then
+ * the zone type is SWR.
+ */
+#define BDRV_ZT_IS_SWR(WP)    ((WP & 0x8000000000000000) == 0) ? (true) : \
+                              (false)
+
 #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
                                            INT_MAX >> BDRV_SECTOR_BITS)
 #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 37dddc603c..59c2d1316d 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -857,6 +857,11 @@ typedef struct BlockLimits {
 
     /* device capacity expressed in bytes */
     int64_t capacity;
+
+    /* array of write pointers' location of each zone in the zoned device. */
+    BlockZoneWps *wps;
+
+    int64_t write_granularity;
 } BlockLimits;
 
 typedef struct BdrvOpBlocker BdrvOpBlocker;
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index 3d26929cdd..f13cc1887b 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -31,6 +31,7 @@
 #define QEMU_AIO_TRUNCATE     0x0080
 #define QEMU_AIO_ZONE_REPORT  0x0100
 #define QEMU_AIO_ZONE_MGMT    0x0200
+#define QEMU_AIO_ZONE_APPEND  0x0400
 #define QEMU_AIO_TYPE_MASK \
         (QEMU_AIO_READ | \
          QEMU_AIO_WRITE | \
@@ -41,7 +42,8 @@
          QEMU_AIO_COPY_RANGE | \
          QEMU_AIO_TRUNCATE  | \
          QEMU_AIO_ZONE_REPORT | \
-         QEMU_AIO_ZONE_MGMT)
+         QEMU_AIO_ZONE_MGMT | \
+         QEMU_AIO_ZONE_APPEND)
 
 /* AIO flags */
 #define QEMU_AIO_MISALIGNED   0x1000
-- 
2.37.3



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v2 2/2] block: introduce zone append write for zoned devices
  2022-09-29  9:31 [PATCH v2 0/2] add zone append write for zoned device Sam Li
  2022-09-29  9:31 ` [PATCH v2 1/2] file-posix: add the tracking of the zones wp Sam Li
@ 2022-09-29  9:31 ` Sam Li
  2022-10-05  1:54   ` Damien Le Moal
  1 sibling, 1 reply; 9+ messages in thread
From: Sam Li @ 2022-09-29  9:31 UTC (permalink / raw)
  To: qemu-devel
  Cc: Hanna Reitz, damien.lemoal, Fam Zheng, stefanha, dmitry.fomichev,
	hare, Kevin Wolf, qemu-block, Sam Li

A zone append command is a write operation that specifies the first
logical block of a zone as the write position. When writing to a zoned
block device using zone append, the byte offset of the write is pointing
to the write pointer of that zone. Upon completion the device will
respond with the position the data has been written in the zone.

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 block/block-backend.c              | 65 ++++++++++++++++++++++++++++++
 block/file-posix.c                 | 51 +++++++++++++++++++++++
 block/io.c                         | 21 ++++++++++
 block/raw-format.c                 |  7 ++++
 include/block/block-io.h           |  3 ++
 include/block/block_int-common.h   |  3 ++
 include/sysemu/block-backend-io.h  |  9 +++++
 qemu-io-cmds.c                     | 62 ++++++++++++++++++++++++++++
 tests/qemu-iotests/tests/zoned.out |  7 ++++
 tests/qemu-iotests/tests/zoned.sh  |  9 +++++
 10 files changed, 237 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index f7f7acd6f4..07a8632af1 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
         struct {
             BlockZoneOp op;
         } zone_mgmt;
+        struct {
+            int64_t *append_sector;
+        } zone_append;
     };
 } BlkRwCo;
 
@@ -1869,6 +1872,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
     return &acb->common;
 }
 
+static void blk_aio_zone_append_entry(void *opaque) {
+    BlkAioEmAIOCB *acb = opaque;
+    BlkRwCo *rwco = &acb->rwco;
+
+    rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector,
+                                   rwco->iobuf, rwco->flags);
+    blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+                                QEMUIOVector *qiov, BdrvRequestFlags flags,
+                                BlockCompletionFunc *cb, void *opaque) {
+    BlkAioEmAIOCB *acb;
+    Coroutine *co;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+    acb->rwco = (BlkRwCo) {
+        .blk    = blk,
+        .ret    = NOT_DONE,
+        .flags  = flags,
+        .iobuf  = qiov,
+        .zone_append = {
+                .append_sector = offset,
+        },
+    };
+    acb->has_returned = false;
+
+    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
+    bdrv_coroutine_enter(blk_bs(blk), co);
+
+    acb->has_returned = true;
+    if (acb->rwco.ret != NOT_DONE) {
+        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+                                         blk_aio_complete_bh, acb);
+    }
+
+    return &acb->common;
+}
+
 /*
  * Send a zone_report command.
  * offset is a byte offset from the start of the device. No alignment
@@ -1921,6 +1965,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
     return ret;
 }
 
+/*
+ * Send a zone_append command.
+ */
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+        QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+    int ret;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    blk_wait_while_drained(blk);
+    if (!blk_is_available(blk)) {
+        blk_dec_in_flight(blk);
+        return -ENOMEDIUM;
+    }
+
+    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
     BlockDriverState *bs = blk_bs(blk);
diff --git a/block/file-posix.c b/block/file-posix.c
index 33e81ac112..24b70f1afe 100755
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -3454,6 +3454,56 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
 #endif
 }
 
+
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
+                                           int64_t *offset,
+                                           QEMUIOVector *qiov,
+                                           BdrvRequestFlags flags) {
+#if defined(CONFIG_BLKZONED)
+    BDRVRawState *s = bs->opaque;
+    int64_t zone_size_mask = bs->bl.zone_size - 1;
+    int64_t iov_len = 0;
+    int64_t len = 0;
+    RawPosixAIOData acb;
+
+    if (*offset & zone_size_mask) {
+        error_report("sector offset %" PRId64 " is not aligned to zone size "
+                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
+        return -EINVAL;
+    }
+
+    int64_t wg = bs->bl.write_granularity;
+    int64_t wg_mask = wg - 1;
+    for (int i = 0; i < qiov->niov; i++) {
+       iov_len = qiov->iov[i].iov_len;
+       if (iov_len & wg_mask) {
+           error_report("len of IOVector[%d] %" PRId64 " is not aligned to block "
+                        "size %" PRId64 "", i, iov_len, wg);
+           return -EINVAL;
+       }
+       len += iov_len;
+    }
+
+    acb = (RawPosixAIOData) {
+        .bs = bs,
+        .aio_fildes = s->fd,
+        .aio_type = QEMU_AIO_ZONE_APPEND,
+        .aio_offset = bs->bl.wps->wp[*offset / bs->bl.zone_size],
+        .aio_nbytes = len,
+        .io = {
+                .iov = qiov->iov,
+                .niov = qiov->niov,
+                .wps = bs->bl.wps,
+                .append_sector = offset,
+        },
+    };
+
+    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
+#else
+    return -ENOTSUP;
+#endif
+}
+
 static coroutine_fn int
 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
                 bool blkdev)
@@ -4229,6 +4279,7 @@ static BlockDriver bdrv_zoned_host_device = {
     /* zone management operations */
     .bdrv_co_zone_report = raw_co_zone_report,
     .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+    .bdrv_co_zone_append = raw_co_zone_append,
 };
 #endif
 
diff --git a/block/io.c b/block/io.c
index 5ab2d169c8..b9dfdf0709 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3239,6 +3239,27 @@ out:
     return co.ret;
 }
 
+int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
+                        QEMUIOVector *qiov,
+                        BdrvRequestFlags flags)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+    };
+    IO_CODE();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_zone_append) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
 {
     IO_CODE();
diff --git a/block/raw-format.c b/block/raw-format.c
index 9441536819..df8cc33467 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -325,6 +325,12 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
     return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
 }
 
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, int64_t *offset,
+                                           QEMUIOVector *qiov,
+                                           BdrvRequestFlags flags) {
+    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
+}
+
 static int64_t raw_getlength(BlockDriverState *bs)
 {
     int64_t len;
@@ -628,6 +634,7 @@ BlockDriver bdrv_raw = {
     .bdrv_co_pdiscard     = &raw_co_pdiscard,
     .bdrv_co_zone_report  = &raw_co_zone_report,
     .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
+    .bdrv_co_zone_append = &raw_co_zone_append,
     .bdrv_co_block_status = &raw_co_block_status,
     .bdrv_co_copy_range_from = &raw_co_copy_range_from,
     .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
diff --git a/include/block/block-io.h b/include/block/block-io.h
index 65463b88d9..a792164018 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
                                      BlockZoneDescriptor *zones);
 int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
                                    int64_t offset, int64_t len);
+int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
+                                     QEMUIOVector *qiov,
+                                     BdrvRequestFlags flags);
 
 int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 59c2d1316d..a7e7db5646 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -701,6 +701,9 @@ struct BlockDriver {
             BlockZoneDescriptor *zones);
     int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
             int64_t offset, int64_t len);
+    int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
+            int64_t *offset, QEMUIOVector *qiov,
+            BdrvRequestFlags flags);
 
     /* removable device specific */
     bool (*bdrv_is_inserted)(BlockDriverState *bs);
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
index 6835525582..33e35ae5d7 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -51,6 +51,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
 BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
                               int64_t offset, int64_t len,
                               BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+                                QEMUIOVector *qiov, BdrvRequestFlags flags,
+                                BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
                              BlockCompletionFunc *cb, void *opaque);
 void blk_aio_cancel_async(BlockAIOCB *acb);
@@ -172,6 +175,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
                                   int64_t offset, int64_t len);
 int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
                                        int64_t offset, int64_t len);
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+                                    QEMUIOVector *qiov,
+                                    BdrvRequestFlags flags);
+int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset,
+                                         QEMUIOVector *qiov,
+                                         BdrvRequestFlags flags);
 
 int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
                                       int64_t bytes);
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index e56c8d1c30..6cb86de35b 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1855,6 +1855,67 @@ static const cmdinfo_t zone_reset_cmd = {
     .oneline = "reset a zone write pointer in zone block device",
 };
 
+static int do_aio_zone_append(BlockBackend *blk, QEMUIOVector *qiov,
+                              int64_t *offset, int flags, int *total)
+{
+    int async_ret = NOT_DONE;
+
+    blk_aio_zone_append(blk, offset, qiov, flags, aio_rw_done, &async_ret);
+    while (async_ret == NOT_DONE) {
+        main_loop_wait(false);
+    }
+
+    *total = qiov->size;
+    return async_ret < 0 ? async_ret : 1;
+}
+
+static int zone_append_f(BlockBackend *blk, int argc, char **argv) {
+    int ret;
+    int flags = 0;
+    int total = 0;
+    int64_t offset;
+    char *buf;
+    int nr_iov;
+    int pattern = 0xcd;
+    QEMUIOVector qiov;
+
+    if (optind > argc - 2) {
+        return -EINVAL;
+    }
+    optind++;
+    offset = cvtnum(argv[optind]);
+    if (offset < 0) {
+        print_cvtnum_err(offset, argv[optind]);
+        return offset;
+    }
+    optind++;
+    nr_iov = argc - optind;
+    buf = create_iovec(blk, &qiov, &argv[optind], nr_iov, pattern);
+    if (buf == NULL) {
+        return -EINVAL;
+    }
+    ret = do_aio_zone_append(blk, &qiov, &offset, flags, &total);
+    if (ret < 0) {
+        printf("zone append failed: %s\n", strerror(-ret));
+        goto out;
+    }
+
+    out:
+    qemu_iovec_destroy(&qiov);
+    qemu_io_free(buf);
+    return ret;
+}
+
+static const cmdinfo_t zone_append_cmd = {
+    .name = "zone_append",
+    .altname = "zap",
+    .cfunc = zone_append_f,
+    .argmin = 3,
+    .argmax = 3,
+    .args = "offset len [len..]",
+    .oneline = "append write a number of bytes at a specified offset",
+};
+
 static int truncate_f(BlockBackend *blk, int argc, char **argv);
 static const cmdinfo_t truncate_cmd = {
     .name       = "truncate",
@@ -2652,6 +2713,7 @@ static void __attribute((constructor)) init_qemuio_commands(void)
     qemuio_add_command(&zone_close_cmd);
     qemuio_add_command(&zone_finish_cmd);
     qemuio_add_command(&zone_reset_cmd);
+    qemuio_add_command(&zone_append_cmd);
     qemuio_add_command(&truncate_cmd);
     qemuio_add_command(&length_cmd);
     qemuio_add_command(&info_cmd);
diff --git a/tests/qemu-iotests/tests/zoned.out b/tests/qemu-iotests/tests/zoned.out
index 0c8f96deb9..b3b139b4ec 100644
--- a/tests/qemu-iotests/tests/zoned.out
+++ b/tests/qemu-iotests/tests/zoned.out
@@ -50,4 +50,11 @@ start: 0x80000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:14, [type: 2]
 (5) resetting the second zone
 After resetting a zone:
 start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
+
+
+(6) append write
+After appending the first zone:
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x18, zcond:2, [type: 2]
+After appending the second zone:
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80018, zcond:2, [type: 2]
 *** done
diff --git a/tests/qemu-iotests/tests/zoned.sh b/tests/qemu-iotests/tests/zoned.sh
index fced0194c5..888711eef2 100755
--- a/tests/qemu-iotests/tests/zoned.sh
+++ b/tests/qemu-iotests/tests/zoned.sh
@@ -79,6 +79,15 @@ echo "(5) resetting the second zone"
 sudo $QEMU_IO $IMG -c "zrs 268435456 268435456"
 echo "After resetting a zone:"
 sudo $QEMU_IO $IMG -c "zrp 268435456 1"
+echo
+echo
+echo "(6) append write" # physical block size of the device is 4096
+sudo $QEMU_IO $IMG -c "zap 0 0x1000 0x2000"
+echo "After appending the first zone:"
+sudo $QEMU_IO $IMG -c "zrp 0 1"
+sudo $QEMU_IO $IMG -c "zap 268435456 0x1000 0x2000"
+echo "After appending the second zone:"
+sudo $QEMU_IO $IMG -c "zrp 268435456 1"
 
 # success, all done
 echo "*** done"
-- 
2.37.3



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/2] file-posix: add the tracking of the zones wp
  2022-09-29  9:31 ` [PATCH v2 1/2] file-posix: add the tracking of the zones wp Sam Li
@ 2022-10-05  1:44   ` Damien Le Moal
  2022-10-05  2:30     ` Damien Le Moal
  2022-10-05  7:33     ` Damien Le Moal
  0 siblings, 2 replies; 9+ messages in thread
From: Damien Le Moal @ 2022-10-05  1:44 UTC (permalink / raw)
  To: Sam Li, qemu-devel
  Cc: Hanna Reitz, Fam Zheng, stefanha, dmitry.fomichev, hare,
	Kevin Wolf, qemu-block

On 9/29/22 18:31, Sam Li wrote:
> Since Linux doesn't have a user API to issue zone append operations to
> zoned devices from user space, the file-posix driver is modified to add
> zone append emulation using regular writes. To do this, the file-posix
> driver tracks the wp location of all zones of the device. It uses an
> array of uint64_t. The most significant bit of each wp location indicates
> if the zone type is sequential write required.
> 
> The zones wp can be changed due to the following operations issued:
> - zone reset: change the wp to the start offset of that zone
> - zone finish: change to the end location of that zone
> - write to a zone
> - zone append
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/file-posix.c               | 138 ++++++++++++++++++++++++++++++-
>  include/block/block-common.h     |  16 ++++
>  include/block/block_int-common.h |   5 ++
>  include/block/raw-aio.h          |   4 +-
>  4 files changed, 159 insertions(+), 4 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 73656d87f2..33e81ac112 100755
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData {
>          struct {
>              struct iovec *iov;
>              int niov;
> +            int64_t *append_sector;
> +            BlockZoneWps *wps;
>          } io;
>          struct {
>              uint64_t cmd;
> @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
>  #endif
>  }
>  
> +#if defined(CONFIG_BLKZONED)
> +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps,
> +                          unsigned int nrz) {

Maybe rename this to get_zones_wp() ?

> +    struct blk_zone *blkz;
> +    int64_t rep_size;
> +    int64_t sector = offset >> BDRV_SECTOR_BITS;
> +    int ret, n = 0, i = 0;
> +
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> +    g_autofree struct blk_zone_report *rep = NULL;

To be cleaner, move this declaration above with the others ?

> +    rep = g_malloc(rep_size);
> +
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +        memset(rep, 0, rep_size);
> +        rep->sector = sector;
> +        rep->nr_zones = nrz - n;
> +
> +        do {
> +            ret = ioctl(fd, BLKREPORTZONE, rep);
> +        } while (ret != 0 && errno == EINTR);
> +        if (ret != 0) {
> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> +                    fd, offset, errno);
> +            return -errno;
> +        }
> +
> +        if (!rep->nr_zones) {
> +            break;
> +        }
> +
> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> +            wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> +            sector = blkz[i].start + blkz[i].len;
> +
> +            /*
> +             * In the wp tracking, it only cares if the zone type is sequential
> +             * writes required so that the wp can advance to the right location.

Or sequential write preferred (host aware case)

> +             * Instead of the type of zone_type which is an 8-bit unsigned
> +             * integer, use the first most significant bits of the wp location
> +             * to indicate the zone type: 0 for SWR zones and 1 for the
> +             * others.
> +             */
> +            if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) {

This should be:

		if (blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL) {

Note that the type field is not a bit-field. So you must compare values
instead of doing bit operations.

> +                wps->wp[i] += (uint64_t)1 << 63;

You can simplify this:

		   wps->wp[i] |= 1ULL << 63;

Overall, I would rewrite this like this:

for (i = 0; i < rep->nr_zones; i++, n++) {
    /*
     * The wp tracking cares only about sequential write required
     * and sequential write preferred zones so that the wp can
     * advance to the right location.
     * Use the most significant bit of the wp location
     * to indicate the zone type: 0 for SWR zones and 1 for
     * conventional zones.
     */
    if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
        wps->wp[i] = 1ULL << 63;
    else
        wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
}
sector = blkz[i - 1].start + blkz[i - 1].len;

Which I think is a lot simpler.

> +            }
> +        }
> +    }
> +
> +    return 0;
> +}
> +#endif
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>      BDRVRawState *s = bs->opaque;
> @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>              error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
>              return;
>          }
> +
> +        ret = get_sysfs_long_val(&st, "physical_block_size");
> +        if (ret >= 0) {
> +            bs->bl.write_granularity = ret;
> +        }

This change seems unrelated to the wp tracking. Should this be moved to a
different patch ?

> +
> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> +        qemu_mutex_init(&bs->bl.wps->lock);

Move this initialization after the if block. Doing so, you do not need to
call mutex destroy in case of error.

> +        if (report_zone_wp(0, s->fd, bs->bl.wps, ret) < 0 ) {
> +            error_report("report wps failed");
> +            qemu_mutex_destroy(&bs->bl.wps->lock);
> +            g_free(bs->bl.wps);
> +            return;
> +        }
>      }
>  }
>  
> @@ -1582,7 +1651,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>      ssize_t len;
>  
>      do {
> -        if (aiocb->aio_type & QEMU_AIO_WRITE)
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
>              len = qemu_pwritev(aiocb->aio_fildes,
>                                 aiocb->io.iov,
>                                 aiocb->io.niov,
> @@ -1612,7 +1681,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
>      ssize_t len;
>  
>      while (offset < aiocb->aio_nbytes) {
> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>              len = pwrite(aiocb->aio_fildes,
>                           (const char *)buf + offset,
>                           aiocb->aio_nbytes - offset,
> @@ -1705,7 +1774,7 @@ static int handle_aiocb_rw(void *opaque)
>      }
>  
>      nbytes = handle_aiocb_rw_linear(aiocb, buf);
> -    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
> +    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
>          char *p = buf;
>          size_t count = aiocb->aio_nbytes, copy;
>          int i;
> @@ -1726,6 +1795,23 @@ static int handle_aiocb_rw(void *opaque)
>  
>  out:
>      if (nbytes == aiocb->aio_nbytes) {
> +#if defined(CONFIG_BLKZONED)
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> +            BlockZoneWps *wps = aiocb->io.wps;

Why adding this pointer to the aiocb struct ? You can get the array from
aiocb->bs->bl.wps, no ?

> +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> +            if (wps) {
> +               if (BDRV_ZT_IS_SWR(wps->wp[index])) {
> +                    qemu_mutex_lock(&wps->lock);
> +                    wps->wp[index] += aiocb->aio_nbytes;
> +                    qemu_mutex_unlock(&wps->lock);
> +                }
> +
> +                if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> +                    *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS;

Bug: the append sector must be the first sector written, not the wp
(sector following the last written sector). So this must be done *before*
advancing the wp.

You need to have wps->lock held here too since you are reading
wps->wp[index]. So the mutex lock/unlock needs to be around the 2 hunks
under "if (wps) {". Also, given that there cannot be any zone append
issued to conventional zones (they will fail), you can simplify:

            if (wps) {
                qemu_mutex_lock(&wps->lock);
                if (BDRV_ZT_IS_SWR(wps->wp[index])) {
                    if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
                        *aiocb->io.append_sector =
                           wps->wp[index] >> BDRV_SECTOR_BITS;
                    }
                    wps->wp[index] += aiocb->aio_nbytes;
                }
                qemu_mutex_unlock(&wps->lock);
            }

Now the last problem with this code is sequential write preferred zones.
For these, the write may actually be overwriting sectors that have already
been written, meaning that the wp may not necessarilly need to be
advanced. You can handle that case together with SWR case simply like this:

            if (wps) {
                qemu_mutex_lock(&wps->lock);
                if (BDRV_ZT_IS_SWR(wps->wp[index])) {
                    uint64_t wend_offset =
                         aiocb->aio_offset + aiocb->aio_nbytes;

                    if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
                        *aiocb->io.append_sector =
                           wps->wp[index] >> BDRV_SECTOR_BITS;
                    }

                    /* Advance the wp if needed */
                    if (wend_offset > wps->wp[index])
                        wps->wp[index] = wend_offset;
                }
                qemu_mutex_unlock(&wps->lock);
            }

> +                }
> +            }
> +        }
> +#endif
>          return 0;
>      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
>          if (aiocb->aio_type & QEMU_AIO_WRITE) {
> @@ -1737,6 +1823,19 @@ out:
>          }
>      } else {
>          assert(nbytes < 0);
> +#if defined(CONFIG_BLKZONED)
> +        if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) {

Why do this only for zone append ? Regular write error also need a refresh
of the zone wp.

> +            qemu_mutex_lock(&aiocb->bs->bl.wps->lock);
> +            if (report_zone_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> +                           aiocb->bs->bl.nr_zones) < 0) {
> +                error_report("report zone wp failed");
> +                qemu_mutex_destroy(&aiocb->bs->bl.wps->lock);
> +                g_free(aiocb->bs->bl.wps);
> +                return -EINVAL;
> +            }
> +            qemu_mutex_unlock(&aiocb->bs->bl.wps->lock);

This really needs to be a helper function, e.g. update_zone_wp() or
something like this. Aslo, why nuke the entire zone array if the refresh
for this zone fails ? You could simply leave it as is. The next write may
again fail and another attempt at refreshing it done.

> +        }
> +#endif
>          return nbytes;
>      }
>  }
> @@ -2027,12 +2126,16 @@ static int handle_aiocb_zone_report(void *opaque) {
>  static int handle_aiocb_zone_mgmt(void *opaque) {
>  #if defined(CONFIG_BLKZONED)
>      RawPosixAIOData *aiocb = opaque;
> +    BlockDriverState *bs = aiocb->bs;
>      int fd = aiocb->aio_fildes;
>      int64_t sector = aiocb->aio_offset / 512;
>      int64_t nr_sectors = aiocb->aio_nbytes / 512;
>      struct blk_zone_range range;
>      int ret;
>  
> +    BlockZoneWps *wps = bs->bl.wps;
> +    int index = aiocb->aio_offset / bs->bl.zone_size;
> +
>      /* Execute the operation */
>      range.sector = sector;
>      range.nr_sectors = nr_sectors;
> @@ -2045,6 +2148,22 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
>                       errno);
>          return -errno;
>      }
> +    
> +    if (aiocb->zone_mgmt.all) {
> +        for (int i = 0; i < bs->bl.nr_zones; ++i) {
> +            qemu_mutex_lock(&wps->lock);
> +            wps->wp[i] = i * bs->bl.zone_size;
> +            qemu_mutex_unlock(&wps->lock);
> +        }
> +    } else if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
> +        qemu_mutex_lock(&wps->lock);
> +        wps->wp[index] = aiocb->aio_offset;
> +        qemu_mutex_unlock(&wps->lock);
> +    } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
> +        qemu_mutex_lock(&wps->lock);
> +        wps->wp[index] = aiocb->aio_offset + bs->bl.zone_size;

This may be the last zone of the device, which may be smaller. So you need
to check that here. Same for the first case for reset all: you need to
handle the smaller last zone if there is one.

> +        qemu_mutex_unlock(&wps->lock);
> +    }

Instead of the lock/unlock for each case here, take the mutex lock before
the if () and unlock it after it. Less lines :)

Also, if the zone management command fails, you need to do a report zones
and refresh the wps array.

>      return ret;
>  #else
>      return -ENOTSUP;
> @@ -2355,6 +2474,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
>          },
>      };
>  
> +    BlockZoneWps *wps = bs->bl.wps;
> +    acb.io.wps = wps;

You do not need the pws variable. Simply do:

       acb.io.wps = bs->bl.wps;

>      assert(qiov->size == bytes);
>      return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
>  }
> @@ -2465,6 +2586,12 @@ static void raw_close(BlockDriverState *bs)
>      BDRVRawState *s = bs->opaque;
>  
>      if (s->fd >= 0) {
> +#if defined(CONFIG_BLKZONED)
> +        if (bs->bl.wps) {
> +            qemu_mutex_destroy(&bs->bl.wps->lock);
> +            g_free(bs->bl.wps);
> +        }
> +#endif
>          qemu_close(s->fd);
>          s->fd = -1;
>      }
> @@ -3299,6 +3426,11 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>          zone_op_name = "BLKRESETZONE";
>          zone_op = BLKRESETZONE;
>          break;
> +    case BLK_ZO_RESET_ALL:
> +        zone_op_name = "BLKRESETZONE";
> +        zone_op = BLKRESETZONE;
> +        is_all = true;
> +        break;

This change seems unrelated to the wp tracking. Different patch ?

>      default:
>          g_assert_not_reached();
>      }
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index 8efb6b0c43..43bfc484eb 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -58,6 +58,7 @@ typedef enum BlockZoneOp {
>      BLK_ZO_CLOSE,
>      BLK_ZO_FINISH,
>      BLK_ZO_RESET,
> +    BLK_ZO_RESET_ALL,

same here. Adding reset all support should be a different patch.
>  } BlockZoneOp;
>  
>  typedef enum BlockZoneModel {
> @@ -96,6 +97,14 @@ typedef struct BlockZoneDescriptor {
>      BlockZoneCondition cond;
>  } BlockZoneDescriptor;
>  
> +/*
> + * Track write pointers of a zone in bytes.
> + */
> +typedef struct BlockZoneWps {
> +    QemuMutex lock;
> +    uint64_t wp[];
> +} BlockZoneWps;
> +
>  typedef struct BlockDriverInfo {
>      /* in bytes, 0 if irrelevant */
>      int cluster_size;
> @@ -209,6 +218,13 @@ typedef enum {
>  #define BDRV_SECTOR_BITS   9
>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
>  
> +/*
> + * Get the first most significant bit of WP. If it is zero, then
> + * the zone type is SWR.
> + */
> +#define BDRV_ZT_IS_SWR(WP)    ((WP & 0x8000000000000000) == 0) ? (true) : \
> +                              (false)

Simplify:

#define BDRV_ZT_IS_SWR(wp)	(!((wp) & (1ULL << 63))

But since this must be used for both SWR and SWP zones, I would reverse
this into:

#define BDRV_ZONE_IS_CONV(wp)	((wp) & (1ULL << 63))

Which is a lot simpler.

> +
>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
>                                             INT_MAX >> BDRV_SECTOR_BITS)
>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index 37dddc603c..59c2d1316d 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -857,6 +857,11 @@ typedef struct BlockLimits {
>  
>      /* device capacity expressed in bytes */
>      int64_t capacity;
> +
> +    /* array of write pointers' location of each zone in the zoned device. */
> +    BlockZoneWps *wps;
> +
> +    int64_t write_granularity;
>  } BlockLimits;
>  
>  typedef struct BdrvOpBlocker BdrvOpBlocker;
> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> index 3d26929cdd..f13cc1887b 100644
> --- a/include/block/raw-aio.h
> +++ b/include/block/raw-aio.h
> @@ -31,6 +31,7 @@
>  #define QEMU_AIO_TRUNCATE     0x0080
>  #define QEMU_AIO_ZONE_REPORT  0x0100
>  #define QEMU_AIO_ZONE_MGMT    0x0200
> +#define QEMU_AIO_ZONE_APPEND  0x0400
>  #define QEMU_AIO_TYPE_MASK \
>          (QEMU_AIO_READ | \
>           QEMU_AIO_WRITE | \
> @@ -41,7 +42,8 @@
>           QEMU_AIO_COPY_RANGE | \
>           QEMU_AIO_TRUNCATE  | \
>           QEMU_AIO_ZONE_REPORT | \
> -         QEMU_AIO_ZONE_MGMT)
> +         QEMU_AIO_ZONE_MGMT | \
> +         QEMU_AIO_ZONE_APPEND)

This should be introduced in patch 2. This patch should be only about zone
wp tracking with regular writes and zone management ops. The second patch
can implement zone append emulation thanks to this patch. So separate.

>  
>  /* AIO flags */
>  #define QEMU_AIO_MISALIGNED   0x1000

-- 
Damien Le Moal
Western Digital Research



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] block: introduce zone append write for zoned devices
  2022-09-29  9:31 ` [PATCH v2 2/2] block: introduce zone append write for zoned devices Sam Li
@ 2022-10-05  1:54   ` Damien Le Moal
  0 siblings, 0 replies; 9+ messages in thread
From: Damien Le Moal @ 2022-10-05  1:54 UTC (permalink / raw)
  To: Sam Li, qemu-devel
  Cc: Hanna Reitz, Fam Zheng, stefanha, dmitry.fomichev, hare,
	Kevin Wolf, qemu-block

On 9/29/22 18:31, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of the write is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been written in the zone.
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/block-backend.c              | 65 ++++++++++++++++++++++++++++++
>  block/file-posix.c                 | 51 +++++++++++++++++++++++
>  block/io.c                         | 21 ++++++++++
>  block/raw-format.c                 |  7 ++++
>  include/block/block-io.h           |  3 ++
>  include/block/block_int-common.h   |  3 ++
>  include/sysemu/block-backend-io.h  |  9 +++++
>  qemu-io-cmds.c                     | 62 ++++++++++++++++++++++++++++
>  tests/qemu-iotests/tests/zoned.out |  7 ++++
>  tests/qemu-iotests/tests/zoned.sh  |  9 +++++
>  10 files changed, 237 insertions(+)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index f7f7acd6f4..07a8632af1 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>          struct {
>              BlockZoneOp op;
>          } zone_mgmt;
> +        struct {
> +            int64_t *append_sector;
> +        } zone_append;
>      };
>  } BlkRwCo;
>  
> @@ -1869,6 +1872,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>      return &acb->common;
>  }
>  
> +static void blk_aio_zone_append_entry(void *opaque) {
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = &acb->rwco;
> +
> +    rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector,
> +                                   rwco->iobuf, rwco->flags);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> +                                BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +        .blk    = blk,
> +        .ret    = NOT_DONE,
> +        .flags  = flags,
> +        .iobuf  = qiov,
> +        .zone_append = {
> +                .append_sector = offset,
> +        },
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +    bdrv_coroutine_enter(blk_bs(blk), co);
> +
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> +                                         blk_aio_complete_bh, acb);
> +    }
> +
> +    return &acb->common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1921,6 +1965,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>      return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +        QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    if (!blk_is_available(blk)) {
> +        blk_dec_in_flight(blk);
> +        return -ENOMEDIUM;
> +    }
> +
> +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>      BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 33e81ac112..24b70f1afe 100755
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -3454,6 +3454,56 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>  #endif
>  }
>  
> +

whiteline change.

> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> +                                           int64_t *offset,
> +                                           QEMUIOVector *qiov,
> +                                           BdrvRequestFlags flags) {
> +#if defined(CONFIG_BLKZONED)
> +    BDRVRawState *s = bs->opaque;
> +    int64_t zone_size_mask = bs->bl.zone_size - 1;
> +    int64_t iov_len = 0;
> +    int64_t len = 0;
> +    RawPosixAIOData acb;
> +
> +    if (*offset & zone_size_mask) {
> +        error_report("sector offset %" PRId64 " is not aligned to zone size "
> +                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
> +        return -EINVAL;
> +    }
> +
> +    int64_t wg = bs->bl.write_granularity;
> +    int64_t wg_mask = wg - 1;
> +    for (int i = 0; i < qiov->niov; i++) {
> +       iov_len = qiov->iov[i].iov_len;
> +       if (iov_len & wg_mask) {
> +           error_report("len of IOVector[%d] %" PRId64 " is not aligned to block "
> +                        "size %" PRId64 "", i, iov_len, wg);
> +           return -EINVAL;
> +       }
> +       len += iov_len;
> +    }
> +
> +    acb = (RawPosixAIOData) {
> +        .bs = bs,
> +        .aio_fildes = s->fd,
> +        .aio_type = QEMU_AIO_ZONE_APPEND,
> +        .aio_offset = bs->bl.wps->wp[*offset / bs->bl.zone_size],

You cannot read the wps array without holding wps->lock, and the lock MUST
be held until the aio is issued. Otherwise, if another aio is issued after
the above line, the 2 AIOs may end up being issued in reverse order,
causing an unaligned write error.

Given this, I would change this to issue this aio without changing the
offset and let the lowest level function in file-posix doing the
conversion from zone append to regular write, incrementing the wp and
issue the write, all with wps lock held.

> +        .aio_nbytes = len,
> +        .io = {
> +                .iov = qiov->iov,
> +                .niov = qiov->niov,
> +                .wps = bs->bl.wps,
> +                .append_sector = offset,
> +        },
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
> +
>  static coroutine_fn int
>  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>                  bool blkdev)
> @@ -4229,6 +4279,7 @@ static BlockDriver bdrv_zoned_host_device = {
>      /* zone management operations */
>      .bdrv_co_zone_report = raw_co_zone_report,
>      .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> +    .bdrv_co_zone_append = raw_co_zone_append,
>  };
>  #endif
>  
> diff --git a/block/io.c b/block/io.c
> index 5ab2d169c8..b9dfdf0709 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -3239,6 +3239,27 @@ out:
>      return co.ret;
>  }
>  
> +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> +                        QEMUIOVector *qiov,
> +                        BdrvRequestFlags flags)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +            .coroutine = qemu_coroutine_self(),
> +    };
> +    IO_CODE();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_zone_append) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
>  void *qemu_blockalign(BlockDriverState *bs, size_t size)
>  {
>      IO_CODE();
> diff --git a/block/raw-format.c b/block/raw-format.c
> index 9441536819..df8cc33467 100644
> --- a/block/raw-format.c
> +++ b/block/raw-format.c
> @@ -325,6 +325,12 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>      return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
>  }
>  
> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, int64_t *offset,
> +                                           QEMUIOVector *qiov,
> +                                           BdrvRequestFlags flags) {
> +    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
> +}
> +
>  static int64_t raw_getlength(BlockDriverState *bs)
>  {
>      int64_t len;
> @@ -628,6 +634,7 @@ BlockDriver bdrv_raw = {
>      .bdrv_co_pdiscard     = &raw_co_pdiscard,
>      .bdrv_co_zone_report  = &raw_co_zone_report,
>      .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
> +    .bdrv_co_zone_append = &raw_co_zone_append,
>      .bdrv_co_block_status = &raw_co_block_status,
>      .bdrv_co_copy_range_from = &raw_co_copy_range_from,
>      .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
> diff --git a/include/block/block-io.h b/include/block/block-io.h
> index 65463b88d9..a792164018 100644
> --- a/include/block/block-io.h
> +++ b/include/block/block-io.h
> @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
>                                       BlockZoneDescriptor *zones);
>  int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>                                     int64_t offset, int64_t len);
> +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> +                                     QEMUIOVector *qiov,
> +                                     BdrvRequestFlags flags);
>  
>  int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
>  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index 59c2d1316d..a7e7db5646 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -701,6 +701,9 @@ struct BlockDriver {
>              BlockZoneDescriptor *zones);
>      int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
>              int64_t offset, int64_t len);
> +    int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
> +            int64_t *offset, QEMUIOVector *qiov,
> +            BdrvRequestFlags flags);
>  
>      /* removable device specific */
>      bool (*bdrv_is_inserted)(BlockDriverState *bs);
> diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
> index 6835525582..33e35ae5d7 100644
> --- a/include/sysemu/block-backend-io.h
> +++ b/include/sysemu/block-backend-io.h
> @@ -51,6 +51,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
>  BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                int64_t offset, int64_t len,
>                                BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> +                                BlockCompletionFunc *cb, void *opaque);
>  BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
>                               BlockCompletionFunc *cb, void *opaque);
>  void blk_aio_cancel_async(BlockAIOCB *acb);
> @@ -172,6 +175,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                    int64_t offset, int64_t len);
>  int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                         int64_t offset, int64_t len);
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +                                    QEMUIOVector *qiov,
> +                                    BdrvRequestFlags flags);
> +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset,
> +                                         QEMUIOVector *qiov,
> +                                         BdrvRequestFlags flags);
>  
>  int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
>                                        int64_t bytes);
> diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c

Hmmm... may be move the test stuff to a 3rd patch to facilitate review
(and make each patch smaller) ?

> index e56c8d1c30..6cb86de35b 100644
> --- a/qemu-io-cmds.c
> +++ b/qemu-io-cmds.c
> @@ -1855,6 +1855,67 @@ static const cmdinfo_t zone_reset_cmd = {
>      .oneline = "reset a zone write pointer in zone block device",
>  };
>  
> +static int do_aio_zone_append(BlockBackend *blk, QEMUIOVector *qiov,
> +                              int64_t *offset, int flags, int *total)
> +{
> +    int async_ret = NOT_DONE;
> +
> +    blk_aio_zone_append(blk, offset, qiov, flags, aio_rw_done, &async_ret);
> +    while (async_ret == NOT_DONE) {
> +        main_loop_wait(false);
> +    }
> +
> +    *total = qiov->size;
> +    return async_ret < 0 ? async_ret : 1;
> +}
> +
> +static int zone_append_f(BlockBackend *blk, int argc, char **argv) {
> +    int ret;
> +    int flags = 0;
> +    int total = 0;
> +    int64_t offset;
> +    char *buf;
> +    int nr_iov;
> +    int pattern = 0xcd;
> +    QEMUIOVector qiov;
> +
> +    if (optind > argc - 2) {
> +        return -EINVAL;
> +    }
> +    optind++;
> +    offset = cvtnum(argv[optind]);
> +    if (offset < 0) {
> +        print_cvtnum_err(offset, argv[optind]);
> +        return offset;
> +    }
> +    optind++;
> +    nr_iov = argc - optind;
> +    buf = create_iovec(blk, &qiov, &argv[optind], nr_iov, pattern);
> +    if (buf == NULL) {
> +        return -EINVAL;
> +    }
> +    ret = do_aio_zone_append(blk, &qiov, &offset, flags, &total);
> +    if (ret < 0) {
> +        printf("zone append failed: %s\n", strerror(-ret));
> +        goto out;
> +    }
> +
> +    out:
> +    qemu_iovec_destroy(&qiov);
> +    qemu_io_free(buf);
> +    return ret;
> +}
> +
> +static const cmdinfo_t zone_append_cmd = {
> +    .name = "zone_append",
> +    .altname = "zap",
> +    .cfunc = zone_append_f,
> +    .argmin = 3,
> +    .argmax = 3,
> +    .args = "offset len [len..]",
> +    .oneline = "append write a number of bytes at a specified offset",
> +};
> +
>  static int truncate_f(BlockBackend *blk, int argc, char **argv);
>  static const cmdinfo_t truncate_cmd = {
>      .name       = "truncate",
> @@ -2652,6 +2713,7 @@ static void __attribute((constructor)) init_qemuio_commands(void)
>      qemuio_add_command(&zone_close_cmd);
>      qemuio_add_command(&zone_finish_cmd);
>      qemuio_add_command(&zone_reset_cmd);
> +    qemuio_add_command(&zone_append_cmd);
>      qemuio_add_command(&truncate_cmd);
>      qemuio_add_command(&length_cmd);
>      qemuio_add_command(&info_cmd);
> diff --git a/tests/qemu-iotests/tests/zoned.out b/tests/qemu-iotests/tests/zoned.out
> index 0c8f96deb9..b3b139b4ec 100644
> --- a/tests/qemu-iotests/tests/zoned.out
> +++ b/tests/qemu-iotests/tests/zoned.out
> @@ -50,4 +50,11 @@ start: 0x80000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:14, [type: 2]
>  (5) resetting the second zone
>  After resetting a zone:
>  start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
> +
> +
> +(6) append write
> +After appending the first zone:
> +start: 0x0, len 0x80000, cap 0x80000, wptr 0x18, zcond:2, [type: 2]
> +After appending the second zone:
> +start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80018, zcond:2, [type: 2]
>  *** done
> diff --git a/tests/qemu-iotests/tests/zoned.sh b/tests/qemu-iotests/tests/zoned.sh
> index fced0194c5..888711eef2 100755
> --- a/tests/qemu-iotests/tests/zoned.sh
> +++ b/tests/qemu-iotests/tests/zoned.sh
> @@ -79,6 +79,15 @@ echo "(5) resetting the second zone"
>  sudo $QEMU_IO $IMG -c "zrs 268435456 268435456"
>  echo "After resetting a zone:"
>  sudo $QEMU_IO $IMG -c "zrp 268435456 1"
> +echo
> +echo
> +echo "(6) append write" # physical block size of the device is 4096
> +sudo $QEMU_IO $IMG -c "zap 0 0x1000 0x2000"
> +echo "After appending the first zone:"
> +sudo $QEMU_IO $IMG -c "zrp 0 1"
> +sudo $QEMU_IO $IMG -c "zap 268435456 0x1000 0x2000"
> +echo "After appending the second zone:"
> +sudo $QEMU_IO $IMG -c "zrp 268435456 1"
>  
>  # success, all done
>  echo "*** done"

-- 
Damien Le Moal
Western Digital Research



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/2] file-posix: add the tracking of the zones wp
  2022-10-05  1:44   ` Damien Le Moal
@ 2022-10-05  2:30     ` Damien Le Moal
  2022-10-05  7:33     ` Damien Le Moal
  1 sibling, 0 replies; 9+ messages in thread
From: Damien Le Moal @ 2022-10-05  2:30 UTC (permalink / raw)
  To: Sam Li, qemu-devel
  Cc: Hanna Reitz, Fam Zheng, stefanha, dmitry.fomichev, hare,
	Kevin Wolf, qemu-block

On 10/5/22 10:44, Damien Le Moal wrote:
> On 9/29/22 18:31, Sam Li wrote:
>> Since Linux doesn't have a user API to issue zone append operations to
>> zoned devices from user space, the file-posix driver is modified to add
>> zone append emulation using regular writes. To do this, the file-posix
>> driver tracks the wp location of all zones of the device. It uses an
>> array of uint64_t. The most significant bit of each wp location indicates
>> if the zone type is sequential write required.
>>
>> The zones wp can be changed due to the following operations issued:
>> - zone reset: change the wp to the start offset of that zone
>> - zone finish: change to the end location of that zone
>> - write to a zone
>> - zone append
>>
>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>> ---
>>  block/file-posix.c               | 138 ++++++++++++++++++++++++++++++-
>>  include/block/block-common.h     |  16 ++++
>>  include/block/block_int-common.h |   5 ++
>>  include/block/raw-aio.h          |   4 +-
>>  4 files changed, 159 insertions(+), 4 deletions(-)
>>
>> diff --git a/block/file-posix.c b/block/file-posix.c
>> index 73656d87f2..33e81ac112 100755
>> --- a/block/file-posix.c
>> +++ b/block/file-posix.c
>> @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData {
>>          struct {
>>              struct iovec *iov;
>>              int niov;
>> +            int64_t *append_sector;
>> +            BlockZoneWps *wps;
>>          } io;
>>          struct {
>>              uint64_t cmd;
>> @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
>>  #endif
>>  }
>>  
>> +#if defined(CONFIG_BLKZONED)
>> +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps,
>> +                          unsigned int nrz) {
> 
> Maybe rename this to get_zones_wp() ?
> 
>> +    struct blk_zone *blkz;
>> +    int64_t rep_size;
>> +    int64_t sector = offset >> BDRV_SECTOR_BITS;
>> +    int ret, n = 0, i = 0;
>> +
>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>> +    g_autofree struct blk_zone_report *rep = NULL;
> 
> To be cleaner, move this declaration above with the others ?
> 
>> +    rep = g_malloc(rep_size);
>> +
>> +    blkz = (struct blk_zone *)(rep + 1);
>> +    while (n < nrz) {
>> +        memset(rep, 0, rep_size);
>> +        rep->sector = sector;
>> +        rep->nr_zones = nrz - n;
>> +
>> +        do {
>> +            ret = ioctl(fd, BLKREPORTZONE, rep);
>> +        } while (ret != 0 && errno == EINTR);
>> +        if (ret != 0) {
>> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
>> +                    fd, offset, errno);
>> +            return -errno;
>> +        }
>> +
>> +        if (!rep->nr_zones) {
>> +            break;
>> +        }
>> +
>> +        for (i = 0; i < rep->nr_zones; i++, n++) {
>> +            wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
>> +            sector = blkz[i].start + blkz[i].len;
>> +
>> +            /*
>> +             * In the wp tracking, it only cares if the zone type is sequential
>> +             * writes required so that the wp can advance to the right location.
> 
> Or sequential write preferred (host aware case)
> 
>> +             * Instead of the type of zone_type which is an 8-bit unsigned
>> +             * integer, use the first most significant bits of the wp location
>> +             * to indicate the zone type: 0 for SWR zones and 1 for the
>> +             * others.
>> +             */
>> +            if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) {
> 
> This should be:
> 
> 		if (blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL) {
> 
> Note that the type field is not a bit-field. So you must compare values
> instead of doing bit operations.
> 
>> +                wps->wp[i] += (uint64_t)1 << 63;
> 
> You can simplify this:
> 
> 		   wps->wp[i] |= 1ULL << 63;
> 
> Overall, I would rewrite this like this:
> 
> for (i = 0; i < rep->nr_zones; i++, n++) {
>     /*
>      * The wp tracking cares only about sequential write required
>      * and sequential write preferred zones so that the wp can
>      * advance to the right location.
>      * Use the most significant bit of the wp location
>      * to indicate the zone type: 0 for SWR zones and 1 for
>      * conventional zones.
>      */
>     if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
>         wps->wp[i] = 1ULL << 63;
>     else
>         wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> }
> sector = blkz[i - 1].start + blkz[i - 1].len;
> 
> Which I think is a lot simpler.
> 
>> +            }
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +#endif
>> +
>>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>>  {
>>      BDRVRawState *s = bs->opaque;
>> @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>>              error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
>>              return;
>>          }
>> +
>> +        ret = get_sysfs_long_val(&st, "physical_block_size");
>> +        if (ret >= 0) {
>> +            bs->bl.write_granularity = ret;
>> +        }
> 
> This change seems unrelated to the wp tracking. Should this be moved to a
> different patch ?
> 
>> +
>> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
>> +        qemu_mutex_init(&bs->bl.wps->lock);
> 
> Move this initialization after the if block. Doing so, you do not need to
> call mutex destroy in case of error.
> 
>> +        if (report_zone_wp(0, s->fd, bs->bl.wps, ret) < 0 ) {
>> +            error_report("report wps failed");
>> +            qemu_mutex_destroy(&bs->bl.wps->lock);
>> +            g_free(bs->bl.wps);
>> +            return;
>> +        }
>>      }
>>  }
>>  
>> @@ -1582,7 +1651,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>>      ssize_t len;
>>  
>>      do {
>> -        if (aiocb->aio_type & QEMU_AIO_WRITE)
>> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
>>              len = qemu_pwritev(aiocb->aio_fildes,
>>                                 aiocb->io.iov,
>>                                 aiocb->io.niov,
>> @@ -1612,7 +1681,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
>>      ssize_t len;
>>  
>>      while (offset < aiocb->aio_nbytes) {
>> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
>> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>>              len = pwrite(aiocb->aio_fildes,
>>                           (const char *)buf + offset,
>>                           aiocb->aio_nbytes - offset,
>> @@ -1705,7 +1774,7 @@ static int handle_aiocb_rw(void *opaque)
>>      }
>>  
>>      nbytes = handle_aiocb_rw_linear(aiocb, buf);
>> -    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
>> +    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
>>          char *p = buf;
>>          size_t count = aiocb->aio_nbytes, copy;
>>          int i;
>> @@ -1726,6 +1795,23 @@ static int handle_aiocb_rw(void *opaque)
>>  
>>  out:
>>      if (nbytes == aiocb->aio_nbytes) {
>> +#if defined(CONFIG_BLKZONED)
>> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>> +            BlockZoneWps *wps = aiocb->io.wps;
> 
> Why adding this pointer to the aiocb struct ? You can get the array from
> aiocb->bs->bl.wps, no ?
> 
>> +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
>> +            if (wps) {
>> +               if (BDRV_ZT_IS_SWR(wps->wp[index])) {
>> +                    qemu_mutex_lock(&wps->lock);
>> +                    wps->wp[index] += aiocb->aio_nbytes;
>> +                    qemu_mutex_unlock(&wps->lock);
>> +                }
>> +
>> +                if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
>> +                    *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS;
> 
> Bug: the append sector must be the first sector written, not the wp
> (sector following the last written sector). So this must be done *before*
> advancing the wp.
> 
> You need to have wps->lock held here too since you are reading
> wps->wp[index]. So the mutex lock/unlock needs to be around the 2 hunks
> under "if (wps) {". Also, given that there cannot be any zone append
> issued to conventional zones (they will fail), you can simplify:
> 
>             if (wps) {
>                 qemu_mutex_lock(&wps->lock);
>                 if (BDRV_ZT_IS_SWR(wps->wp[index])) {
>                     if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
>                         *aiocb->io.append_sector =
>                            wps->wp[index] >> BDRV_SECTOR_BITS;
>                     }
>                     wps->wp[index] += aiocb->aio_nbytes;
>                 }
>                 qemu_mutex_unlock(&wps->lock);
>             }
> 
> Now the last problem with this code is sequential write preferred zones.
> For these, the write may actually be overwriting sectors that have already
> been written, meaning that the wp may not necessarilly need to be
> advanced. You can handle that case together with SWR case simply like this:
> 
>             if (wps) {
>                 qemu_mutex_lock(&wps->lock);
>                 if (BDRV_ZT_IS_SWR(wps->wp[index])) {
>                     uint64_t wend_offset =
>                          aiocb->aio_offset + aiocb->aio_nbytes;
> 
>                     if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
>                         *aiocb->io.append_sector =
>                            wps->wp[index] >> BDRV_SECTOR_BITS;
>                     }
> 
>                     /* Advance the wp if needed */
>                     if (wend_offset > wps->wp[index])
>                         wps->wp[index] = wend_offset;
>                 }
>                 qemu_mutex_unlock(&wps->lock);
>             }
> 
>> +                }
>> +            }
>> +        }
>> +#endif
>>          return 0;
>>      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
>>          if (aiocb->aio_type & QEMU_AIO_WRITE) {
>> @@ -1737,6 +1823,19 @@ out:
>>          }
>>      } else {
>>          assert(nbytes < 0);
>> +#if defined(CONFIG_BLKZONED)
>> +        if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) {
> 
> Why do this only for zone append ? Regular write error also need a refresh
> of the zone wp.
> 
>> +            qemu_mutex_lock(&aiocb->bs->bl.wps->lock);
>> +            if (report_zone_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
>> +                           aiocb->bs->bl.nr_zones) < 0) {
>> +                error_report("report zone wp failed");
>> +                qemu_mutex_destroy(&aiocb->bs->bl.wps->lock);
>> +                g_free(aiocb->bs->bl.wps);
>> +                return -EINVAL;
>> +            }
>> +            qemu_mutex_unlock(&aiocb->bs->bl.wps->lock);
> 
> This really needs to be a helper function, e.g. update_zone_wp() or
> something like this. Aslo, why nuke the entire zone array if the refresh
> for this zone fails ? You could simply leave it as is. The next write may
> again fail and another attempt at refreshing it done.

Another note on this: you must update the wp ONLY for the zone that you
attempted to write here, and only that zone. If you update the wp for all
zones, you will introduce problems for the zones that are being written
(write in flight) since for these, wps->wp[i] is larger than the device
current wp (they become eual only after all writes are completed).

> 
>> +        }
>> +#endif
>>          return nbytes;
>>      }
>>  }
>> @@ -2027,12 +2126,16 @@ static int handle_aiocb_zone_report(void *opaque) {
>>  static int handle_aiocb_zone_mgmt(void *opaque) {
>>  #if defined(CONFIG_BLKZONED)
>>      RawPosixAIOData *aiocb = opaque;
>> +    BlockDriverState *bs = aiocb->bs;
>>      int fd = aiocb->aio_fildes;
>>      int64_t sector = aiocb->aio_offset / 512;
>>      int64_t nr_sectors = aiocb->aio_nbytes / 512;
>>      struct blk_zone_range range;
>>      int ret;
>>  
>> +    BlockZoneWps *wps = bs->bl.wps;
>> +    int index = aiocb->aio_offset / bs->bl.zone_size;
>> +
>>      /* Execute the operation */
>>      range.sector = sector;
>>      range.nr_sectors = nr_sectors;
>> @@ -2045,6 +2148,22 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
>>                       errno);
>>          return -errno;
>>      }
>> +    
>> +    if (aiocb->zone_mgmt.all) {
>> +        for (int i = 0; i < bs->bl.nr_zones; ++i) {
>> +            qemu_mutex_lock(&wps->lock);
>> +            wps->wp[i] = i * bs->bl.zone_size;
>> +            qemu_mutex_unlock(&wps->lock);
>> +        }
>> +    } else if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
>> +        qemu_mutex_lock(&wps->lock);
>> +        wps->wp[index] = aiocb->aio_offset;
>> +        qemu_mutex_unlock(&wps->lock);
>> +    } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
>> +        qemu_mutex_lock(&wps->lock);
>> +        wps->wp[index] = aiocb->aio_offset + bs->bl.zone_size;
> 
> This may be the last zone of the device, which may be smaller. So you need
> to check that here. Same for the first case for reset all: you need to
> handle the smaller last zone if there is one.
> 
>> +        qemu_mutex_unlock(&wps->lock);
>> +    }
> 
> Instead of the lock/unlock for each case here, take the mutex lock before
> the if () and unlock it after it. Less lines :)
> 
> Also, if the zone management command fails, you need to do a report zones
> and refresh the wps array.
> 
>>      return ret;
>>  #else
>>      return -ENOTSUP;
>> @@ -2355,6 +2474,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
>>          },
>>      };
>>  
>> +    BlockZoneWps *wps = bs->bl.wps;
>> +    acb.io.wps = wps;
> 
> You do not need the pws variable. Simply do:
> 
>        acb.io.wps = bs->bl.wps;
> 
>>      assert(qiov->size == bytes);
>>      return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
>>  }
>> @@ -2465,6 +2586,12 @@ static void raw_close(BlockDriverState *bs)
>>      BDRVRawState *s = bs->opaque;
>>  
>>      if (s->fd >= 0) {
>> +#if defined(CONFIG_BLKZONED)
>> +        if (bs->bl.wps) {
>> +            qemu_mutex_destroy(&bs->bl.wps->lock);
>> +            g_free(bs->bl.wps);
>> +        }
>> +#endif
>>          qemu_close(s->fd);
>>          s->fd = -1;
>>      }
>> @@ -3299,6 +3426,11 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>          zone_op_name = "BLKRESETZONE";
>>          zone_op = BLKRESETZONE;
>>          break;
>> +    case BLK_ZO_RESET_ALL:
>> +        zone_op_name = "BLKRESETZONE";
>> +        zone_op = BLKRESETZONE;
>> +        is_all = true;
>> +        break;
> 
> This change seems unrelated to the wp tracking. Different patch ?
> 
>>      default:
>>          g_assert_not_reached();
>>      }
>> diff --git a/include/block/block-common.h b/include/block/block-common.h
>> index 8efb6b0c43..43bfc484eb 100644
>> --- a/include/block/block-common.h
>> +++ b/include/block/block-common.h
>> @@ -58,6 +58,7 @@ typedef enum BlockZoneOp {
>>      BLK_ZO_CLOSE,
>>      BLK_ZO_FINISH,
>>      BLK_ZO_RESET,
>> +    BLK_ZO_RESET_ALL,
> 
> same here. Adding reset all support should be a different patch.
>>  } BlockZoneOp;
>>  
>>  typedef enum BlockZoneModel {
>> @@ -96,6 +97,14 @@ typedef struct BlockZoneDescriptor {
>>      BlockZoneCondition cond;
>>  } BlockZoneDescriptor;
>>  
>> +/*
>> + * Track write pointers of a zone in bytes.
>> + */
>> +typedef struct BlockZoneWps {
>> +    QemuMutex lock;
>> +    uint64_t wp[];
>> +} BlockZoneWps;
>> +
>>  typedef struct BlockDriverInfo {
>>      /* in bytes, 0 if irrelevant */
>>      int cluster_size;
>> @@ -209,6 +218,13 @@ typedef enum {
>>  #define BDRV_SECTOR_BITS   9
>>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
>>  
>> +/*
>> + * Get the first most significant bit of WP. If it is zero, then
>> + * the zone type is SWR.
>> + */
>> +#define BDRV_ZT_IS_SWR(WP)    ((WP & 0x8000000000000000) == 0) ? (true) : \
>> +                              (false)
> 
> Simplify:
> 
> #define BDRV_ZT_IS_SWR(wp)	(!((wp) & (1ULL << 63))
> 
> But since this must be used for both SWR and SWP zones, I would reverse
> this into:
> 
> #define BDRV_ZONE_IS_CONV(wp)	((wp) & (1ULL << 63))
> 
> Which is a lot simpler.
> 
>> +
>>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
>>                                             INT_MAX >> BDRV_SECTOR_BITS)
>>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
>> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
>> index 37dddc603c..59c2d1316d 100644
>> --- a/include/block/block_int-common.h
>> +++ b/include/block/block_int-common.h
>> @@ -857,6 +857,11 @@ typedef struct BlockLimits {
>>  
>>      /* device capacity expressed in bytes */
>>      int64_t capacity;
>> +
>> +    /* array of write pointers' location of each zone in the zoned device. */
>> +    BlockZoneWps *wps;
>> +
>> +    int64_t write_granularity;
>>  } BlockLimits;
>>  
>>  typedef struct BdrvOpBlocker BdrvOpBlocker;
>> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
>> index 3d26929cdd..f13cc1887b 100644
>> --- a/include/block/raw-aio.h
>> +++ b/include/block/raw-aio.h
>> @@ -31,6 +31,7 @@
>>  #define QEMU_AIO_TRUNCATE     0x0080
>>  #define QEMU_AIO_ZONE_REPORT  0x0100
>>  #define QEMU_AIO_ZONE_MGMT    0x0200
>> +#define QEMU_AIO_ZONE_APPEND  0x0400
>>  #define QEMU_AIO_TYPE_MASK \
>>          (QEMU_AIO_READ | \
>>           QEMU_AIO_WRITE | \
>> @@ -41,7 +42,8 @@
>>           QEMU_AIO_COPY_RANGE | \
>>           QEMU_AIO_TRUNCATE  | \
>>           QEMU_AIO_ZONE_REPORT | \
>> -         QEMU_AIO_ZONE_MGMT)
>> +         QEMU_AIO_ZONE_MGMT | \
>> +         QEMU_AIO_ZONE_APPEND)
> 
> This should be introduced in patch 2. This patch should be only about zone
> wp tracking with regular writes and zone management ops. The second patch
> can implement zone append emulation thanks to this patch. So separate.
> 
>>  
>>  /* AIO flags */
>>  #define QEMU_AIO_MISALIGNED   0x1000
> 

-- 
Damien Le Moal
Western Digital Research



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/2] file-posix: add the tracking of the zones wp
  2022-10-05  1:44   ` Damien Le Moal
  2022-10-05  2:30     ` Damien Le Moal
@ 2022-10-05  7:33     ` Damien Le Moal
  2022-10-05  8:30       ` Sam Li
  1 sibling, 1 reply; 9+ messages in thread
From: Damien Le Moal @ 2022-10-05  7:33 UTC (permalink / raw)
  To: Sam Li, qemu-devel
  Cc: Hanna Reitz, Fam Zheng, stefanha, dmitry.fomichev, hare,
	Kevin Wolf, qemu-block

On 10/5/22 10:44, Damien Le Moal wrote:
> On 9/29/22 18:31, Sam Li wrote:
>> Since Linux doesn't have a user API to issue zone append operations to
>> zoned devices from user space, the file-posix driver is modified to add
>> zone append emulation using regular writes. To do this, the file-posix
>> driver tracks the wp location of all zones of the device. It uses an
>> array of uint64_t. The most significant bit of each wp location indicates
>> if the zone type is sequential write required.
>>
>> The zones wp can be changed due to the following operations issued:
>> - zone reset: change the wp to the start offset of that zone
>> - zone finish: change to the end location of that zone
>> - write to a zone
>> - zone append
>>
>> Signed-off-by: Sam Li <faithilikerun@gmail.com>

Replying to myself to add some comments that I forgot.

>> ---
>>  block/file-posix.c               | 138 ++++++++++++++++++++++++++++++-
>>  include/block/block-common.h     |  16 ++++
>>  include/block/block_int-common.h |   5 ++
>>  include/block/raw-aio.h          |   4 +-
>>  4 files changed, 159 insertions(+), 4 deletions(-)
>>
>> diff --git a/block/file-posix.c b/block/file-posix.c
>> index 73656d87f2..33e81ac112 100755
>> --- a/block/file-posix.c
>> +++ b/block/file-posix.c
>> @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData {
>>          struct {
>>              struct iovec *iov;
>>              int niov;
>> +            int64_t *append_sector;
>> +            BlockZoneWps *wps;
>>          } io;
>>          struct {
>>              uint64_t cmd;
>> @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
>>  #endif
>>  }
>>  
>> +#if defined(CONFIG_BLKZONED)
>> +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps,
>> +                          unsigned int nrz) {
> 
> Maybe rename this to get_zones_wp() ?
> 
>> +    struct blk_zone *blkz;
>> +    int64_t rep_size;
>> +    int64_t sector = offset >> BDRV_SECTOR_BITS;
>> +    int ret, n = 0, i = 0;
>> +
>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>> +    g_autofree struct blk_zone_report *rep = NULL;
> 
> To be cleaner, move this declaration above with the others ?
> 
>> +    rep = g_malloc(rep_size);
>> +
>> +    blkz = (struct blk_zone *)(rep + 1);
>> +    while (n < nrz) {
>> +        memset(rep, 0, rep_size);
>> +        rep->sector = sector;
>> +        rep->nr_zones = nrz - n;
>> +
>> +        do {
>> +            ret = ioctl(fd, BLKREPORTZONE, rep);
>> +        } while (ret != 0 && errno == EINTR);
>> +        if (ret != 0) {
>> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
>> +                    fd, offset, errno);
>> +            return -errno;
>> +        }
>> +
>> +        if (!rep->nr_zones) {
>> +            break;
>> +        }
>> +
>> +        for (i = 0; i < rep->nr_zones; i++, n++) {
>> +            wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
>> +            sector = blkz[i].start + blkz[i].len;
>> +
>> +            /*
>> +             * In the wp tracking, it only cares if the zone type is sequential
>> +             * writes required so that the wp can advance to the right location.
> 
> Or sequential write preferred (host aware case)
> 
>> +             * Instead of the type of zone_type which is an 8-bit unsigned
>> +             * integer, use the first most significant bits of the wp location
>> +             * to indicate the zone type: 0 for SWR zones and 1 for the
>> +             * others.
>> +             */
>> +            if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) {
> 
> This should be:
> 
> 		if (blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL) {
> 
> Note that the type field is not a bit-field. So you must compare values
> instead of doing bit operations.
> 
>> +                wps->wp[i] += (uint64_t)1 << 63;
> 
> You can simplify this:
> 
> 		   wps->wp[i] |= 1ULL << 63;
> 
> Overall, I would rewrite this like this:
> 
> for (i = 0; i < rep->nr_zones; i++, n++) {
>     /*
>      * The wp tracking cares only about sequential write required
>      * and sequential write preferred zones so that the wp can
>      * advance to the right location.
>      * Use the most significant bit of the wp location
>      * to indicate the zone type: 0 for SWR zones and 1 for
>      * conventional zones.
>      */
>     if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
>         wps->wp[i] = 1ULL << 63;
>     else
>         wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> }
> sector = blkz[i - 1].start + blkz[i - 1].len;
> 
> Which I think is a lot simpler.
> 
>> +            }
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +#endif
>> +
>>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>>  {
>>      BDRVRawState *s = bs->opaque;
>> @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>>              error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
>>              return;
>>          }
>> +
>> +        ret = get_sysfs_long_val(&st, "physical_block_size");
>> +        if (ret >= 0) {
>> +            bs->bl.write_granularity = ret;
>> +        }
> 
> This change seems unrelated to the wp tracking. Should this be moved to a
> different patch ?
> 
>> +
>> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
>> +        qemu_mutex_init(&bs->bl.wps->lock);
> 
> Move this initialization after the if block. Doing so, you do not need to
> call mutex destroy in case of error.
> 
>> +        if (report_zone_wp(0, s->fd, bs->bl.wps, ret) < 0 ) {
>> +            error_report("report wps failed");
>> +            qemu_mutex_destroy(&bs->bl.wps->lock);
>> +            g_free(bs->bl.wps);
>> +            return;
>> +        }
>>      }
>>  }
>>  
>> @@ -1582,7 +1651,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>>      ssize_t len;
>>  
>>      do {
>> -        if (aiocb->aio_type & QEMU_AIO_WRITE)
>> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
>>              len = qemu_pwritev(aiocb->aio_fildes,
>>                                 aiocb->io.iov,
>>                                 aiocb->io.niov,
>> @@ -1612,7 +1681,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
>>      ssize_t len;
>>  
>>      while (offset < aiocb->aio_nbytes) {
>> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
>> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>>              len = pwrite(aiocb->aio_fildes,
>>                           (const char *)buf + offset,
>>                           aiocb->aio_nbytes - offset,
>> @@ -1705,7 +1774,7 @@ static int handle_aiocb_rw(void *opaque)
>>      }
>>  
>>      nbytes = handle_aiocb_rw_linear(aiocb, buf);
>> -    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
>> +    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
>>          char *p = buf;
>>          size_t count = aiocb->aio_nbytes, copy;
>>          int i;
>> @@ -1726,6 +1795,23 @@ static int handle_aiocb_rw(void *opaque)
>>  
>>  out:
>>      if (nbytes == aiocb->aio_nbytes) {
>> +#if defined(CONFIG_BLKZONED)
>> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>> +            BlockZoneWps *wps = aiocb->io.wps;
> 
> Why adding this pointer to the aiocb struct ? You can get the array from
> aiocb->bs->bl.wps, no ?
> 
>> +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
>> +            if (wps) {
>> +               if (BDRV_ZT_IS_SWR(wps->wp[index])) {
>> +                    qemu_mutex_lock(&wps->lock);
>> +                    wps->wp[index] += aiocb->aio_nbytes;
>> +                    qemu_mutex_unlock(&wps->lock);
>> +                }
>> +
>> +                if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
>> +                    *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS;
> 
> Bug: the append sector must be the first sector written, not the wp
> (sector following the last written sector). So this must be done *before*
> advancing the wp.
> 
> You need to have wps->lock held here too since you are reading
> wps->wp[index]. So the mutex lock/unlock needs to be around the 2 hunks
> under "if (wps) {". Also, given that there cannot be any zone append
> issued to conventional zones (they will fail), you can simplify:
> 
>             if (wps) {
>                 qemu_mutex_lock(&wps->lock);
>                 if (BDRV_ZT_IS_SWR(wps->wp[index])) {
>                     if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
>                         *aiocb->io.append_sector =
>                            wps->wp[index] >> BDRV_SECTOR_BITS;
>                     }
>                     wps->wp[index] += aiocb->aio_nbytes;
>                 }
>                 qemu_mutex_unlock(&wps->lock);
>             }
> 
> Now the last problem with this code is sequential write preferred zones.
> For these, the write may actually be overwriting sectors that have already
> been written, meaning that the wp may not necessarilly need to be
> advanced. You can handle that case together with SWR case simply like this:
> 
>             if (wps) {
>                 qemu_mutex_lock(&wps->lock);
>                 if (BDRV_ZT_IS_SWR(wps->wp[index])) {
>                     uint64_t wend_offset =
>                          aiocb->aio_offset + aiocb->aio_nbytes;
> 
>                     if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
>                         *aiocb->io.append_sector =
>                            wps->wp[index] >> BDRV_SECTOR_BITS;
>                     }
> 
>                     /* Advance the wp if needed */
>                     if (wend_offset > wps->wp[index])
>                         wps->wp[index] = wend_offset;
>                 }
>                 qemu_mutex_unlock(&wps->lock);
>             }
> 

Note that you should not increment the wp if the zone is already full. But
for such case, since write and zone append commands will fail, you can
fail them immediately.

>> +                }
>> +            }
>> +        }
>> +#endif
>>          return 0;
>>      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
>>          if (aiocb->aio_type & QEMU_AIO_WRITE) {
>> @@ -1737,6 +1823,19 @@ out:
>>          }
>>      } else {
>>          assert(nbytes < 0);
>> +#if defined(CONFIG_BLKZONED)
>> +        if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) {
> 
> Why do this only for zone append ? Regular write error also need a refresh
> of the zone wp.
> 
>> +            qemu_mutex_lock(&aiocb->bs->bl.wps->lock);
>> +            if (report_zone_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
>> +                           aiocb->bs->bl.nr_zones) < 0) {
>> +                error_report("report zone wp failed");
>> +                qemu_mutex_destroy(&aiocb->bs->bl.wps->lock);
>> +                g_free(aiocb->bs->bl.wps);
>> +                return -EINVAL;
>> +            }
>> +            qemu_mutex_unlock(&aiocb->bs->bl.wps->lock);
> 
> This really needs to be a helper function, e.g. update_zone_wp() or
> something like this. Aslo, why nuke the entire zone array if the refresh
> for this zone fails ? You could simply leave it as is. The next write may
> again fail and another attempt at refreshing it done.
> 
>> +        }
>> +#endif
>>          return nbytes;
>>      }
>>  }
>> @@ -2027,12 +2126,16 @@ static int handle_aiocb_zone_report(void *opaque) {
>>  static int handle_aiocb_zone_mgmt(void *opaque) {
>>  #if defined(CONFIG_BLKZONED)
>>      RawPosixAIOData *aiocb = opaque;
>> +    BlockDriverState *bs = aiocb->bs;
>>      int fd = aiocb->aio_fildes;
>>      int64_t sector = aiocb->aio_offset / 512;
>>      int64_t nr_sectors = aiocb->aio_nbytes / 512;
>>      struct blk_zone_range range;
>>      int ret;
>>  
>> +    BlockZoneWps *wps = bs->bl.wps;
>> +    int index = aiocb->aio_offset / bs->bl.zone_size;
>> +
>>      /* Execute the operation */
>>      range.sector = sector;
>>      range.nr_sectors = nr_sectors;
>> @@ -2045,6 +2148,22 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
>>                       errno);
>>          return -errno;
>>      }
>> +    
>> +    if (aiocb->zone_mgmt.all) {

This case should be integrated into the
if (aiocb->zone_mgmt.zone_op == BLKRESETZONE)

>> +        for (int i = 0; i < bs->bl.nr_zones; ++i) {
>> +            qemu_mutex_lock(&wps->lock);
>> +            wps->wp[i] = i * bs->bl.zone_size;

You need to test the zone type bit and only change SWR or SWP zones.

>> +            qemu_mutex_unlock(&wps->lock);
>> +        }
>> +    } else if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
>> +        qemu_mutex_lock(&wps->lock);
>> +        wps->wp[index] = aiocb->aio_offset;
>> +        qemu_mutex_unlock(&wps->lock);
>> +    } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
>> +        qemu_mutex_lock(&wps->lock);
>> +        wps->wp[index] = aiocb->aio_offset + bs->bl.zone_size;
> 
> This may be the last zone of the device, which may be smaller. So you need
> to check that here. Same for the first case for reset all: you need to
> handle the smaller last zone if there is one.

You should fail any zone management commnand targetting a conventional
zone. You are not checking that.

Also, you can avoid issuing commands by testing if a zone is already full
(no need to issue finish command) or empty (no need to do a reset).

> 
>> +        qemu_mutex_unlock(&wps->lock);
>> +    }
> 
> Instead of the lock/unlock for each case here, take the mutex lock before
> the if () and unlock it after it. Less lines :)
> 
> Also, if the zone management command fails, you need to do a report zones
> and refresh the wps array.
> 
>>      return ret;
>>  #else
>>      return -ENOTSUP;
>> @@ -2355,6 +2474,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
>>          },
>>      };
>>  
>> +    BlockZoneWps *wps = bs->bl.wps;
>> +    acb.io.wps = wps;
> 
> You do not need the pws variable. Simply do:
> 
>        acb.io.wps = bs->bl.wps;
> 
>>      assert(qiov->size == bytes);
>>      return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
>>  }
>> @@ -2465,6 +2586,12 @@ static void raw_close(BlockDriverState *bs)
>>      BDRVRawState *s = bs->opaque;
>>  
>>      if (s->fd >= 0) {
>> +#if defined(CONFIG_BLKZONED)
>> +        if (bs->bl.wps) {
>> +            qemu_mutex_destroy(&bs->bl.wps->lock);
>> +            g_free(bs->bl.wps);
>> +        }
>> +#endif
>>          qemu_close(s->fd);
>>          s->fd = -1;
>>      }
>> @@ -3299,6 +3426,11 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>          zone_op_name = "BLKRESETZONE";
>>          zone_op = BLKRESETZONE;
>>          break;
>> +    case BLK_ZO_RESET_ALL:
>> +        zone_op_name = "BLKRESETZONE";
>> +        zone_op = BLKRESETZONE;
>> +        is_all = true;
>> +        break;
> 
> This change seems unrelated to the wp tracking. Different patch ?
> 
>>      default:
>>          g_assert_not_reached();
>>      }
>> diff --git a/include/block/block-common.h b/include/block/block-common.h
>> index 8efb6b0c43..43bfc484eb 100644
>> --- a/include/block/block-common.h
>> +++ b/include/block/block-common.h
>> @@ -58,6 +58,7 @@ typedef enum BlockZoneOp {
>>      BLK_ZO_CLOSE,
>>      BLK_ZO_FINISH,
>>      BLK_ZO_RESET,
>> +    BLK_ZO_RESET_ALL,
> 
> same here. Adding reset all support should be a different patch.
>>  } BlockZoneOp;
>>  
>>  typedef enum BlockZoneModel {
>> @@ -96,6 +97,14 @@ typedef struct BlockZoneDescriptor {
>>      BlockZoneCondition cond;
>>  } BlockZoneDescriptor;
>>  
>> +/*
>> + * Track write pointers of a zone in bytes.
>> + */
>> +typedef struct BlockZoneWps {
>> +    QemuMutex lock;
>> +    uint64_t wp[];
>> +} BlockZoneWps;
>> +
>>  typedef struct BlockDriverInfo {
>>      /* in bytes, 0 if irrelevant */
>>      int cluster_size;
>> @@ -209,6 +218,13 @@ typedef enum {
>>  #define BDRV_SECTOR_BITS   9
>>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
>>  
>> +/*
>> + * Get the first most significant bit of WP. If it is zero, then
>> + * the zone type is SWR.
>> + */
>> +#define BDRV_ZT_IS_SWR(WP)    ((WP & 0x8000000000000000) == 0) ? (true) : \
>> +                              (false)
> 
> Simplify:
> 
> #define BDRV_ZT_IS_SWR(wp)	(!((wp) & (1ULL << 63))
> 
> But since this must be used for both SWR and SWP zones, I would reverse
> this into:
> 
> #define BDRV_ZONE_IS_CONV(wp)	((wp) & (1ULL << 63))
> 
> Which is a lot simpler.
> 
>> +
>>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
>>                                             INT_MAX >> BDRV_SECTOR_BITS)
>>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
>> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
>> index 37dddc603c..59c2d1316d 100644
>> --- a/include/block/block_int-common.h
>> +++ b/include/block/block_int-common.h
>> @@ -857,6 +857,11 @@ typedef struct BlockLimits {
>>  
>>      /* device capacity expressed in bytes */
>>      int64_t capacity;
>> +
>> +    /* array of write pointers' location of each zone in the zoned device. */
>> +    BlockZoneWps *wps;
>> +
>> +    int64_t write_granularity;
>>  } BlockLimits;
>>  
>>  typedef struct BdrvOpBlocker BdrvOpBlocker;
>> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
>> index 3d26929cdd..f13cc1887b 100644
>> --- a/include/block/raw-aio.h
>> +++ b/include/block/raw-aio.h
>> @@ -31,6 +31,7 @@
>>  #define QEMU_AIO_TRUNCATE     0x0080
>>  #define QEMU_AIO_ZONE_REPORT  0x0100
>>  #define QEMU_AIO_ZONE_MGMT    0x0200
>> +#define QEMU_AIO_ZONE_APPEND  0x0400
>>  #define QEMU_AIO_TYPE_MASK \
>>          (QEMU_AIO_READ | \
>>           QEMU_AIO_WRITE | \
>> @@ -41,7 +42,8 @@
>>           QEMU_AIO_COPY_RANGE | \
>>           QEMU_AIO_TRUNCATE  | \
>>           QEMU_AIO_ZONE_REPORT | \
>> -         QEMU_AIO_ZONE_MGMT)
>> +         QEMU_AIO_ZONE_MGMT | \
>> +         QEMU_AIO_ZONE_APPEND)
> 
> This should be introduced in patch 2. This patch should be only about zone
> wp tracking with regular writes and zone management ops. The second patch
> can implement zone append emulation thanks to this patch. So separate.
> 
>>  
>>  /* AIO flags */
>>  #define QEMU_AIO_MISALIGNED   0x1000
> 

-- 
Damien Le Moal
Western Digital Research



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/2] file-posix: add the tracking of the zones wp
  2022-10-05  7:33     ` Damien Le Moal
@ 2022-10-05  8:30       ` Sam Li
  2022-10-05 21:20         ` Damien Le Moal
  0 siblings, 1 reply; 9+ messages in thread
From: Sam Li @ 2022-10-05  8:30 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: qemu-devel, Hanna Reitz, Fam Zheng, stefanha, dmitry.fomichev,
	hare, Kevin Wolf, qemu-block

Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月5日周三 15:34写道：
>
> On 10/5/22 10:44, Damien Le Moal wrote:
> > On 9/29/22 18:31, Sam Li wrote:
> >> Since Linux doesn't have a user API to issue zone append operations to
> >> zoned devices from user space, the file-posix driver is modified to add
> >> zone append emulation using regular writes. To do this, the file-posix
> >> driver tracks the wp location of all zones of the device. It uses an
> >> array of uint64_t. The most significant bit of each wp location indicates
> >> if the zone type is sequential write required.
> >>
> >> The zones wp can be changed due to the following operations issued:
> >> - zone reset: change the wp to the start offset of that zone
> >> - zone finish: change to the end location of that zone
> >> - write to a zone
> >> - zone append
> >>
> >> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>
> Replying to myself to add some comments that I forgot.
>
> >> ---
> >>  block/file-posix.c               | 138 ++++++++++++++++++++++++++++++-
> >>  include/block/block-common.h     |  16 ++++
> >>  include/block/block_int-common.h |   5 ++
> >>  include/block/raw-aio.h          |   4 +-
> >>  4 files changed, 159 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/block/file-posix.c b/block/file-posix.c
> >> index 73656d87f2..33e81ac112 100755
> >> --- a/block/file-posix.c
> >> +++ b/block/file-posix.c
> >> @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData {
> >>          struct {
> >>              struct iovec *iov;
> >>              int niov;
> >> +            int64_t *append_sector;
> >> +            BlockZoneWps *wps;
> >>          } io;
> >>          struct {
> >>              uint64_t cmd;
> >> @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
> >>  #endif
> >>  }
> >>
> >> +#if defined(CONFIG_BLKZONED)
> >> +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps,
> >> +                          unsigned int nrz) {
> >
> > Maybe rename this to get_zones_wp() ?
> >
> >> +    struct blk_zone *blkz;
> >> +    int64_t rep_size;
> >> +    int64_t sector = offset >> BDRV_SECTOR_BITS;
> >> +    int ret, n = 0, i = 0;
> >> +
> >> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> >> +    g_autofree struct blk_zone_report *rep = NULL;
> >
> > To be cleaner, move this declaration above with the others ?
> >
> >> +    rep = g_malloc(rep_size);
> >> +
> >> +    blkz = (struct blk_zone *)(rep + 1);
> >> +    while (n < nrz) {
> >> +        memset(rep, 0, rep_size);
> >> +        rep->sector = sector;
> >> +        rep->nr_zones = nrz - n;
> >> +
> >> +        do {
> >> +            ret = ioctl(fd, BLKREPORTZONE, rep);
> >> +        } while (ret != 0 && errno == EINTR);
> >> +        if (ret != 0) {
> >> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> >> +                    fd, offset, errno);
> >> +            return -errno;
> >> +        }
> >> +
> >> +        if (!rep->nr_zones) {
> >> +            break;
> >> +        }
> >> +
> >> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> >> +            wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> >> +            sector = blkz[i].start + blkz[i].len;
> >> +
> >> +            /*
> >> +             * In the wp tracking, it only cares if the zone type is sequential
> >> +             * writes required so that the wp can advance to the right location.
> >
> > Or sequential write preferred (host aware case)
> >
> >> +             * Instead of the type of zone_type which is an 8-bit unsigned
> >> +             * integer, use the first most significant bits of the wp location
> >> +             * to indicate the zone type: 0 for SWR zones and 1 for the
> >> +             * others.
> >> +             */
> >> +            if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) {
> >
> > This should be:
> >
> >               if (blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL) {
> >
> > Note that the type field is not a bit-field. So you must compare values
> > instead of doing bit operations.
> >
> >> +                wps->wp[i] += (uint64_t)1 << 63;
> >
> > You can simplify this:
> >
> >                  wps->wp[i] |= 1ULL << 63;
> >
> > Overall, I would rewrite this like this:
> >
> > for (i = 0; i < rep->nr_zones; i++, n++) {
> >     /*
> >      * The wp tracking cares only about sequential write required
> >      * and sequential write preferred zones so that the wp can
> >      * advance to the right location.
> >      * Use the most significant bit of the wp location
> >      * to indicate the zone type: 0 for SWR zones and 1 for
> >      * conventional zones.
> >      */
> >     if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> >         wps->wp[i] = 1ULL << 63;
> >     else
> >         wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> > }
> > sector = blkz[i - 1].start + blkz[i - 1].len;
> >
> > Which I think is a lot simpler.
> >
> >> +            }
> >> +        }
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +#endif
> >> +
> >>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >>  {
> >>      BDRVRawState *s = bs->opaque;
> >> @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >>              error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
> >>              return;
> >>          }
> >> +
> >> +        ret = get_sysfs_long_val(&st, "physical_block_size");
> >> +        if (ret >= 0) {
> >> +            bs->bl.write_granularity = ret;
> >> +        }
> >
> > This change seems unrelated to the wp tracking. Should this be moved to a
> > different patch ?
> >
> >> +
> >> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> >> +        qemu_mutex_init(&bs->bl.wps->lock);
> >
> > Move this initialization after the if block. Doing so, you do not need to
> > call mutex destroy in case of error.
> >
> >> +        if (report_zone_wp(0, s->fd, bs->bl.wps, ret) < 0 ) {
> >> +            error_report("report wps failed");
> >> +            qemu_mutex_destroy(&bs->bl.wps->lock);
> >> +            g_free(bs->bl.wps);
> >> +            return;
> >> +        }
> >>      }
> >>  }
> >>
> >> @@ -1582,7 +1651,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
> >>      ssize_t len;
> >>
> >>      do {
> >> -        if (aiocb->aio_type & QEMU_AIO_WRITE)
> >> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
> >>              len = qemu_pwritev(aiocb->aio_fildes,
> >>                                 aiocb->io.iov,
> >>                                 aiocb->io.niov,
> >> @@ -1612,7 +1681,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
> >>      ssize_t len;
> >>
> >>      while (offset < aiocb->aio_nbytes) {
> >> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> >> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> >>              len = pwrite(aiocb->aio_fildes,
> >>                           (const char *)buf + offset,
> >>                           aiocb->aio_nbytes - offset,
> >> @@ -1705,7 +1774,7 @@ static int handle_aiocb_rw(void *opaque)
> >>      }
> >>
> >>      nbytes = handle_aiocb_rw_linear(aiocb, buf);
> >> -    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
> >> +    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
> >>          char *p = buf;
> >>          size_t count = aiocb->aio_nbytes, copy;
> >>          int i;
> >> @@ -1726,6 +1795,23 @@ static int handle_aiocb_rw(void *opaque)
> >>
> >>  out:
> >>      if (nbytes == aiocb->aio_nbytes) {
> >> +#if defined(CONFIG_BLKZONED)
> >> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> >> +            BlockZoneWps *wps = aiocb->io.wps;
> >
> > Why adding this pointer to the aiocb struct ? You can get the array from
> > aiocb->bs->bl.wps, no ?
> >
> >> +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> >> +            if (wps) {
> >> +               if (BDRV_ZT_IS_SWR(wps->wp[index])) {
> >> +                    qemu_mutex_lock(&wps->lock);
> >> +                    wps->wp[index] += aiocb->aio_nbytes;
> >> +                    qemu_mutex_unlock(&wps->lock);
> >> +                }
> >> +
> >> +                if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> >> +                    *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS;
> >
> > Bug: the append sector must be the first sector written, not the wp
> > (sector following the last written sector). So this must be done *before*
> > advancing the wp.
> >
> > You need to have wps->lock held here too since you are reading
> > wps->wp[index]. So the mutex lock/unlock needs to be around the 2 hunks
> > under "if (wps) {". Also, given that there cannot be any zone append
> > issued to conventional zones (they will fail), you can simplify:
> >
> >             if (wps) {
> >                 qemu_mutex_lock(&wps->lock);
> >                 if (BDRV_ZT_IS_SWR(wps->wp[index])) {
> >                     if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> >                         *aiocb->io.append_sector =
> >                            wps->wp[index] >> BDRV_SECTOR_BITS;
> >                     }
> >                     wps->wp[index] += aiocb->aio_nbytes;
> >                 }
> >                 qemu_mutex_unlock(&wps->lock);
> >             }
> >
> > Now the last problem with this code is sequential write preferred zones.
> > For these, the write may actually be overwriting sectors that have already
> > been written, meaning that the wp may not necessarilly need to be
> > advanced. You can handle that case together with SWR case simply like this:
> >
> >             if (wps) {
> >                 qemu_mutex_lock(&wps->lock);
> >                 if (BDRV_ZT_IS_SWR(wps->wp[index])) {
> >                     uint64_t wend_offset =
> >                          aiocb->aio_offset + aiocb->aio_nbytes;
> >
> >                     if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> >                         *aiocb->io.append_sector =
> >                            wps->wp[index] >> BDRV_SECTOR_BITS;
> >                     }
> >
> >                     /* Advance the wp if needed */
> >                     if (wend_offset > wps->wp[index])
> >                         wps->wp[index] = wend_offset;
> >                 }
> >                 qemu_mutex_unlock(&wps->lock);
> >             }
> >
>
> Note that you should not increment the wp if the zone is already full. But
> for such case, since write and zone append commands will fail, you can
> fail them immediately.
>
> >> +                }
> >> +            }
> >> +        }
> >> +#endif
> >>          return 0;
> >>      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
> >>          if (aiocb->aio_type & QEMU_AIO_WRITE) {
> >> @@ -1737,6 +1823,19 @@ out:
> >>          }
> >>      } else {
> >>          assert(nbytes < 0);
> >> +#if defined(CONFIG_BLKZONED)
> >> +        if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) {
> >
> > Why do this only for zone append ? Regular write error also need a refresh
> > of the zone wp.
> >
> >> +            qemu_mutex_lock(&aiocb->bs->bl.wps->lock);
> >> +            if (report_zone_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> >> +                           aiocb->bs->bl.nr_zones) < 0) {
> >> +                error_report("report zone wp failed");
> >> +                qemu_mutex_destroy(&aiocb->bs->bl.wps->lock);
> >> +                g_free(aiocb->bs->bl.wps);
> >> +                return -EINVAL;
> >> +            }
> >> +            qemu_mutex_unlock(&aiocb->bs->bl.wps->lock);
> >
> > This really needs to be a helper function, e.g. update_zone_wp() or
> > something like this. Aslo, why nuke the entire zone array if the refresh
> > for this zone fails ? You could simply leave it as is. The next write may
> > again fail and another attempt at refreshing it done.
> >
> >> +        }
> >> +#endif
> >>          return nbytes;
> >>      }
> >>  }
> >> @@ -2027,12 +2126,16 @@ static int handle_aiocb_zone_report(void *opaque) {
> >>  static int handle_aiocb_zone_mgmt(void *opaque) {
> >>  #if defined(CONFIG_BLKZONED)
> >>      RawPosixAIOData *aiocb = opaque;
> >> +    BlockDriverState *bs = aiocb->bs;
> >>      int fd = aiocb->aio_fildes;
> >>      int64_t sector = aiocb->aio_offset / 512;
> >>      int64_t nr_sectors = aiocb->aio_nbytes / 512;
> >>      struct blk_zone_range range;
> >>      int ret;
> >>
> >> +    BlockZoneWps *wps = bs->bl.wps;
> >> +    int index = aiocb->aio_offset / bs->bl.zone_size;
> >> +
> >>      /* Execute the operation */
> >>      range.sector = sector;
> >>      range.nr_sectors = nr_sectors;
> >> @@ -2045,6 +2148,22 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
> >>                       errno);
> >>          return -errno;
> >>      }
> >> +
> >> +    if (aiocb->zone_mgmt.all) {
>
> This case should be integrated into the
> if (aiocb->zone_mgmt.zone_op == BLKRESETZONE)

The purpose of this all flag is to pass down the RESET_ALL request to
the file-posix so that it can reset t write pointers of all zones.

>
> >> +        for (int i = 0; i < bs->bl.nr_zones; ++i) {
> >> +            qemu_mutex_lock(&wps->lock);
> >> +            wps->wp[i] = i * bs->bl.zone_size;
>
> You need to test the zone type bit and only change SWR or SWP zones.
>
> >> +            qemu_mutex_unlock(&wps->lock);
> >> +        }
> >> +    } else if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
> >> +        qemu_mutex_lock(&wps->lock);
> >> +        wps->wp[index] = aiocb->aio_offset;
> >> +        qemu_mutex_unlock(&wps->lock);
> >> +    } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
> >> +        qemu_mutex_lock(&wps->lock);
> >> +        wps->wp[index] = aiocb->aio_offset + bs->bl.zone_size;
> >
> > This may be the last zone of the device, which may be smaller. So you need
> > to check that here. Same for the first case for reset all: you need to
> > handle the smaller last zone if there is one.
>
> You should fail any zone management commnand targetting a conventional
> zone. You are not checking that.
>
> Also, you can avoid issuing commands by testing if a zone is already full
> (no need to issue finish command) or empty (no need to do a reset).

Maybe the type check can be done before issuing the request and the
device can return VIRTIO_BLK_S_ZONE_INVALID_CMD to finish it.

The simplest way to do the state check I think is to compare the wp to
the start/end sector which can tell us if the zone is empty or full.
Therefore it should be done in the raw_co_zone_mgmt().

Thanks for reviewing! I can send the revision very soon.

>
> >
> >> +        qemu_mutex_unlock(&wps->lock);
> >> +    }
> >
> > Instead of the lock/unlock for each case here, take the mutex lock before
> > the if () and unlock it after it. Less lines :)
> >
> > Also, if the zone management command fails, you need to do a report zones
> > and refresh the wps array.
> >
> >>      return ret;
> >>  #else
> >>      return -ENOTSUP;
> >> @@ -2355,6 +2474,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
> >>          },
> >>      };
> >>
> >> +    BlockZoneWps *wps = bs->bl.wps;
> >> +    acb.io.wps = wps;
> >
> > You do not need the pws variable. Simply do:
> >
> >        acb.io.wps = bs->bl.wps;
> >
> >>      assert(qiov->size == bytes);
> >>      return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
> >>  }
> >> @@ -2465,6 +2586,12 @@ static void raw_close(BlockDriverState *bs)
> >>      BDRVRawState *s = bs->opaque;
> >>
> >>      if (s->fd >= 0) {
> >> +#if defined(CONFIG_BLKZONED)
> >> +        if (bs->bl.wps) {
> >> +            qemu_mutex_destroy(&bs->bl.wps->lock);
> >> +            g_free(bs->bl.wps);
> >> +        }
> >> +#endif
> >>          qemu_close(s->fd);
> >>          s->fd = -1;
> >>      }
> >> @@ -3299,6 +3426,11 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >>          zone_op_name = "BLKRESETZONE";
> >>          zone_op = BLKRESETZONE;
> >>          break;
> >> +    case BLK_ZO_RESET_ALL:
> >> +        zone_op_name = "BLKRESETZONE";
> >> +        zone_op = BLKRESETZONE;
> >> +        is_all = true;
> >> +        break;
> >
> > This change seems unrelated to the wp tracking. Different patch ?
> >
> >>      default:
> >>          g_assert_not_reached();
> >>      }
> >> diff --git a/include/block/block-common.h b/include/block/block-common.h
> >> index 8efb6b0c43..43bfc484eb 100644
> >> --- a/include/block/block-common.h
> >> +++ b/include/block/block-common.h
> >> @@ -58,6 +58,7 @@ typedef enum BlockZoneOp {
> >>      BLK_ZO_CLOSE,
> >>      BLK_ZO_FINISH,
> >>      BLK_ZO_RESET,
> >> +    BLK_ZO_RESET_ALL,
> >
> > same here. Adding reset all support should be a different patch.
> >>  } BlockZoneOp;
> >>
> >>  typedef enum BlockZoneModel {
> >> @@ -96,6 +97,14 @@ typedef struct BlockZoneDescriptor {
> >>      BlockZoneCondition cond;
> >>  } BlockZoneDescriptor;
> >>
> >> +/*
> >> + * Track write pointers of a zone in bytes.
> >> + */
> >> +typedef struct BlockZoneWps {
> >> +    QemuMutex lock;
> >> +    uint64_t wp[];
> >> +} BlockZoneWps;
> >> +
> >>  typedef struct BlockDriverInfo {
> >>      /* in bytes, 0 if irrelevant */
> >>      int cluster_size;
> >> @@ -209,6 +218,13 @@ typedef enum {
> >>  #define BDRV_SECTOR_BITS   9
> >>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
> >>
> >> +/*
> >> + * Get the first most significant bit of WP. If it is zero, then
> >> + * the zone type is SWR.
> >> + */
> >> +#define BDRV_ZT_IS_SWR(WP)    ((WP & 0x8000000000000000) == 0) ? (true) : \
> >> +                              (false)
> >
> > Simplify:
> >
> > #define BDRV_ZT_IS_SWR(wp)    (!((wp) & (1ULL << 63))
> >
> > But since this must be used for both SWR and SWP zones, I would reverse
> > this into:
> >
> > #define BDRV_ZONE_IS_CONV(wp) ((wp) & (1ULL << 63))
> >
> > Which is a lot simpler.
> >
> >> +
> >>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
> >>                                             INT_MAX >> BDRV_SECTOR_BITS)
> >>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
> >> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> >> index 37dddc603c..59c2d1316d 100644
> >> --- a/include/block/block_int-common.h
> >> +++ b/include/block/block_int-common.h
> >> @@ -857,6 +857,11 @@ typedef struct BlockLimits {
> >>
> >>      /* device capacity expressed in bytes */
> >>      int64_t capacity;
> >> +
> >> +    /* array of write pointers' location of each zone in the zoned device. */
> >> +    BlockZoneWps *wps;
> >> +
> >> +    int64_t write_granularity;
> >>  } BlockLimits;
> >>
> >>  typedef struct BdrvOpBlocker BdrvOpBlocker;
> >> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> >> index 3d26929cdd..f13cc1887b 100644
> >> --- a/include/block/raw-aio.h
> >> +++ b/include/block/raw-aio.h
> >> @@ -31,6 +31,7 @@
> >>  #define QEMU_AIO_TRUNCATE     0x0080
> >>  #define QEMU_AIO_ZONE_REPORT  0x0100
> >>  #define QEMU_AIO_ZONE_MGMT    0x0200
> >> +#define QEMU_AIO_ZONE_APPEND  0x0400
> >>  #define QEMU_AIO_TYPE_MASK \
> >>          (QEMU_AIO_READ | \
> >>           QEMU_AIO_WRITE | \
> >> @@ -41,7 +42,8 @@
> >>           QEMU_AIO_COPY_RANGE | \
> >>           QEMU_AIO_TRUNCATE  | \
> >>           QEMU_AIO_ZONE_REPORT | \
> >> -         QEMU_AIO_ZONE_MGMT)
> >> +         QEMU_AIO_ZONE_MGMT | \
> >> +         QEMU_AIO_ZONE_APPEND)
> >
> > This should be introduced in patch 2. This patch should be only about zone
> > wp tracking with regular writes and zone management ops. The second patch
> > can implement zone append emulation thanks to this patch. So separate.
> >
> >>
> >>  /* AIO flags */
> >>  #define QEMU_AIO_MISALIGNED   0x1000
> >
>
> --
> Damien Le Moal
> Western Digital Research
>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/2] file-posix: add the tracking of the zones wp
  2022-10-05  8:30       ` Sam Li
@ 2022-10-05 21:20         ` Damien Le Moal
  0 siblings, 0 replies; 9+ messages in thread
From: Damien Le Moal @ 2022-10-05 21:20 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, Hanna Reitz, Fam Zheng, stefanha, dmitry.fomichev,
	hare, Kevin Wolf, qemu-block

On 10/5/22 17:30, Sam Li wrote:
> Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月5日周三 15:34写道：
>>
>> On 10/5/22 10:44, Damien Le Moal wrote:
>>> On 9/29/22 18:31, Sam Li wrote:
>>>> Since Linux doesn't have a user API to issue zone append operations to
>>>> zoned devices from user space, the file-posix driver is modified to add
>>>> zone append emulation using regular writes. To do this, the file-posix
>>>> driver tracks the wp location of all zones of the device. It uses an
>>>> array of uint64_t. The most significant bit of each wp location indicates
>>>> if the zone type is sequential write required.
>>>>
>>>> The zones wp can be changed due to the following operations issued:
>>>> - zone reset: change the wp to the start offset of that zone
>>>> - zone finish: change to the end location of that zone
>>>> - write to a zone
>>>> - zone append
>>>>
>>>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>>
>> Replying to myself to add some comments that I forgot.
>>
>>>> ---
>>>>  block/file-posix.c               | 138 ++++++++++++++++++++++++++++++-
>>>>  include/block/block-common.h     |  16 ++++
>>>>  include/block/block_int-common.h |   5 ++
>>>>  include/block/raw-aio.h          |   4 +-
>>>>  4 files changed, 159 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/block/file-posix.c b/block/file-posix.c
>>>> index 73656d87f2..33e81ac112 100755
>>>> --- a/block/file-posix.c
>>>> +++ b/block/file-posix.c
>>>> @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData {
>>>>          struct {
>>>>              struct iovec *iov;
>>>>              int niov;
>>>> +            int64_t *append_sector;
>>>> +            BlockZoneWps *wps;
>>>>          } io;
>>>>          struct {
>>>>              uint64_t cmd;
>>>> @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
>>>>  #endif
>>>>  }
>>>>
>>>> +#if defined(CONFIG_BLKZONED)
>>>> +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps,
>>>> +                          unsigned int nrz) {
>>>
>>> Maybe rename this to get_zones_wp() ?
>>>
>>>> +    struct blk_zone *blkz;
>>>> +    int64_t rep_size;
>>>> +    int64_t sector = offset >> BDRV_SECTOR_BITS;
>>>> +    int ret, n = 0, i = 0;
>>>> +
>>>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>>>> +    g_autofree struct blk_zone_report *rep = NULL;
>>>
>>> To be cleaner, move this declaration above with the others ?
>>>
>>>> +    rep = g_malloc(rep_size);
>>>> +
>>>> +    blkz = (struct blk_zone *)(rep + 1);
>>>> +    while (n < nrz) {
>>>> +        memset(rep, 0, rep_size);
>>>> +        rep->sector = sector;
>>>> +        rep->nr_zones = nrz - n;
>>>> +
>>>> +        do {
>>>> +            ret = ioctl(fd, BLKREPORTZONE, rep);
>>>> +        } while (ret != 0 && errno == EINTR);
>>>> +        if (ret != 0) {
>>>> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
>>>> +                    fd, offset, errno);
>>>> +            return -errno;
>>>> +        }
>>>> +
>>>> +        if (!rep->nr_zones) {
>>>> +            break;
>>>> +        }
>>>> +
>>>> +        for (i = 0; i < rep->nr_zones; i++, n++) {
>>>> +            wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
>>>> +            sector = blkz[i].start + blkz[i].len;
>>>> +
>>>> +            /*
>>>> +             * In the wp tracking, it only cares if the zone type is sequential
>>>> +             * writes required so that the wp can advance to the right location.
>>>
>>> Or sequential write preferred (host aware case)
>>>
>>>> +             * Instead of the type of zone_type which is an 8-bit unsigned
>>>> +             * integer, use the first most significant bits of the wp location
>>>> +             * to indicate the zone type: 0 for SWR zones and 1 for the
>>>> +             * others.
>>>> +             */
>>>> +            if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) {
>>>
>>> This should be:
>>>
>>>               if (blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL) {
>>>
>>> Note that the type field is not a bit-field. So you must compare values
>>> instead of doing bit operations.
>>>
>>>> +                wps->wp[i] += (uint64_t)1 << 63;
>>>
>>> You can simplify this:
>>>
>>>                  wps->wp[i] |= 1ULL << 63;
>>>
>>> Overall, I would rewrite this like this:
>>>
>>> for (i = 0; i < rep->nr_zones; i++, n++) {
>>>     /*
>>>      * The wp tracking cares only about sequential write required
>>>      * and sequential write preferred zones so that the wp can
>>>      * advance to the right location.
>>>      * Use the most significant bit of the wp location
>>>      * to indicate the zone type: 0 for SWR zones and 1 for
>>>      * conventional zones.
>>>      */
>>>     if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
>>>         wps->wp[i] = 1ULL << 63;
>>>     else
>>>         wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
>>> }
>>> sector = blkz[i - 1].start + blkz[i - 1].len;
>>>
>>> Which I think is a lot simpler.
>>>
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +#endif
>>>> +
>>>>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>>>>  {
>>>>      BDRVRawState *s = bs->opaque;
>>>> @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>>>>              error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
>>>>              return;
>>>>          }
>>>> +
>>>> +        ret = get_sysfs_long_val(&st, "physical_block_size");
>>>> +        if (ret >= 0) {
>>>> +            bs->bl.write_granularity = ret;
>>>> +        }
>>>
>>> This change seems unrelated to the wp tracking. Should this be moved to a
>>> different patch ?
>>>
>>>> +
>>>> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
>>>> +        qemu_mutex_init(&bs->bl.wps->lock);
>>>
>>> Move this initialization after the if block. Doing so, you do not need to
>>> call mutex destroy in case of error.
>>>
>>>> +        if (report_zone_wp(0, s->fd, bs->bl.wps, ret) < 0 ) {
>>>> +            error_report("report wps failed");
>>>> +            qemu_mutex_destroy(&bs->bl.wps->lock);
>>>> +            g_free(bs->bl.wps);
>>>> +            return;
>>>> +        }
>>>>      }
>>>>  }
>>>>
>>>> @@ -1582,7 +1651,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>>>>      ssize_t len;
>>>>
>>>>      do {
>>>> -        if (aiocb->aio_type & QEMU_AIO_WRITE)
>>>> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
>>>>              len = qemu_pwritev(aiocb->aio_fildes,
>>>>                                 aiocb->io.iov,
>>>>                                 aiocb->io.niov,
>>>> @@ -1612,7 +1681,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
>>>>      ssize_t len;
>>>>
>>>>      while (offset < aiocb->aio_nbytes) {
>>>> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
>>>> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>>>>              len = pwrite(aiocb->aio_fildes,
>>>>                           (const char *)buf + offset,
>>>>                           aiocb->aio_nbytes - offset,
>>>> @@ -1705,7 +1774,7 @@ static int handle_aiocb_rw(void *opaque)
>>>>      }
>>>>
>>>>      nbytes = handle_aiocb_rw_linear(aiocb, buf);
>>>> -    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
>>>> +    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
>>>>          char *p = buf;
>>>>          size_t count = aiocb->aio_nbytes, copy;
>>>>          int i;
>>>> @@ -1726,6 +1795,23 @@ static int handle_aiocb_rw(void *opaque)
>>>>
>>>>  out:
>>>>      if (nbytes == aiocb->aio_nbytes) {
>>>> +#if defined(CONFIG_BLKZONED)
>>>> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>>>> +            BlockZoneWps *wps = aiocb->io.wps;
>>>
>>> Why adding this pointer to the aiocb struct ? You can get the array from
>>> aiocb->bs->bl.wps, no ?
>>>
>>>> +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
>>>> +            if (wps) {
>>>> +               if (BDRV_ZT_IS_SWR(wps->wp[index])) {
>>>> +                    qemu_mutex_lock(&wps->lock);
>>>> +                    wps->wp[index] += aiocb->aio_nbytes;
>>>> +                    qemu_mutex_unlock(&wps->lock);
>>>> +                }
>>>> +
>>>> +                if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
>>>> +                    *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS;
>>>
>>> Bug: the append sector must be the first sector written, not the wp
>>> (sector following the last written sector). So this must be done *before*
>>> advancing the wp.
>>>
>>> You need to have wps->lock held here too since you are reading
>>> wps->wp[index]. So the mutex lock/unlock needs to be around the 2 hunks
>>> under "if (wps) {". Also, given that there cannot be any zone append
>>> issued to conventional zones (they will fail), you can simplify:
>>>
>>>             if (wps) {
>>>                 qemu_mutex_lock(&wps->lock);
>>>                 if (BDRV_ZT_IS_SWR(wps->wp[index])) {
>>>                     if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
>>>                         *aiocb->io.append_sector =
>>>                            wps->wp[index] >> BDRV_SECTOR_BITS;
>>>                     }
>>>                     wps->wp[index] += aiocb->aio_nbytes;
>>>                 }
>>>                 qemu_mutex_unlock(&wps->lock);
>>>             }
>>>
>>> Now the last problem with this code is sequential write preferred zones.
>>> For these, the write may actually be overwriting sectors that have already
>>> been written, meaning that the wp may not necessarilly need to be
>>> advanced. You can handle that case together with SWR case simply like this:
>>>
>>>             if (wps) {
>>>                 qemu_mutex_lock(&wps->lock);
>>>                 if (BDRV_ZT_IS_SWR(wps->wp[index])) {
>>>                     uint64_t wend_offset =
>>>                          aiocb->aio_offset + aiocb->aio_nbytes;
>>>
>>>                     if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
>>>                         *aiocb->io.append_sector =
>>>                            wps->wp[index] >> BDRV_SECTOR_BITS;
>>>                     }
>>>
>>>                     /* Advance the wp if needed */
>>>                     if (wend_offset > wps->wp[index])
>>>                         wps->wp[index] = wend_offset;
>>>                 }
>>>                 qemu_mutex_unlock(&wps->lock);
>>>             }
>>>
>>
>> Note that you should not increment the wp if the zone is already full. But
>> for such case, since write and zone append commands will fail, you can
>> fail them immediately.
>>
>>>> +                }
>>>> +            }
>>>> +        }
>>>> +#endif
>>>>          return 0;
>>>>      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
>>>>          if (aiocb->aio_type & QEMU_AIO_WRITE) {
>>>> @@ -1737,6 +1823,19 @@ out:
>>>>          }
>>>>      } else {
>>>>          assert(nbytes < 0);
>>>> +#if defined(CONFIG_BLKZONED)
>>>> +        if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) {
>>>
>>> Why do this only for zone append ? Regular write error also need a refresh
>>> of the zone wp.
>>>
>>>> +            qemu_mutex_lock(&aiocb->bs->bl.wps->lock);
>>>> +            if (report_zone_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
>>>> +                           aiocb->bs->bl.nr_zones) < 0) {
>>>> +                error_report("report zone wp failed");
>>>> +                qemu_mutex_destroy(&aiocb->bs->bl.wps->lock);
>>>> +                g_free(aiocb->bs->bl.wps);
>>>> +                return -EINVAL;
>>>> +            }
>>>> +            qemu_mutex_unlock(&aiocb->bs->bl.wps->lock);
>>>
>>> This really needs to be a helper function, e.g. update_zone_wp() or
>>> something like this. Aslo, why nuke the entire zone array if the refresh
>>> for this zone fails ? You could simply leave it as is. The next write may
>>> again fail and another attempt at refreshing it done.
>>>
>>>> +        }
>>>> +#endif
>>>>          return nbytes;
>>>>      }
>>>>  }
>>>> @@ -2027,12 +2126,16 @@ static int handle_aiocb_zone_report(void *opaque) {
>>>>  static int handle_aiocb_zone_mgmt(void *opaque) {
>>>>  #if defined(CONFIG_BLKZONED)
>>>>      RawPosixAIOData *aiocb = opaque;
>>>> +    BlockDriverState *bs = aiocb->bs;
>>>>      int fd = aiocb->aio_fildes;
>>>>      int64_t sector = aiocb->aio_offset / 512;
>>>>      int64_t nr_sectors = aiocb->aio_nbytes / 512;
>>>>      struct blk_zone_range range;
>>>>      int ret;
>>>>
>>>> +    BlockZoneWps *wps = bs->bl.wps;
>>>> +    int index = aiocb->aio_offset / bs->bl.zone_size;
>>>> +
>>>>      /* Execute the operation */
>>>>      range.sector = sector;
>>>>      range.nr_sectors = nr_sectors;
>>>> @@ -2045,6 +2148,22 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
>>>>                       errno);
>>>>          return -errno;
>>>>      }
>>>> +
>>>> +    if (aiocb->zone_mgmt.all) {
>>
>> This case should be integrated into the
>> if (aiocb->zone_mgmt.zone_op == BLKRESETZONE)
> 
> The purpose of this all flag is to pass down the RESET_ALL request to
> the file-posix so that it can reset t write pointers of all zones.

I understand. My point was that you check this flag without checking that
the op is BLKRESETZONE. You need to check that to ensure that the caller
is issuing a valid request. And since you need to check that the operation
is reset and you already have that check below, merging the on-zone-reset
and all-zones-reset cases under the same "if (aiocb->zone_mgmt.zone_op ==
BLKRESETZONE)" condition will make things clear.

> 
>>
>>>> +        for (int i = 0; i < bs->bl.nr_zones; ++i) {
>>>> +            qemu_mutex_lock(&wps->lock);
>>>> +            wps->wp[i] = i * bs->bl.zone_size;
>>
>> You need to test the zone type bit and only change SWR or SWP zones.
>>
>>>> +            qemu_mutex_unlock(&wps->lock);
>>>> +        }
>>>> +    } else if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
>>>> +        qemu_mutex_lock(&wps->lock);
>>>> +        wps->wp[index] = aiocb->aio_offset;
>>>> +        qemu_mutex_unlock(&wps->lock);
>>>> +    } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
>>>> +        qemu_mutex_lock(&wps->lock);
>>>> +        wps->wp[index] = aiocb->aio_offset + bs->bl.zone_size;
>>>
>>> This may be the last zone of the device, which may be smaller. So you need
>>> to check that here. Same for the first case for reset all: you need to
>>> handle the smaller last zone if there is one.
>>
>> You should fail any zone management commnand targetting a conventional
>> zone. You are not checking that.
>>
>> Also, you can avoid issuing commands by testing if a zone is already full
>> (no need to issue finish command) or empty (no need to do a reset).
> 
> Maybe the type check can be done before issuing the request and the
> device can return VIRTIO_BLK_S_ZONE_INVALID_CMD to finish it.
> 
> The simplest way to do the state check I think is to compare the wp to
> the start/end sector which can tell us if the zone is empty or full.
> Therefore it should be done in the raw_co_zone_mgmt().

Nope. You cannot do that there since you can only read the wp array
holding the lock. And you must not release the lock until you issue the
command to the drive. So you should check the zone condition here.

> 
> Thanks for reviewing! I can send the revision very soon.
> 
>>
>>>
>>>> +        qemu_mutex_unlock(&wps->lock);
>>>> +    }
>>>
>>> Instead of the lock/unlock for each case here, take the mutex lock before
>>> the if () and unlock it after it. Less lines :)
>>>
>>> Also, if the zone management command fails, you need to do a report zones
>>> and refresh the wps array.
>>>
>>>>      return ret;
>>>>  #else
>>>>      return -ENOTSUP;
>>>> @@ -2355,6 +2474,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
>>>>          },
>>>>      };
>>>>
>>>> +    BlockZoneWps *wps = bs->bl.wps;
>>>> +    acb.io.wps = wps;
>>>
>>> You do not need the pws variable. Simply do:
>>>
>>>        acb.io.wps = bs->bl.wps;
>>>
>>>>      assert(qiov->size == bytes);
>>>>      return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
>>>>  }
>>>> @@ -2465,6 +2586,12 @@ static void raw_close(BlockDriverState *bs)
>>>>      BDRVRawState *s = bs->opaque;
>>>>
>>>>      if (s->fd >= 0) {
>>>> +#if defined(CONFIG_BLKZONED)
>>>> +        if (bs->bl.wps) {
>>>> +            qemu_mutex_destroy(&bs->bl.wps->lock);
>>>> +            g_free(bs->bl.wps);
>>>> +        }
>>>> +#endif
>>>>          qemu_close(s->fd);
>>>>          s->fd = -1;
>>>>      }
>>>> @@ -3299,6 +3426,11 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>>>          zone_op_name = "BLKRESETZONE";
>>>>          zone_op = BLKRESETZONE;
>>>>          break;
>>>> +    case BLK_ZO_RESET_ALL:
>>>> +        zone_op_name = "BLKRESETZONE";
>>>> +        zone_op = BLKRESETZONE;
>>>> +        is_all = true;
>>>> +        break;
>>>
>>> This change seems unrelated to the wp tracking. Different patch ?
>>>
>>>>      default:
>>>>          g_assert_not_reached();
>>>>      }
>>>> diff --git a/include/block/block-common.h b/include/block/block-common.h
>>>> index 8efb6b0c43..43bfc484eb 100644
>>>> --- a/include/block/block-common.h
>>>> +++ b/include/block/block-common.h
>>>> @@ -58,6 +58,7 @@ typedef enum BlockZoneOp {
>>>>      BLK_ZO_CLOSE,
>>>>      BLK_ZO_FINISH,
>>>>      BLK_ZO_RESET,
>>>> +    BLK_ZO_RESET_ALL,
>>>
>>> same here. Adding reset all support should be a different patch.
>>>>  } BlockZoneOp;
>>>>
>>>>  typedef enum BlockZoneModel {
>>>> @@ -96,6 +97,14 @@ typedef struct BlockZoneDescriptor {
>>>>      BlockZoneCondition cond;
>>>>  } BlockZoneDescriptor;
>>>>
>>>> +/*
>>>> + * Track write pointers of a zone in bytes.
>>>> + */
>>>> +typedef struct BlockZoneWps {
>>>> +    QemuMutex lock;
>>>> +    uint64_t wp[];
>>>> +} BlockZoneWps;
>>>> +
>>>>  typedef struct BlockDriverInfo {
>>>>      /* in bytes, 0 if irrelevant */
>>>>      int cluster_size;
>>>> @@ -209,6 +218,13 @@ typedef enum {
>>>>  #define BDRV_SECTOR_BITS   9
>>>>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
>>>>
>>>> +/*
>>>> + * Get the first most significant bit of WP. If it is zero, then
>>>> + * the zone type is SWR.
>>>> + */
>>>> +#define BDRV_ZT_IS_SWR(WP)    ((WP & 0x8000000000000000) == 0) ? (true) : \
>>>> +                              (false)
>>>
>>> Simplify:
>>>
>>> #define BDRV_ZT_IS_SWR(wp)    (!((wp) & (1ULL << 63))
>>>
>>> But since this must be used for both SWR and SWP zones, I would reverse
>>> this into:
>>>
>>> #define BDRV_ZONE_IS_CONV(wp) ((wp) & (1ULL << 63))
>>>
>>> Which is a lot simpler.
>>>
>>>> +
>>>>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
>>>>                                             INT_MAX >> BDRV_SECTOR_BITS)
>>>>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
>>>> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
>>>> index 37dddc603c..59c2d1316d 100644
>>>> --- a/include/block/block_int-common.h
>>>> +++ b/include/block/block_int-common.h
>>>> @@ -857,6 +857,11 @@ typedef struct BlockLimits {
>>>>
>>>>      /* device capacity expressed in bytes */
>>>>      int64_t capacity;
>>>> +
>>>> +    /* array of write pointers' location of each zone in the zoned device. */
>>>> +    BlockZoneWps *wps;
>>>> +
>>>> +    int64_t write_granularity;
>>>>  } BlockLimits;
>>>>
>>>>  typedef struct BdrvOpBlocker BdrvOpBlocker;
>>>> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
>>>> index 3d26929cdd..f13cc1887b 100644
>>>> --- a/include/block/raw-aio.h
>>>> +++ b/include/block/raw-aio.h
>>>> @@ -31,6 +31,7 @@
>>>>  #define QEMU_AIO_TRUNCATE     0x0080
>>>>  #define QEMU_AIO_ZONE_REPORT  0x0100
>>>>  #define QEMU_AIO_ZONE_MGMT    0x0200
>>>> +#define QEMU_AIO_ZONE_APPEND  0x0400
>>>>  #define QEMU_AIO_TYPE_MASK \
>>>>          (QEMU_AIO_READ | \
>>>>           QEMU_AIO_WRITE | \
>>>> @@ -41,7 +42,8 @@
>>>>           QEMU_AIO_COPY_RANGE | \
>>>>           QEMU_AIO_TRUNCATE  | \
>>>>           QEMU_AIO_ZONE_REPORT | \
>>>> -         QEMU_AIO_ZONE_MGMT)
>>>> +         QEMU_AIO_ZONE_MGMT | \
>>>> +         QEMU_AIO_ZONE_APPEND)
>>>
>>> This should be introduced in patch 2. This patch should be only about zone
>>> wp tracking with regular writes and zone management ops. The second patch
>>> can implement zone append emulation thanks to this patch. So separate.
>>>
>>>>
>>>>  /* AIO flags */
>>>>  #define QEMU_AIO_MISALIGNED   0x1000
>>>
>>
>> --
>> Damien Le Moal
>> Western Digital Research
>>

-- 
Damien Le Moal
Western Digital Research



^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-10-05 21:31 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-29  9:31 [PATCH v2 0/2] add zone append write for zoned device Sam Li
2022-09-29  9:31 ` [PATCH v2 1/2] file-posix: add the tracking of the zones wp Sam Li
2022-10-05  1:44   ` Damien Le Moal
2022-10-05  2:30     ` Damien Le Moal
2022-10-05  7:33     ` Damien Le Moal
2022-10-05  8:30       ` Sam Li
2022-10-05 21:20         ` Damien Le Moal
2022-09-29  9:31 ` [PATCH v2 2/2] block: introduce zone append write for zoned devices Sam Li
2022-10-05  1:54   ` Damien Le Moal

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.