qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [RFC v3 0/2] Add zoned storage emulation to virtio-blk driver
@ 2022-10-16 15:05 Sam Li
  2022-10-16 15:05 ` [RFC v3 1/2] include: update virtio_blk headers from Linux 5.19-rc2+ Sam Li
  2022-10-16 15:05 ` [RFC v3 2/2] virtio-blk: add zoned storage emulation for zoned devices Sam Li
  0 siblings, 2 replies; 9+ messages in thread
From: Sam Li @ 2022-10-16 15:05 UTC (permalink / raw)
  To: qemu-devel
  Cc: dmitry.fomichev, Michael S. Tsirkin, hare, qemu-block,
	damien.lemoal, Raphael Norwitz, Stefan Hajnoczi, Hanna Reitz,
	Kevin Wolf, Sam Li

Note: the virtio-blk headers isn't upstream in the kernel yet therefore
marked as an RFC.

v3:
- use qemuio_from_buffer to write status bit [Stefan]
- avoid using req->elem directly [Stefan]
- fix error checkings and memory leak [Stefan]

v2:
- change units of emulated zone op coresponding to block layer APIs
- modify error checking cases [Stefan, Damien]

v1:
- add zoned storage emulation

Sam Li (2):
  include: update virtio_blk headers from Linux 5.19-rc2+
  virtio-blk: add zoned storage emulation for zoned devices

 hw/block/virtio-blk-common.c                |   2 +
 hw/block/virtio-blk.c                       | 412 +++++++++++++++++++-
 include/hw/virtio/virtio-blk.h              |  11 +-
 include/standard-headers/linux/virtio_blk.h | 109 ++++++
 4 files changed, 531 insertions(+), 3 deletions(-)

-- 
2.37.3



^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC v3 1/2] include: update virtio_blk headers from Linux 5.19-rc2+
  2022-10-16 15:05 [RFC v3 0/2] Add zoned storage emulation to virtio-blk driver Sam Li
@ 2022-10-16 15:05 ` Sam Li
  2022-10-17  0:53   ` Dmitry Fomichev
  2022-10-16 15:05 ` [RFC v3 2/2] virtio-blk: add zoned storage emulation for zoned devices Sam Li
  1 sibling, 1 reply; 9+ messages in thread
From: Sam Li @ 2022-10-16 15:05 UTC (permalink / raw)
  To: qemu-devel
  Cc: dmitry.fomichev, Michael S. Tsirkin, hare, qemu-block,
	damien.lemoal, Raphael Norwitz, Stefan Hajnoczi, Hanna Reitz,
	Kevin Wolf, Sam Li

Use scripts/update-linux-headers.sh to update virtio-blk headers
from Dmitry's "virtio-blk:add support for zoned block devices"
linux patch. There is a link for more information:
https://github.com/dmitry-fomichev/virtblk-zbd

Signed-off-by: Sam Li <faithilikerun@gmail.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 include/standard-headers/linux/virtio_blk.h | 109 ++++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/include/standard-headers/linux/virtio_blk.h b/include/standard-headers/linux/virtio_blk.h
index 2dcc90826a..490bd21c76 100644
--- a/include/standard-headers/linux/virtio_blk.h
+++ b/include/standard-headers/linux/virtio_blk.h
@@ -40,6 +40,7 @@
 #define VIRTIO_BLK_F_MQ		12	/* support more than one vq */
 #define VIRTIO_BLK_F_DISCARD	13	/* DISCARD is supported */
 #define VIRTIO_BLK_F_WRITE_ZEROES	14	/* WRITE ZEROES is supported */
+#define VIRTIO_BLK_F_ZONED		17	/* Zoned block device */
 
 /* Legacy feature bits */
 #ifndef VIRTIO_BLK_NO_LEGACY
@@ -119,6 +120,20 @@ struct virtio_blk_config {
 	uint8_t write_zeroes_may_unmap;
 
 	uint8_t unused1[3];
+
+	/* Secure erase fields that are defined in the virtio spec */
+	uint8_t sec_erase[12];
+
+	/* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */
+	struct virtio_blk_zoned_characteristics {
+		__virtio32 zone_sectors;
+		__virtio32 max_open_zones;
+		__virtio32 max_active_zones;
+		__virtio32 max_append_sectors;
+		__virtio32 write_granularity;
+		uint8_t model;
+		uint8_t unused2[3];
+	} zoned;
 } QEMU_PACKED;
 
 /*
@@ -153,6 +168,27 @@ struct virtio_blk_config {
 /* Write zeroes command */
 #define VIRTIO_BLK_T_WRITE_ZEROES	13
 
+/* Zone append command */
+#define VIRTIO_BLK_T_ZONE_APPEND    15
+
+/* Report zones command */
+#define VIRTIO_BLK_T_ZONE_REPORT    16
+
+/* Open zone command */
+#define VIRTIO_BLK_T_ZONE_OPEN      18
+
+/* Close zone command */
+#define VIRTIO_BLK_T_ZONE_CLOSE     20
+
+/* Finish zone command */
+#define VIRTIO_BLK_T_ZONE_FINISH    22
+
+/* Reset zone command */
+#define VIRTIO_BLK_T_ZONE_RESET     24
+
+/* Reset All zones command */
+#define VIRTIO_BLK_T_ZONE_RESET_ALL 26
+
 #ifndef VIRTIO_BLK_NO_LEGACY
 /* Barrier before this op. */
 #define VIRTIO_BLK_T_BARRIER	0x80000000
@@ -172,6 +208,72 @@ struct virtio_blk_outhdr {
 	__virtio64 sector;
 };
 
+/*
+ * Supported zoned device models.
+ */
+
+/* Regular block device */
+#define VIRTIO_BLK_Z_NONE      0
+/* Host-managed zoned device */
+#define VIRTIO_BLK_Z_HM        1
+/* Host-aware zoned device */
+#define VIRTIO_BLK_Z_HA        2
+
+/*
+ * Zone descriptor. A part of VIRTIO_BLK_T_ZONE_REPORT command reply.
+ */
+struct virtio_blk_zone_descriptor {
+	/* Zone capacity */
+	__virtio64 z_cap;
+	/* The starting sector of the zone */
+	__virtio64 z_start;
+	/* Zone write pointer position in sectors */
+	__virtio64 z_wp;
+	/* Zone type */
+	uint8_t z_type;
+	/* Zone state */
+	uint8_t z_state;
+	uint8_t reserved[38];
+};
+
+struct virtio_blk_zone_report {
+	__virtio64 nr_zones;
+	uint8_t reserved[56];
+	struct virtio_blk_zone_descriptor zones[];
+};
+
+/*
+ * Supported zone types.
+ */
+
+/* Conventional zone */
+#define VIRTIO_BLK_ZT_CONV         1
+/* Sequential Write Required zone */
+#define VIRTIO_BLK_ZT_SWR          2
+/* Sequential Write Preferred zone */
+#define VIRTIO_BLK_ZT_SWP          3
+
+/*
+ * Zone states that are available for zones of all types.
+ */
+
+/* Not a write pointer (conventional zones only) */
+#define VIRTIO_BLK_ZS_NOT_WP       0
+/* Empty */
+#define VIRTIO_BLK_ZS_EMPTY        1
+/* Implicitly Open */
+#define VIRTIO_BLK_ZS_IOPEN        2
+/* Explicitly Open */
+#define VIRTIO_BLK_ZS_EOPEN        3
+/* Closed */
+#define VIRTIO_BLK_ZS_CLOSED       4
+/* Read-Only */
+#define VIRTIO_BLK_ZS_RDONLY       13
+/* Full */
+#define VIRTIO_BLK_ZS_FULL         14
+/* Offline */
+#define VIRTIO_BLK_ZS_OFFLINE      15
+
 /* Unmap this range (only valid for write zeroes command) */
 #define VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP	0x00000001
 
@@ -198,4 +300,11 @@ struct virtio_scsi_inhdr {
 #define VIRTIO_BLK_S_OK		0
 #define VIRTIO_BLK_S_IOERR	1
 #define VIRTIO_BLK_S_UNSUPP	2
+
+/* Error codes that are specific to zoned block devices */
+#define VIRTIO_BLK_S_ZONE_INVALID_CMD     3
+#define VIRTIO_BLK_S_ZONE_UNALIGNED_WP    4
+#define VIRTIO_BLK_S_ZONE_OPEN_RESOURCE   5
+#define VIRTIO_BLK_S_ZONE_ACTIVE_RESOURCE 6
+
 #endif /* _LINUX_VIRTIO_BLK_H */
-- 
2.37.3



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [RFC v3 2/2] virtio-blk: add zoned storage emulation for zoned devices
  2022-10-16 15:05 [RFC v3 0/2] Add zoned storage emulation to virtio-blk driver Sam Li
  2022-10-16 15:05 ` [RFC v3 1/2] include: update virtio_blk headers from Linux 5.19-rc2+ Sam Li
@ 2022-10-16 15:05 ` Sam Li
  2022-10-17  0:59   ` Dmitry Fomichev
  1 sibling, 1 reply; 9+ messages in thread
From: Sam Li @ 2022-10-16 15:05 UTC (permalink / raw)
  To: qemu-devel
  Cc: dmitry.fomichev, Michael S. Tsirkin, hare, qemu-block,
	damien.lemoal, Raphael Norwitz, Stefan Hajnoczi, Hanna Reitz,
	Kevin Wolf, Sam Li

This patch extends virtio-blk emulation to handle zoned device commands
by calling the new block layer APIs to perform zoned device I/O on
behalf of the guest. It supports Report Zone, four zone oparations (open,
close, finish, reset), and Append Zone.

The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
support zoned block devices. Regular block devices(conventional zones)
will not be set.

Then the guest os can use blkzone(8) to test those commands on zoned devices.
Furthermore, using zonefs to test zone append write is also supported.

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 hw/block/virtio-blk-common.c   |   2 +
 hw/block/virtio-blk.c          | 412 ++++++++++++++++++++++++++++++++-
 include/hw/virtio/virtio-blk.h |  11 +-
 3 files changed, 422 insertions(+), 3 deletions(-)

diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
index ac52d7c176..e2f8e2f6da 100644
--- a/hw/block/virtio-blk-common.c
+++ b/hw/block/virtio-blk-common.c
@@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
      .end = endof(struct virtio_blk_config, discard_sector_alignment)},
     {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
      .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
+    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
+     .end = endof(struct virtio_blk_config, zoned)},
     {}
 };
 
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 8131ec2dbc..58891aea31 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -26,6 +26,9 @@
 #include "hw/virtio/virtio-blk.h"
 #include "dataplane/virtio-blk.h"
 #include "scsi/constants.h"
+#if defined(CONFIG_BLKZONED)
+#include <linux/blkzoned.h>
+#endif
 #ifdef __linux__
 # include <scsi/sg.h>
 #endif
@@ -55,10 +58,29 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
 {
     VirtIOBlock *s = req->dev;
     VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    int64_t inhdr_len, n;
+    void *buf;
 
     trace_virtio_blk_req_complete(vdev, req, status);
 
-    stb_p(&req->in->status, status);
+    iov_discard_undo(&req->inhdr_undo);
+    if (virtio_ldl_p(vdev, &req->out.type) == VIRTIO_BLK_T_ZONE_APPEND) {
+        inhdr_len = sizeof(struct virtio_blk_zone_append_inhdr);
+        req->in.in_hdr->status = status;
+        buf = req->in.in_hdr;
+    } else {
+        inhdr_len = sizeof(struct virtio_blk_inhdr);
+        req->in.zone_append_inhdr->status = status;
+        buf = req->in.zone_append_inhdr;
+    }
+
+    n = iov_from_buf(req->elem.in_sg, req->elem.in_num,
+                     req->in_len - inhdr_len, buf, inhdr_len);
+    if (n != inhdr_len) {
+        virtio_error(vdev, "Driver provided input buffer less than size of "
+                     "in header");
+    }
+
     iov_discard_undo(&req->inhdr_undo);
     iov_discard_undo(&req->outhdr_undo);
     virtqueue_push(req->vq, &req->elem, req->in_len);
@@ -592,6 +614,334 @@ err:
     return err_status;
 }
 
+typedef struct ZoneCmdData {
+    VirtIOBlockReq *req;
+    union {
+        struct {
+            unsigned int nr_zones;
+            BlockZoneDescriptor *zones;
+        } zone_report_data;
+        struct {
+            int64_t offset;
+        } zone_append_data;
+    };
+} ZoneCmdData;
+
+/*
+ * check zoned_request: error checking before issuing requests. If all checks
+ * passed, return true.
+ * append: true if only zone append requests issued.
+ */
+static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
+                             bool append, uint8_t *status) {
+    BlockDriverState *bs = blk_bs(s->blk);
+    int index = offset / bs->bl.zone_size;
+
+    if (offset < 0 || len < 0 || offset > bs->bl.capacity - len) {
+        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        return false;
+    }
+
+    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
+        *status = VIRTIO_BLK_S_UNSUPP;
+        return false;
+    }
+
+    if (append) {
+        if ((offset % bs->bl.write_granularity) != 0) {
+            *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
+            return false;
+        }
+
+        if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) {
+            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+            return false;
+        }
+
+        if (len / 512 > bs->bl.max_append_sectors) {
+            if (bs->bl.max_append_sectors == 0) {
+                *status = VIRTIO_BLK_S_UNSUPP;
+            } else {
+                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+            }
+            return false;
+        }
+    }
+    return true;
+}
+
+static void virtio_blk_zone_report_complete(void *opaque, int ret)
+{
+    ZoneCmdData *data = opaque;
+    VirtIOBlockReq *req = data->req;
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
+    struct iovec *in_iov = req->elem.in_sg;
+    unsigned in_num = req->elem.in_num;
+    int64_t zrp_size, nz, n, j = 0;
+    int8_t err_status = VIRTIO_BLK_S_OK;
+
+    if (ret) {
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+    nz = data->zone_report_data.nr_zones;
+    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
+            .nr_zones = cpu_to_le64(nz),
+    };
+
+    zrp_size = sizeof(struct virtio_blk_zone_report)
+               + sizeof(struct virtio_blk_zone_descriptor) * nz;
+    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
+    if (n != sizeof(zrp_hdr)) {
+        virtio_error(vdev, "Driver provided intput buffer that is too small!");
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+    for (size_t i = sizeof(zrp_hdr); i < zrp_size;
+        i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
+        struct virtio_blk_zone_descriptor desc =
+            (struct virtio_blk_zone_descriptor) {
+                .z_start = cpu_to_le64(data->zone_report_data.zones[j].start)
+                    >> BDRV_SECTOR_BITS,
+                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap)
+                    >> BDRV_SECTOR_BITS,
+                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp)
+                    >> BDRV_SECTOR_BITS,
+        };
+
+        switch (data->zone_report_data.zones[j].type) {
+        case BLK_ZT_CONV:
+            desc.z_type = BLK_ZONE_TYPE_CONVENTIONAL;
+            break;
+        case BLK_ZT_SWR:
+            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+            break;
+        case BLK_ZT_SWP:
+            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_PREF;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        switch (data->zone_report_data.zones[j].cond) {
+        case BLK_ZS_RDONLY:
+            desc.z_state = BLK_ZONE_COND_READONLY;
+            break;
+        case BLK_ZS_OFFLINE:
+            desc.z_state = BLK_ZONE_COND_OFFLINE;
+            break;
+        case BLK_ZS_EMPTY:
+            desc.z_state = BLK_ZONE_COND_EMPTY;
+            break;
+        case BLK_ZS_CLOSED:
+            desc.z_state = BLK_ZONE_COND_CLOSED;
+            break;
+        case BLK_ZS_FULL:
+            desc.z_state = BLK_ZONE_COND_FULL;
+            break;
+        case BLK_ZS_EOPEN:
+            desc.z_state = BLK_ZONE_COND_EXP_OPEN;
+            break;
+        case BLK_ZS_IOPEN:
+            desc.z_state = BLK_ZONE_COND_IMP_OPEN;
+            break;
+        case BLK_ZS_NOT_WP:
+            desc.z_state = BLK_ZONE_COND_NOT_WP;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        /* TODO: it takes O(n^2) time complexity. Optimizations required. */
+        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
+        if (n != sizeof(desc)) {
+            virtio_error(vdev, "Driver provided input buffer "
+                               "for descriptors that is too small!");
+            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+            goto out;
+        }
+    }
+    goto out;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    g_free(data->zone_report_data.zones);
+    g_free(data);
+}
+
+static int virtio_blk_handle_zone_report(VirtIOBlockReq *req)
+{
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    unsigned int nr_zones;
+    ZoneCmdData *data;
+    int64_t zone_size, offset;
+    uint8_t err_status;
+
+    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
+            sizeof(struct virtio_blk_zone_report) +
+            sizeof(struct virtio_blk_zone_descriptor)) {
+        virtio_error(vdev, "in buffer too small for zone report");
+        return -1;
+    }
+
+    /* start byte offset of the zone report */
+    offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
+    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
+        goto out;
+    }
+
+    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
+                sizeof(struct virtio_blk_zone_report)) /
+               sizeof(struct virtio_blk_zone_descriptor);
+
+    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
+    data = g_malloc(sizeof(ZoneCmdData));
+    data->req = req;
+    data->zone_report_data.nr_zones = nr_zones;
+    data->zone_report_data.zones = g_malloc(zone_size),
+
+    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
+                        data->zone_report_data.zones,
+                        virtio_blk_zone_report_complete, data);
+    return 0;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
+static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
+{
+    ZoneCmdData *data = opaque;
+    VirtIOBlockReq *req = data->req;
+    VirtIOBlock *s = req->dev;
+    int8_t err_status = VIRTIO_BLK_S_OK;
+
+    if (ret) {
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+    goto out;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    g_free(data);
+}
+
+static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
+{
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    BlockDriverState *bs = blk_bs(s->blk);
+    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
+    uint64_t len;
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    uint32_t type = virtio_ldl_p(vdev, &req->out.type);
+    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
+        /* Entire drive capacity */
+        offset = 0;
+        len = bs->bl.capacity;
+    } else {
+        if (bs->bl.zone_size > bs->bl.capacity - offset) {
+            /* The zoned device allows the last smaller zone. */
+            len = bs->bl.capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
+        } else {
+            len = bs->bl.zone_size;
+        }
+    }
+
+    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
+        goto out;
+    }
+
+    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
+    data->req = req;
+    blk_aio_zone_mgmt(s->blk, op, offset, len,
+                      virtio_blk_zone_mgmt_complete, data);
+
+    return 0;
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
+static void virtio_blk_zone_append_complete(void *opaque, int ret)
+{
+    ZoneCmdData *data = opaque;
+    VirtIOBlockReq *req = data->req;
+    VirtIOBlock *s = req->dev;
+    struct iovec *in_iov = req->elem.in_sg;
+    unsigned in_num = req->elem.in_num;
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    if (ret) {
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+    struct virtio_blk_zone_append_inhdr zap_hdr =
+        (struct virtio_blk_zone_append_inhdr) {
+        .append_sector = cpu_to_le64(data->zone_append_data.offset
+            >> BDRV_SECTOR_BITS),
+    };
+    iov_from_buf(in_iov, in_num, 0, &zap_hdr, sizeof(zap_hdr));
+    goto out;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    g_free(data);
+}
+
+static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
+                                         struct iovec *out_iov,
+                                         uint64_t niov) {
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
+    int64_t len = iov_size(out_iov, niov);
+
+    if (!check_zoned_request(s, offset, len, true, &err_status)) {
+        goto out;
+    }
+
+    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
+    data->req = req;
+    data->zone_append_data.offset = offset;
+    qemu_iovec_init_external(&req->qiov, out_iov, niov);
+    blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
+                        virtio_blk_zone_append_complete, data);
+    return 0;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
 static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 {
     uint32_t type;
@@ -624,7 +974,7 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 
     /* We always touch the last byte, so just see how big in_iov is.  */
     req->in_len = iov_size(in_iov, in_num);
-    req->in = (void *)in_iov[in_num - 1].iov_base
+    req->in.in_hdr = (void *)in_iov[in_num - 1].iov_base
               + in_iov[in_num - 1].iov_len
               - sizeof(struct virtio_blk_inhdr);
     iov_discard_back_undoable(in_iov, &in_num, sizeof(struct virtio_blk_inhdr),
@@ -678,6 +1028,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
     case VIRTIO_BLK_T_FLUSH:
         virtio_blk_handle_flush(req, mrb);
         break;
+    case VIRTIO_BLK_T_ZONE_REPORT:
+        virtio_blk_handle_zone_report(req);
+        break;
+    case VIRTIO_BLK_T_ZONE_OPEN:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
+        break;
+    case VIRTIO_BLK_T_ZONE_CLOSE:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
+        break;
+    case VIRTIO_BLK_T_ZONE_FINISH:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
+        break;
+    case VIRTIO_BLK_T_ZONE_RESET:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
+        break;
+    case VIRTIO_BLK_T_ZONE_RESET_ALL:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
+        break;
     case VIRTIO_BLK_T_SCSI_CMD:
         virtio_blk_handle_scsi(req);
         break;
@@ -696,6 +1064,13 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
         virtio_blk_free_request(req);
         break;
     }
+    case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
+        /*
+         * It is not safe to access req->elem.out_sg directly because it
+         * may be modified by virtio_blk_handle_request().
+         */
+        virtio_blk_handle_zone_append(req, out_iov, out_num);
+        break;
     /*
      * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
      * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
@@ -895,6 +1270,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
     BlockConf *conf = &s->conf.conf;
+    BlockDriverState *bs = blk_bs(s->blk);
     struct virtio_blk_config blkcfg;
     uint64_t capacity;
     int64_t length;
@@ -954,6 +1330,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
         blkcfg.write_zeroes_may_unmap = 1;
         virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
     }
+    if (bs->bl.zoned != BLK_Z_NONE) {
+        switch (bs->bl.zoned) {
+        case BLK_Z_HM:
+            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
+            break;
+        case BLK_Z_HA:
+            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
+                     bs->bl.zone_size / 512);
+        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
+                     bs->bl.max_active_zones);
+        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
+                     bs->bl.max_open_zones);
+        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
+        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
+                     bs->bl.max_append_sectors);
+    } else {
+        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
+    }
     memcpy(config, &blkcfg, s->config_size);
 }
 
@@ -1118,6 +1518,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtIOBlock *s = VIRTIO_BLK(dev);
     VirtIOBlkConf *conf = &s->conf;
+    BlockDriverState *bs = blk_bs(conf->conf.blk);
     Error *err = NULL;
     unsigned i;
 
@@ -1163,6 +1564,13 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
         return;
     }
 
+    if (bs->bl.zoned != BLK_Z_NONE) {
+        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
+        if (bs->bl.zoned == BLK_Z_HM) {
+            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
+        }
+    }
+
     if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
         (!conf->max_discard_sectors ||
          conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index d311c57cca..88901a9f55 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -30,6 +30,12 @@ struct virtio_blk_inhdr
     unsigned char status;
 };
 
+struct virtio_blk_zone_append_inhdr {
+    unsigned char status;
+    uint8_t reserved[7];
+    int64_t append_sector;
+};
+
 #define VIRTIO_BLK_AUTO_NUM_QUEUES UINT16_MAX
 
 struct VirtIOBlkConf
@@ -73,7 +79,10 @@ typedef struct VirtIOBlockReq {
     VirtQueue *vq;
     IOVDiscardUndo inhdr_undo;
     IOVDiscardUndo outhdr_undo;
-    struct virtio_blk_inhdr *in;
+    union {
+        struct virtio_blk_inhdr *in_hdr;
+        struct virtio_blk_zone_append_inhdr *zone_append_inhdr;
+    } in;
     struct virtio_blk_outhdr out;
     QEMUIOVector qiov;
     size_t in_len;
-- 
2.37.3



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC v3 1/2] include: update virtio_blk headers from Linux 5.19-rc2+
  2022-10-16 15:05 ` [RFC v3 1/2] include: update virtio_blk headers from Linux 5.19-rc2+ Sam Li
@ 2022-10-17  0:53   ` Dmitry Fomichev
  2022-10-17  2:12     ` Damien Le Moal
  0 siblings, 1 reply; 9+ messages in thread
From: Dmitry Fomichev @ 2022-10-17  0:53 UTC (permalink / raw)
  To: faithilikerun, qemu-devel
  Cc: hreitz, hare, stefanha, qemu-block, kwolf, raphael.norwitz, mst,
	damien.lemoal

On Sun, 2022-10-16 at 23:05 +0800, Sam Li wrote:
> Use scripts/update-linux-headers.sh to update virtio-blk headers
> from Dmitry's "virtio-blk:add support for zoned block devices"
> linux patch. There is a link for more information:
> https://github.com/dmitry-fomichev/virtblk-zbd
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Sam Li <faithilikerun@gmail.com>

the duplicate sign-off is not needed. With this,

Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>

> ---
>  include/standard-headers/linux/virtio_blk.h | 109 ++++++++++++++++++++
>  1 file changed, 109 insertions(+)
> 
> diff --git a/include/standard-headers/linux/virtio_blk.h b/include/standard-
> headers/linux/virtio_blk.h
> index 2dcc90826a..490bd21c76 100644
> --- a/include/standard-headers/linux/virtio_blk.h
> +++ b/include/standard-headers/linux/virtio_blk.h
> @@ -40,6 +40,7 @@
>  #define VIRTIO_BLK_F_MQ                12      /* support more than one vq */
>  #define VIRTIO_BLK_F_DISCARD   13      /* DISCARD is supported */
>  #define VIRTIO_BLK_F_WRITE_ZEROES      14      /* WRITE ZEROES is supported */
> +#define VIRTIO_BLK_F_ZONED             17      /* Zoned block device */
>  
>  /* Legacy feature bits */
>  #ifndef VIRTIO_BLK_NO_LEGACY
> @@ -119,6 +120,20 @@ struct virtio_blk_config {
>         uint8_t write_zeroes_may_unmap;
>  
>         uint8_t unused1[3];
> +
> +       /* Secure erase fields that are defined in the virtio spec */
> +       uint8_t sec_erase[12];
> +
> +       /* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */
> +       struct virtio_blk_zoned_characteristics {
> +               __virtio32 zone_sectors;
> +               __virtio32 max_open_zones;
> +               __virtio32 max_active_zones;
> +               __virtio32 max_append_sectors;
> +               __virtio32 write_granularity;
> +               uint8_t model;
> +               uint8_t unused2[3];
> +       } zoned;
>  } QEMU_PACKED;
>  
>  /*
> @@ -153,6 +168,27 @@ struct virtio_blk_config {
>  /* Write zeroes command */
>  #define VIRTIO_BLK_T_WRITE_ZEROES      13
>  
> +/* Zone append command */
> +#define VIRTIO_BLK_T_ZONE_APPEND    15
> +
> +/* Report zones command */
> +#define VIRTIO_BLK_T_ZONE_REPORT    16
> +
> +/* Open zone command */
> +#define VIRTIO_BLK_T_ZONE_OPEN      18
> +
> +/* Close zone command */
> +#define VIRTIO_BLK_T_ZONE_CLOSE     20
> +
> +/* Finish zone command */
> +#define VIRTIO_BLK_T_ZONE_FINISH    22
> +
> +/* Reset zone command */
> +#define VIRTIO_BLK_T_ZONE_RESET     24
> +
> +/* Reset All zones command */
> +#define VIRTIO_BLK_T_ZONE_RESET_ALL 26
> +
>  #ifndef VIRTIO_BLK_NO_LEGACY
>  /* Barrier before this op. */
>  #define VIRTIO_BLK_T_BARRIER   0x80000000
> @@ -172,6 +208,72 @@ struct virtio_blk_outhdr {
>         __virtio64 sector;
>  };
>  
> +/*
> + * Supported zoned device models.
> + */
> +
> +/* Regular block device */
> +#define VIRTIO_BLK_Z_NONE      0
> +/* Host-managed zoned device */
> +#define VIRTIO_BLK_Z_HM        1
> +/* Host-aware zoned device */
> +#define VIRTIO_BLK_Z_HA        2
> +
> +/*
> + * Zone descriptor. A part of VIRTIO_BLK_T_ZONE_REPORT command reply.
> + */
> +struct virtio_blk_zone_descriptor {
> +       /* Zone capacity */
> +       __virtio64 z_cap;
> +       /* The starting sector of the zone */
> +       __virtio64 z_start;
> +       /* Zone write pointer position in sectors */
> +       __virtio64 z_wp;
> +       /* Zone type */
> +       uint8_t z_type;
> +       /* Zone state */
> +       uint8_t z_state;
> +       uint8_t reserved[38];
> +};
> +
> +struct virtio_blk_zone_report {
> +       __virtio64 nr_zones;
> +       uint8_t reserved[56];
> +       struct virtio_blk_zone_descriptor zones[];
> +};
> +
> +/*
> + * Supported zone types.
> + */
> +
> +/* Conventional zone */
> +#define VIRTIO_BLK_ZT_CONV         1
> +/* Sequential Write Required zone */
> +#define VIRTIO_BLK_ZT_SWR          2
> +/* Sequential Write Preferred zone */
> +#define VIRTIO_BLK_ZT_SWP          3
> +
> +/*
> + * Zone states that are available for zones of all types.
> + */
> +
> +/* Not a write pointer (conventional zones only) */
> +#define VIRTIO_BLK_ZS_NOT_WP       0
> +/* Empty */
> +#define VIRTIO_BLK_ZS_EMPTY        1
> +/* Implicitly Open */
> +#define VIRTIO_BLK_ZS_IOPEN        2
> +/* Explicitly Open */
> +#define VIRTIO_BLK_ZS_EOPEN        3
> +/* Closed */
> +#define VIRTIO_BLK_ZS_CLOSED       4
> +/* Read-Only */
> +#define VIRTIO_BLK_ZS_RDONLY       13
> +/* Full */
> +#define VIRTIO_BLK_ZS_FULL         14
> +/* Offline */
> +#define VIRTIO_BLK_ZS_OFFLINE      15
> +
>  /* Unmap this range (only valid for write zeroes command) */
>  #define VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP     0x00000001
>  
> @@ -198,4 +300,11 @@ struct virtio_scsi_inhdr {
>  #define VIRTIO_BLK_S_OK                0
>  #define VIRTIO_BLK_S_IOERR     1
>  #define VIRTIO_BLK_S_UNSUPP    2
> +
> +/* Error codes that are specific to zoned block devices */
> +#define VIRTIO_BLK_S_ZONE_INVALID_CMD     3
> +#define VIRTIO_BLK_S_ZONE_UNALIGNED_WP    4
> +#define VIRTIO_BLK_S_ZONE_OPEN_RESOURCE   5
> +#define VIRTIO_BLK_S_ZONE_ACTIVE_RESOURCE 6
> +
>  #endif /* _LINUX_VIRTIO_BLK_H */


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC v3 2/2] virtio-blk: add zoned storage emulation for zoned devices
  2022-10-16 15:05 ` [RFC v3 2/2] virtio-blk: add zoned storage emulation for zoned devices Sam Li
@ 2022-10-17  0:59   ` Dmitry Fomichev
  2022-10-18  8:56     ` Sam Li
  0 siblings, 1 reply; 9+ messages in thread
From: Dmitry Fomichev @ 2022-10-17  0:59 UTC (permalink / raw)
  To: faithilikerun, qemu-devel
  Cc: hreitz, hare, stefanha, qemu-block, kwolf, raphael.norwitz, mst,
	damien.lemoal

On Sun, 2022-10-16 at 23:05 +0800, Sam Li wrote:
> This patch extends virtio-blk emulation to handle zoned device commands
> by calling the new block layer APIs to perform zoned device I/O on
> behalf of the guest. It supports Report Zone, four zone oparations (open,
> close, finish, reset), and Append Zone.
> 
> The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> support zoned block devices. Regular block devices(conventional zones)
> will not be set.
> 
> Then the guest os can use blkzone(8) to test those commands on zoned devices.
> Furthermore, using zonefs to test zone append write is also supported.
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  hw/block/virtio-blk-common.c   |   2 +
>  hw/block/virtio-blk.c          | 412 ++++++++++++++++++++++++++++++++-
>  include/hw/virtio/virtio-blk.h |  11 +-
>  3 files changed, 422 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
> index ac52d7c176..e2f8e2f6da 100644
> --- a/hw/block/virtio-blk-common.c
> +++ b/hw/block/virtio-blk-common.c
> @@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
>       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
>      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
>       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> +     .end = endof(struct virtio_blk_config, zoned)},
>      {}
>  };
>  
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index 8131ec2dbc..58891aea31 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -26,6 +26,9 @@
>  #include "hw/virtio/virtio-blk.h"
>  #include "dataplane/virtio-blk.h"
>  #include "scsi/constants.h"
> +#if defined(CONFIG_BLKZONED)
> +#include <linux/blkzoned.h>
> +#endif
>  #ifdef __linux__
>  # include <scsi/sg.h>
>  #endif
> @@ -55,10 +58,29 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req,
> unsigned char status)
>  {
>      VirtIOBlock *s = req->dev;
>      VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    int64_t inhdr_len, n;
> +    void *buf;
>  
>      trace_virtio_blk_req_complete(vdev, req, status);
>  
> -    stb_p(&req->in->status, status);
> +    iov_discard_undo(&req->inhdr_undo);
> +    if (virtio_ldl_p(vdev, &req->out.type) == VIRTIO_BLK_T_ZONE_APPEND) {
> +        inhdr_len = sizeof(struct virtio_blk_zone_append_inhdr);
> +        req->in.in_hdr->status = status;
> +        buf = req->in.in_hdr;
> +    } else {
> +        inhdr_len = sizeof(struct virtio_blk_inhdr);
> +        req->in.zone_append_inhdr->status = status;
> +        buf = req->in.zone_append_inhdr;
> +    }
> +
> +    n = iov_from_buf(req->elem.in_sg, req->elem.in_num,
> +                     req->in_len - inhdr_len, buf, inhdr_len);
> +    if (n != inhdr_len) {
> +        virtio_error(vdev, "Driver provided input buffer less than size of "
> +                     "in header");
> +    }
> +
>      iov_discard_undo(&req->inhdr_undo);
>      iov_discard_undo(&req->outhdr_undo);
>      virtqueue_push(req->vq, &req->elem, req->in_len);
> @@ -592,6 +614,334 @@ err:
>      return err_status;
>  }
>  
> +typedef struct ZoneCmdData {
> +    VirtIOBlockReq *req;
> +    union {
> +        struct {
> +            unsigned int nr_zones;
> +            BlockZoneDescriptor *zones;
> +        } zone_report_data;
> +        struct {
> +            int64_t offset;
> +        } zone_append_data;
> +    };
> +} ZoneCmdData;
> +
> +/*
> + * check zoned_request: error checking before issuing requests. If all checks
> + * passed, return true.
> + * append: true if only zone append requests issued.
> + */
> +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
> +                             bool append, uint8_t *status) {
> +    BlockDriverState *bs = blk_bs(s->blk);
> +    int index = offset / bs->bl.zone_size;
> +
> +    if (offset < 0 || len < 0 || offset > bs->bl.capacity - len) {
> +        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        return false;
> +    }
> +
> +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> +        *status = VIRTIO_BLK_S_UNSUPP;
> +        return false;
> +    }
> +
> +    if (append) {
> +        if ((offset % bs->bl.write_granularity) != 0) {
> +            *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> +            return false;
> +        }
> +
> +        if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) {
> +            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            return false;
> +        }
> +
> +        if (len / 512 > bs->bl.max_append_sectors) {
> +            if (bs->bl.max_append_sectors == 0) {
> +                *status = VIRTIO_BLK_S_UNSUPP;
> +            } else {
> +                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            }
> +            return false;
> +        }
> +    }
> +    return true;
> +}
> +
> +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> +{
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> +    struct iovec *in_iov = req->elem.in_sg;
> +    unsigned in_num = req->elem.in_num;
> +    int64_t zrp_size, nz, n, j = 0;
> +    int8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (ret) {
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +    nz = data->zone_report_data.nr_zones;
> +    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
> +            .nr_zones = cpu_to_le64(nz),
> +    };
> +
> +    zrp_size = sizeof(struct virtio_blk_zone_report)
> +               + sizeof(struct virtio_blk_zone_descriptor) * nz;
> +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
> +    if (n != sizeof(zrp_hdr)) {
> +        virtio_error(vdev, "Driver provided intput buffer that is too
> small!");

s/intput/input

> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +    for (size_t i = sizeof(zrp_hdr); i < zrp_size;
> +        i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
> +        struct virtio_blk_zone_descriptor desc =
> +            (struct virtio_blk_zone_descriptor) {
> +                .z_start = cpu_to_le64(data->zone_report_data.zones[j].start)
> +                    >> BDRV_SECTOR_BITS,
> +                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap)
> +                    >> BDRV_SECTOR_BITS,
> +                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp)
> +                    >> BDRV_SECTOR_BITS,

I think these should be

+                .z_start = cpu_to_le64(data->zone_report_data.zones[j].start
+                    >> BDRV_SECTOR_BITS),
+                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
+                    >> BDRV_SECTOR_BITS),
+                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
+                    >> BDRV_SECTOR_BITS),

> +        };
> +
> +        switch (data->zone_report_data.zones[j].type) {
> +        case BLK_ZT_CONV:
> +            desc.z_type = BLK_ZONE_TYPE_CONVENTIONAL;
> +            break;
> +        case BLK_ZT_SWR:
> +            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_REQ;
> +            break;
> +        case BLK_ZT_SWP:
> +            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_PREF;
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }

why do we need to convert these to system-defined values?
The output of this function will be sent to the driver that
uses the values defined in virtio-zbd spec. And the conversion
to BLK_ZS_xxx/BLK_ZT_xxx has already been done in parse_zone()
function in file-posix.c

> +
> +        switch (data->zone_report_data.zones[j].cond) {
> +        case BLK_ZS_RDONLY:
> +            desc.z_state = BLK_ZONE_COND_READONLY;
> +            break;
> +        case BLK_ZS_OFFLINE:
> +            desc.z_state = BLK_ZONE_COND_OFFLINE;
> +            break;
> +        case BLK_ZS_EMPTY:
> +            desc.z_state = BLK_ZONE_COND_EMPTY;
> +            break;
> +        case BLK_ZS_CLOSED:
> +            desc.z_state = BLK_ZONE_COND_CLOSED;
> +            break;
> +        case BLK_ZS_FULL:
> +            desc.z_state = BLK_ZONE_COND_FULL;
> +            break;
> +        case BLK_ZS_EOPEN:
> +            desc.z_state = BLK_ZONE_COND_EXP_OPEN;
> +            break;
> +        case BLK_ZS_IOPEN:
> +            desc.z_state = BLK_ZONE_COND_IMP_OPEN;
> +            break;
> +        case BLK_ZS_NOT_WP:
> +            desc.z_state = BLK_ZONE_COND_NOT_WP;
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }

same here. I think the code to form the descriptor can be reduced to

+        struct virtio_blk_zone_descriptor desc = (struct
virtio_blk_zone_descriptor) {
+            .z_start = cpu_to_le64(data->zone_report_data.zones[j].start >>
BDRV_SECTOR_BITS),
+            .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap >>
BDRV_SECTOR_BITS),
+            .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp >>
BDRV_SECTOR_BITS),
+            .z_type = data->zone_report_data.zones[j].type,
+            .z_state = data->zone_report_data.zones[j].state,
+        };

> +
> +        /* TODO: it takes O(n^2) time complexity. Optimizations required. */
> +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
> +        if (n != sizeof(desc)) {
> +            virtio_error(vdev, "Driver provided input buffer "
> +                               "for descriptors that is too small!");
> +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            goto out;
> +        }
> +    }
> +    goto out;

this goto is not needed.

> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data->zone_report_data.zones);
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req)
> +{
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    unsigned int nr_zones;
> +    ZoneCmdData *data;
> +    int64_t zone_size, offset;
> +    uint8_t err_status;
> +
> +    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
> +            sizeof(struct virtio_blk_zone_report) +
> +            sizeof(struct virtio_blk_zone_descriptor)) {
> +        virtio_error(vdev, "in buffer too small for zone report");
> +        return -1;
> +    }
> +
> +    /* start byte offset of the zone report */
> +    offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> +        goto out;
> +    }
> +
> +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> +                sizeof(struct virtio_blk_zone_report)) /
> +               sizeof(struct virtio_blk_zone_descriptor);
> +
> +    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
> +    data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +    data->zone_report_data.nr_zones = nr_zones;
> +    data->zone_report_data.zones = g_malloc(zone_size),
> +
> +    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
> +                        data->zone_report_data.zones,
> +                        virtio_blk_zone_report_complete, data);
> +    return 0;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
> +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
> +{
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    int8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (ret) {
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +    goto out;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
> +{
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    BlockDriverState *bs = blk_bs(s->blk);
> +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> +    uint64_t len;
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    uint32_t type = virtio_ldl_p(vdev, &req->out.type);
> +    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
> +        /* Entire drive capacity */
> +        offset = 0;
> +        len = bs->bl.capacity;
> +    } else {
> +        if (bs->bl.zone_size > bs->bl.capacity - offset) {
> +            /* The zoned device allows the last smaller zone. */
> +            len = bs->bl.capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
> +        } else {
> +            len = bs->bl.zone_size;
> +        }
> +    }
> +
> +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> +        goto out;
> +    }
> +
> +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> +                      virtio_blk_zone_mgmt_complete, data);
> +
> +    return 0;
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
> +static void virtio_blk_zone_append_complete(void *opaque, int ret)
> +{
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    struct iovec *in_iov = req->elem.in_sg;
> +    unsigned in_num = req->elem.in_num;
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (ret) {
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +    struct virtio_blk_zone_append_inhdr zap_hdr =
> +        (struct virtio_blk_zone_append_inhdr) {
> +        .append_sector = cpu_to_le64(data->zone_append_data.offset
> +            >> BDRV_SECTOR_BITS),
> +    };
> +    iov_from_buf(in_iov, in_num, 0, &zap_hdr, sizeof(zap_hdr));
> +    goto out;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
> +                                         struct iovec *out_iov,
> +                                         uint64_t niov) {
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> +    int64_t len = iov_size(out_iov, niov);
> +
> +    if (!check_zoned_request(s, offset, len, true, &err_status)) {
> +        goto out;
> +    }
> +
> +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +    data->zone_append_data.offset = offset;
> +    qemu_iovec_init_external(&req->qiov, out_iov, niov);
> +    blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
> +                        virtio_blk_zone_append_complete, data);
> +    return 0;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
>  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>  {
>      uint32_t type;
> @@ -624,7 +974,7 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req,
> MultiReqBuffer *mrb)
>  
>      /* We always touch the last byte, so just see how big in_iov is.  */
>      req->in_len = iov_size(in_iov, in_num);
> -    req->in = (void *)in_iov[in_num - 1].iov_base
> +    req->in.in_hdr = (void *)in_iov[in_num - 1].iov_base
>                + in_iov[in_num - 1].iov_len
>                - sizeof(struct virtio_blk_inhdr);
>      iov_discard_back_undoable(in_iov, &in_num, sizeof(struct
> virtio_blk_inhdr),
> @@ -678,6 +1028,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req,
> MultiReqBuffer *mrb)
>      case VIRTIO_BLK_T_FLUSH:
>          virtio_blk_handle_flush(req, mrb);
>          break;
> +    case VIRTIO_BLK_T_ZONE_REPORT:
> +        virtio_blk_handle_zone_report(req);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_OPEN:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_CLOSE:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_FINISH:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_RESET:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> +        break;
>      case VIRTIO_BLK_T_SCSI_CMD:
>          virtio_blk_handle_scsi(req);
>          break;
> @@ -696,6 +1064,13 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req,
> MultiReqBuffer *mrb)
>          virtio_blk_free_request(req);
>          break;
>      }
> +    case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
> +        /*
> +         * It is not safe to access req->elem.out_sg directly because it
> +         * may be modified by virtio_blk_handle_request().
> +         */
> +        virtio_blk_handle_zone_append(req, out_iov, out_num);
> +        break;
>      /*
>       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
>       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
> @@ -895,6 +1270,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev,
> uint8_t *config)
>  {
>      VirtIOBlock *s = VIRTIO_BLK(vdev);
>      BlockConf *conf = &s->conf.conf;
> +    BlockDriverState *bs = blk_bs(s->blk);
>      struct virtio_blk_config blkcfg;
>      uint64_t capacity;
>      int64_t length;
> @@ -954,6 +1330,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev,
> uint8_t *config)
>          blkcfg.write_zeroes_may_unmap = 1;
>          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
>      }
> +    if (bs->bl.zoned != BLK_Z_NONE) {
> +        switch (bs->bl.zoned) {
> +        case BLK_Z_HM:
> +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
> +            break;
> +        case BLK_Z_HA:
> +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
> +
> +        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
> +                     bs->bl.zone_size / 512);
> +        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
> +                     bs->bl.max_active_zones);
> +        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
> +                     bs->bl.max_open_zones);
> +        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> +        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
> +                     bs->bl.max_append_sectors);
> +    } else {
> +        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
> +    }
>      memcpy(config, &blkcfg, s->config_size);
>  }
>  
> @@ -1118,6 +1518,7 @@ static void virtio_blk_device_realize(DeviceState *dev,
> Error **errp)
>      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>      VirtIOBlock *s = VIRTIO_BLK(dev);
>      VirtIOBlkConf *conf = &s->conf;
> +    BlockDriverState *bs = blk_bs(conf->conf.blk);
>      Error *err = NULL;
>      unsigned i;
>  
> @@ -1163,6 +1564,13 @@ static void virtio_blk_device_realize(DeviceState *dev,
> Error **errp)
>          return;
>      }
>  
> +    if (bs->bl.zoned != BLK_Z_NONE) {
> +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
> +        if (bs->bl.zoned == BLK_Z_HM) {
> +            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
> +        }
> +    }
> +
>      if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
>          (!conf->max_discard_sectors ||
>           conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
> diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
> index d311c57cca..88901a9f55 100644
> --- a/include/hw/virtio/virtio-blk.h
> +++ b/include/hw/virtio/virtio-blk.h
> @@ -30,6 +30,12 @@ struct virtio_blk_inhdr
>      unsigned char status;
>  };
>  
> +struct virtio_blk_zone_append_inhdr {
> +    unsigned char status;
> +    uint8_t reserved[7];
> +    int64_t append_sector;
> +};
> +
>  #define VIRTIO_BLK_AUTO_NUM_QUEUES UINT16_MAX
>  
>  struct VirtIOBlkConf
> @@ -73,7 +79,10 @@ typedef struct VirtIOBlockReq {
>      VirtQueue *vq;
>      IOVDiscardUndo inhdr_undo;
>      IOVDiscardUndo outhdr_undo;
> -    struct virtio_blk_inhdr *in;
> +    union {
> +        struct virtio_blk_inhdr *in_hdr;
> +        struct virtio_blk_zone_append_inhdr *zone_append_inhdr;
> +    } in;
>      struct virtio_blk_outhdr out;
>      QEMUIOVector qiov;
>      size_t in_len;


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC v3 1/2] include: update virtio_blk headers from Linux 5.19-rc2+
  2022-10-17  0:53   ` Dmitry Fomichev
@ 2022-10-17  2:12     ` Damien Le Moal
  0 siblings, 0 replies; 9+ messages in thread
From: Damien Le Moal @ 2022-10-17  2:12 UTC (permalink / raw)
  To: Dmitry Fomichev, faithilikerun, qemu-devel
  Cc: hreitz, hare, stefanha, qemu-block, kwolf, raphael.norwitz, mst

On 10/17/22 09:53, Dmitry Fomichev wrote:
> On Sun, 2022-10-16 at 23:05 +0800, Sam Li wrote:
>> Use scripts/update-linux-headers.sh to update virtio-blk headers
>> from Dmitry's "virtio-blk:add support for zoned block devices"
>> linux patch. There is a link for more information:
>> https://github.com/dmitry-fomichev/virtblk-zbd
>>
>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> 
> the duplicate sign-off is not needed. With this,
> 
> Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>

The mention of the linux kernel version should be removed from the patch
title as the changes are not included in any upstream kernel yet.

> 
>> ---
>>  include/standard-headers/linux/virtio_blk.h | 109 ++++++++++++++++++++
>>  1 file changed, 109 insertions(+)
>>
>> diff --git a/include/standard-headers/linux/virtio_blk.h b/include/standard-
>> headers/linux/virtio_blk.h
>> index 2dcc90826a..490bd21c76 100644
>> --- a/include/standard-headers/linux/virtio_blk.h
>> +++ b/include/standard-headers/linux/virtio_blk.h
>> @@ -40,6 +40,7 @@
>>  #define VIRTIO_BLK_F_MQ                12      /* support more than one vq */
>>  #define VIRTIO_BLK_F_DISCARD   13      /* DISCARD is supported */
>>  #define VIRTIO_BLK_F_WRITE_ZEROES      14      /* WRITE ZEROES is supported */
>> +#define VIRTIO_BLK_F_ZONED             17      /* Zoned block device */
>>  
>>  /* Legacy feature bits */
>>  #ifndef VIRTIO_BLK_NO_LEGACY
>> @@ -119,6 +120,20 @@ struct virtio_blk_config {
>>         uint8_t write_zeroes_may_unmap;
>>  
>>         uint8_t unused1[3];
>> +
>> +       /* Secure erase fields that are defined in the virtio spec */
>> +       uint8_t sec_erase[12];
>> +
>> +       /* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */
>> +       struct virtio_blk_zoned_characteristics {
>> +               __virtio32 zone_sectors;
>> +               __virtio32 max_open_zones;
>> +               __virtio32 max_active_zones;
>> +               __virtio32 max_append_sectors;
>> +               __virtio32 write_granularity;
>> +               uint8_t model;
>> +               uint8_t unused2[3];
>> +       } zoned;
>>  } QEMU_PACKED;
>>  
>>  /*
>> @@ -153,6 +168,27 @@ struct virtio_blk_config {
>>  /* Write zeroes command */
>>  #define VIRTIO_BLK_T_WRITE_ZEROES      13
>>  
>> +/* Zone append command */
>> +#define VIRTIO_BLK_T_ZONE_APPEND    15
>> +
>> +/* Report zones command */
>> +#define VIRTIO_BLK_T_ZONE_REPORT    16
>> +
>> +/* Open zone command */
>> +#define VIRTIO_BLK_T_ZONE_OPEN      18
>> +
>> +/* Close zone command */
>> +#define VIRTIO_BLK_T_ZONE_CLOSE     20
>> +
>> +/* Finish zone command */
>> +#define VIRTIO_BLK_T_ZONE_FINISH    22
>> +
>> +/* Reset zone command */
>> +#define VIRTIO_BLK_T_ZONE_RESET     24
>> +
>> +/* Reset All zones command */
>> +#define VIRTIO_BLK_T_ZONE_RESET_ALL 26
>> +
>>  #ifndef VIRTIO_BLK_NO_LEGACY
>>  /* Barrier before this op. */
>>  #define VIRTIO_BLK_T_BARRIER   0x80000000
>> @@ -172,6 +208,72 @@ struct virtio_blk_outhdr {
>>         __virtio64 sector;
>>  };
>>  
>> +/*
>> + * Supported zoned device models.
>> + */
>> +
>> +/* Regular block device */
>> +#define VIRTIO_BLK_Z_NONE      0
>> +/* Host-managed zoned device */
>> +#define VIRTIO_BLK_Z_HM        1
>> +/* Host-aware zoned device */
>> +#define VIRTIO_BLK_Z_HA        2
>> +
>> +/*
>> + * Zone descriptor. A part of VIRTIO_BLK_T_ZONE_REPORT command reply.
>> + */
>> +struct virtio_blk_zone_descriptor {
>> +       /* Zone capacity */
>> +       __virtio64 z_cap;
>> +       /* The starting sector of the zone */
>> +       __virtio64 z_start;
>> +       /* Zone write pointer position in sectors */
>> +       __virtio64 z_wp;
>> +       /* Zone type */
>> +       uint8_t z_type;
>> +       /* Zone state */
>> +       uint8_t z_state;
>> +       uint8_t reserved[38];
>> +};
>> +
>> +struct virtio_blk_zone_report {
>> +       __virtio64 nr_zones;
>> +       uint8_t reserved[56];
>> +       struct virtio_blk_zone_descriptor zones[];
>> +};
>> +
>> +/*
>> + * Supported zone types.
>> + */
>> +
>> +/* Conventional zone */
>> +#define VIRTIO_BLK_ZT_CONV         1
>> +/* Sequential Write Required zone */
>> +#define VIRTIO_BLK_ZT_SWR          2
>> +/* Sequential Write Preferred zone */
>> +#define VIRTIO_BLK_ZT_SWP          3
>> +
>> +/*
>> + * Zone states that are available for zones of all types.
>> + */
>> +
>> +/* Not a write pointer (conventional zones only) */
>> +#define VIRTIO_BLK_ZS_NOT_WP       0
>> +/* Empty */
>> +#define VIRTIO_BLK_ZS_EMPTY        1
>> +/* Implicitly Open */
>> +#define VIRTIO_BLK_ZS_IOPEN        2
>> +/* Explicitly Open */
>> +#define VIRTIO_BLK_ZS_EOPEN        3
>> +/* Closed */
>> +#define VIRTIO_BLK_ZS_CLOSED       4
>> +/* Read-Only */
>> +#define VIRTIO_BLK_ZS_RDONLY       13
>> +/* Full */
>> +#define VIRTIO_BLK_ZS_FULL         14
>> +/* Offline */
>> +#define VIRTIO_BLK_ZS_OFFLINE      15
>> +
>>  /* Unmap this range (only valid for write zeroes command) */
>>  #define VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP     0x00000001
>>  
>> @@ -198,4 +300,11 @@ struct virtio_scsi_inhdr {
>>  #define VIRTIO_BLK_S_OK                0
>>  #define VIRTIO_BLK_S_IOERR     1
>>  #define VIRTIO_BLK_S_UNSUPP    2
>> +
>> +/* Error codes that are specific to zoned block devices */
>> +#define VIRTIO_BLK_S_ZONE_INVALID_CMD     3
>> +#define VIRTIO_BLK_S_ZONE_UNALIGNED_WP    4
>> +#define VIRTIO_BLK_S_ZONE_OPEN_RESOURCE   5
>> +#define VIRTIO_BLK_S_ZONE_ACTIVE_RESOURCE 6
>> +
>>  #endif /* _LINUX_VIRTIO_BLK_H */
> 

-- 
Damien Le Moal
Western Digital Research



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC v3 2/2] virtio-blk: add zoned storage emulation for zoned devices
  2022-10-17  0:59   ` Dmitry Fomichev
@ 2022-10-18  8:56     ` Sam Li
  2022-10-23  2:08       ` Dmitry Fomichev
  0 siblings, 1 reply; 9+ messages in thread
From: Sam Li @ 2022-10-18  8:56 UTC (permalink / raw)
  To: Dmitry Fomichev
  Cc: qemu-devel, hreitz, hare, stefanha, qemu-block, kwolf,
	raphael.norwitz, mst, damien.lemoal

Dmitry Fomichev <Dmitry.Fomichev@wdc.com> 于2022年10月17日周一 09:01写道:
>
> On Sun, 2022-10-16 at 23:05 +0800, Sam Li wrote:
> > This patch extends virtio-blk emulation to handle zoned device commands
> > by calling the new block layer APIs to perform zoned device I/O on
> > behalf of the guest. It supports Report Zone, four zone oparations (open,
> > close, finish, reset), and Append Zone.
> >
> > The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> > support zoned block devices. Regular block devices(conventional zones)
> > will not be set.
> >
> > Then the guest os can use blkzone(8) to test those commands on zoned devices.
> > Furthermore, using zonefs to test zone append write is also supported.
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >  hw/block/virtio-blk-common.c   |   2 +
> >  hw/block/virtio-blk.c          | 412 ++++++++++++++++++++++++++++++++-
> >  include/hw/virtio/virtio-blk.h |  11 +-
> >  3 files changed, 422 insertions(+), 3 deletions(-)
> >
> > diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
> > index ac52d7c176..e2f8e2f6da 100644
> > --- a/hw/block/virtio-blk-common.c
> > +++ b/hw/block/virtio-blk-common.c
> > @@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
> >       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
> >      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
> >       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> > +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> > +     .end = endof(struct virtio_blk_config, zoned)},
> >      {}
> >  };
> >
> > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> > index 8131ec2dbc..58891aea31 100644
> > --- a/hw/block/virtio-blk.c
> > +++ b/hw/block/virtio-blk.c
> > @@ -26,6 +26,9 @@
> >  #include "hw/virtio/virtio-blk.h"
> >  #include "dataplane/virtio-blk.h"
> >  #include "scsi/constants.h"
> > +#if defined(CONFIG_BLKZONED)
> > +#include <linux/blkzoned.h>
> > +#endif
> >  #ifdef __linux__
> >  # include <scsi/sg.h>
> >  #endif
> > @@ -55,10 +58,29 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req,
> > unsigned char status)
> >  {
> >      VirtIOBlock *s = req->dev;
> >      VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    int64_t inhdr_len, n;
> > +    void *buf;
> >
> >      trace_virtio_blk_req_complete(vdev, req, status);
> >
> > -    stb_p(&req->in->status, status);
> > +    iov_discard_undo(&req->inhdr_undo);
> > +    if (virtio_ldl_p(vdev, &req->out.type) == VIRTIO_BLK_T_ZONE_APPEND) {
> > +        inhdr_len = sizeof(struct virtio_blk_zone_append_inhdr);
> > +        req->in.in_hdr->status = status;
> > +        buf = req->in.in_hdr;
> > +    } else {
> > +        inhdr_len = sizeof(struct virtio_blk_inhdr);
> > +        req->in.zone_append_inhdr->status = status;
> > +        buf = req->in.zone_append_inhdr;
> > +    }
> > +
> > +    n = iov_from_buf(req->elem.in_sg, req->elem.in_num,
> > +                     req->in_len - inhdr_len, buf, inhdr_len);
> > +    if (n != inhdr_len) {
> > +        virtio_error(vdev, "Driver provided input buffer less than size of "
> > +                     "in header");
> > +    }
> > +
> >      iov_discard_undo(&req->inhdr_undo);
> >      iov_discard_undo(&req->outhdr_undo);
> >      virtqueue_push(req->vq, &req->elem, req->in_len);
> > @@ -592,6 +614,334 @@ err:
> >      return err_status;
> >  }
> >
> > +typedef struct ZoneCmdData {
> > +    VirtIOBlockReq *req;
> > +    union {
> > +        struct {
> > +            unsigned int nr_zones;
> > +            BlockZoneDescriptor *zones;
> > +        } zone_report_data;
> > +        struct {
> > +            int64_t offset;
> > +        } zone_append_data;
> > +    };
> > +} ZoneCmdData;
> > +
> > +/*
> > + * check zoned_request: error checking before issuing requests. If all checks
> > + * passed, return true.
> > + * append: true if only zone append requests issued.
> > + */
> > +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
> > +                             bool append, uint8_t *status) {
> > +    BlockDriverState *bs = blk_bs(s->blk);
> > +    int index = offset / bs->bl.zone_size;
> > +
> > +    if (offset < 0 || len < 0 || offset > bs->bl.capacity - len) {
> > +        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        return false;
> > +    }
> > +
> > +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> > +        *status = VIRTIO_BLK_S_UNSUPP;
> > +        return false;
> > +    }
> > +
> > +    if (append) {
> > +        if ((offset % bs->bl.write_granularity) != 0) {
> > +            *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> > +            return false;
> > +        }
> > +
> > +        if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) {
> > +            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +            return false;
> > +        }
> > +
> > +        if (len / 512 > bs->bl.max_append_sectors) {
> > +            if (bs->bl.max_append_sectors == 0) {
> > +                *status = VIRTIO_BLK_S_UNSUPP;
> > +            } else {
> > +                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +            }
> > +            return false;
> > +        }
> > +    }
> > +    return true;
> > +}
> > +
> > +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> > +{
> > +    ZoneCmdData *data = opaque;
> > +    VirtIOBlockReq *req = data->req;
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> > +    struct iovec *in_iov = req->elem.in_sg;
> > +    unsigned in_num = req->elem.in_num;
> > +    int64_t zrp_size, nz, n, j = 0;
> > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    if (ret) {
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +    nz = data->zone_report_data.nr_zones;
> > +    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
> > +            .nr_zones = cpu_to_le64(nz),
> > +    };
> > +
> > +    zrp_size = sizeof(struct virtio_blk_zone_report)
> > +               + sizeof(struct virtio_blk_zone_descriptor) * nz;
> > +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
> > +    if (n != sizeof(zrp_hdr)) {
> > +        virtio_error(vdev, "Driver provided intput buffer that is too
> > small!");
>
> s/intput/input
>
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +    for (size_t i = sizeof(zrp_hdr); i < zrp_size;
> > +        i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
> > +        struct virtio_blk_zone_descriptor desc =
> > +            (struct virtio_blk_zone_descriptor) {
> > +                .z_start = cpu_to_le64(data->zone_report_data.zones[j].start)
> > +                    >> BDRV_SECTOR_BITS,
> > +                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap)
> > +                    >> BDRV_SECTOR_BITS,
> > +                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp)
> > +                    >> BDRV_SECTOR_BITS,
>
> I think these should be
>
> +                .z_start = cpu_to_le64(data->zone_report_data.zones[j].start
> +                    >> BDRV_SECTOR_BITS),
> +                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
> +                    >> BDRV_SECTOR_BITS),
> +                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
> +                    >> BDRV_SECTOR_BITS),
>
> > +        };
> > +
> > +        switch (data->zone_report_data.zones[j].type) {
> > +        case BLK_ZT_CONV:
> > +            desc.z_type = BLK_ZONE_TYPE_CONVENTIONAL;
> > +            break;
> > +        case BLK_ZT_SWR:
> > +            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_REQ;
> > +            break;
> > +        case BLK_ZT_SWP:
> > +            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_PREF;
> > +            break;
> > +        default:
> > +            g_assert_not_reached();
> > +        }
>
> why do we need to convert these to system-defined values?
> The output of this function will be sent to the driver that
> uses the values defined in virtio-zbd spec. And the conversion
> to BLK_ZS_xxx/BLK_ZT_xxx has already been done in parse_zone()
> function in file-posix.c

Because QEMU may use different macro values compared to Linux. It is
for avoiding passing wrong values to the guest. Parse_zone() converts
Linux macros to QEMU macros and QEMU macros will change back to
Linux's defined values here.

>
> > +
> > +        switch (data->zone_report_data.zones[j].cond) {
> > +        case BLK_ZS_RDONLY:
> > +            desc.z_state = BLK_ZONE_COND_READONLY;
> > +            break;
> > +        case BLK_ZS_OFFLINE:
> > +            desc.z_state = BLK_ZONE_COND_OFFLINE;
> > +            break;
> > +        case BLK_ZS_EMPTY:
> > +            desc.z_state = BLK_ZONE_COND_EMPTY;
> > +            break;
> > +        case BLK_ZS_CLOSED:
> > +            desc.z_state = BLK_ZONE_COND_CLOSED;
> > +            break;
> > +        case BLK_ZS_FULL:
> > +            desc.z_state = BLK_ZONE_COND_FULL;
> > +            break;
> > +        case BLK_ZS_EOPEN:
> > +            desc.z_state = BLK_ZONE_COND_EXP_OPEN;
> > +            break;
> > +        case BLK_ZS_IOPEN:
> > +            desc.z_state = BLK_ZONE_COND_IMP_OPEN;
> > +            break;
> > +        case BLK_ZS_NOT_WP:
> > +            desc.z_state = BLK_ZONE_COND_NOT_WP;
> > +            break;
> > +        default:
> > +            g_assert_not_reached();
> > +        }
>
> same here. I think the code to form the descriptor can be reduced to
>
> +        struct virtio_blk_zone_descriptor desc = (struct
> virtio_blk_zone_descriptor) {
> +            .z_start = cpu_to_le64(data->zone_report_data.zones[j].start >>
> BDRV_SECTOR_BITS),
> +            .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap >>
> BDRV_SECTOR_BITS),
> +            .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp >>
> BDRV_SECTOR_BITS),
> +            .z_type = data->zone_report_data.zones[j].type,
> +            .z_state = data->zone_report_data.zones[j].state,
> +        };
>
> > +
> > +        /* TODO: it takes O(n^2) time complexity. Optimizations required. */
> > +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
> > +        if (n != sizeof(desc)) {
> > +            virtio_error(vdev, "Driver provided input buffer "
> > +                               "for descriptors that is too small!");
> > +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +            goto out;
> > +        }
> > +    }
> > +    goto out;
>
> this goto is not needed.
>
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    g_free(data->zone_report_data.zones);
> > +    g_free(data);
> > +}
> > +
> > +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req)
> > +{
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    unsigned int nr_zones;
> > +    ZoneCmdData *data;
> > +    int64_t zone_size, offset;
> > +    uint8_t err_status;
> > +
> > +    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
> > +            sizeof(struct virtio_blk_zone_report) +
> > +            sizeof(struct virtio_blk_zone_descriptor)) {
> > +        virtio_error(vdev, "in buffer too small for zone report");
> > +        return -1;
> > +    }
> > +
> > +    /* start byte offset of the zone report */
> > +    offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> > +        goto out;
> > +    }
> > +
> > +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> > +                sizeof(struct virtio_blk_zone_report)) /
> > +               sizeof(struct virtio_blk_zone_descriptor);
> > +
> > +    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
> > +    data = g_malloc(sizeof(ZoneCmdData));
> > +    data->req = req;
> > +    data->zone_report_data.nr_zones = nr_zones;
> > +    data->zone_report_data.zones = g_malloc(zone_size),
> > +
> > +    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
> > +                        data->zone_report_data.zones,
> > +                        virtio_blk_zone_report_complete, data);
> > +    return 0;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> > +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
> > +{
> > +    ZoneCmdData *data = opaque;
> > +    VirtIOBlockReq *req = data->req;
> > +    VirtIOBlock *s = req->dev;
> > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    if (ret) {
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +    goto out;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    g_free(data);
> > +}
> > +
> > +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
> > +{
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    BlockDriverState *bs = blk_bs(s->blk);
> > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > +    uint64_t len;
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    uint32_t type = virtio_ldl_p(vdev, &req->out.type);
> > +    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
> > +        /* Entire drive capacity */
> > +        offset = 0;
> > +        len = bs->bl.capacity;
> > +    } else {
> > +        if (bs->bl.zone_size > bs->bl.capacity - offset) {
> > +            /* The zoned device allows the last smaller zone. */
> > +            len = bs->bl.capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
> > +        } else {
> > +            len = bs->bl.zone_size;
> > +        }
> > +    }
> > +
> > +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> > +        goto out;
> > +    }
> > +
> > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > +    data->req = req;
> > +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> > +                      virtio_blk_zone_mgmt_complete, data);
> > +
> > +    return 0;
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> > +static void virtio_blk_zone_append_complete(void *opaque, int ret)
> > +{
> > +    ZoneCmdData *data = opaque;
> > +    VirtIOBlockReq *req = data->req;
> > +    VirtIOBlock *s = req->dev;
> > +    struct iovec *in_iov = req->elem.in_sg;
> > +    unsigned in_num = req->elem.in_num;
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    if (ret) {
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +    struct virtio_blk_zone_append_inhdr zap_hdr =
> > +        (struct virtio_blk_zone_append_inhdr) {
> > +        .append_sector = cpu_to_le64(data->zone_append_data.offset
> > +            >> BDRV_SECTOR_BITS),
> > +    };
> > +    iov_from_buf(in_iov, in_num, 0, &zap_hdr, sizeof(zap_hdr));
> > +    goto out;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    g_free(data);
> > +}
> > +
> > +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
> > +                                         struct iovec *out_iov,
> > +                                         uint64_t niov) {
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > +    int64_t len = iov_size(out_iov, niov);
> > +
> > +    if (!check_zoned_request(s, offset, len, true, &err_status)) {
> > +        goto out;
> > +    }
> > +
> > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > +    data->req = req;
> > +    data->zone_append_data.offset = offset;
> > +    qemu_iovec_init_external(&req->qiov, out_iov, niov);
> > +    blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
> > +                        virtio_blk_zone_append_complete, data);
> > +    return 0;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> >  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> >  {
> >      uint32_t type;
> > @@ -624,7 +974,7 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req,
> > MultiReqBuffer *mrb)
> >
> >      /* We always touch the last byte, so just see how big in_iov is.  */
> >      req->in_len = iov_size(in_iov, in_num);
> > -    req->in = (void *)in_iov[in_num - 1].iov_base
> > +    req->in.in_hdr = (void *)in_iov[in_num - 1].iov_base
> >                + in_iov[in_num - 1].iov_len
> >                - sizeof(struct virtio_blk_inhdr);
> >      iov_discard_back_undoable(in_iov, &in_num, sizeof(struct
> > virtio_blk_inhdr),
> > @@ -678,6 +1028,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req,
> > MultiReqBuffer *mrb)
> >      case VIRTIO_BLK_T_FLUSH:
> >          virtio_blk_handle_flush(req, mrb);
> >          break;
> > +    case VIRTIO_BLK_T_ZONE_REPORT:
> > +        virtio_blk_handle_zone_report(req);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_OPEN:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_CLOSE:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_FINISH:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_RESET:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> > +        break;
> >      case VIRTIO_BLK_T_SCSI_CMD:
> >          virtio_blk_handle_scsi(req);
> >          break;
> > @@ -696,6 +1064,13 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req,
> > MultiReqBuffer *mrb)
> >          virtio_blk_free_request(req);
> >          break;
> >      }
> > +    case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
> > +        /*
> > +         * It is not safe to access req->elem.out_sg directly because it
> > +         * may be modified by virtio_blk_handle_request().
> > +         */
> > +        virtio_blk_handle_zone_append(req, out_iov, out_num);
> > +        break;
> >      /*
> >       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
> >       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
> > @@ -895,6 +1270,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev,
> > uint8_t *config)
> >  {
> >      VirtIOBlock *s = VIRTIO_BLK(vdev);
> >      BlockConf *conf = &s->conf.conf;
> > +    BlockDriverState *bs = blk_bs(s->blk);
> >      struct virtio_blk_config blkcfg;
> >      uint64_t capacity;
> >      int64_t length;
> > @@ -954,6 +1330,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev,
> > uint8_t *config)
> >          blkcfg.write_zeroes_may_unmap = 1;
> >          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
> >      }
> > +    if (bs->bl.zoned != BLK_Z_NONE) {
> > +        switch (bs->bl.zoned) {
> > +        case BLK_Z_HM:
> > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
> > +            break;
> > +        case BLK_Z_HA:
> > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
> > +            break;
> > +        default:
> > +            g_assert_not_reached();
> > +        }
> > +
> > +        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
> > +                     bs->bl.zone_size / 512);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
> > +                     bs->bl.max_active_zones);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
> > +                     bs->bl.max_open_zones);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
> > +                     bs->bl.max_append_sectors);
> > +    } else {
> > +        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
> > +    }
> >      memcpy(config, &blkcfg, s->config_size);
> >  }
> >
> > @@ -1118,6 +1518,7 @@ static void virtio_blk_device_realize(DeviceState *dev,
> > Error **errp)
> >      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> >      VirtIOBlock *s = VIRTIO_BLK(dev);
> >      VirtIOBlkConf *conf = &s->conf;
> > +    BlockDriverState *bs = blk_bs(conf->conf.blk);
> >      Error *err = NULL;
> >      unsigned i;
> >
> > @@ -1163,6 +1564,13 @@ static void virtio_blk_device_realize(DeviceState *dev,
> > Error **errp)
> >          return;
> >      }
> >
> > +    if (bs->bl.zoned != BLK_Z_NONE) {
> > +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
> > +        if (bs->bl.zoned == BLK_Z_HM) {
> > +            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
> > +        }
> > +    }
> > +
> >      if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
> >          (!conf->max_discard_sectors ||
> >           conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
> > diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
> > index d311c57cca..88901a9f55 100644
> > --- a/include/hw/virtio/virtio-blk.h
> > +++ b/include/hw/virtio/virtio-blk.h
> > @@ -30,6 +30,12 @@ struct virtio_blk_inhdr
> >      unsigned char status;
> >  };
> >
> > +struct virtio_blk_zone_append_inhdr {
> > +    unsigned char status;
> > +    uint8_t reserved[7];
> > +    int64_t append_sector;
> > +};
> > +
> >  #define VIRTIO_BLK_AUTO_NUM_QUEUES UINT16_MAX
> >
> >  struct VirtIOBlkConf
> > @@ -73,7 +79,10 @@ typedef struct VirtIOBlockReq {
> >      VirtQueue *vq;
> >      IOVDiscardUndo inhdr_undo;
> >      IOVDiscardUndo outhdr_undo;
> > -    struct virtio_blk_inhdr *in;
> > +    union {
> > +        struct virtio_blk_inhdr *in_hdr;
> > +        struct virtio_blk_zone_append_inhdr *zone_append_inhdr;
> > +    } in;
> >      struct virtio_blk_outhdr out;
> >      QEMUIOVector qiov;
> >      size_t in_len;
>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC v3 2/2] virtio-blk: add zoned storage emulation for zoned devices
  2022-10-18  8:56     ` Sam Li
@ 2022-10-23  2:08       ` Dmitry Fomichev
  2022-10-23  3:01         ` Sam Li
  0 siblings, 1 reply; 9+ messages in thread
From: Dmitry Fomichev @ 2022-10-23  2:08 UTC (permalink / raw)
  To: faithilikerun
  Cc: hreitz, hare, qemu-devel, stefanha, qemu-block, kwolf,
	raphael.norwitz, mst, damien.lemoal

On Tue, 2022-10-18 at 16:56 +0800, Sam Li wrote:
> Dmitry Fomichev <Dmitry.Fomichev@wdc.com> 于2022年10月17日周一 09:01写道:
> > 
> > On Sun, 2022-10-16 at 23:05 +0800, Sam Li wrote:
> > > This patch extends virtio-blk emulation to handle zoned device commands
> > > by calling the new block layer APIs to perform zoned device I/O on
> > > behalf of the guest. It supports Report Zone, four zone oparations (open,
> > > close, finish, reset), and Append Zone.
> > > 
> > > The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> > > support zoned block devices. Regular block devices(conventional zones)
> > > will not be set.
> > > 
> > > Then the guest os can use blkzone(8) to test those commands on zoned
> > > devices.
> > > Furthermore, using zonefs to test zone append write is also supported.
> > > 
> > > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > > ---
> > >  hw/block/virtio-blk-common.c   |   2 +
> > >  hw/block/virtio-blk.c          | 412 ++++++++++++++++++++++++++++++++-
> > >  include/hw/virtio/virtio-blk.h |  11 +-
> > >  3 files changed, 422 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
> > > index ac52d7c176..e2f8e2f6da 100644
> > > --- a/hw/block/virtio-blk-common.c
> > > +++ b/hw/block/virtio-blk-common.c
> > > @@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
> > >       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
> > >      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
> > >       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> > > +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> > > +     .end = endof(struct virtio_blk_config, zoned)},
> > >      {}
> > >  };
> > > 
> > > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> > > index 8131ec2dbc..58891aea31 100644
> > > --- a/hw/block/virtio-blk.c
> > > +++ b/hw/block/virtio-blk.c
> > > @@ -26,6 +26,9 @@
> > >  #include "hw/virtio/virtio-blk.h"
> > >  #include "dataplane/virtio-blk.h"
> > >  #include "scsi/constants.h"
> > > +#if defined(CONFIG_BLKZONED)
> > > +#include <linux/blkzoned.h>
> > > +#endif
> > >  #ifdef __linux__
> > >  # include <scsi/sg.h>
> > >  #endif
> > > @@ -55,10 +58,29 @@ static void virtio_blk_req_complete(VirtIOBlockReq
> > > *req,
> > > unsigned char status)
> > >  {
> > >      VirtIOBlock *s = req->dev;
> > >      VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > > +    int64_t inhdr_len, n;
> > > +    void *buf;
> > > 
> > >      trace_virtio_blk_req_complete(vdev, req, status);
> > > 
> > > -    stb_p(&req->in->status, status);
> > > +    iov_discard_undo(&req->inhdr_undo);
> > > +    if (virtio_ldl_p(vdev, &req->out.type) == VIRTIO_BLK_T_ZONE_APPEND) {
> > > +        inhdr_len = sizeof(struct virtio_blk_zone_append_inhdr);
> > > +        req->in.in_hdr->status = status;
> > > +        buf = req->in.in_hdr;
> > > +    } else {
> > > +        inhdr_len = sizeof(struct virtio_blk_inhdr);
> > > +        req->in.zone_append_inhdr->status = status;
> > > +        buf = req->in.zone_append_inhdr;
> > > +    }
> > > +
> > > +    n = iov_from_buf(req->elem.in_sg, req->elem.in_num,
> > > +                     req->in_len - inhdr_len, buf, inhdr_len);
> > > +    if (n != inhdr_len) {
> > > +        virtio_error(vdev, "Driver provided input buffer less than size of
> > > "
> > > +                     "in header");
> > > +    }
> > > +
> > >      iov_discard_undo(&req->inhdr_undo);
> > >      iov_discard_undo(&req->outhdr_undo);
> > >      virtqueue_push(req->vq, &req->elem, req->in_len);
> > > @@ -592,6 +614,334 @@ err:
> > >      return err_status;
> > >  }
> > > 
> > > +typedef struct ZoneCmdData {
> > > +    VirtIOBlockReq *req;
> > > +    union {
> > > +        struct {
> > > +            unsigned int nr_zones;
> > > +            BlockZoneDescriptor *zones;
> > > +        } zone_report_data;
> > > +        struct {
> > > +            int64_t offset;
> > > +        } zone_append_data;
> > > +    };
> > > +} ZoneCmdData;
> > > +
> > > +/*
> > > + * check zoned_request: error checking before issuing requests. If all
> > > checks
> > > + * passed, return true.
> > > + * append: true if only zone append requests issued.
> > > + */
> > > +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t
> > > len,
> > > +                             bool append, uint8_t *status) {
> > > +    BlockDriverState *bs = blk_bs(s->blk);
> > > +    int index = offset / bs->bl.zone_size;
> > > +
> > > +    if (offset < 0 || len < 0 || offset > bs->bl.capacity - len) {
> > > +        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +        return false;
> > > +    }
> > > +
> > > +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> > > +        *status = VIRTIO_BLK_S_UNSUPP;
> > > +        return false;
> > > +    }
> > > +
> > > +    if (append) {
> > > +        if ((offset % bs->bl.write_granularity) != 0) {
> > > +            *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> > > +            return false;
> > > +        }
> > > +
> > > +        if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) {
> > > +            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +            return false;
> > > +        }
> > > +
> > > +        if (len / 512 > bs->bl.max_append_sectors) {
> > > +            if (bs->bl.max_append_sectors == 0) {
> > > +                *status = VIRTIO_BLK_S_UNSUPP;
> > > +            } else {
> > > +                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +            }
> > > +            return false;
> > > +        }
> > > +    }
> > > +    return true;
> > > +}
> > > +
> > > +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> > > +{
> > > +    ZoneCmdData *data = opaque;
> > > +    VirtIOBlockReq *req = data->req;
> > > +    VirtIOBlock *s = req->dev;
> > > +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> > > +    struct iovec *in_iov = req->elem.in_sg;
> > > +    unsigned in_num = req->elem.in_num;
> > > +    int64_t zrp_size, nz, n, j = 0;
> > > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > > +
> > > +    if (ret) {
> > > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +        goto out;
> > > +    }
> > > +
> > > +    nz = data->zone_report_data.nr_zones;
> > > +    struct virtio_blk_zone_report zrp_hdr = (struct
> > > virtio_blk_zone_report) {
> > > +            .nr_zones = cpu_to_le64(nz),
> > > +    };
> > > +
> > > +    zrp_size = sizeof(struct virtio_blk_zone_report)
> > > +               + sizeof(struct virtio_blk_zone_descriptor) * nz;
> > > +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
> > > +    if (n != sizeof(zrp_hdr)) {
> > > +        virtio_error(vdev, "Driver provided intput buffer that is too
> > > small!");
> > 
> > s/intput/input
> > 
> > > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +        goto out;
> > > +    }
> > > +
> > > +    for (size_t i = sizeof(zrp_hdr); i < zrp_size;
> > > +        i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
> > > +        struct virtio_blk_zone_descriptor desc =
> > > +            (struct virtio_blk_zone_descriptor) {
> > > +                .z_start = cpu_to_le64(data-
> > > >zone_report_data.zones[j].start)
> > > +                    >> BDRV_SECTOR_BITS,
> > > +                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap)
> > > +                    >> BDRV_SECTOR_BITS,
> > > +                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp)
> > > +                    >> BDRV_SECTOR_BITS,
> > 
> > I think these should be
> > 
> > +                .z_start = cpu_to_le64(data->zone_report_data.zones[j].start
> > +                    >> BDRV_SECTOR_BITS),
> > +                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
> > +                    >> BDRV_SECTOR_BITS),
> > +                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
> > +                    >> BDRV_SECTOR_BITS),
> > 
> > > +        };
> > > +
> > > +        switch (data->zone_report_data.zones[j].type) {
> > > +        case BLK_ZT_CONV:
> > > +            desc.z_type = BLK_ZONE_TYPE_CONVENTIONAL;
> > > +            break;
> > > +        case BLK_ZT_SWR:
> > > +            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_REQ;
> > > +            break;
> > > +        case BLK_ZT_SWP:
> > > +            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_PREF;
> > > +            break;
> > > +        default:
> > > +            g_assert_not_reached();
> > > +        }
> > 
> > why do we need to convert these to system-defined values?
> > The output of this function will be sent to the driver that
> > uses the values defined in virtio-zbd spec. And the conversion
> > to BLK_ZS_xxx/BLK_ZT_xxx has already been done in parse_zone()
> > function in file-posix.c
> 
> Because QEMU may use different macro values compared to Linux. It is
> for avoiding passing wrong values to the guest. Parse_zone() converts
> Linux macros to QEMU macros and QEMU macros will change back to
> Linux's defined values here.

You are not yet passing anything to the guest OS at this point. You are
sending the zone report to the driver using the format defined in
virtio-zbd spec. The driver then will have to convert the
BLK_ZT_xxx/BLK_ZS_xxx values to the corresponding values defined in the
driver environment which, in general, may be something else besides Linux.

Perhaps you are being confused by the fact that virtblk_parse_zone()
function in the current version of Linux virtio_blk driver patch does
not perform the explicit conversion from VIRTIO_BLK_ZT_xxx to
BLK_ZONE_TYPE_xxx and from VIRTIO_BLK_ZS_xxx to BLK_ZONE_COND_xxx
values. It is an optimization that is possible because we know a priori
that the actual integer values for the both sets of definitions are the
same.

But it is that function is the appropriate place to convert from
virtio-defined zone types and states to the system values, not this part
of your code. Perhaps I should add similar switch statements to the
driver code for the sake of code clarity and robustness. It will add
a bit of overhead for zone report processing, but these requests
are rare and the overall performance should not be affected.

DF

> 
> > 
> > > +
> > > +        switch (data->zone_report_data.zones[j].cond) {
> > > +        case BLK_ZS_RDONLY:
> > > +            desc.z_state = BLK_ZONE_COND_READONLY;
> > > +            break;
> > > +        case BLK_ZS_OFFLINE:
> > > +            desc.z_state = BLK_ZONE_COND_OFFLINE;
> > > +            break;
> > > +        case BLK_ZS_EMPTY:
> > > +            desc.z_state = BLK_ZONE_COND_EMPTY;
> > > +            break;
> > > +        case BLK_ZS_CLOSED:
> > > +            desc.z_state = BLK_ZONE_COND_CLOSED;
> > > +            break;
> > > +        case BLK_ZS_FULL:
> > > +            desc.z_state = BLK_ZONE_COND_FULL;
> > > +            break;
> > > +        case BLK_ZS_EOPEN:
> > > +            desc.z_state = BLK_ZONE_COND_EXP_OPEN;
> > > +            break;
> > > +        case BLK_ZS_IOPEN:
> > > +            desc.z_state = BLK_ZONE_COND_IMP_OPEN;
> > > +            break;
> > > +        case BLK_ZS_NOT_WP:
> > > +            desc.z_state = BLK_ZONE_COND_NOT_WP;
> > > +            break;
> > > +        default:
> > > +            g_assert_not_reached();
> > > +        }
> > 
> > same here. I think the code to form the descriptor can be reduced to
> > 
> > +        struct virtio_blk_zone_descriptor desc = (struct
> > virtio_blk_zone_descriptor) {
> > +            .z_start = cpu_to_le64(data->zone_report_data.zones[j].start >>
> > BDRV_SECTOR_BITS),
> > +            .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap >>
> > BDRV_SECTOR_BITS),
> > +            .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp >>
> > BDRV_SECTOR_BITS),
> > +            .z_type = data->zone_report_data.zones[j].type,
> > +            .z_state = data->zone_report_data.zones[j].state,
> > +        };
> > 
> > > +
> > > +        /* TODO: it takes O(n^2) time complexity. Optimizations required.
> > > */
> > > +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
> > > +        if (n != sizeof(desc)) {
> > > +            virtio_error(vdev, "Driver provided input buffer "
> > > +                               "for descriptors that is too small!");
> > > +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +            goto out;
> > > +        }
> > > +    }
> > > +    goto out;
> > 
> > this goto is not needed.
> > 
> > > +
> > > +out:
> > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > +    virtio_blk_req_complete(req, err_status);
> > > +    virtio_blk_free_request(req);
> > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > +    g_free(data->zone_report_data.zones);
> > > +    g_free(data);
> > > +}
> > > +
> > > +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req)
> > > +{
> > > +    VirtIOBlock *s = req->dev;
> > > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > > +    unsigned int nr_zones;
> > > +    ZoneCmdData *data;
> > > +    int64_t zone_size, offset;
> > > +    uint8_t err_status;
> > > +
> > > +    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
> > > +            sizeof(struct virtio_blk_zone_report) +
> > > +            sizeof(struct virtio_blk_zone_descriptor)) {
> > > +        virtio_error(vdev, "in buffer too small for zone report");
> > > +        return -1;
> > > +    }
> > > +
> > > +    /* start byte offset of the zone report */
> > > +    offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > > +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> > > +        goto out;
> > > +    }
> > > +
> > > +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> > > +                sizeof(struct virtio_blk_zone_report)) /
> > > +               sizeof(struct virtio_blk_zone_descriptor);
> > > +
> > > +    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
> > > +    data = g_malloc(sizeof(ZoneCmdData));
> > > +    data->req = req;
> > > +    data->zone_report_data.nr_zones = nr_zones;
> > > +    data->zone_report_data.zones = g_malloc(zone_size),
> > > +
> > > +    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
> > > +                        data->zone_report_data.zones,
> > > +                        virtio_blk_zone_report_complete, data);
> > > +    return 0;
> > > +
> > > +out:
> > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > +    virtio_blk_req_complete(req, err_status);
> > > +    virtio_blk_free_request(req);
> > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > +    return err_status;
> > > +}
> > > +
> > > +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
> > > +{
> > > +    ZoneCmdData *data = opaque;
> > > +    VirtIOBlockReq *req = data->req;
> > > +    VirtIOBlock *s = req->dev;
> > > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > > +
> > > +    if (ret) {
> > > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +        goto out;
> > > +    }
> > > +    goto out;
> > > +
> > > +out:
> > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > +    virtio_blk_req_complete(req, err_status);
> > > +    virtio_blk_free_request(req);
> > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > +    g_free(data);
> > > +}
> > > +
> > > +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp
> > > op)
> > > +{
> > > +    VirtIOBlock *s = req->dev;
> > > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > > +    BlockDriverState *bs = blk_bs(s->blk);
> > > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > > +    uint64_t len;
> > > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > > +
> > > +    uint32_t type = virtio_ldl_p(vdev, &req->out.type);
> > > +    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
> > > +        /* Entire drive capacity */
> > > +        offset = 0;
> > > +        len = bs->bl.capacity;
> > > +    } else {
> > > +        if (bs->bl.zone_size > bs->bl.capacity - offset) {
> > > +            /* The zoned device allows the last smaller zone. */
> > > +            len = bs->bl.capacity - bs->bl.zone_size * (bs->bl.nr_zones -
> > > 1);
> > > +        } else {
> > > +            len = bs->bl.zone_size;
> > > +        }
> > > +    }
> > > +
> > > +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> > > +        goto out;
> > > +    }
> > > +
> > > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > > +    data->req = req;
> > > +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> > > +                      virtio_blk_zone_mgmt_complete, data);
> > > +
> > > +    return 0;
> > > +out:
> > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > +    virtio_blk_req_complete(req, err_status);
> > > +    virtio_blk_free_request(req);
> > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > +    return err_status;
> > > +}
> > > +
> > > +static void virtio_blk_zone_append_complete(void *opaque, int ret)
> > > +{
> > > +    ZoneCmdData *data = opaque;
> > > +    VirtIOBlockReq *req = data->req;
> > > +    VirtIOBlock *s = req->dev;
> > > +    struct iovec *in_iov = req->elem.in_sg;
> > > +    unsigned in_num = req->elem.in_num;
> > > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > > +
> > > +    if (ret) {
> > > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +        goto out;
> > > +    }
> > > +
> > > +    struct virtio_blk_zone_append_inhdr zap_hdr =
> > > +        (struct virtio_blk_zone_append_inhdr) {
> > > +        .append_sector = cpu_to_le64(data->zone_append_data.offset
> > > +            >> BDRV_SECTOR_BITS),
> > > +    };
> > > +    iov_from_buf(in_iov, in_num, 0, &zap_hdr, sizeof(zap_hdr));
> > > +    goto out;
> > > +
> > > +out:
> > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > +    virtio_blk_req_complete(req, err_status);
> > > +    virtio_blk_free_request(req);
> > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > +    g_free(data);
> > > +}
> > > +
> > > +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
> > > +                                         struct iovec *out_iov,
> > > +                                         uint64_t niov) {
> > > +    VirtIOBlock *s = req->dev;
> > > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > > +
> > > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > > +    int64_t len = iov_size(out_iov, niov);
> > > +
> > > +    if (!check_zoned_request(s, offset, len, true, &err_status)) {
> > > +        goto out;
> > > +    }
> > > +
> > > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > > +    data->req = req;
> > > +    data->zone_append_data.offset = offset;
> > > +    qemu_iovec_init_external(&req->qiov, out_iov, niov);
> > > +    blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req-
> > > >qiov, 0,
> > > +                        virtio_blk_zone_append_complete, data);
> > > +    return 0;
> > > +
> > > +out:
> > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > +    virtio_blk_req_complete(req, err_status);
> > > +    virtio_blk_free_request(req);
> > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > +    return err_status;
> > > +}
> > > +
> > >  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer
> > > *mrb)
> > >  {
> > >      uint32_t type;
> > > @@ -624,7 +974,7 @@ static int virtio_blk_handle_request(VirtIOBlockReq
> > > *req,
> > > MultiReqBuffer *mrb)
> > > 
> > >      /* We always touch the last byte, so just see how big in_iov is.  */
> > >      req->in_len = iov_size(in_iov, in_num);
> > > -    req->in = (void *)in_iov[in_num - 1].iov_base
> > > +    req->in.in_hdr = (void *)in_iov[in_num - 1].iov_base
> > >                + in_iov[in_num - 1].iov_len
> > >                - sizeof(struct virtio_blk_inhdr);
> > >      iov_discard_back_undoable(in_iov, &in_num, sizeof(struct
> > > virtio_blk_inhdr),
> > > @@ -678,6 +1028,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq
> > > *req,
> > > MultiReqBuffer *mrb)
> > >      case VIRTIO_BLK_T_FLUSH:
> > >          virtio_blk_handle_flush(req, mrb);
> > >          break;
> > > +    case VIRTIO_BLK_T_ZONE_REPORT:
> > > +        virtio_blk_handle_zone_report(req);
> > > +        break;
> > > +    case VIRTIO_BLK_T_ZONE_OPEN:
> > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> > > +        break;
> > > +    case VIRTIO_BLK_T_ZONE_CLOSE:
> > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> > > +        break;
> > > +    case VIRTIO_BLK_T_ZONE_FINISH:
> > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> > > +        break;
> > > +    case VIRTIO_BLK_T_ZONE_RESET:
> > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> > > +        break;
> > > +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
> > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> > > +        break;
> > >      case VIRTIO_BLK_T_SCSI_CMD:
> > >          virtio_blk_handle_scsi(req);
> > >          break;
> > > @@ -696,6 +1064,13 @@ static int virtio_blk_handle_request(VirtIOBlockReq
> > > *req,
> > > MultiReqBuffer *mrb)
> > >          virtio_blk_free_request(req);
> > >          break;
> > >      }
> > > +    case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
> > > +        /*
> > > +         * It is not safe to access req->elem.out_sg directly because it
> > > +         * may be modified by virtio_blk_handle_request().
> > > +         */
> > > +        virtio_blk_handle_zone_append(req, out_iov, out_num);
> > > +        break;
> > >      /*
> > >       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
> > >       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch
> > > statement,
> > > @@ -895,6 +1270,7 @@ static void virtio_blk_update_config(VirtIODevice
> > > *vdev,
> > > uint8_t *config)
> > >  {
> > >      VirtIOBlock *s = VIRTIO_BLK(vdev);
> > >      BlockConf *conf = &s->conf.conf;
> > > +    BlockDriverState *bs = blk_bs(s->blk);
> > >      struct virtio_blk_config blkcfg;
> > >      uint64_t capacity;
> > >      int64_t length;
> > > @@ -954,6 +1330,30 @@ static void virtio_blk_update_config(VirtIODevice
> > > *vdev,
> > > uint8_t *config)
> > >          blkcfg.write_zeroes_may_unmap = 1;
> > >          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
> > >      }
> > > +    if (bs->bl.zoned != BLK_Z_NONE) {
> > > +        switch (bs->bl.zoned) {
> > > +        case BLK_Z_HM:
> > > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
> > > +            break;
> > > +        case BLK_Z_HA:
> > > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
> > > +            break;
> > > +        default:
> > > +            g_assert_not_reached();
> > > +        }
> > > +
> > > +        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
> > > +                     bs->bl.zone_size / 512);
> > > +        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
> > > +                     bs->bl.max_active_zones);
> > > +        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
> > > +                     bs->bl.max_open_zones);
> > > +        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> > > +        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
> > > +                     bs->bl.max_append_sectors);
> > > +    } else {
> > > +        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
> > > +    }
> > >      memcpy(config, &blkcfg, s->config_size);
> > >  }
> > > 
> > > @@ -1118,6 +1518,7 @@ static void virtio_blk_device_realize(DeviceState
> > > *dev,
> > > Error **errp)
> > >      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> > >      VirtIOBlock *s = VIRTIO_BLK(dev);
> > >      VirtIOBlkConf *conf = &s->conf;
> > > +    BlockDriverState *bs = blk_bs(conf->conf.blk);
> > >      Error *err = NULL;
> > >      unsigned i;
> > > 
> > > @@ -1163,6 +1564,13 @@ static void virtio_blk_device_realize(DeviceState
> > > *dev,
> > > Error **errp)
> > >          return;
> > >      }
> > > 
> > > +    if (bs->bl.zoned != BLK_Z_NONE) {
> > > +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
> > > +        if (bs->bl.zoned == BLK_Z_HM) {
> > > +            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
> > > +        }
> > > +    }
> > > +
> > >      if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
> > >          (!conf->max_discard_sectors ||
> > >           conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
> > > diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-
> > > blk.h
> > > index d311c57cca..88901a9f55 100644
> > > --- a/include/hw/virtio/virtio-blk.h
> > > +++ b/include/hw/virtio/virtio-blk.h
> > > @@ -30,6 +30,12 @@ struct virtio_blk_inhdr
> > >      unsigned char status;
> > >  };
> > > 
> > > +struct virtio_blk_zone_append_inhdr {
> > > +    unsigned char status;
> > > +    uint8_t reserved[7];
> > > +    int64_t append_sector;
> > > +};
> > > +
> > >  #define VIRTIO_BLK_AUTO_NUM_QUEUES UINT16_MAX
> > > 
> > >  struct VirtIOBlkConf
> > > @@ -73,7 +79,10 @@ typedef struct VirtIOBlockReq {
> > >      VirtQueue *vq;
> > >      IOVDiscardUndo inhdr_undo;
> > >      IOVDiscardUndo outhdr_undo;
> > > -    struct virtio_blk_inhdr *in;
> > > +    union {
> > > +        struct virtio_blk_inhdr *in_hdr;
> > > +        struct virtio_blk_zone_append_inhdr *zone_append_inhdr;
> > > +    } in;
> > >      struct virtio_blk_outhdr out;
> > >      QEMUIOVector qiov;
> > >      size_t in_len;
> > 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC v3 2/2] virtio-blk: add zoned storage emulation for zoned devices
  2022-10-23  2:08       ` Dmitry Fomichev
@ 2022-10-23  3:01         ` Sam Li
  0 siblings, 0 replies; 9+ messages in thread
From: Sam Li @ 2022-10-23  3:01 UTC (permalink / raw)
  To: Dmitry Fomichev
  Cc: hreitz, hare, qemu-devel, stefanha, qemu-block, kwolf,
	raphael.norwitz, mst, damien.lemoal

Dmitry Fomichev <Dmitry.Fomichev@wdc.com> 于2022年10月23日周日 10:08写道:
>
> On Tue, 2022-10-18 at 16:56 +0800, Sam Li wrote:
> > Dmitry Fomichev <Dmitry.Fomichev@wdc.com> 于2022年10月17日周一 09:01写道:
> > >
> > > On Sun, 2022-10-16 at 23:05 +0800, Sam Li wrote:
> > > > This patch extends virtio-blk emulation to handle zoned device commands
> > > > by calling the new block layer APIs to perform zoned device I/O on
> > > > behalf of the guest. It supports Report Zone, four zone oparations (open,
> > > > close, finish, reset), and Append Zone.
> > > >
> > > > The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> > > > support zoned block devices. Regular block devices(conventional zones)
> > > > will not be set.
> > > >
> > > > Then the guest os can use blkzone(8) to test those commands on zoned
> > > > devices.
> > > > Furthermore, using zonefs to test zone append write is also supported.
> > > >
> > > > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > > > ---
> > > >  hw/block/virtio-blk-common.c   |   2 +
> > > >  hw/block/virtio-blk.c          | 412 ++++++++++++++++++++++++++++++++-
> > > >  include/hw/virtio/virtio-blk.h |  11 +-
> > > >  3 files changed, 422 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
> > > > index ac52d7c176..e2f8e2f6da 100644
> > > > --- a/hw/block/virtio-blk-common.c
> > > > +++ b/hw/block/virtio-blk-common.c
> > > > @@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
> > > >       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
> > > >      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
> > > >       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> > > > +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> > > > +     .end = endof(struct virtio_blk_config, zoned)},
> > > >      {}
> > > >  };
> > > >
> > > > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> > > > index 8131ec2dbc..58891aea31 100644
> > > > --- a/hw/block/virtio-blk.c
> > > > +++ b/hw/block/virtio-blk.c
> > > > @@ -26,6 +26,9 @@
> > > >  #include "hw/virtio/virtio-blk.h"
> > > >  #include "dataplane/virtio-blk.h"
> > > >  #include "scsi/constants.h"
> > > > +#if defined(CONFIG_BLKZONED)
> > > > +#include <linux/blkzoned.h>
> > > > +#endif
> > > >  #ifdef __linux__
> > > >  # include <scsi/sg.h>
> > > >  #endif
> > > > @@ -55,10 +58,29 @@ static void virtio_blk_req_complete(VirtIOBlockReq
> > > > *req,
> > > > unsigned char status)
> > > >  {
> > > >      VirtIOBlock *s = req->dev;
> > > >      VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > > > +    int64_t inhdr_len, n;
> > > > +    void *buf;
> > > >
> > > >      trace_virtio_blk_req_complete(vdev, req, status);
> > > >
> > > > -    stb_p(&req->in->status, status);
> > > > +    iov_discard_undo(&req->inhdr_undo);
> > > > +    if (virtio_ldl_p(vdev, &req->out.type) == VIRTIO_BLK_T_ZONE_APPEND) {
> > > > +        inhdr_len = sizeof(struct virtio_blk_zone_append_inhdr);
> > > > +        req->in.in_hdr->status = status;
> > > > +        buf = req->in.in_hdr;
> > > > +    } else {
> > > > +        inhdr_len = sizeof(struct virtio_blk_inhdr);
> > > > +        req->in.zone_append_inhdr->status = status;
> > > > +        buf = req->in.zone_append_inhdr;
> > > > +    }
> > > > +
> > > > +    n = iov_from_buf(req->elem.in_sg, req->elem.in_num,
> > > > +                     req->in_len - inhdr_len, buf, inhdr_len);
> > > > +    if (n != inhdr_len) {
> > > > +        virtio_error(vdev, "Driver provided input buffer less than size of
> > > > "
> > > > +                     "in header");
> > > > +    }
> > > > +
> > > >      iov_discard_undo(&req->inhdr_undo);
> > > >      iov_discard_undo(&req->outhdr_undo);
> > > >      virtqueue_push(req->vq, &req->elem, req->in_len);
> > > > @@ -592,6 +614,334 @@ err:
> > > >      return err_status;
> > > >  }
> > > >
> > > > +typedef struct ZoneCmdData {
> > > > +    VirtIOBlockReq *req;
> > > > +    union {
> > > > +        struct {
> > > > +            unsigned int nr_zones;
> > > > +            BlockZoneDescriptor *zones;
> > > > +        } zone_report_data;
> > > > +        struct {
> > > > +            int64_t offset;
> > > > +        } zone_append_data;
> > > > +    };
> > > > +} ZoneCmdData;
> > > > +
> > > > +/*
> > > > + * check zoned_request: error checking before issuing requests. If all
> > > > checks
> > > > + * passed, return true.
> > > > + * append: true if only zone append requests issued.
> > > > + */
> > > > +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t
> > > > len,
> > > > +                             bool append, uint8_t *status) {
> > > > +    BlockDriverState *bs = blk_bs(s->blk);
> > > > +    int index = offset / bs->bl.zone_size;
> > > > +
> > > > +    if (offset < 0 || len < 0 || offset > bs->bl.capacity - len) {
> > > > +        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > > +        return false;
> > > > +    }
> > > > +
> > > > +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> > > > +        *status = VIRTIO_BLK_S_UNSUPP;
> > > > +        return false;
> > > > +    }
> > > > +
> > > > +    if (append) {
> > > > +        if ((offset % bs->bl.write_granularity) != 0) {
> > > > +            *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> > > > +            return false;
> > > > +        }
> > > > +
> > > > +        if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) {
> > > > +            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > > +            return false;
> > > > +        }
> > > > +
> > > > +        if (len / 512 > bs->bl.max_append_sectors) {
> > > > +            if (bs->bl.max_append_sectors == 0) {
> > > > +                *status = VIRTIO_BLK_S_UNSUPP;
> > > > +            } else {
> > > > +                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > > +            }
> > > > +            return false;
> > > > +        }
> > > > +    }
> > > > +    return true;
> > > > +}
> > > > +
> > > > +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> > > > +{
> > > > +    ZoneCmdData *data = opaque;
> > > > +    VirtIOBlockReq *req = data->req;
> > > > +    VirtIOBlock *s = req->dev;
> > > > +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> > > > +    struct iovec *in_iov = req->elem.in_sg;
> > > > +    unsigned in_num = req->elem.in_num;
> > > > +    int64_t zrp_size, nz, n, j = 0;
> > > > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > > > +
> > > > +    if (ret) {
> > > > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    nz = data->zone_report_data.nr_zones;
> > > > +    struct virtio_blk_zone_report zrp_hdr = (struct
> > > > virtio_blk_zone_report) {
> > > > +            .nr_zones = cpu_to_le64(nz),
> > > > +    };
> > > > +
> > > > +    zrp_size = sizeof(struct virtio_blk_zone_report)
> > > > +               + sizeof(struct virtio_blk_zone_descriptor) * nz;
> > > > +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
> > > > +    if (n != sizeof(zrp_hdr)) {
> > > > +        virtio_error(vdev, "Driver provided intput buffer that is too
> > > > small!");
> > >
> > > s/intput/input
> > >
> > > > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    for (size_t i = sizeof(zrp_hdr); i < zrp_size;
> > > > +        i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
> > > > +        struct virtio_blk_zone_descriptor desc =
> > > > +            (struct virtio_blk_zone_descriptor) {
> > > > +                .z_start = cpu_to_le64(data-
> > > > >zone_report_data.zones[j].start)
> > > > +                    >> BDRV_SECTOR_BITS,
> > > > +                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap)
> > > > +                    >> BDRV_SECTOR_BITS,
> > > > +                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp)
> > > > +                    >> BDRV_SECTOR_BITS,
> > >
> > > I think these should be
> > >
> > > +                .z_start = cpu_to_le64(data->zone_report_data.zones[j].start
> > > +                    >> BDRV_SECTOR_BITS),
> > > +                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
> > > +                    >> BDRV_SECTOR_BITS),
> > > +                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
> > > +                    >> BDRV_SECTOR_BITS),
> > >
> > > > +        };
> > > > +
> > > > +        switch (data->zone_report_data.zones[j].type) {
> > > > +        case BLK_ZT_CONV:
> > > > +            desc.z_type = BLK_ZONE_TYPE_CONVENTIONAL;
> > > > +            break;
> > > > +        case BLK_ZT_SWR:
> > > > +            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_REQ;
> > > > +            break;
> > > > +        case BLK_ZT_SWP:
> > > > +            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_PREF;
> > > > +            break;
> > > > +        default:
> > > > +            g_assert_not_reached();
> > > > +        }
> > >
> > > why do we need to convert these to system-defined values?
> > > The output of this function will be sent to the driver that
> > > uses the values defined in virtio-zbd spec. And the conversion
> > > to BLK_ZS_xxx/BLK_ZT_xxx has already been done in parse_zone()
> > > function in file-posix.c
> >
> > Because QEMU may use different macro values compared to Linux. It is
> > for avoiding passing wrong values to the guest. Parse_zone() converts
> > Linux macros to QEMU macros and QEMU macros will change back to
> > Linux's defined values here.
>
> You are not yet passing anything to the guest OS at this point. You are
> sending the zone report to the driver using the format defined in
> virtio-zbd spec. The driver then will have to convert the
> BLK_ZT_xxx/BLK_ZS_xxx values to the corresponding values defined in the
> driver environment which, in general, may be something else besides Linux.
>
> Perhaps you are being confused by the fact that virtblk_parse_zone()
> function in the current version of Linux virtio_blk driver patch does
> not perform the explicit conversion from VIRTIO_BLK_ZT_xxx to
> BLK_ZONE_TYPE_xxx and from VIRTIO_BLK_ZS_xxx to BLK_ZONE_COND_xxx
> values. It is an optimization that is possible because we know a priori
> that the actual integer values for the both sets of definitions are the
> same.
>
> But it is that function is the appropriate place to convert from
> virtio-defined zone types and states to the system values, not this part
> of your code. Perhaps I should add similar switch statements to the
> driver code for the sake of code clarity and robustness. It will add
> a bit of overhead for zone report processing, but these requests
> are rare and the overall performance should not be affected.

I see. The switch cases are in order to avoid the assumption that the
QEMU type and state fields may not match virtio-blk constants. If
there is an optimization that avoids it, I'll change back to no switch
version and add corresponding documents.

>
> DF
>
> >
> > >
> > > > +
> > > > +        switch (data->zone_report_data.zones[j].cond) {
> > > > +        case BLK_ZS_RDONLY:
> > > > +            desc.z_state = BLK_ZONE_COND_READONLY;
> > > > +            break;
> > > > +        case BLK_ZS_OFFLINE:
> > > > +            desc.z_state = BLK_ZONE_COND_OFFLINE;
> > > > +            break;
> > > > +        case BLK_ZS_EMPTY:
> > > > +            desc.z_state = BLK_ZONE_COND_EMPTY;
> > > > +            break;
> > > > +        case BLK_ZS_CLOSED:
> > > > +            desc.z_state = BLK_ZONE_COND_CLOSED;
> > > > +            break;
> > > > +        case BLK_ZS_FULL:
> > > > +            desc.z_state = BLK_ZONE_COND_FULL;
> > > > +            break;
> > > > +        case BLK_ZS_EOPEN:
> > > > +            desc.z_state = BLK_ZONE_COND_EXP_OPEN;
> > > > +            break;
> > > > +        case BLK_ZS_IOPEN:
> > > > +            desc.z_state = BLK_ZONE_COND_IMP_OPEN;
> > > > +            break;
> > > > +        case BLK_ZS_NOT_WP:
> > > > +            desc.z_state = BLK_ZONE_COND_NOT_WP;
> > > > +            break;
> > > > +        default:
> > > > +            g_assert_not_reached();
> > > > +        }
> > >
> > > same here. I think the code to form the descriptor can be reduced to
> > >
> > > +        struct virtio_blk_zone_descriptor desc = (struct
> > > virtio_blk_zone_descriptor) {
> > > +            .z_start = cpu_to_le64(data->zone_report_data.zones[j].start >>
> > > BDRV_SECTOR_BITS),
> > > +            .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap >>
> > > BDRV_SECTOR_BITS),
> > > +            .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp >>
> > > BDRV_SECTOR_BITS),
> > > +            .z_type = data->zone_report_data.zones[j].type,
> > > +            .z_state = data->zone_report_data.zones[j].state,
> > > +        };
> > >
> > > > +
> > > > +        /* TODO: it takes O(n^2) time complexity. Optimizations required.
> > > > */
> > > > +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
> > > > +        if (n != sizeof(desc)) {
> > > > +            virtio_error(vdev, "Driver provided input buffer "
> > > > +                               "for descriptors that is too small!");
> > > > +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > > +            goto out;
> > > > +        }
> > > > +    }
> > > > +    goto out;
> > >
> > > this goto is not needed.
> > >
> > > > +
> > > > +out:
> > > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > > +    virtio_blk_req_complete(req, err_status);
> > > > +    virtio_blk_free_request(req);
> > > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > > +    g_free(data->zone_report_data.zones);
> > > > +    g_free(data);
> > > > +}
> > > > +
> > > > +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req)
> > > > +{
> > > > +    VirtIOBlock *s = req->dev;
> > > > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > > > +    unsigned int nr_zones;
> > > > +    ZoneCmdData *data;
> > > > +    int64_t zone_size, offset;
> > > > +    uint8_t err_status;
> > > > +
> > > > +    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
> > > > +            sizeof(struct virtio_blk_zone_report) +
> > > > +            sizeof(struct virtio_blk_zone_descriptor)) {
> > > > +        virtio_error(vdev, "in buffer too small for zone report");
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    /* start byte offset of the zone report */
> > > > +    offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > > > +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> > > > +                sizeof(struct virtio_blk_zone_report)) /
> > > > +               sizeof(struct virtio_blk_zone_descriptor);
> > > > +
> > > > +    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
> > > > +    data = g_malloc(sizeof(ZoneCmdData));
> > > > +    data->req = req;
> > > > +    data->zone_report_data.nr_zones = nr_zones;
> > > > +    data->zone_report_data.zones = g_malloc(zone_size),
> > > > +
> > > > +    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
> > > > +                        data->zone_report_data.zones,
> > > > +                        virtio_blk_zone_report_complete, data);
> > > > +    return 0;
> > > > +
> > > > +out:
> > > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > > +    virtio_blk_req_complete(req, err_status);
> > > > +    virtio_blk_free_request(req);
> > > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > > +    return err_status;
> > > > +}
> > > > +
> > > > +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
> > > > +{
> > > > +    ZoneCmdData *data = opaque;
> > > > +    VirtIOBlockReq *req = data->req;
> > > > +    VirtIOBlock *s = req->dev;
> > > > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > > > +
> > > > +    if (ret) {
> > > > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > > +        goto out;
> > > > +    }
> > > > +    goto out;
> > > > +
> > > > +out:
> > > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > > +    virtio_blk_req_complete(req, err_status);
> > > > +    virtio_blk_free_request(req);
> > > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > > +    g_free(data);
> > > > +}
> > > > +
> > > > +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp
> > > > op)
> > > > +{
> > > > +    VirtIOBlock *s = req->dev;
> > > > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > > > +    BlockDriverState *bs = blk_bs(s->blk);
> > > > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > > > +    uint64_t len;
> > > > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > > > +
> > > > +    uint32_t type = virtio_ldl_p(vdev, &req->out.type);
> > > > +    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
> > > > +        /* Entire drive capacity */
> > > > +        offset = 0;
> > > > +        len = bs->bl.capacity;
> > > > +    } else {
> > > > +        if (bs->bl.zone_size > bs->bl.capacity - offset) {
> > > > +            /* The zoned device allows the last smaller zone. */
> > > > +            len = bs->bl.capacity - bs->bl.zone_size * (bs->bl.nr_zones -
> > > > 1);
> > > > +        } else {
> > > > +            len = bs->bl.zone_size;
> > > > +        }
> > > > +    }
> > > > +
> > > > +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > > > +    data->req = req;
> > > > +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> > > > +                      virtio_blk_zone_mgmt_complete, data);
> > > > +
> > > > +    return 0;
> > > > +out:
> > > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > > +    virtio_blk_req_complete(req, err_status);
> > > > +    virtio_blk_free_request(req);
> > > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > > +    return err_status;
> > > > +}
> > > > +
> > > > +static void virtio_blk_zone_append_complete(void *opaque, int ret)
> > > > +{
> > > > +    ZoneCmdData *data = opaque;
> > > > +    VirtIOBlockReq *req = data->req;
> > > > +    VirtIOBlock *s = req->dev;
> > > > +    struct iovec *in_iov = req->elem.in_sg;
> > > > +    unsigned in_num = req->elem.in_num;
> > > > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > > > +
> > > > +    if (ret) {
> > > > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    struct virtio_blk_zone_append_inhdr zap_hdr =
> > > > +        (struct virtio_blk_zone_append_inhdr) {
> > > > +        .append_sector = cpu_to_le64(data->zone_append_data.offset
> > > > +            >> BDRV_SECTOR_BITS),
> > > > +    };
> > > > +    iov_from_buf(in_iov, in_num, 0, &zap_hdr, sizeof(zap_hdr));
> > > > +    goto out;
> > > > +
> > > > +out:
> > > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > > +    virtio_blk_req_complete(req, err_status);
> > > > +    virtio_blk_free_request(req);
> > > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > > +    g_free(data);
> > > > +}
> > > > +
> > > > +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
> > > > +                                         struct iovec *out_iov,
> > > > +                                         uint64_t niov) {
> > > > +    VirtIOBlock *s = req->dev;
> > > > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > > > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > > > +
> > > > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > > > +    int64_t len = iov_size(out_iov, niov);
> > > > +
> > > > +    if (!check_zoned_request(s, offset, len, true, &err_status)) {
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > > > +    data->req = req;
> > > > +    data->zone_append_data.offset = offset;
> > > > +    qemu_iovec_init_external(&req->qiov, out_iov, niov);
> > > > +    blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req-
> > > > >qiov, 0,
> > > > +                        virtio_blk_zone_append_complete, data);
> > > > +    return 0;
> > > > +
> > > > +out:
> > > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > > +    virtio_blk_req_complete(req, err_status);
> > > > +    virtio_blk_free_request(req);
> > > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > > +    return err_status;
> > > > +}
> > > > +
> > > >  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer
> > > > *mrb)
> > > >  {
> > > >      uint32_t type;
> > > > @@ -624,7 +974,7 @@ static int virtio_blk_handle_request(VirtIOBlockReq
> > > > *req,
> > > > MultiReqBuffer *mrb)
> > > >
> > > >      /* We always touch the last byte, so just see how big in_iov is.  */
> > > >      req->in_len = iov_size(in_iov, in_num);
> > > > -    req->in = (void *)in_iov[in_num - 1].iov_base
> > > > +    req->in.in_hdr = (void *)in_iov[in_num - 1].iov_base
> > > >                + in_iov[in_num - 1].iov_len
> > > >                - sizeof(struct virtio_blk_inhdr);
> > > >      iov_discard_back_undoable(in_iov, &in_num, sizeof(struct
> > > > virtio_blk_inhdr),
> > > > @@ -678,6 +1028,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq
> > > > *req,
> > > > MultiReqBuffer *mrb)
> > > >      case VIRTIO_BLK_T_FLUSH:
> > > >          virtio_blk_handle_flush(req, mrb);
> > > >          break;
> > > > +    case VIRTIO_BLK_T_ZONE_REPORT:
> > > > +        virtio_blk_handle_zone_report(req);
> > > > +        break;
> > > > +    case VIRTIO_BLK_T_ZONE_OPEN:
> > > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> > > > +        break;
> > > > +    case VIRTIO_BLK_T_ZONE_CLOSE:
> > > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> > > > +        break;
> > > > +    case VIRTIO_BLK_T_ZONE_FINISH:
> > > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> > > > +        break;
> > > > +    case VIRTIO_BLK_T_ZONE_RESET:
> > > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> > > > +        break;
> > > > +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
> > > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> > > > +        break;
> > > >      case VIRTIO_BLK_T_SCSI_CMD:
> > > >          virtio_blk_handle_scsi(req);
> > > >          break;
> > > > @@ -696,6 +1064,13 @@ static int virtio_blk_handle_request(VirtIOBlockReq
> > > > *req,
> > > > MultiReqBuffer *mrb)
> > > >          virtio_blk_free_request(req);
> > > >          break;
> > > >      }
> > > > +    case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
> > > > +        /*
> > > > +         * It is not safe to access req->elem.out_sg directly because it
> > > > +         * may be modified by virtio_blk_handle_request().
> > > > +         */
> > > > +        virtio_blk_handle_zone_append(req, out_iov, out_num);
> > > > +        break;
> > > >      /*
> > > >       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
> > > >       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch
> > > > statement,
> > > > @@ -895,6 +1270,7 @@ static void virtio_blk_update_config(VirtIODevice
> > > > *vdev,
> > > > uint8_t *config)
> > > >  {
> > > >      VirtIOBlock *s = VIRTIO_BLK(vdev);
> > > >      BlockConf *conf = &s->conf.conf;
> > > > +    BlockDriverState *bs = blk_bs(s->blk);
> > > >      struct virtio_blk_config blkcfg;
> > > >      uint64_t capacity;
> > > >      int64_t length;
> > > > @@ -954,6 +1330,30 @@ static void virtio_blk_update_config(VirtIODevice
> > > > *vdev,
> > > > uint8_t *config)
> > > >          blkcfg.write_zeroes_may_unmap = 1;
> > > >          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
> > > >      }
> > > > +    if (bs->bl.zoned != BLK_Z_NONE) {
> > > > +        switch (bs->bl.zoned) {
> > > > +        case BLK_Z_HM:
> > > > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
> > > > +            break;
> > > > +        case BLK_Z_HA:
> > > > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
> > > > +            break;
> > > > +        default:
> > > > +            g_assert_not_reached();
> > > > +        }
> > > > +
> > > > +        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
> > > > +                     bs->bl.zone_size / 512);
> > > > +        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
> > > > +                     bs->bl.max_active_zones);
> > > > +        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
> > > > +                     bs->bl.max_open_zones);
> > > > +        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> > > > +        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
> > > > +                     bs->bl.max_append_sectors);
> > > > +    } else {
> > > > +        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
> > > > +    }
> > > >      memcpy(config, &blkcfg, s->config_size);
> > > >  }
> > > >
> > > > @@ -1118,6 +1518,7 @@ static void virtio_blk_device_realize(DeviceState
> > > > *dev,
> > > > Error **errp)
> > > >      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> > > >      VirtIOBlock *s = VIRTIO_BLK(dev);
> > > >      VirtIOBlkConf *conf = &s->conf;
> > > > +    BlockDriverState *bs = blk_bs(conf->conf.blk);
> > > >      Error *err = NULL;
> > > >      unsigned i;
> > > >
> > > > @@ -1163,6 +1564,13 @@ static void virtio_blk_device_realize(DeviceState
> > > > *dev,
> > > > Error **errp)
> > > >          return;
> > > >      }
> > > >
> > > > +    if (bs->bl.zoned != BLK_Z_NONE) {
> > > > +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
> > > > +        if (bs->bl.zoned == BLK_Z_HM) {
> > > > +            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
> > > > +        }
> > > > +    }
> > > > +
> > > >      if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
> > > >          (!conf->max_discard_sectors ||
> > > >           conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
> > > > diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-
> > > > blk.h
> > > > index d311c57cca..88901a9f55 100644
> > > > --- a/include/hw/virtio/virtio-blk.h
> > > > +++ b/include/hw/virtio/virtio-blk.h
> > > > @@ -30,6 +30,12 @@ struct virtio_blk_inhdr
> > > >      unsigned char status;
> > > >  };
> > > >
> > > > +struct virtio_blk_zone_append_inhdr {
> > > > +    unsigned char status;
> > > > +    uint8_t reserved[7];
> > > > +    int64_t append_sector;
> > > > +};
> > > > +
> > > >  #define VIRTIO_BLK_AUTO_NUM_QUEUES UINT16_MAX
> > > >
> > > >  struct VirtIOBlkConf
> > > > @@ -73,7 +79,10 @@ typedef struct VirtIOBlockReq {
> > > >      VirtQueue *vq;
> > > >      IOVDiscardUndo inhdr_undo;
> > > >      IOVDiscardUndo outhdr_undo;
> > > > -    struct virtio_blk_inhdr *in;
> > > > +    union {
> > > > +        struct virtio_blk_inhdr *in_hdr;
> > > > +        struct virtio_blk_zone_append_inhdr *zone_append_inhdr;
> > > > +    } in;
> > > >      struct virtio_blk_outhdr out;
> > > >      QEMUIOVector qiov;
> > > >      size_t in_len;
> > >
>


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-10-24  4:48 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-16 15:05 [RFC v3 0/2] Add zoned storage emulation to virtio-blk driver Sam Li
2022-10-16 15:05 ` [RFC v3 1/2] include: update virtio_blk headers from Linux 5.19-rc2+ Sam Li
2022-10-17  0:53   ` Dmitry Fomichev
2022-10-17  2:12     ` Damien Le Moal
2022-10-16 15:05 ` [RFC v3 2/2] virtio-blk: add zoned storage emulation for zoned devices Sam Li
2022-10-17  0:59   ` Dmitry Fomichev
2022-10-18  8:56     ` Sam Li
2022-10-23  2:08       ` Dmitry Fomichev
2022-10-23  3:01         ` Sam Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).