All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/2] Add zoned storage emulation to virtio-blk driver
@ 2022-09-29  9:48 Sam Li
  2022-09-29  9:48 ` [PATCH v2 1/2] include: update virtio_blk headers from Linux 5.19-rc2+ Sam Li
  2022-09-29  9:48 ` [PATCH v2 2/2] virtio-blk: add zoned storage emulation for zoned devices Sam Li
  0 siblings, 2 replies; 8+ messages in thread
From: Sam Li @ 2022-09-29  9:48 UTC (permalink / raw)
  To: qemu-devel
  Cc: dmitry.fomichev, damien.lemoal, qemu-block, stefanha,
	Hanna Reitz, Kevin Wolf, Michael S. Tsirkin, hare, Sam Li

v2:
- change units of emulated zone op coresponding to block layer APIs
- modify error checking cases [Stefan, Damien]

v1:
- add zoned storage emulation

Sam Li (2):
  include: update virtio_blk headers from Linux 5.19-rc2+
  virtio-blk: add zoned storage emulation for zoned devices

 hw/block/virtio-blk.c                       | 393 ++++++++++++++++++++
 include/standard-headers/linux/virtio_blk.h | 109 ++++++
 2 files changed, 502 insertions(+)

-- 
2.37.3



^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2 1/2] include: update virtio_blk headers from Linux 5.19-rc2+
  2022-09-29  9:48 [PATCH v2 0/2] Add zoned storage emulation to virtio-blk driver Sam Li
@ 2022-09-29  9:48 ` Sam Li
  2022-10-06 12:54   ` Stefan Hajnoczi
  2022-10-06 13:24   ` Peter Maydell
  2022-09-29  9:48 ` [PATCH v2 2/2] virtio-blk: add zoned storage emulation for zoned devices Sam Li
  1 sibling, 2 replies; 8+ messages in thread
From: Sam Li @ 2022-09-29  9:48 UTC (permalink / raw)
  To: qemu-devel
  Cc: dmitry.fomichev, damien.lemoal, qemu-block, stefanha,
	Hanna Reitz, Kevin Wolf, Michael S. Tsirkin, hare, Sam Li

Use scripts/update-linux-headers.sh to update virtio-blk headers
from Dmitry's "virtio-blk:add support for zoned block devices"
linux patch. There is a link for more information:
https://github.com/dmitry-fomichev/virtblk-zbd

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 include/standard-headers/linux/virtio_blk.h | 109 ++++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/include/standard-headers/linux/virtio_blk.h b/include/standard-headers/linux/virtio_blk.h
index 2dcc90826a..490bd21c76 100644
--- a/include/standard-headers/linux/virtio_blk.h
+++ b/include/standard-headers/linux/virtio_blk.h
@@ -40,6 +40,7 @@
 #define VIRTIO_BLK_F_MQ		12	/* support more than one vq */
 #define VIRTIO_BLK_F_DISCARD	13	/* DISCARD is supported */
 #define VIRTIO_BLK_F_WRITE_ZEROES	14	/* WRITE ZEROES is supported */
+#define VIRTIO_BLK_F_ZONED		17	/* Zoned block device */
 
 /* Legacy feature bits */
 #ifndef VIRTIO_BLK_NO_LEGACY
@@ -119,6 +120,20 @@ struct virtio_blk_config {
 	uint8_t write_zeroes_may_unmap;
 
 	uint8_t unused1[3];
+
+	/* Secure erase fields that are defined in the virtio spec */
+	uint8_t sec_erase[12];
+
+	/* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */
+	struct virtio_blk_zoned_characteristics {
+		__virtio32 zone_sectors;
+		__virtio32 max_open_zones;
+		__virtio32 max_active_zones;
+		__virtio32 max_append_sectors;
+		__virtio32 write_granularity;
+		uint8_t model;
+		uint8_t unused2[3];
+	} zoned;
 } QEMU_PACKED;
 
 /*
@@ -153,6 +168,27 @@ struct virtio_blk_config {
 /* Write zeroes command */
 #define VIRTIO_BLK_T_WRITE_ZEROES	13
 
+/* Zone append command */
+#define VIRTIO_BLK_T_ZONE_APPEND    15
+
+/* Report zones command */
+#define VIRTIO_BLK_T_ZONE_REPORT    16
+
+/* Open zone command */
+#define VIRTIO_BLK_T_ZONE_OPEN      18
+
+/* Close zone command */
+#define VIRTIO_BLK_T_ZONE_CLOSE     20
+
+/* Finish zone command */
+#define VIRTIO_BLK_T_ZONE_FINISH    22
+
+/* Reset zone command */
+#define VIRTIO_BLK_T_ZONE_RESET     24
+
+/* Reset All zones command */
+#define VIRTIO_BLK_T_ZONE_RESET_ALL 26
+
 #ifndef VIRTIO_BLK_NO_LEGACY
 /* Barrier before this op. */
 #define VIRTIO_BLK_T_BARRIER	0x80000000
@@ -172,6 +208,72 @@ struct virtio_blk_outhdr {
 	__virtio64 sector;
 };
 
+/*
+ * Supported zoned device models.
+ */
+
+/* Regular block device */
+#define VIRTIO_BLK_Z_NONE      0
+/* Host-managed zoned device */
+#define VIRTIO_BLK_Z_HM        1
+/* Host-aware zoned device */
+#define VIRTIO_BLK_Z_HA        2
+
+/*
+ * Zone descriptor. A part of VIRTIO_BLK_T_ZONE_REPORT command reply.
+ */
+struct virtio_blk_zone_descriptor {
+	/* Zone capacity */
+	__virtio64 z_cap;
+	/* The starting sector of the zone */
+	__virtio64 z_start;
+	/* Zone write pointer position in sectors */
+	__virtio64 z_wp;
+	/* Zone type */
+	uint8_t z_type;
+	/* Zone state */
+	uint8_t z_state;
+	uint8_t reserved[38];
+};
+
+struct virtio_blk_zone_report {
+	__virtio64 nr_zones;
+	uint8_t reserved[56];
+	struct virtio_blk_zone_descriptor zones[];
+};
+
+/*
+ * Supported zone types.
+ */
+
+/* Conventional zone */
+#define VIRTIO_BLK_ZT_CONV         1
+/* Sequential Write Required zone */
+#define VIRTIO_BLK_ZT_SWR          2
+/* Sequential Write Preferred zone */
+#define VIRTIO_BLK_ZT_SWP          3
+
+/*
+ * Zone states that are available for zones of all types.
+ */
+
+/* Not a write pointer (conventional zones only) */
+#define VIRTIO_BLK_ZS_NOT_WP       0
+/* Empty */
+#define VIRTIO_BLK_ZS_EMPTY        1
+/* Implicitly Open */
+#define VIRTIO_BLK_ZS_IOPEN        2
+/* Explicitly Open */
+#define VIRTIO_BLK_ZS_EOPEN        3
+/* Closed */
+#define VIRTIO_BLK_ZS_CLOSED       4
+/* Read-Only */
+#define VIRTIO_BLK_ZS_RDONLY       13
+/* Full */
+#define VIRTIO_BLK_ZS_FULL         14
+/* Offline */
+#define VIRTIO_BLK_ZS_OFFLINE      15
+
 /* Unmap this range (only valid for write zeroes command) */
 #define VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP	0x00000001
 
@@ -198,4 +300,11 @@ struct virtio_scsi_inhdr {
 #define VIRTIO_BLK_S_OK		0
 #define VIRTIO_BLK_S_IOERR	1
 #define VIRTIO_BLK_S_UNSUPP	2
+
+/* Error codes that are specific to zoned block devices */
+#define VIRTIO_BLK_S_ZONE_INVALID_CMD     3
+#define VIRTIO_BLK_S_ZONE_UNALIGNED_WP    4
+#define VIRTIO_BLK_S_ZONE_OPEN_RESOURCE   5
+#define VIRTIO_BLK_S_ZONE_ACTIVE_RESOURCE 6
+
 #endif /* _LINUX_VIRTIO_BLK_H */
-- 
2.37.3



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v2 2/2] virtio-blk: add zoned storage emulation for zoned devices
  2022-09-29  9:48 [PATCH v2 0/2] Add zoned storage emulation to virtio-blk driver Sam Li
  2022-09-29  9:48 ` [PATCH v2 1/2] include: update virtio_blk headers from Linux 5.19-rc2+ Sam Li
@ 2022-09-29  9:48 ` Sam Li
  2022-10-06 15:04   ` Stefan Hajnoczi
  1 sibling, 1 reply; 8+ messages in thread
From: Sam Li @ 2022-09-29  9:48 UTC (permalink / raw)
  To: qemu-devel
  Cc: dmitry.fomichev, damien.lemoal, qemu-block, stefanha,
	Hanna Reitz, Kevin Wolf, Michael S. Tsirkin, hare, Sam Li

This patch extends virtio-blk emulation to handle zoned device commands
by calling the new block layer APIs to perform zoned device I/O on
behalf of the guest. It supports Report Zone, four zone oparations (open,
close, finish, reset), and Append Zone.

The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
support zoned block devices. Regular block devices(conventional zones)
will not be set.

The guest os having zoned device support can use blkzone(8) to test those
commands. Furthermore, using zonefs to test zone append write is also
supported.

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 hw/block/virtio-blk.c | 393 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 393 insertions(+)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index e9ba752f6b..1c2535bfeb 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -26,6 +26,9 @@
 #include "hw/virtio/virtio-blk.h"
 #include "dataplane/virtio-blk.h"
 #include "scsi/constants.h"
+#if defined(CONFIG_BLKZONED)
+#include <linux/blkzoned.h>
+#endif
 #ifdef __linux__
 # include <scsi/sg.h>
 #endif
@@ -46,6 +49,8 @@ static const VirtIOFeature feature_sizes[] = {
      .end = endof(struct virtio_blk_config, discard_sector_alignment)},
     {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
      .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
+    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
+     .end = endof(struct virtio_blk_config, zoned)},
     {}
 };
 
@@ -614,6 +619,340 @@ err:
     return err_status;
 }
 
+typedef struct ZoneCmdData {
+    VirtIOBlockReq *req;
+    union {
+        struct {
+            unsigned int nr_zones;
+            BlockZoneDescriptor *zones;
+        } zone_report_data;
+        struct {
+            int64_t append_sector;
+        } zone_append_data;
+    };
+} ZoneCmdData;
+
+/*
+ * check zoned_request: error checking before issuing requests. If all checks
+ * passed, return true.
+ * append: true if only zone append requests issued.
+ */
+static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
+                             bool append, uint8_t *status) {
+    BlockDriverState *bs = blk_bs(s->blk);
+    int index = offset / bs->bl.zone_size;
+
+    if (offset < 0 || offset + len > bs->bl.capacity) {
+        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        return false;
+    }
+
+    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
+        *status = VIRTIO_BLK_S_UNSUPP;
+        return false;
+    }
+
+    if (append) {
+        if ((offset % bs->bl.write_granularity) != 0) {
+            *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
+            return false;
+        }
+
+        if (!BDRV_ZT_IS_SWR(bs->bl.wps->wp[index])) {
+            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+            return false;
+        }
+
+        if (len / 512 > bs->bl.max_append_sectors) {
+            if (bs->bl.max_append_sectors == 0) {
+                *status = VIRTIO_BLK_S_UNSUPP;
+            } else {
+                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+            }
+            return false;
+        }
+    }
+    return true;
+}
+
+static void virtio_blk_zone_report_complete(void *opaque, int ret)
+{
+    ZoneCmdData *data = opaque;
+    VirtIOBlockReq *req = data->req;
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
+    struct iovec *in_iov = req->elem.in_sg;
+    unsigned in_num = req->elem.in_num;
+    int64_t zrp_size, nz, n, j = 0;
+    int8_t err_status = VIRTIO_BLK_S_OK;
+
+    if (ret) {
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+    nz = data->zone_report_data.nr_zones;
+    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
+            .nr_zones = cpu_to_le64(nz),
+    };
+
+    zrp_size = sizeof(struct virtio_blk_zone_report)
+               + sizeof(struct virtio_blk_zone_descriptor) * nz;
+    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
+    if (n != sizeof(zrp_hdr)) {
+        virtio_error(vdev, "Driver provided intput buffer that is too small!");
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+    for (size_t i = sizeof(zrp_hdr); i < zrp_size; i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
+        struct virtio_blk_zone_descriptor desc =
+                (struct virtio_blk_zone_descriptor) {
+                        .z_start = cpu_to_le64(data->zone_report_data.zones[j].start) >> BDRV_SECTOR_BITS,
+                        .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap) >> BDRV_SECTOR_BITS,
+                        .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp) >> BDRV_SECTOR_BITS,
+                };
+
+        switch (data->zone_report_data.zones[j].type) {
+        case BLK_ZT_CONV:
+            desc.z_type = BLK_ZONE_TYPE_CONVENTIONAL;
+            break;
+        case BLK_ZT_SWR:
+            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+            break;
+        case BLK_ZT_SWP:
+            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_PREF;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        switch (data->zone_report_data.zones[j].cond) {
+        case BLK_ZS_RDONLY:
+            desc.z_state = BLK_ZONE_COND_READONLY;
+            break;
+        case BLK_ZS_OFFLINE:
+            desc.z_state = BLK_ZONE_COND_OFFLINE;
+            break;
+        case BLK_ZS_EMPTY:
+            desc.z_state = BLK_ZONE_COND_EMPTY;
+            break;
+        case BLK_ZS_CLOSED:
+            desc.z_state = BLK_ZONE_COND_CLOSED;
+            break;
+        case BLK_ZS_FULL:
+            desc.z_state = BLK_ZONE_COND_FULL;
+            break;
+        case BLK_ZS_EOPEN:
+            desc.z_state = BLK_ZONE_COND_EXP_OPEN;
+            break;
+        case BLK_ZS_IOPEN:
+            desc.z_state = BLK_ZONE_COND_IMP_OPEN;
+            break;
+        case BLK_ZS_NOT_WP:
+            desc.z_state = BLK_ZONE_COND_NOT_WP;
+            break;
+        default:
+            g_assert_not_reached();
+            break;
+        }
+
+        /* TODO: it takes O(n^2) time complexity. Optimizations required here. */
+        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
+        if (n != sizeof(desc)) {
+            virtio_error(vdev, "Driver provided input buffer "
+                               "for descriptors that is too small!");
+            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+            goto out;
+        }
+    }
+    goto out;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    g_free(data->zone_report_data.zones);
+    g_free(data);
+}
+
+static int virtio_blk_handle_zone_report(VirtIOBlockReq *req) {
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    unsigned int nr_zones;
+    ZoneCmdData *data;
+    int64_t zone_size, offset;
+    uint8_t err_status;
+
+    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
+            sizeof(struct virtio_blk_zone_report) +
+            sizeof(struct virtio_blk_zone_descriptor)) {
+        virtio_error(vdev, "in buffer too small for zone report");
+        return -1;
+    }
+
+    /* start byte offset of the zone report */
+    offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
+    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
+        goto out;
+    }
+
+    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
+                sizeof(struct virtio_blk_zone_report)) /
+               sizeof(struct virtio_blk_zone_descriptor);
+
+    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
+    data = g_malloc(sizeof(ZoneCmdData));
+    data->req = req;
+    data->zone_report_data.nr_zones = nr_zones;
+    data->zone_report_data.zones = g_malloc(zone_size),
+
+    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
+                        data->zone_report_data.zones,
+                        virtio_blk_zone_report_complete, data);
+    return 0;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
+static void virtio_blk_zone_mgmt_complete(void *opaque, int ret) {
+    ZoneCmdData *data = opaque;
+    VirtIOBlockReq *req = data->req;
+    VirtIOBlock *s = req->dev;
+    int8_t err_status = VIRTIO_BLK_S_OK;
+
+    if (ret) {
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+    goto out;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    g_free(data);
+}
+
+static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op) {
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    BlockDriverState *bs = blk_bs(s->blk);
+    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
+    uint64_t len;
+    uint32_t type;
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
+        goto out;
+    }
+
+    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
+    data->req = req;
+
+    type = virtio_ldl_p(vdev, &req->out.type);
+    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
+        /* Entire drive capacity */
+        offset = 0;
+        len = bs->bl.capacity;
+    } else {
+        if (bs->bl.zone_size * bs->bl.nr_zones == bs->bl.capacity) {
+            len = bs->bl.zone_size;
+        } else {
+            /* when the SWR drive has one last small zone, calculate its len */
+            len = bs->bl.capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
+        }
+        if (offset + len > bs->bl.capacity) {
+            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+            goto out;
+        }
+    }
+
+    blk_aio_zone_mgmt(s->blk, op, offset, len,
+                      virtio_blk_zone_mgmt_complete, data);
+
+    return 0;
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
+static void virtio_blk_zone_append_complete(void *opaque, int ret) {
+    ZoneCmdData *data = opaque;
+    VirtIOBlockReq *req = data->req;
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
+    int64_t append_sector, n;
+    struct iovec *out_iov = req->elem.out_sg;
+    unsigned out_num = req->elem.out_num;
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    if (ret) {
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+    virtio_stl_p(vdev, &append_sector, data->zone_append_data.append_sector);
+    n = iov_to_buf(out_iov, out_num, 0, &append_sector, sizeof(append_sector));
+    if (n != sizeof(append_sector)) {
+        virtio_error(vdev, "Driver provided input buffer less than size of "
+                     "append_sector");
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+    goto out;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    g_free(data);
+}
+
+static int virtio_blk_handle_zone_append(VirtIOBlockReq *req) {
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    uint64_t niov = req->elem.out_num;
+    struct iovec *out_iov = req->elem.out_sg;
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
+    int64_t len = 0;
+    for (int i = 1; i < niov; ++i) {
+        len += out_iov[i].iov_len;
+    }
+
+    if (!check_zoned_request(s, offset, len, true, &err_status)) {
+        goto out;
+    }
+
+    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
+    data->req = req;
+    data->zone_append_data.append_sector = offset;
+    qemu_iovec_init_external(&req->qiov, &out_iov[1], niov-1);
+    blk_aio_zone_append(s->blk, &data->zone_append_data.append_sector, &req->qiov, 0,
+                        virtio_blk_zone_append_complete, data);
+    return 0;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
 static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 {
     uint32_t type;
@@ -700,6 +1039,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
     case VIRTIO_BLK_T_FLUSH:
         virtio_blk_handle_flush(req, mrb);
         break;
+    case VIRTIO_BLK_T_ZONE_REPORT:
+        virtio_blk_handle_zone_report(req);
+        break;
+    case VIRTIO_BLK_T_ZONE_OPEN:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
+        break;
+    case VIRTIO_BLK_T_ZONE_CLOSE:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
+        break;
+    case VIRTIO_BLK_T_ZONE_FINISH:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
+        break;
+    case VIRTIO_BLK_T_ZONE_RESET:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
+        break;
+    case VIRTIO_BLK_T_ZONE_RESET_ALL:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET_ALL);
+        break;
     case VIRTIO_BLK_T_SCSI_CMD:
         virtio_blk_handle_scsi(req);
         break;
@@ -718,6 +1075,9 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
         virtio_blk_free_request(req);
         break;
     }
+   case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
+       virtio_blk_handle_zone_append(req);
+       break;
     /*
      * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
      * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
@@ -917,6 +1277,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
     BlockConf *conf = &s->conf.conf;
+    BlockDriverState *bs = blk_bs(s->blk);
     struct virtio_blk_config blkcfg;
     uint64_t capacity;
     int64_t length;
@@ -976,6 +1337,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
         blkcfg.write_zeroes_may_unmap = 1;
         virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
     }
+    if (bs->bl.zoned != BLK_Z_NONE) {
+        switch (bs->bl.zoned) {
+        case BLK_Z_HM:
+            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
+            break;
+        case BLK_Z_HA:
+            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
+                     bs->bl.zone_size / 512);
+        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
+                     bs->bl.max_active_zones);
+        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
+                     bs->bl.max_open_zones);
+        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
+        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
+                     bs->bl.max_append_sectors);
+    } else {
+        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
+    }
     memcpy(config, &blkcfg, s->config_size);
 }
 
@@ -1140,6 +1525,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtIOBlock *s = VIRTIO_BLK(dev);
     VirtIOBlkConf *conf = &s->conf;
+    BlockDriverState *bs = blk_bs(conf->conf.blk);
     Error *err = NULL;
     unsigned i;
 
@@ -1185,6 +1571,13 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
         return;
     }
 
+    if (bs->bl.zoned != BLK_Z_NONE) {
+        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
+        if (bs->bl.zoned == BLK_Z_HM) {
+            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
+        }
+    }
+
     if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
         (!conf->max_discard_sectors ||
          conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
-- 
2.37.3



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 1/2] include: update virtio_blk headers from Linux 5.19-rc2+
  2022-09-29  9:48 ` [PATCH v2 1/2] include: update virtio_blk headers from Linux 5.19-rc2+ Sam Li
@ 2022-10-06 12:54   ` Stefan Hajnoczi
  2022-10-06 13:24   ` Peter Maydell
  1 sibling, 0 replies; 8+ messages in thread
From: Stefan Hajnoczi @ 2022-10-06 12:54 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, dmitry.fomichev, damien.lemoal, qemu-block,
	Hanna Reitz, Kevin Wolf, Michael S. Tsirkin, hare

[-- Attachment #1: Type: text/plain, Size: 524 bytes --]

On Thu, Sep 29, 2022 at 05:48:20PM +0800, Sam Li wrote:
> Use scripts/update-linux-headers.sh to update virtio-blk headers
> from Dmitry's "virtio-blk:add support for zoned block devices"
> linux patch. There is a link for more information:
> https://github.com/dmitry-fomichev/virtblk-zbd
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  include/standard-headers/linux/virtio_blk.h | 109 ++++++++++++++++++++
>  1 file changed, 109 insertions(+)

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 1/2] include: update virtio_blk headers from Linux 5.19-rc2+
  2022-09-29  9:48 ` [PATCH v2 1/2] include: update virtio_blk headers from Linux 5.19-rc2+ Sam Li
  2022-10-06 12:54   ` Stefan Hajnoczi
@ 2022-10-06 13:24   ` Peter Maydell
  1 sibling, 0 replies; 8+ messages in thread
From: Peter Maydell @ 2022-10-06 13:24 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, dmitry.fomichev, damien.lemoal, qemu-block, stefanha,
	Hanna Reitz, Kevin Wolf, Michael S. Tsirkin, hare

On Thu, 29 Sept 2022 at 11:14, Sam Li <faithilikerun@gmail.com> wrote:
>
> Use scripts/update-linux-headers.sh to update virtio-blk headers
> from Dmitry's "virtio-blk:add support for zoned block devices"
> linux patch. There is a link for more information:
> https://github.com/dmitry-fomichev/virtblk-zbd
>
> Signed-off-by: Sam Li <faithilikerun@gmail.com>

Just as a process note, if that patchset isn't upstream in the kernel
yet then this QEMU patchseries should be marked as an RFC, as a guard
against our applying it to QEMU before the kernel ABI has been
fixed.

thanks
-- PMM


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 2/2] virtio-blk: add zoned storage emulation for zoned devices
  2022-09-29  9:48 ` [PATCH v2 2/2] virtio-blk: add zoned storage emulation for zoned devices Sam Li
@ 2022-10-06 15:04   ` Stefan Hajnoczi
  2022-10-09  1:54     ` Sam Li
  0 siblings, 1 reply; 8+ messages in thread
From: Stefan Hajnoczi @ 2022-10-06 15:04 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, dmitry.fomichev, damien.lemoal, qemu-block,
	Hanna Reitz, Kevin Wolf, Michael S. Tsirkin, hare

[-- Attachment #1: Type: text/plain, Size: 21093 bytes --]

On Thu, Sep 29, 2022 at 05:48:21PM +0800, Sam Li wrote:
> This patch extends virtio-blk emulation to handle zoned device commands
> by calling the new block layer APIs to perform zoned device I/O on
> behalf of the guest. It supports Report Zone, four zone oparations (open,
> close, finish, reset), and Append Zone.
> 
> The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> support zoned block devices. Regular block devices(conventional zones)
> will not be set.
> 
> The guest os having zoned device support can use blkzone(8) to test those
> commands. Furthermore, using zonefs to test zone append write is also
> supported.
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  hw/block/virtio-blk.c | 393 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 393 insertions(+)
> 
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index e9ba752f6b..1c2535bfeb 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -26,6 +26,9 @@
>  #include "hw/virtio/virtio-blk.h"
>  #include "dataplane/virtio-blk.h"
>  #include "scsi/constants.h"
> +#if defined(CONFIG_BLKZONED)
> +#include <linux/blkzoned.h>
> +#endif

Why is this Linux-specific header file included? The virtio-blk
emulation code should only use QEMU block layer APIs, not Linux APIs.

>  #ifdef __linux__
>  # include <scsi/sg.h>
>  #endif
> @@ -46,6 +49,8 @@ static const VirtIOFeature feature_sizes[] = {
>       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
>      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
>       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> +     .end = endof(struct virtio_blk_config, zoned)},
>      {}
>  };
>  
> @@ -614,6 +619,340 @@ err:
>      return err_status;
>  }
>  
> +typedef struct ZoneCmdData {
> +    VirtIOBlockReq *req;
> +    union {
> +        struct {
> +            unsigned int nr_zones;
> +            BlockZoneDescriptor *zones;
> +        } zone_report_data;
> +        struct {
> +            int64_t append_sector;
> +        } zone_append_data;
> +    };
> +} ZoneCmdData;
> +
> +/*
> + * check zoned_request: error checking before issuing requests. If all checks
> + * passed, return true.
> + * append: true if only zone append requests issued.
> + */
> +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
> +                             bool append, uint8_t *status) {
> +    BlockDriverState *bs = blk_bs(s->blk);
> +    int index = offset / bs->bl.zone_size;

This function doesn't check that offset+len is in the same zone as
offset. Maybe that's correct because some request types allow [offset,
offset+len) to cross zones?

> +
> +    if (offset < 0 || offset + len > bs->bl.capacity) {

Other cases that are not checked:
1. len < 0
2. offset >= bs->bl.capacity
3. len > bs->bl.capacity - offset (catches integer overflow)

It may be possible to combine these cases, but be careful about integer
overflow.

> +        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        return false;
> +    }
> +
> +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> +        *status = VIRTIO_BLK_S_UNSUPP;
> +        return false;
> +    }
> +
> +    if (append) {
> +        if ((offset % bs->bl.write_granularity) != 0) {
> +            *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> +            return false;
> +        }
> +
> +        if (!BDRV_ZT_IS_SWR(bs->bl.wps->wp[index])) {
> +            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            return false;
> +        }

Where does the virtio-blk zone spec say that only SWR zones allow zone
append commands? Should it work for SWP zones too?

> +
> +        if (len / 512 > bs->bl.max_append_sectors) {
> +            if (bs->bl.max_append_sectors == 0) {
> +                *status = VIRTIO_BLK_S_UNSUPP;
> +            } else {
> +                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            }
> +            return false;
> +        }
> +    }
> +    return true;
> +}
> +
> +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> +{
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> +    struct iovec *in_iov = req->elem.in_sg;
> +    unsigned in_num = req->elem.in_num;
> +    int64_t zrp_size, nz, n, j = 0;
> +    int8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (ret) {
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +    nz = data->zone_report_data.nr_zones;
> +    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
> +            .nr_zones = cpu_to_le64(nz),
> +    };
> +
> +    zrp_size = sizeof(struct virtio_blk_zone_report)
> +               + sizeof(struct virtio_blk_zone_descriptor) * nz;
> +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
> +    if (n != sizeof(zrp_hdr)) {
> +        virtio_error(vdev, "Driver provided intput buffer that is too small!");
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +    for (size_t i = sizeof(zrp_hdr); i < zrp_size; i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
> +        struct virtio_blk_zone_descriptor desc =
> +                (struct virtio_blk_zone_descriptor) {
> +                        .z_start = cpu_to_le64(data->zone_report_data.zones[j].start) >> BDRV_SECTOR_BITS,
> +                        .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap) >> BDRV_SECTOR_BITS,
> +                        .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp) >> BDRV_SECTOR_BITS,
> +                };
> +
> +        switch (data->zone_report_data.zones[j].type) {
> +        case BLK_ZT_CONV:
> +            desc.z_type = BLK_ZONE_TYPE_CONVENTIONAL;
> +            break;
> +        case BLK_ZT_SWR:
> +            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_REQ;
> +            break;
> +        case BLK_ZT_SWP:
> +            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_PREF;
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
> +
> +        switch (data->zone_report_data.zones[j].cond) {
> +        case BLK_ZS_RDONLY:
> +            desc.z_state = BLK_ZONE_COND_READONLY;
> +            break;
> +        case BLK_ZS_OFFLINE:
> +            desc.z_state = BLK_ZONE_COND_OFFLINE;
> +            break;
> +        case BLK_ZS_EMPTY:
> +            desc.z_state = BLK_ZONE_COND_EMPTY;
> +            break;
> +        case BLK_ZS_CLOSED:
> +            desc.z_state = BLK_ZONE_COND_CLOSED;
> +            break;
> +        case BLK_ZS_FULL:
> +            desc.z_state = BLK_ZONE_COND_FULL;
> +            break;
> +        case BLK_ZS_EOPEN:
> +            desc.z_state = BLK_ZONE_COND_EXP_OPEN;
> +            break;
> +        case BLK_ZS_IOPEN:
> +            desc.z_state = BLK_ZONE_COND_IMP_OPEN;
> +            break;
> +        case BLK_ZS_NOT_WP:
> +            desc.z_state = BLK_ZONE_COND_NOT_WP;
> +            break;
> +        default:
> +            g_assert_not_reached();
> +            break;
> +        }
> +
> +        /* TODO: it takes O(n^2) time complexity. Optimizations required here. */
> +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
> +        if (n != sizeof(desc)) {
> +            virtio_error(vdev, "Driver provided input buffer "
> +                               "for descriptors that is too small!");
> +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            goto out;
> +        }
> +    }
> +    goto out;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data->zone_report_data.zones);
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req) {
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    unsigned int nr_zones;
> +    ZoneCmdData *data;
> +    int64_t zone_size, offset;
> +    uint8_t err_status;
> +
> +    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
> +            sizeof(struct virtio_blk_zone_report) +
> +            sizeof(struct virtio_blk_zone_descriptor)) {
> +        virtio_error(vdev, "in buffer too small for zone report");
> +        return -1;
> +    }
> +
> +    /* start byte offset of the zone report */
> +    offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> +        goto out;
> +    }
> +
> +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> +                sizeof(struct virtio_blk_zone_report)) /
> +               sizeof(struct virtio_blk_zone_descriptor);
> +
> +    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
> +    data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +    data->zone_report_data.nr_zones = nr_zones;
> +    data->zone_report_data.zones = g_malloc(zone_size),
> +
> +    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
> +                        data->zone_report_data.zones,
> +                        virtio_blk_zone_report_complete, data);
> +    return 0;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
> +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret) {
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    int8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (ret) {
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +    goto out;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op) {
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    BlockDriverState *bs = blk_bs(s->blk);
> +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> +    uint64_t len;
> +    uint32_t type;
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> +        goto out;
> +    }
> +
> +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +
> +    type = virtio_ldl_p(vdev, &req->out.type);
> +    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
> +        /* Entire drive capacity */
> +        offset = 0;
> +        len = bs->bl.capacity;
> +    } else {
> +        if (bs->bl.zone_size * bs->bl.nr_zones == bs->bl.capacity) {
> +            len = bs->bl.zone_size;
> +        } else {
> +            /* when the SWR drive has one last small zone, calculate its len */
> +            len = bs->bl.capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
> +        }
> +        if (offset + len > bs->bl.capacity) {
> +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            goto out;

data is leaked here.

> +        }
> +    }
> +
> +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> +                      virtio_blk_zone_mgmt_complete, data);
> +
> +    return 0;
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
> +static void virtio_blk_zone_append_complete(void *opaque, int ret) {
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> +    int64_t append_sector, n;
> +    struct iovec *out_iov = req->elem.out_sg;
> +    unsigned out_num = req->elem.out_num;
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (ret) {
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +    virtio_stl_p(vdev, &append_sector, data->zone_append_data.append_sector);
> +    n = iov_to_buf(out_iov, out_num, 0, &append_sector, sizeof(append_sector));

out_iov contains the driver->device buffers. The device is only allowed
to read from out_iov, not write to it.

The device->driver buffers are in in_iov.

According to the spec the zone append in hdr looks like this:

  struct {
      u8 status;
      u8 reserved[7];
      le64 append_sector;
  } virtio_blk_zone_append_inhdr;

In virtio_blk_handle_request() we used iov_discard_back_undoable() to
take the last byte (the status field for non-zone append requests) from
in_iov[]. This is incorrect for zone append requests because they have
the larger struct zone_append_inhdr instead of struct
virtio_blk_inhdr.

I think it might be time to stop using req->in in virtio-blk.c and
instead use iov_from_buf() to write the status byte. For zone append
requests we also need to write reserved[] and append_sector:

  iov_discard_undo(&req->inhdr_undo);
  inhdr_len = is_zone_append ?
               sizeof(struct virtio_blk_zone_append_inhdr) :
	       sizeof(struct virtio_blk_inhdr);
  iov_from_buf(req->elem.in_sg, req->elem.in_num,
               req->in_len - inhdr_len,
	       &req->in, inhdr_len);

where req->in changes to:

  union {
      struct virtio_blk_inhdr inhdr;
      struct virtio_blk_zone_append_inhdr zone_append_inhdr;
  } in;

Most requests will just use in.inhdr but zone append will fill out the
full in.zone_append_inhdr struct.

> +    if (n != sizeof(append_sector)) {
> +        virtio_error(vdev, "Driver provided input buffer less than size of "
> +                     "append_sector");
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +    goto out;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req) {
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    uint64_t niov = req->elem.out_num;
> +    struct iovec *out_iov = req->elem.out_sg;
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> +    int64_t len = 0;
> +    for (int i = 1; i < niov; ++i) {
> +        len += out_iov[i].iov_len;

Please pass in out_iov and out_num instead of using req->elem.out_sg and
req->elem.out_num. virtio_blk_handle_request() modifies the iovecs
pointed to by req->elem.out_sg using iov_discard_front_undoable() and it
is not safe to access req->elem.out_sg directly.

Also, VIRTIO devices are not allowed to make assumptions about the iovec
layout. That means skipping the first iovec in the for loop violates the
spec. The driver could send struct virtio_blk_req as two or more iovecs
instead of putting it into just 1 iovec. This is why the device is not
allowed to assume out_iov[0] is struct virtio_blk_req.

The for loop can be replaced with:

  len = iov_size(out_iov, out_num);

and out_iov[1]/niov-1 can be replaced with just out_iov and out_num (if
you pass them in from virtio_blk_handle_request()).

> +    }
> +
> +    if (!check_zoned_request(s, offset, len, true, &err_status)) {
> +        goto out;
> +    }
> +
> +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +    data->zone_append_data.append_sector = offset;
> +    qemu_iovec_init_external(&req->qiov, &out_iov[1], niov-1);
> +    blk_aio_zone_append(s->blk, &data->zone_append_data.append_sector, &req->qiov, 0,
> +                        virtio_blk_zone_append_complete, data);
> +    return 0;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
>  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>  {
>      uint32_t type;
> @@ -700,6 +1039,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>      case VIRTIO_BLK_T_FLUSH:
>          virtio_blk_handle_flush(req, mrb);
>          break;
> +    case VIRTIO_BLK_T_ZONE_REPORT:
> +        virtio_blk_handle_zone_report(req);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_OPEN:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_CLOSE:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_FINISH:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_RESET:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET_ALL);
> +        break;
>      case VIRTIO_BLK_T_SCSI_CMD:
>          virtio_blk_handle_scsi(req);
>          break;
> @@ -718,6 +1075,9 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>          virtio_blk_free_request(req);
>          break;
>      }
> +   case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:

Indentation is off. QEMU uses 4-space indentation.

> +       virtio_blk_handle_zone_append(req);
> +       break;
>      /*
>       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
>       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
> @@ -917,6 +1277,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
>  {
>      VirtIOBlock *s = VIRTIO_BLK(vdev);
>      BlockConf *conf = &s->conf.conf;
> +    BlockDriverState *bs = blk_bs(s->blk);
>      struct virtio_blk_config blkcfg;
>      uint64_t capacity;
>      int64_t length;
> @@ -976,6 +1337,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
>          blkcfg.write_zeroes_may_unmap = 1;
>          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
>      }
> +    if (bs->bl.zoned != BLK_Z_NONE) {
> +        switch (bs->bl.zoned) {
> +        case BLK_Z_HM:
> +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
> +            break;
> +        case BLK_Z_HA:
> +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
> +
> +        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
> +                     bs->bl.zone_size / 512);
> +        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
> +                     bs->bl.max_active_zones);
> +        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
> +                     bs->bl.max_open_zones);
> +        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> +        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
> +                     bs->bl.max_append_sectors);
> +    } else {
> +        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
> +    }
>      memcpy(config, &blkcfg, s->config_size);
>  }
>  
> @@ -1140,6 +1525,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>      VirtIOBlock *s = VIRTIO_BLK(dev);
>      VirtIOBlkConf *conf = &s->conf;
> +    BlockDriverState *bs = blk_bs(conf->conf.blk);
>      Error *err = NULL;
>      unsigned i;
>  
> @@ -1185,6 +1571,13 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>          return;
>      }
>  
> +    if (bs->bl.zoned != BLK_Z_NONE) {
> +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
> +        if (bs->bl.zoned == BLK_Z_HM) {
> +            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
> +        }
> +    }
> +
>      if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
>          (!conf->max_discard_sectors ||
>           conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
> -- 
> 2.37.3
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 2/2] virtio-blk: add zoned storage emulation for zoned devices
  2022-10-06 15:04   ` Stefan Hajnoczi
@ 2022-10-09  1:54     ` Sam Li
  2022-10-09  2:38       ` Sam Li
  0 siblings, 1 reply; 8+ messages in thread
From: Sam Li @ 2022-10-09  1:54 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: qemu-devel, dmitry.fomichev, damien.lemoal, qemu-block,
	Hanna Reitz, Kevin Wolf, Michael S. Tsirkin, hare

Stefan Hajnoczi <stefanha@redhat.com> 于2022年10月6日周四 23:04写道:
>
> On Thu, Sep 29, 2022 at 05:48:21PM +0800, Sam Li wrote:
> > This patch extends virtio-blk emulation to handle zoned device commands
> > by calling the new block layer APIs to perform zoned device I/O on
> > behalf of the guest. It supports Report Zone, four zone oparations (open,
> > close, finish, reset), and Append Zone.
> >
> > The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> > support zoned block devices. Regular block devices(conventional zones)
> > will not be set.
> >
> > The guest os having zoned device support can use blkzone(8) to test those
> > commands. Furthermore, using zonefs to test zone append write is also
> > supported.
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >  hw/block/virtio-blk.c | 393 ++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 393 insertions(+)
> >
> > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> > index e9ba752f6b..1c2535bfeb 100644
> > --- a/hw/block/virtio-blk.c
> > +++ b/hw/block/virtio-blk.c
> > @@ -26,6 +26,9 @@
> >  #include "hw/virtio/virtio-blk.h"
> >  #include "dataplane/virtio-blk.h"
> >  #include "scsi/constants.h"
> > +#if defined(CONFIG_BLKZONED)
> > +#include <linux/blkzoned.h>
> > +#endif
>
> Why is this Linux-specific header file included? The virtio-blk
> emulation code should only use QEMU block layer APIs, not Linux APIs.
>
> >  #ifdef __linux__
> >  # include <scsi/sg.h>
> >  #endif
> > @@ -46,6 +49,8 @@ static const VirtIOFeature feature_sizes[] = {
> >       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
> >      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
> >       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> > +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> > +     .end = endof(struct virtio_blk_config, zoned)},
> >      {}
> >  };
> >
> > @@ -614,6 +619,340 @@ err:
> >      return err_status;
> >  }
> >
> > +typedef struct ZoneCmdData {
> > +    VirtIOBlockReq *req;
> > +    union {
> > +        struct {
> > +            unsigned int nr_zones;
> > +            BlockZoneDescriptor *zones;
> > +        } zone_report_data;
> > +        struct {
> > +            int64_t append_sector;
> > +        } zone_append_data;
> > +    };
> > +} ZoneCmdData;
> > +
> > +/*
> > + * check zoned_request: error checking before issuing requests. If all checks
> > + * passed, return true.
> > + * append: true if only zone append requests issued.
> > + */
> > +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
> > +                             bool append, uint8_t *status) {
> > +    BlockDriverState *bs = blk_bs(s->blk);
> > +    int index = offset / bs->bl.zone_size;
>
> This function doesn't check that offset+len is in the same zone as
> offset. Maybe that's correct because some request types allow [offset,
> offset+len) to cross zones?

Yes, zone_mgmt requests should allow that.

>
> > +
> > +    if (offset < 0 || offset + len > bs->bl.capacity) {
>
> Other cases that are not checked:
> 1. len < 0
> 2. offset >= bs->bl.capacity
> 3. len > bs->bl.capacity - offset (catches integer overflow)
>
> It may be possible to combine these cases, but be careful about integer
> overflow.

Right. Combining above cases:

if (offset < 0 || len < 0 || offset > cap - len)

offset > cap - len can cover for  #2, #3 cases because any offset that
is greater than cap-len is invalid must be also invalid when it's
greater than cap.

>
> > +        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        return false;
> > +    }
> > +
> > +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> > +        *status = VIRTIO_BLK_S_UNSUPP;
> > +        return false;
> > +    }
> > +
> > +    if (append) {
> > +        if ((offset % bs->bl.write_granularity) != 0) {
> > +            *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> > +            return false;
> > +        }
> > +
> > +        if (!BDRV_ZT_IS_SWR(bs->bl.wps->wp[index])) {
> > +            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +            return false;
> > +        }
>
> Where does the virtio-blk zone spec say that only SWR zones allow zone
> append commands? Should it work for SWP zones too?

The spec says not. But it should work for SWP zones too. I'll change
this to check conventional zones instead.

+If the zone specified by the VIRTIO_BLK_T_ZONE_APPEND request is not
a SWR zone,
+then the request SHALL be completed with VIRTIO_BLK_S_ZONE_INVALID_CMD
+\field{status}.

>
> > +
> > +        if (len / 512 > bs->bl.max_append_sectors) {
> > +            if (bs->bl.max_append_sectors == 0) {
> > +                *status = VIRTIO_BLK_S_UNSUPP;
> > +            } else {
> > +                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +            }
> > +            return false;
> > +        }
> > +    }
> > +    return true;
> > +}
> > +
> > +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> > +{
> > +    ZoneCmdData *data = opaque;
> > +    VirtIOBlockReq *req = data->req;
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> > +    struct iovec *in_iov = req->elem.in_sg;
> > +    unsigned in_num = req->elem.in_num;
> > +    int64_t zrp_size, nz, n, j = 0;
> > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    if (ret) {
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +    nz = data->zone_report_data.nr_zones;
> > +    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
> > +            .nr_zones = cpu_to_le64(nz),
> > +    };
> > +
> > +    zrp_size = sizeof(struct virtio_blk_zone_report)
> > +               + sizeof(struct virtio_blk_zone_descriptor) * nz;
> > +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
> > +    if (n != sizeof(zrp_hdr)) {
> > +        virtio_error(vdev, "Driver provided intput buffer that is too small!");
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +    for (size_t i = sizeof(zrp_hdr); i < zrp_size; i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
> > +        struct virtio_blk_zone_descriptor desc =
> > +                (struct virtio_blk_zone_descriptor) {
> > +                        .z_start = cpu_to_le64(data->zone_report_data.zones[j].start) >> BDRV_SECTOR_BITS,
> > +                        .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap) >> BDRV_SECTOR_BITS,
> > +                        .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp) >> BDRV_SECTOR_BITS,
> > +                };
> > +
> > +        switch (data->zone_report_data.zones[j].type) {
> > +        case BLK_ZT_CONV:
> > +            desc.z_type = BLK_ZONE_TYPE_CONVENTIONAL;
> > +            break;
> > +        case BLK_ZT_SWR:
> > +            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_REQ;
> > +            break;
> > +        case BLK_ZT_SWP:
> > +            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_PREF;
> > +            break;
> > +        default:
> > +            g_assert_not_reached();
> > +        }
> > +
> > +        switch (data->zone_report_data.zones[j].cond) {
> > +        case BLK_ZS_RDONLY:
> > +            desc.z_state = BLK_ZONE_COND_READONLY;
> > +            break;
> > +        case BLK_ZS_OFFLINE:
> > +            desc.z_state = BLK_ZONE_COND_OFFLINE;
> > +            break;
> > +        case BLK_ZS_EMPTY:
> > +            desc.z_state = BLK_ZONE_COND_EMPTY;
> > +            break;
> > +        case BLK_ZS_CLOSED:
> > +            desc.z_state = BLK_ZONE_COND_CLOSED;
> > +            break;
> > +        case BLK_ZS_FULL:
> > +            desc.z_state = BLK_ZONE_COND_FULL;
> > +            break;
> > +        case BLK_ZS_EOPEN:
> > +            desc.z_state = BLK_ZONE_COND_EXP_OPEN;
> > +            break;
> > +        case BLK_ZS_IOPEN:
> > +            desc.z_state = BLK_ZONE_COND_IMP_OPEN;
> > +            break;
> > +        case BLK_ZS_NOT_WP:
> > +            desc.z_state = BLK_ZONE_COND_NOT_WP;
> > +            break;
> > +        default:
> > +            g_assert_not_reached();
> > +            break;
> > +        }
> > +
> > +        /* TODO: it takes O(n^2) time complexity. Optimizations required here. */
> > +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
> > +        if (n != sizeof(desc)) {
> > +            virtio_error(vdev, "Driver provided input buffer "
> > +                               "for descriptors that is too small!");
> > +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +            goto out;
> > +        }
> > +    }
> > +    goto out;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    g_free(data->zone_report_data.zones);
> > +    g_free(data);
> > +}
> > +
> > +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req) {
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    unsigned int nr_zones;
> > +    ZoneCmdData *data;
> > +    int64_t zone_size, offset;
> > +    uint8_t err_status;
> > +
> > +    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
> > +            sizeof(struct virtio_blk_zone_report) +
> > +            sizeof(struct virtio_blk_zone_descriptor)) {
> > +        virtio_error(vdev, "in buffer too small for zone report");
> > +        return -1;
> > +    }
> > +
> > +    /* start byte offset of the zone report */
> > +    offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> > +        goto out;
> > +    }
> > +
> > +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> > +                sizeof(struct virtio_blk_zone_report)) /
> > +               sizeof(struct virtio_blk_zone_descriptor);
> > +
> > +    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
> > +    data = g_malloc(sizeof(ZoneCmdData));
> > +    data->req = req;
> > +    data->zone_report_data.nr_zones = nr_zones;
> > +    data->zone_report_data.zones = g_malloc(zone_size),
> > +
> > +    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
> > +                        data->zone_report_data.zones,
> > +                        virtio_blk_zone_report_complete, data);
> > +    return 0;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> > +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret) {
> > +    ZoneCmdData *data = opaque;
> > +    VirtIOBlockReq *req = data->req;
> > +    VirtIOBlock *s = req->dev;
> > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    if (ret) {
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +    goto out;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    g_free(data);
> > +}
> > +
> > +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op) {
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    BlockDriverState *bs = blk_bs(s->blk);
> > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > +    uint64_t len;
> > +    uint32_t type;
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> > +        goto out;
> > +    }
> > +
> > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > +    data->req = req;
> > +
> > +    type = virtio_ldl_p(vdev, &req->out.type);
> > +    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
> > +        /* Entire drive capacity */
> > +        offset = 0;
> > +        len = bs->bl.capacity;
> > +    } else {
> > +        if (bs->bl.zone_size * bs->bl.nr_zones == bs->bl.capacity) {
> > +            len = bs->bl.zone_size;
> > +        } else {
> > +            /* when the SWR drive has one last small zone, calculate its len */
> > +            len = bs->bl.capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
> > +        }
> > +        if (offset + len > bs->bl.capacity) {
> > +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +            goto out;
>
> data is leaked here.

Move the len part before check_zoned_request so data is not malloced
until it passes checks. Avoid freeing memory here.

>
> > +        }
> > +    }
> > +
> > +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> > +                      virtio_blk_zone_mgmt_complete, data);
> > +
> > +    return 0;
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> > +static void virtio_blk_zone_append_complete(void *opaque, int ret) {
> > +    ZoneCmdData *data = opaque;
> > +    VirtIOBlockReq *req = data->req;
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> > +    int64_t append_sector, n;
> > +    struct iovec *out_iov = req->elem.out_sg;
> > +    unsigned out_num = req->elem.out_num;
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    if (ret) {
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +    virtio_stl_p(vdev, &append_sector, data->zone_append_data.append_sector);
> > +    n = iov_to_buf(out_iov, out_num, 0, &append_sector, sizeof(append_sector));
>
> out_iov contains the driver->device buffers. The device is only allowed
> to read from out_iov, not write to it.
>
> The device->driver buffers are in in_iov.
>
> According to the spec the zone append in hdr looks like this:
>
>   struct {
>       u8 status;
>       u8 reserved[7];
>       le64 append_sector;
>   } virtio_blk_zone_append_inhdr;
>
> In virtio_blk_handle_request() we used iov_discard_back_undoable() to
> take the last byte (the status field for non-zone append requests) from
> in_iov[]. This is incorrect for zone append requests because they have
> the larger struct zone_append_inhdr instead of struct
> virtio_blk_inhdr.
>
> I think it might be time to stop using req->in in virtio-blk.c and
> instead use iov_from_buf() to write the status byte. For zone append
> requests we also need to write reserved[] and append_sector:
>
>   iov_discard_undo(&req->inhdr_undo);
>   inhdr_len = is_zone_append ?
>                sizeof(struct virtio_blk_zone_append_inhdr) :
>                sizeof(struct virtio_blk_inhdr);
>   iov_from_buf(req->elem.in_sg, req->elem.in_num,
>                req->in_len - inhdr_len,
>                &req->in, inhdr_len);
>
> where req->in changes to:
>
>   union {
>       struct virtio_blk_inhdr inhdr;
>       struct virtio_blk_zone_append_inhdr zone_append_inhdr;
>   } in;
>
> Most requests will just use in.inhdr but zone append will fill out the
> full in.zone_append_inhdr struct.

I did some changes according to that but it didn't work for
zone_append requests and broke other zonefs-tests too. I wonder how
zone_append requests fill the status bit. I used the original way as
virtio_blk_inhdr here because attempts to remove it failed :)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 1c2535bfeb..3ecabc7fb3 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -54,6 +54,12 @@ static const VirtIOFeature feature_sizes[] = {
     {}
 };

+typedef struct virtio_blk_zone_append_inhdr {
+    uint8_t status;
+    uint8_t reserved[7];
+    int64_t append_sector;
+} virtio_blk_zone_append_inhdr;
+
 static void virtio_blk_set_config_size(VirtIOBlock *s, uint64_t host_features)
 {
     s->config_size = MAX(VIRTIO_BLK_CFG_SIZE,
@@ -82,11 +88,16 @@ static void virtio_blk_req_complete(VirtIOBlockReq
*req, unsigned char status)
 {
     VirtIOBlock *s = req->dev;
     VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    uint32_t type = virtio_ldl_p(vdev, &req->out.type);

     trace_virtio_blk_req_complete(vdev, req, status);

-    stb_p(&req->in->status, status);
     iov_discard_undo(&req->inhdr_undo);
+    size_t inhdr_len = (type == VIRTIO_BLK_T_ZONE_APPEND) ?
+
sizeof(virtio_blk_zone_append_inhdr):sizeof(struct virtio_blk_inhdr);
+    iov_from_buf(req->elem.in_sg, req->elem.in_num,
+                 req->in_len - inhdr_len, &req->in, inhdr_len);
+    stb_p(&req->in->status, status);
+
     iov_discard_undo(&req->outhdr_undo);
     virtqueue_push(req->vq, &req->elem, req->in_len);
     if (s->dataplane_started && !s->dataplane_disabled) {

>
> > +    if (n != sizeof(append_sector)) {
> > +        virtio_error(vdev, "Driver provided input buffer less than size of "
> > +                     "append_sector");
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +    goto out;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    g_free(data);
> > +}
> > +
> > +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req) {
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    uint64_t niov = req->elem.out_num;
> > +    struct iovec *out_iov = req->elem.out_sg;
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > +    int64_t len = 0;
> > +    for (int i = 1; i < niov; ++i) {
> > +        len += out_iov[i].iov_len;
>
> Please pass in out_iov and out_num instead of using req->elem.out_sg and
> req->elem.out_num. virtio_blk_handle_request() modifies the iovecs
> pointed to by req->elem.out_sg using iov_discard_front_undoable() and it
> is not safe to access req->elem.out_sg directly.
>
> Also, VIRTIO devices are not allowed to make assumptions about the iovec
> layout. That means skipping the first iovec in the for loop violates the
> spec. The driver could send struct virtio_blk_req as two or more iovecs
> instead of putting it into just 1 iovec. This is why the device is not
> allowed to assume out_iov[0] is struct virtio_blk_req.
>
> The for loop can be replaced with:
>
>   len = iov_size(out_iov, out_num);
>
> and out_iov[1]/niov-1 can be replaced with just out_iov and out_num (if
> you pass them in from virtio_blk_handle_request()).

Thanks!

>
> > +    }
> > +
> > +    if (!check_zoned_request(s, offset, len, true, &err_status)) {
> > +        goto out;
> > +    }
> > +
> > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > +    data->req = req;
> > +    data->zone_append_data.append_sector = offset;
> > +    qemu_iovec_init_external(&req->qiov, &out_iov[1], niov-1);
> > +    blk_aio_zone_append(s->blk, &data->zone_append_data.append_sector, &req->qiov, 0,
> > +                        virtio_blk_zone_append_complete, data);
> > +    return 0;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> >  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> >  {
> >      uint32_t type;
> > @@ -700,6 +1039,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> >      case VIRTIO_BLK_T_FLUSH:
> >          virtio_blk_handle_flush(req, mrb);
> >          break;
> > +    case VIRTIO_BLK_T_ZONE_REPORT:
> > +        virtio_blk_handle_zone_report(req);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_OPEN:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_CLOSE:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_FINISH:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_RESET:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET_ALL);
> > +        break;
> >      case VIRTIO_BLK_T_SCSI_CMD:
> >          virtio_blk_handle_scsi(req);
> >          break;
> > @@ -718,6 +1075,9 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> >          virtio_blk_free_request(req);
> >          break;
> >      }
> > +   case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
>
> Indentation is off. QEMU uses 4-space indentation.
>
> > +       virtio_blk_handle_zone_append(req);
> > +       break;
> >      /*
> >       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
> >       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
> > @@ -917,6 +1277,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
> >  {
> >      VirtIOBlock *s = VIRTIO_BLK(vdev);
> >      BlockConf *conf = &s->conf.conf;
> > +    BlockDriverState *bs = blk_bs(s->blk);
> >      struct virtio_blk_config blkcfg;
> >      uint64_t capacity;
> >      int64_t length;
> > @@ -976,6 +1337,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
> >          blkcfg.write_zeroes_may_unmap = 1;
> >          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
> >      }
> > +    if (bs->bl.zoned != BLK_Z_NONE) {
> > +        switch (bs->bl.zoned) {
> > +        case BLK_Z_HM:
> > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
> > +            break;
> > +        case BLK_Z_HA:
> > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
> > +            break;
> > +        default:
> > +            g_assert_not_reached();
> > +        }
> > +
> > +        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
> > +                     bs->bl.zone_size / 512);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
> > +                     bs->bl.max_active_zones);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
> > +                     bs->bl.max_open_zones);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
> > +                     bs->bl.max_append_sectors);
> > +    } else {
> > +        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
> > +    }
> >      memcpy(config, &blkcfg, s->config_size);
> >  }
> >
> > @@ -1140,6 +1525,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
> >      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> >      VirtIOBlock *s = VIRTIO_BLK(dev);
> >      VirtIOBlkConf *conf = &s->conf;
> > +    BlockDriverState *bs = blk_bs(conf->conf.blk);
> >      Error *err = NULL;
> >      unsigned i;
> >
> > @@ -1185,6 +1571,13 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
> >          return;
> >      }
> >
> > +    if (bs->bl.zoned != BLK_Z_NONE) {
> > +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
> > +        if (bs->bl.zoned == BLK_Z_HM) {
> > +            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
> > +        }
> > +    }
> > +
> >      if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
> >          (!conf->max_discard_sectors ||
> >           conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
> > --
> > 2.37.3
> >


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 2/2] virtio-blk: add zoned storage emulation for zoned devices
  2022-10-09  1:54     ` Sam Li
@ 2022-10-09  2:38       ` Sam Li
  0 siblings, 0 replies; 8+ messages in thread
From: Sam Li @ 2022-10-09  2:38 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: qemu-devel, dmitry.fomichev, damien.lemoal, qemu-block,
	Hanna Reitz, Kevin Wolf, Michael S. Tsirkin, hare

Sam Li <faithilikerun@gmail.com> 于2022年10月9日周日 09:54写道:
>
> Stefan Hajnoczi <stefanha@redhat.com> 于2022年10月6日周四 23:04写道:
> >
> > On Thu, Sep 29, 2022 at 05:48:21PM +0800, Sam Li wrote:
> > > This patch extends virtio-blk emulation to handle zoned device commands
> > > by calling the new block layer APIs to perform zoned device I/O on
> > > behalf of the guest. It supports Report Zone, four zone oparations (open,
> > > close, finish, reset), and Append Zone.
> > >
> > > The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> > > support zoned block devices. Regular block devices(conventional zones)
> > > will not be set.
> > >
> > > The guest os having zoned device support can use blkzone(8) to test those
> > > commands. Furthermore, using zonefs to test zone append write is also
> > > supported.
> > >
> > > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > > ---
> > >  hw/block/virtio-blk.c | 393 ++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 393 insertions(+)
> > >
> > > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> > > index e9ba752f6b..1c2535bfeb 100644
> > > --- a/hw/block/virtio-blk.c
> > > +++ b/hw/block/virtio-blk.c
> > > @@ -26,6 +26,9 @@
> > >  #include "hw/virtio/virtio-blk.h"
> > >  #include "dataplane/virtio-blk.h"
> > >  #include "scsi/constants.h"
> > > +#if defined(CONFIG_BLKZONED)
> > > +#include <linux/blkzoned.h>
> > > +#endif
> >
> > Why is this Linux-specific header file included? The virtio-blk
> > emulation code should only use QEMU block layer APIs, not Linux APIs.
> >
> > >  #ifdef __linux__
> > >  # include <scsi/sg.h>
> > >  #endif
> > > @@ -46,6 +49,8 @@ static const VirtIOFeature feature_sizes[] = {
> > >       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
> > >      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
> > >       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> > > +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> > > +     .end = endof(struct virtio_blk_config, zoned)},
> > >      {}
> > >  };
> > >
> > > @@ -614,6 +619,340 @@ err:
> > >      return err_status;
> > >  }
> > >
> > > +typedef struct ZoneCmdData {
> > > +    VirtIOBlockReq *req;
> > > +    union {
> > > +        struct {
> > > +            unsigned int nr_zones;
> > > +            BlockZoneDescriptor *zones;
> > > +        } zone_report_data;
> > > +        struct {
> > > +            int64_t append_sector;
> > > +        } zone_append_data;
> > > +    };
> > > +} ZoneCmdData;
> > > +
> > > +/*
> > > + * check zoned_request: error checking before issuing requests. If all checks
> > > + * passed, return true.
> > > + * append: true if only zone append requests issued.
> > > + */
> > > +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
> > > +                             bool append, uint8_t *status) {
> > > +    BlockDriverState *bs = blk_bs(s->blk);
> > > +    int index = offset / bs->bl.zone_size;
> >
> > This function doesn't check that offset+len is in the same zone as
> > offset. Maybe that's correct because some request types allow [offset,
> > offset+len) to cross zones?
>
> Yes, zone_mgmt requests should allow that.
>
> >
> > > +
> > > +    if (offset < 0 || offset + len > bs->bl.capacity) {
> >
> > Other cases that are not checked:
> > 1. len < 0
> > 2. offset >= bs->bl.capacity
> > 3. len > bs->bl.capacity - offset (catches integer overflow)
> >
> > It may be possible to combine these cases, but be careful about integer
> > overflow.
>
> Right. Combining above cases:
>
> if (offset < 0 || len < 0 || offset > cap - len)
>
> offset > cap - len can cover for  #2, #3 cases because any offset that
> is greater than cap-len is invalid must be also invalid when it's
> greater than cap.
>
> >
> > > +        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +        return false;
> > > +    }
> > > +
> > > +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> > > +        *status = VIRTIO_BLK_S_UNSUPP;
> > > +        return false;
> > > +    }
> > > +
> > > +    if (append) {
> > > +        if ((offset % bs->bl.write_granularity) != 0) {
> > > +            *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> > > +            return false;
> > > +        }
> > > +
> > > +        if (!BDRV_ZT_IS_SWR(bs->bl.wps->wp[index])) {
> > > +            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +            return false;
> > > +        }
> >
> > Where does the virtio-blk zone spec say that only SWR zones allow zone
> > append commands? Should it work for SWP zones too?
>
> The spec says not. But it should work for SWP zones too. I'll change
> this to check conventional zones instead.
>
> +If the zone specified by the VIRTIO_BLK_T_ZONE_APPEND request is not
> a SWR zone,
> +then the request SHALL be completed with VIRTIO_BLK_S_ZONE_INVALID_CMD
> +\field{status}.
>
> >
> > > +
> > > +        if (len / 512 > bs->bl.max_append_sectors) {
> > > +            if (bs->bl.max_append_sectors == 0) {
> > > +                *status = VIRTIO_BLK_S_UNSUPP;
> > > +            } else {
> > > +                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +            }
> > > +            return false;
> > > +        }
> > > +    }
> > > +    return true;
> > > +}
> > > +
> > > +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> > > +{
> > > +    ZoneCmdData *data = opaque;
> > > +    VirtIOBlockReq *req = data->req;
> > > +    VirtIOBlock *s = req->dev;
> > > +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> > > +    struct iovec *in_iov = req->elem.in_sg;
> > > +    unsigned in_num = req->elem.in_num;
> > > +    int64_t zrp_size, nz, n, j = 0;
> > > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > > +
> > > +    if (ret) {
> > > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +        goto out;
> > > +    }
> > > +
> > > +    nz = data->zone_report_data.nr_zones;
> > > +    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
> > > +            .nr_zones = cpu_to_le64(nz),
> > > +    };
> > > +
> > > +    zrp_size = sizeof(struct virtio_blk_zone_report)
> > > +               + sizeof(struct virtio_blk_zone_descriptor) * nz;
> > > +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
> > > +    if (n != sizeof(zrp_hdr)) {
> > > +        virtio_error(vdev, "Driver provided intput buffer that is too small!");
> > > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +        goto out;
> > > +    }
> > > +
> > > +    for (size_t i = sizeof(zrp_hdr); i < zrp_size; i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
> > > +        struct virtio_blk_zone_descriptor desc =
> > > +                (struct virtio_blk_zone_descriptor) {
> > > +                        .z_start = cpu_to_le64(data->zone_report_data.zones[j].start) >> BDRV_SECTOR_BITS,
> > > +                        .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap) >> BDRV_SECTOR_BITS,
> > > +                        .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp) >> BDRV_SECTOR_BITS,
> > > +                };
> > > +
> > > +        switch (data->zone_report_data.zones[j].type) {
> > > +        case BLK_ZT_CONV:
> > > +            desc.z_type = BLK_ZONE_TYPE_CONVENTIONAL;
> > > +            break;
> > > +        case BLK_ZT_SWR:
> > > +            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_REQ;
> > > +            break;
> > > +        case BLK_ZT_SWP:
> > > +            desc.z_type = BLK_ZONE_TYPE_SEQWRITE_PREF;
> > > +            break;
> > > +        default:
> > > +            g_assert_not_reached();
> > > +        }
> > > +
> > > +        switch (data->zone_report_data.zones[j].cond) {
> > > +        case BLK_ZS_RDONLY:
> > > +            desc.z_state = BLK_ZONE_COND_READONLY;
> > > +            break;
> > > +        case BLK_ZS_OFFLINE:
> > > +            desc.z_state = BLK_ZONE_COND_OFFLINE;
> > > +            break;
> > > +        case BLK_ZS_EMPTY:
> > > +            desc.z_state = BLK_ZONE_COND_EMPTY;
> > > +            break;
> > > +        case BLK_ZS_CLOSED:
> > > +            desc.z_state = BLK_ZONE_COND_CLOSED;
> > > +            break;
> > > +        case BLK_ZS_FULL:
> > > +            desc.z_state = BLK_ZONE_COND_FULL;
> > > +            break;
> > > +        case BLK_ZS_EOPEN:
> > > +            desc.z_state = BLK_ZONE_COND_EXP_OPEN;
> > > +            break;
> > > +        case BLK_ZS_IOPEN:
> > > +            desc.z_state = BLK_ZONE_COND_IMP_OPEN;
> > > +            break;
> > > +        case BLK_ZS_NOT_WP:
> > > +            desc.z_state = BLK_ZONE_COND_NOT_WP;
> > > +            break;
> > > +        default:
> > > +            g_assert_not_reached();
> > > +            break;
> > > +        }
> > > +
> > > +        /* TODO: it takes O(n^2) time complexity. Optimizations required here. */
> > > +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
> > > +        if (n != sizeof(desc)) {
> > > +            virtio_error(vdev, "Driver provided input buffer "
> > > +                               "for descriptors that is too small!");
> > > +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +            goto out;
> > > +        }
> > > +    }
> > > +    goto out;
> > > +
> > > +out:
> > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > +    virtio_blk_req_complete(req, err_status);
> > > +    virtio_blk_free_request(req);
> > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > +    g_free(data->zone_report_data.zones);
> > > +    g_free(data);
> > > +}
> > > +
> > > +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req) {
> > > +    VirtIOBlock *s = req->dev;
> > > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > > +    unsigned int nr_zones;
> > > +    ZoneCmdData *data;
> > > +    int64_t zone_size, offset;
> > > +    uint8_t err_status;
> > > +
> > > +    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
> > > +            sizeof(struct virtio_blk_zone_report) +
> > > +            sizeof(struct virtio_blk_zone_descriptor)) {
> > > +        virtio_error(vdev, "in buffer too small for zone report");
> > > +        return -1;
> > > +    }
> > > +
> > > +    /* start byte offset of the zone report */
> > > +    offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > > +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> > > +        goto out;
> > > +    }
> > > +
> > > +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> > > +                sizeof(struct virtio_blk_zone_report)) /
> > > +               sizeof(struct virtio_blk_zone_descriptor);
> > > +
> > > +    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
> > > +    data = g_malloc(sizeof(ZoneCmdData));
> > > +    data->req = req;
> > > +    data->zone_report_data.nr_zones = nr_zones;
> > > +    data->zone_report_data.zones = g_malloc(zone_size),
> > > +
> > > +    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
> > > +                        data->zone_report_data.zones,
> > > +                        virtio_blk_zone_report_complete, data);
> > > +    return 0;
> > > +
> > > +out:
> > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > +    virtio_blk_req_complete(req, err_status);
> > > +    virtio_blk_free_request(req);
> > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > +    return err_status;
> > > +}
> > > +
> > > +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret) {
> > > +    ZoneCmdData *data = opaque;
> > > +    VirtIOBlockReq *req = data->req;
> > > +    VirtIOBlock *s = req->dev;
> > > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > > +
> > > +    if (ret) {
> > > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +        goto out;
> > > +    }
> > > +    goto out;
> > > +
> > > +out:
> > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > +    virtio_blk_req_complete(req, err_status);
> > > +    virtio_blk_free_request(req);
> > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > +    g_free(data);
> > > +}
> > > +
> > > +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op) {
> > > +    VirtIOBlock *s = req->dev;
> > > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > > +    BlockDriverState *bs = blk_bs(s->blk);
> > > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > > +    uint64_t len;
> > > +    uint32_t type;
> > > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > > +
> > > +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> > > +        goto out;
> > > +    }
> > > +
> > > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > > +    data->req = req;
> > > +
> > > +    type = virtio_ldl_p(vdev, &req->out.type);
> > > +    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
> > > +        /* Entire drive capacity */
> > > +        offset = 0;
> > > +        len = bs->bl.capacity;
> > > +    } else {
> > > +        if (bs->bl.zone_size * bs->bl.nr_zones == bs->bl.capacity) {
> > > +            len = bs->bl.zone_size;
> > > +        } else {
> > > +            /* when the SWR drive has one last small zone, calculate its len */
> > > +            len = bs->bl.capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
> > > +        }
> > > +        if (offset + len > bs->bl.capacity) {
> > > +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +            goto out;
> >
> > data is leaked here.
>
> Move the len part before check_zoned_request so data is not malloced
> until it passes checks. Avoid freeing memory here.
>
> >
> > > +        }
> > > +    }
> > > +
> > > +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> > > +                      virtio_blk_zone_mgmt_complete, data);
> > > +
> > > +    return 0;
> > > +out:
> > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > +    virtio_blk_req_complete(req, err_status);
> > > +    virtio_blk_free_request(req);
> > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > +    return err_status;
> > > +}
> > > +
> > > +static void virtio_blk_zone_append_complete(void *opaque, int ret) {
> > > +    ZoneCmdData *data = opaque;
> > > +    VirtIOBlockReq *req = data->req;
> > > +    VirtIOBlock *s = req->dev;
> > > +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> > > +    int64_t append_sector, n;
> > > +    struct iovec *out_iov = req->elem.out_sg;
> > > +    unsigned out_num = req->elem.out_num;
> > > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > > +
> > > +    if (ret) {
> > > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +        goto out;
> > > +    }
> > > +
> > > +    virtio_stl_p(vdev, &append_sector, data->zone_append_data.append_sector);
> > > +    n = iov_to_buf(out_iov, out_num, 0, &append_sector, sizeof(append_sector));
> >
> > out_iov contains the driver->device buffers. The device is only allowed
> > to read from out_iov, not write to it.
> >
> > The device->driver buffers are in in_iov.
> >
> > According to the spec the zone append in hdr looks like this:
> >
> >   struct {
> >       u8 status;
> >       u8 reserved[7];
> >       le64 append_sector;
> >   } virtio_blk_zone_append_inhdr;
> >
> > In virtio_blk_handle_request() we used iov_discard_back_undoable() to
> > take the last byte (the status field for non-zone append requests) from
> > in_iov[]. This is incorrect for zone append requests because they have
> > the larger struct zone_append_inhdr instead of struct
> > virtio_blk_inhdr.
> >
> > I think it might be time to stop using req->in in virtio-blk.c and
> > instead use iov_from_buf() to write the status byte. For zone append
> > requests we also need to write reserved[] and append_sector:
> >
> >   iov_discard_undo(&req->inhdr_undo);
> >   inhdr_len = is_zone_append ?
> >                sizeof(struct virtio_blk_zone_append_inhdr) :
> >                sizeof(struct virtio_blk_inhdr);
> >   iov_from_buf(req->elem.in_sg, req->elem.in_num,
> >                req->in_len - inhdr_len,
> >                &req->in, inhdr_len);
> >
> > where req->in changes to:
> >
> >   union {
> >       struct virtio_blk_inhdr inhdr;
> >       struct virtio_blk_zone_append_inhdr zone_append_inhdr;
> >   } in;
> >
> > Most requests will just use in.inhdr but zone append will fill out the
> > full in.zone_append_inhdr struct.
>
> I did some changes according to that but it didn't work for
> zone_append requests and broke other zonefs-tests too. I wonder how
> zone_append requests fill the status bit. I used the original way as
> virtio_blk_inhdr here because attempts to remove it failed :)
>
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index 1c2535bfeb..3ecabc7fb3 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -54,6 +54,12 @@ static const VirtIOFeature feature_sizes[] = {
>      {}
>  };
>
> +typedef struct virtio_blk_zone_append_inhdr {
> +    uint8_t status;
> +    uint8_t reserved[7];
> +    int64_t append_sector;
> +} virtio_blk_zone_append_inhdr;
> +
>  static void virtio_blk_set_config_size(VirtIOBlock *s, uint64_t host_features)
>  {
>      s->config_size = MAX(VIRTIO_BLK_CFG_SIZE,
> @@ -82,11 +88,16 @@ static void virtio_blk_req_complete(VirtIOBlockReq
> *req, unsigned char status)
>  {
>      VirtIOBlock *s = req->dev;
>      VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    uint32_t type = virtio_ldl_p(vdev, &req->out.type);
>
>      trace_virtio_blk_req_complete(vdev, req, status);
>
> -    stb_p(&req->in->status, status);
>      iov_discard_undo(&req->inhdr_undo);
> +    size_t inhdr_len = (type == VIRTIO_BLK_T_ZONE_APPEND) ?
> +
> sizeof(virtio_blk_zone_append_inhdr):sizeof(struct virtio_blk_inhdr);
> +    iov_from_buf(req->elem.in_sg, req->elem.in_num,
> +                 req->in_len - inhdr_len, &req->in, inhdr_len);
> +    stb_p(&req->in->status, status);
> +
>      iov_discard_undo(&req->outhdr_undo);
>      virtqueue_push(req->vq, &req->elem, req->in_len);
>      if (s->dataplane_started && !s->dataplane_disabled) {
>
> >
> > > +    if (n != sizeof(append_sector)) {
> > > +        virtio_error(vdev, "Driver provided input buffer less than size of "
> > > +                     "append_sector");
> > > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > > +        goto out;
> > > +    }
> > > +    goto out;
> > > +
> > > +out:
> > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > +    virtio_blk_req_complete(req, err_status);
> > > +    virtio_blk_free_request(req);
> > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > +    g_free(data);
> > > +}
> > > +
> > > +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req) {
> > > +    VirtIOBlock *s = req->dev;
> > > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > > +    uint64_t niov = req->elem.out_num;
> > > +    struct iovec *out_iov = req->elem.out_sg;
> > > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > > +
> > > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > > +    int64_t len = 0;
> > > +    for (int i = 1; i < niov; ++i) {
> > > +        len += out_iov[i].iov_len;
> >
> > Please pass in out_iov and out_num instead of using req->elem.out_sg and
> > req->elem.out_num. virtio_blk_handle_request() modifies the iovecs
> > pointed to by req->elem.out_sg using iov_discard_front_undoable() and it
> > is not safe to access req->elem.out_sg directly.
> >
> > Also, VIRTIO devices are not allowed to make assumptions about the iovec
> > layout. That means skipping the first iovec in the for loop violates the
> > spec. The driver could send struct virtio_blk_req as two or more iovecs
> > instead of putting it into just 1 iovec. This is why the device is not
> > allowed to assume out_iov[0] is struct virtio_blk_req.

When VIRTIO devices can't make such assumptions, zone_append in the
block layer should still make the assumption about struct
virtio_blk_req. Because the iovecs with headers may not be aligned to
block size in most cases and zone_append writes need to skip those
iovecs. So the problem becomes how many iovecs would it take to
contain struct virtio_blk_req. Can we just skip the first N iovecs
whose size is less than/not aligned to one block size and write the
following iovecs?

> >
> > The for loop can be replaced with:
> >
> >   len = iov_size(out_iov, out_num);
> >
> > and out_iov[1]/niov-1 can be replaced with just out_iov and out_num (if
> > you pass them in from virtio_blk_handle_request()).
>
> Thanks!
>
> >
> > > +    }
> > > +
> > > +    if (!check_zoned_request(s, offset, len, true, &err_status)) {
> > > +        goto out;
> > > +    }
> > > +
> > > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > > +    data->req = req;
> > > +    data->zone_append_data.append_sector = offset;
> > > +    qemu_iovec_init_external(&req->qiov, &out_iov[1], niov-1);
> > > +    blk_aio_zone_append(s->blk, &data->zone_append_data.append_sector, &req->qiov, 0,
> > > +                        virtio_blk_zone_append_complete, data);
> > > +    return 0;
> > > +
> > > +out:
> > > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > > +    virtio_blk_req_complete(req, err_status);
> > > +    virtio_blk_free_request(req);
> > > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > > +    return err_status;
> > > +}
> > > +
> > >  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> > >  {
> > >      uint32_t type;
> > > @@ -700,6 +1039,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> > >      case VIRTIO_BLK_T_FLUSH:
> > >          virtio_blk_handle_flush(req, mrb);
> > >          break;
> > > +    case VIRTIO_BLK_T_ZONE_REPORT:
> > > +        virtio_blk_handle_zone_report(req);
> > > +        break;
> > > +    case VIRTIO_BLK_T_ZONE_OPEN:
> > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> > > +        break;
> > > +    case VIRTIO_BLK_T_ZONE_CLOSE:
> > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> > > +        break;
> > > +    case VIRTIO_BLK_T_ZONE_FINISH:
> > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> > > +        break;
> > > +    case VIRTIO_BLK_T_ZONE_RESET:
> > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> > > +        break;
> > > +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
> > > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET_ALL);
> > > +        break;
> > >      case VIRTIO_BLK_T_SCSI_CMD:
> > >          virtio_blk_handle_scsi(req);
> > >          break;
> > > @@ -718,6 +1075,9 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> > >          virtio_blk_free_request(req);
> > >          break;
> > >      }
> > > +   case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
> >
> > Indentation is off. QEMU uses 4-space indentation.
> >
> > > +       virtio_blk_handle_zone_append(req);
> > > +       break;
> > >      /*
> > >       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
> > >       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
> > > @@ -917,6 +1277,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
> > >  {
> > >      VirtIOBlock *s = VIRTIO_BLK(vdev);
> > >      BlockConf *conf = &s->conf.conf;
> > > +    BlockDriverState *bs = blk_bs(s->blk);
> > >      struct virtio_blk_config blkcfg;
> > >      uint64_t capacity;
> > >      int64_t length;
> > > @@ -976,6 +1337,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
> > >          blkcfg.write_zeroes_may_unmap = 1;
> > >          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
> > >      }
> > > +    if (bs->bl.zoned != BLK_Z_NONE) {
> > > +        switch (bs->bl.zoned) {
> > > +        case BLK_Z_HM:
> > > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
> > > +            break;
> > > +        case BLK_Z_HA:
> > > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
> > > +            break;
> > > +        default:
> > > +            g_assert_not_reached();
> > > +        }
> > > +
> > > +        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
> > > +                     bs->bl.zone_size / 512);
> > > +        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
> > > +                     bs->bl.max_active_zones);
> > > +        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
> > > +                     bs->bl.max_open_zones);
> > > +        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> > > +        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
> > > +                     bs->bl.max_append_sectors);
> > > +    } else {
> > > +        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
> > > +    }
> > >      memcpy(config, &blkcfg, s->config_size);
> > >  }
> > >
> > > @@ -1140,6 +1525,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
> > >      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> > >      VirtIOBlock *s = VIRTIO_BLK(dev);
> > >      VirtIOBlkConf *conf = &s->conf;
> > > +    BlockDriverState *bs = blk_bs(conf->conf.blk);
> > >      Error *err = NULL;
> > >      unsigned i;
> > >
> > > @@ -1185,6 +1571,13 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
> > >          return;
> > >      }
> > >
> > > +    if (bs->bl.zoned != BLK_Z_NONE) {
> > > +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
> > > +        if (bs->bl.zoned == BLK_Z_HM) {
> > > +            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
> > > +        }
> > > +    }
> > > +
> > >      if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
> > >          (!conf->max_discard_sectors ||
> > >           conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
> > > --
> > > 2.37.3
> > >


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2022-10-09  2:39 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-29  9:48 [PATCH v2 0/2] Add zoned storage emulation to virtio-blk driver Sam Li
2022-09-29  9:48 ` [PATCH v2 1/2] include: update virtio_blk headers from Linux 5.19-rc2+ Sam Li
2022-10-06 12:54   ` Stefan Hajnoczi
2022-10-06 13:24   ` Peter Maydell
2022-09-29  9:48 ` [PATCH v2 2/2] virtio-blk: add zoned storage emulation for zoned devices Sam Li
2022-10-06 15:04   ` Stefan Hajnoczi
2022-10-09  1:54     ` Sam Li
2022-10-09  2:38       ` Sam Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.