All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC v5 0/3] Add zoned storage emulation to virtio-blk driver
@ 2022-12-19  8:16 Sam Li
  2022-12-19  8:16 ` [RFC v5 1/3] include: update virtio_blk headers Sam Li
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Sam Li @ 2022-12-19  8:16 UTC (permalink / raw)
  To: qemu-devel
  Cc: Kevin Wolf, Raphael Norwitz, Hanna Reitz, damien.lemoal, hare,
	qemu-block, Stefan Hajnoczi, Michael S. Tsirkin, dmitry.fomichev,
	Sam Li

Note: the virtio-blk headers isn't upstream in the kernel yet therefore
marked as an RFC. More information can be found here:
https://patchwork.kernel.org/project/linux-block/cover/20221030043545.974223-1-dmitry.fomichev@wdc.com/

v5:
- address Stefan's review comments
  * restore the way writing zone append result to buffer
  * fix error checking case and other errands

v4:
- change the way writing zone append request result to buffer
- change zone state, zone type value of virtio_blk_zone_descriptor
- add trace events for new zone APIs

v3:
- use qemuio_from_buffer to write status bit [Stefan]
- avoid using req->elem directly [Stefan]
- fix error checkings and memory leak [Stefan]

v2:
- change units of emulated zone op coresponding to block layer APIs
- modify error checking cases [Stefan, Damien]

v1:
- add zoned storage emulation

Sam Li (3):
  include: update virtio_blk headers
  virtio-blk: add zoned storage emulation for zoned devices
  virtio-blk: add some trace events for zoned emulation

 hw/block/trace-events                       |   7 +
 hw/block/virtio-blk-common.c                |   2 +
 hw/block/virtio-blk.c                       | 395 ++++++++++++++++++++
 include/standard-headers/linux/virtio_blk.h | 158 +++++++-
 4 files changed, 546 insertions(+), 16 deletions(-)

-- 
2.38.1



^ permalink raw reply	[flat|nested] 7+ messages in thread

* [RFC v5 1/3] include: update virtio_blk headers
  2022-12-19  8:16 [RFC v5 0/3] Add zoned storage emulation to virtio-blk driver Sam Li
@ 2022-12-19  8:16 ` Sam Li
  2022-12-19  8:16 ` [RFC v5 2/3] virtio-blk: add zoned storage emulation for zoned devices Sam Li
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Sam Li @ 2022-12-19  8:16 UTC (permalink / raw)
  To: qemu-devel
  Cc: Kevin Wolf, Raphael Norwitz, Hanna Reitz, damien.lemoal, hare,
	qemu-block, Stefan Hajnoczi, Michael S. Tsirkin, dmitry.fomichev,
	Sam Li

Use scripts/update-linux-headers.sh to update virtio-blk headers
from Dmitry's "virtio-blk:add support for zoned block devices"
Linux patches.

Signed-off-by: Sam Li <faithilikerun@gmail.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
---
 include/standard-headers/linux/virtio_blk.h | 158 ++++++++++++++++++--
 1 file changed, 142 insertions(+), 16 deletions(-)

diff --git a/include/standard-headers/linux/virtio_blk.h b/include/standard-headers/linux/virtio_blk.h
index 2dcc90826a..3744e4da1b 100644
--- a/include/standard-headers/linux/virtio_blk.h
+++ b/include/standard-headers/linux/virtio_blk.h
@@ -25,10 +25,10 @@
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE. */
-#include "standard-headers/linux/types.h"
-#include "standard-headers/linux/virtio_ids.h"
-#include "standard-headers/linux/virtio_config.h"
-#include "standard-headers/linux/virtio_types.h"
+#include <linux/types.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_types.h>
 
 /* Feature bits */
 #define VIRTIO_BLK_F_SIZE_MAX	1	/* Indicates maximum segment size */
@@ -40,6 +40,8 @@
 #define VIRTIO_BLK_F_MQ		12	/* support more than one vq */
 #define VIRTIO_BLK_F_DISCARD	13	/* DISCARD is supported */
 #define VIRTIO_BLK_F_WRITE_ZEROES	14	/* WRITE ZEROES is supported */
+#define VIRTIO_BLK_F_SECURE_ERASE	16 /* Secure Erase is supported */
+#define VIRTIO_BLK_F_ZONED		17	/* Zoned block device */
 
 /* Legacy feature bits */
 #ifndef VIRTIO_BLK_NO_LEGACY
@@ -47,8 +49,10 @@
 #define VIRTIO_BLK_F_SCSI	7	/* Supports scsi command passthru */
 #define VIRTIO_BLK_F_FLUSH	9	/* Flush command supported */
 #define VIRTIO_BLK_F_CONFIG_WCE	11	/* Writeback mode available in config */
+#ifndef __KERNEL__
 /* Old (deprecated) name for VIRTIO_BLK_F_FLUSH. */
 #define VIRTIO_BLK_F_WCE VIRTIO_BLK_F_FLUSH
+#endif
 #endif /* !VIRTIO_BLK_NO_LEGACY */
 
 #define VIRTIO_BLK_ID_BYTES	20	/* ID string length */
@@ -63,8 +67,8 @@ struct virtio_blk_config {
 	/* geometry of the device (if VIRTIO_BLK_F_GEOMETRY) */
 	struct virtio_blk_geometry {
 		__virtio16 cylinders;
-		uint8_t heads;
-		uint8_t sectors;
+		__u8 heads;
+		__u8 sectors;
 	} geometry;
 
 	/* block size of device (if VIRTIO_BLK_F_BLK_SIZE) */
@@ -72,17 +76,17 @@ struct virtio_blk_config {
 
 	/* the next 4 entries are guarded by VIRTIO_BLK_F_TOPOLOGY  */
 	/* exponent for physical block per logical block. */
-	uint8_t physical_block_exp;
+	__u8 physical_block_exp;
 	/* alignment offset in logical blocks. */
-	uint8_t alignment_offset;
+	__u8 alignment_offset;
 	/* minimum I/O size without performance penalty in logical blocks. */
 	__virtio16 min_io_size;
 	/* optimal sustained I/O size in logical blocks. */
 	__virtio32 opt_io_size;
 
 	/* writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) */
-	uint8_t wce;
-	uint8_t unused;
+	__u8 wce;
+	__u8 unused;
 
 	/* number of vqs, only available when VIRTIO_BLK_F_MQ is set */
 	__virtio16 num_queues;
@@ -116,10 +120,35 @@ struct virtio_blk_config {
 	 * Set if a VIRTIO_BLK_T_WRITE_ZEROES request may result in the
 	 * deallocation of one or more of the sectors.
 	 */
-	uint8_t write_zeroes_may_unmap;
+	__u8 write_zeroes_may_unmap;
 
-	uint8_t unused1[3];
-} QEMU_PACKED;
+	__u8 unused1[3];
+
+	/* the next 3 entries are guarded by VIRTIO_BLK_F_SECURE_ERASE */
+	/*
+	 * The maximum secure erase sectors (in 512-byte sectors) for
+	 * one segment.
+	 */
+	__virtio32 max_secure_erase_sectors;
+	/*
+	 * The maximum number of secure erase segments in a
+	 * secure erase command.
+	 */
+	__virtio32 max_secure_erase_seg;
+	/* Secure erase commands must be aligned to this number of sectors. */
+	__virtio32 secure_erase_sector_alignment;
+
+	/* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */
+	struct virtio_blk_zoned_characteristics {
+		__virtio32 zone_sectors;
+		__virtio32 max_open_zones;
+		__virtio32 max_active_zones;
+		__virtio32 max_append_sectors;
+		__virtio32 write_granularity;
+		__u8 model;
+		__u8 unused2[3];
+	} zoned;
+} __attribute__((packed));
 
 /*
  * Command types
@@ -153,6 +182,30 @@ struct virtio_blk_config {
 /* Write zeroes command */
 #define VIRTIO_BLK_T_WRITE_ZEROES	13
 
+/* Secure erase command */
+#define VIRTIO_BLK_T_SECURE_ERASE	14
+
+/* Zone append command */
+#define VIRTIO_BLK_T_ZONE_APPEND    15
+
+/* Report zones command */
+#define VIRTIO_BLK_T_ZONE_REPORT    16
+
+/* Open zone command */
+#define VIRTIO_BLK_T_ZONE_OPEN      18
+
+/* Close zone command */
+#define VIRTIO_BLK_T_ZONE_CLOSE     20
+
+/* Finish zone command */
+#define VIRTIO_BLK_T_ZONE_FINISH    22
+
+/* Reset zone command */
+#define VIRTIO_BLK_T_ZONE_RESET     24
+
+/* Reset All zones command */
+#define VIRTIO_BLK_T_ZONE_RESET_ALL 26
+
 #ifndef VIRTIO_BLK_NO_LEGACY
 /* Barrier before this op. */
 #define VIRTIO_BLK_T_BARRIER	0x80000000
@@ -172,17 +225,83 @@ struct virtio_blk_outhdr {
 	__virtio64 sector;
 };
 
+/*
+ * Supported zoned device models.
+ */
+
+/* Regular block device */
+#define VIRTIO_BLK_Z_NONE      0
+/* Host-managed zoned device */
+#define VIRTIO_BLK_Z_HM        1
+/* Host-aware zoned device */
+#define VIRTIO_BLK_Z_HA        2
+
+/*
+ * Zone descriptor. A part of VIRTIO_BLK_T_ZONE_REPORT command reply.
+ */
+struct virtio_blk_zone_descriptor {
+	/* Zone capacity */
+	__virtio64 z_cap;
+	/* The starting sector of the zone */
+	__virtio64 z_start;
+	/* Zone write pointer position in sectors */
+	__virtio64 z_wp;
+	/* Zone type */
+	__u8 z_type;
+	/* Zone state */
+	__u8 z_state;
+	__u8 reserved[38];
+};
+
+struct virtio_blk_zone_report {
+	__virtio64 nr_zones;
+	__u8 reserved[56];
+	struct virtio_blk_zone_descriptor zones[];
+};
+
+/*
+ * Supported zone types.
+ */
+
+/* Conventional zone */
+#define VIRTIO_BLK_ZT_CONV         1
+/* Sequential Write Required zone */
+#define VIRTIO_BLK_ZT_SWR          2
+/* Sequential Write Preferred zone */
+#define VIRTIO_BLK_ZT_SWP          3
+
+/*
+ * Zone states that are available for zones of all types.
+ */
+
+/* Not a write pointer (conventional zones only) */
+#define VIRTIO_BLK_ZS_NOT_WP       0
+/* Empty */
+#define VIRTIO_BLK_ZS_EMPTY        1
+/* Implicitly Open */
+#define VIRTIO_BLK_ZS_IOPEN        2
+/* Explicitly Open */
+#define VIRTIO_BLK_ZS_EOPEN        3
+/* Closed */
+#define VIRTIO_BLK_ZS_CLOSED       4
+/* Read-Only */
+#define VIRTIO_BLK_ZS_RDONLY       13
+/* Full */
+#define VIRTIO_BLK_ZS_FULL         14
+/* Offline */
+#define VIRTIO_BLK_ZS_OFFLINE      15
+
 /* Unmap this range (only valid for write zeroes command) */
 #define VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP	0x00000001
 
 /* Discard/write zeroes range for each request. */
 struct virtio_blk_discard_write_zeroes {
 	/* discard/write zeroes start sector */
-	uint64_t sector;
+	__le64 sector;
 	/* number of discard/write zeroes sectors */
-	uint32_t num_sectors;
+	__le32 num_sectors;
 	/* flags for this range */
-	uint32_t flags;
+	__le32 flags;
 };
 
 #ifndef VIRTIO_BLK_NO_LEGACY
@@ -198,4 +317,11 @@ struct virtio_scsi_inhdr {
 #define VIRTIO_BLK_S_OK		0
 #define VIRTIO_BLK_S_IOERR	1
 #define VIRTIO_BLK_S_UNSUPP	2
+
+/* Error codes that are specific to zoned block devices */
+#define VIRTIO_BLK_S_ZONE_INVALID_CMD     3
+#define VIRTIO_BLK_S_ZONE_UNALIGNED_WP    4
+#define VIRTIO_BLK_S_ZONE_OPEN_RESOURCE   5
+#define VIRTIO_BLK_S_ZONE_ACTIVE_RESOURCE 6
+
 #endif /* _LINUX_VIRTIO_BLK_H */
-- 
2.38.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [RFC v5 2/3] virtio-blk: add zoned storage emulation for zoned devices
  2022-12-19  8:16 [RFC v5 0/3] Add zoned storage emulation to virtio-blk driver Sam Li
  2022-12-19  8:16 ` [RFC v5 1/3] include: update virtio_blk headers Sam Li
@ 2022-12-19  8:16 ` Sam Li
  2022-12-20 21:37   ` Stefan Hajnoczi
  2022-12-19  8:16 ` [RFC v5 3/3] virtio-blk: add some trace events for zoned emulation Sam Li
  2022-12-20 21:05 ` [RFC v5 0/3] Add zoned storage emulation to virtio-blk driver Stefan Hajnoczi
  3 siblings, 1 reply; 7+ messages in thread
From: Sam Li @ 2022-12-19  8:16 UTC (permalink / raw)
  To: qemu-devel
  Cc: Kevin Wolf, Raphael Norwitz, Hanna Reitz, damien.lemoal, hare,
	qemu-block, Stefan Hajnoczi, Michael S. Tsirkin, dmitry.fomichev,
	Sam Li

This patch extends virtio-blk emulation to handle zoned device commands
by calling the new block layer APIs to perform zoned device I/O on
behalf of the guest. It supports Report Zone, four zone oparations (open,
close, finish, reset), and Append Zone.

The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
support zoned block devices. Regular block devices(conventional zones)
will not be set.

The guest os can use blktests, fio to test those commands on zoned devices.
Furthermore, using zonefs to test zone append write is also supported.

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 hw/block/virtio-blk-common.c |   2 +
 hw/block/virtio-blk.c        | 383 +++++++++++++++++++++++++++++++++++
 2 files changed, 385 insertions(+)

diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
index ac52d7c176..e2f8e2f6da 100644
--- a/hw/block/virtio-blk-common.c
+++ b/hw/block/virtio-blk-common.c
@@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
      .end = endof(struct virtio_blk_config, discard_sector_alignment)},
     {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
      .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
+    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
+     .end = endof(struct virtio_blk_config, zoned)},
     {}
 };
 
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index f717550fdc..2157157e7d 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -601,6 +601,331 @@ err:
     return err_status;
 }
 
+typedef struct ZoneCmdData {
+    VirtIOBlockReq *req;
+    union {
+        struct {
+            unsigned int nr_zones;
+            BlockZoneDescriptor *zones;
+        } zone_report_data;
+        struct {
+            int64_t offset;
+        } zone_append_data;
+    };
+} ZoneCmdData;
+
+/*
+ * check zoned_request: error checking before issuing requests. If all checks
+ * passed, return true.
+ * append: true if only zone append requests issued.
+ */
+static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
+                             bool append, uint8_t *status) {
+    BlockDriverState *bs = blk_bs(s->blk);
+    int index = offset / bs->bl.zone_size;
+
+    if (offset < 0 || len < 0 || len > (bs->total_sectors << BDRV_SECTOR_BITS)
+        || offset > (bs->total_sectors << BDRV_SECTOR_BITS) - len) {
+        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        return false;
+    }
+
+    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
+        *status = VIRTIO_BLK_S_UNSUPP;
+        return false;
+    }
+
+    if (append) {
+        if (bs->bl.write_granularity) {
+            if ((offset % bs->bl.write_granularity) != 0) {
+                *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
+                return false;
+            }
+        }
+
+        if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) {
+            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+            return false;
+        }
+
+        if (len / 512 > bs->bl.max_append_sectors) {
+            if (bs->bl.max_append_sectors == 0) {
+                *status = VIRTIO_BLK_S_UNSUPP;
+            } else {
+                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+            }
+            return false;
+        }
+    }
+    return true;
+}
+
+static void virtio_blk_zone_report_complete(void *opaque, int ret)
+{
+    ZoneCmdData *data = opaque;
+    VirtIOBlockReq *req = data->req;
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
+    struct iovec *in_iov = req->elem.in_sg;
+    unsigned in_num = req->elem.in_num;
+    int64_t zrp_size, n, j = 0;
+    int64_t nz = data->zone_report_data.nr_zones;
+    int8_t err_status = VIRTIO_BLK_S_OK;
+
+    if (ret) {
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
+            .nr_zones = cpu_to_le64(nz),
+    };
+    zrp_size = sizeof(struct virtio_blk_zone_report)
+               + sizeof(struct virtio_blk_zone_descriptor) * nz;
+    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
+    if (n != sizeof(zrp_hdr)) {
+        virtio_error(vdev, "Driver provided intput buffer that is too small!");
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+    for (size_t i = sizeof(zrp_hdr); i < zrp_size;
+        i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
+        struct virtio_blk_zone_descriptor desc =
+            (struct virtio_blk_zone_descriptor) {
+                .z_start = cpu_to_le64(data->zone_report_data.zones[j].start
+                    >> BDRV_SECTOR_BITS),
+                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
+                    >> BDRV_SECTOR_BITS),
+                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
+                    >> BDRV_SECTOR_BITS),
+        };
+
+        switch (data->zone_report_data.zones[j].type) {
+        case BLK_ZT_CONV:
+            desc.z_type = VIRTIO_BLK_ZT_CONV;
+            break;
+        case BLK_ZT_SWR:
+            desc.z_type = VIRTIO_BLK_ZT_SWR;
+            break;
+        case BLK_ZT_SWP:
+            desc.z_type = VIRTIO_BLK_ZT_SWP;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        switch (data->zone_report_data.zones[j].state) {
+        case BLK_ZS_RDONLY:
+            desc.z_state = VIRTIO_BLK_ZS_RDONLY;
+            break;
+        case BLK_ZS_OFFLINE:
+            desc.z_state = VIRTIO_BLK_ZS_OFFLINE;
+            break;
+        case BLK_ZS_EMPTY:
+            desc.z_state = VIRTIO_BLK_ZS_EMPTY;
+            break;
+        case BLK_ZS_CLOSED:
+            desc.z_state = VIRTIO_BLK_ZS_CLOSED;
+            break;
+        case BLK_ZS_FULL:
+            desc.z_state = VIRTIO_BLK_ZS_FULL;
+            break;
+        case BLK_ZS_EOPEN:
+            desc.z_state = VIRTIO_BLK_ZS_EOPEN;
+            break;
+        case BLK_ZS_IOPEN:
+            desc.z_state = VIRTIO_BLK_ZS_IOPEN;
+            break;
+        case BLK_ZS_NOT_WP:
+            desc.z_state = VIRTIO_BLK_ZS_NOT_WP;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        /* TODO: it takes O(n^2) time complexity. Optimizations required. */
+        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
+        if (n != sizeof(desc)) {
+            virtio_error(vdev, "Driver provided input buffer "
+                               "for descriptors that is too small!");
+            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        }
+    }
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    g_free(data->zone_report_data.zones);
+    g_free(data);
+}
+
+static int virtio_blk_handle_zone_report(VirtIOBlockReq *req)
+{
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    unsigned int nr_zones;
+    ZoneCmdData *data;
+    int64_t zone_size, offset;
+    uint8_t err_status;
+
+    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
+            sizeof(struct virtio_blk_zone_report) +
+            sizeof(struct virtio_blk_zone_descriptor)) {
+        virtio_error(vdev, "in buffer too small for zone report");
+        return -1;
+    }
+
+    /* start byte offset of the zone report */
+    offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
+    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
+        goto out;
+    }
+    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
+                sizeof(struct virtio_blk_zone_report)) /
+               sizeof(struct virtio_blk_zone_descriptor);
+
+    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
+    data = g_malloc(sizeof(ZoneCmdData));
+    data->req = req;
+    data->zone_report_data.nr_zones = nr_zones;
+    data->zone_report_data.zones = g_malloc(zone_size),
+
+    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
+                        data->zone_report_data.zones,
+                        virtio_blk_zone_report_complete, data);
+    return 0;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
+static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
+{
+    VirtIOBlockReq *req = opaque;
+    VirtIOBlock *s = req->dev;
+    int8_t err_status = VIRTIO_BLK_S_OK;
+
+    if (ret) {
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+    }
+
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+}
+
+static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
+{
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    BlockDriverState *bs = blk_bs(s->blk);
+    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
+    uint64_t len;
+    uint64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    uint32_t type = virtio_ldl_p(vdev, &req->out.type);
+    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
+        /* Entire drive capacity */
+        offset = 0;
+        len = capacity;
+    } else {
+        if (bs->bl.zone_size > capacity - offset) {
+            /* The zoned device allows the last smaller zone. */
+            len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
+        } else {
+            len = bs->bl.zone_size;
+        }
+    }
+
+    if (!check_zoned_request(s, offset, len, false, &err_status)) {
+        goto out;
+    }
+
+    blk_aio_zone_mgmt(s->blk, op, offset, len,
+                      virtio_blk_zone_mgmt_complete, req);
+
+    return 0;
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
+static void virtio_blk_zone_append_complete(void *opaque, int ret)
+{
+    ZoneCmdData *data = opaque;
+    VirtIOBlockReq *req = data->req;
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
+    int64_t append_sector, n;
+    struct iovec *in_iov = req->elem.in_sg;
+    unsigned in_num = req->elem.in_num;
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    if (ret) {
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+    virtio_stl_p(vdev, &append_sector,
+                 data->zone_append_data.offset >> BDRV_SECTOR_BITS);
+    n = iov_from_buf(in_iov, in_num, 0, &append_sector, sizeof(append_sector));
+    if (n != sizeof(append_sector)) {
+        virtio_error(vdev, "Driver provided input buffer less than size of "
+                           "append_sector");
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    g_free(data);
+}
+
+static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
+                                         struct iovec *out_iov,
+                                         uint64_t niov) {
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
+    int64_t len = iov_size(out_iov, niov);
+
+    if (!check_zoned_request(s, offset, len, true, &err_status)) {
+        goto out;
+    }
+
+    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
+    data->req = req;
+    data->zone_append_data.offset = offset;
+    qemu_iovec_init_external(&req->qiov, out_iov, niov);
+    blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
+                        virtio_blk_zone_append_complete, data);
+    return 0;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
 static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 {
     uint32_t type;
@@ -687,6 +1012,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
     case VIRTIO_BLK_T_FLUSH:
         virtio_blk_handle_flush(req, mrb);
         break;
+    case VIRTIO_BLK_T_ZONE_REPORT:
+        virtio_blk_handle_zone_report(req);
+        break;
+    case VIRTIO_BLK_T_ZONE_OPEN:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
+        break;
+    case VIRTIO_BLK_T_ZONE_CLOSE:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
+        break;
+    case VIRTIO_BLK_T_ZONE_FINISH:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
+        break;
+    case VIRTIO_BLK_T_ZONE_RESET:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
+        break;
+    case VIRTIO_BLK_T_ZONE_RESET_ALL:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
+        break;
     case VIRTIO_BLK_T_SCSI_CMD:
         virtio_blk_handle_scsi(req);
         break;
@@ -705,6 +1048,13 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
         virtio_blk_free_request(req);
         break;
     }
+    case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
+        /*
+         * It is not safe to access req->elem.out_sg directly because it
+         * may be modified by virtio_blk_handle_request().
+         */
+        virtio_blk_handle_zone_append(req, out_iov, out_num);
+        break;
     /*
      * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
      * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
@@ -904,6 +1254,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
     BlockConf *conf = &s->conf.conf;
+    BlockDriverState *bs = blk_bs(s->blk);
     struct virtio_blk_config blkcfg;
     uint64_t capacity;
     int64_t length;
@@ -963,6 +1314,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
         blkcfg.write_zeroes_may_unmap = 1;
         virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
     }
+    if (bs->bl.zoned != BLK_Z_NONE) {
+        switch (bs->bl.zoned) {
+        case BLK_Z_HM:
+            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
+            break;
+        case BLK_Z_HA:
+            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
+                     bs->bl.zone_size / 512);
+        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
+                     bs->bl.max_active_zones);
+        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
+                     bs->bl.max_open_zones);
+        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
+        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
+                     bs->bl.max_append_sectors);
+    } else {
+        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
+    }
     memcpy(config, &blkcfg, s->config_size);
 }
 
@@ -1127,6 +1502,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtIOBlock *s = VIRTIO_BLK(dev);
     VirtIOBlkConf *conf = &s->conf;
+    BlockDriverState *bs = blk_bs(conf->conf.blk);
     Error *err = NULL;
     unsigned i;
 
@@ -1172,6 +1548,13 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
         return;
     }
 
+    if (bs->bl.zoned != BLK_Z_NONE) {
+        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
+        if (bs->bl.zoned == BLK_Z_HM) {
+            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
+        }
+    }
+
     if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
         (!conf->max_discard_sectors ||
          conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
-- 
2.38.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [RFC v5 3/3] virtio-blk: add some trace events for zoned emulation
  2022-12-19  8:16 [RFC v5 0/3] Add zoned storage emulation to virtio-blk driver Sam Li
  2022-12-19  8:16 ` [RFC v5 1/3] include: update virtio_blk headers Sam Li
  2022-12-19  8:16 ` [RFC v5 2/3] virtio-blk: add zoned storage emulation for zoned devices Sam Li
@ 2022-12-19  8:16 ` Sam Li
  2022-12-20 21:05 ` [RFC v5 0/3] Add zoned storage emulation to virtio-blk driver Stefan Hajnoczi
  3 siblings, 0 replies; 7+ messages in thread
From: Sam Li @ 2022-12-19  8:16 UTC (permalink / raw)
  To: qemu-devel
  Cc: Kevin Wolf, Raphael Norwitz, Hanna Reitz, damien.lemoal, hare,
	qemu-block, Stefan Hajnoczi, Michael S. Tsirkin, dmitry.fomichev,
	Sam Li

Signed-off-by: Sam Li <faithilikerun@gmail.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hw/block/trace-events |  7 +++++++
 hw/block/virtio-blk.c | 12 ++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/hw/block/trace-events b/hw/block/trace-events
index 2c45a62bd5..34be8b9135 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -44,9 +44,16 @@ pflash_write_unknown(const char *name, uint8_t cmd) "%s: unknown command 0x%02x"
 # virtio-blk.c
 virtio_blk_req_complete(void *vdev, void *req, int status) "vdev %p req %p status %d"
 virtio_blk_rw_complete(void *vdev, void *req, int ret) "vdev %p req %p ret %d"
+virtio_blk_zone_report_complete(void *vdev, void *req, unsigned int nr_zones, int ret) "vdev %p req %p nr_zones %u ret %d"
+virtio_blk_zone_mgmt_complete(void *vdev, void *req, int ret) "vdev %p req %p ret %d"
+virtio_blk_zone_append_complete(void *vdev, void *req, int64_t sector, int ret) "vdev %p req %p, append sector 0x%" PRIx64 " ret %d"
 virtio_blk_handle_write(void *vdev, void *req, uint64_t sector, size_t nsectors) "vdev %p req %p sector %"PRIu64" nsectors %zu"
 virtio_blk_handle_read(void *vdev, void *req, uint64_t sector, size_t nsectors) "vdev %p req %p sector %"PRIu64" nsectors %zu"
 virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint64_t offset, size_t size, bool is_write) "vdev %p mrb %p start %d num_reqs %d offset %"PRIu64" size %zu is_write %d"
+virtio_blk_handle_zone_report(void *vdev, void *req, int64_t sector, unsigned int nr_zones) "vdev %p req %p sector 0x%" PRIx64 " nr_zones %u"
+virtio_blk_handle_zone_mgmt(void *vdev, void *req, uint8_t op, int64_t sector, int64_t len) "vdev %p req %p op 0x%x sector 0x%" PRIx64 " len 0x%" PRIx64 ""
+virtio_blk_handle_zone_reset_all(void *vdev, void *req, int64_t sector, int64_t len) "vdev %p req %p sector 0x%" PRIx64 " cap 0x%" PRIx64 ""
+virtio_blk_handle_zone_append(void *vdev, void *req, int64_t sector) "vdev %p req %p, append sector 0x%" PRIx64 ""
 
 # hd-geometry.c
 hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 2157157e7d..f79b5cbb59 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -672,6 +672,7 @@ static void virtio_blk_zone_report_complete(void *opaque, int ret)
     int64_t nz = data->zone_report_data.nr_zones;
     int8_t err_status = VIRTIO_BLK_S_OK;
 
+    trace_virtio_blk_zone_report_complete(vdev, req, nz, ret);
     if (ret) {
         err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
         goto out;
@@ -786,6 +787,8 @@ static int virtio_blk_handle_zone_report(VirtIOBlockReq *req)
     nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
                 sizeof(struct virtio_blk_zone_report)) /
                sizeof(struct virtio_blk_zone_descriptor);
+    trace_virtio_blk_handle_zone_report(vdev, req,
+                                        offset >> BDRV_SECTOR_BITS, nr_zones);
 
     zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
     data = g_malloc(sizeof(ZoneCmdData));
@@ -810,7 +813,9 @@ static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
 {
     VirtIOBlockReq *req = opaque;
     VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
     int8_t err_status = VIRTIO_BLK_S_OK;
+    trace_virtio_blk_zone_mgmt_complete(vdev, req,ret);
 
     if (ret) {
         err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
@@ -837,6 +842,8 @@ static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
         /* Entire drive capacity */
         offset = 0;
         len = capacity;
+        trace_virtio_blk_handle_zone_reset_all(vdev, req, 0,
+                                               bs->total_sectors);
     } else {
         if (bs->bl.zone_size > capacity - offset) {
             /* The zoned device allows the last smaller zone. */
@@ -844,6 +851,9 @@ static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
         } else {
             len = bs->bl.zone_size;
         }
+        trace_virtio_blk_handle_zone_mgmt(vdev, req, op,
+                                          offset >> BDRV_SECTOR_BITS,
+                                          len >> BDRV_SECTOR_BITS);
     }
 
     if (!check_zoned_request(s, offset, len, false, &err_status)) {
@@ -887,6 +897,7 @@ static void virtio_blk_zone_append_complete(void *opaque, int ret)
         err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
         goto out;
     }
+    trace_virtio_blk_zone_append_complete(vdev, req, append_sector, ret);
 
 out:
     aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
@@ -906,6 +917,7 @@ static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
     int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
     int64_t len = iov_size(out_iov, niov);
 
+    trace_virtio_blk_handle_zone_append(vdev, req, offset >> BDRV_SECTOR_BITS);
     if (!check_zoned_request(s, offset, len, true, &err_status)) {
         goto out;
     }
-- 
2.38.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC v5 0/3] Add zoned storage emulation to virtio-blk driver
  2022-12-19  8:16 [RFC v5 0/3] Add zoned storage emulation to virtio-blk driver Sam Li
                   ` (2 preceding siblings ...)
  2022-12-19  8:16 ` [RFC v5 3/3] virtio-blk: add some trace events for zoned emulation Sam Li
@ 2022-12-20 21:05 ` Stefan Hajnoczi
  3 siblings, 0 replies; 7+ messages in thread
From: Stefan Hajnoczi @ 2022-12-20 21:05 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, Kevin Wolf, Raphael Norwitz, Hanna Reitz,
	damien.lemoal, hare, qemu-block, Michael S. Tsirkin,
	dmitry.fomichev

[-- Attachment #1: Type: text/plain, Size: 593 bytes --]

On Mon, Dec 19, 2022 at 04:16:41PM +0800, Sam Li wrote:
> Note: the virtio-blk headers isn't upstream in the kernel yet therefore
> marked as an RFC. More information can be found here:
> https://patchwork.kernel.org/project/linux-block/cover/20221030043545.974223-1-dmitry.fomichev@wdc.com/

The VIRTIO spec changes have been merged. The Linux virtio_blk guest
driver patches are in Michael Tsirkin's vhost tree:
https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git/tree/drivers/block/virtio_blk.c?h=vhost

The "RFC" can be dropped as soon as Linus merges the changes. Almost
there!

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC v5 2/3] virtio-blk: add zoned storage emulation for zoned devices
  2022-12-19  8:16 ` [RFC v5 2/3] virtio-blk: add zoned storage emulation for zoned devices Sam Li
@ 2022-12-20 21:37   ` Stefan Hajnoczi
  2022-12-21  4:06     ` Sam Li
  0 siblings, 1 reply; 7+ messages in thread
From: Stefan Hajnoczi @ 2022-12-20 21:37 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, Kevin Wolf, Raphael Norwitz, Hanna Reitz,
	damien.lemoal, hare, qemu-block, Michael S. Tsirkin,
	dmitry.fomichev

[-- Attachment #1: Type: text/plain, Size: 19187 bytes --]

On Mon, Dec 19, 2022 at 04:16:43PM +0800, Sam Li wrote:
> This patch extends virtio-blk emulation to handle zoned device commands
> by calling the new block layer APIs to perform zoned device I/O on
> behalf of the guest. It supports Report Zone, four zone oparations (open,
> close, finish, reset), and Append Zone.
> 
> The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> support zoned block devices. Regular block devices(conventional zones)
> will not be set.
> 
> The guest os can use blktests, fio to test those commands on zoned devices.
> Furthermore, using zonefs to test zone append write is also supported.
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  hw/block/virtio-blk-common.c |   2 +
>  hw/block/virtio-blk.c        | 383 +++++++++++++++++++++++++++++++++++
>  2 files changed, 385 insertions(+)
> 
> diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
> index ac52d7c176..e2f8e2f6da 100644
> --- a/hw/block/virtio-blk-common.c
> +++ b/hw/block/virtio-blk-common.c
> @@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
>       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
>      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
>       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> +     .end = endof(struct virtio_blk_config, zoned)},
>      {}
>  };
>  
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index f717550fdc..2157157e7d 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -601,6 +601,331 @@ err:
>      return err_status;
>  }
>  
> +typedef struct ZoneCmdData {
> +    VirtIOBlockReq *req;
> +    union {
> +        struct {
> +            unsigned int nr_zones;
> +            BlockZoneDescriptor *zones;
> +        } zone_report_data;
> +        struct {
> +            int64_t offset;
> +        } zone_append_data;
> +    };
> +} ZoneCmdData;
> +
> +/*
> + * check zoned_request: error checking before issuing requests. If all checks
> + * passed, return true.
> + * append: true if only zone append requests issued.
> + */
> +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
> +                             bool append, uint8_t *status) {
> +    BlockDriverState *bs = blk_bs(s->blk);
> +    int index = offset / bs->bl.zone_size;

Is bs->bl.zone_size 0 when !virtio_has_feature(s->host_features,
VIRTIO_BLK_F_ZONED)?

In that case there is a division by zero and the QEMU process crashes
with a SIGFPE signal.

The VIRTIO_BLK_F_ZONED check needs to be performed before calculating
index.

> +
> +    if (offset < 0 || len < 0 || len > (bs->total_sectors << BDRV_SECTOR_BITS)
> +        || offset > (bs->total_sectors << BDRV_SECTOR_BITS) - len) {
> +        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        return false;
> +    }
> +
> +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> +        *status = VIRTIO_BLK_S_UNSUPP;
> +        return false;
> +    }
> +
> +    if (append) {
> +        if (bs->bl.write_granularity) {
> +            if ((offset % bs->bl.write_granularity) != 0) {
> +                *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> +                return false;
> +            }
> +        }
> +
> +        if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) {
> +            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            return false;
> +        }
> +
> +        if (len / 512 > bs->bl.max_append_sectors) {
> +            if (bs->bl.max_append_sectors == 0) {
> +                *status = VIRTIO_BLK_S_UNSUPP;
> +            } else {
> +                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            }
> +            return false;
> +        }
> +    }
> +    return true;
> +}
> +
> +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> +{
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> +    struct iovec *in_iov = req->elem.in_sg;
> +    unsigned in_num = req->elem.in_num;

in_iov/in_num must be passed in from virtio_blk_handle_request() (e.g.
via ZoneCmdData). req->elem.in_num is incorrect after
iov_discard_back_undoable() when the guest driver submits a single-byte
iovec as the final element.

> +    int64_t zrp_size, n, j = 0;
> +    int64_t nz = data->zone_report_data.nr_zones;
> +    int8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (ret) {
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
> +            .nr_zones = cpu_to_le64(nz),
> +    };
> +    zrp_size = sizeof(struct virtio_blk_zone_report)
> +               + sizeof(struct virtio_blk_zone_descriptor) * nz;
> +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
> +    if (n != sizeof(zrp_hdr)) {
> +        virtio_error(vdev, "Driver provided intput buffer that is too small!");

s/intput/input/

> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +    for (size_t i = sizeof(zrp_hdr); i < zrp_size;
> +        i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
> +        struct virtio_blk_zone_descriptor desc =
> +            (struct virtio_blk_zone_descriptor) {
> +                .z_start = cpu_to_le64(data->zone_report_data.zones[j].start
> +                    >> BDRV_SECTOR_BITS),
> +                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
> +                    >> BDRV_SECTOR_BITS),
> +                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
> +                    >> BDRV_SECTOR_BITS),
> +        };
> +
> +        switch (data->zone_report_data.zones[j].type) {
> +        case BLK_ZT_CONV:
> +            desc.z_type = VIRTIO_BLK_ZT_CONV;
> +            break;
> +        case BLK_ZT_SWR:
> +            desc.z_type = VIRTIO_BLK_ZT_SWR;
> +            break;
> +        case BLK_ZT_SWP:
> +            desc.z_type = VIRTIO_BLK_ZT_SWP;
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
> +
> +        switch (data->zone_report_data.zones[j].state) {
> +        case BLK_ZS_RDONLY:
> +            desc.z_state = VIRTIO_BLK_ZS_RDONLY;
> +            break;
> +        case BLK_ZS_OFFLINE:
> +            desc.z_state = VIRTIO_BLK_ZS_OFFLINE;
> +            break;
> +        case BLK_ZS_EMPTY:
> +            desc.z_state = VIRTIO_BLK_ZS_EMPTY;
> +            break;
> +        case BLK_ZS_CLOSED:
> +            desc.z_state = VIRTIO_BLK_ZS_CLOSED;
> +            break;
> +        case BLK_ZS_FULL:
> +            desc.z_state = VIRTIO_BLK_ZS_FULL;
> +            break;
> +        case BLK_ZS_EOPEN:
> +            desc.z_state = VIRTIO_BLK_ZS_EOPEN;
> +            break;
> +        case BLK_ZS_IOPEN:
> +            desc.z_state = VIRTIO_BLK_ZS_IOPEN;
> +            break;
> +        case BLK_ZS_NOT_WP:
> +            desc.z_state = VIRTIO_BLK_ZS_NOT_WP;
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
> +
> +        /* TODO: it takes O(n^2) time complexity. Optimizations required. */
> +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
> +        if (n != sizeof(desc)) {
> +            virtio_error(vdev, "Driver provided input buffer "
> +                               "for descriptors that is too small!");
> +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        }
> +    }
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data->zone_report_data.zones);
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req)
> +{
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    unsigned int nr_zones;
> +    ZoneCmdData *data;
> +    int64_t zone_size, offset;
> +    uint8_t err_status;
> +
> +    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
> +            sizeof(struct virtio_blk_zone_report) +
> +            sizeof(struct virtio_blk_zone_descriptor)) {
> +        virtio_error(vdev, "in buffer too small for zone report");
> +        return -1;
> +    }
> +
> +    /* start byte offset of the zone report */
> +    offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
> +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> +        goto out;
> +    }
> +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> +                sizeof(struct virtio_blk_zone_report)) /
> +               sizeof(struct virtio_blk_zone_descriptor);
> +
> +    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
> +    data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +    data->zone_report_data.nr_zones = nr_zones;
> +    data->zone_report_data.zones = g_malloc(zone_size),
> +
> +    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
> +                        data->zone_report_data.zones,
> +                        virtio_blk_zone_report_complete, data);
> +    return 0;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
> +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
> +{
> +    VirtIOBlockReq *req = opaque;
> +    VirtIOBlock *s = req->dev;
> +    int8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (ret) {
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +    }
> +
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +}
> +
> +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
> +{
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    BlockDriverState *bs = blk_bs(s->blk);
> +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
> +    uint64_t len;
> +    uint64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    uint32_t type = virtio_ldl_p(vdev, &req->out.type);
> +    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
> +        /* Entire drive capacity */
> +        offset = 0;
> +        len = capacity;
> +    } else {
> +        if (bs->bl.zone_size > capacity - offset) {
> +            /* The zoned device allows the last smaller zone. */
> +            len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
> +        } else {
> +            len = bs->bl.zone_size;
> +        }
> +    }
> +
> +    if (!check_zoned_request(s, offset, len, false, &err_status)) {
> +        goto out;
> +    }
> +
> +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> +                      virtio_blk_zone_mgmt_complete, req);
> +
> +    return 0;
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
> +static void virtio_blk_zone_append_complete(void *opaque, int ret)
> +{
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> +    int64_t append_sector, n;
> +    struct iovec *in_iov = req->elem.in_sg;
> +    unsigned in_num = req->elem.in_num;

Same as for report zones. in_iov/in_num must be passed in from
virtio_blk_handle_request().

> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (ret) {
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +    virtio_stl_p(vdev, &append_sector,
> +                 data->zone_append_data.offset >> BDRV_SECTOR_BITS);

This must be stq because offset is 64 bits.

> +    n = iov_from_buf(in_iov, in_num, 0, &append_sector, sizeof(append_sector));
> +    if (n != sizeof(append_sector)) {
> +        virtio_error(vdev, "Driver provided input buffer less than size of "
> +                           "append_sector");
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
> +                                         struct iovec *out_iov,
> +                                         uint64_t niov) {
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
> +    int64_t len = iov_size(out_iov, niov);
> +
> +    if (!check_zoned_request(s, offset, len, true, &err_status)) {
> +        goto out;
> +    }
> +
> +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +    data->zone_append_data.offset = offset;
> +    qemu_iovec_init_external(&req->qiov, out_iov, niov);
> +    blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
> +                        virtio_blk_zone_append_complete, data);

Accounting is missing. Other I/O request types (read, write, flush) are
tracked (see BLOCK_ACCT_*). I think a new BLOCK_ACCT_APPEND enum
constant should be introduced.

This reminds me that QEMU's I/O throttling code doesn't know about
append operations either. Guests can bypass I/O throttling by submitting
append operations instead of writes. I think this can be left as a
separate task though. It's not directly related to the virtio-blk
emulation code.

> +    return 0;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
>  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>  {
>      uint32_t type;
> @@ -687,6 +1012,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>      case VIRTIO_BLK_T_FLUSH:
>          virtio_blk_handle_flush(req, mrb);
>          break;
> +    case VIRTIO_BLK_T_ZONE_REPORT:
> +        virtio_blk_handle_zone_report(req);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_OPEN:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_CLOSE:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_FINISH:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_RESET:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> +        break;
>      case VIRTIO_BLK_T_SCSI_CMD:
>          virtio_blk_handle_scsi(req);
>          break;
> @@ -705,6 +1048,13 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>          virtio_blk_free_request(req);
>          break;
>      }
> +    case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
> +        /*
> +         * It is not safe to access req->elem.out_sg directly because it
> +         * may be modified by virtio_blk_handle_request().
> +         */
> +        virtio_blk_handle_zone_append(req, out_iov, out_num);
> +        break;
>      /*
>       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
>       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
> @@ -904,6 +1254,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
>  {
>      VirtIOBlock *s = VIRTIO_BLK(vdev);
>      BlockConf *conf = &s->conf.conf;
> +    BlockDriverState *bs = blk_bs(s->blk);
>      struct virtio_blk_config blkcfg;
>      uint64_t capacity;
>      int64_t length;
> @@ -963,6 +1314,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
>          blkcfg.write_zeroes_may_unmap = 1;
>          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
>      }
> +    if (bs->bl.zoned != BLK_Z_NONE) {
> +        switch (bs->bl.zoned) {
> +        case BLK_Z_HM:
> +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
> +            break;
> +        case BLK_Z_HA:
> +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
> +
> +        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
> +                     bs->bl.zone_size / 512);
> +        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
> +                     bs->bl.max_active_zones);
> +        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
> +                     bs->bl.max_open_zones);
> +        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> +        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
> +                     bs->bl.max_append_sectors);
> +    } else {
> +        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
> +    }
>      memcpy(config, &blkcfg, s->config_size);
>  }
>  
> @@ -1127,6 +1502,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>      VirtIOBlock *s = VIRTIO_BLK(dev);
>      VirtIOBlkConf *conf = &s->conf;
> +    BlockDriverState *bs = blk_bs(conf->conf.blk);
>      Error *err = NULL;
>      unsigned i;
>  
> @@ -1172,6 +1548,13 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>          return;
>      }
>  
> +    if (bs->bl.zoned != BLK_Z_NONE) {
> +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
> +        if (bs->bl.zoned == BLK_Z_HM) {
> +            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
> +        }
> +    }
> +
>      if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
>          (!conf->max_discard_sectors ||
>           conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
> -- 
> 2.38.1
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC v5 2/3] virtio-blk: add zoned storage emulation for zoned devices
  2022-12-20 21:37   ` Stefan Hajnoczi
@ 2022-12-21  4:06     ` Sam Li
  0 siblings, 0 replies; 7+ messages in thread
From: Sam Li @ 2022-12-21  4:06 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: qemu-devel, Kevin Wolf, Raphael Norwitz, Hanna Reitz,
	damien.lemoal, hare, qemu-block, Michael S. Tsirkin,
	dmitry.fomichev

Stefan Hajnoczi <stefanha@redhat.com> 于2022年12月21日周三 05:37写道:
>
> On Mon, Dec 19, 2022 at 04:16:43PM +0800, Sam Li wrote:
> > This patch extends virtio-blk emulation to handle zoned device commands
> > by calling the new block layer APIs to perform zoned device I/O on
> > behalf of the guest. It supports Report Zone, four zone oparations (open,
> > close, finish, reset), and Append Zone.
> >
> > The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> > support zoned block devices. Regular block devices(conventional zones)
> > will not be set.
> >
> > The guest os can use blktests, fio to test those commands on zoned devices.
> > Furthermore, using zonefs to test zone append write is also supported.
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >  hw/block/virtio-blk-common.c |   2 +
> >  hw/block/virtio-blk.c        | 383 +++++++++++++++++++++++++++++++++++
> >  2 files changed, 385 insertions(+)
> >
> > diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
> > index ac52d7c176..e2f8e2f6da 100644
> > --- a/hw/block/virtio-blk-common.c
> > +++ b/hw/block/virtio-blk-common.c
> > @@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
> >       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
> >      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
> >       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> > +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> > +     .end = endof(struct virtio_blk_config, zoned)},
> >      {}
> >  };
> >
> > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> > index f717550fdc..2157157e7d 100644
> > --- a/hw/block/virtio-blk.c
> > +++ b/hw/block/virtio-blk.c
> > @@ -601,6 +601,331 @@ err:
> >      return err_status;
> >  }
> >
> > +typedef struct ZoneCmdData {
> > +    VirtIOBlockReq *req;
> > +    union {
> > +        struct {
> > +            unsigned int nr_zones;
> > +            BlockZoneDescriptor *zones;
> > +        } zone_report_data;
> > +        struct {
> > +            int64_t offset;
> > +        } zone_append_data;
> > +    };
> > +} ZoneCmdData;
> > +
> > +/*
> > + * check zoned_request: error checking before issuing requests. If all checks
> > + * passed, return true.
> > + * append: true if only zone append requests issued.
> > + */
> > +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
> > +                             bool append, uint8_t *status) {
> > +    BlockDriverState *bs = blk_bs(s->blk);
> > +    int index = offset / bs->bl.zone_size;
>
> Is bs->bl.zone_size 0 when !virtio_has_feature(s->host_features,
> VIRTIO_BLK_F_ZONED)?
>
> In that case there is a division by zero and the QEMU process crashes
> with a SIGFPE signal.
>
> The VIRTIO_BLK_F_ZONED check needs to be performed before calculating
> index.
>
> > +
> > +    if (offset < 0 || len < 0 || len > (bs->total_sectors << BDRV_SECTOR_BITS)
> > +        || offset > (bs->total_sectors << BDRV_SECTOR_BITS) - len) {
> > +        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        return false;
> > +    }
> > +
> > +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> > +        *status = VIRTIO_BLK_S_UNSUPP;
> > +        return false;
> > +    }
> > +
> > +    if (append) {
> > +        if (bs->bl.write_granularity) {
> > +            if ((offset % bs->bl.write_granularity) != 0) {
> > +                *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> > +                return false;
> > +            }
> > +        }
> > +
> > +        if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) {
> > +            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +            return false;
> > +        }
> > +
> > +        if (len / 512 > bs->bl.max_append_sectors) {
> > +            if (bs->bl.max_append_sectors == 0) {
> > +                *status = VIRTIO_BLK_S_UNSUPP;
> > +            } else {
> > +                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +            }
> > +            return false;
> > +        }
> > +    }
> > +    return true;
> > +}
> > +
> > +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> > +{
> > +    ZoneCmdData *data = opaque;
> > +    VirtIOBlockReq *req = data->req;
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> > +    struct iovec *in_iov = req->elem.in_sg;
> > +    unsigned in_num = req->elem.in_num;
>
> in_iov/in_num must be passed in from virtio_blk_handle_request() (e.g.
> via ZoneCmdData). req->elem.in_num is incorrect after
> iov_discard_back_undoable() when the guest driver submits a single-byte
> iovec as the final element.
>
> > +    int64_t zrp_size, n, j = 0;
> > +    int64_t nz = data->zone_report_data.nr_zones;
> > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    if (ret) {
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
> > +            .nr_zones = cpu_to_le64(nz),
> > +    };
> > +    zrp_size = sizeof(struct virtio_blk_zone_report)
> > +               + sizeof(struct virtio_blk_zone_descriptor) * nz;
> > +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
> > +    if (n != sizeof(zrp_hdr)) {
> > +        virtio_error(vdev, "Driver provided intput buffer that is too small!");
>
> s/intput/input/
>
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +    for (size_t i = sizeof(zrp_hdr); i < zrp_size;
> > +        i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
> > +        struct virtio_blk_zone_descriptor desc =
> > +            (struct virtio_blk_zone_descriptor) {
> > +                .z_start = cpu_to_le64(data->zone_report_data.zones[j].start
> > +                    >> BDRV_SECTOR_BITS),
> > +                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
> > +                    >> BDRV_SECTOR_BITS),
> > +                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
> > +                    >> BDRV_SECTOR_BITS),
> > +        };
> > +
> > +        switch (data->zone_report_data.zones[j].type) {
> > +        case BLK_ZT_CONV:
> > +            desc.z_type = VIRTIO_BLK_ZT_CONV;
> > +            break;
> > +        case BLK_ZT_SWR:
> > +            desc.z_type = VIRTIO_BLK_ZT_SWR;
> > +            break;
> > +        case BLK_ZT_SWP:
> > +            desc.z_type = VIRTIO_BLK_ZT_SWP;
> > +            break;
> > +        default:
> > +            g_assert_not_reached();
> > +        }
> > +
> > +        switch (data->zone_report_data.zones[j].state) {
> > +        case BLK_ZS_RDONLY:
> > +            desc.z_state = VIRTIO_BLK_ZS_RDONLY;
> > +            break;
> > +        case BLK_ZS_OFFLINE:
> > +            desc.z_state = VIRTIO_BLK_ZS_OFFLINE;
> > +            break;
> > +        case BLK_ZS_EMPTY:
> > +            desc.z_state = VIRTIO_BLK_ZS_EMPTY;
> > +            break;
> > +        case BLK_ZS_CLOSED:
> > +            desc.z_state = VIRTIO_BLK_ZS_CLOSED;
> > +            break;
> > +        case BLK_ZS_FULL:
> > +            desc.z_state = VIRTIO_BLK_ZS_FULL;
> > +            break;
> > +        case BLK_ZS_EOPEN:
> > +            desc.z_state = VIRTIO_BLK_ZS_EOPEN;
> > +            break;
> > +        case BLK_ZS_IOPEN:
> > +            desc.z_state = VIRTIO_BLK_ZS_IOPEN;
> > +            break;
> > +        case BLK_ZS_NOT_WP:
> > +            desc.z_state = VIRTIO_BLK_ZS_NOT_WP;
> > +            break;
> > +        default:
> > +            g_assert_not_reached();
> > +        }
> > +
> > +        /* TODO: it takes O(n^2) time complexity. Optimizations required. */
> > +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
> > +        if (n != sizeof(desc)) {
> > +            virtio_error(vdev, "Driver provided input buffer "
> > +                               "for descriptors that is too small!");
> > +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        }
> > +    }
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    g_free(data->zone_report_data.zones);
> > +    g_free(data);
> > +}
> > +
> > +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req)
> > +{
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    unsigned int nr_zones;
> > +    ZoneCmdData *data;
> > +    int64_t zone_size, offset;
> > +    uint8_t err_status;
> > +
> > +    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
> > +            sizeof(struct virtio_blk_zone_report) +
> > +            sizeof(struct virtio_blk_zone_descriptor)) {
> > +        virtio_error(vdev, "in buffer too small for zone report");
> > +        return -1;
> > +    }
> > +
> > +    /* start byte offset of the zone report */
> > +    offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
> > +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> > +        goto out;
> > +    }
> > +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> > +                sizeof(struct virtio_blk_zone_report)) /
> > +               sizeof(struct virtio_blk_zone_descriptor);
> > +
> > +    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
> > +    data = g_malloc(sizeof(ZoneCmdData));
> > +    data->req = req;
> > +    data->zone_report_data.nr_zones = nr_zones;
> > +    data->zone_report_data.zones = g_malloc(zone_size),
> > +
> > +    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
> > +                        data->zone_report_data.zones,
> > +                        virtio_blk_zone_report_complete, data);
> > +    return 0;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> > +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
> > +{
> > +    VirtIOBlockReq *req = opaque;
> > +    VirtIOBlock *s = req->dev;
> > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    if (ret) {
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +    }
> > +
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +}
> > +
> > +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
> > +{
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    BlockDriverState *bs = blk_bs(s->blk);
> > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
> > +    uint64_t len;
> > +    uint64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    uint32_t type = virtio_ldl_p(vdev, &req->out.type);
> > +    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
> > +        /* Entire drive capacity */
> > +        offset = 0;
> > +        len = capacity;
> > +    } else {
> > +        if (bs->bl.zone_size > capacity - offset) {
> > +            /* The zoned device allows the last smaller zone. */
> > +            len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
> > +        } else {
> > +            len = bs->bl.zone_size;
> > +        }
> > +    }
> > +
> > +    if (!check_zoned_request(s, offset, len, false, &err_status)) {
> > +        goto out;
> > +    }
> > +
> > +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> > +                      virtio_blk_zone_mgmt_complete, req);
> > +
> > +    return 0;
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> > +static void virtio_blk_zone_append_complete(void *opaque, int ret)
> > +{
> > +    ZoneCmdData *data = opaque;
> > +    VirtIOBlockReq *req = data->req;
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> > +    int64_t append_sector, n;
> > +    struct iovec *in_iov = req->elem.in_sg;
> > +    unsigned in_num = req->elem.in_num;
>
> Same as for report zones. in_iov/in_num must be passed in from
> virtio_blk_handle_request().
>
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    if (ret) {
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +    virtio_stl_p(vdev, &append_sector,
> > +                 data->zone_append_data.offset >> BDRV_SECTOR_BITS);
>
> This must be stq because offset is 64 bits.

Will change it to virtio_ldq_p().

>
> > +    n = iov_from_buf(in_iov, in_num, 0, &append_sector, sizeof(append_sector));
> > +    if (n != sizeof(append_sector)) {
> > +        virtio_error(vdev, "Driver provided input buffer less than size of "
> > +                           "append_sector");
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    g_free(data);
> > +}
> > +
> > +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
> > +                                         struct iovec *out_iov,
> > +                                         uint64_t niov) {
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
> > +    int64_t len = iov_size(out_iov, niov);
> > +
> > +    if (!check_zoned_request(s, offset, len, true, &err_status)) {
> > +        goto out;
> > +    }
> > +
> > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > +    data->req = req;
> > +    data->zone_append_data.offset = offset;
> > +    qemu_iovec_init_external(&req->qiov, out_iov, niov);
> > +    blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
> > +                        virtio_blk_zone_append_complete, data);
>
> Accounting is missing. Other I/O request types (read, write, flush) are
> tracked (see BLOCK_ACCT_*). I think a new BLOCK_ACCT_APPEND enum
> constant should be introduced.
>
> This reminds me that QEMU's I/O throttling code doesn't know about
> append operations either. Guests can bypass I/O throttling by submitting
> append operations instead of writes. I think this can be left as a
> separate task though. It's not directly related to the virtio-blk
> emulation code.

Thanks! I'll see to it.

>
> > +    return 0;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> >  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> >  {
> >      uint32_t type;
> > @@ -687,6 +1012,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> >      case VIRTIO_BLK_T_FLUSH:
> >          virtio_blk_handle_flush(req, mrb);
> >          break;
> > +    case VIRTIO_BLK_T_ZONE_REPORT:
> > +        virtio_blk_handle_zone_report(req);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_OPEN:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_CLOSE:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_FINISH:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_RESET:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> > +        break;
> >      case VIRTIO_BLK_T_SCSI_CMD:
> >          virtio_blk_handle_scsi(req);
> >          break;
> > @@ -705,6 +1048,13 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> >          virtio_blk_free_request(req);
> >          break;
> >      }
> > +    case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
> > +        /*
> > +         * It is not safe to access req->elem.out_sg directly because it
> > +         * may be modified by virtio_blk_handle_request().
> > +         */
> > +        virtio_blk_handle_zone_append(req, out_iov, out_num);
> > +        break;
> >      /*
> >       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
> >       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
> > @@ -904,6 +1254,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
> >  {
> >      VirtIOBlock *s = VIRTIO_BLK(vdev);
> >      BlockConf *conf = &s->conf.conf;
> > +    BlockDriverState *bs = blk_bs(s->blk);
> >      struct virtio_blk_config blkcfg;
> >      uint64_t capacity;
> >      int64_t length;
> > @@ -963,6 +1314,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
> >          blkcfg.write_zeroes_may_unmap = 1;
> >          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
> >      }
> > +    if (bs->bl.zoned != BLK_Z_NONE) {
> > +        switch (bs->bl.zoned) {
> > +        case BLK_Z_HM:
> > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
> > +            break;
> > +        case BLK_Z_HA:
> > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
> > +            break;
> > +        default:
> > +            g_assert_not_reached();
> > +        }
> > +
> > +        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
> > +                     bs->bl.zone_size / 512);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
> > +                     bs->bl.max_active_zones);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
> > +                     bs->bl.max_open_zones);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
> > +                     bs->bl.max_append_sectors);
> > +    } else {
> > +        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
> > +    }
> >      memcpy(config, &blkcfg, s->config_size);
> >  }
> >
> > @@ -1127,6 +1502,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
> >      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> >      VirtIOBlock *s = VIRTIO_BLK(dev);
> >      VirtIOBlkConf *conf = &s->conf;
> > +    BlockDriverState *bs = blk_bs(conf->conf.blk);
> >      Error *err = NULL;
> >      unsigned i;
> >
> > @@ -1172,6 +1548,13 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
> >          return;
> >      }
> >
> > +    if (bs->bl.zoned != BLK_Z_NONE) {
> > +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
> > +        if (bs->bl.zoned == BLK_Z_HM) {
> > +            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
> > +        }
> > +    }
> > +
> >      if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
> >          (!conf->max_discard_sectors ||
> >           conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
> > --
> > 2.38.1
> >


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-12-21  4:07 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-19  8:16 [RFC v5 0/3] Add zoned storage emulation to virtio-blk driver Sam Li
2022-12-19  8:16 ` [RFC v5 1/3] include: update virtio_blk headers Sam Li
2022-12-19  8:16 ` [RFC v5 2/3] virtio-blk: add zoned storage emulation for zoned devices Sam Li
2022-12-20 21:37   ` Stefan Hajnoczi
2022-12-21  4:06     ` Sam Li
2022-12-19  8:16 ` [RFC v5 3/3] virtio-blk: add some trace events for zoned emulation Sam Li
2022-12-20 21:05 ` [RFC v5 0/3] Add zoned storage emulation to virtio-blk driver Stefan Hajnoczi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.