All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC patch 0/1] block: vhost-blk backend
@ 2022-07-25 20:55 Andrey Zhadchenko via
  2022-07-25 20:55 ` [RFC PATCH 1/1] block: add " Andrey Zhadchenko via
                   ` (4 more replies)
  0 siblings, 5 replies; 19+ messages in thread
From: Andrey Zhadchenko via @ 2022-07-25 20:55 UTC (permalink / raw)
  To: qemu-block
  Cc: qemu-devel, kwolf, hreitz, mst, stefanha, den, andrey.zhadchenko

Although QEMU virtio-blk is quite fast, there is still some room for
improvements. Disk latency can be reduced if we handle virito-blk requests
in host kernel so we avoid a lot of syscalls and context switches.

The biggest disadvantage of this vhost-blk flavor is raw format.
Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html

Also by using kernel modules we can bypass iothread limitation and finaly scale
block requests with cpus for high-performance devices. This is planned to be
implemented in next version.

Linux kernel module part:
https://lore.kernel.org/kvm/20220725202753.298725-1-andrey.zhadchenko@virtuozzo.com/

test setups and results:
fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
QEMU drive options: cache=none
filesystem: xfs

SSD:
               | randread, IOPS  | randwrite, IOPS |
Host           |      95.8k	 |	85.3k	   |
QEMU virtio    |      57.5k	 |	79.4k	   |
QEMU vhost-blk |      95.6k	 |	84.3k	   |

RAMDISK (vq == vcpu):
                 | randread, IOPS | randwrite, IOPS |
virtio, 1vcpu    |	123k	  |	 129k       |
virtio, 2vcpu    |	253k (??) |	 250k (??)  |
virtio, 4vcpu    |	158k	  |	 154k       |
vhost-blk, 1vcpu |	110k	  |	 113k       |
vhost-blk, 2vcpu |	247k	  |	 252k       |
vhost-blk, 4vcpu |	576k	  |	 567k       |

Andrey Zhadchenko (1):
  block: add vhost-blk backend

 configure                     |  13 ++
 hw/block/Kconfig              |   5 +
 hw/block/meson.build          |   1 +
 hw/block/vhost-blk.c          | 395 ++++++++++++++++++++++++++++++++++
 hw/virtio/meson.build         |   1 +
 hw/virtio/vhost-blk-pci.c     | 102 +++++++++
 include/hw/virtio/vhost-blk.h |  44 ++++
 linux-headers/linux/vhost.h   |   3 +
 8 files changed, 564 insertions(+)
 create mode 100644 hw/block/vhost-blk.c
 create mode 100644 hw/virtio/vhost-blk-pci.c
 create mode 100644 include/hw/virtio/vhost-blk.h

-- 
2.31.1



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [RFC PATCH 1/1] block: add vhost-blk backend
  2022-07-25 20:55 [RFC patch 0/1] block: vhost-blk backend Andrey Zhadchenko via
@ 2022-07-25 20:55 ` Andrey Zhadchenko via
  2022-10-04 18:45   ` Stefan Hajnoczi
  2022-07-26 13:51 ` [RFC patch 0/1] block: " Michael S. Tsirkin
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 19+ messages in thread
From: Andrey Zhadchenko via @ 2022-07-25 20:55 UTC (permalink / raw)
  To: qemu-block
  Cc: qemu-devel, kwolf, hreitz, mst, stefanha, den, andrey.zhadchenko

Although QEMU virtio is quite fast, there is still some room for
improvements. Disk latency can be reduced if we handle virito-blk requests
in host kernel istead of passing them to QEMU. The patch adds vhost-blk
backend which sets up vhost-blk kernel module to process requests.

test setup and results:
fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
QEMU drive options: cache=none
filesystem: xfs

SSD:
               | randread, IOPS  | randwrite, IOPS |
Host           |      95.8k	 |	85.3k	   |
QEMU virtio    |      57.5k	 |	79.4k	   |
QEMU vhost-blk |      95.6k	 |	84.3k	   |

RAMDISK (vq == vcpu):
                 | randread, IOPS | randwrite, IOPS |
virtio, 1vcpu    |	123k	  |	 129k       |
virtio, 2vcpu    |	253k (??) |	 250k (??)  |
virtio, 4vcpu    |	158k	  |	 154k       |
vhost-blk, 1vcpu |	110k	  |	 113k       |
vhost-blk, 2vcpu |	247k	  |	 252k       |
vhost-blk, 4vcpu |	576k	  |	 567k       |

Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
---
 hw/block/Kconfig              |   5 +
 hw/block/meson.build          |   4 +
 hw/block/vhost-blk.c          | 394 ++++++++++++++++++++++++++++++++++
 hw/virtio/meson.build         |   3 +
 hw/virtio/vhost-blk-pci.c     | 102 +++++++++
 include/hw/virtio/vhost-blk.h |  50 +++++
 meson.build                   |   5 +
 7 files changed, 563 insertions(+)
 create mode 100644 hw/block/vhost-blk.c
 create mode 100644 hw/virtio/vhost-blk-pci.c
 create mode 100644 include/hw/virtio/vhost-blk.h

diff --git a/hw/block/Kconfig b/hw/block/Kconfig
index 9e8f28f982..b4286ad10e 100644
--- a/hw/block/Kconfig
+++ b/hw/block/Kconfig
@@ -36,6 +36,11 @@ config VIRTIO_BLK
     default y
     depends on VIRTIO
 
+config VHOST_BLK
+    bool
+    default n
+    depends on VIRTIO && LINUX
+
 config VHOST_USER_BLK
     bool
     # Only PCI devices are provided for now
diff --git a/hw/block/meson.build b/hw/block/meson.build
index 2389326112..caf9bedff3 100644
--- a/hw/block/meson.build
+++ b/hw/block/meson.build
@@ -19,4 +19,8 @@ softmmu_ss.add(when: 'CONFIG_TC58128', if_true: files('tc58128.c'))
 specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c'))
 specific_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: files('vhost-user-blk.c'))
 
+if have_vhost_blk
+  specific_ss.add(files('vhost-blk.c'))
+endif
+
 subdir('dataplane')
diff --git a/hw/block/vhost-blk.c b/hw/block/vhost-blk.c
new file mode 100644
index 0000000000..33d90af270
--- /dev/null
+++ b/hw/block/vhost-blk.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2022 Virtuozzo International GmbH.
+ * Author: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
+ *
+ * vhost-blk is host kernel accelerator for virtio-blk.
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qom/object.h"
+#include "hw/qdev-core.h"
+#include "hw/boards.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-blk.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-blk.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/virtio-pci.h"
+#include "sysemu/sysemu.h"
+#include "linux-headers/linux/vhost.h"
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+
+static int vhost_blk_start(VirtIODevice *vdev)
+{
+    VHostBlk *s = VHOST_BLK(vdev);
+    struct vhost_vring_file backend;
+    int ret, i;
+    int *fd = blk_bs(s->conf.conf.blk)->file->bs->opaque;
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+    if (!k->set_guest_notifiers) {
+        error_report("vhost-blk: binding does not support guest notifiers");
+        return -ENOSYS;
+    }
+
+    if (s->vhost_started) {
+        return 0;
+    }
+
+    if (ioctl(s->vhostfd, VHOST_SET_OWNER, NULL)) {
+        error_report("vhost-blk: unable to set owner");
+        return -ENOSYS;
+    }
+
+    ret = vhost_dev_enable_notifiers(&s->dev, vdev);
+    if (ret < 0) {
+        error_report("vhost-blk: unable to enable dev notifiers", errno);
+        return ret;
+    }
+
+    s->dev.acked_features = vdev->guest_features & s->dev.backend_features;
+
+    ret = vhost_dev_start(&s->dev, vdev);
+    if (ret < 0) {
+        error_report("vhost-blk: unable to start vhost dev");
+        return ret;
+    }
+
+    ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true);
+    if (ret < 0) {
+        error_report("vhost-blk: unable to bind guest notifiers");
+        goto out;
+    }
+
+    memset(&backend, 0, sizeof(backend));
+    backend.index = 0;
+    backend.fd = *fd;
+    if (ioctl(s->vhostfd, VHOST_BLK_SET_BACKEND, &backend)) {
+        error_report("vhost-blk: unable to set backend");
+        ret = -errno;
+        goto out;
+    }
+
+    for (i = 0; i < s->dev.nvqs; i++) {
+        vhost_virtqueue_mask(&s->dev, vdev, i, false);
+    }
+
+    event_notifier_set(virtio_queue_get_host_notifier(virtio_get_queue(vdev, 0)));
+
+    s->vhost_started = true;
+
+    return 0;
+
+out:
+    vhost_dev_stop(&s->dev, vdev);
+    return ret;
+
+}
+
+static void vhost_blk_stop(VirtIODevice *vdev)
+{
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+    VHostBlk *s = VHOST_BLK(vdev);
+    int ret;
+
+    if (!s->vhost_started) {
+        return;
+    }
+
+    ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false);
+    if (ret < 0) {
+        error_report("vhost-blk: unable to unbind guest notifiers");
+    }
+    vhost_dev_disable_notifiers(&s->dev, vdev);
+    vhost_dev_stop(&s->dev, vdev);
+
+    s->vhost_started = false;
+}
+
+static void vhost_blk_reset(VirtIODevice *vdev)
+{
+    VHostBlk *s = VHOST_BLK(vdev);
+    int ret;
+
+    vhost_blk_stop(vdev);
+    ret = ioctl(s->vhostfd, VHOST_RESET_OWNER, NULL);
+    if (ret && errno != EPERM) {
+            error_report("vhost-blk: failed to reset owner %d", errno);
+    }
+}
+
+static void vhost_blk_set_status(VirtIODevice *vdev, uint8_t status)
+{
+    if (status & (VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)) {
+        vhost_blk_stop(vdev);
+        return;
+    }
+
+    if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+        return;
+    }
+
+    if (vhost_blk_start(vdev)) {
+        error_report("vhost-blk: failed to start");
+    }
+}
+
+static void vhost_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+}
+
+static void vhost_blk_device_realize(DeviceState *dev, Error **errp)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostBlk *s = VHOST_BLK(vdev);
+    VhostBlkConf *conf = &s->conf;
+    int i, ret;
+
+    if (!conf->conf.blk) {
+        error_setg(errp, "vhost-blk: drive property not set");
+        return;
+    }
+
+    if (!blk_is_inserted(conf->conf.blk)) {
+        error_setg(errp, "vhost-blk: device needs media, but drive is empty");
+        return;
+    }
+
+    if (conf->num_queues == VHOST_BLK_AUTO_NUM_QUEUES) {
+        conf->num_queues = MIN(virtio_pci_optimal_num_queues(0),
+                               VHOST_BLK_MAX_QUEUES);
+    }
+
+    if (!conf->num_queues) {
+        error_setg(errp, "vhost-blk: num-queues property must be larger than 0");
+        return;
+    }
+
+    if (conf->queue_size <= 2) {
+        error_setg(errp, "vhost-blk: invalid queue-size property (%" PRIu16 "), "
+                   "must be > 2", conf->queue_size);
+        return;
+    }
+
+    if (!is_power_of_2(conf->queue_size) ||
+        conf->queue_size > VIRTQUEUE_MAX_SIZE) {
+        error_setg(errp, "vhost_blk: invalid queue-size property (%" PRIu16 "), "
+                   "must be a power of 2 (max %d)",
+                   conf->queue_size, VIRTQUEUE_MAX_SIZE);
+        return;
+    }
+
+    if (!blkconf_apply_backend_options(&conf->conf,
+                                       !blk_supports_write_perm(conf->conf.blk),
+                                       true, errp)) {
+        return;
+    }
+
+    if (!blkconf_geometry(&conf->conf, NULL, 65535, 255, 255, errp)) {
+        return;
+    }
+
+    if (!blkconf_blocksizes(&conf->conf, errp)) {
+        return;
+    }
+
+    s->dev.nvqs = conf->num_queues;
+    s->dev.max_queues = conf->num_queues;
+    s->dev.vqs = g_new(struct vhost_virtqueue, s->dev.nvqs);
+    s->dev.vq_index = 0;
+
+    virtio_init(vdev, VIRTIO_ID_BLOCK, sizeof(struct virtio_blk_config));
+
+    for (i = 0; i < conf->num_queues; i++) {
+        virtio_add_queue(vdev, conf->queue_size, vhost_blk_handle_output);
+    }
+
+    s->vhostfd = open("/dev/vhost-blk", O_RDWR);
+    if (s->vhostfd < 0) {
+        error_setg(errp, "vhost-blk: unable to open /dev/vhost-blk");
+        goto cleanup;
+    }
+
+    s->dev.acked_features = 0;
+    ret = ioctl(s->vhostfd, VHOST_GET_FEATURES, &s->dev.backend_features);
+    if (ret < 0) {
+        error_setg(errp, "vhost-blk: unable to get backend features");
+        goto cleanup;
+    }
+
+    ret = vhost_dev_init(&s->dev, (void *)((size_t)s->vhostfd),
+                         VHOST_BACKEND_TYPE_KERNEL, 0, false);
+    if (ret < 0) {
+        error_setg(errp, "vhost-blk: vhost initialization failed: %s",
+                strerror(-ret));
+        goto cleanup;
+    }
+
+    return;
+
+cleanup:
+    g_free(s->dev.vqs);
+    close(s->vhostfd);
+    for (i = 0; i < conf->num_queues; i++) {
+        virtio_del_queue(vdev, i);
+    }
+    virtio_cleanup(vdev);
+    return;
+}
+
+static void vhost_blk_device_unrealize(DeviceState *dev)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostBlk *s = VHOST_BLK(dev);
+
+    vhost_blk_set_status(vdev, 0);
+    vhost_dev_cleanup(&s->dev);
+    g_free(s->dev.vqs);
+    virtio_cleanup(vdev);
+}
+
+static const int user_feature_bits[] = {
+    VIRTIO_BLK_F_FLUSH,
+    VIRTIO_RING_F_INDIRECT_DESC,
+    VIRTIO_RING_F_EVENT_IDX,
+    VHOST_INVALID_FEATURE_BIT
+};
+
+
+static uint64_t vhost_blk_get_features(VirtIODevice *vdev,
+                                            uint64_t features,
+                                            Error **errp)
+{
+    VHostBlk *s = VHOST_BLK(vdev);
+    uint64_t res;
+
+    features |= s->host_features;
+
+    virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
+    virtio_add_feature(&features, VIRTIO_BLK_F_SEG_MAX);
+    virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
+    virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
+    virtio_add_feature(&features, VIRTIO_BLK_F_SIZE_MAX);
+
+    virtio_add_feature(&features, VIRTIO_F_VERSION_1);
+
+    if (!blk_is_writable(s->conf.conf.blk)) {
+        virtio_add_feature(&features, VIRTIO_BLK_F_RO);
+    }
+
+    if (s->conf.num_queues > 1) {
+        virtio_add_feature(&features, VIRTIO_BLK_F_MQ);
+    }
+
+    res = vhost_get_features(&s->dev, user_feature_bits, features);
+
+    return res;
+}
+
+static void vhost_blk_update_config(VirtIODevice *vdev, uint8_t *config)
+{
+    VHostBlk *s = VHOST_BLK(vdev);
+    BlockConf *conf = &s->conf.conf;
+    struct virtio_blk_config blkcfg;
+    uint64_t capacity;
+    int64_t length;
+    int blk_size = conf->logical_block_size;
+
+    blk_get_geometry(s->conf.conf.blk, &capacity);
+    memset(&blkcfg, 0, sizeof(blkcfg));
+    virtio_stq_p(vdev, &blkcfg.capacity, capacity);
+    virtio_stl_p(vdev, &blkcfg.seg_max, s->conf.queue_size - 2);
+    virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
+    virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
+    blkcfg.geometry.heads = conf->heads;
+
+    length = blk_getlength(s->conf.conf.blk);
+    if (length > 0 && length / conf->heads / conf->secs % blk_size) {
+        unsigned short mask;
+
+        mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
+        blkcfg.geometry.sectors = conf->secs & ~mask;
+    } else {
+        blkcfg.geometry.sectors = conf->secs;
+    }
+
+    blkcfg.size_max = 0;
+    blkcfg.physical_block_exp = get_physical_block_exp(conf);
+    blkcfg.alignment_offset = 0;
+    virtio_stw_p(vdev, &blkcfg.num_queues, s->conf.num_queues);
+
+    memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
+}
+
+static Property vhost_blk_properties[] = {
+    DEFINE_BLOCK_PROPERTIES(VHostBlk, conf.conf),
+    DEFINE_PROP_UINT16("num-queues", VHostBlk, conf.num_queues,
+                       VHOST_BLK_AUTO_NUM_QUEUES),
+    DEFINE_PROP_UINT16("queue-size", VHostBlk, conf.queue_size, 256),
+/* Discard and write-zeroes not yet implemented in kernel module */
+    DEFINE_PROP_BIT64("discard", VHostBlk, host_features,
+                      VIRTIO_BLK_F_DISCARD, false),
+    DEFINE_PROP_BIT64("write-zeroes", VHostBlk, host_features,
+                      VIRTIO_BLK_F_WRITE_ZEROES, false),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static const VMStateDescription vmstate_vhost_blk = {
+    .name = "vhost-blk",
+    .minimum_version_id = 1,
+    .version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_VIRTIO_DEVICE,
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+static void vhost_blk_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+    device_class_set_props(dc, vhost_blk_properties);
+    dc->vmsd = &vmstate_vhost_blk;
+    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+    vdc->realize = vhost_blk_device_realize;
+    vdc->unrealize = vhost_blk_device_unrealize;
+    vdc->get_config = vhost_blk_update_config;
+    vdc->get_features = vhost_blk_get_features;
+    vdc->set_status = vhost_blk_set_status;
+    vdc->reset = vhost_blk_reset;
+}
+
+static void vhost_blk_instance_init(Object *obj)
+{
+    VHostBlk *s = VHOST_BLK(obj);
+
+    device_add_bootindex_property(obj, &s->conf.conf.bootindex,
+                                  "bootindex", "/disk@0,0",
+                                  DEVICE(obj));
+}
+
+static const TypeInfo vhost_blk_info = {
+    .name = TYPE_VHOST_BLK,
+    .parent = TYPE_VIRTIO_DEVICE,
+    .instance_size = sizeof(VHostBlk),
+    .instance_init = vhost_blk_instance_init,
+    .class_init = vhost_blk_class_init,
+};
+
+static void virtio_register_types(void)
+{
+    type_register_static(&vhost_blk_info);
+}
+
+type_init(virtio_register_types)
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index 7e8877fd64..fb2c0e7242 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -40,6 +40,9 @@ virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: files('vhost-user-rng-
 virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_SCSI', if_true: files('vhost-user-scsi-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs-pci.c'))
+if have_vhost_blk
+  virtio_ss.add(files('vhost-blk-pci.c'))
+endif
 
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_INPUT_HOST', if_true: files('virtio-input-host-pci.c'))
diff --git a/hw/virtio/vhost-blk-pci.c b/hw/virtio/vhost-blk-pci.c
new file mode 100644
index 0000000000..f3b6e112b4
--- /dev/null
+++ b/hw/virtio/vhost-blk-pci.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2022 Virtuozzo International GmbH.
+ * Author: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
+ *
+ * vhost-blk PCI bindings
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "standard-headers/linux/virtio_pci.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/vhost-blk.h"
+#include "hw/pci/pci.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+typedef struct VHostBlkPCI VHostBlkPCI;
+
+/*
+ * vhost-blk-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VHOST_BLK_PCI "vhost-blk-pci-base"
+DECLARE_INSTANCE_CHECKER(VHostBlkPCI, VHOST_BLK_PCI,
+                         TYPE_VHOST_BLK_PCI)
+
+struct VHostBlkPCI {
+    VirtIOPCIProxy parent_obj;
+    VHostBlk vdev;
+};
+
+static Property vhost_blk_pci_properties[] = {
+    DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
+    DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+                       DEV_NVECTORS_UNSPECIFIED),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+    VHostBlkPCI *dev = VHOST_BLK_PCI(vpci_dev);
+    DeviceState *vdev = DEVICE(&dev->vdev);
+
+    if (dev->vdev.conf.num_queues == VHOST_BLK_AUTO_NUM_QUEUES) {
+        dev->vdev.conf.num_queues = MIN(virtio_pci_optimal_num_queues(0),
+                                        VHOST_BLK_MAX_QUEUES);
+    }
+
+    if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+        vpci_dev->nvectors = dev->vdev.conf.num_queues + 1;
+    }
+
+    qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void vhost_blk_pci_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+    device_class_set_props(dc, vhost_blk_pci_properties);
+    k->realize = vhost_blk_pci_realize;
+    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_BLOCK;
+    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+    pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI;
+}
+
+static void vhost_blk_pci_instance_init(Object *obj)
+{
+    VHostBlkPCI *dev = VHOST_BLK_PCI(obj);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VHOST_BLK);
+    object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
+                              "bootindex");
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_blk_pci_info = {
+    .base_name               = TYPE_VHOST_BLK_PCI,
+    .generic_name            = "vhost-blk-pci",
+    .transitional_name       = "vhost-blk-pci-transitional",
+    .non_transitional_name   = "vhost-blk-pci-non-transitional",
+    .instance_size  = sizeof(VHostBlkPCI),
+    .instance_init  = vhost_blk_pci_instance_init,
+    .class_init     = vhost_blk_pci_class_init,
+};
+
+static void vhost_blk_pci_register(void)
+{
+    virtio_pci_types_register(&vhost_blk_pci_info);
+}
+
+type_init(vhost_blk_pci_register)
diff --git a/include/hw/virtio/vhost-blk.h b/include/hw/virtio/vhost-blk.h
new file mode 100644
index 0000000000..76ece4215d
--- /dev/null
+++ b/include/hw/virtio/vhost-blk.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2022 Virtuozzo International GmbH.
+ * Author: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
+ *
+ * vhost-blk is host kernel accelerator for virtio-blk.
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#ifndef VHOST_BLK_H
+#define VHOST_BLK_H
+
+#include "standard-headers/linux/virtio_blk.h"
+#include "hw/block/block.h"
+#include "hw/virtio/vhost.h"
+#include "sysemu/block-backend.h"
+
+#define TYPE_VHOST_BLK "vhost-blk"
+#define VHOST_BLK(obj) \
+        OBJECT_CHECK(VHostBlk, (obj), TYPE_VHOST_BLK)
+
+#define VHOST_BLK_AUTO_NUM_QUEUES UINT16_MAX
+#define VHOST_BLK_MAX_QUEUES 8
+
+/*
+ * normally should be visible from imported headers
+ * temporary define here to simplify development
+ */
+#define VHOST_BLK_SET_BACKEND          _IOW(VHOST_VIRTIO, 0x90, \
+                                            struct vhost_vring_file)
+
+typedef struct VhostBlkConf {
+    BlockConf conf;
+    uint16_t num_queues;
+    uint16_t queue_size;
+} VhostBlkConf;
+
+typedef struct VHostBlk {
+    VirtIODevice parent_obj;
+    VhostBlkConf conf;
+    uint64_t host_features;
+    uint64_t decided_features;
+    struct virtio_blk_config blkcfg;
+    int vhostfd;
+    struct vhost_dev dev;
+    bool vhost_started;
+} VHostBlk;
+
+#endif
diff --git a/meson.build b/meson.build
index 8a8c415fc1..7c00a8ce89 100644
--- a/meson.build
+++ b/meson.build
@@ -336,6 +336,9 @@ have_vhost_kernel = get_option('vhost_kernel') \
 have_vhost_user_crypto = get_option('vhost_crypto') \
   .require(have_vhost_user,
            error_message: 'vhost-crypto requires vhost-user to be enabled').allowed()
+have_vhost_blk = get_option('vhost_blk') \
+  .require(targetos == 'linux',
+           error_message: 'vhost-kernel is only available on Linux').allowed()
 
 have_vhost = have_vhost_user or have_vhost_vdpa or have_vhost_kernel
 
@@ -1814,6 +1817,7 @@ config_host_data.set('CONFIG_VHOST_KERNEL', have_vhost_kernel)
 config_host_data.set('CONFIG_VHOST_USER', have_vhost_user)
 config_host_data.set('CONFIG_VHOST_CRYPTO', have_vhost_user_crypto)
 config_host_data.set('CONFIG_VHOST_VDPA', have_vhost_vdpa)
+config_host_data.set('CONFIG_VHOST_BLK', have_vhost_blk)
 config_host_data.set('CONFIG_VMNET', vmnet.found())
 config_host_data.set('CONFIG_VHOST_USER_BLK_SERVER', have_vhost_user_blk_server)
 config_host_data.set('CONFIG_VDUSE_BLK_EXPORT', have_vduse_blk_export)
@@ -3756,6 +3760,7 @@ summary_info += {'vhost-user support': have_vhost_user}
 summary_info += {'vhost-user-crypto support': have_vhost_user_crypto}
 summary_info += {'vhost-user-blk server support': have_vhost_user_blk_server}
 summary_info += {'vhost-vdpa support': have_vhost_vdpa}
+summary_info += {'vhost-blk support': have_vhost_blk}
 summary_info += {'build guest agent': have_ga}
 summary(summary_info, bool_yn: true, section: 'Configurable features')
 
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [RFC patch 0/1] block: vhost-blk backend
  2022-07-25 20:55 [RFC patch 0/1] block: vhost-blk backend Andrey Zhadchenko via
  2022-07-25 20:55 ` [RFC PATCH 1/1] block: add " Andrey Zhadchenko via
@ 2022-07-26 13:51 ` Michael S. Tsirkin
  2022-07-26 14:15   ` Denis V. Lunev
  2022-10-04 18:13 ` Stefan Hajnoczi
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 19+ messages in thread
From: Michael S. Tsirkin @ 2022-07-26 13:51 UTC (permalink / raw)
  To: Andrey Zhadchenko; +Cc: qemu-block, qemu-devel, kwolf, hreitz, stefanha, den

On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
> Although QEMU virtio-blk is quite fast, there is still some room for
> improvements. Disk latency can be reduced if we handle virito-blk requests
> in host kernel so we avoid a lot of syscalls and context switches.
> 
> The biggest disadvantage of this vhost-blk flavor is raw format.
> Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
> files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html

That one seems stalled. Do you plan to work on that too?

> Also by using kernel modules we can bypass iothread limitation and finaly scale
> block requests with cpus for high-performance devices. This is planned to be
> implemented in next version.
> 
> Linux kernel module part:
> https://lore.kernel.org/kvm/20220725202753.298725-1-andrey.zhadchenko@virtuozzo.com/
> 
> test setups and results:
> fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
> QEMU drive options: cache=none
> filesystem: xfs
> 
> SSD:
>                | randread, IOPS  | randwrite, IOPS |
> Host           |      95.8k	 |	85.3k	   |
> QEMU virtio    |      57.5k	 |	79.4k	   |
> QEMU vhost-blk |      95.6k	 |	84.3k	   |
> 
> RAMDISK (vq == vcpu):
>                  | randread, IOPS | randwrite, IOPS |
> virtio, 1vcpu    |	123k	  |	 129k       |
> virtio, 2vcpu    |	253k (??) |	 250k (??)  |
> virtio, 4vcpu    |	158k	  |	 154k       |
> vhost-blk, 1vcpu |	110k	  |	 113k       |
> vhost-blk, 2vcpu |	247k	  |	 252k       |
> vhost-blk, 4vcpu |	576k	  |	 567k       |
> 
> Andrey Zhadchenko (1):
>   block: add vhost-blk backend


From vhost/virtio side the patchset looks ok. But let's see what do
block devs think about it.


>  configure                     |  13 ++
>  hw/block/Kconfig              |   5 +
>  hw/block/meson.build          |   1 +
>  hw/block/vhost-blk.c          | 395 ++++++++++++++++++++++++++++++++++
>  hw/virtio/meson.build         |   1 +
>  hw/virtio/vhost-blk-pci.c     | 102 +++++++++
>  include/hw/virtio/vhost-blk.h |  44 ++++
>  linux-headers/linux/vhost.h   |   3 +
>  8 files changed, 564 insertions(+)
>  create mode 100644 hw/block/vhost-blk.c
>  create mode 100644 hw/virtio/vhost-blk-pci.c
>  create mode 100644 include/hw/virtio/vhost-blk.h
> 
> -- 
> 2.31.1



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC patch 0/1] block: vhost-blk backend
  2022-07-26 13:51 ` [RFC patch 0/1] block: " Michael S. Tsirkin
@ 2022-07-26 14:15   ` Denis V. Lunev
  2022-07-27 13:06     ` Stefano Garzarella
  0 siblings, 1 reply; 19+ messages in thread
From: Denis V. Lunev @ 2022-07-26 14:15 UTC (permalink / raw)
  To: Michael S. Tsirkin, Andrey Zhadchenko
  Cc: qemu-block, qemu-devel, kwolf, hreitz, stefanha

On 26.07.2022 15:51, Michael S. Tsirkin wrote:
> On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
>> Although QEMU virtio-blk is quite fast, there is still some room for
>> improvements. Disk latency can be reduced if we handle virito-blk requests
>> in host kernel so we avoid a lot of syscalls and context switches.
>>
>> The biggest disadvantage of this vhost-blk flavor is raw format.
>> Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
>> files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html
> That one seems stalled. Do you plan to work on that too?
We have too. The difference in numbers, as you seen below is quite too
much. We have waited for this patch to be sent to keep pushing.

It should be noted that may be talk on OSS this year could also push a bit.

Den


>> Also by using kernel modules we can bypass iothread limitation and finaly scale
>> block requests with cpus for high-performance devices. This is planned to be
>> implemented in next version.
>>
>> Linux kernel module part:
>> https://lore.kernel.org/kvm/20220725202753.298725-1-andrey.zhadchenko@virtuozzo.com/
>>
>> test setups and results:
>> fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
>> QEMU drive options: cache=none
>> filesystem: xfs
>>
>> SSD:
>>                 | randread, IOPS  | randwrite, IOPS |
>> Host           |      95.8k	 |	85.3k	   |
>> QEMU virtio    |      57.5k	 |	79.4k	   |
>> QEMU vhost-blk |      95.6k	 |	84.3k	   |
>>
>> RAMDISK (vq == vcpu):
>>                   | randread, IOPS | randwrite, IOPS |
>> virtio, 1vcpu    |	123k	  |	 129k       |
>> virtio, 2vcpu    |	253k (??) |	 250k (??)  |
>> virtio, 4vcpu    |	158k	  |	 154k       |
>> vhost-blk, 1vcpu |	110k	  |	 113k       |
>> vhost-blk, 2vcpu |	247k	  |	 252k       |
>> vhost-blk, 4vcpu |	576k	  |	 567k       |
>>
>> Andrey Zhadchenko (1):
>>    block: add vhost-blk backend
>
>  From vhost/virtio side the patchset looks ok. But let's see what do
> block devs think about it.
>
>
>>   configure                     |  13 ++
>>   hw/block/Kconfig              |   5 +
>>   hw/block/meson.build          |   1 +
>>   hw/block/vhost-blk.c          | 395 ++++++++++++++++++++++++++++++++++
>>   hw/virtio/meson.build         |   1 +
>>   hw/virtio/vhost-blk-pci.c     | 102 +++++++++
>>   include/hw/virtio/vhost-blk.h |  44 ++++
>>   linux-headers/linux/vhost.h   |   3 +
>>   8 files changed, 564 insertions(+)
>>   create mode 100644 hw/block/vhost-blk.c
>>   create mode 100644 hw/virtio/vhost-blk-pci.c
>>   create mode 100644 include/hw/virtio/vhost-blk.h
>>
>> -- 
>> 2.31.1



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC patch 0/1] block: vhost-blk backend
  2022-07-26 14:15   ` Denis V. Lunev
@ 2022-07-27 13:06     ` Stefano Garzarella
  2022-07-28  5:28       ` Andrey Zhadchenko
  0 siblings, 1 reply; 19+ messages in thread
From: Stefano Garzarella @ 2022-07-27 13:06 UTC (permalink / raw)
  To: Denis V. Lunev
  Cc: Michael S. Tsirkin, Andrey Zhadchenko, qemu-block, qemu-devel,
	kwolf, hreitz, stefanha

On Tue, Jul 26, 2022 at 04:15:48PM +0200, Denis V. Lunev wrote:
>On 26.07.2022 15:51, Michael S. Tsirkin wrote:
>>On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
>>>Although QEMU virtio-blk is quite fast, there is still some room for
>>>improvements. Disk latency can be reduced if we handle virito-blk requests
>>>in host kernel so we avoid a lot of syscalls and context switches.
>>>
>>>The biggest disadvantage of this vhost-blk flavor is raw format.
>>>Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
>>>files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html
>>That one seems stalled. Do you plan to work on that too?
>We have too. The difference in numbers, as you seen below is quite too
>much. We have waited for this patch to be sent to keep pushing.
>
>It should be noted that may be talk on OSS this year could also push a bit.

Cool, the results are similar of what I saw when I compared vhost-blk 
and io_uring passthrough with NVMe (Slide 7 here: [1]).

About QEMU block layer support, we recently started to work on libblkio 
[2]. Stefan also sent an RFC [3] to implement the QEMU BlockDriver.
Currently it supports virtio-blk devices using vhost-vdpa and 
vhost-user.
We could add support for vhost (kernel) as well, though, we were 
thinking of leveraging vDPA to implement in-kernel software device as 
well.

That way we could reuse a lot of the code to support both hardware and 
software accelerators.

In the talk [1] I describe the idea a little bit, and a few months ago I 
did a PoC (unsubmitted RFC) to see if it was feasible and the numbers 
were in line with vhost-blk.

Do you think we could join forces and just have an in-kernel vdpa-blk 
software device?

Of course we could have both vhost-blk and vdpa-blk, but with vDPA 
perhaps we can have one software stack to maintain for both HW and 
software accelerators.

Thanks,
Stefano

[1] 
https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-software-offload-for-virtio-blk-stefano-garzarella-red-hat
[2] https://gitlab.com/libblkio/libblkio
[3] 
https://lore.kernel.org/qemu-devel/20220708041737.1768521-1-stefanha@redhat.com/



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC patch 0/1] block: vhost-blk backend
  2022-07-27 13:06     ` Stefano Garzarella
@ 2022-07-28  5:28       ` Andrey Zhadchenko
  2022-07-28 15:40         ` Stefano Garzarella
  0 siblings, 1 reply; 19+ messages in thread
From: Andrey Zhadchenko @ 2022-07-28  5:28 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: Michael S. Tsirkin, qemu-block, qemu-devel, kwolf, hreitz, stefanha, den


On 7/27/22 16:06, Stefano Garzarella wrote:
> On Tue, Jul 26, 2022 at 04:15:48PM +0200, Denis V. Lunev wrote:
>> On 26.07.2022 15:51, Michael S. Tsirkin wrote:
>>> On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
>>>> Although QEMU virtio-blk is quite fast, there is still some room for
>>>> improvements. Disk latency can be reduced if we handle virito-blk 
>>>> requests
>>>> in host kernel so we avoid a lot of syscalls and context switches.
>>>>
>>>> The biggest disadvantage of this vhost-blk flavor is raw format.
>>>> Luckily Kirill Thai proposed device mapper driver for QCOW2 format 
>>>> to attach
>>>> files as block devices: 
>>>> https://www.spinics.net/lists/kernel/msg4292965.html
>>> That one seems stalled. Do you plan to work on that too?
>> We have too. The difference in numbers, as you seen below is quite too
>> much. We have waited for this patch to be sent to keep pushing.
>>
>> It should be noted that may be talk on OSS this year could also push a 
>> bit.
> 
> Cool, the results are similar of what I saw when I compared vhost-blk 
> and io_uring passthrough with NVMe (Slide 7 here: [1]).
> 
> About QEMU block layer support, we recently started to work on libblkio 
> [2]. Stefan also sent an RFC [3] to implement the QEMU BlockDriver.
> Currently it supports virtio-blk devices using vhost-vdpa and vhost-user.
> We could add support for vhost (kernel) as well, though, we were 
> thinking of leveraging vDPA to implement in-kernel software device as well.
> 
> That way we could reuse a lot of the code to support both hardware and 
> software accelerators.
> 
> In the talk [1] I describe the idea a little bit, and a few months ago I 
> did a PoC (unsubmitted RFC) to see if it was feasible and the numbers 
> were in line with vhost-blk.
> 
> Do you think we could join forces and just have an in-kernel vdpa-blk 
> software device?

This seems worth trying. Why double the efforts to do the same. Yet I 
would like to play a bit with your vdpa-blk PoC beforehand. Can you send 
it to me with some instructions how to run it?

> 
> Of course we could have both vhost-blk and vdpa-blk, but with vDPA 
> perhaps we can have one software stack to maintain for both HW and 
> software accelerators.
> 
> Thanks,
> Stefano
> 
> [1] 
> https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-software-offload-for-virtio-blk-stefano-garzarella-red-hat 
> 
> [2] https://gitlab.com/libblkio/libblkio
> [3] 
> https://lore.kernel.org/qemu-devel/20220708041737.1768521-1-stefanha@redhat.com/ 
> 
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC patch 0/1] block: vhost-blk backend
  2022-07-28  5:28       ` Andrey Zhadchenko
@ 2022-07-28 15:40         ` Stefano Garzarella
  0 siblings, 0 replies; 19+ messages in thread
From: Stefano Garzarella @ 2022-07-28 15:40 UTC (permalink / raw)
  To: Andrey Zhadchenko
  Cc: Michael S. Tsirkin, qemu-block, qemu-devel, kwolf, hreitz, stefanha, den

On Thu, Jul 28, 2022 at 7:28 AM Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com> wrote:
> On 7/27/22 16:06, Stefano Garzarella wrote:
> > On Tue, Jul 26, 2022 at 04:15:48PM +0200, Denis V. Lunev wrote:
> >> On 26.07.2022 15:51, Michael S. Tsirkin wrote:
> >>> On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
> >>>> Although QEMU virtio-blk is quite fast, there is still some room for
> >>>> improvements. Disk latency can be reduced if we handle virito-blk
> >>>> requests
> >>>> in host kernel so we avoid a lot of syscalls and context switches.
> >>>>
> >>>> The biggest disadvantage of this vhost-blk flavor is raw format.
> >>>> Luckily Kirill Thai proposed device mapper driver for QCOW2 format
> >>>> to attach
> >>>> files as block devices:
> >>>> https://www.spinics.net/lists/kernel/msg4292965.html
> >>> That one seems stalled. Do you plan to work on that too?
> >> We have too. The difference in numbers, as you seen below is quite too
> >> much. We have waited for this patch to be sent to keep pushing.
> >>
> >> It should be noted that may be talk on OSS this year could also push a
> >> bit.
> >
> > Cool, the results are similar of what I saw when I compared vhost-blk
> > and io_uring passthrough with NVMe (Slide 7 here: [1]).
> >
> > About QEMU block layer support, we recently started to work on libblkio
> > [2]. Stefan also sent an RFC [3] to implement the QEMU BlockDriver.
> > Currently it supports virtio-blk devices using vhost-vdpa and vhost-user.
> > We could add support for vhost (kernel) as well, though, we were
> > thinking of leveraging vDPA to implement in-kernel software device as well.
> >
> > That way we could reuse a lot of the code to support both hardware and
> > software accelerators.
> >
> > In the talk [1] I describe the idea a little bit, and a few months ago I
> > did a PoC (unsubmitted RFC) to see if it was feasible and the numbers
> > were in line with vhost-blk.
> >
> > Do you think we could join forces and just have an in-kernel vdpa-blk
> > software device?
>
> This seems worth trying. Why double the efforts to do the same. Yet I
> would like to play a bit with your vdpa-blk PoC beforehand.

Great :-)

> Can you send it to me with some instructions how to run it?

Yep, sure!

The PoC is available here: 
https://gitlab.com/sgarzarella/linux/-/tree/vdpa-sw-blk-poc

The tree was based on Linux v5.16, but I had some issues to rebuild with 
new gcc, so I rebased on v5.16.20 (not tested), configs needed:
CONFIG_VDPA_SW_BLOCK=m + CONFIG_VHOST_VDPA=m + dependencies.

It contains:
  - patches required for QEMU generic vhost-vdpa support
  - patches to support blk_mq_ops->poll() (to use io_uring iopoll) in
    the guest virtio-blk driver (I used the same kernel on guest and
    host)
  - some improvements for vringh (not completed, it could be a
    bottleneck)
  - vdpa-sw and vdpa-sw-blk patches (and hacks)

It is based on the vDPA simulator framework already merged upstream. The 
idea is to generalize the simulator to share the code between both 
software devices and simulators. The code needs a lot of work, I was 
focusing just on a working virtio-blk device emulation, but more focus 
on the generic part should be done.
In the code there are a couple of defines to control polling.

About the vdpa-blk device, you need iproute2's vdpa tool available 
upstream:
  https://wiki.linuxfoundation.org/networking/iproute2

Once the device is instantiated (see instructions later), the backend 
(raw file or device) can be set through a device attribute (not robust, 
but it was a PoC): /sys/bus/vdpa/devices/$dev_name/backend_fd

I wrote a simple python script available here: 
https://github.com/stefano-garzarella/vm-build/blob/main/vm-tools/vdpa_set_backend_fd.py

For QEMU, we are working on libblkio to support both slow path (when 
QEMU block layer is needed) and fast path (vqs passed directly to the 
device). For now libblkio supports only slow path, so to test the fast 
path you can use Longpeng's patches (not yet merged upstream) with 
generic vhost-vdpa support: 
https://lore.kernel.org/qemu-devel/20220514041107.1980-1-longpeng2@huawei.com/

Steps:
  # load vDPA block in-kernel sw device module
  modprobe vdpa_sw_blk

  # load nvme module with poll_queues set if you want to use iopoll
  modprobe nvme poll_queues=15

  # instantiate a new vdpa-blk device
  vdpa dev add mgmtdev vdpasw_blk name blk0

  # set backend (/dev/nvme0n1)
  vdpa_set_backend_fd.py -b /dev/nvme0n1 blk0

  # load vhost vDPA bus ...
  modprobe vhost_vdpa

  # ... and vhost-vdpa device will appear
  ls -l /dev/vhost-vdpa-0
  crw-------. 1 root root 510, 0 Jul 28 17:06 /dev/vhost-vdpa-0

  # start QEMU patched with generic vhost-vdpa
  qemu-system-x86_64 ... \
  -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-0

I haven't tested it recently, so I'm not sure it all works, but in the 
next few days I'll try. For anything else, feel free to reach me here or 
on IRC (sgarzare on #qemu).

Thanks,
Stefano



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC patch 0/1] block: vhost-blk backend
  2022-07-25 20:55 [RFC patch 0/1] block: vhost-blk backend Andrey Zhadchenko via
  2022-07-25 20:55 ` [RFC PATCH 1/1] block: add " Andrey Zhadchenko via
  2022-07-26 13:51 ` [RFC patch 0/1] block: " Michael S. Tsirkin
@ 2022-10-04 18:13 ` Stefan Hajnoczi
  2022-10-05  9:14   ` Andrey Zhadchenko
  2022-10-04 18:26 ` Stefan Hajnoczi
  2022-10-04 19:00 ` Stefan Hajnoczi
  4 siblings, 1 reply; 19+ messages in thread
From: Stefan Hajnoczi @ 2022-10-04 18:13 UTC (permalink / raw)
  To: Andrey Zhadchenko; +Cc: qemu-block, qemu-devel, kwolf, hreitz, mst, den

[-- Attachment #1: Type: text/plain, Size: 1031 bytes --]

On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
> Although QEMU virtio-blk is quite fast, there is still some room for
> improvements. Disk latency can be reduced if we handle virito-blk requests
> in host kernel so we avoid a lot of syscalls and context switches.
> 
> The biggest disadvantage of this vhost-blk flavor is raw format.
> Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
> files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html
> 
> Also by using kernel modules we can bypass iothread limitation and finaly scale
> block requests with cpus for high-performance devices. This is planned to be
> implemented in next version.

Hi Andrey,
Do you have a new version of this patch series that uses multiple
threads?

I have been playing with vq-IOThread mapping in QEMU and would like to
benchmark vhost-blk vs QEMU virtio-blk mq IOThreads:
https://gitlab.com/stefanha/qemu/-/tree/virtio-blk-mq-iothread-prototype

Thanks,
Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC patch 0/1] block: vhost-blk backend
  2022-07-25 20:55 [RFC patch 0/1] block: vhost-blk backend Andrey Zhadchenko via
                   ` (2 preceding siblings ...)
  2022-10-04 18:13 ` Stefan Hajnoczi
@ 2022-10-04 18:26 ` Stefan Hajnoczi
  2022-10-05 10:28   ` Andrey Zhadchenko
  2022-10-04 19:00 ` Stefan Hajnoczi
  4 siblings, 1 reply; 19+ messages in thread
From: Stefan Hajnoczi @ 2022-10-04 18:26 UTC (permalink / raw)
  To: Andrey Zhadchenko; +Cc: qemu-block, qemu-devel, kwolf, hreitz, mst, den

[-- Attachment #1: Type: text/plain, Size: 2143 bytes --]

On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
> Although QEMU virtio-blk is quite fast, there is still some room for
> improvements. Disk latency can be reduced if we handle virito-blk requests
> in host kernel so we avoid a lot of syscalls and context switches.
> 
> The biggest disadvantage of this vhost-blk flavor is raw format.
> Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
> files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html
> 
> Also by using kernel modules we can bypass iothread limitation and finaly scale
> block requests with cpus for high-performance devices. This is planned to be
> implemented in next version.
> 
> Linux kernel module part:
> https://lore.kernel.org/kvm/20220725202753.298725-1-andrey.zhadchenko@virtuozzo.com/
> 
> test setups and results:
> fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128

> QEMU drive options: cache=none
> filesystem: xfs

Please post the full QEMU command-line so it's clear exactly what this
is benchmarking.

A preallocated raw image file is a good baseline with:

  --object iothread,id=iothread0 \
  --blockdev file,filename=test.img,cache.direct=on,aio=native,node-name=drive0 \
  --device virtio-blk-pci,drive=drive0,iothread=iothread0

(BTW QEMU's default vq size is 256 descriptors and the number of vqs is
the number of vCPUs.)

> 
> SSD:
>                | randread, IOPS  | randwrite, IOPS |
> Host           |      95.8k	 |	85.3k	   |
> QEMU virtio    |      57.5k	 |	79.4k	   |
> QEMU vhost-blk |      95.6k	 |	84.3k	   |
> 
> RAMDISK (vq == vcpu):

With fio numjobs=vcpu here?

>                  | randread, IOPS | randwrite, IOPS |
> virtio, 1vcpu    |	123k	  |	 129k       |
> virtio, 2vcpu    |	253k (??) |	 250k (??)  |

QEMU's aio=threads (default) gets around the single IOThread. It beats
aio=native for this reason in some cases. Were you using aio=native or
aio=threads?

> virtio, 4vcpu    |	158k	  |	 154k       |
> vhost-blk, 1vcpu |	110k	  |	 113k       |
> vhost-blk, 2vcpu |	247k	  |	 252k       |

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH 1/1] block: add vhost-blk backend
  2022-07-25 20:55 ` [RFC PATCH 1/1] block: add " Andrey Zhadchenko via
@ 2022-10-04 18:45   ` Stefan Hajnoczi
  2022-10-05 13:06     ` Andrey Zhadchenko
  0 siblings, 1 reply; 19+ messages in thread
From: Stefan Hajnoczi @ 2022-10-04 18:45 UTC (permalink / raw)
  To: Andrey Zhadchenko; +Cc: qemu-block, qemu-devel, kwolf, hreitz, mst, den

[-- Attachment #1: Type: text/plain, Size: 24718 bytes --]

On Mon, Jul 25, 2022 at 11:55:27PM +0300, Andrey Zhadchenko wrote:
> Although QEMU virtio is quite fast, there is still some room for
> improvements. Disk latency can be reduced if we handle virito-blk requests
> in host kernel istead of passing them to QEMU. The patch adds vhost-blk
> backend which sets up vhost-blk kernel module to process requests.
> 
> test setup and results:
> fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
> QEMU drive options: cache=none
> filesystem: xfs
> 
> SSD:
>                | randread, IOPS  | randwrite, IOPS |
> Host           |      95.8k	 |	85.3k	   |
> QEMU virtio    |      57.5k	 |	79.4k	   |
> QEMU vhost-blk |      95.6k	 |	84.3k	   |
> 
> RAMDISK (vq == vcpu):
>                  | randread, IOPS | randwrite, IOPS |
> virtio, 1vcpu    |	123k	  |	 129k       |
> virtio, 2vcpu    |	253k (??) |	 250k (??)  |
> virtio, 4vcpu    |	158k	  |	 154k       |
> vhost-blk, 1vcpu |	110k	  |	 113k       |
> vhost-blk, 2vcpu |	247k	  |	 252k       |
> vhost-blk, 4vcpu |	576k	  |	 567k       |
> 
> Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
> ---
>  hw/block/Kconfig              |   5 +
>  hw/block/meson.build          |   4 +
>  hw/block/vhost-blk.c          | 394 ++++++++++++++++++++++++++++++++++
>  hw/virtio/meson.build         |   3 +
>  hw/virtio/vhost-blk-pci.c     | 102 +++++++++
>  include/hw/virtio/vhost-blk.h |  50 +++++
>  meson.build                   |   5 +
>  7 files changed, 563 insertions(+)
>  create mode 100644 hw/block/vhost-blk.c
>  create mode 100644 hw/virtio/vhost-blk-pci.c
>  create mode 100644 include/hw/virtio/vhost-blk.h
> 
> diff --git a/hw/block/Kconfig b/hw/block/Kconfig
> index 9e8f28f982..b4286ad10e 100644
> --- a/hw/block/Kconfig
> +++ b/hw/block/Kconfig
> @@ -36,6 +36,11 @@ config VIRTIO_BLK
>      default y
>      depends on VIRTIO
>  
> +config VHOST_BLK
> +    bool
> +    default n

Feel free to enable it by default. That way it gets more CI/build
coverage.

> +    depends on VIRTIO && LINUX
> +
>  config VHOST_USER_BLK
>      bool
>      # Only PCI devices are provided for now
> diff --git a/hw/block/meson.build b/hw/block/meson.build
> index 2389326112..caf9bedff3 100644
> --- a/hw/block/meson.build
> +++ b/hw/block/meson.build
> @@ -19,4 +19,8 @@ softmmu_ss.add(when: 'CONFIG_TC58128', if_true: files('tc58128.c'))
>  specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c'))
>  specific_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: files('vhost-user-blk.c'))
>  
> +if have_vhost_blk
> +  specific_ss.add(files('vhost-blk.c'))
> +endif

Can this use the same add(when: 'CONFIG_VHOST_BLK', ...) syntax as the
other conditional builds above?

> +
>  subdir('dataplane')
> diff --git a/hw/block/vhost-blk.c b/hw/block/vhost-blk.c
> new file mode 100644
> index 0000000000..33d90af270
> --- /dev/null
> +++ b/hw/block/vhost-blk.c
> @@ -0,0 +1,394 @@
> +/*
> + * Copyright (c) 2022 Virtuozzo International GmbH.
> + * Author: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
> + *
> + * vhost-blk is host kernel accelerator for virtio-blk.
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "qemu/error-report.h"
> +#include "qom/object.h"
> +#include "hw/qdev-core.h"
> +#include "hw/boards.h"
> +#include "hw/virtio/vhost.h"
> +#include "hw/virtio/vhost-blk.h"
> +#include "hw/virtio/virtio.h"
> +#include "hw/virtio/virtio-blk.h"
> +#include "hw/virtio/virtio-bus.h"
> +#include "hw/virtio/virtio-access.h"
> +#include "hw/virtio/virtio-pci.h"
> +#include "sysemu/sysemu.h"
> +#include "linux-headers/linux/vhost.h"
> +#include <sys/ioctl.h>
> +#include <linux/fs.h>
> +
> +static int vhost_blk_start(VirtIODevice *vdev)
> +{
> +    VHostBlk *s = VHOST_BLK(vdev);
> +    struct vhost_vring_file backend;
> +    int ret, i;
> +    int *fd = blk_bs(s->conf.conf.blk)->file->bs->opaque;

This needs a clean API so vhost-blk.c doesn't make assumptions about the
file-posix BlockDriver's internal state memory layout.

> +    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
> +    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
> +
> +    if (!k->set_guest_notifiers) {
> +        error_report("vhost-blk: binding does not support guest notifiers");
> +        return -ENOSYS;
> +    }
> +
> +    if (s->vhost_started) {
> +        return 0;
> +    }
> +
> +    if (ioctl(s->vhostfd, VHOST_SET_OWNER, NULL)) {
> +        error_report("vhost-blk: unable to set owner");
> +        return -ENOSYS;
> +    }
> +
> +    ret = vhost_dev_enable_notifiers(&s->dev, vdev);
> +    if (ret < 0) {
> +        error_report("vhost-blk: unable to enable dev notifiers", errno);
> +        return ret;
> +    }
> +
> +    s->dev.acked_features = vdev->guest_features & s->dev.backend_features;
> +
> +    ret = vhost_dev_start(&s->dev, vdev);
> +    if (ret < 0) {
> +        error_report("vhost-blk: unable to start vhost dev");
> +        return ret;
> +    }
> +
> +    ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true);
> +    if (ret < 0) {
> +        error_report("vhost-blk: unable to bind guest notifiers");
> +        goto out;
> +    }
> +
> +    memset(&backend, 0, sizeof(backend));
> +    backend.index = 0;
> +    backend.fd = *fd;
> +    if (ioctl(s->vhostfd, VHOST_BLK_SET_BACKEND, &backend)) {
> +        error_report("vhost-blk: unable to set backend");
> +        ret = -errno;
> +        goto out;
> +    }
> +
> +    for (i = 0; i < s->dev.nvqs; i++) {
> +        vhost_virtqueue_mask(&s->dev, vdev, i, false);
> +    }
> +
> +    event_notifier_set(virtio_queue_get_host_notifier(virtio_get_queue(vdev, 0)));

Should this be done for all vqs?

> +
> +    s->vhost_started = true;
> +
> +    return 0;
> +
> +out:
> +    vhost_dev_stop(&s->dev, vdev);
> +    return ret;
> +
> +}
> +
> +static void vhost_blk_stop(VirtIODevice *vdev)
> +{
> +    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
> +    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
> +    VHostBlk *s = VHOST_BLK(vdev);
> +    int ret;
> +
> +    if (!s->vhost_started) {
> +        return;
> +    }
> +
> +    ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false);
> +    if (ret < 0) {
> +        error_report("vhost-blk: unable to unbind guest notifiers");
> +    }
> +    vhost_dev_disable_notifiers(&s->dev, vdev);
> +    vhost_dev_stop(&s->dev, vdev);
> +
> +    s->vhost_started = false;
> +}
> +
> +static void vhost_blk_reset(VirtIODevice *vdev)
> +{
> +    VHostBlk *s = VHOST_BLK(vdev);
> +    int ret;
> +
> +    vhost_blk_stop(vdev);
> +    ret = ioctl(s->vhostfd, VHOST_RESET_OWNER, NULL);
> +    if (ret && errno != EPERM) {
> +            error_report("vhost-blk: failed to reset owner %d", errno);
> +    }
> +}
> +
> +static void vhost_blk_set_status(VirtIODevice *vdev, uint8_t status)
> +{
> +    if (status & (VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)) {
> +        vhost_blk_stop(vdev);
> +        return;
> +    }
> +
> +    if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
> +        return;
> +    }
> +
> +    if (vhost_blk_start(vdev)) {
> +        error_report("vhost-blk: failed to start");
> +    }
> +}
> +
> +static void vhost_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
> +{
> +}
> +
> +static void vhost_blk_device_realize(DeviceState *dev, Error **errp)
> +{
> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> +    VHostBlk *s = VHOST_BLK(vdev);
> +    VhostBlkConf *conf = &s->conf;
> +    int i, ret;
> +
> +    if (!conf->conf.blk) {
> +        error_setg(errp, "vhost-blk: drive property not set");
> +        return;
> +    }
> +
> +    if (!blk_is_inserted(conf->conf.blk)) {
> +        error_setg(errp, "vhost-blk: device needs media, but drive is empty");
> +        return;
> +    }
> +
> +    if (conf->num_queues == VHOST_BLK_AUTO_NUM_QUEUES) {
> +        conf->num_queues = MIN(virtio_pci_optimal_num_queues(0),
> +                               VHOST_BLK_MAX_QUEUES);

Why is 8 (VHOST_BLK_MAX_QUEUES) queues the maximum?

> +    }
> +
> +    if (!conf->num_queues) {
> +        error_setg(errp, "vhost-blk: num-queues property must be larger than 0");
> +        return;
> +    }
> +
> +    if (conf->queue_size <= 2) {
> +        error_setg(errp, "vhost-blk: invalid queue-size property (%" PRIu16 "), "
> +                   "must be > 2", conf->queue_size);
> +        return;
> +    }
> +
> +    if (!is_power_of_2(conf->queue_size) ||
> +        conf->queue_size > VIRTQUEUE_MAX_SIZE) {
> +        error_setg(errp, "vhost_blk: invalid queue-size property (%" PRIu16 "), "
> +                   "must be a power of 2 (max %d)",
> +                   conf->queue_size, VIRTQUEUE_MAX_SIZE);
> +        return;
> +    }
> +
> +    if (!blkconf_apply_backend_options(&conf->conf,
> +                                       !blk_supports_write_perm(conf->conf.blk),
> +                                       true, errp)) {
> +        return;
> +    }
> +
> +    if (!blkconf_geometry(&conf->conf, NULL, 65535, 255, 255, errp)) {
> +        return;
> +    }
> +
> +    if (!blkconf_blocksizes(&conf->conf, errp)) {
> +        return;
> +    }
> +
> +    s->dev.nvqs = conf->num_queues;
> +    s->dev.max_queues = conf->num_queues;
> +    s->dev.vqs = g_new(struct vhost_virtqueue, s->dev.nvqs);
> +    s->dev.vq_index = 0;
> +
> +    virtio_init(vdev, VIRTIO_ID_BLOCK, sizeof(struct virtio_blk_config));
> +
> +    for (i = 0; i < conf->num_queues; i++) {
> +        virtio_add_queue(vdev, conf->queue_size, vhost_blk_handle_output);
> +    }
> +
> +    s->vhostfd = open("/dev/vhost-blk", O_RDWR);
> +    if (s->vhostfd < 0) {
> +        error_setg(errp, "vhost-blk: unable to open /dev/vhost-blk");
> +        goto cleanup;
> +    }
> +
> +    s->dev.acked_features = 0;
> +    ret = ioctl(s->vhostfd, VHOST_GET_FEATURES, &s->dev.backend_features);
> +    if (ret < 0) {
> +        error_setg(errp, "vhost-blk: unable to get backend features");
> +        goto cleanup;
> +    }
> +
> +    ret = vhost_dev_init(&s->dev, (void *)((size_t)s->vhostfd),
> +                         VHOST_BACKEND_TYPE_KERNEL, 0, false);

The last parameter should be NULL (Error **) instead of false.

> +    if (ret < 0) {
> +        error_setg(errp, "vhost-blk: vhost initialization failed: %s",
> +                strerror(-ret));
> +        goto cleanup;
> +    }
> +
> +    return;
> +
> +cleanup:
> +    g_free(s->dev.vqs);
> +    close(s->vhostfd);
> +    for (i = 0; i < conf->num_queues; i++) {
> +        virtio_del_queue(vdev, i);
> +    }
> +    virtio_cleanup(vdev);
> +    return;
> +}
> +
> +static void vhost_blk_device_unrealize(DeviceState *dev)
> +{
> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> +    VHostBlk *s = VHOST_BLK(dev);
> +
> +    vhost_blk_set_status(vdev, 0);
> +    vhost_dev_cleanup(&s->dev);
> +    g_free(s->dev.vqs);
> +    virtio_cleanup(vdev);
> +}
> +
> +static const int user_feature_bits[] = {
> +    VIRTIO_BLK_F_FLUSH,
> +    VIRTIO_RING_F_INDIRECT_DESC,
> +    VIRTIO_RING_F_EVENT_IDX,
> +    VHOST_INVALID_FEATURE_BIT
> +};
> +
> +
> +static uint64_t vhost_blk_get_features(VirtIODevice *vdev,
> +                                            uint64_t features,
> +                                            Error **errp)
> +{
> +    VHostBlk *s = VHOST_BLK(vdev);
> +    uint64_t res;
> +
> +    features |= s->host_features;
> +
> +    virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
> +    virtio_add_feature(&features, VIRTIO_BLK_F_SEG_MAX);
> +    virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
> +    virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
> +    virtio_add_feature(&features, VIRTIO_BLK_F_SIZE_MAX);
> +
> +    virtio_add_feature(&features, VIRTIO_F_VERSION_1);
> +
> +    if (!blk_is_writable(s->conf.conf.blk)) {
> +        virtio_add_feature(&features, VIRTIO_BLK_F_RO);
> +    }
> +
> +    if (s->conf.num_queues > 1) {
> +        virtio_add_feature(&features, VIRTIO_BLK_F_MQ);
> +    }
> +
> +    res = vhost_get_features(&s->dev, user_feature_bits, features);
> +
> +    return res;
> +}
> +
> +static void vhost_blk_update_config(VirtIODevice *vdev, uint8_t *config)
> +{
> +    VHostBlk *s = VHOST_BLK(vdev);
> +    BlockConf *conf = &s->conf.conf;
> +    struct virtio_blk_config blkcfg;
> +    uint64_t capacity;
> +    int64_t length;
> +    int blk_size = conf->logical_block_size;
> +
> +    blk_get_geometry(s->conf.conf.blk, &capacity);
> +    memset(&blkcfg, 0, sizeof(blkcfg));
> +    virtio_stq_p(vdev, &blkcfg.capacity, capacity);
> +    virtio_stl_p(vdev, &blkcfg.seg_max, s->conf.queue_size - 2);
> +    virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
> +    virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
> +    blkcfg.geometry.heads = conf->heads;
> +
> +    length = blk_getlength(s->conf.conf.blk);
> +    if (length > 0 && length / conf->heads / conf->secs % blk_size) {
> +        unsigned short mask;
> +
> +        mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
> +        blkcfg.geometry.sectors = conf->secs & ~mask;
> +    } else {
> +        blkcfg.geometry.sectors = conf->secs;
> +    }
> +
> +    blkcfg.size_max = 0;
> +    blkcfg.physical_block_exp = get_physical_block_exp(conf);
> +    blkcfg.alignment_offset = 0;
> +    virtio_stw_p(vdev, &blkcfg.num_queues, s->conf.num_queues);
> +
> +    memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
> +}
> +
> +static Property vhost_blk_properties[] = {
> +    DEFINE_BLOCK_PROPERTIES(VHostBlk, conf.conf),
> +    DEFINE_PROP_UINT16("num-queues", VHostBlk, conf.num_queues,
> +                       VHOST_BLK_AUTO_NUM_QUEUES),
> +    DEFINE_PROP_UINT16("queue-size", VHostBlk, conf.queue_size, 256),
> +/* Discard and write-zeroes not yet implemented in kernel module */
> +    DEFINE_PROP_BIT64("discard", VHostBlk, host_features,
> +                      VIRTIO_BLK_F_DISCARD, false),
> +    DEFINE_PROP_BIT64("write-zeroes", VHostBlk, host_features,
> +                      VIRTIO_BLK_F_WRITE_ZEROES, false),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static const VMStateDescription vmstate_vhost_blk = {
> +    .name = "vhost-blk",
> +    .minimum_version_id = 1,
> +    .version_id = 1,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_VIRTIO_DEVICE,
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
> +static void vhost_blk_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
> +
> +    device_class_set_props(dc, vhost_blk_properties);
> +    dc->vmsd = &vmstate_vhost_blk;

Does this really support live migration? Are in-flight requests drained
when pausing for live migration handover?

> +    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
> +    vdc->realize = vhost_blk_device_realize;
> +    vdc->unrealize = vhost_blk_device_unrealize;
> +    vdc->get_config = vhost_blk_update_config;
> +    vdc->get_features = vhost_blk_get_features;
> +    vdc->set_status = vhost_blk_set_status;
> +    vdc->reset = vhost_blk_reset;
> +}
> +
> +static void vhost_blk_instance_init(Object *obj)
> +{
> +    VHostBlk *s = VHOST_BLK(obj);
> +
> +    device_add_bootindex_property(obj, &s->conf.conf.bootindex,
> +                                  "bootindex", "/disk@0,0",
> +                                  DEVICE(obj));
> +}
> +
> +static const TypeInfo vhost_blk_info = {
> +    .name = TYPE_VHOST_BLK,
> +    .parent = TYPE_VIRTIO_DEVICE,
> +    .instance_size = sizeof(VHostBlk),
> +    .instance_init = vhost_blk_instance_init,
> +    .class_init = vhost_blk_class_init,
> +};
> +
> +static void virtio_register_types(void)
> +{
> +    type_register_static(&vhost_blk_info);
> +}
> +
> +type_init(virtio_register_types)
> diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
> index 7e8877fd64..fb2c0e7242 100644
> --- a/hw/virtio/meson.build
> +++ b/hw/virtio/meson.build
> @@ -40,6 +40,9 @@ virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: files('vhost-user-rng-
>  virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_SCSI', if_true: files('vhost-user-scsi-pci.c'))
>  virtio_pci_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-pci.c'))
>  virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs-pci.c'))
> +if have_vhost_blk
> +  virtio_ss.add(files('vhost-blk-pci.c'))
> +endif
>  
>  virtio_pci_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto-pci.c'))
>  virtio_pci_ss.add(when: 'CONFIG_VIRTIO_INPUT_HOST', if_true: files('virtio-input-host-pci.c'))
> diff --git a/hw/virtio/vhost-blk-pci.c b/hw/virtio/vhost-blk-pci.c
> new file mode 100644
> index 0000000000..f3b6e112b4
> --- /dev/null
> +++ b/hw/virtio/vhost-blk-pci.c
> @@ -0,0 +1,102 @@
> +/*
> + * Copyright (c) 2022 Virtuozzo International GmbH.
> + * Author: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
> + *
> + * vhost-blk PCI bindings
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +
> +#include "standard-headers/linux/virtio_pci.h"
> +#include "hw/virtio/virtio.h"
> +#include "hw/virtio/vhost-blk.h"
> +#include "hw/pci/pci.h"
> +#include "hw/qdev-properties.h"
> +#include "qapi/error.h"
> +#include "qemu/error-report.h"
> +#include "qemu/module.h"
> +#include "hw/virtio/virtio-pci.h"
> +#include "qom/object.h"
> +
> +typedef struct VHostBlkPCI VHostBlkPCI;
> +
> +/*
> + * vhost-blk-pci: This extends VirtioPCIProxy.
> + */
> +#define TYPE_VHOST_BLK_PCI "vhost-blk-pci-base"
> +DECLARE_INSTANCE_CHECKER(VHostBlkPCI, VHOST_BLK_PCI,
> +                         TYPE_VHOST_BLK_PCI)
> +
> +struct VHostBlkPCI {
> +    VirtIOPCIProxy parent_obj;
> +    VHostBlk vdev;
> +};
> +
> +static Property vhost_blk_pci_properties[] = {
> +    DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
> +    DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
> +                       DEV_NVECTORS_UNSPECIFIED),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void vhost_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
> +{
> +    VHostBlkPCI *dev = VHOST_BLK_PCI(vpci_dev);
> +    DeviceState *vdev = DEVICE(&dev->vdev);
> +
> +    if (dev->vdev.conf.num_queues == VHOST_BLK_AUTO_NUM_QUEUES) {
> +        dev->vdev.conf.num_queues = MIN(virtio_pci_optimal_num_queues(0),
> +                                        VHOST_BLK_MAX_QUEUES);
> +    }
> +
> +    if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
> +        vpci_dev->nvectors = dev->vdev.conf.num_queues + 1;
> +    }
> +
> +    qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
> +}
> +
> +static void vhost_blk_pci_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
> +    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
> +
> +    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
> +    device_class_set_props(dc, vhost_blk_pci_properties);
> +    k->realize = vhost_blk_pci_realize;
> +    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
> +    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_BLOCK;
> +    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
> +    pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI;
> +}
> +
> +static void vhost_blk_pci_instance_init(Object *obj)
> +{
> +    VHostBlkPCI *dev = VHOST_BLK_PCI(obj);
> +
> +    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
> +                                TYPE_VHOST_BLK);
> +    object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
> +                              "bootindex");
> +}
> +
> +static const VirtioPCIDeviceTypeInfo vhost_blk_pci_info = {
> +    .base_name               = TYPE_VHOST_BLK_PCI,
> +    .generic_name            = "vhost-blk-pci",
> +    .transitional_name       = "vhost-blk-pci-transitional",
> +    .non_transitional_name   = "vhost-blk-pci-non-transitional",
> +    .instance_size  = sizeof(VHostBlkPCI),
> +    .instance_init  = vhost_blk_pci_instance_init,
> +    .class_init     = vhost_blk_pci_class_init,
> +};
> +
> +static void vhost_blk_pci_register(void)
> +{
> +    virtio_pci_types_register(&vhost_blk_pci_info);
> +}
> +
> +type_init(vhost_blk_pci_register)
> diff --git a/include/hw/virtio/vhost-blk.h b/include/hw/virtio/vhost-blk.h
> new file mode 100644
> index 0000000000..76ece4215d
> --- /dev/null
> +++ b/include/hw/virtio/vhost-blk.h
> @@ -0,0 +1,50 @@
> +/*
> + * Copyright (c) 2022 Virtuozzo International GmbH.
> + * Author: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
> + *
> + * vhost-blk is host kernel accelerator for virtio-blk.
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + */
> +
> +#ifndef VHOST_BLK_H
> +#define VHOST_BLK_H
> +
> +#include "standard-headers/linux/virtio_blk.h"
> +#include "hw/block/block.h"
> +#include "hw/virtio/vhost.h"
> +#include "sysemu/block-backend.h"
> +
> +#define TYPE_VHOST_BLK "vhost-blk"
> +#define VHOST_BLK(obj) \
> +        OBJECT_CHECK(VHostBlk, (obj), TYPE_VHOST_BLK)
> +
> +#define VHOST_BLK_AUTO_NUM_QUEUES UINT16_MAX
> +#define VHOST_BLK_MAX_QUEUES 8
> +
> +/*
> + * normally should be visible from imported headers
> + * temporary define here to simplify development
> + */
> +#define VHOST_BLK_SET_BACKEND          _IOW(VHOST_VIRTIO, 0x90, \
> +                                            struct vhost_vring_file)
> +
> +typedef struct VhostBlkConf {
> +    BlockConf conf;

What is the purpose of using BlockConf but bypassing the QEMU block
layer?

If the file is a qcow2 file or there are additional block drivers like
the luks crypto driver then this doesn't work. If block jobs and other
block operations are performed on the QMP monitor then the image can be
corrupted.

> +    uint16_t num_queues;
> +    uint16_t queue_size;
> +} VhostBlkConf;
> +
> +typedef struct VHostBlk {
> +    VirtIODevice parent_obj;
> +    VhostBlkConf conf;
> +    uint64_t host_features;
> +    uint64_t decided_features;
> +    struct virtio_blk_config blkcfg;
> +    int vhostfd;
> +    struct vhost_dev dev;
> +    bool vhost_started;
> +} VHostBlk;
> +
> +#endif
> diff --git a/meson.build b/meson.build
> index 8a8c415fc1..7c00a8ce89 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -336,6 +336,9 @@ have_vhost_kernel = get_option('vhost_kernel') \
>  have_vhost_user_crypto = get_option('vhost_crypto') \
>    .require(have_vhost_user,
>             error_message: 'vhost-crypto requires vhost-user to be enabled').allowed()
> +have_vhost_blk = get_option('vhost_blk') \
> +  .require(targetos == 'linux',
> +           error_message: 'vhost-kernel is only available on Linux').allowed()
>  
>  have_vhost = have_vhost_user or have_vhost_vdpa or have_vhost_kernel
>  
> @@ -1814,6 +1817,7 @@ config_host_data.set('CONFIG_VHOST_KERNEL', have_vhost_kernel)
>  config_host_data.set('CONFIG_VHOST_USER', have_vhost_user)
>  config_host_data.set('CONFIG_VHOST_CRYPTO', have_vhost_user_crypto)
>  config_host_data.set('CONFIG_VHOST_VDPA', have_vhost_vdpa)
> +config_host_data.set('CONFIG_VHOST_BLK', have_vhost_blk)
>  config_host_data.set('CONFIG_VMNET', vmnet.found())
>  config_host_data.set('CONFIG_VHOST_USER_BLK_SERVER', have_vhost_user_blk_server)
>  config_host_data.set('CONFIG_VDUSE_BLK_EXPORT', have_vduse_blk_export)
> @@ -3756,6 +3760,7 @@ summary_info += {'vhost-user support': have_vhost_user}
>  summary_info += {'vhost-user-crypto support': have_vhost_user_crypto}
>  summary_info += {'vhost-user-blk server support': have_vhost_user_blk_server}
>  summary_info += {'vhost-vdpa support': have_vhost_vdpa}
> +summary_info += {'vhost-blk support': have_vhost_blk}
>  summary_info += {'build guest agent': have_ga}
>  summary(summary_info, bool_yn: true, section: 'Configurable features')
>  
> -- 
> 2.31.1
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC patch 0/1] block: vhost-blk backend
  2022-07-25 20:55 [RFC patch 0/1] block: vhost-blk backend Andrey Zhadchenko via
                   ` (3 preceding siblings ...)
  2022-10-04 18:26 ` Stefan Hajnoczi
@ 2022-10-04 19:00 ` Stefan Hajnoczi
  2022-10-05 11:50   ` Andrey Zhadchenko
  4 siblings, 1 reply; 19+ messages in thread
From: Stefan Hajnoczi @ 2022-10-04 19:00 UTC (permalink / raw)
  To: Andrey Zhadchenko; +Cc: qemu-block, qemu-devel, kwolf, hreitz, mst, den

[-- Attachment #1: Type: text/plain, Size: 3562 bytes --]

On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
> Although QEMU virtio-blk is quite fast, there is still some room for
> improvements. Disk latency can be reduced if we handle virito-blk requests
> in host kernel so we avoid a lot of syscalls and context switches.
> 
> The biggest disadvantage of this vhost-blk flavor is raw format.
> Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
> files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html
> 
> Also by using kernel modules we can bypass iothread limitation and finaly scale
> block requests with cpus for high-performance devices. This is planned to be
> implemented in next version.
> 
> Linux kernel module part:
> https://lore.kernel.org/kvm/20220725202753.298725-1-andrey.zhadchenko@virtuozzo.com/
> 
> test setups and results:
> fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
> QEMU drive options: cache=none
> filesystem: xfs
> 
> SSD:
>                | randread, IOPS  | randwrite, IOPS |
> Host           |      95.8k	 |	85.3k	   |
> QEMU virtio    |      57.5k	 |	79.4k	   |
> QEMU vhost-blk |      95.6k	 |	84.3k	   |
> 
> RAMDISK (vq == vcpu):
>                  | randread, IOPS | randwrite, IOPS |
> virtio, 1vcpu    |	123k	  |	 129k       |
> virtio, 2vcpu    |	253k (??) |	 250k (??)  |
> virtio, 4vcpu    |	158k	  |	 154k       |
> vhost-blk, 1vcpu |	110k	  |	 113k       |
> vhost-blk, 2vcpu |	247k	  |	 252k       |
> vhost-blk, 4vcpu |	576k	  |	 567k       |
> 
> Andrey Zhadchenko (1):
>   block: add vhost-blk backend
> 
>  configure                     |  13 ++
>  hw/block/Kconfig              |   5 +
>  hw/block/meson.build          |   1 +
>  hw/block/vhost-blk.c          | 395 ++++++++++++++++++++++++++++++++++
>  hw/virtio/meson.build         |   1 +
>  hw/virtio/vhost-blk-pci.c     | 102 +++++++++
>  include/hw/virtio/vhost-blk.h |  44 ++++
>  linux-headers/linux/vhost.h   |   3 +
>  8 files changed, 564 insertions(+)
>  create mode 100644 hw/block/vhost-blk.c
>  create mode 100644 hw/virtio/vhost-blk-pci.c
>  create mode 100644 include/hw/virtio/vhost-blk.h

vhost-blk has been tried several times in the past. That doesn't mean it
cannot be merged this time, but past arguments should be addressed:

- What makes it necessary to move the code into the kernel? In the past
  the performance results were not very convincing. The fastest
  implementations actually tend to be userspace NVMe PCI drivers that
  bypass the kernel! Bypassing the VFS and submitting block requests
  directly was not a huge boost. The syscall/context switch argument
  sounds okay but the numbers didn't really show that kernel block I/O
  is much faster than userspace block I/O.

  I've asked for more details on the QEMU command-line to understand
  what your numbers show. Maybe something has changed since previous
  times when vhost-blk has been tried.

  The only argument I see is QEMU's current 1 IOThread per virtio-blk
  device limitation, which is currently being worked on. If that's the
  only reason for vhost-blk then is it worth doing all the work of
  getting vhost-blk shipped (kernel, QEMU, and libvirt changes)? It
  seems like a short-term solution.

- The security impact of bugs in kernel vhost-blk code is more serious
  than bugs in a QEMU userspace process.

- The management stack needs to be changed to use vhost-blk whereas
  QEMU can be optimized without affecting other layers.

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC patch 0/1] block: vhost-blk backend
  2022-10-04 18:13 ` Stefan Hajnoczi
@ 2022-10-05  9:14   ` Andrey Zhadchenko
  2022-10-05 15:18     ` Stefan Hajnoczi
  0 siblings, 1 reply; 19+ messages in thread
From: Andrey Zhadchenko @ 2022-10-05  9:14 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: qemu-block, qemu-devel, kwolf, hreitz, mst, den

On 10/4/22 21:13, Stefan Hajnoczi wrote:
> On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
>> Although QEMU virtio-blk is quite fast, there is still some room for
>> improvements. Disk latency can be reduced if we handle virito-blk requests
>> in host kernel so we avoid a lot of syscalls and context switches.
>>
>> The biggest disadvantage of this vhost-blk flavor is raw format.
>> Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
>> files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html
>>
>> Also by using kernel modules we can bypass iothread limitation and finaly scale
>> block requests with cpus for high-performance devices. This is planned to be
>> implemented in next version.
> 
> Hi Andrey,
> Do you have a new version of this patch series that uses multiple
> threads?
> 
> I have been playing with vq-IOThread mapping in QEMU and would like to
> benchmark vhost-blk vs QEMU virtio-blk mq IOThreads:
> https://gitlab.com/stefanha/qemu/-/tree/virtio-blk-mq-iothread-prototype
> 
> Thanks,
> Stefan

Hi Stefan
For now my multi-threaded version is only available for Red Hat 9 5.14.0 
kernel. If you really want you can grab it from here: 
https://lists.openvz.org/pipermail/devel/2022-September/079951.html (kernel)
For QEMU part all you need is adding to vhost_blk_start something like:

#define VHOST_SET_NWORKERS _IOW(VHOST_VIRTIO, 0x1F, int)
ioctl(s->vhostfd, VHOST_SET_NWORKERS, &nworkers);

Or you can wait a bit. I should be able to send second versions by the 
end of the week (Monday in worst case).

Thanks,
Andrey


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC patch 0/1] block: vhost-blk backend
  2022-10-04 18:26 ` Stefan Hajnoczi
@ 2022-10-05 10:28   ` Andrey Zhadchenko
  2022-10-05 15:30     ` Stefan Hajnoczi
  0 siblings, 1 reply; 19+ messages in thread
From: Andrey Zhadchenko @ 2022-10-05 10:28 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: qemu-block, qemu-devel, kwolf, hreitz, mst, den



On 10/4/22 21:26, Stefan Hajnoczi wrote:
> On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
>> Although QEMU virtio-blk is quite fast, there is still some room for
>> improvements. Disk latency can be reduced if we handle virito-blk requests
>> in host kernel so we avoid a lot of syscalls and context switches.
>>
>> The biggest disadvantage of this vhost-blk flavor is raw format.
>> Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
>> files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html
>>
>> Also by using kernel modules we can bypass iothread limitation and finaly scale
>> block requests with cpus for high-performance devices. This is planned to be
>> implemented in next version.
>>
>> Linux kernel module part:
>> https://lore.kernel.org/kvm/20220725202753.298725-1-andrey.zhadchenko@virtuozzo.com/
>>
>> test setups and results:
>> fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
> 
>> QEMU drive options: cache=none
>> filesystem: xfs
> 
> Please post the full QEMU command-line so it's clear exactly what this
> is benchmarking.

The full command for vhost is this:
qemu-system-x86_64 \
-kernel bzImage -nographic -append "console=ttyS0 root=/dev/sdb rw 
systemd.unified_cgroup_hierarchy=0 nokaslr" \
-m 1024 -s --enable-kvm -smp $2 \
-drive id=main_drive,file=debian_sid.img,media=disk,format=raw \
-drive id=vhost_drive,file=$1,media=disk,format=raw,if=none \
-device vhost-blk-pci,drive=vhost_drive,num-threads=$3

(num-threads option for vhost-blk-pci was not used)

For virtio I used this:
qemu-system-x86_64 \
-kernel bzImage -nographic -append "console=ttyS0 root=/dev/sdb rw 
systemd.unified_cgroup_hierarchy=0 nokaslr" \
-m 1024 -s --enable-kvm -smp $2 \
-drive file=debian_sid.img,media=disk \
-drive file=$1,media=disk,if=virtio,cache=none,if=none,id=d1,aio=threads\
-device virtio-blk-pci,drive=d1

> 
> A preallocated raw image file is a good baseline with:
> 
>    --object iothread,id=iothread0 \
>    --blockdev file,filename=test.img,cache.direct=on,aio=native,node-name=drive0 >    --device virtio-blk-pci,drive=drive0,iothread=iothread0
The image I used was preallocated qcow2 image set up with dm-qcow2 
because this vhost-blk version directly uses bio interface and can't 
work with regular files.

> 
> (BTW QEMU's default vq size is 256 descriptors and the number of vqs is
> the number of vCPUs.)
> 
>>
>> SSD:
>>                 | randread, IOPS  | randwrite, IOPS |
>> Host           |      95.8k	 |	85.3k	   |
>> QEMU virtio    |      57.5k	 |	79.4k	   |

Adding iothread0 and using raw file instead of qcow2 + dm-qcow2 setup 
brings the numbers to
                   |      60.4k   |      84.3k      |

>> QEMU vhost-blk |      95.6k	 |	84.3k	   |
>>
>> RAMDISK (vq == vcpu):
> 
> With fio numjobs=vcpu here?

Yes

> 
>>                   | randread, IOPS | randwrite, IOPS |
>> virtio, 1vcpu    |	123k	  |	 129k       |
>> virtio, 2vcpu    |	253k (??) |	 250k (??)  |
> 
> QEMU's aio=threads (default) gets around the single IOThread. It beats
> aio=native for this reason in some cases. Were you using aio=native or
> aio=threads?

At some point of time I started to specify aio=threads (and before that 
I did not use this option). I am not sure when exactly. I will 
re-measure all cases for the next submission.

> 
>> virtio, 4vcpu    |	158k	  |	 154k       |
>> vhost-blk, 1vcpu |	110k	  |	 113k       |
>> vhost-blk, 2vcpu |	247k	  |	 252k       |


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC patch 0/1] block: vhost-blk backend
  2022-10-04 19:00 ` Stefan Hajnoczi
@ 2022-10-05 11:50   ` Andrey Zhadchenko
  2022-10-05 15:40     ` Stefan Hajnoczi
  0 siblings, 1 reply; 19+ messages in thread
From: Andrey Zhadchenko @ 2022-10-05 11:50 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: qemu-block, qemu-devel, kwolf, hreitz, mst, den



On 10/4/22 22:00, Stefan Hajnoczi wrote:
> On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
>> Although QEMU virtio-blk is quite fast, there is still some room for
>> improvements. Disk latency can be reduced if we handle virito-blk requests
>> in host kernel so we avoid a lot of syscalls and context switches.
>>
>> The biggest disadvantage of this vhost-blk flavor is raw format.
>> Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
>> files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html
>>
>> Also by using kernel modules we can bypass iothread limitation and finaly scale
>> block requests with cpus for high-performance devices. This is planned to be
>> implemented in next version.
>>
>> Linux kernel module part:
>> https://lore.kernel.org/kvm/20220725202753.298725-1-andrey.zhadchenko@virtuozzo.com/
>>
>> test setups and results:
>> fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
>> QEMU drive options: cache=none
>> filesystem: xfs
>>
>> SSD:
>>                 | randread, IOPS  | randwrite, IOPS |
>> Host           |      95.8k	 |	85.3k	   |
>> QEMU virtio    |      57.5k	 |	79.4k	   |
>> QEMU vhost-blk |      95.6k	 |	84.3k	   |
>>
>> RAMDISK (vq == vcpu):
>>                   | randread, IOPS | randwrite, IOPS |
>> virtio, 1vcpu    |	123k	  |	 129k       |
>> virtio, 2vcpu    |	253k (??) |	 250k (??)  |
>> virtio, 4vcpu    |	158k	  |	 154k       |
>> vhost-blk, 1vcpu |	110k	  |	 113k       |
>> vhost-blk, 2vcpu |	247k	  |	 252k       |
>> vhost-blk, 4vcpu |	576k	  |	 567k       |
>>
>> Andrey Zhadchenko (1):
>>    block: add vhost-blk backend
>>
>>   configure                     |  13 ++
>>   hw/block/Kconfig              |   5 +
>>   hw/block/meson.build          |   1 +
>>   hw/block/vhost-blk.c          | 395 ++++++++++++++++++++++++++++++++++
>>   hw/virtio/meson.build         |   1 +
>>   hw/virtio/vhost-blk-pci.c     | 102 +++++++++
>>   include/hw/virtio/vhost-blk.h |  44 ++++
>>   linux-headers/linux/vhost.h   |   3 +
>>   8 files changed, 564 insertions(+)
>>   create mode 100644 hw/block/vhost-blk.c
>>   create mode 100644 hw/virtio/vhost-blk-pci.c
>>   create mode 100644 include/hw/virtio/vhost-blk.h
> 
> vhost-blk has been tried several times in the past. That doesn't mean it
> cannot be merged this time, but past arguments should be addressed:
> 
> - What makes it necessary to move the code into the kernel? In the past
>    the performance results were not very convincing. The fastest
>    implementations actually tend to be userspace NVMe PCI drivers that
>    bypass the kernel! Bypassing the VFS and submitting block requests
>    directly was not a huge boost. The syscall/context switch argument
>    sounds okay but the numbers didn't really show that kernel block I/O
>    is much faster than userspace block I/O.
> 
>    I've asked for more details on the QEMU command-line to understand
>    what your numbers show. Maybe something has changed since previous
>    times when vhost-blk has been tried.
> 
>    The only argument I see is QEMU's current 1 IOThread per virtio-blk
>    device limitation, which is currently being worked on. If that's the
>    only reason for vhost-blk then is it worth doing all the work of
>    getting vhost-blk shipped (kernel, QEMU, and libvirt changes)? It
>    seems like a short-term solution.
> 
> - The security impact of bugs in kernel vhost-blk code is more serious
>    than bugs in a QEMU userspace process.
> 
> - The management stack needs to be changed to use vhost-blk whereas
>    QEMU can be optimized without affecting other layers.
> 
> Stefan

Indeed there was several vhost-blk attempts, but from what I found in 
mailing lists only the Asias attempt got some attention and discussion. 
Ramdisk performance results were great but ramdisk is more a benchmark 
than a real use. I didn't find out why Asias dropped his version except 
vague "He concluded performance results was not worth". The storage 
speed is very important for vhost-blk performance, as there is no point 
to cut cpu costs from 1ms to 0,1ms if the request need 50ms to proceed 
in the actual disk. I think that 10 years ago NVMI was non-existent and 
SSD + SATA was probably a lot faster than HDD but still not enough to 
utilize this technology.

The tests I did give me 60k IOPS randwrite for VM and 95k for host. And 
the vhost-blk is able to negate the difference even using only 1 
thread/vq/vcpu. And unlinke current QEMU single IOThread it can be 
easily scaled with number of cpus/vcpus. For sure this can be solved by 
liftimg IOThread limitations but this will probably be even more 
disastrous amount of changes (and adding vhost-blk won't break old setups!).

Probably the only undisputed advantage of vhost-blk is syscalls 
reduction. And again the benefit really depends on a storage speed, as 
it should be somehow comparable with syscalls time. Also I must note 
that this may be good for high-density servers with a lot of VMs. But 
for now I did not have the exact numbers which show how much time we are 
really winning for a single request at average.

Overall vhost-blk will only become better along with the increase of 
storage speed.

Also I must note that all arguments above apply to vdpa-blk. And unlike 
vhost-blk, which needs it's own QEMU code, vdpa-blk can be setup with 
generic virtio-vdpa QEMU code (I am not sure if it is merged yet but 
still). Although vdpa-blk have it's own problems for now.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH 1/1] block: add vhost-blk backend
  2022-10-04 18:45   ` Stefan Hajnoczi
@ 2022-10-05 13:06     ` Andrey Zhadchenko
  2022-10-05 15:50       ` Stefan Hajnoczi
  0 siblings, 1 reply; 19+ messages in thread
From: Andrey Zhadchenko @ 2022-10-05 13:06 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: qemu-block, qemu-devel, kwolf, hreitz, mst, den

Thanks for the review!

On 10/4/22 21:45, Stefan Hajnoczi wrote:
> On Mon, Jul 25, 2022 at 11:55:27PM +0300, Andrey Zhadchenko wrote:
>> Although QEMU virtio is quite fast, there is still some room for
>> improvements. Disk latency can be reduced if we handle virito-blk requests
>> in host kernel istead of passing them to QEMU. The patch adds vhost-blk
>> backend which sets up vhost-blk kernel module to process requests.
>>
>> test setup and results:
>> fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
>> QEMU drive options: cache=none
>> filesystem: xfs
>>
>> SSD:
>>                 | randread, IOPS  | randwrite, IOPS |
>> Host           |      95.8k	 |	85.3k	   |
>> QEMU virtio    |      57.5k	 |	79.4k	   |
>> QEMU vhost-blk |      95.6k	 |	84.3k	   |
>>
>> RAMDISK (vq == vcpu):
>>                   | randread, IOPS | randwrite, IOPS |
>> virtio, 1vcpu    |	123k	  |	 129k       |
>> virtio, 2vcpu    |	253k (??) |	 250k (??)  |
>> virtio, 4vcpu    |	158k	  |	 154k       |
>> vhost-blk, 1vcpu |	110k	  |	 113k       |
>> vhost-blk, 2vcpu |	247k	  |	 252k       |
>> vhost-blk, 4vcpu |	576k	  |	 567k       |
>>
>> Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
>> ---
>>   hw/block/Kconfig              |   5 +
>>   hw/block/meson.build          |   4 +
>>   hw/block/vhost-blk.c          | 394 ++++++++++++++++++++++++++++++++++
>>   hw/virtio/meson.build         |   3 +
>>   hw/virtio/vhost-blk-pci.c     | 102 +++++++++
>>   include/hw/virtio/vhost-blk.h |  50 +++++
>>   meson.build                   |   5 +
>>   7 files changed, 563 insertions(+)
>>   create mode 100644 hw/block/vhost-blk.c
>>   create mode 100644 hw/virtio/vhost-blk-pci.c
>>   create mode 100644 include/hw/virtio/vhost-blk.h
>>
>> diff --git a/hw/block/Kconfig b/hw/block/Kconfig
>> index 9e8f28f982..b4286ad10e 100644
>> --- a/hw/block/Kconfig
>> +++ b/hw/block/Kconfig
>> @@ -36,6 +36,11 @@ config VIRTIO_BLK
>>       default y
>>       depends on VIRTIO
>>   
>> +config VHOST_BLK
>> +    bool
>> +    default n
> 
> Feel free to enable it by default. That way it gets more CI/build
> coverage.
> 
>> +    depends on VIRTIO && LINUX
>> +
>>   config VHOST_USER_BLK
>>       bool
>>       # Only PCI devices are provided for now
>> diff --git a/hw/block/meson.build b/hw/block/meson.build
>> index 2389326112..caf9bedff3 100644
>> --- a/hw/block/meson.build
>> +++ b/hw/block/meson.build
>> @@ -19,4 +19,8 @@ softmmu_ss.add(when: 'CONFIG_TC58128', if_true: files('tc58128.c'))
>>   specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c'))
>>   specific_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: files('vhost-user-blk.c'))
>>   
>> +if have_vhost_blk
>> +  specific_ss.add(files('vhost-blk.c'))
>> +endif
> 
> Can this use the same add(when: 'CONFIG_VHOST_BLK', ...) syntax as the
> other conditional builds above?
I tried but it didn't go well for me :)
I will check this again

> 
>> +
>>   subdir('dataplane')
>> diff --git a/hw/block/vhost-blk.c b/hw/block/vhost-blk.c
>> new file mode 100644
>> index 0000000000..33d90af270
>> --- /dev/null
>> +++ b/hw/block/vhost-blk.c
>> @@ -0,0 +1,394 @@
>> +/*
>> + * Copyright (c) 2022 Virtuozzo International GmbH.
>> + * Author: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
>> + *
>> + * vhost-blk is host kernel accelerator for virtio-blk.
>> + *
>> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
>> + * See the COPYING.LIB file in the top-level directory.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qapi/error.h"
>> +#include "qemu/error-report.h"
>> +#include "qom/object.h"
>> +#include "hw/qdev-core.h"
>> +#include "hw/boards.h"
>> +#include "hw/virtio/vhost.h"
>> +#include "hw/virtio/vhost-blk.h"
>> +#include "hw/virtio/virtio.h"
>> +#include "hw/virtio/virtio-blk.h"
>> +#include "hw/virtio/virtio-bus.h"
>> +#include "hw/virtio/virtio-access.h"
>> +#include "hw/virtio/virtio-pci.h"
>> +#include "sysemu/sysemu.h"
>> +#include "linux-headers/linux/vhost.h"
>> +#include <sys/ioctl.h>
>> +#include <linux/fs.h>
>> +
>> +static int vhost_blk_start(VirtIODevice *vdev)
>> +{
>> +    VHostBlk *s = VHOST_BLK(vdev);
>> +    struct vhost_vring_file backend;
>> +    int ret, i;
>> +    int *fd = blk_bs(s->conf.conf.blk)->file->bs->opaque;
> 
> This needs a clean API so vhost-blk.c doesn't make assumptions about the
> file-posix BlockDriver's internal state memory layout.
I thought it did have some API but I didn't manage to find it. Does it 
exist?
Probably I can just open file by the name? But I didn't really want to 
manage flags like readonly. And if BlockDriver already opens this fd why 
bother with another copy?

> 
>> +    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
>> +    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
>> +
>> +    if (!k->set_guest_notifiers) {
>> +        error_report("vhost-blk: binding does not support guest notifiers");
>> +        return -ENOSYS;
>> +    }
>> +
>> +    if (s->vhost_started) {
>> +        return 0;
>> +    }
>> +
>> +    if (ioctl(s->vhostfd, VHOST_SET_OWNER, NULL)) {
>> +        error_report("vhost-blk: unable to set owner");
>> +        return -ENOSYS;
>> +    }
>> +
>> +    ret = vhost_dev_enable_notifiers(&s->dev, vdev);
>> +    if (ret < 0) {
>> +        error_report("vhost-blk: unable to enable dev notifiers", errno);
>> +        return ret;
>> +    }
>> +
>> +    s->dev.acked_features = vdev->guest_features & s->dev.backend_features;
>> +
>> +    ret = vhost_dev_start(&s->dev, vdev);
>> +    if (ret < 0) {
>> +        error_report("vhost-blk: unable to start vhost dev");
>> +        return ret;
>> +    }
>> +
>> +    ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true);
>> +    if (ret < 0) {
>> +        error_report("vhost-blk: unable to bind guest notifiers");
>> +        goto out;
>> +    }
>> +
>> +    memset(&backend, 0, sizeof(backend));
>> +    backend.index = 0;
>> +    backend.fd = *fd;
>> +    if (ioctl(s->vhostfd, VHOST_BLK_SET_BACKEND, &backend)) {
>> +        error_report("vhost-blk: unable to set backend");
>> +        ret = -errno;
>> +        goto out;
>> +    }
>> +
>> +    for (i = 0; i < s->dev.nvqs; i++) {
>> +        vhost_virtqueue_mask(&s->dev, vdev, i, false);
>> +    }
>> +
>> +    event_notifier_set(virtio_queue_get_host_notifier(virtio_get_queue(vdev, 0)));
> 
> Should this be done for all vqs?
> 
>> +
>> +    s->vhost_started = true;
>> +
>> +    return 0;
>> +
>> +out:
>> +    vhost_dev_stop(&s->dev, vdev);
>> +    return ret;
>> +
>> +}
>> +
>> +static void vhost_blk_stop(VirtIODevice *vdev)
>> +{
>> +    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
>> +    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
>> +    VHostBlk *s = VHOST_BLK(vdev);
>> +    int ret;
>> +
>> +    if (!s->vhost_started) {
>> +        return;
>> +    }
>> +
>> +    ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false);
>> +    if (ret < 0) {
>> +        error_report("vhost-blk: unable to unbind guest notifiers");
>> +    }
>> +    vhost_dev_disable_notifiers(&s->dev, vdev);
>> +    vhost_dev_stop(&s->dev, vdev);
>> +
>> +    s->vhost_started = false;
>> +}
>> +
>> +static void vhost_blk_reset(VirtIODevice *vdev)
>> +{
>> +    VHostBlk *s = VHOST_BLK(vdev);
>> +    int ret;
>> +
>> +    vhost_blk_stop(vdev);
>> +    ret = ioctl(s->vhostfd, VHOST_RESET_OWNER, NULL);
>> +    if (ret && errno != EPERM) {
>> +            error_report("vhost-blk: failed to reset owner %d", errno);
>> +    }
>> +}
>> +
>> +static void vhost_blk_set_status(VirtIODevice *vdev, uint8_t status)
>> +{
>> +    if (status & (VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)) {
>> +        vhost_blk_stop(vdev);
>> +        return;
>> +    }
>> +
>> +    if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
>> +        return;
>> +    }
>> +
>> +    if (vhost_blk_start(vdev)) {
>> +        error_report("vhost-blk: failed to start");
>> +    }
>> +}
>> +
>> +static void vhost_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
>> +{
>> +}
>> +
>> +static void vhost_blk_device_realize(DeviceState *dev, Error **errp)
>> +{
>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>> +    VHostBlk *s = VHOST_BLK(vdev);
>> +    VhostBlkConf *conf = &s->conf;
>> +    int i, ret;
>> +
>> +    if (!conf->conf.blk) {
>> +        error_setg(errp, "vhost-blk: drive property not set");
>> +        return;
>> +    }
>> +
>> +    if (!blk_is_inserted(conf->conf.blk)) {
>> +        error_setg(errp, "vhost-blk: device needs media, but drive is empty");
>> +        return;
>> +    }
>> +
>> +    if (conf->num_queues == VHOST_BLK_AUTO_NUM_QUEUES) {
>> +        conf->num_queues = MIN(virtio_pci_optimal_num_queues(0),
>> +                               VHOST_BLK_MAX_QUEUES);
> 
> Why is 8 (VHOST_BLK_MAX_QUEUES) queues the maximum?

Arbitrary number for RFC patch to simplify memory management in kernel. 
I will re-do it if we will get this series past RFC.

> 
>> +    }
>> +
>> +    if (!conf->num_queues) {
>> +        error_setg(errp, "vhost-blk: num-queues property must be larger than 0");
>> +        return;
>> +    }
>> +
>> +    if (conf->queue_size <= 2) {
>> +        error_setg(errp, "vhost-blk: invalid queue-size property (%" PRIu16 "), "
>> +                   "must be > 2", conf->queue_size);
>> +        return;
>> +    }
>> +
>> +    if (!is_power_of_2(conf->queue_size) ||
>> +        conf->queue_size > VIRTQUEUE_MAX_SIZE) {
>> +        error_setg(errp, "vhost_blk: invalid queue-size property (%" PRIu16 "), "
>> +                   "must be a power of 2 (max %d)",
>> +                   conf->queue_size, VIRTQUEUE_MAX_SIZE);
>> +        return;
>> +    }
>> +
>> +    if (!blkconf_apply_backend_options(&conf->conf,
>> +                                       !blk_supports_write_perm(conf->conf.blk),
>> +                                       true, errp)) {
>> +        return;
>> +    }
>> +
>> +    if (!blkconf_geometry(&conf->conf, NULL, 65535, 255, 255, errp)) {
>> +        return;
>> +    }
>> +
>> +    if (!blkconf_blocksizes(&conf->conf, errp)) {
>> +        return;
>> +    }
>> +
>> +    s->dev.nvqs = conf->num_queues;
>> +    s->dev.max_queues = conf->num_queues;
>> +    s->dev.vqs = g_new(struct vhost_virtqueue, s->dev.nvqs);
>> +    s->dev.vq_index = 0;
>> +
>> +    virtio_init(vdev, VIRTIO_ID_BLOCK, sizeof(struct virtio_blk_config));
>> +
>> +    for (i = 0; i < conf->num_queues; i++) {
>> +        virtio_add_queue(vdev, conf->queue_size, vhost_blk_handle_output);
>> +    }
>> +
>> +    s->vhostfd = open("/dev/vhost-blk", O_RDWR);
>> +    if (s->vhostfd < 0) {
>> +        error_setg(errp, "vhost-blk: unable to open /dev/vhost-blk");
>> +        goto cleanup;
>> +    }
>> +
>> +    s->dev.acked_features = 0;
>> +    ret = ioctl(s->vhostfd, VHOST_GET_FEATURES, &s->dev.backend_features);
>> +    if (ret < 0) {
>> +        error_setg(errp, "vhost-blk: unable to get backend features");
>> +        goto cleanup;
>> +    }
>> +
>> +    ret = vhost_dev_init(&s->dev, (void *)((size_t)s->vhostfd),
>> +                         VHOST_BACKEND_TYPE_KERNEL, 0, false);
> 
> The last parameter should be NULL (Error **) instead of false.
> 
>> +    if (ret < 0) {
>> +        error_setg(errp, "vhost-blk: vhost initialization failed: %s",
>> +                strerror(-ret));
>> +        goto cleanup;
>> +    }
>> +
>> +    return;
>> +
>> +cleanup:
>> +    g_free(s->dev.vqs);
>> +    close(s->vhostfd);
>> +    for (i = 0; i < conf->num_queues; i++) {
>> +        virtio_del_queue(vdev, i);
>> +    }
>> +    virtio_cleanup(vdev);
>> +    return;
>> +}
>> +
>> +static void vhost_blk_device_unrealize(DeviceState *dev)
>> +{
>> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>> +    VHostBlk *s = VHOST_BLK(dev);
>> +
>> +    vhost_blk_set_status(vdev, 0);
>> +    vhost_dev_cleanup(&s->dev);
>> +    g_free(s->dev.vqs);
>> +    virtio_cleanup(vdev);
>> +}
>> +
>> +static const int user_feature_bits[] = {
>> +    VIRTIO_BLK_F_FLUSH,
>> +    VIRTIO_RING_F_INDIRECT_DESC,
>> +    VIRTIO_RING_F_EVENT_IDX,
>> +    VHOST_INVALID_FEATURE_BIT
>> +};
>> +
>> +
>> +static uint64_t vhost_blk_get_features(VirtIODevice *vdev,
>> +                                            uint64_t features,
>> +                                            Error **errp)
>> +{
>> +    VHostBlk *s = VHOST_BLK(vdev);
>> +    uint64_t res;
>> +
>> +    features |= s->host_features;
>> +
>> +    virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
>> +    virtio_add_feature(&features, VIRTIO_BLK_F_SEG_MAX);
>> +    virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
>> +    virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
>> +    virtio_add_feature(&features, VIRTIO_BLK_F_SIZE_MAX);
>> +
>> +    virtio_add_feature(&features, VIRTIO_F_VERSION_1);
>> +
>> +    if (!blk_is_writable(s->conf.conf.blk)) {
>> +        virtio_add_feature(&features, VIRTIO_BLK_F_RO);
>> +    }
>> +
>> +    if (s->conf.num_queues > 1) {
>> +        virtio_add_feature(&features, VIRTIO_BLK_F_MQ);
>> +    }
>> +
>> +    res = vhost_get_features(&s->dev, user_feature_bits, features);
>> +
>> +    return res;
>> +}
>> +
>> +static void vhost_blk_update_config(VirtIODevice *vdev, uint8_t *config)
>> +{
>> +    VHostBlk *s = VHOST_BLK(vdev);
>> +    BlockConf *conf = &s->conf.conf;
>> +    struct virtio_blk_config blkcfg;
>> +    uint64_t capacity;
>> +    int64_t length;
>> +    int blk_size = conf->logical_block_size;
>> +
>> +    blk_get_geometry(s->conf.conf.blk, &capacity);
>> +    memset(&blkcfg, 0, sizeof(blkcfg));
>> +    virtio_stq_p(vdev, &blkcfg.capacity, capacity);
>> +    virtio_stl_p(vdev, &blkcfg.seg_max, s->conf.queue_size - 2);
>> +    virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
>> +    virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
>> +    blkcfg.geometry.heads = conf->heads;
>> +
>> +    length = blk_getlength(s->conf.conf.blk);
>> +    if (length > 0 && length / conf->heads / conf->secs % blk_size) {
>> +        unsigned short mask;
>> +
>> +        mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
>> +        blkcfg.geometry.sectors = conf->secs & ~mask;
>> +    } else {
>> +        blkcfg.geometry.sectors = conf->secs;
>> +    }
>> +
>> +    blkcfg.size_max = 0;
>> +    blkcfg.physical_block_exp = get_physical_block_exp(conf);
>> +    blkcfg.alignment_offset = 0;
>> +    virtio_stw_p(vdev, &blkcfg.num_queues, s->conf.num_queues);
>> +
>> +    memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
>> +}
>> +
>> +static Property vhost_blk_properties[] = {
>> +    DEFINE_BLOCK_PROPERTIES(VHostBlk, conf.conf),
>> +    DEFINE_PROP_UINT16("num-queues", VHostBlk, conf.num_queues,
>> +                       VHOST_BLK_AUTO_NUM_QUEUES),
>> +    DEFINE_PROP_UINT16("queue-size", VHostBlk, conf.queue_size, 256),
>> +/* Discard and write-zeroes not yet implemented in kernel module */
>> +    DEFINE_PROP_BIT64("discard", VHostBlk, host_features,
>> +                      VIRTIO_BLK_F_DISCARD, false),
>> +    DEFINE_PROP_BIT64("write-zeroes", VHostBlk, host_features,
>> +                      VIRTIO_BLK_F_WRITE_ZEROES, false),
>> +    DEFINE_PROP_END_OF_LIST(),
>> +};
>> +
>> +static const VMStateDescription vmstate_vhost_blk = {
>> +    .name = "vhost-blk",
>> +    .minimum_version_id = 1,
>> +    .version_id = 1,
>> +    .fields = (VMStateField[]) {
>> +        VMSTATE_VIRTIO_DEVICE,
>> +        VMSTATE_END_OF_LIST()
>> +    },
>> +};
>> +
>> +static void vhost_blk_class_init(ObjectClass *klass, void *data)
>> +{
>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>> +    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
>> +
>> +    device_class_set_props(dc, vhost_blk_properties);
>> +    dc->vmsd = &vmstate_vhost_blk;
> 
> Does this really support live migration? Are in-flight requests drained
> when pausing for live migration handover?

Not yet but it doesn't look hard to implement. Resetting device will 
drain all requests.

> 
>> +    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
>> +    vdc->realize = vhost_blk_device_realize;
>> +    vdc->unrealize = vhost_blk_device_unrealize;
>> +    vdc->get_config = vhost_blk_update_config;
>> +    vdc->get_features = vhost_blk_get_features;
>> +    vdc->set_status = vhost_blk_set_status;
>> +    vdc->reset = vhost_blk_reset;
>> +}
>> +
>> +static void vhost_blk_instance_init(Object *obj)
>> +{
>> +    VHostBlk *s = VHOST_BLK(obj);
>> +
>> +    device_add_bootindex_property(obj, &s->conf.conf.bootindex,
>> +                                  "bootindex", "/disk@0,0",
>> +                                  DEVICE(obj));
>> +}
>> +
>> +static const TypeInfo vhost_blk_info = {
>> +    .name = TYPE_VHOST_BLK,
>> +    .parent = TYPE_VIRTIO_DEVICE,
>> +    .instance_size = sizeof(VHostBlk),
>> +    .instance_init = vhost_blk_instance_init,
>> +    .class_init = vhost_blk_class_init,
>> +};
>> +
>> +static void virtio_register_types(void)
>> +{
>> +    type_register_static(&vhost_blk_info);
>> +}
>> +
>> +type_init(virtio_register_types)
>> diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
>> index 7e8877fd64..fb2c0e7242 100644
>> --- a/hw/virtio/meson.build
>> +++ b/hw/virtio/meson.build
>> @@ -40,6 +40,9 @@ virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: files('vhost-user-rng-
>>   virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_SCSI', if_true: files('vhost-user-scsi-pci.c'))
>>   virtio_pci_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-pci.c'))
>>   virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs-pci.c'))
>> +if have_vhost_blk
>> +  virtio_ss.add(files('vhost-blk-pci.c'))
>> +endif
>>   
>>   virtio_pci_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto-pci.c'))
>>   virtio_pci_ss.add(when: 'CONFIG_VIRTIO_INPUT_HOST', if_true: files('virtio-input-host-pci.c'))
>> diff --git a/hw/virtio/vhost-blk-pci.c b/hw/virtio/vhost-blk-pci.c
>> new file mode 100644
>> index 0000000000..f3b6e112b4
>> --- /dev/null
>> +++ b/hw/virtio/vhost-blk-pci.c
>> @@ -0,0 +1,102 @@
>> +/*
>> + * Copyright (c) 2022 Virtuozzo International GmbH.
>> + * Author: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
>> + *
>> + * vhost-blk PCI bindings
>> + *
>> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
>> + * See the COPYING.LIB file in the top-level directory.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +
>> +#include "standard-headers/linux/virtio_pci.h"
>> +#include "hw/virtio/virtio.h"
>> +#include "hw/virtio/vhost-blk.h"
>> +#include "hw/pci/pci.h"
>> +#include "hw/qdev-properties.h"
>> +#include "qapi/error.h"
>> +#include "qemu/error-report.h"
>> +#include "qemu/module.h"
>> +#include "hw/virtio/virtio-pci.h"
>> +#include "qom/object.h"
>> +
>> +typedef struct VHostBlkPCI VHostBlkPCI;
>> +
>> +/*
>> + * vhost-blk-pci: This extends VirtioPCIProxy.
>> + */
>> +#define TYPE_VHOST_BLK_PCI "vhost-blk-pci-base"
>> +DECLARE_INSTANCE_CHECKER(VHostBlkPCI, VHOST_BLK_PCI,
>> +                         TYPE_VHOST_BLK_PCI)
>> +
>> +struct VHostBlkPCI {
>> +    VirtIOPCIProxy parent_obj;
>> +    VHostBlk vdev;
>> +};
>> +
>> +static Property vhost_blk_pci_properties[] = {
>> +    DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
>> +    DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
>> +                       DEV_NVECTORS_UNSPECIFIED),
>> +    DEFINE_PROP_END_OF_LIST(),
>> +};
>> +
>> +static void vhost_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
>> +{
>> +    VHostBlkPCI *dev = VHOST_BLK_PCI(vpci_dev);
>> +    DeviceState *vdev = DEVICE(&dev->vdev);
>> +
>> +    if (dev->vdev.conf.num_queues == VHOST_BLK_AUTO_NUM_QUEUES) {
>> +        dev->vdev.conf.num_queues = MIN(virtio_pci_optimal_num_queues(0),
>> +                                        VHOST_BLK_MAX_QUEUES);
>> +    }
>> +
>> +    if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
>> +        vpci_dev->nvectors = dev->vdev.conf.num_queues + 1;
>> +    }
>> +
>> +    qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
>> +}
>> +
>> +static void vhost_blk_pci_class_init(ObjectClass *klass, void *data)
>> +{
>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>> +    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
>> +    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
>> +
>> +    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
>> +    device_class_set_props(dc, vhost_blk_pci_properties);
>> +    k->realize = vhost_blk_pci_realize;
>> +    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
>> +    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_BLOCK;
>> +    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
>> +    pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI;
>> +}
>> +
>> +static void vhost_blk_pci_instance_init(Object *obj)
>> +{
>> +    VHostBlkPCI *dev = VHOST_BLK_PCI(obj);
>> +
>> +    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
>> +                                TYPE_VHOST_BLK);
>> +    object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
>> +                              "bootindex");
>> +}
>> +
>> +static const VirtioPCIDeviceTypeInfo vhost_blk_pci_info = {
>> +    .base_name               = TYPE_VHOST_BLK_PCI,
>> +    .generic_name            = "vhost-blk-pci",
>> +    .transitional_name       = "vhost-blk-pci-transitional",
>> +    .non_transitional_name   = "vhost-blk-pci-non-transitional",
>> +    .instance_size  = sizeof(VHostBlkPCI),
>> +    .instance_init  = vhost_blk_pci_instance_init,
>> +    .class_init     = vhost_blk_pci_class_init,
>> +};
>> +
>> +static void vhost_blk_pci_register(void)
>> +{
>> +    virtio_pci_types_register(&vhost_blk_pci_info);
>> +}
>> +
>> +type_init(vhost_blk_pci_register)
>> diff --git a/include/hw/virtio/vhost-blk.h b/include/hw/virtio/vhost-blk.h
>> new file mode 100644
>> index 0000000000..76ece4215d
>> --- /dev/null
>> +++ b/include/hw/virtio/vhost-blk.h
>> @@ -0,0 +1,50 @@
>> +/*
>> + * Copyright (c) 2022 Virtuozzo International GmbH.
>> + * Author: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
>> + *
>> + * vhost-blk is host kernel accelerator for virtio-blk.
>> + *
>> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
>> + * See the COPYING.LIB file in the top-level directory.
>> + */
>> +
>> +#ifndef VHOST_BLK_H
>> +#define VHOST_BLK_H
>> +
>> +#include "standard-headers/linux/virtio_blk.h"
>> +#include "hw/block/block.h"
>> +#include "hw/virtio/vhost.h"
>> +#include "sysemu/block-backend.h"
>> +
>> +#define TYPE_VHOST_BLK "vhost-blk"
>> +#define VHOST_BLK(obj) \
>> +        OBJECT_CHECK(VHostBlk, (obj), TYPE_VHOST_BLK)
>> +
>> +#define VHOST_BLK_AUTO_NUM_QUEUES UINT16_MAX
>> +#define VHOST_BLK_MAX_QUEUES 8
>> +
>> +/*
>> + * normally should be visible from imported headers
>> + * temporary define here to simplify development
>> + */
>> +#define VHOST_BLK_SET_BACKEND          _IOW(VHOST_VIRTIO, 0x90, \
>> +                                            struct vhost_vring_file)
>> +
>> +typedef struct VhostBlkConf {
>> +    BlockConf conf;
> 
> What is the purpose of using BlockConf but bypassing the QEMU block
> layer?

I did not want to
- Decalre several properties BlockConf already have
- Duplicate configuration helpers like
blkconf_geometry() for the above
These were the main reasons.

> 
> If the file is a qcow2 file or there are additional block drivers like
> the luks crypto driver then this doesn't work. If block jobs and other
> block operations are performed on the QMP monitor then the image can be
> corrupted.

Thank you for the pointing this out. Unfortunately I know little about 
qemu block layer and did not think such things can happen in this case. 
I will try to re-evaluate this.

> 
>> +    uint16_t num_queues;
>> +    uint16_t queue_size;
>> +} VhostBlkConf;
>> +
>> +typedef struct VHostBlk {
>> +    VirtIODevice parent_obj;
>> +    VhostBlkConf conf;
>> +    uint64_t host_features;
>> +    uint64_t decided_features;
>> +    struct virtio_blk_config blkcfg;
>> +    int vhostfd;
>> +    struct vhost_dev dev;
>> +    bool vhost_started;
>> +} VHostBlk;
>> +
>> +#endif
>> diff --git a/meson.build b/meson.build
>> index 8a8c415fc1..7c00a8ce89 100644
>> --- a/meson.build
>> +++ b/meson.build
>> @@ -336,6 +336,9 @@ have_vhost_kernel = get_option('vhost_kernel') \
>>   have_vhost_user_crypto = get_option('vhost_crypto') \
>>     .require(have_vhost_user,
>>              error_message: 'vhost-crypto requires vhost-user to be enabled').allowed()
>> +have_vhost_blk = get_option('vhost_blk') \
>> +  .require(targetos == 'linux',
>> +           error_message: 'vhost-kernel is only available on Linux').allowed()
>>   
>>   have_vhost = have_vhost_user or have_vhost_vdpa or have_vhost_kernel
>>   
>> @@ -1814,6 +1817,7 @@ config_host_data.set('CONFIG_VHOST_KERNEL', have_vhost_kernel)
>>   config_host_data.set('CONFIG_VHOST_USER', have_vhost_user)
>>   config_host_data.set('CONFIG_VHOST_CRYPTO', have_vhost_user_crypto)
>>   config_host_data.set('CONFIG_VHOST_VDPA', have_vhost_vdpa)
>> +config_host_data.set('CONFIG_VHOST_BLK', have_vhost_blk)
>>   config_host_data.set('CONFIG_VMNET', vmnet.found())
>>   config_host_data.set('CONFIG_VHOST_USER_BLK_SERVER', have_vhost_user_blk_server)
>>   config_host_data.set('CONFIG_VDUSE_BLK_EXPORT', have_vduse_blk_export)
>> @@ -3756,6 +3760,7 @@ summary_info += {'vhost-user support': have_vhost_user}
>>   summary_info += {'vhost-user-crypto support': have_vhost_user_crypto}
>>   summary_info += {'vhost-user-blk server support': have_vhost_user_blk_server}
>>   summary_info += {'vhost-vdpa support': have_vhost_vdpa}
>> +summary_info += {'vhost-blk support': have_vhost_blk}
>>   summary_info += {'build guest agent': have_ga}
>>   summary(summary_info, bool_yn: true, section: 'Configurable features')
>>   
>> -- 
>> 2.31.1
>>


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC patch 0/1] block: vhost-blk backend
  2022-10-05  9:14   ` Andrey Zhadchenko
@ 2022-10-05 15:18     ` Stefan Hajnoczi
  0 siblings, 0 replies; 19+ messages in thread
From: Stefan Hajnoczi @ 2022-10-05 15:18 UTC (permalink / raw)
  To: Andrey Zhadchenko; +Cc: qemu-block, qemu-devel, kwolf, hreitz, mst, den

[-- Attachment #1: Type: text/plain, Size: 1834 bytes --]

On Wed, Oct 05, 2022 at 12:14:18PM +0300, Andrey Zhadchenko wrote:
> On 10/4/22 21:13, Stefan Hajnoczi wrote:
> > On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
> > > Although QEMU virtio-blk is quite fast, there is still some room for
> > > improvements. Disk latency can be reduced if we handle virito-blk requests
> > > in host kernel so we avoid a lot of syscalls and context switches.
> > > 
> > > The biggest disadvantage of this vhost-blk flavor is raw format.
> > > Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
> > > files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html
> > > 
> > > Also by using kernel modules we can bypass iothread limitation and finaly scale
> > > block requests with cpus for high-performance devices. This is planned to be
> > > implemented in next version.
> > 
> > Hi Andrey,
> > Do you have a new version of this patch series that uses multiple
> > threads?
> > 
> > I have been playing with vq-IOThread mapping in QEMU and would like to
> > benchmark vhost-blk vs QEMU virtio-blk mq IOThreads:
> > https://gitlab.com/stefanha/qemu/-/tree/virtio-blk-mq-iothread-prototype
> > 
> > Thanks,
> > Stefan
> 
> Hi Stefan
> For now my multi-threaded version is only available for Red Hat 9 5.14.0
> kernel. If you really want you can grab it from here:
> https://lists.openvz.org/pipermail/devel/2022-September/079951.html (kernel)
> For QEMU part all you need is adding to vhost_blk_start something like:
> 
> #define VHOST_SET_NWORKERS _IOW(VHOST_VIRTIO, 0x1F, int)
> ioctl(s->vhostfd, VHOST_SET_NWORKERS, &nworkers);
> 
> Or you can wait a bit. I should be able to send second versions by the end
> of the week (Monday in worst case).

Thanks, I will wait for the next revision.

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC patch 0/1] block: vhost-blk backend
  2022-10-05 10:28   ` Andrey Zhadchenko
@ 2022-10-05 15:30     ` Stefan Hajnoczi
  0 siblings, 0 replies; 19+ messages in thread
From: Stefan Hajnoczi @ 2022-10-05 15:30 UTC (permalink / raw)
  To: Andrey Zhadchenko; +Cc: qemu-block, qemu-devel, kwolf, hreitz, mst, den

[-- Attachment #1: Type: text/plain, Size: 4189 bytes --]

On Wed, Oct 05, 2022 at 01:28:14PM +0300, Andrey Zhadchenko wrote:
> 
> 
> On 10/4/22 21:26, Stefan Hajnoczi wrote:
> > On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
> > > Although QEMU virtio-blk is quite fast, there is still some room for
> > > improvements. Disk latency can be reduced if we handle virito-blk requests
> > > in host kernel so we avoid a lot of syscalls and context switches.
> > > 
> > > The biggest disadvantage of this vhost-blk flavor is raw format.
> > > Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
> > > files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html
> > > 
> > > Also by using kernel modules we can bypass iothread limitation and finaly scale
> > > block requests with cpus for high-performance devices. This is planned to be
> > > implemented in next version.
> > > 
> > > Linux kernel module part:
> > > https://lore.kernel.org/kvm/20220725202753.298725-1-andrey.zhadchenko@virtuozzo.com/
> > > 
> > > test setups and results:
> > > fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
> > 
> > > QEMU drive options: cache=none
> > > filesystem: xfs
> > 
> > Please post the full QEMU command-line so it's clear exactly what this
> > is benchmarking.
> 
> The full command for vhost is this:
> qemu-system-x86_64 \
> -kernel bzImage -nographic -append "console=ttyS0 root=/dev/sdb rw
> systemd.unified_cgroup_hierarchy=0 nokaslr" \
> -m 1024 -s --enable-kvm -smp $2 \
> -drive id=main_drive,file=debian_sid.img,media=disk,format=raw \
> -drive id=vhost_drive,file=$1,media=disk,format=raw,if=none \

No cache=none because vhost-blk directly submits bios in the kernel?

> -device vhost-blk-pci,drive=vhost_drive,num-threads=$3
> 
> (num-threads option for vhost-blk-pci was not used)
> 
> For virtio I used this:
> qemu-system-x86_64 \
> -kernel bzImage -nographic -append "console=ttyS0 root=/dev/sdb rw
> systemd.unified_cgroup_hierarchy=0 nokaslr" \
> -m 1024 -s --enable-kvm -smp $2 \
> -drive file=debian_sid.img,media=disk \
> -drive file=$1,media=disk,if=virtio,cache=none,if=none,id=d1,aio=threads\
> -device virtio-blk-pci,drive=d1
> 
> > 
> > A preallocated raw image file is a good baseline with:
> > 
> >    --object iothread,id=iothread0 \
> >    --blockdev file,filename=test.img,cache.direct=on,aio=native,node-name=drive0 >    --device virtio-blk-pci,drive=drive0,iothread=iothread0
> The image I used was preallocated qcow2 image set up with dm-qcow2 because
> this vhost-blk version directly uses bio interface and can't work with
> regular files.

I see. 

> 
> > 
> > (BTW QEMU's default vq size is 256 descriptors and the number of vqs is
> > the number of vCPUs.)
> > 
> > > 
> > > SSD:
> > >                 | randread, IOPS  | randwrite, IOPS |
> > > Host           |      95.8k	 |	85.3k	   |
> > > QEMU virtio    |      57.5k	 |	79.4k	   |
> 
> Adding iothread0 and using raw file instead of qcow2 + dm-qcow2 setup brings
> the numbers to
>                   |      60.4k   |      84.3k      |
> 
> > > QEMU vhost-blk |      95.6k	 |	84.3k	   |
> > > 
> > > RAMDISK (vq == vcpu):
> > 
> > With fio numjobs=vcpu here?
> 
> Yes
> 
> > 
> > >                   | randread, IOPS | randwrite, IOPS |
> > > virtio, 1vcpu    |	123k	  |	 129k       |
> > > virtio, 2vcpu    |	253k (??) |	 250k (??)  |
> > 
> > QEMU's aio=threads (default) gets around the single IOThread. It beats
> > aio=native for this reason in some cases. Were you using aio=native or
> > aio=threads?
> 
> At some point of time I started to specify aio=threads (and before that I
> did not use this option). I am not sure when exactly. I will re-measure all
> cases for the next submission.

aio=native is usually recommended. aio=threads is less optimized.

aio=native should have lower latency than aio=threads although it scales
worse on hosts with free CPUs because it's limited to a single thread.

> 
> > 
> > > virtio, 4vcpu    |	158k	  |	 154k       |
> > > vhost-blk, 1vcpu |	110k	  |	 113k       |
> > > vhost-blk, 2vcpu |	247k	  |	 252k       |
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC patch 0/1] block: vhost-blk backend
  2022-10-05 11:50   ` Andrey Zhadchenko
@ 2022-10-05 15:40     ` Stefan Hajnoczi
  0 siblings, 0 replies; 19+ messages in thread
From: Stefan Hajnoczi @ 2022-10-05 15:40 UTC (permalink / raw)
  To: Andrey Zhadchenko; +Cc: qemu-block, qemu-devel, kwolf, hreitz, mst, den

[-- Attachment #1: Type: text/plain, Size: 6474 bytes --]

On Wed, Oct 05, 2022 at 02:50:06PM +0300, Andrey Zhadchenko wrote:
> 
> 
> On 10/4/22 22:00, Stefan Hajnoczi wrote:
> > On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
> > > Although QEMU virtio-blk is quite fast, there is still some room for
> > > improvements. Disk latency can be reduced if we handle virito-blk requests
> > > in host kernel so we avoid a lot of syscalls and context switches.
> > > 
> > > The biggest disadvantage of this vhost-blk flavor is raw format.
> > > Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
> > > files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html
> > > 
> > > Also by using kernel modules we can bypass iothread limitation and finaly scale
> > > block requests with cpus for high-performance devices. This is planned to be
> > > implemented in next version.
> > > 
> > > Linux kernel module part:
> > > https://lore.kernel.org/kvm/20220725202753.298725-1-andrey.zhadchenko@virtuozzo.com/
> > > 
> > > test setups and results:
> > > fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
> > > QEMU drive options: cache=none
> > > filesystem: xfs
> > > 
> > > SSD:
> > >                 | randread, IOPS  | randwrite, IOPS |
> > > Host           |      95.8k	 |	85.3k	   |
> > > QEMU virtio    |      57.5k	 |	79.4k	   |
> > > QEMU vhost-blk |      95.6k	 |	84.3k	   |
> > > 
> > > RAMDISK (vq == vcpu):
> > >                   | randread, IOPS | randwrite, IOPS |
> > > virtio, 1vcpu    |	123k	  |	 129k       |
> > > virtio, 2vcpu    |	253k (??) |	 250k (??)  |
> > > virtio, 4vcpu    |	158k	  |	 154k       |
> > > vhost-blk, 1vcpu |	110k	  |	 113k       |
> > > vhost-blk, 2vcpu |	247k	  |	 252k       |
> > > vhost-blk, 4vcpu |	576k	  |	 567k       |
> > > 
> > > Andrey Zhadchenko (1):
> > >    block: add vhost-blk backend
> > > 
> > >   configure                     |  13 ++
> > >   hw/block/Kconfig              |   5 +
> > >   hw/block/meson.build          |   1 +
> > >   hw/block/vhost-blk.c          | 395 ++++++++++++++++++++++++++++++++++
> > >   hw/virtio/meson.build         |   1 +
> > >   hw/virtio/vhost-blk-pci.c     | 102 +++++++++
> > >   include/hw/virtio/vhost-blk.h |  44 ++++
> > >   linux-headers/linux/vhost.h   |   3 +
> > >   8 files changed, 564 insertions(+)
> > >   create mode 100644 hw/block/vhost-blk.c
> > >   create mode 100644 hw/virtio/vhost-blk-pci.c
> > >   create mode 100644 include/hw/virtio/vhost-blk.h
> > 
> > vhost-blk has been tried several times in the past. That doesn't mean it
> > cannot be merged this time, but past arguments should be addressed:
> > 
> > - What makes it necessary to move the code into the kernel? In the past
> >    the performance results were not very convincing. The fastest
> >    implementations actually tend to be userspace NVMe PCI drivers that
> >    bypass the kernel! Bypassing the VFS and submitting block requests
> >    directly was not a huge boost. The syscall/context switch argument
> >    sounds okay but the numbers didn't really show that kernel block I/O
> >    is much faster than userspace block I/O.
> > 
> >    I've asked for more details on the QEMU command-line to understand
> >    what your numbers show. Maybe something has changed since previous
> >    times when vhost-blk has been tried.
> > 
> >    The only argument I see is QEMU's current 1 IOThread per virtio-blk
> >    device limitation, which is currently being worked on. If that's the
> >    only reason for vhost-blk then is it worth doing all the work of
> >    getting vhost-blk shipped (kernel, QEMU, and libvirt changes)? It
> >    seems like a short-term solution.
> > 
> > - The security impact of bugs in kernel vhost-blk code is more serious
> >    than bugs in a QEMU userspace process.
> > 
> > - The management stack needs to be changed to use vhost-blk whereas
> >    QEMU can be optimized without affecting other layers.
> > 
> > Stefan
> 
> Indeed there was several vhost-blk attempts, but from what I found in
> mailing lists only the Asias attempt got some attention and discussion.
> Ramdisk performance results were great but ramdisk is more a benchmark than
> a real use. I didn't find out why Asias dropped his version except vague "He
> concluded performance results was not worth". The storage speed is very
> important for vhost-blk performance, as there is no point to cut cpu costs
> from 1ms to 0,1ms if the request need 50ms to proceed in the actual disk. I
> think that 10 years ago NVMI was non-existent and SSD + SATA was probably a
> lot faster than HDD but still not enough to utilize this technology.

Yes, it's possible that latency improvements are more noticeable now.
Thank you for posting the benchmark results. I will also run benchmarks
so we can compare vhost-blk with today's QEMU as well as multiqueue
IOThreads QEMU (for which I only have a hacky prototype) on a local NVMe
PCI SSD.

> The tests I did give me 60k IOPS randwrite for VM and 95k for host. And the
> vhost-blk is able to negate the difference even using only 1 thread/vq/vcpu.
> And unlinke current QEMU single IOThread it can be easily scaled with number
> of cpus/vcpus. For sure this can be solved by liftimg IOThread limitations
> but this will probably be even more disastrous amount of changes (and adding
> vhost-blk won't break old setups!).
> 
> Probably the only undisputed advantage of vhost-blk is syscalls reduction.
> And again the benefit really depends on a storage speed, as it should be
> somehow comparable with syscalls time. Also I must note that this may be
> good for high-density servers with a lot of VMs. But for now I did not have
> the exact numbers which show how much time we are really winning for a
> single request at average.
> 
> Overall vhost-blk will only become better along with the increase of storage
> speed.
> 
> Also I must note that all arguments above apply to vdpa-blk. And unlike
> vhost-blk, which needs it's own QEMU code, vdpa-blk can be setup with
> generic virtio-vdpa QEMU code (I am not sure if it is merged yet but still).
> Although vdpa-blk have it's own problems for now.

Yes, I think that's why Stefano hasn't pushed for a software vpda-blk
device yet despite having played with it and is more focussed on
hardware enablement. vdpa-blk has the same issues as vhost-blk.

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH 1/1] block: add vhost-blk backend
  2022-10-05 13:06     ` Andrey Zhadchenko
@ 2022-10-05 15:50       ` Stefan Hajnoczi
  0 siblings, 0 replies; 19+ messages in thread
From: Stefan Hajnoczi @ 2022-10-05 15:50 UTC (permalink / raw)
  To: Andrey Zhadchenko; +Cc: qemu-block, qemu-devel, kwolf, hreitz, mst, den

[-- Attachment #1: Type: text/plain, Size: 29321 bytes --]

On Wed, Oct 05, 2022 at 04:06:58PM +0300, Andrey Zhadchenko wrote:
> Thanks for the review!
> 
> On 10/4/22 21:45, Stefan Hajnoczi wrote:
> > On Mon, Jul 25, 2022 at 11:55:27PM +0300, Andrey Zhadchenko wrote:
> > > Although QEMU virtio is quite fast, there is still some room for
> > > improvements. Disk latency can be reduced if we handle virito-blk requests
> > > in host kernel istead of passing them to QEMU. The patch adds vhost-blk
> > > backend which sets up vhost-blk kernel module to process requests.
> > > 
> > > test setup and results:
> > > fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
> > > QEMU drive options: cache=none
> > > filesystem: xfs
> > > 
> > > SSD:
> > >                 | randread, IOPS  | randwrite, IOPS |
> > > Host           |      95.8k	 |	85.3k	   |
> > > QEMU virtio    |      57.5k	 |	79.4k	   |
> > > QEMU vhost-blk |      95.6k	 |	84.3k	   |
> > > 
> > > RAMDISK (vq == vcpu):
> > >                   | randread, IOPS | randwrite, IOPS |
> > > virtio, 1vcpu    |	123k	  |	 129k       |
> > > virtio, 2vcpu    |	253k (??) |	 250k (??)  |
> > > virtio, 4vcpu    |	158k	  |	 154k       |
> > > vhost-blk, 1vcpu |	110k	  |	 113k       |
> > > vhost-blk, 2vcpu |	247k	  |	 252k       |
> > > vhost-blk, 4vcpu |	576k	  |	 567k       |
> > > 
> > > Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
> > > ---
> > >   hw/block/Kconfig              |   5 +
> > >   hw/block/meson.build          |   4 +
> > >   hw/block/vhost-blk.c          | 394 ++++++++++++++++++++++++++++++++++
> > >   hw/virtio/meson.build         |   3 +
> > >   hw/virtio/vhost-blk-pci.c     | 102 +++++++++
> > >   include/hw/virtio/vhost-blk.h |  50 +++++
> > >   meson.build                   |   5 +
> > >   7 files changed, 563 insertions(+)
> > >   create mode 100644 hw/block/vhost-blk.c
> > >   create mode 100644 hw/virtio/vhost-blk-pci.c
> > >   create mode 100644 include/hw/virtio/vhost-blk.h
> > > 
> > > diff --git a/hw/block/Kconfig b/hw/block/Kconfig
> > > index 9e8f28f982..b4286ad10e 100644
> > > --- a/hw/block/Kconfig
> > > +++ b/hw/block/Kconfig
> > > @@ -36,6 +36,11 @@ config VIRTIO_BLK
> > >       default y
> > >       depends on VIRTIO
> > > +config VHOST_BLK
> > > +    bool
> > > +    default n
> > 
> > Feel free to enable it by default. That way it gets more CI/build
> > coverage.
> > 
> > > +    depends on VIRTIO && LINUX
> > > +
> > >   config VHOST_USER_BLK
> > >       bool
> > >       # Only PCI devices are provided for now
> > > diff --git a/hw/block/meson.build b/hw/block/meson.build
> > > index 2389326112..caf9bedff3 100644
> > > --- a/hw/block/meson.build
> > > +++ b/hw/block/meson.build
> > > @@ -19,4 +19,8 @@ softmmu_ss.add(when: 'CONFIG_TC58128', if_true: files('tc58128.c'))
> > >   specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c'))
> > >   specific_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: files('vhost-user-blk.c'))
> > > +if have_vhost_blk
> > > +  specific_ss.add(files('vhost-blk.c'))
> > > +endif
> > 
> > Can this use the same add(when: 'CONFIG_VHOST_BLK', ...) syntax as the
> > other conditional builds above?
> I tried but it didn't go well for me :)
> I will check this again

Maybe a line needs to be added to meson_options.txt.

> 
> > 
> > > +
> > >   subdir('dataplane')
> > > diff --git a/hw/block/vhost-blk.c b/hw/block/vhost-blk.c
> > > new file mode 100644
> > > index 0000000000..33d90af270
> > > --- /dev/null
> > > +++ b/hw/block/vhost-blk.c
> > > @@ -0,0 +1,394 @@
> > > +/*
> > > + * Copyright (c) 2022 Virtuozzo International GmbH.
> > > + * Author: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
> > > + *
> > > + * vhost-blk is host kernel accelerator for virtio-blk.
> > > + *
> > > + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> > > + * See the COPYING.LIB file in the top-level directory.
> > > + */
> > > +
> > > +#include "qemu/osdep.h"
> > > +#include "qapi/error.h"
> > > +#include "qemu/error-report.h"
> > > +#include "qom/object.h"
> > > +#include "hw/qdev-core.h"
> > > +#include "hw/boards.h"
> > > +#include "hw/virtio/vhost.h"
> > > +#include "hw/virtio/vhost-blk.h"
> > > +#include "hw/virtio/virtio.h"
> > > +#include "hw/virtio/virtio-blk.h"
> > > +#include "hw/virtio/virtio-bus.h"
> > > +#include "hw/virtio/virtio-access.h"
> > > +#include "hw/virtio/virtio-pci.h"
> > > +#include "sysemu/sysemu.h"
> > > +#include "linux-headers/linux/vhost.h"
> > > +#include <sys/ioctl.h>
> > > +#include <linux/fs.h>
> > > +
> > > +static int vhost_blk_start(VirtIODevice *vdev)
> > > +{
> > > +    VHostBlk *s = VHOST_BLK(vdev);
> > > +    struct vhost_vring_file backend;
> > > +    int ret, i;
> > > +    int *fd = blk_bs(s->conf.conf.blk)->file->bs->opaque;
> > 
> > This needs a clean API so vhost-blk.c doesn't make assumptions about the
> > file-posix BlockDriver's internal state memory layout.
> I thought it did have some API but I didn't manage to find it. Does it
> exist?
> Probably I can just open file by the name? But I didn't really want to
> manage flags like readonly. And if BlockDriver already opens this fd why
> bother with another copy?

I thought I added something like this in the past, but now I can't find
an API. One approach is to add a public API to block/file-posix.c like
int raw_get_fd(BlockDriverState *bs) that verifies that bs->drv is one
of the file-posix.c BlockDrivers and then returns
((BDRVRawState*)bs->opaque)->fd.

> > 
> > > +    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
> > > +    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
> > > +
> > > +    if (!k->set_guest_notifiers) {
> > > +        error_report("vhost-blk: binding does not support guest notifiers");
> > > +        return -ENOSYS;
> > > +    }
> > > +
> > > +    if (s->vhost_started) {
> > > +        return 0;
> > > +    }
> > > +
> > > +    if (ioctl(s->vhostfd, VHOST_SET_OWNER, NULL)) {
> > > +        error_report("vhost-blk: unable to set owner");
> > > +        return -ENOSYS;
> > > +    }
> > > +
> > > +    ret = vhost_dev_enable_notifiers(&s->dev, vdev);
> > > +    if (ret < 0) {
> > > +        error_report("vhost-blk: unable to enable dev notifiers", errno);
> > > +        return ret;
> > > +    }
> > > +
> > > +    s->dev.acked_features = vdev->guest_features & s->dev.backend_features;
> > > +
> > > +    ret = vhost_dev_start(&s->dev, vdev);
> > > +    if (ret < 0) {
> > > +        error_report("vhost-blk: unable to start vhost dev");
> > > +        return ret;
> > > +    }
> > > +
> > > +    ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true);
> > > +    if (ret < 0) {
> > > +        error_report("vhost-blk: unable to bind guest notifiers");
> > > +        goto out;
> > > +    }
> > > +
> > > +    memset(&backend, 0, sizeof(backend));
> > > +    backend.index = 0;
> > > +    backend.fd = *fd;
> > > +    if (ioctl(s->vhostfd, VHOST_BLK_SET_BACKEND, &backend)) {
> > > +        error_report("vhost-blk: unable to set backend");
> > > +        ret = -errno;
> > > +        goto out;
> > > +    }
> > > +
> > > +    for (i = 0; i < s->dev.nvqs; i++) {
> > > +        vhost_virtqueue_mask(&s->dev, vdev, i, false);
> > > +    }
> > > +
> > > +    event_notifier_set(virtio_queue_get_host_notifier(virtio_get_queue(vdev, 0)));
> > 
> > Should this be done for all vqs?
> > 
> > > +
> > > +    s->vhost_started = true;
> > > +
> > > +    return 0;
> > > +
> > > +out:
> > > +    vhost_dev_stop(&s->dev, vdev);
> > > +    return ret;
> > > +
> > > +}
> > > +
> > > +static void vhost_blk_stop(VirtIODevice *vdev)
> > > +{
> > > +    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
> > > +    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
> > > +    VHostBlk *s = VHOST_BLK(vdev);
> > > +    int ret;
> > > +
> > > +    if (!s->vhost_started) {
> > > +        return;
> > > +    }
> > > +
> > > +    ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false);
> > > +    if (ret < 0) {
> > > +        error_report("vhost-blk: unable to unbind guest notifiers");
> > > +    }
> > > +    vhost_dev_disable_notifiers(&s->dev, vdev);
> > > +    vhost_dev_stop(&s->dev, vdev);
> > > +
> > > +    s->vhost_started = false;
> > > +}
> > > +
> > > +static void vhost_blk_reset(VirtIODevice *vdev)
> > > +{
> > > +    VHostBlk *s = VHOST_BLK(vdev);
> > > +    int ret;
> > > +
> > > +    vhost_blk_stop(vdev);
> > > +    ret = ioctl(s->vhostfd, VHOST_RESET_OWNER, NULL);
> > > +    if (ret && errno != EPERM) {
> > > +            error_report("vhost-blk: failed to reset owner %d", errno);
> > > +    }
> > > +}
> > > +
> > > +static void vhost_blk_set_status(VirtIODevice *vdev, uint8_t status)
> > > +{
> > > +    if (status & (VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)) {
> > > +        vhost_blk_stop(vdev);
> > > +        return;
> > > +    }
> > > +
> > > +    if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
> > > +        return;
> > > +    }
> > > +
> > > +    if (vhost_blk_start(vdev)) {
> > > +        error_report("vhost-blk: failed to start");
> > > +    }
> > > +}
> > > +
> > > +static void vhost_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
> > > +{
> > > +}
> > > +
> > > +static void vhost_blk_device_realize(DeviceState *dev, Error **errp)
> > > +{
> > > +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> > > +    VHostBlk *s = VHOST_BLK(vdev);
> > > +    VhostBlkConf *conf = &s->conf;
> > > +    int i, ret;
> > > +
> > > +    if (!conf->conf.blk) {
> > > +        error_setg(errp, "vhost-blk: drive property not set");
> > > +        return;
> > > +    }
> > > +
> > > +    if (!blk_is_inserted(conf->conf.blk)) {
> > > +        error_setg(errp, "vhost-blk: device needs media, but drive is empty");
> > > +        return;
> > > +    }
> > > +
> > > +    if (conf->num_queues == VHOST_BLK_AUTO_NUM_QUEUES) {
> > > +        conf->num_queues = MIN(virtio_pci_optimal_num_queues(0),
> > > +                               VHOST_BLK_MAX_QUEUES);
> > 
> > Why is 8 (VHOST_BLK_MAX_QUEUES) queues the maximum?
> 
> Arbitrary number for RFC patch to simplify memory management in kernel. I
> will re-do it if we will get this series past RFC.
> 
> > 
> > > +    }
> > > +
> > > +    if (!conf->num_queues) {
> > > +        error_setg(errp, "vhost-blk: num-queues property must be larger than 0");
> > > +        return;
> > > +    }
> > > +
> > > +    if (conf->queue_size <= 2) {
> > > +        error_setg(errp, "vhost-blk: invalid queue-size property (%" PRIu16 "), "
> > > +                   "must be > 2", conf->queue_size);
> > > +        return;
> > > +    }
> > > +
> > > +    if (!is_power_of_2(conf->queue_size) ||
> > > +        conf->queue_size > VIRTQUEUE_MAX_SIZE) {
> > > +        error_setg(errp, "vhost_blk: invalid queue-size property (%" PRIu16 "), "
> > > +                   "must be a power of 2 (max %d)",
> > > +                   conf->queue_size, VIRTQUEUE_MAX_SIZE);
> > > +        return;
> > > +    }
> > > +
> > > +    if (!blkconf_apply_backend_options(&conf->conf,
> > > +                                       !blk_supports_write_perm(conf->conf.blk),
> > > +                                       true, errp)) {
> > > +        return;
> > > +    }
> > > +
> > > +    if (!blkconf_geometry(&conf->conf, NULL, 65535, 255, 255, errp)) {
> > > +        return;
> > > +    }
> > > +
> > > +    if (!blkconf_blocksizes(&conf->conf, errp)) {
> > > +        return;
> > > +    }
> > > +
> > > +    s->dev.nvqs = conf->num_queues;
> > > +    s->dev.max_queues = conf->num_queues;
> > > +    s->dev.vqs = g_new(struct vhost_virtqueue, s->dev.nvqs);
> > > +    s->dev.vq_index = 0;
> > > +
> > > +    virtio_init(vdev, VIRTIO_ID_BLOCK, sizeof(struct virtio_blk_config));
> > > +
> > > +    for (i = 0; i < conf->num_queues; i++) {
> > > +        virtio_add_queue(vdev, conf->queue_size, vhost_blk_handle_output);
> > > +    }
> > > +
> > > +    s->vhostfd = open("/dev/vhost-blk", O_RDWR);
> > > +    if (s->vhostfd < 0) {
> > > +        error_setg(errp, "vhost-blk: unable to open /dev/vhost-blk");
> > > +        goto cleanup;
> > > +    }
> > > +
> > > +    s->dev.acked_features = 0;
> > > +    ret = ioctl(s->vhostfd, VHOST_GET_FEATURES, &s->dev.backend_features);
> > > +    if (ret < 0) {
> > > +        error_setg(errp, "vhost-blk: unable to get backend features");
> > > +        goto cleanup;
> > > +    }
> > > +
> > > +    ret = vhost_dev_init(&s->dev, (void *)((size_t)s->vhostfd),
> > > +                         VHOST_BACKEND_TYPE_KERNEL, 0, false);
> > 
> > The last parameter should be NULL (Error **) instead of false.
> > 
> > > +    if (ret < 0) {
> > > +        error_setg(errp, "vhost-blk: vhost initialization failed: %s",
> > > +                strerror(-ret));
> > > +        goto cleanup;
> > > +    }
> > > +
> > > +    return;
> > > +
> > > +cleanup:
> > > +    g_free(s->dev.vqs);
> > > +    close(s->vhostfd);
> > > +    for (i = 0; i < conf->num_queues; i++) {
> > > +        virtio_del_queue(vdev, i);
> > > +    }
> > > +    virtio_cleanup(vdev);
> > > +    return;
> > > +}
> > > +
> > > +static void vhost_blk_device_unrealize(DeviceState *dev)
> > > +{
> > > +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> > > +    VHostBlk *s = VHOST_BLK(dev);
> > > +
> > > +    vhost_blk_set_status(vdev, 0);
> > > +    vhost_dev_cleanup(&s->dev);
> > > +    g_free(s->dev.vqs);
> > > +    virtio_cleanup(vdev);
> > > +}
> > > +
> > > +static const int user_feature_bits[] = {
> > > +    VIRTIO_BLK_F_FLUSH,
> > > +    VIRTIO_RING_F_INDIRECT_DESC,
> > > +    VIRTIO_RING_F_EVENT_IDX,
> > > +    VHOST_INVALID_FEATURE_BIT
> > > +};
> > > +
> > > +
> > > +static uint64_t vhost_blk_get_features(VirtIODevice *vdev,
> > > +                                            uint64_t features,
> > > +                                            Error **errp)
> > > +{
> > > +    VHostBlk *s = VHOST_BLK(vdev);
> > > +    uint64_t res;
> > > +
> > > +    features |= s->host_features;
> > > +
> > > +    virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
> > > +    virtio_add_feature(&features, VIRTIO_BLK_F_SEG_MAX);
> > > +    virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
> > > +    virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
> > > +    virtio_add_feature(&features, VIRTIO_BLK_F_SIZE_MAX);
> > > +
> > > +    virtio_add_feature(&features, VIRTIO_F_VERSION_1);
> > > +
> > > +    if (!blk_is_writable(s->conf.conf.blk)) {
> > > +        virtio_add_feature(&features, VIRTIO_BLK_F_RO);
> > > +    }
> > > +
> > > +    if (s->conf.num_queues > 1) {
> > > +        virtio_add_feature(&features, VIRTIO_BLK_F_MQ);
> > > +    }
> > > +
> > > +    res = vhost_get_features(&s->dev, user_feature_bits, features);
> > > +
> > > +    return res;
> > > +}
> > > +
> > > +static void vhost_blk_update_config(VirtIODevice *vdev, uint8_t *config)
> > > +{
> > > +    VHostBlk *s = VHOST_BLK(vdev);
> > > +    BlockConf *conf = &s->conf.conf;
> > > +    struct virtio_blk_config blkcfg;
> > > +    uint64_t capacity;
> > > +    int64_t length;
> > > +    int blk_size = conf->logical_block_size;
> > > +
> > > +    blk_get_geometry(s->conf.conf.blk, &capacity);
> > > +    memset(&blkcfg, 0, sizeof(blkcfg));
> > > +    virtio_stq_p(vdev, &blkcfg.capacity, capacity);
> > > +    virtio_stl_p(vdev, &blkcfg.seg_max, s->conf.queue_size - 2);
> > > +    virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
> > > +    virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
> > > +    blkcfg.geometry.heads = conf->heads;
> > > +
> > > +    length = blk_getlength(s->conf.conf.blk);
> > > +    if (length > 0 && length / conf->heads / conf->secs % blk_size) {
> > > +        unsigned short mask;
> > > +
> > > +        mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
> > > +        blkcfg.geometry.sectors = conf->secs & ~mask;
> > > +    } else {
> > > +        blkcfg.geometry.sectors = conf->secs;
> > > +    }
> > > +
> > > +    blkcfg.size_max = 0;
> > > +    blkcfg.physical_block_exp = get_physical_block_exp(conf);
> > > +    blkcfg.alignment_offset = 0;
> > > +    virtio_stw_p(vdev, &blkcfg.num_queues, s->conf.num_queues);
> > > +
> > > +    memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
> > > +}
> > > +
> > > +static Property vhost_blk_properties[] = {
> > > +    DEFINE_BLOCK_PROPERTIES(VHostBlk, conf.conf),
> > > +    DEFINE_PROP_UINT16("num-queues", VHostBlk, conf.num_queues,
> > > +                       VHOST_BLK_AUTO_NUM_QUEUES),
> > > +    DEFINE_PROP_UINT16("queue-size", VHostBlk, conf.queue_size, 256),
> > > +/* Discard and write-zeroes not yet implemented in kernel module */
> > > +    DEFINE_PROP_BIT64("discard", VHostBlk, host_features,
> > > +                      VIRTIO_BLK_F_DISCARD, false),
> > > +    DEFINE_PROP_BIT64("write-zeroes", VHostBlk, host_features,
> > > +                      VIRTIO_BLK_F_WRITE_ZEROES, false),
> > > +    DEFINE_PROP_END_OF_LIST(),
> > > +};
> > > +
> > > +static const VMStateDescription vmstate_vhost_blk = {
> > > +    .name = "vhost-blk",
> > > +    .minimum_version_id = 1,
> > > +    .version_id = 1,
> > > +    .fields = (VMStateField[]) {
> > > +        VMSTATE_VIRTIO_DEVICE,
> > > +        VMSTATE_END_OF_LIST()
> > > +    },
> > > +};
> > > +
> > > +static void vhost_blk_class_init(ObjectClass *klass, void *data)
> > > +{
> > > +    DeviceClass *dc = DEVICE_CLASS(klass);
> > > +    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
> > > +
> > > +    device_class_set_props(dc, vhost_blk_properties);
> > > +    dc->vmsd = &vmstate_vhost_blk;
> > 
> > Does this really support live migration? Are in-flight requests drained
> > when pausing for live migration handover?
> 
> Not yet but it doesn't look hard to implement. Resetting device will drain
> all requests.
> 
> > 
> > > +    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
> > > +    vdc->realize = vhost_blk_device_realize;
> > > +    vdc->unrealize = vhost_blk_device_unrealize;
> > > +    vdc->get_config = vhost_blk_update_config;
> > > +    vdc->get_features = vhost_blk_get_features;
> > > +    vdc->set_status = vhost_blk_set_status;
> > > +    vdc->reset = vhost_blk_reset;
> > > +}
> > > +
> > > +static void vhost_blk_instance_init(Object *obj)
> > > +{
> > > +    VHostBlk *s = VHOST_BLK(obj);
> > > +
> > > +    device_add_bootindex_property(obj, &s->conf.conf.bootindex,
> > > +                                  "bootindex", "/disk@0,0",
> > > +                                  DEVICE(obj));
> > > +}
> > > +
> > > +static const TypeInfo vhost_blk_info = {
> > > +    .name = TYPE_VHOST_BLK,
> > > +    .parent = TYPE_VIRTIO_DEVICE,
> > > +    .instance_size = sizeof(VHostBlk),
> > > +    .instance_init = vhost_blk_instance_init,
> > > +    .class_init = vhost_blk_class_init,
> > > +};
> > > +
> > > +static void virtio_register_types(void)
> > > +{
> > > +    type_register_static(&vhost_blk_info);
> > > +}
> > > +
> > > +type_init(virtio_register_types)
> > > diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
> > > index 7e8877fd64..fb2c0e7242 100644
> > > --- a/hw/virtio/meson.build
> > > +++ b/hw/virtio/meson.build
> > > @@ -40,6 +40,9 @@ virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: files('vhost-user-rng-
> > >   virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_SCSI', if_true: files('vhost-user-scsi-pci.c'))
> > >   virtio_pci_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-pci.c'))
> > >   virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs-pci.c'))
> > > +if have_vhost_blk
> > > +  virtio_ss.add(files('vhost-blk-pci.c'))
> > > +endif
> > >   virtio_pci_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto-pci.c'))
> > >   virtio_pci_ss.add(when: 'CONFIG_VIRTIO_INPUT_HOST', if_true: files('virtio-input-host-pci.c'))
> > > diff --git a/hw/virtio/vhost-blk-pci.c b/hw/virtio/vhost-blk-pci.c
> > > new file mode 100644
> > > index 0000000000..f3b6e112b4
> > > --- /dev/null
> > > +++ b/hw/virtio/vhost-blk-pci.c
> > > @@ -0,0 +1,102 @@
> > > +/*
> > > + * Copyright (c) 2022 Virtuozzo International GmbH.
> > > + * Author: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
> > > + *
> > > + * vhost-blk PCI bindings
> > > + *
> > > + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> > > + * See the COPYING.LIB file in the top-level directory.
> > > + */
> > > +
> > > +#include "qemu/osdep.h"
> > > +
> > > +#include "standard-headers/linux/virtio_pci.h"
> > > +#include "hw/virtio/virtio.h"
> > > +#include "hw/virtio/vhost-blk.h"
> > > +#include "hw/pci/pci.h"
> > > +#include "hw/qdev-properties.h"
> > > +#include "qapi/error.h"
> > > +#include "qemu/error-report.h"
> > > +#include "qemu/module.h"
> > > +#include "hw/virtio/virtio-pci.h"
> > > +#include "qom/object.h"
> > > +
> > > +typedef struct VHostBlkPCI VHostBlkPCI;
> > > +
> > > +/*
> > > + * vhost-blk-pci: This extends VirtioPCIProxy.
> > > + */
> > > +#define TYPE_VHOST_BLK_PCI "vhost-blk-pci-base"
> > > +DECLARE_INSTANCE_CHECKER(VHostBlkPCI, VHOST_BLK_PCI,
> > > +                         TYPE_VHOST_BLK_PCI)
> > > +
> > > +struct VHostBlkPCI {
> > > +    VirtIOPCIProxy parent_obj;
> > > +    VHostBlk vdev;
> > > +};
> > > +
> > > +static Property vhost_blk_pci_properties[] = {
> > > +    DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
> > > +    DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
> > > +                       DEV_NVECTORS_UNSPECIFIED),
> > > +    DEFINE_PROP_END_OF_LIST(),
> > > +};
> > > +
> > > +static void vhost_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
> > > +{
> > > +    VHostBlkPCI *dev = VHOST_BLK_PCI(vpci_dev);
> > > +    DeviceState *vdev = DEVICE(&dev->vdev);
> > > +
> > > +    if (dev->vdev.conf.num_queues == VHOST_BLK_AUTO_NUM_QUEUES) {
> > > +        dev->vdev.conf.num_queues = MIN(virtio_pci_optimal_num_queues(0),
> > > +                                        VHOST_BLK_MAX_QUEUES);
> > > +    }
> > > +
> > > +    if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
> > > +        vpci_dev->nvectors = dev->vdev.conf.num_queues + 1;
> > > +    }
> > > +
> > > +    qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
> > > +}
> > > +
> > > +static void vhost_blk_pci_class_init(ObjectClass *klass, void *data)
> > > +{
> > > +    DeviceClass *dc = DEVICE_CLASS(klass);
> > > +    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
> > > +    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
> > > +
> > > +    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
> > > +    device_class_set_props(dc, vhost_blk_pci_properties);
> > > +    k->realize = vhost_blk_pci_realize;
> > > +    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
> > > +    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_BLOCK;
> > > +    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
> > > +    pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI;
> > > +}
> > > +
> > > +static void vhost_blk_pci_instance_init(Object *obj)
> > > +{
> > > +    VHostBlkPCI *dev = VHOST_BLK_PCI(obj);
> > > +
> > > +    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
> > > +                                TYPE_VHOST_BLK);
> > > +    object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
> > > +                              "bootindex");
> > > +}
> > > +
> > > +static const VirtioPCIDeviceTypeInfo vhost_blk_pci_info = {
> > > +    .base_name               = TYPE_VHOST_BLK_PCI,
> > > +    .generic_name            = "vhost-blk-pci",
> > > +    .transitional_name       = "vhost-blk-pci-transitional",
> > > +    .non_transitional_name   = "vhost-blk-pci-non-transitional",
> > > +    .instance_size  = sizeof(VHostBlkPCI),
> > > +    .instance_init  = vhost_blk_pci_instance_init,
> > > +    .class_init     = vhost_blk_pci_class_init,
> > > +};
> > > +
> > > +static void vhost_blk_pci_register(void)
> > > +{
> > > +    virtio_pci_types_register(&vhost_blk_pci_info);
> > > +}
> > > +
> > > +type_init(vhost_blk_pci_register)
> > > diff --git a/include/hw/virtio/vhost-blk.h b/include/hw/virtio/vhost-blk.h
> > > new file mode 100644
> > > index 0000000000..76ece4215d
> > > --- /dev/null
> > > +++ b/include/hw/virtio/vhost-blk.h
> > > @@ -0,0 +1,50 @@
> > > +/*
> > > + * Copyright (c) 2022 Virtuozzo International GmbH.
> > > + * Author: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
> > > + *
> > > + * vhost-blk is host kernel accelerator for virtio-blk.
> > > + *
> > > + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> > > + * See the COPYING.LIB file in the top-level directory.
> > > + */
> > > +
> > > +#ifndef VHOST_BLK_H
> > > +#define VHOST_BLK_H
> > > +
> > > +#include "standard-headers/linux/virtio_blk.h"
> > > +#include "hw/block/block.h"
> > > +#include "hw/virtio/vhost.h"
> > > +#include "sysemu/block-backend.h"
> > > +
> > > +#define TYPE_VHOST_BLK "vhost-blk"
> > > +#define VHOST_BLK(obj) \
> > > +        OBJECT_CHECK(VHostBlk, (obj), TYPE_VHOST_BLK)
> > > +
> > > +#define VHOST_BLK_AUTO_NUM_QUEUES UINT16_MAX
> > > +#define VHOST_BLK_MAX_QUEUES 8
> > > +
> > > +/*
> > > + * normally should be visible from imported headers
> > > + * temporary define here to simplify development
> > > + */
> > > +#define VHOST_BLK_SET_BACKEND          _IOW(VHOST_VIRTIO, 0x90, \
> > > +                                            struct vhost_vring_file)
> > > +
> > > +typedef struct VhostBlkConf {
> > > +    BlockConf conf;
> > 
> > What is the purpose of using BlockConf but bypassing the QEMU block
> > layer?
> 
> I did not want to
> - Decalre several properties BlockConf already have
> - Duplicate configuration helpers like
> blkconf_geometry() for the above
> These were the main reasons.
> 
> > 
> > If the file is a qcow2 file or there are additional block drivers like
> > the luks crypto driver then this doesn't work. If block jobs and other
> > block operations are performed on the QMP monitor then the image can be
> > corrupted.
> 
> Thank you for the pointing this out. Unfortunately I know little about qemu
> block layer and did not think such things can happen in this case. I will
> try to re-evaluate this.

Kevin may have an opinion on whether to use the QEMU block layer or to
have a filename=<str>/fd=<fdnum> options for setting the file/block
device. I think in the past the suggestion was to avoid using the QEMU
block layer if I/O won't actually go through the QEMU block layer.

> 
> > 
> > > +    uint16_t num_queues;
> > > +    uint16_t queue_size;
> > > +} VhostBlkConf;
> > > +
> > > +typedef struct VHostBlk {
> > > +    VirtIODevice parent_obj;
> > > +    VhostBlkConf conf;
> > > +    uint64_t host_features;
> > > +    uint64_t decided_features;
> > > +    struct virtio_blk_config blkcfg;
> > > +    int vhostfd;
> > > +    struct vhost_dev dev;
> > > +    bool vhost_started;
> > > +} VHostBlk;
> > > +
> > > +#endif
> > > diff --git a/meson.build b/meson.build
> > > index 8a8c415fc1..7c00a8ce89 100644
> > > --- a/meson.build
> > > +++ b/meson.build
> > > @@ -336,6 +336,9 @@ have_vhost_kernel = get_option('vhost_kernel') \
> > >   have_vhost_user_crypto = get_option('vhost_crypto') \
> > >     .require(have_vhost_user,
> > >              error_message: 'vhost-crypto requires vhost-user to be enabled').allowed()
> > > +have_vhost_blk = get_option('vhost_blk') \
> > > +  .require(targetos == 'linux',
> > > +           error_message: 'vhost-kernel is only available on Linux').allowed()
> > >   have_vhost = have_vhost_user or have_vhost_vdpa or have_vhost_kernel
> > > @@ -1814,6 +1817,7 @@ config_host_data.set('CONFIG_VHOST_KERNEL', have_vhost_kernel)
> > >   config_host_data.set('CONFIG_VHOST_USER', have_vhost_user)
> > >   config_host_data.set('CONFIG_VHOST_CRYPTO', have_vhost_user_crypto)
> > >   config_host_data.set('CONFIG_VHOST_VDPA', have_vhost_vdpa)
> > > +config_host_data.set('CONFIG_VHOST_BLK', have_vhost_blk)
> > >   config_host_data.set('CONFIG_VMNET', vmnet.found())
> > >   config_host_data.set('CONFIG_VHOST_USER_BLK_SERVER', have_vhost_user_blk_server)
> > >   config_host_data.set('CONFIG_VDUSE_BLK_EXPORT', have_vduse_blk_export)
> > > @@ -3756,6 +3760,7 @@ summary_info += {'vhost-user support': have_vhost_user}
> > >   summary_info += {'vhost-user-crypto support': have_vhost_user_crypto}
> > >   summary_info += {'vhost-user-blk server support': have_vhost_user_blk_server}
> > >   summary_info += {'vhost-vdpa support': have_vhost_vdpa}
> > > +summary_info += {'vhost-blk support': have_vhost_blk}
> > >   summary_info += {'build guest agent': have_ga}
> > >   summary(summary_info, bool_yn: true, section: 'Configurable features')
> > > -- 
> > > 2.31.1
> > > 
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2022-10-05 15:53 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-25 20:55 [RFC patch 0/1] block: vhost-blk backend Andrey Zhadchenko via
2022-07-25 20:55 ` [RFC PATCH 1/1] block: add " Andrey Zhadchenko via
2022-10-04 18:45   ` Stefan Hajnoczi
2022-10-05 13:06     ` Andrey Zhadchenko
2022-10-05 15:50       ` Stefan Hajnoczi
2022-07-26 13:51 ` [RFC patch 0/1] block: " Michael S. Tsirkin
2022-07-26 14:15   ` Denis V. Lunev
2022-07-27 13:06     ` Stefano Garzarella
2022-07-28  5:28       ` Andrey Zhadchenko
2022-07-28 15:40         ` Stefano Garzarella
2022-10-04 18:13 ` Stefan Hajnoczi
2022-10-05  9:14   ` Andrey Zhadchenko
2022-10-05 15:18     ` Stefan Hajnoczi
2022-10-04 18:26 ` Stefan Hajnoczi
2022-10-05 10:28   ` Andrey Zhadchenko
2022-10-05 15:30     ` Stefan Hajnoczi
2022-10-04 19:00 ` Stefan Hajnoczi
2022-10-05 11:50   ` Andrey Zhadchenko
2022-10-05 15:40     ` Stefan Hajnoczi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.