linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/1] vhost: add vhost_blk driver
@ 2018-11-02 18:21 Vitaly Mayatskikh
  2018-11-02 18:21 ` [PATCH 1/1] Add " Vitaly Mayatskikh
                   ` (3 more replies)
  0 siblings, 4 replies; 27+ messages in thread
From: Vitaly Mayatskikh @ 2018-11-02 18:21 UTC (permalink / raw)
  To: Michael S . Tsirkin
  Cc: Jason Wang, Paolo Bonzini, kvm, virtualization, linux-kernel,
	Vitaly Mayatskikh

vhost_blk is a host-side kernel mode accelerator for virtio-blk. The
driver allows VM to reach a near bare-metal disk performance. See IOPS
numbers below (fio --rw=randread --bs=4k).

This implementation uses kiocb interface. It is slightly slower than
going directly through bio, but is simpler and also works with disk
images placed on a file system.

# fio num-jobs
# A: bare metal over block
# B: bare metal over file
# C: virtio-blk over block
# D: virtio-blk over file
# E: vhost-blk bio over block
# F: vhost-blk kiocb over block
# G: vhost-blk kiocb over file
#
#  A     B     C    D    E     F     G

1  171k  151k  148k 151k 195k  187k  175k
2  328k  302k  249k 241k 349k  334k  296k
3  479k  437k  179k 174k 501k  464k  404k
4  622k  568k  143k 183k 620k  580k  492k
5  755k  697k  136k 128k 737k  693k  579k
6  887k  808k  131k 120k 830k  782k  640k
7  1004k 926k  126k 131k 926k  863k  693k
8  1099k 1015k 117k 115k 1001k 931k  712k
9  1194k 1119k 115k 111k 1055k 991k  711k
10 1278k 1207k 109k 114k 1130k 1046k 695k
11 1345k 1280k 110k 108k 1119k 1091k 663k
12 1411k 1356k 104k 106k 1201k 1142k 629k
13 1466k 1423k 106k 106k 1260k 1170k 607k
14 1517k 1486k 103k 106k 1296k 1179k 589k
15 1552k 1543k 102k 102k 1322k 1191k 571k
16 1480k 1506k 101k 102k 1346k 1202k 566k

Vitaly Mayatskikh (1):
  Add vhost_blk driver

 drivers/vhost/Kconfig  |  13 ++
 drivers/vhost/Makefile |   3 +
 drivers/vhost/blk.c    | 510 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 526 insertions(+)
 create mode 100644 drivers/vhost/blk.c

-- 
2.17.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 1/1] Add vhost_blk driver
  2018-11-02 18:21 [PATCH 0/1] vhost: add vhost_blk driver Vitaly Mayatskikh
@ 2018-11-02 18:21 ` Vitaly Mayatskikh
  2018-11-02 18:36   ` Michael S. Tsirkin
                     ` (2 more replies)
  2018-11-02 18:26 ` [PATCH 0/1] vhost: add " Michael S. Tsirkin
                   ` (2 subsequent siblings)
  3 siblings, 3 replies; 27+ messages in thread
From: Vitaly Mayatskikh @ 2018-11-02 18:21 UTC (permalink / raw)
  To: Michael S . Tsirkin
  Cc: Jason Wang, Paolo Bonzini, kvm, virtualization, linux-kernel,
	Vitaly Mayatskikh

This driver accelerates host side of virtio-blk.

Signed-off-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>
---
 drivers/vhost/Kconfig  |  13 ++
 drivers/vhost/Makefile |   3 +
 drivers/vhost/blk.c    | 510 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 526 insertions(+)
 create mode 100644 drivers/vhost/blk.c

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index b580885243f7..c4980d6af0ea 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -53,3 +53,16 @@ config VHOST_CROSS_ENDIAN_LEGACY
 	  adds some overhead, it is disabled by default.
 
 	  If unsure, say "N".
+
+config VHOST_BLK
+	tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)"
+	depends on BLOCK && EVENTFD
+	select VHOST
+	default n
+	help
+	 This kernel module can be loaded in host kernel to accelerate
+	 guest block with virtio_blk. Not to be confused with virtio_blk
+	 module itself which needs to be loaded in guest kernel.
+
+	 To compile this driver as a module, choose M here: the module will
+	 be called vhost_blk.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 6c6df24f770c..c8be36cd9214 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -8,6 +8,9 @@ vhost_scsi-y := scsi.o
 obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o
 vhost_vsock-y := vsock.o
 
+obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
+vhost_blk-y := blk.o
+
 obj-$(CONFIG_VHOST_RING) += vringh.o
 
 obj-$(CONFIG_VHOST)	+= vhost.o
diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
new file mode 100644
index 000000000000..aefb9a61fa0f
--- /dev/null
+++ b/drivers/vhost/blk.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 IBM Corporation
+ * Author: Vitaly Mayatskikh <v.mayatskih@gmail.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * virtio-blk server in host kernel.
+ */
+
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/virtio_blk.h>
+#include <linux/vhost.h>
+#include <linux/fs.h>
+#include "vhost.h"
+
+enum {
+	VHOST_BLK_FEATURES =
+	VHOST_FEATURES |
+	(1ULL << VIRTIO_F_IOMMU_PLATFORM) |
+	(1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+	(1ULL << VIRTIO_RING_F_EVENT_IDX) |
+	(1ULL << VIRTIO_BLK_F_MQ)
+};
+
+#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0x50, int)
+
+enum {
+	VHOST_BLK_VQ_MAX = 16,
+	VHOST_BLK_VQ_MAX_REQS = 128,
+};
+
+struct vhost_blk_req {
+	struct llist_node list;
+	int index;
+	struct vhost_blk_queue *q;
+	struct virtio_blk_outhdr hdr;
+	struct iovec *out_iov;
+	struct iovec *in_iov;
+	u8 out_num;
+	u8 in_num;
+	long len;
+	struct kiocb iocb;
+	struct iov_iter i;
+	int res;
+	void __user *status;
+};
+
+struct vhost_blk_queue {
+	int index;
+	struct vhost_blk *blk;
+	struct vhost_virtqueue vq;
+	struct vhost_work w;
+	struct llist_head wl;
+	struct vhost_blk_req req[VHOST_BLK_VQ_MAX_REQS];
+};
+
+struct vhost_blk {
+	struct vhost_dev dev;
+	struct file *backend;
+	int num_queues;
+	struct vhost_virtqueue *vqs[VHOST_BLK_VQ_MAX];
+	struct vhost_blk_queue queue[VHOST_BLK_VQ_MAX];
+};
+
+static void vhost_blk_flush(struct vhost_blk *blk)
+{
+	int i;
+
+	for (i = 0; i < blk->num_queues; i++)
+		vhost_poll_flush(&blk->queue[i].vq.poll);
+}
+
+
+static void vhost_blk_stop(struct vhost_blk *blk)
+{
+	struct vhost_virtqueue *vq;
+	int i;
+
+	for (i = 0; i < blk->num_queues; i++) {
+		vq = &blk->queue[i].vq;
+		mutex_lock(&vq->mutex);
+		rcu_assign_pointer(vq->private_data, NULL);
+		mutex_unlock(&vq->mutex);
+	}
+}
+
+static int vhost_blk_req_done(struct vhost_blk_req *req, unsigned char status)
+{
+	int ret;
+	int len = req->len;
+
+	pr_debug("%s vq[%d] req->index %d status %d len %d\n", __func__,
+		 req->q->index, req->index, status, len);
+	ret = put_user(status, (unsigned char __user *)req->status);
+
+	WARN(ret, "%s: vq[%d] req->index %d failed to write status\n", __func__,
+	     req->q->index, req->index);
+
+	vhost_add_used(&req->q->vq, req->index, len);
+
+	return ret;
+}
+
+static void vhost_blk_io_done_work(struct vhost_work *w)
+{
+	struct vhost_blk_queue *q = container_of(w, struct vhost_blk_queue, w);
+	struct llist_node *node;
+	struct vhost_blk_req *req, *tmp;
+
+	node = llist_del_all(&q->wl);
+	llist_for_each_entry_safe(req, tmp, node, list) {
+		vhost_blk_req_done(req, req->res);
+	}
+	vhost_signal(&q->blk->dev, &q->vq);
+}
+
+static void vhost_blk_iocb_complete(struct kiocb *iocb, long ret, long ret2)
+{
+	struct vhost_blk_req *req = container_of(iocb, struct vhost_blk_req,
+						 iocb);
+
+	pr_debug("%s vq[%d] req->index %d ret %ld ret2 %ld\n", __func__,
+		 req->q->index, req->index, ret, ret2);
+
+	req->res = (ret == req->len) ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
+	llist_add(&req->list, &req->q->wl);
+	vhost_vq_work_queue(&req->q->vq, &req->q->w);
+}
+
+static int vhost_blk_req_handle(struct vhost_blk_req *req)
+{
+	struct vhost_blk *blk = req->q->blk;
+	struct vhost_virtqueue *vq = &req->q->vq;
+	int type = le32_to_cpu(req->hdr.type);
+	int ret;
+	u8 status;
+
+	if ((type == VIRTIO_BLK_T_IN) || (type == VIRTIO_BLK_T_OUT)) {
+		bool write = (type == VIRTIO_BLK_T_OUT);
+		int nr_seg = (write ? req->out_num : req->in_num) - 1;
+		unsigned long sector = le64_to_cpu(req->hdr.sector);
+		ssize_t len, rem_len;
+
+		if (!req->q->blk->backend) {
+			vq_err(vq, "blk %p no backend!\n", req->q->blk);
+			ret = -EINVAL;
+			goto out_err;
+		}
+
+		len = iov_length(&vq->iov[1], nr_seg);
+		pr_debug("%s: [pid:%d %s] %s sector %lld, len %ld\n",
+			 __func__, current->pid, current->comm,
+			 write ? "WRITE" : "READ", req->hdr.sector, len);
+
+		req->len = len;
+		rem_len = len;
+		iov_iter_init(&req->i, (write ? WRITE : READ),
+			      write ? &req->out_iov[0] : &req->in_iov[0],
+			      nr_seg, len);
+
+		req->iocb.ki_pos = sector << 9;
+		req->iocb.ki_filp = blk->backend;
+		req->iocb.ki_complete = vhost_blk_iocb_complete;
+		req->iocb.ki_flags = IOCB_DIRECT;
+
+		if (write)
+			ret = call_write_iter(blk->backend, &req->iocb,
+					      &req->i);
+		else
+			ret = call_read_iter(blk->backend, &req->iocb,
+					     &req->i);
+
+		if (ret != -EIOCBQUEUED)
+			vhost_blk_iocb_complete(&req->iocb, ret, 0);
+
+		ret = 0;
+		goto out;
+	}
+
+	if (type == VIRTIO_BLK_T_GET_ID) {
+		char s[] = "vhost_blk";
+		size_t len = min_t(size_t, req->in_iov[0].iov_len,
+				   strlen(s));
+
+		ret = copy_to_user(req->in_iov[0].iov_base, s, len);
+		status = ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+		if (put_user(status, (unsigned char __user *)req->status)) {
+			ret = -EFAULT;
+			goto out_err;
+		}
+		vhost_add_used_and_signal(&blk->dev, vq, req->index, 1);
+		ret = 0;
+		goto out;
+	} else {
+		pr_warn("Unsupported request type %d\n", type);
+		vhost_discard_vq_desc(vq, 1);
+		ret = -EINVAL;
+		return ret;
+	}
+out_err:
+	vhost_discard_vq_desc(vq, 1);
+out:
+	return ret;
+}
+
+static void vhost_blk_handle_guest_kick(struct vhost_work *work)
+{
+	struct vhost_virtqueue *vq;
+	struct vhost_blk_queue *q;
+	struct vhost_blk *blk;
+	struct vhost_blk_req *req;
+	int in, out;
+	int head;
+
+	vq = container_of(work, struct vhost_virtqueue, poll.work);
+	q = container_of(vq, struct vhost_blk_queue, vq);
+	blk = container_of(vq->dev, struct vhost_blk, dev);
+
+	vhost_disable_notify(&blk->dev, vq);
+	for (;;) {
+		in = out = -1;
+
+		head = vhost_get_vq_desc(vq, vq->iov,
+					 ARRAY_SIZE(vq->iov),
+					 &out, &in, NULL, NULL);
+
+		if (head < 0)
+			break;
+
+		if (head == vq->num) {
+			if (vhost_enable_notify(&blk->dev, vq)) {
+				vhost_disable_notify(&blk->dev, vq);
+				continue;
+			}
+			break;
+		}
+
+		req = &q->req[head];
+		req->index = head;
+		req->out_num = out;
+		req->in_num = in;
+		req->out_iov = &vq->iov[1];
+		req->in_iov = &vq->iov[out];
+		req->status = vq->iov[out + in - 1].iov_base;
+
+		if (copy_from_user(&req->hdr, vq->iov[0].iov_base,
+				   sizeof(req->hdr))) {
+			vq_err(vq, "Failed to get block header!\n");
+			vhost_discard_vq_desc(vq, 1);
+			continue;
+		}
+		if (vhost_blk_req_handle(req) < 0)
+			break;
+	}
+}
+
+static int vhost_blk_open(struct inode *inode, struct file *file)
+{
+	struct vhost_blk *blk;
+	struct vhost_blk_queue *q;
+	int i, j;
+
+	blk = kvzalloc(sizeof(*blk), GFP_KERNEL);
+	if (!blk)
+		return -ENOMEM;
+
+	for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
+		q = &blk->queue[i];
+		q->index = i;
+		q->blk = blk;
+		q->vq.handle_kick = vhost_blk_handle_guest_kick;
+		vhost_work_init(&q->w, vhost_blk_io_done_work);
+		blk->vqs[i] = &q->vq;
+		for (j = 0; j < VHOST_BLK_VQ_MAX_REQS; j++) {
+			q->req[j].index = j;
+			q->req[j].q = q;
+		}
+	}
+	vhost_dev_init(&blk->dev, (struct vhost_virtqueue **)&blk->vqs,
+		       VHOST_BLK_VQ_MAX);
+	file->private_data = blk;
+
+	return 0;
+}
+
+static int vhost_blk_release(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *blk = f->private_data;
+
+	vhost_blk_stop(blk);
+	mutex_lock(&blk->dev.mutex);
+	vhost_blk_flush(blk);
+	vhost_dev_stop(&blk->dev);
+	vhost_dev_cleanup(&blk->dev);
+	vhost_blk_flush(blk);
+
+	if (blk->backend) {
+		fput(blk->backend);
+		blk->backend = NULL;
+	}
+
+	mutex_unlock(&blk->dev.mutex);
+	kvfree(blk);
+
+	return 0;
+}
+
+static int vhost_blk_set_features(struct vhost_blk *blk, u64 features)
+{
+	int i;
+	int ret = -EFAULT;
+
+	mutex_lock(&blk->dev.mutex);
+	if ((features & (1 << VHOST_F_LOG_ALL)) &&
+	    !vhost_log_access_ok(&blk->dev))
+		goto out_unlock;
+
+	if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
+		if (vhost_init_device_iotlb(&blk->dev, true))
+			goto out_unlock;
+	}
+
+	for (i = 0; i < VHOST_BLK_VQ_MAX; ++i) {
+		struct vhost_virtqueue *vq = blk->vqs[i];
+
+		mutex_lock(&vq->mutex);
+		vq->acked_features = features & VHOST_BLK_FEATURES;
+		mutex_unlock(&vq->mutex);
+	}
+	ret = 0;
+out_unlock:
+	mutex_unlock(&blk->dev.mutex);
+
+	return ret;
+}
+
+static long vhost_blk_reset_owner(struct vhost_blk *blk)
+{
+	long err;
+	struct vhost_umem *umem;
+
+	mutex_lock(&blk->dev.mutex);
+	err = vhost_dev_check_owner(&blk->dev);
+	if (err)
+		goto done;
+	umem = vhost_dev_reset_owner_prepare();
+	if (!umem) {
+		err = -ENOMEM;
+		goto done;
+	}
+	vhost_blk_stop(blk);
+	vhost_blk_flush(blk);
+	vhost_dev_reset_owner(&blk->dev, umem);
+done:
+	mutex_unlock(&blk->dev.mutex);
+	return err;
+}
+
+static long vhost_blk_set_backend(struct vhost_blk *blk, int fd)
+{
+	struct file *backend;
+	int ret, i;
+	struct vhost_virtqueue *vq;
+
+	mutex_lock(&blk->dev.mutex);
+	ret = vhost_dev_check_owner(&blk->dev);
+	if (ret)
+		goto out_dev;
+
+	backend = fget(fd);
+	if (IS_ERR(backend)) {
+		ret = PTR_ERR(backend);
+		goto out_dev;
+	}
+
+	if (backend == blk->backend) {
+		ret = 0;
+		goto out_file;
+	}
+
+	if (blk->backend)
+		fput(blk->backend);
+	blk->backend = backend;
+	for (i = 0; i < blk->num_queues; i++) {
+		vq = &blk->queue[i].vq;
+		if (!vhost_vq_access_ok(vq)) {
+			ret = -EFAULT;
+			goto out_file;
+		}
+		mutex_lock(&vq->mutex);
+		rcu_assign_pointer(vq->private_data, backend);
+		ret = vhost_vq_init_access(vq);
+		mutex_unlock(&vq->mutex);
+		if (ret) {
+			pr_err("vhost_vq_init_access failed: %d\n", ret);
+			goto out_file;
+		}
+
+	}
+	ret = 0;
+	goto out_dev;
+out_file:
+	fput(backend);
+	blk->backend = NULL;
+out_dev:
+	mutex_unlock(&blk->dev.mutex);
+	vhost_blk_flush(blk);
+	return ret;
+}
+
+static long vhost_blk_pass_ioctl(struct vhost_blk *blk, unsigned int ioctl,
+				 void __user *argp)
+{
+	long ret;
+
+	mutex_lock(&blk->dev.mutex);
+	ret = vhost_dev_ioctl(&blk->dev, ioctl, argp);
+	if (ret == -ENOIOCTLCMD)
+		ret = vhost_vring_ioctl(&blk->dev, ioctl, argp);
+	else
+		vhost_blk_flush(blk);
+	mutex_unlock(&blk->dev.mutex);
+	return ret;
+}
+
+static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
+			    unsigned long arg)
+{
+	struct vhost_blk *blk = f->private_data;
+	void __user *argp = (void __user *)arg;
+	int fd;
+	u64 __user *featurep = argp;
+	u64 features;
+	long ret;
+	struct vhost_vring_state s;
+
+	switch (ioctl) {
+	case VHOST_SET_MEM_TABLE:
+		vhost_blk_stop(blk);
+		ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
+		break;
+	case VHOST_SET_VRING_NUM:
+		if (copy_from_user(&s, argp, sizeof(s)))
+			return -EFAULT;
+		ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
+		if (!ret)
+			blk->num_queues = s.index + 1;
+		break;
+	case VHOST_BLK_SET_BACKEND:
+		if (copy_from_user(&fd, argp, sizeof(fd)))
+			return -EFAULT;
+		ret = vhost_blk_set_backend(blk, fd);
+		break;
+	case VHOST_GET_FEATURES:
+		features = VHOST_BLK_FEATURES;
+		if (copy_to_user(featurep, &features, sizeof(features)))
+			return -EFAULT;
+		ret = 0;
+		break;
+	case VHOST_SET_FEATURES:
+		if (copy_from_user(&features, featurep, sizeof(features)))
+			return -EFAULT;
+		if (features & ~VHOST_BLK_FEATURES)
+			return -EOPNOTSUPP;
+		ret = vhost_blk_set_features(blk, features);
+		break;
+	case VHOST_RESET_OWNER:
+		ret = vhost_blk_reset_owner(blk);
+		break;
+	default:
+		ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
+		break;
+	}
+	return ret;
+}
+
+static const struct file_operations vhost_blk_fops = {
+	.owner          = THIS_MODULE,
+	.open           = vhost_blk_open,
+	.release        = vhost_blk_release,
+	.llseek		= noop_llseek,
+	.unlocked_ioctl = vhost_blk_ioctl,
+};
+
+static struct miscdevice vhost_blk_misc = {
+	MISC_DYNAMIC_MINOR,
+	"vhost-blk",
+	&vhost_blk_fops,
+};
+
+static int vhost_blk_init(void)
+{
+	return misc_register(&vhost_blk_misc);
+}
+module_init(vhost_blk_init);
+
+static void vhost_blk_exit(void)
+{
+	misc_deregister(&vhost_blk_misc);
+}
+
+module_exit(vhost_blk_exit);
+
+MODULE_VERSION("1.0");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Vitaly Mayatskikh");
+MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");
+MODULE_ALIAS("devname:vhost-blk");
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-02 18:21 [PATCH 0/1] vhost: add vhost_blk driver Vitaly Mayatskikh
  2018-11-02 18:21 ` [PATCH 1/1] Add " Vitaly Mayatskikh
@ 2018-11-02 18:26 ` Michael S. Tsirkin
  2018-11-02 18:31   ` Vitaly Mayatskih
                     ` (2 more replies)
  2018-11-04 11:57 ` Maxim Levitsky
  2018-11-05  3:00 ` Jason Wang
  3 siblings, 3 replies; 27+ messages in thread
From: Michael S. Tsirkin @ 2018-11-02 18:26 UTC (permalink / raw)
  To: Vitaly Mayatskikh
  Cc: Jason Wang, Paolo Bonzini, kvm, virtualization, linux-kernel

On Fri, Nov 02, 2018 at 06:21:22PM +0000, Vitaly Mayatskikh wrote:
> vhost_blk is a host-side kernel mode accelerator for virtio-blk. The
> driver allows VM to reach a near bare-metal disk performance. See IOPS
> numbers below (fio --rw=randread --bs=4k).
> 
> This implementation uses kiocb interface. It is slightly slower than
> going directly through bio, but is simpler and also works with disk
> images placed on a file system.
> 
> # fio num-jobs
> # A: bare metal over block
> # B: bare metal over file
> # C: virtio-blk over block
> # D: virtio-blk over file
> # E: vhost-blk bio over block
> # F: vhost-blk kiocb over block
> # G: vhost-blk kiocb over file
> #
> #  A     B     C    D    E     F     G
> 
> 1  171k  151k  148k 151k 195k  187k  175k
> 2  328k  302k  249k 241k 349k  334k  296k
> 3  479k  437k  179k 174k 501k  464k  404k
> 4  622k  568k  143k 183k 620k  580k  492k
> 5  755k  697k  136k 128k 737k  693k  579k
> 6  887k  808k  131k 120k 830k  782k  640k
> 7  1004k 926k  126k 131k 926k  863k  693k
> 8  1099k 1015k 117k 115k 1001k 931k  712k
> 9  1194k 1119k 115k 111k 1055k 991k  711k
> 10 1278k 1207k 109k 114k 1130k 1046k 695k
> 11 1345k 1280k 110k 108k 1119k 1091k 663k
> 12 1411k 1356k 104k 106k 1201k 1142k 629k
> 13 1466k 1423k 106k 106k 1260k 1170k 607k
> 14 1517k 1486k 103k 106k 1296k 1179k 589k
> 15 1552k 1543k 102k 102k 1322k 1191k 571k
> 16 1480k 1506k 101k 102k 1346k 1202k 566k
> 
> Vitaly Mayatskikh (1):
>   Add vhost_blk driver


Thanks!
Before merging this, I'd like to get some acks from userspace that it's
actually going to be used - e.g. QEMU block maintainers.

>  drivers/vhost/Kconfig  |  13 ++
>  drivers/vhost/Makefile |   3 +
>  drivers/vhost/blk.c    | 510 +++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 526 insertions(+)
>  create mode 100644 drivers/vhost/blk.c
> 
> -- 
> 2.17.1

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-02 18:26 ` [PATCH 0/1] vhost: add " Michael S. Tsirkin
@ 2018-11-02 18:31   ` Vitaly Mayatskih
  2018-11-05 14:02   ` Christian Borntraeger
  2018-11-06 15:40   ` Stefan Hajnoczi
  2 siblings, 0 replies; 27+ messages in thread
From: Vitaly Mayatskih @ 2018-11-02 18:31 UTC (permalink / raw)
  To: Michael S . Tsirkin
  Cc: Jason Wang, Paolo Bonzini, kvm, virtualization, linux-kernel

Sure! It needs a new QEMU driver (vhost-blk-pci), I'm preparing it to
be sent out.
On Fri, Nov 2, 2018 at 2:26 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Nov 02, 2018 at 06:21:22PM +0000, Vitaly Mayatskikh wrote:
> > vhost_blk is a host-side kernel mode accelerator for virtio-blk. The
> > driver allows VM to reach a near bare-metal disk performance. See IOPS
> > numbers below (fio --rw=randread --bs=4k).
> >
> > This implementation uses kiocb interface. It is slightly slower than
> > going directly through bio, but is simpler and also works with disk
> > images placed on a file system.
> >
> > # fio num-jobs
> > # A: bare metal over block
> > # B: bare metal over file
> > # C: virtio-blk over block
> > # D: virtio-blk over file
> > # E: vhost-blk bio over block
> > # F: vhost-blk kiocb over block
> > # G: vhost-blk kiocb over file
> > #
> > #  A     B     C    D    E     F     G
> >
> > 1  171k  151k  148k 151k 195k  187k  175k
> > 2  328k  302k  249k 241k 349k  334k  296k
> > 3  479k  437k  179k 174k 501k  464k  404k
> > 4  622k  568k  143k 183k 620k  580k  492k
> > 5  755k  697k  136k 128k 737k  693k  579k
> > 6  887k  808k  131k 120k 830k  782k  640k
> > 7  1004k 926k  126k 131k 926k  863k  693k
> > 8  1099k 1015k 117k 115k 1001k 931k  712k
> > 9  1194k 1119k 115k 111k 1055k 991k  711k
> > 10 1278k 1207k 109k 114k 1130k 1046k 695k
> > 11 1345k 1280k 110k 108k 1119k 1091k 663k
> > 12 1411k 1356k 104k 106k 1201k 1142k 629k
> > 13 1466k 1423k 106k 106k 1260k 1170k 607k
> > 14 1517k 1486k 103k 106k 1296k 1179k 589k
> > 15 1552k 1543k 102k 102k 1322k 1191k 571k
> > 16 1480k 1506k 101k 102k 1346k 1202k 566k
> >
> > Vitaly Mayatskikh (1):
> >   Add vhost_blk driver
>
>
> Thanks!
> Before merging this, I'd like to get some acks from userspace that it's
> actually going to be used - e.g. QEMU block maintainers.
>
> >  drivers/vhost/Kconfig  |  13 ++
> >  drivers/vhost/Makefile |   3 +
> >  drivers/vhost/blk.c    | 510 +++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 526 insertions(+)
> >  create mode 100644 drivers/vhost/blk.c
> >
> > --
> > 2.17.1



-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/1] Add vhost_blk driver
  2018-11-02 18:21 ` [PATCH 1/1] Add " Vitaly Mayatskikh
@ 2018-11-02 18:36   ` Michael S. Tsirkin
  2018-11-02 19:24     ` Vitaly Mayatskih
  2018-11-03  2:50   ` kbuild test robot
  2018-11-06 16:03   ` Stefan Hajnoczi
  2 siblings, 1 reply; 27+ messages in thread
From: Michael S. Tsirkin @ 2018-11-02 18:36 UTC (permalink / raw)
  To: Vitaly Mayatskikh
  Cc: Jason Wang, Paolo Bonzini, kvm, virtualization, linux-kernel

On Fri, Nov 02, 2018 at 06:21:23PM +0000, Vitaly Mayatskikh wrote:
> This driver accelerates host side of virtio-blk.
> 
> Signed-off-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>
> ---
>  drivers/vhost/Kconfig  |  13 ++
>  drivers/vhost/Makefile |   3 +
>  drivers/vhost/blk.c    | 510 +++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 526 insertions(+)
>  create mode 100644 drivers/vhost/blk.c
> 
> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> index b580885243f7..c4980d6af0ea 100644
> --- a/drivers/vhost/Kconfig
> +++ b/drivers/vhost/Kconfig
> @@ -53,3 +53,16 @@ config VHOST_CROSS_ENDIAN_LEGACY
>  	  adds some overhead, it is disabled by default.
>  
>  	  If unsure, say "N".
> +
> +config VHOST_BLK
> +	tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)"
> +	depends on BLOCK && EVENTFD
> +	select VHOST
> +	default n
> +	help
> +	 This kernel module can be loaded in host kernel to accelerate
> +	 guest block with virtio_blk. Not to be confused with virtio_blk
> +	 module itself which needs to be loaded in guest kernel.
> +
> +	 To compile this driver as a module, choose M here: the module will
> +	 be called vhost_blk.
> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> index 6c6df24f770c..c8be36cd9214 100644
> --- a/drivers/vhost/Makefile
> +++ b/drivers/vhost/Makefile
> @@ -8,6 +8,9 @@ vhost_scsi-y := scsi.o
>  obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o
>  vhost_vsock-y := vsock.o
>  
> +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
> +vhost_blk-y := blk.o
> +
>  obj-$(CONFIG_VHOST_RING) += vringh.o
>  
>  obj-$(CONFIG_VHOST)	+= vhost.o
> diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
> new file mode 100644
> index 000000000000..aefb9a61fa0f
> --- /dev/null
> +++ b/drivers/vhost/blk.c
> @@ -0,0 +1,510 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2018 IBM Corporation
> + * Author: Vitaly Mayatskikh <v.mayatskih@gmail.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + *
> + * virtio-blk server in host kernel.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/miscdevice.h>
> +#include <linux/virtio_blk.h>
> +#include <linux/vhost.h>
> +#include <linux/fs.h>
> +#include "vhost.h"
> +
> +enum {
> +	VHOST_BLK_FEATURES =
> +	VHOST_FEATURES |
> +	(1ULL << VIRTIO_F_IOMMU_PLATFORM) |
> +	(1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
> +	(1ULL << VIRTIO_RING_F_EVENT_IDX) |
> +	(1ULL << VIRTIO_BLK_F_MQ)
> +};
> +
> +#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0x50, int)
> +
> +enum {
> +	VHOST_BLK_VQ_MAX = 16,
> +	VHOST_BLK_VQ_MAX_REQS = 128,
> +};
> +
> +struct vhost_blk_req {
> +	struct llist_node list;
> +	int index;
> +	struct vhost_blk_queue *q;
> +	struct virtio_blk_outhdr hdr;
> +	struct iovec *out_iov;
> +	struct iovec *in_iov;
> +	u8 out_num;
> +	u8 in_num;
> +	long len;
> +	struct kiocb iocb;
> +	struct iov_iter i;
> +	int res;
> +	void __user *status;
> +};
> +
> +struct vhost_blk_queue {
> +	int index;
> +	struct vhost_blk *blk;
> +	struct vhost_virtqueue vq;
> +	struct vhost_work w;
> +	struct llist_head wl;
> +	struct vhost_blk_req req[VHOST_BLK_VQ_MAX_REQS];
> +};
> +
> +struct vhost_blk {
> +	struct vhost_dev dev;
> +	struct file *backend;
> +	int num_queues;
> +	struct vhost_virtqueue *vqs[VHOST_BLK_VQ_MAX];
> +	struct vhost_blk_queue queue[VHOST_BLK_VQ_MAX];
> +};
> +
> +static void vhost_blk_flush(struct vhost_blk *blk)
> +{
> +	int i;
> +
> +	for (i = 0; i < blk->num_queues; i++)
> +		vhost_poll_flush(&blk->queue[i].vq.poll);
> +}
> +
> +
> +static void vhost_blk_stop(struct vhost_blk *blk)
> +{
> +	struct vhost_virtqueue *vq;
> +	int i;
> +
> +	for (i = 0; i < blk->num_queues; i++) {
> +		vq = &blk->queue[i].vq;
> +		mutex_lock(&vq->mutex);
> +		rcu_assign_pointer(vq->private_data, NULL);
> +		mutex_unlock(&vq->mutex);
> +	}
> +}
> +
> +static int vhost_blk_req_done(struct vhost_blk_req *req, unsigned char status)
> +{
> +	int ret;
> +	int len = req->len;
> +
> +	pr_debug("%s vq[%d] req->index %d status %d len %d\n", __func__,
> +		 req->q->index, req->index, status, len);
> +	ret = put_user(status, (unsigned char __user *)req->status);

I'd make it u8 and not unsigned char. Also why don't you change
req->status type so you don't need a cast?

> +
> +	WARN(ret, "%s: vq[%d] req->index %d failed to write status\n", __func__,
> +	     req->q->index, req->index);

kernel warnings and debug messages that are guest-triggerable lead to
disk full errors on the host. applies elsewhere too. you want traces
instead.

> +
> +	vhost_add_used(&req->q->vq, req->index, len);


this can fail too.

> +
> +	return ret;
> +}
> +
> +static void vhost_blk_io_done_work(struct vhost_work *w)
> +{
> +	struct vhost_blk_queue *q = container_of(w, struct vhost_blk_queue, w);
> +	struct llist_node *node;
> +	struct vhost_blk_req *req, *tmp;
> +
> +	node = llist_del_all(&q->wl);
> +	llist_for_each_entry_safe(req, tmp, node, list) {
> +		vhost_blk_req_done(req, req->res);
> +	}
> +	vhost_signal(&q->blk->dev, &q->vq);
> +}
> +
> +static void vhost_blk_iocb_complete(struct kiocb *iocb, long ret, long ret2)
> +{
> +	struct vhost_blk_req *req = container_of(iocb, struct vhost_blk_req,
> +						 iocb);
> +
> +	pr_debug("%s vq[%d] req->index %d ret %ld ret2 %ld\n", __func__,
> +		 req->q->index, req->index, ret, ret2);
> +
> +	req->res = (ret == req->len) ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
> +	llist_add(&req->list, &req->q->wl);
> +	vhost_vq_work_queue(&req->q->vq, &req->q->w);
> +}
> +
> +static int vhost_blk_req_handle(struct vhost_blk_req *req)
> +{
> +	struct vhost_blk *blk = req->q->blk;
> +	struct vhost_virtqueue *vq = &req->q->vq;
> +	int type = le32_to_cpu(req->hdr.type);
> +	int ret;
> +	u8 status;
> +
> +	if ((type == VIRTIO_BLK_T_IN) || (type == VIRTIO_BLK_T_OUT)) {
> +		bool write = (type == VIRTIO_BLK_T_OUT);
> +		int nr_seg = (write ? req->out_num : req->in_num) - 1;
> +		unsigned long sector = le64_to_cpu(req->hdr.sector);
> +		ssize_t len, rem_len;
> +
> +		if (!req->q->blk->backend) {
> +			vq_err(vq, "blk %p no backend!\n", req->q->blk);
> +			ret = -EINVAL;
> +			goto out_err;
> +		}
> +
> +		len = iov_length(&vq->iov[1], nr_seg);
> +		pr_debug("%s: [pid:%d %s] %s sector %lld, len %ld\n",
> +			 __func__, current->pid, current->comm,
> +			 write ? "WRITE" : "READ", req->hdr.sector, len);
> +
> +		req->len = len;
> +		rem_len = len;
> +		iov_iter_init(&req->i, (write ? WRITE : READ),
> +			      write ? &req->out_iov[0] : &req->in_iov[0],
> +			      nr_seg, len);
> +
> +		req->iocb.ki_pos = sector << 9;
> +		req->iocb.ki_filp = blk->backend;
> +		req->iocb.ki_complete = vhost_blk_iocb_complete;
> +		req->iocb.ki_flags = IOCB_DIRECT;
> +
> +		if (write)
> +			ret = call_write_iter(blk->backend, &req->iocb,
> +					      &req->i);
> +		else
> +			ret = call_read_iter(blk->backend, &req->iocb,
> +					     &req->i);
> +
> +		if (ret != -EIOCBQUEUED)
> +			vhost_blk_iocb_complete(&req->iocb, ret, 0);
> +
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	if (type == VIRTIO_BLK_T_GET_ID) {
> +		char s[] = "vhost_blk";

Isn't this supposed to return the serial #?

> +		size_t len = min_t(size_t, req->in_iov[0].iov_len,
> +				   strlen(s));
> +
> +		ret = copy_to_user(req->in_iov[0].iov_base, s, len);

I don't think we should assume there's no scatter list here.

> +		status = ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
> +		if (put_user(status, (unsigned char __user *)req->status)) {
> +			ret = -EFAULT;
> +			goto out_err;
> +		}
> +		vhost_add_used_and_signal(&blk->dev, vq, req->index, 1);
> +		ret = 0;
> +		goto out;
> +	} else {
> +		pr_warn("Unsupported request type %d\n", type);
> +		vhost_discard_vq_desc(vq, 1);
> +		ret = -EINVAL;
> +		return ret;
> +	}
> +out_err:
> +	vhost_discard_vq_desc(vq, 1);
> +out:
> +	return ret;
> +}
> +
> +static void vhost_blk_handle_guest_kick(struct vhost_work *work)
> +{
> +	struct vhost_virtqueue *vq;
> +	struct vhost_blk_queue *q;
> +	struct vhost_blk *blk;
> +	struct vhost_blk_req *req;
> +	int in, out;
> +	int head;
> +
> +	vq = container_of(work, struct vhost_virtqueue, poll.work);
> +	q = container_of(vq, struct vhost_blk_queue, vq);
> +	blk = container_of(vq->dev, struct vhost_blk, dev);
> +
> +	vhost_disable_notify(&blk->dev, vq);
> +	for (;;) {
> +		in = out = -1;
> +
> +		head = vhost_get_vq_desc(vq, vq->iov,
> +					 ARRAY_SIZE(vq->iov),
> +					 &out, &in, NULL, NULL);
> +
> +		if (head < 0)
> +			break;
> +
> +		if (head == vq->num) {
> +			if (vhost_enable_notify(&blk->dev, vq)) {
> +				vhost_disable_notify(&blk->dev, vq);
> +				continue;
> +			}
> +			break;
> +		}
> +
> +		req = &q->req[head];
> +		req->index = head;
> +		req->out_num = out;
> +		req->in_num = in;
> +		req->out_iov = &vq->iov[1];
> +		req->in_iov = &vq->iov[out];
> +		req->status = vq->iov[out + in - 1].iov_base;

Shouldn't we validate that there's actually an in?

> +
> +		if (copy_from_user(&req->hdr, vq->iov[0].iov_base,
> +				   sizeof(req->hdr))) {
> +			vq_err(vq, "Failed to get block header!\n");
> +			vhost_discard_vq_desc(vq, 1);
> +			continue;
> +		}

It's better to avoid assuming that header is in a single iov entry,
use an iterator.

> +		if (vhost_blk_req_handle(req) < 0)
> +			break;
> +	}
> +}
> +
> +static int vhost_blk_open(struct inode *inode, struct file *file)
> +{
> +	struct vhost_blk *blk;
> +	struct vhost_blk_queue *q;
> +	int i, j;
> +
> +	blk = kvzalloc(sizeof(*blk), GFP_KERNEL);
> +	if (!blk)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
> +		q = &blk->queue[i];
> +		q->index = i;
> +		q->blk = blk;
> +		q->vq.handle_kick = vhost_blk_handle_guest_kick;
> +		vhost_work_init(&q->w, vhost_blk_io_done_work);
> +		blk->vqs[i] = &q->vq;
> +		for (j = 0; j < VHOST_BLK_VQ_MAX_REQS; j++) {
> +			q->req[j].index = j;
> +			q->req[j].q = q;
> +		}
> +	}
> +	vhost_dev_init(&blk->dev, (struct vhost_virtqueue **)&blk->vqs,
> +		       VHOST_BLK_VQ_MAX);
> +	file->private_data = blk;
> +
> +	return 0;
> +}
> +
> +static int vhost_blk_release(struct inode *inode, struct file *f)
> +{
> +	struct vhost_blk *blk = f->private_data;
> +
> +	vhost_blk_stop(blk);
> +	mutex_lock(&blk->dev.mutex);
> +	vhost_blk_flush(blk);
> +	vhost_dev_stop(&blk->dev);
> +	vhost_dev_cleanup(&blk->dev);
> +	vhost_blk_flush(blk);
> +
> +	if (blk->backend) {
> +		fput(blk->backend);
> +		blk->backend = NULL;
> +	}
> +
> +	mutex_unlock(&blk->dev.mutex);
> +	kvfree(blk);
> +
> +	return 0;
> +}
> +
> +static int vhost_blk_set_features(struct vhost_blk *blk, u64 features)
> +{
> +	int i;
> +	int ret = -EFAULT;
> +
> +	mutex_lock(&blk->dev.mutex);
> +	if ((features & (1 << VHOST_F_LOG_ALL)) &&
> +	    !vhost_log_access_ok(&blk->dev))
> +		goto out_unlock;
> +
> +	if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
> +		if (vhost_init_device_iotlb(&blk->dev, true))
> +			goto out_unlock;
> +	}
> +
> +	for (i = 0; i < VHOST_BLK_VQ_MAX; ++i) {
> +		struct vhost_virtqueue *vq = blk->vqs[i];
> +
> +		mutex_lock(&vq->mutex);
> +		vq->acked_features = features & VHOST_BLK_FEATURES;
> +		mutex_unlock(&vq->mutex);
> +	}
> +	ret = 0;
> +out_unlock:
> +	mutex_unlock(&blk->dev.mutex);
> +
> +	return ret;
> +}
> +
> +static long vhost_blk_reset_owner(struct vhost_blk *blk)
> +{
> +	long err;
> +	struct vhost_umem *umem;
> +
> +	mutex_lock(&blk->dev.mutex);
> +	err = vhost_dev_check_owner(&blk->dev);
> +	if (err)
> +		goto done;
> +	umem = vhost_dev_reset_owner_prepare();
> +	if (!umem) {
> +		err = -ENOMEM;
> +		goto done;
> +	}
> +	vhost_blk_stop(blk);
> +	vhost_blk_flush(blk);
> +	vhost_dev_reset_owner(&blk->dev, umem);
> +done:
> +	mutex_unlock(&blk->dev.mutex);
> +	return err;
> +}
> +
> +static long vhost_blk_set_backend(struct vhost_blk *blk, int fd)
> +{
> +	struct file *backend;
> +	int ret, i;
> +	struct vhost_virtqueue *vq;
> +
> +	mutex_lock(&blk->dev.mutex);
> +	ret = vhost_dev_check_owner(&blk->dev);
> +	if (ret)
> +		goto out_dev;
> +
> +	backend = fget(fd);
> +	if (IS_ERR(backend)) {
> +		ret = PTR_ERR(backend);
> +		goto out_dev;
> +	}
> +
> +	if (backend == blk->backend) {
> +		ret = 0;
> +		goto out_file;
> +	}
> +
> +	if (blk->backend)
> +		fput(blk->backend);
> +	blk->backend = backend;
> +	for (i = 0; i < blk->num_queues; i++) {
> +		vq = &blk->queue[i].vq;
> +		if (!vhost_vq_access_ok(vq)) {
> +			ret = -EFAULT;
> +			goto out_file;
> +		}
> +		mutex_lock(&vq->mutex);
> +		rcu_assign_pointer(vq->private_data, backend);
> +		ret = vhost_vq_init_access(vq);
> +		mutex_unlock(&vq->mutex);
> +		if (ret) {
> +			pr_err("vhost_vq_init_access failed: %d\n", ret);
> +			goto out_file;
> +		}
> +
> +	}
> +	ret = 0;
> +	goto out_dev;
> +out_file:
> +	fput(backend);
> +	blk->backend = NULL;
> +out_dev:
> +	mutex_unlock(&blk->dev.mutex);
> +	vhost_blk_flush(blk);
> +	return ret;
> +}
> +
> +static long vhost_blk_pass_ioctl(struct vhost_blk *blk, unsigned int ioctl,
> +				 void __user *argp)
> +{
> +	long ret;
> +
> +	mutex_lock(&blk->dev.mutex);
> +	ret = vhost_dev_ioctl(&blk->dev, ioctl, argp);
> +	if (ret == -ENOIOCTLCMD)
> +		ret = vhost_vring_ioctl(&blk->dev, ioctl, argp);
> +	else
> +		vhost_blk_flush(blk);
> +	mutex_unlock(&blk->dev.mutex);
> +	return ret;
> +}
> +
> +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
> +			    unsigned long arg)
> +{
> +	struct vhost_blk *blk = f->private_data;
> +	void __user *argp = (void __user *)arg;
> +	int fd;
> +	u64 __user *featurep = argp;
> +	u64 features;
> +	long ret;
> +	struct vhost_vring_state s;
> +
> +	switch (ioctl) {
> +	case VHOST_SET_MEM_TABLE:
> +		vhost_blk_stop(blk);
> +		ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> +		break;
> +	case VHOST_SET_VRING_NUM:
> +		if (copy_from_user(&s, argp, sizeof(s)))
> +			return -EFAULT;
> +		ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> +		if (!ret)
> +			blk->num_queues = s.index + 1;
> +		break;
> +	case VHOST_BLK_SET_BACKEND:
> +		if (copy_from_user(&fd, argp, sizeof(fd)))
> +			return -EFAULT;
> +		ret = vhost_blk_set_backend(blk, fd);
> +		break;
> +	case VHOST_GET_FEATURES:
> +		features = VHOST_BLK_FEATURES;
> +		if (copy_to_user(featurep, &features, sizeof(features)))
> +			return -EFAULT;
> +		ret = 0;
> +		break;
> +	case VHOST_SET_FEATURES:
> +		if (copy_from_user(&features, featurep, sizeof(features)))
> +			return -EFAULT;
> +		if (features & ~VHOST_BLK_FEATURES)
> +			return -EOPNOTSUPP;
> +		ret = vhost_blk_set_features(blk, features);
> +		break;
> +	case VHOST_RESET_OWNER:
> +		ret = vhost_blk_reset_owner(blk);
> +		break;
> +	default:
> +		ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> +		break;
> +	}
> +	return ret;
> +}
> +
> +static const struct file_operations vhost_blk_fops = {
> +	.owner          = THIS_MODULE,
> +	.open           = vhost_blk_open,
> +	.release        = vhost_blk_release,
> +	.llseek		= noop_llseek,
> +	.unlocked_ioctl = vhost_blk_ioctl,
> +};
> +
> +static struct miscdevice vhost_blk_misc = {
> +	MISC_DYNAMIC_MINOR,
> +	"vhost-blk",
> +	&vhost_blk_fops,
> +};
> +
> +static int vhost_blk_init(void)
> +{
> +	return misc_register(&vhost_blk_misc);
> +}
> +module_init(vhost_blk_init);
> +
> +static void vhost_blk_exit(void)
> +{
> +	misc_deregister(&vhost_blk_misc);
> +}
> +
> +module_exit(vhost_blk_exit);
> +
> +MODULE_VERSION("1.0");
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR("Vitaly Mayatskikh");
> +MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");
> +MODULE_ALIAS("devname:vhost-blk");
> -- 
> 2.17.1

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/1] Add vhost_blk driver
  2018-11-02 18:36   ` Michael S. Tsirkin
@ 2018-11-02 19:24     ` Vitaly Mayatskih
  2018-11-02 20:38       ` Michael S. Tsirkin
  0 siblings, 1 reply; 27+ messages in thread
From: Vitaly Mayatskih @ 2018-11-02 19:24 UTC (permalink / raw)
  To: Michael S . Tsirkin
  Cc: Jason Wang, Paolo Bonzini, kvm, virtualization, linux-kernel

On Fri, Nov 2, 2018 at 2:36 PM Michael S. Tsirkin <mst@redhat.com> wrote:

> > +     if (type == VIRTIO_BLK_T_GET_ID) {
> > +             char s[] = "vhost_blk";
>
> Isn't this supposed to return the serial #?

Yes, that gets a bit tricky here... Disk serial number is specified in
QEMU command line parameters, so it needs to be passed to vhost_blk
when qemu attaches the disk backend. That can be done (now? in a
following incremental patch?).
Also other vhost work does the same with GET_ID, if that's matter.

I'll fix the rest, thanks for review!

-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/1] Add vhost_blk driver
  2018-11-02 19:24     ` Vitaly Mayatskih
@ 2018-11-02 20:38       ` Michael S. Tsirkin
  0 siblings, 0 replies; 27+ messages in thread
From: Michael S. Tsirkin @ 2018-11-02 20:38 UTC (permalink / raw)
  To: Vitaly Mayatskih
  Cc: Jason Wang, Paolo Bonzini, kvm, virtualization, linux-kernel

On Fri, Nov 02, 2018 at 03:24:13PM -0400, Vitaly Mayatskih wrote:
> On Fri, Nov 2, 2018 at 2:36 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> 
> > > +     if (type == VIRTIO_BLK_T_GET_ID) {
> > > +             char s[] = "vhost_blk";
> >
> > Isn't this supposed to return the serial #?
> 
> Yes, that gets a bit tricky here... Disk serial number is specified in
> QEMU command line parameters, so it needs to be passed to vhost_blk
> when qemu attaches the disk backend. That can be done (now? in a
> following incremental patch?).

I'add an ioctl now.

> Also other vhost work does the same with GET_ID, if that's matter.
> 
> I'll fix the rest, thanks for review!
> 
> -- 
> wbr, Vitaly

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/1] Add vhost_blk driver
  2018-11-02 18:21 ` [PATCH 1/1] Add " Vitaly Mayatskikh
  2018-11-02 18:36   ` Michael S. Tsirkin
@ 2018-11-03  2:50   ` kbuild test robot
  2018-11-06 16:03   ` Stefan Hajnoczi
  2 siblings, 0 replies; 27+ messages in thread
From: kbuild test robot @ 2018-11-03  2:50 UTC (permalink / raw)
  To: Vitaly Mayatskikh
  Cc: kbuild-all, Michael S . Tsirkin, Jason Wang, Paolo Bonzini, kvm,
	virtualization, linux-kernel, Vitaly Mayatskikh

[-- Attachment #1: Type: text/plain, Size: 5345 bytes --]

Hi Vitaly,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on vhost/linux-next]
[also build test ERROR on v4.19 next-20181102]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Vitaly-Mayatskikh/vhost-add-vhost_blk-driver/20181103-084141
base:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git linux-next
config: mips-allyesconfig (attached as .config)
compiler: mips-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.2.0 make.cross ARCH=mips 

All error/warnings (new ones prefixed by >>):

   drivers/vhost/blk.c: In function 'vhost_blk_iocb_complete':
>> drivers/vhost/blk.c:129:2: error: implicit declaration of function 'vhost_vq_work_queue'; did you mean 'vhost_work_queue'? [-Werror=implicit-function-declaration]
     vhost_vq_work_queue(&req->q->vq, &req->q->w);
     ^~~~~~~~~~~~~~~~~~~
     vhost_work_queue
   In file included from include/linux/kernel.h:14:0,
                    from include/linux/list.h:9,
                    from include/linux/module.h:9,
                    from drivers/vhost/blk.c:11:
   drivers/vhost/blk.c: In function 'vhost_blk_req_handle':
>> drivers/vhost/blk.c:153:12: warning: format '%ld' expects argument of type 'long int', but argument 8 has type 'ssize_t {aka int}' [-Wformat=]
      pr_debug("%s: [pid:%d %s] %s sector %lld, len %ld\n",
               ^
   include/linux/printk.h:292:21: note: in definition of macro 'pr_fmt'
    #define pr_fmt(fmt) fmt
                        ^~~
   include/linux/printk.h:340:2: note: in expansion of macro 'dynamic_pr_debug'
     dynamic_pr_debug(fmt, ##__VA_ARGS__)
     ^~~~~~~~~~~~~~~~
>> drivers/vhost/blk.c:153:3: note: in expansion of macro 'pr_debug'
      pr_debug("%s: [pid:%d %s] %s sector %lld, len %ld\n",
      ^~~~~~~~
   cc1: some warnings being treated as errors

vim +129 drivers/vhost/blk.c

   118	
   119	static void vhost_blk_iocb_complete(struct kiocb *iocb, long ret, long ret2)
   120	{
   121		struct vhost_blk_req *req = container_of(iocb, struct vhost_blk_req,
   122							 iocb);
   123	
   124		pr_debug("%s vq[%d] req->index %d ret %ld ret2 %ld\n", __func__,
   125			 req->q->index, req->index, ret, ret2);
   126	
   127		req->res = (ret == req->len) ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
   128		llist_add(&req->list, &req->q->wl);
 > 129		vhost_vq_work_queue(&req->q->vq, &req->q->w);
   130	}
   131	
   132	static int vhost_blk_req_handle(struct vhost_blk_req *req)
   133	{
   134		struct vhost_blk *blk = req->q->blk;
   135		struct vhost_virtqueue *vq = &req->q->vq;
   136		int type = le32_to_cpu(req->hdr.type);
   137		int ret;
   138		u8 status;
   139	
   140		if ((type == VIRTIO_BLK_T_IN) || (type == VIRTIO_BLK_T_OUT)) {
   141			bool write = (type == VIRTIO_BLK_T_OUT);
   142			int nr_seg = (write ? req->out_num : req->in_num) - 1;
   143			unsigned long sector = le64_to_cpu(req->hdr.sector);
   144			ssize_t len, rem_len;
   145	
   146			if (!req->q->blk->backend) {
   147				vq_err(vq, "blk %p no backend!\n", req->q->blk);
   148				ret = -EINVAL;
   149				goto out_err;
   150			}
   151	
   152			len = iov_length(&vq->iov[1], nr_seg);
 > 153			pr_debug("%s: [pid:%d %s] %s sector %lld, len %ld\n",
   154				 __func__, current->pid, current->comm,
   155				 write ? "WRITE" : "READ", req->hdr.sector, len);
   156	
   157			req->len = len;
   158			rem_len = len;
   159			iov_iter_init(&req->i, (write ? WRITE : READ),
   160				      write ? &req->out_iov[0] : &req->in_iov[0],
   161				      nr_seg, len);
   162	
   163			req->iocb.ki_pos = sector << 9;
   164			req->iocb.ki_filp = blk->backend;
   165			req->iocb.ki_complete = vhost_blk_iocb_complete;
   166			req->iocb.ki_flags = IOCB_DIRECT;
   167	
   168			if (write)
   169				ret = call_write_iter(blk->backend, &req->iocb,
   170						      &req->i);
   171			else
   172				ret = call_read_iter(blk->backend, &req->iocb,
   173						     &req->i);
   174	
   175			if (ret != -EIOCBQUEUED)
   176				vhost_blk_iocb_complete(&req->iocb, ret, 0);
   177	
   178			ret = 0;
   179			goto out;
   180		}
   181	
   182		if (type == VIRTIO_BLK_T_GET_ID) {
   183			char s[] = "vhost_blk";
   184			size_t len = min_t(size_t, req->in_iov[0].iov_len,
   185					   strlen(s));
   186	
   187			ret = copy_to_user(req->in_iov[0].iov_base, s, len);
   188			status = ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
   189			if (put_user(status, (unsigned char __user *)req->status)) {
   190				ret = -EFAULT;
   191				goto out_err;
   192			}
   193			vhost_add_used_and_signal(&blk->dev, vq, req->index, 1);
   194			ret = 0;
   195			goto out;
   196		} else {
   197			pr_warn("Unsupported request type %d\n", type);
   198			vhost_discard_vq_desc(vq, 1);
   199			ret = -EINVAL;
   200			return ret;
   201		}
   202	out_err:
   203		vhost_discard_vq_desc(vq, 1);
   204	out:
   205		return ret;
   206	}
   207	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 57563 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-02 18:21 [PATCH 0/1] vhost: add vhost_blk driver Vitaly Mayatskikh
  2018-11-02 18:21 ` [PATCH 1/1] Add " Vitaly Mayatskikh
  2018-11-02 18:26 ` [PATCH 0/1] vhost: add " Michael S. Tsirkin
@ 2018-11-04 11:57 ` Maxim Levitsky
  2018-11-04 16:40   ` Vitaly Mayatskih
  2018-11-05  3:00 ` Jason Wang
  3 siblings, 1 reply; 27+ messages in thread
From: Maxim Levitsky @ 2018-11-04 11:57 UTC (permalink / raw)
  To: Vitaly Mayatskikh, Michael S . Tsirkin
  Cc: Jason Wang, Paolo Bonzini, kvm, virtualization, linux-kernel

On Fri, 2018-11-02 at 18:21 +0000, Vitaly Mayatskikh wrote:
> vhost_blk is a host-side kernel mode accelerator for virtio-blk. The
> driver allows VM to reach a near bare-metal disk performance. See IOPS
> numbers below (fio --rw=randread --bs=4k).
> 
> This implementation uses kiocb interface. It is slightly slower than
> going directly through bio, but is simpler and also works with disk
> images placed on a file system.
> 
> # fio num-jobs
> # A: bare metal over block
> # B: bare metal over file
> # C: virtio-blk over block
> # D: virtio-blk over file
> # E: vhost-blk bio over block
> # F: vhost-blk kiocb over block
> # G: vhost-blk kiocb over file
> #
> #  A     B     C    D    E     F     G
> 

Hi!
I am also working in this area, and so I am very intersted in this driver.

> 1  171k  151k  148k 151k 195k  187k  175k
If I understand correctly this is fio --numjobs=1?
It looks like you are getting better that native performance over bare metal in
E,F,G (vhost-blk cases in fact). Is this correct?

Could you share the full fio command line you have used?
Which IO device did you use for the test? NVME?
Which system (cpu model/number of cores/etc) did you test on?

Best regards,
      Maxim Levitsky



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-04 11:57 ` Maxim Levitsky
@ 2018-11-04 16:40   ` Vitaly Mayatskih
  2018-11-05 11:55     ` Maxim Levitsky
  0 siblings, 1 reply; 27+ messages in thread
From: Vitaly Mayatskih @ 2018-11-04 16:40 UTC (permalink / raw)
  To: mlevitsk
  Cc: Michael S . Tsirkin, Jason Wang, Paolo Bonzini, kvm,
	virtualization, linux-kernel

On Sun, Nov 4, 2018 at 6:57 AM Maxim Levitsky <mlevitsk@redhat.com> wrote:

> Hi!
> I am also working in this area, and so I am very intersted in this driver.
>
> > 1  171k  151k  148k 151k 195k  187k  175k
> If I understand correctly this is fio --numjobs=1?
> It looks like you are getting better that native performance over bare metal in
> E,F,G (vhost-blk cases in fact). Is this correct?

Yes. At such speeds it is a matter of how the workers are scheduled,
i.e. how good is batching. There are other factors why vhost-blk is on
par or slightly higher than fio running in userspace on bare metal,
but from my observation the right batching all through the stack is
more important.

> Could you share the full fio command line you have used?

sysctl -w vm.nr_hugepages=8300; numactl -p 1 -N 1 ./qemu-system-x86_64
-enable-kvm -cpu host -smp 16 -mem-prealloc -mem-path
/dev/hugepages/foo -m 8G -nographic -drive
if=none,id=drive0,format=raw,file=/dev/mapper/mirror-hello,cache=none
-device virtio-blk-pci,id=blk0,drive=drive0,num-queues=16 -drive
if=none,id=drive1,format=raw,file=/dev/mapper/mirror-volume,cache=none
-device vhost-blk-pci,id=blk1,drive=drive1,num-queues=16

for i in `seq 1 16`; do echo -n "$i "; ./fio --direct=1 --rw=randread
--ioengine=libaio --bs=4k --iodepth=128 --numjobs=$i --name=foo
--time_based --runtime=15 --group_reporting --filename=/dev/vda
--size=10g | grep -Po 'IOPS=[0-9\.]*k'; done

> Which IO device did you use for the test? NVME?

That was LVM mirror over 2 network disks. On the target side it was
LVM stripe over few NVMe's.

> Which system (cpu model/number of cores/etc) did you test on?

Dual socket: "model name    : Intel(R) Xeon(R) Gold 6142 CPU @
2.60GHz" with HT enabled, so 64 logical cores in total. The network
was something from Intel with 53 Gbps PHY and served by fm10k driver.

-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-02 18:21 [PATCH 0/1] vhost: add vhost_blk driver Vitaly Mayatskikh
                   ` (2 preceding siblings ...)
  2018-11-04 11:57 ` Maxim Levitsky
@ 2018-11-05  3:00 ` Jason Wang
  2018-11-05  3:23   ` Vitaly Mayatskih
  2018-11-05 15:48   ` Christian Borntraeger
  3 siblings, 2 replies; 27+ messages in thread
From: Jason Wang @ 2018-11-05  3:00 UTC (permalink / raw)
  To: Vitaly Mayatskikh, Michael S . Tsirkin
  Cc: Paolo Bonzini, kvm, virtualization, linux-kernel, Stefan Hajnoczi


On 2018/11/3 上午2:21, Vitaly Mayatskikh wrote:
> vhost_blk is a host-side kernel mode accelerator for virtio-blk. The
> driver allows VM to reach a near bare-metal disk performance. See IOPS
> numbers below (fio --rw=randread --bs=4k).
>
> This implementation uses kiocb interface. It is slightly slower than
> going directly through bio, but is simpler and also works with disk
> images placed on a file system.
>
> # fio num-jobs
> # A: bare metal over block
> # B: bare metal over file
> # C: virtio-blk over block
> # D: virtio-blk over file
> # E: vhost-blk bio over block
> # F: vhost-blk kiocb over block
> # G: vhost-blk kiocb over file
> #
> #  A     B     C    D    E     F     G
>
> 1  171k  151k  148k 151k 195k  187k  175k
> 2  328k  302k  249k 241k 349k  334k  296k
> 3  479k  437k  179k 174k 501k  464k  404k
> 4  622k  568k  143k 183k 620k  580k  492k
> 5  755k  697k  136k 128k 737k  693k  579k
> 6  887k  808k  131k 120k 830k  782k  640k
> 7  1004k 926k  126k 131k 926k  863k  693k
> 8  1099k 1015k 117k 115k 1001k 931k  712k
> 9  1194k 1119k 115k 111k 1055k 991k  711k
> 10 1278k 1207k 109k 114k 1130k 1046k 695k
> 11 1345k 1280k 110k 108k 1119k 1091k 663k
> 12 1411k 1356k 104k 106k 1201k 1142k 629k
> 13 1466k 1423k 106k 106k 1260k 1170k 607k
> 14 1517k 1486k 103k 106k 1296k 1179k 589k
> 15 1552k 1543k 102k 102k 1322k 1191k 571k
> 16 1480k 1506k 101k 102k 1346k 1202k 566k
>
> Vitaly Mayatskikh (1):
>    Add vhost_blk driver
>
>   drivers/vhost/Kconfig  |  13 ++
>   drivers/vhost/Makefile |   3 +
>   drivers/vhost/blk.c    | 510 +++++++++++++++++++++++++++++++++++++++++
>   3 files changed, 526 insertions(+)
>   create mode 100644 drivers/vhost/blk.c
>

Hi:

Thanks for the patches.

This is not the first attempt for having vhost-blk:

- Badari's version: https://lwn.net/Articles/379864/

- Asias' version: https://lwn.net/Articles/519880/

It's better to describe the differences (kiocb vs bio? performance?). 
E.g if my memory is correct, Asias said it doesn't give much improvement 
compared with userspace qemu.

And what's more important, I believe we tend to use virtio-scsi nowdays. 
So what's the advantages of vhost-blk over vhost-scsi?

Thanks


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-05  3:00 ` Jason Wang
@ 2018-11-05  3:23   ` Vitaly Mayatskih
  2018-11-06  2:45     ` Jason Wang
  2018-11-05 15:48   ` Christian Borntraeger
  1 sibling, 1 reply; 27+ messages in thread
From: Vitaly Mayatskih @ 2018-11-05  3:23 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S . Tsirkin, Paolo Bonzini, kvm, virtualization,
	linux-kernel, stefanha

On Sun, Nov 4, 2018 at 10:00 PM Jason Wang <jasowang@redhat.com> wrote:

> > # fio num-jobs
> > # A: bare metal over block
> > # B: bare metal over file
> > # C: virtio-blk over block
> > # D: virtio-blk over file
> > # E: vhost-blk bio over block
> > # F: vhost-blk kiocb over block
> > # G: vhost-blk kiocb over file
> > #
> > #  A     B     C    D    E     F     G

> > 16 1480k 1506k 101k 102k 1346k 1202k 566k

> Hi:
>
> Thanks for the patches.
>
> This is not the first attempt for having vhost-blk:
>
> - Badari's version: https://lwn.net/Articles/379864/
>
> - Asias' version: https://lwn.net/Articles/519880/
>
> It's better to describe the differences (kiocb vs bio? performance?).
> E.g if my memory is correct, Asias said it doesn't give much improvement
> compared with userspace qemu.
>
> And what's more important, I believe we tend to use virtio-scsi nowdays.
> So what's the advantages of vhost-blk over vhost-scsi?

Hi,

Yes, I saw both. Frankly, my implementation is not that different,
because the whole thing has only twice more LOC that vhost/test.c.

I posted my numbers (see in quoted text above the 16 queues case),
IOPS goes from ~100k to 1.2M and almost reaches the physical
limitation of the backend.

submit_bio() is a bit faster, but can't be used for disk images placed
on a file system. I have that submit_bio implementation too.

Storage industry is shifting away from SCSI, which has a scaling
problem. I can compare vhost-scsi vs vhost-blk if you are curious.

Thanks!
-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-04 16:40   ` Vitaly Mayatskih
@ 2018-11-05 11:55     ` Maxim Levitsky
  0 siblings, 0 replies; 27+ messages in thread
From: Maxim Levitsky @ 2018-11-05 11:55 UTC (permalink / raw)
  To: Vitaly Mayatskih
  Cc: Michael S . Tsirkin, Jason Wang, Paolo Bonzini, kvm,
	virtualization, linux-kernel

On Sun, 2018-11-04 at 11:40 -0500, Vitaly Mayatskih wrote:
> On Sun, Nov 4, 2018 at 6:57 AM Maxim Levitsky <mlevitsk@redhat.com> wrote:
> 
> > Hi!
> > I am also working in this area, and so I am very intersted in this driver.
> > 
> > > 1  171k  151k  148k 151k 195k  187k  175k
> > 
> > If I understand correctly this is fio --numjobs=1?
> > It looks like you are getting better that native performance over bare metal
> > in
> > E,F,G (vhost-blk cases in fact). Is this correct?
> 
> Yes. At such speeds it is a matter of how the workers are scheduled,
> i.e. how good is batching. There are other factors why vhost-blk is on
> par or slightly higher than fio running in userspace on bare metal,
> but from my observation the right batching all through the stack is
> more important.

I completely agree with you on that.
I currently learning profiling/tracing to understand the batching (or lack of)
in the tests I run

My focus currently is mostly on spdk + native nvme
While for multiple threads, the performance is very close to bare metal, on
single thread I see significant overhead which probably relate to batching as
well.

> 
> > Could you share the full fio command line you have used?
> 
> sysctl -w vm.nr_hugepages=8300; numactl -p 1 -N 1 ./qemu-system-x86_64
> -enable-kvm -cpu host -smp 16 -mem-prealloc -mem-path
> /dev/hugepages/foo -m 8G -nographic -drive
> if=none,id=drive0,format=raw,file=/dev/mapper/mirror-hello,cache=none
> -device virtio-blk-pci,id=blk0,drive=drive0,num-queues=16 -drive
> if=none,id=drive1,format=raw,file=/dev/mapper/mirror-volume,cache=none
> -device vhost-blk-pci,id=blk1,drive=drive1,num-queues=16
Thanks!

> 
> for i in `seq 1 16`; do echo -n "$i "; ./fio --direct=1 --rw=randread
> --ioengine=libaio --bs=4k --iodepth=128 --numjobs=$i --name=foo
> --time_based --runtime=15 --group_reporting --filename=/dev/vda
> --size=10g | grep -Po 'IOPS=[0-9\.]*k'; done
> 
> > Which IO device did you use for the test? NVME?
> 
> That was LVM mirror over 2 network disks. On the target side it was
> LVM stripe over few NVMe's.
Have you tried to test this over directly connected NVME device to?
The networking might naturally improve batching I think.

> 
> > Which system (cpu model/number of cores/etc) did you test on?
> 
> Dual socket: "model name    : Intel(R) Xeon(R) Gold 6142 CPU @
> 2.60GHz" with HT enabled, so 64 logical cores in total. The network
> was something from Intel with 53 Gbps PHY and served by fm10k driver.


All right, thanks!
I'll test your driver on my system where I tested most of the current solutions.

Best regards,
	Maxim Levitsky




^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-02 18:26 ` [PATCH 0/1] vhost: add " Michael S. Tsirkin
  2018-11-02 18:31   ` Vitaly Mayatskih
@ 2018-11-05 14:02   ` Christian Borntraeger
  2018-11-05 14:21     ` Vitaly Mayatskih
  2018-11-06 15:40   ` Stefan Hajnoczi
  2 siblings, 1 reply; 27+ messages in thread
From: Christian Borntraeger @ 2018-11-05 14:02 UTC (permalink / raw)
  To: Michael S. Tsirkin, Vitaly Mayatskikh
  Cc: Paolo Bonzini, linux-kernel, kvm, virtualization

On 11/02/2018 07:26 PM, Michael S. Tsirkin wrote:
> On Fri, Nov 02, 2018 at 06:21:22PM +0000, Vitaly Mayatskikh wrote:
>> vhost_blk is a host-side kernel mode accelerator for virtio-blk. The
>> driver allows VM to reach a near bare-metal disk performance. See IOPS
>> numbers below (fio --rw=randread --bs=4k).
>>
>> This implementation uses kiocb interface. It is slightly slower than
>> going directly through bio, but is simpler and also works with disk
>> images placed on a file system.


This should also work with other transports like virtio-ccw (instead of virtio-pci).
Correct?

>>
>> # fio num-jobs
>> # A: bare metal over block
>> # B: bare metal over file
>> # C: virtio-blk over block
>> # D: virtio-blk over file
>> # E: vhost-blk bio over block
>> # F: vhost-blk kiocb over block
>> # G: vhost-blk kiocb over file
>> #
>> #  A     B     C    D    E     F     G
>>
>> 1  171k  151k  148k 151k 195k  187k  175k
>> 2  328k  302k  249k 241k 349k  334k  296k
>> 3  479k  437k  179k 174k 501k  464k  404k
>> 4  622k  568k  143k 183k 620k  580k  492k
>> 5  755k  697k  136k 128k 737k  693k  579k
>> 6  887k  808k  131k 120k 830k  782k  640k
>> 7  1004k 926k  126k 131k 926k  863k  693k
>> 8  1099k 1015k 117k 115k 1001k 931k  712k
>> 9  1194k 1119k 115k 111k 1055k 991k  711k
>> 10 1278k 1207k 109k 114k 1130k 1046k 695k
>> 11 1345k 1280k 110k 108k 1119k 1091k 663k
>> 12 1411k 1356k 104k 106k 1201k 1142k 629k
>> 13 1466k 1423k 106k 106k 1260k 1170k 607k
>> 14 1517k 1486k 103k 106k 1296k 1179k 589k
>> 15 1552k 1543k 102k 102k 1322k 1191k 571k
>> 16 1480k 1506k 101k 102k 1346k 1202k 566k
>>
>> Vitaly Mayatskikh (1):
>>   Add vhost_blk driver
> 
> 
> Thanks!
> Before merging this, I'd like to get some acks from userspace that it's
> actually going to be used - e.g. QEMU block maintainers.
> 
>>  drivers/vhost/Kconfig  |  13 ++
>>  drivers/vhost/Makefile |   3 +
>>  drivers/vhost/blk.c    | 510 +++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 526 insertions(+)
>>  create mode 100644 drivers/vhost/blk.c
>>
>> -- 
>> 2.17.1
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-05 14:02   ` Christian Borntraeger
@ 2018-11-05 14:21     ` Vitaly Mayatskih
  0 siblings, 0 replies; 27+ messages in thread
From: Vitaly Mayatskih @ 2018-11-05 14:21 UTC (permalink / raw)
  To: borntraeger
  Cc: Michael S . Tsirkin, Paolo Bonzini, linux-kernel, kvm, virtualization

> This should also work with other transports like virtio-ccw (instead of virtio-pci).
> Correct?

I guess it can. There's nothing in the code specifically asking for or
depending on PCI.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-05  3:00 ` Jason Wang
  2018-11-05  3:23   ` Vitaly Mayatskih
@ 2018-11-05 15:48   ` Christian Borntraeger
  2018-11-05 16:15     ` Vitaly Mayatskih
  1 sibling, 1 reply; 27+ messages in thread
From: Christian Borntraeger @ 2018-11-05 15:48 UTC (permalink / raw)
  To: Jason Wang, Vitaly Mayatskikh, Michael S . Tsirkin
  Cc: Paolo Bonzini, Stefan Hajnoczi, linux-kernel, kvm, virtualization



On 11/05/2018 04:00 AM, Jason Wang wrote:
> 
> On 2018/11/3 上午2:21, Vitaly Mayatskikh wrote:
>> vhost_blk is a host-side kernel mode accelerator for virtio-blk. The
>> driver allows VM to reach a near bare-metal disk performance. See IOPS
>> numbers below (fio --rw=randread --bs=4k).
>>
>> This implementation uses kiocb interface. It is slightly slower than
>> going directly through bio, but is simpler and also works with disk
>> images placed on a file system.
>>
>> # fio num-jobs
>> # A: bare metal over block
>> # B: bare metal over file
>> # C: virtio-blk over block
>> # D: virtio-blk over file
>> # E: vhost-blk bio over block
>> # F: vhost-blk kiocb over block
>> # G: vhost-blk kiocb over file
>> #
>> #  A     B     C    D    E     F     G
>>
>> 1  171k  151k  148k 151k 195k  187k  175k
>> 2  328k  302k  249k 241k 349k  334k  296k
>> 3  479k  437k  179k 174k 501k  464k  404k
>> 4  622k  568k  143k 183k 620k  580k  492k
>> 5  755k  697k  136k 128k 737k  693k  579k
>> 6  887k  808k  131k 120k 830k  782k  640k
>> 7  1004k 926k  126k 131k 926k  863k  693k
>> 8  1099k 1015k 117k 115k 1001k 931k  712k
>> 9  1194k 1119k 115k 111k 1055k 991k  711k
>> 10 1278k 1207k 109k 114k 1130k 1046k 695k
>> 11 1345k 1280k 110k 108k 1119k 1091k 663k
>> 12 1411k 1356k 104k 106k 1201k 1142k 629k
>> 13 1466k 1423k 106k 106k 1260k 1170k 607k
>> 14 1517k 1486k 103k 106k 1296k 1179k 589k
>> 15 1552k 1543k 102k 102k 1322k 1191k 571k
>> 16 1480k 1506k 101k 102k 1346k 1202k 566k
>>
>> Vitaly Mayatskikh (1):
>>    Add vhost_blk driver
>>
>>   drivers/vhost/Kconfig  |  13 ++
>>   drivers/vhost/Makefile |   3 +
>>   drivers/vhost/blk.c    | 510 +++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 526 insertions(+)
>>   create mode 100644 drivers/vhost/blk.c
>>
> 
> Hi:
> 
> Thanks for the patches.
> 
> This is not the first attempt for having vhost-blk:
> 
> - Badari's version: https://lwn.net/Articles/379864/
> 
> - Asias' version: https://lwn.net/Articles/519880/
> 
> It's better to describe the differences (kiocb vs bio? performance?). E.g if my memory is correct, Asias said it doesn't give much improvement compared with userspace qemu.
> 
> And what's more important, I believe we tend to use virtio-scsi nowdays. So what's the advantages of vhost-blk over vhost-scsi?


For the record, we still do use virtio-blk a lot. As we see new things like discard/write zero 
support it seems that others do as well.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-05 15:48   ` Christian Borntraeger
@ 2018-11-05 16:15     ` Vitaly Mayatskih
  2018-11-06 15:21       ` Stefan Hajnoczi
  0 siblings, 1 reply; 27+ messages in thread
From: Vitaly Mayatskih @ 2018-11-05 16:15 UTC (permalink / raw)
  To: borntraeger
  Cc: Jason Wang, Michael S . Tsirkin, Paolo Bonzini, stefanha,
	linux-kernel, kvm, virtualization

On Mon, Nov 5, 2018 at 10:48 AM Christian Borntraeger
<borntraeger@de.ibm.com> wrote:

> For the record, we still do use virtio-blk a lot. As we see new things like discard/write zero
> support it seems that others do as well.

Yes, trim/discard and writesame support is planned, at least for
submit_bio path (not in this patch).
-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-05  3:23   ` Vitaly Mayatskih
@ 2018-11-06  2:45     ` Jason Wang
  2018-11-06  2:56       ` Vitaly Mayatskih
  2018-11-06  7:13       ` Christoph Hellwig
  0 siblings, 2 replies; 27+ messages in thread
From: Jason Wang @ 2018-11-06  2:45 UTC (permalink / raw)
  To: Vitaly Mayatskih
  Cc: Michael S . Tsirkin, Paolo Bonzini, kvm, virtualization,
	linux-kernel, stefanha


On 2018/11/5 上午11:23, Vitaly Mayatskih wrote:
> On Sun, Nov 4, 2018 at 10:00 PM Jason Wang <jasowang@redhat.com> wrote:
>
>>> # fio num-jobs
>>> # A: bare metal over block
>>> # B: bare metal over file
>>> # C: virtio-blk over block
>>> # D: virtio-blk over file
>>> # E: vhost-blk bio over block
>>> # F: vhost-blk kiocb over block
>>> # G: vhost-blk kiocb over file
>>> #
>>> #  A     B     C    D    E     F     G
>>> 16 1480k 1506k 101k 102k 1346k 1202k 566k
>> Hi:
>>
>> Thanks for the patches.
>>
>> This is not the first attempt for having vhost-blk:
>>
>> - Badari's version: https://lwn.net/Articles/379864/
>>
>> - Asias' version: https://lwn.net/Articles/519880/
>>
>> It's better to describe the differences (kiocb vs bio? performance?).
>> E.g if my memory is correct, Asias said it doesn't give much improvement
>> compared with userspace qemu.
>>
>> And what's more important, I believe we tend to use virtio-scsi nowdays.
>> So what's the advantages of vhost-blk over vhost-scsi?
> Hi,
>
> Yes, I saw both. Frankly, my implementation is not that different,
> because the whole thing has only twice more LOC that vhost/test.c.
>
> I posted my numbers (see in quoted text above the 16 queues case),
> IOPS goes from ~100k to 1.2M and almost reaches the physical
> limitation of the backend.
>
> submit_bio() is a bit faster, but can't be used for disk images placed
> on a file system. I have that submit_bio implementation too.
>
> Storage industry is shifting away from SCSI, which has a scaling
> problem.


Know little about storage. For scaling, do you mean SCSI protocol 
itself? If not, it's probably not a real issue for virtio-scsi itself.


>   I can compare vhost-scsi vs vhost-blk if you are curious.


It would be very helpful to see the performance comparison.


Thanks


> Thanks!

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-06  2:45     ` Jason Wang
@ 2018-11-06  2:56       ` Vitaly Mayatskih
  2018-11-06  7:13       ` Christoph Hellwig
  1 sibling, 0 replies; 27+ messages in thread
From: Vitaly Mayatskih @ 2018-11-06  2:56 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S . Tsirkin, Paolo Bonzini, kvm, virtualization,
	linux-kernel, stefanha

On Mon, Nov 5, 2018 at 9:45 PM Jason Wang <jasowang@redhat.com> wrote:

> > Storage industry is shifting away from SCSI, which has a scaling
> > problem.
>
>
> Know little about storage. For scaling, do you mean SCSI protocol
> itself? If not, it's probably not a real issue for virtio-scsi itself.

Yes,  the protocol.

> >   I can compare vhost-scsi vs vhost-blk if you are curious.
>
>
> It would be very helpful to see the performance comparison.

I will try to find comparable hw with fast enough SAS SSDs. That one
was done with NVMe's sitting on the very bottom of storage topology.


-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-06  2:45     ` Jason Wang
  2018-11-06  2:56       ` Vitaly Mayatskih
@ 2018-11-06  7:13       ` Christoph Hellwig
  2018-11-06 21:41         ` Paolo Bonzini
  1 sibling, 1 reply; 27+ messages in thread
From: Christoph Hellwig @ 2018-11-06  7:13 UTC (permalink / raw)
  To: Jason Wang
  Cc: Vitaly Mayatskih, Michael S . Tsirkin, Paolo Bonzini, kvm,
	virtualization, linux-kernel, stefanha

On Tue, Nov 06, 2018 at 10:45:08AM +0800, Jason Wang wrote:
> > Storage industry is shifting away from SCSI, which has a scaling
> > problem.
> 
> 
> Know little about storage. For scaling, do you mean SCSI protocol itself? If
> not, it's probably not a real issue for virtio-scsi itself.

The above is utter bullshit.  There is a big NVMe hype, but it is not
because "SCSI has a scaling problem", but because the industry can sell
a new thing, and the standardization body seems easier to work with.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-05 16:15     ` Vitaly Mayatskih
@ 2018-11-06 15:21       ` Stefan Hajnoczi
  0 siblings, 0 replies; 27+ messages in thread
From: Stefan Hajnoczi @ 2018-11-06 15:21 UTC (permalink / raw)
  To: Vitaly Mayatskih
  Cc: borntraeger, Jason Wang, Michael S . Tsirkin, Paolo Bonzini,
	linux-kernel, kvm, virtualization

[-- Attachment #1: Type: text/plain, Size: 498 bytes --]

On Mon, Nov 05, 2018 at 11:15:04AM -0500, Vitaly Mayatskih wrote:
> On Mon, Nov 5, 2018 at 10:48 AM Christian Borntraeger
> <borntraeger@de.ibm.com> wrote:
> 
> > For the record, we still do use virtio-blk a lot. As we see new things like discard/write zero
> > support it seems that others do as well.
> 
> Yes, trim/discard and writesame support is planned, at least for
> submit_bio path (not in this patch).

I agree, virtio_blk is quite popular and not deprecated/frozen.

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 455 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-02 18:26 ` [PATCH 0/1] vhost: add " Michael S. Tsirkin
  2018-11-02 18:31   ` Vitaly Mayatskih
  2018-11-05 14:02   ` Christian Borntraeger
@ 2018-11-06 15:40   ` Stefan Hajnoczi
  2018-11-06 18:46     ` Denis Lunev
  2018-11-06 20:08     ` Vitaly Mayatskih
  2 siblings, 2 replies; 27+ messages in thread
From: Stefan Hajnoczi @ 2018-11-06 15:40 UTC (permalink / raw)
  To: Vitaly Mayatskikh
  Cc: Jason Wang, Paolo Bonzini, kvm, virtualization, linux-kernel,
	Kevin Wolf, Michael S. Tsirkin, den

[-- Attachment #1: Type: text/plain, Size: 3734 bytes --]

On Fri, Nov 02, 2018 at 02:26:00PM -0400, Michael S. Tsirkin wrote:
> On Fri, Nov 02, 2018 at 06:21:22PM +0000, Vitaly Mayatskikh wrote:
> > vhost_blk is a host-side kernel mode accelerator for virtio-blk. The
> > driver allows VM to reach a near bare-metal disk performance. See IOPS
> > numbers below (fio --rw=randread --bs=4k).
> > 
> > This implementation uses kiocb interface. It is slightly slower than
> > going directly through bio, but is simpler and also works with disk
> > images placed on a file system.
> > 
> > # fio num-jobs
> > # A: bare metal over block
> > # B: bare metal over file
> > # C: virtio-blk over block
> > # D: virtio-blk over file
> > # E: vhost-blk bio over block
> > # F: vhost-blk kiocb over block
> > # G: vhost-blk kiocb over file
> > #
> > #  A     B     C    D    E     F     G
> > 
> > 1  171k  151k  148k 151k 195k  187k  175k
> > 2  328k  302k  249k 241k 349k  334k  296k
> > 3  479k  437k  179k 174k 501k  464k  404k
> > 4  622k  568k  143k 183k 620k  580k  492k
> > 5  755k  697k  136k 128k 737k  693k  579k
> > 6  887k  808k  131k 120k 830k  782k  640k
> > 7  1004k 926k  126k 131k 926k  863k  693k
> > 8  1099k 1015k 117k 115k 1001k 931k  712k
> > 9  1194k 1119k 115k 111k 1055k 991k  711k
> > 10 1278k 1207k 109k 114k 1130k 1046k 695k
> > 11 1345k 1280k 110k 108k 1119k 1091k 663k
> > 12 1411k 1356k 104k 106k 1201k 1142k 629k
> > 13 1466k 1423k 106k 106k 1260k 1170k 607k
> > 14 1517k 1486k 103k 106k 1296k 1179k 589k
> > 15 1552k 1543k 102k 102k 1322k 1191k 571k
> > 16 1480k 1506k 101k 102k 1346k 1202k 566k
> > 
> > Vitaly Mayatskikh (1):
> >   Add vhost_blk driver
> 
> 
> Thanks!
> Before merging this, I'd like to get some acks from userspace that it's
> actually going to be used - e.g. QEMU block maintainers.

I have CCed Kevin, who is the overall QEMU block layer maintainer.

Also CCing Denis since I think someone was working on a QEMU userspace
multiqueue virtio-blk device for maximum performance.

Previously vhost_blk.ko implementations were basically the same thing as
the QEMU x-data-plane=on (dedicated thread using Linux AIO), except they
were using a kernel thread and maybe submitted bios.

The performance differences weren't convincing enough that it seemed
worthwhile maintaining another code path which loses live migration, I/O
throttling, image file formats, etc (all the things that QEMU's block
layer supports).

Two changes since then:

1. x-data-plane=on has been replaced with a full trip down QEMU's block
layer (-object iothread,id=iothread0 -device
virtio-blk-pci,iothread=iothread0,...).  It's slower and not truly
multiqueue (yet!).

So from this perspective vhost_blk.ko might be more attractive again, at
least until further QEMU block layer work eliminates the multiqueue and
performance overheads.

2. SPDK has become available for users who want the best I/O performance
and are willing to sacrifice CPU cores for polling.

If you want better performance and don't care about QEMU block layer
features, could you use SPDK?  People who are the target market for
vhost_blk.ko would probably be willing to use SPDK and it already
exists...

From the QEMU userspace perspective, I think the best way to integrate
vhost_blk.ko is to transparently switch to it when possible.  If the
user enables QEMU block layer features that are incompatible with
vhost_blk.ko, then it should fall back to the QEMU block layer
transparently.

I'm not keen on yet another code path with it's own set of limitations
and having to educate users about how to make the choice.  But if it can
be integrated transparently as an "accelerator", then it could be
valuable.

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 455 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/1] Add vhost_blk driver
  2018-11-02 18:21 ` [PATCH 1/1] Add " Vitaly Mayatskikh
  2018-11-02 18:36   ` Michael S. Tsirkin
  2018-11-03  2:50   ` kbuild test robot
@ 2018-11-06 16:03   ` Stefan Hajnoczi
  2018-11-06 19:47     ` Vitaly Mayatskih
  2 siblings, 1 reply; 27+ messages in thread
From: Stefan Hajnoczi @ 2018-11-06 16:03 UTC (permalink / raw)
  To: Vitaly Mayatskikh
  Cc: Michael S . Tsirkin, Jason Wang, Paolo Bonzini, kvm,
	virtualization, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 3004 bytes --]

On Fri, Nov 02, 2018 at 06:21:23PM +0000, Vitaly Mayatskikh wrote:
> This driver accelerates host side of virtio-blk.

Did you look at vhost-user-blk?  It does things slightly differently:
more of the virtio-blk device model is handled by the vhost-user device
(e.g. config space).  That might be necessary to implement
virtio_blk_config.writeback properly.

> +#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0x50, int)

Needs to be in the uapi header so userspace can use it.

> +
> +enum {
> +	VHOST_BLK_VQ_MAX = 16,
> +	VHOST_BLK_VQ_MAX_REQS = 128,
> +};

These limits seem arbitrary and probably too low.

> +
> +struct vhost_blk_req {
> +	struct llist_node list;
> +	int index;
> +	struct vhost_blk_queue *q;
> +	struct virtio_blk_outhdr hdr;
> +	struct iovec *out_iov;
> +	struct iovec *in_iov;
> +	u8 out_num;
> +	u8 in_num;
> +	long len;
> +	struct kiocb iocb;
> +	struct iov_iter i;
> +	int res;
> +	void __user *status;
> +};
> +
> +struct vhost_blk_queue {
> +	int index;
> +	struct vhost_blk *blk;
> +	struct vhost_virtqueue vq;
> +	struct vhost_work w;
> +	struct llist_head wl;

What is this?  Please use clear names and comments. :)

> +static int vhost_blk_req_handle(struct vhost_blk_req *req)
> +{
> +	struct vhost_blk *blk = req->q->blk;
> +	struct vhost_virtqueue *vq = &req->q->vq;
> +	int type = le32_to_cpu(req->hdr.type);
> +	int ret;
> +	u8 status;
> +
> +	if ((type == VIRTIO_BLK_T_IN) || (type == VIRTIO_BLK_T_OUT)) {
> +		bool write = (type == VIRTIO_BLK_T_OUT);
> +		int nr_seg = (write ? req->out_num : req->in_num) - 1;
> +		unsigned long sector = le64_to_cpu(req->hdr.sector);

Using little-endian instead of the virtio types means that only VIRTIO
1.0 modern devices are supported (older devices may not be
little-endian!).  In that case you'd need to put the VIRTIO_1 feature
bit into the features mask.

> +		req = &q->req[head];
> +		req->index = head;
> +		req->out_num = out;
> +		req->in_num = in;
> +		req->out_iov = &vq->iov[1];
> +		req->in_iov = &vq->iov[out];
> +		req->status = vq->iov[out + in - 1].iov_base;

The VIRTIO spec explicitly requires that devices support arbitrary
descriptor layouts, so you cannot assume a particular iov[] layout.

> +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
> +			    unsigned long arg)
> +{
> +	struct vhost_blk *blk = f->private_data;
> +	void __user *argp = (void __user *)arg;
> +	int fd;
> +	u64 __user *featurep = argp;
> +	u64 features;
> +	long ret;
> +	struct vhost_vring_state s;
> +
> +	switch (ioctl) {
> +	case VHOST_SET_MEM_TABLE:
> +		vhost_blk_stop(blk);
> +		ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> +		break;

Why is this necessary?  Existing vhost devices pass the ioctl through
without an explicit case for it.

> +	case VHOST_SET_VRING_NUM:
> +		if (copy_from_user(&s, argp, sizeof(s)))
> +			return -EFAULT;
> +		ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> +		if (!ret)
> +			blk->num_queues = s.index + 1;

Where is this input checked against ARRAY_SIZE(blk->queue)?

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 455 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-06 15:40   ` Stefan Hajnoczi
@ 2018-11-06 18:46     ` Denis Lunev
  2018-11-06 20:08     ` Vitaly Mayatskih
  1 sibling, 0 replies; 27+ messages in thread
From: Denis Lunev @ 2018-11-06 18:46 UTC (permalink / raw)
  To: Stefan Hajnoczi, Vitaly Mayatskikh
  Cc: Jason Wang, Paolo Bonzini, kvm, virtualization, linux-kernel,
	Kevin Wolf, Michael S. Tsirkin

On 11/6/18 6:40 PM, Stefan Hajnoczi wrote:
> On Fri, Nov 02, 2018 at 02:26:00PM -0400, Michael S. Tsirkin wrote:
>> On Fri, Nov 02, 2018 at 06:21:22PM +0000, Vitaly Mayatskikh wrote:
>>> vhost_blk is a host-side kernel mode accelerator for virtio-blk. The
>>> driver allows VM to reach a near bare-metal disk performance. See IOPS
>>> numbers below (fio --rw=randread --bs=4k).
>>>
>>> This implementation uses kiocb interface. It is slightly slower than
>>> going directly through bio, but is simpler and also works with disk
>>> images placed on a file system.
>>>
>>> # fio num-jobs
>>> # A: bare metal over block
>>> # B: bare metal over file
>>> # C: virtio-blk over block
>>> # D: virtio-blk over file
>>> # E: vhost-blk bio over block
>>> # F: vhost-blk kiocb over block
>>> # G: vhost-blk kiocb over file
>>> #
>>> #  A     B     C    D    E     F     G
>>>
>>> 1  171k  151k  148k 151k 195k  187k  175k
>>> 2  328k  302k  249k 241k 349k  334k  296k
>>> 3  479k  437k  179k 174k 501k  464k  404k
>>> 4  622k  568k  143k 183k 620k  580k  492k
>>> 5  755k  697k  136k 128k 737k  693k  579k
>>> 6  887k  808k  131k 120k 830k  782k  640k
>>> 7  1004k 926k  126k 131k 926k  863k  693k
>>> 8  1099k 1015k 117k 115k 1001k 931k  712k
>>> 9  1194k 1119k 115k 111k 1055k 991k  711k
>>> 10 1278k 1207k 109k 114k 1130k 1046k 695k
>>> 11 1345k 1280k 110k 108k 1119k 1091k 663k
>>> 12 1411k 1356k 104k 106k 1201k 1142k 629k
>>> 13 1466k 1423k 106k 106k 1260k 1170k 607k
>>> 14 1517k 1486k 103k 106k 1296k 1179k 589k
>>> 15 1552k 1543k 102k 102k 1322k 1191k 571k
>>> 16 1480k 1506k 101k 102k 1346k 1202k 566k
>>>
>>> Vitaly Mayatskikh (1):
>>>   Add vhost_blk driver
>>
>> Thanks!
>> Before merging this, I'd like to get some acks from userspace that it's
>> actually going to be used - e.g. QEMU block maintainers.
> I have CCed Kevin, who is the overall QEMU block layer maintainer.
>
> Also CCing Denis since I think someone was working on a QEMU userspace
> multiqueue virtio-blk device for maximum performance.
>
> Previously vhost_blk.ko implementations were basically the same thing as
> the QEMU x-data-plane=on (dedicated thread using Linux AIO), except they
> were using a kernel thread and maybe submitted bios.
>
> The performance differences weren't convincing enough that it seemed
> worthwhile maintaining another code path which loses live migration, I/O
> throttling, image file formats, etc (all the things that QEMU's block
> layer supports).
>
> Two changes since then:
>
> 1. x-data-plane=on has been replaced with a full trip down QEMU's block
> layer (-object iothread,id=iothread0 -device
> virtio-blk-pci,iothread=iothread0,...).  It's slower and not truly
> multiqueue (yet!).
>
> So from this perspective vhost_blk.ko might be more attractive again, at
> least until further QEMU block layer work eliminates the multiqueue and
> performance overheads.
>
> 2. SPDK has become available for users who want the best I/O performance
> and are willing to sacrifice CPU cores for polling.
>
> If you want better performance and don't care about QEMU block layer
> features, could you use SPDK?  People who are the target market for
> vhost_blk.ko would probably be willing to use SPDK and it already
> exists...
>
> From the QEMU userspace perspective, I think the best way to integrate
> vhost_blk.ko is to transparently switch to it when possible.  If the
> user enables QEMU block layer features that are incompatible with
> vhost_blk.ko, then it should fall back to the QEMU block layer
> transparently.
>
> I'm not keen on yet another code path with it's own set of limitations
> and having to educate users about how to make the choice.  But if it can
> be integrated transparently as an "accelerator", then it could be
> valuable.
>
> Stefan
Stefan,thank you very much for adding me into the loop :)
Patches themselves are very interesting and worth to
discuss.

First of all, I am not a very big fan of SPDK approach
and like kernel kernel implementation much more.
The reason for this is very simple. We are are not
discussing the overhead of the emulation. It would be
very interesting to compare CPU usage of each approach
and compare the overhead.

The problem is that we can not afford too costly
emulation for most scenarios.

I do think that kernel based approaches generates not
only more IOPSes but less overhead too.

Den

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/1] Add vhost_blk driver
  2018-11-06 16:03   ` Stefan Hajnoczi
@ 2018-11-06 19:47     ` Vitaly Mayatskih
  0 siblings, 0 replies; 27+ messages in thread
From: Vitaly Mayatskih @ 2018-11-06 19:47 UTC (permalink / raw)
  To: stefanha
  Cc: Michael S . Tsirkin, Jason Wang, Paolo Bonzini, kvm,
	virtualization, linux-kernel

On Tue, Nov 6, 2018 at 11:03 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:

> Did you look at vhost-user-blk?  It does things slightly differently:
> more of the virtio-blk device model is handled by the vhost-user device
> (e.g. config space).  That might be necessary to implement
> virtio_blk_config.writeback properly.

Yes, vhost-user-blk was used as a template.

> > +enum {
> > +     VHOST_BLK_VQ_MAX = 16,
> > +     VHOST_BLK_VQ_MAX_REQS = 128,
> > +};
>
> These limits seem arbitrary and probably too low.

It fits cache and TLB, since the data structures are statically
allocated. I saw a worse performance with bigger max-reqs. I'll make
it configurable.

> > +     if ((type == VIRTIO_BLK_T_IN) || (type == VIRTIO_BLK_T_OUT)) {
> > +             bool write = (type == VIRTIO_BLK_T_OUT);
> > +             int nr_seg = (write ? req->out_num : req->in_num) - 1;
> > +             unsigned long sector = le64_to_cpu(req->hdr.sector);
>
> Using little-endian instead of the virtio types means that only VIRTIO
> 1.0 modern devices are supported (older devices may not be
> little-endian!).  In that case you'd need to put the VIRTIO_1 feature
> bit into the features mask.

Yeah, I'm making first baby steps in virtio ;) Thanks!

> > +     switch (ioctl) {
> > +     case VHOST_SET_MEM_TABLE:
> > +             vhost_blk_stop(blk);
> > +             ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> > +             break;
>
> Why is this necessary?  Existing vhost devices pass the ioctl through
> without an explicit case for it.

vq->private_data is populated, vhost_set_vring_num returns -EBUSY if
ioctl is passed as is. It can be a bug in vhost, too, but I don't have
enough knowledge.

diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
index aefb9a61fa0f..da3eb041a975 100644
--- a/drivers/vhost/blk.c
+++ b/drivers/vhost/blk.c
@@ -438,7 +438,7 @@ static long vhost_blk_ioctl(struct file *f,
unsigned int ioctl,

        switch (ioctl) {
        case VHOST_SET_MEM_TABLE:
-               vhost_blk_stop(blk);
+//             vhost_blk_stop(blk);
                ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
                break;
        case VHOST_SET_VRING_NUM:

./x86_64-softmmu/qemu-system-x86_64 --enable-kvm -cpu host -smp 4 -m
2G -vnc 192.168.122.104:5 --drive
if=none,id=drive0,format=raw,file=/dev/vde -device
vhost-blk-pci,id=blk0,drive=drive0,num-queues=4
qemu-system-x86_64: vhost_set_vring_num failed: Device or resource busy (16)
qemu-system-x86_64: Error starting vhost: 16

> > +     case VHOST_SET_VRING_NUM:
> > +             if (copy_from_user(&s, argp, sizeof(s)))
> > +                     return -EFAULT;
> > +             ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> > +             if (!ret)
> > +                     blk->num_queues = s.index + 1;
>
> Where is this input checked against ARRAY_SIZE(blk->queue)?

In vhost itself. I capture the parameter only if vhost ioctl completes
without errors. Perhaps, worth a comment.

Thanks for review, this is exactly what I hoping for!

--
wbr, Vitaly

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-06 15:40   ` Stefan Hajnoczi
  2018-11-06 18:46     ` Denis Lunev
@ 2018-11-06 20:08     ` Vitaly Mayatskih
  1 sibling, 0 replies; 27+ messages in thread
From: Vitaly Mayatskih @ 2018-11-06 20:08 UTC (permalink / raw)
  To: stefanha
  Cc: Jason Wang, Paolo Bonzini, kvm, virtualization, linux-kernel,
	Kevin Wolf, Michael S . Tsirkin, den

On Tue, Nov 6, 2018 at 10:40 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:

> Previously vhost_blk.ko implementations were basically the same thing as
> the QEMU x-data-plane=on (dedicated thread using Linux AIO), except they
> were using a kernel thread and maybe submitted bios.
>
> The performance differences weren't convincing enough that it seemed
> worthwhile maintaining another code path which loses live migration, I/O
> throttling, image file formats, etc (all the things that QEMU's block
> layer supports).
>
> Two changes since then:
>
> 1. x-data-plane=on has been replaced with a full trip down QEMU's block
> layer (-object iothread,id=iothread0 -device
> virtio-blk-pci,iothread=iothread0,...).  It's slower and not truly
> multiqueue (yet!).
>
> So from this perspective vhost_blk.ko might be more attractive again, at
> least until further QEMU block layer work eliminates the multiqueue and
> performance overheads.

Yes, this work is a direct consequence of insufficient performance of
virtio-blk's host side. I'm working on a storage driver, but there's
no a good way to feed all these IOs into one disk of one VM. The
nature of storage design dictates the need of very high IOPS seen by
VM. This is only one tiny use case of course, but the vhost/QEMU
change is small enough to share.

> 2. SPDK has become available for users who want the best I/O performance
> and are willing to sacrifice CPU cores for polling.
>
> If you want better performance and don't care about QEMU block layer
> features, could you use SPDK?  People who are the target market for
> vhost_blk.ko would probably be willing to use SPDK and it already
> exists...

Yes. Though in my experience SPDK creates more problems most of times
than it solves ;) What I find very compelling in using a plain Linux
block device is that it is really fast these days (blk-mq) and the
device mapper can be used for even greater flexibility. Device mapper
is less than perfect performance-wise and at some point will need some
work for sure, but still can push few million IOPS through. And it's
all standard code with decades old user APIs.

In fact, Linux kernel is so good now that our pure-software solution
can push IO at rates up to the limits of fat hardware (x00 GbE, bunch
of NVMes) without an apparent need for hardware acceleration. And,
without hardware dependencies, it is much more flexible. Disk
interface between the host and VM was the only major bottleneck.

> From the QEMU userspace perspective, I think the best way to integrate
> vhost_blk.ko is to transparently switch to it when possible.  If the
> user enables QEMU block layer features that are incompatible with
> vhost_blk.ko, then it should fall back to the QEMU block layer
> transparently.

Sounds like an excellent idea! I'll do that. Most of vhost-blk support
in QEMU is a boilerplate code anyways.

> I'm not keen on yet another code path with it's own set of limitations
> and having to educate users about how to make the choice.  But if it can
> be integrated transparently as an "accelerator", then it could be
> valuable.

Understood. Agree.

Thanks!
-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/1] vhost: add vhost_blk driver
  2018-11-06  7:13       ` Christoph Hellwig
@ 2018-11-06 21:41         ` Paolo Bonzini
  0 siblings, 0 replies; 27+ messages in thread
From: Paolo Bonzini @ 2018-11-06 21:41 UTC (permalink / raw)
  To: Christoph Hellwig, Jason Wang
  Cc: Vitaly Mayatskih, Michael S . Tsirkin, kvm, virtualization,
	linux-kernel, stefanha

On 06/11/2018 08:13, Christoph Hellwig wrote:
> On Tue, Nov 06, 2018 at 10:45:08AM +0800, Jason Wang wrote:
>>> Storage industry is shifting away from SCSI, which has a scaling
>>> problem.
>>
>> Know little about storage. For scaling, do you mean SCSI protocol itself? If
>> not, it's probably not a real issue for virtio-scsi itself.
> 
> The above is utter bullshit.  There is a big NVMe hype, but it is not
> because "SCSI has a scaling problem", but because the industry can sell
> a new thing, and the standardization body seems easier to work with.

In the case of Linux, there is indeed some performance to be gained by
skipping the SCSI layer in the guest and to a lesser extent in QEMU,
which is why virtio-blk is still in use.  But there's no scaling
problem, it's just extra constant overhead.

Paolo

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2018-11-06 21:41 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-02 18:21 [PATCH 0/1] vhost: add vhost_blk driver Vitaly Mayatskikh
2018-11-02 18:21 ` [PATCH 1/1] Add " Vitaly Mayatskikh
2018-11-02 18:36   ` Michael S. Tsirkin
2018-11-02 19:24     ` Vitaly Mayatskih
2018-11-02 20:38       ` Michael S. Tsirkin
2018-11-03  2:50   ` kbuild test robot
2018-11-06 16:03   ` Stefan Hajnoczi
2018-11-06 19:47     ` Vitaly Mayatskih
2018-11-02 18:26 ` [PATCH 0/1] vhost: add " Michael S. Tsirkin
2018-11-02 18:31   ` Vitaly Mayatskih
2018-11-05 14:02   ` Christian Borntraeger
2018-11-05 14:21     ` Vitaly Mayatskih
2018-11-06 15:40   ` Stefan Hajnoczi
2018-11-06 18:46     ` Denis Lunev
2018-11-06 20:08     ` Vitaly Mayatskih
2018-11-04 11:57 ` Maxim Levitsky
2018-11-04 16:40   ` Vitaly Mayatskih
2018-11-05 11:55     ` Maxim Levitsky
2018-11-05  3:00 ` Jason Wang
2018-11-05  3:23   ` Vitaly Mayatskih
2018-11-06  2:45     ` Jason Wang
2018-11-06  2:56       ` Vitaly Mayatskih
2018-11-06  7:13       ` Christoph Hellwig
2018-11-06 21:41         ` Paolo Bonzini
2018-11-05 15:48   ` Christian Borntraeger
2018-11-05 16:15     ` Vitaly Mayatskih
2018-11-06 15:21       ` Stefan Hajnoczi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).