All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] vhost-blk implementation
@ 2010-03-23  1:00 Badari Pulavarty
  2010-03-23  1:16 ` Anthony Liguori
                   ` (3 more replies)
  0 siblings, 4 replies; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-23  1:00 UTC (permalink / raw)
  To: kvm

[-- Attachment #1: Type: text/plain, Size: 31 bytes --]

Forgot to CC: KVM list earlier

[-- Attachment #2: Forwarded message - [RFC] vhost-blk implementation --]
[-- Type: message/rfc822, Size: 8076 bytes --]

From: Badari Pulavarty <pbadari@us.ibm.com>
To: virtualization@lists.linux-foundation.org, qemu-devel@nongnu.org
Subject: [RFC] vhost-blk implementation
Date: Mon, 22 Mar 2010 17:34:06 -0700
Message-ID: <1269304444.7931.68.camel@badari-desktop>

Hi,

Inspired by vhost-net implementation, I did initial prototype 
of vhost-blk to see if it provides any benefits over QEMU virtio-blk.
I haven't handled all the error cases, fixed naming conventions etc.,
but the implementation is stable to play with. I tried not to deviate
from vhost-net implementation where possible.

NOTE:  Only change I had to make to vhost core code is to 
increase VHOST_NET_MAX_SG to 130 (128+2) in vhost.h 

Performance:
=============

I have done simple tests to see how it performs. I got very
encouraging results on sequential read tests. But on sequential
write tests, I see degrade over virtio-blk. I can't figure out and
explain why. Can some one shed light on whats happening here ?

Read Results:
=============
Test does read of 84GB file from the host (through virtio). I unmount
and mount the filesystem on the host to make sure there is nothing
in the page cache..


with vhost-blk:
----------------

# time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
640000+0 records in
640000+0 records out
83886080000 bytes (84 GB) copied, 126.135 seconds, 665 MB/s

real    2m6.137s
user    0m0.281s
sys     0m14.725s

without vhost-blk: (virtio)
---------------------------

# time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
640000+0 records in
640000+0 records out
83886080000 bytes (84 GB) copied, 275.466 seconds, 305 MB/s

real    4m35.468s
user    0m0.373s
sys     0m48.074s



Write Results:
==============

I see degraded IO performance when doing sequential IO write
tests with vhost-blk compared to virtio-blk.

# time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct

I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with
vhost-blk. Wondering why ?

Comments/flames ? 

Thanks,
Badari


vhost-blk is in-kernel accelerator for virtio-blk. 
At this time, this is a prototype based on virtio-net.
Lots of error handling and clean up needs to be done.
Read performance is pretty good over QEMU virtio-blk, but
write performance is not anywhere close to QEMU virtio-blk.
Why ?

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
---
 drivers/vhost/blk.c |  242 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 242 insertions(+)

Index: net-next/drivers/vhost/blk.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ net-next/drivers/vhost/blk.c	2010-03-22 18:07:18.156584400 -0400
@@ -0,0 +1,242 @@
+ /*
+  * virtio-block server in host kernel.
+  * Inspired by vhost-net and shamlessly ripped code from it :)
+  */
+
+#include <linux/compat.h>
+#include <linux/eventfd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_blk.h>
+#include <linux/mmu_context.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/rcupdate.h>
+#include <linux/file.h>
+
+#include "vhost.h"
+
+#define VHOST_BLK_VQ_MAX 1
+
+struct vhost_blk {
+	struct vhost_dev dev;
+	struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+	struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+};
+
+static int do_handle_io(struct file *file, uint32_t type, uint64_t sector,
+			struct iovec *iov, int in)
+{
+	loff_t pos = sector << 8;
+	int ret = 0;
+
+	if (type & VIRTIO_BLK_T_FLUSH)  {
+		ret = vfs_fsync(file, file->f_path.dentry, 1);
+	} else if (type & VIRTIO_BLK_T_OUT) {
+		ret = vfs_writev(file, iov, in, &pos);
+	} else {
+		ret = vfs_readv(file, iov, in, &pos);
+	}
+	return ret;
+}
+
+static void handle_blk(struct vhost_blk *blk)
+{
+	struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+	unsigned head, out, in;
+	struct virtio_blk_outhdr hdr;
+	int r, nvecs;
+	uint8_t status = 0;
+
+	use_mm(blk->dev.mm);
+	mutex_lock(&vq->mutex);
+
+	vhost_disable_notify(vq);
+
+	for (;;) {
+		head = vhost_get_vq_desc(&blk->dev, vq, vq->iov,
+					 ARRAY_SIZE(vq->iov),
+					 &out, &in, NULL, NULL);
+		if (head == vq->num) {
+			if (unlikely(vhost_enable_notify(vq))) {
+				vhost_disable_notify(vq);
+				continue;
+			}
+			break;
+		}
+
+		BUG_ON(vq->iov[0].iov_len != 16);
+
+		r = copy_from_user(&hdr, vq->iov[0].iov_base, sizeof hdr);
+		if (r < 0) {
+			printk("copy from user failed\n");
+			vhost_discard_vq_desc(vq);
+			break;
+		}
+
+		nvecs = out - 1;
+		if (hdr.type == VIRTIO_BLK_T_IN)
+			nvecs = in - 1;
+
+		r = do_handle_io(vq->private_data, hdr.type, hdr.sector, &vq->iov[1], nvecs);
+		status = (r < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+
+		nvecs++;
+		BUG_ON(vq->iov[nvecs].iov_len != 1);
+
+		if (copy_to_user(vq->iov[nvecs].iov_base, &status, sizeof status) < 0) {
+			printk("copy to user failed\n");
+			vhost_discard_vq_desc(vq);
+			break;
+		}
+		vhost_add_used_and_signal(&blk->dev, vq, head, r);
+	}
+        mutex_unlock(&vq->mutex);
+        unuse_mm(blk->dev.mm);
+}
+
+static void vhost_blk_flush(struct vhost_blk *n)
+{
+	vhost_poll_flush(n->poll);
+	vhost_poll_flush(&n->dev.vqs[0].poll);
+}
+
+static void handle_blk_kick(struct work_struct *work)
+{
+	struct vhost_virtqueue *vq;
+	struct vhost_blk *blk;
+	vq = container_of(work, struct vhost_virtqueue, poll.work);
+	blk = container_of(vq->dev, struct vhost_blk, dev);
+	handle_blk(blk);
+}
+
+static void handle_rq_blk(struct work_struct *work)
+{
+	struct vhost_blk *blk;
+	blk = container_of(work, struct vhost_blk, poll[0].work);
+	handle_blk(blk);
+}
+
+static int vhost_blk_open(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *n = kmalloc(sizeof *n, GFP_KERNEL);
+	int r;
+	if (!n)
+		return -ENOMEM;
+	n->vqs[0].handle_kick = handle_blk_kick;
+	r = vhost_dev_init(&n->dev, n->vqs, VHOST_BLK_VQ_MAX);
+	if (r < 0) {
+		kfree(n);
+		return r;
+	}
+
+	vhost_poll_init(n->poll, handle_rq_blk, POLLOUT|POLLIN);
+	f->private_data = n;
+	return 0;
+}
+
+static int vhost_blk_release(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *n = f->private_data;
+
+	fput(n->vqs->private_data);
+	kfree(n);
+	return 0;
+}
+
+static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int fd)
+{
+	struct file *file;
+	struct vhost_virtqueue *vq;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	vq = n->vqs + index;
+	mutex_lock(&vq->mutex);
+	rcu_assign_pointer(vq->private_data, file);
+	mutex_unlock(&vq->mutex);
+	return 0;
+}
+
+
+static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
+                            unsigned long arg)
+{
+	struct vhost_blk *n = f->private_data;
+	void __user *argp = (void __user *)arg;
+	struct vhost_vring_file backend;
+	int r;
+
+	switch (ioctl) {
+        case VHOST_NET_SET_BACKEND:
+                r = copy_from_user(&backend, argp, sizeof backend);
+                if (r < 0)
+                        return r;
+                return vhost_blk_set_backend(n, backend.index, backend.fd);
+	default:
+		mutex_lock(&n->dev.mutex);
+		r = vhost_dev_ioctl(&n->dev, ioctl, arg);
+		vhost_blk_flush(n);
+		mutex_unlock(&n->dev.mutex);
+		return r;
+	}
+}
+
+const static struct file_operations vhost_blk_fops = {
+	.owner          = THIS_MODULE,
+	.release        = vhost_blk_release,
+	.open           = vhost_blk_open,
+	.unlocked_ioctl = vhost_blk_ioctl,
+};
+
+static struct miscdevice vhost_blk_misc = {
+	234,
+	"vhost-blk",
+	&vhost_blk_fops,
+};
+
+int vhost_blk_init(void)
+{
+	int r = vhost_init();
+	if (r)
+		goto err_init;
+	r = misc_register(&vhost_blk_misc);
+	if (r)
+		goto err_reg;
+	return 0;
+err_reg:
+	vhost_cleanup();
+err_init:
+	return r;
+
+}
+module_init(vhost_blk_init);
+
+void vhost_blk_exit(void)
+{
+	misc_deregister(&vhost_blk_misc);
+	vhost_cleanup();
+}
+module_exit(vhost_blk_exit);
+
+MODULE_VERSION("0.0.1");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23  1:00 [RFC] vhost-blk implementation Badari Pulavarty
@ 2010-03-23  1:16 ` Anthony Liguori
  2010-03-23  1:45   ` Badari Pulavarty
  2010-03-23 10:03 ` Avi Kivity
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 48+ messages in thread
From: Anthony Liguori @ 2010-03-23  1:16 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: kvm

On 03/22/2010 08:00 PM, Badari Pulavarty wrote:
> Forgot to CC: KVM list earlier
>    


These virtio results are still with a 2.6.18 kernel with no aio, right?

Regards,

Anthony LIguori

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23  1:16 ` Anthony Liguori
@ 2010-03-23  1:45   ` Badari Pulavarty
  2010-03-23  2:00     ` Anthony Liguori
  0 siblings, 1 reply; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-23  1:45 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm

Anthony Liguori wrote:
> On 03/22/2010 08:00 PM, Badari Pulavarty wrote:
>> Forgot to CC: KVM list earlier
>>    
>
>
> These virtio results are still with a 2.6.18 kernel with no aio, right?
Results are on 2.6.33-rc8-net-next kernel. But not using AIO.

Thanks,
Badari



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23  1:45   ` Badari Pulavarty
@ 2010-03-23  2:00     ` Anthony Liguori
  2010-03-23  2:50       ` Badari Pulavarty
  0 siblings, 1 reply; 48+ messages in thread
From: Anthony Liguori @ 2010-03-23  2:00 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: kvm

On 03/22/2010 08:45 PM, Badari Pulavarty wrote:
> Anthony Liguori wrote:
>> On 03/22/2010 08:00 PM, Badari Pulavarty wrote:
>>> Forgot to CC: KVM list earlier
>>
>>
>> These virtio results are still with a 2.6.18 kernel with no aio, right?
> Results are on 2.6.33-rc8-net-next kernel. But not using AIO.

Can you try with aio and cache=none?

Regards,

Anthony Liguori

> Thanks,
> Badari
>
>


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23  2:00     ` Anthony Liguori
@ 2010-03-23  2:50       ` Badari Pulavarty
  2010-03-23 10:05         ` Avi Kivity
  0 siblings, 1 reply; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-23  2:50 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm

Anthony Liguori wrote:
> On 03/22/2010 08:45 PM, Badari Pulavarty wrote:
>> Anthony Liguori wrote:
>>> On 03/22/2010 08:00 PM, Badari Pulavarty wrote:
>>>> Forgot to CC: KVM list earlier
>>>
>>>
>>> These virtio results are still with a 2.6.18 kernel with no aio, right?
>> Results are on 2.6.33-rc8-net-next kernel. But not using AIO.
>
> Can you try with aio and cache=none?

aio-native,cache=none

# time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
640000+0 records in
640000+0 records out
83886080000 bytes (84 GB) copied, 470.588 seconds, 170 MB/s

Thanks,
Badari





^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23  1:00 [RFC] vhost-blk implementation Badari Pulavarty
  2010-03-23  1:16 ` Anthony Liguori
@ 2010-03-23 10:03 ` Avi Kivity
  2010-03-23 14:55   ` Badari Pulavarty
  2010-03-24 20:05   ` Christoph Hellwig
  2010-03-23 10:09 ` [RFC] vhost-blk implementation Eran Rom
  2010-03-24 20:04 ` Christoph Hellwig
  3 siblings, 2 replies; 48+ messages in thread
From: Avi Kivity @ 2010-03-23 10:03 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: kvm

On 03/23/2010 03:00 AM, Badari Pulavarty wrote:
> Forgot to CC: KVM list earlier
>    
>
> [RFC] vhost-blk implementation.eml
>
> Subject:
> [RFC] vhost-blk implementation
> From:
> Badari Pulavarty <pbadari@us.ibm.com>
> Date:
> Mon, 22 Mar 2010 17:34:06 -0700
>
> To:
> virtualization@lists.linux-foundation.org, qemu-devel@nongnu.org
>
>
> Hi,
>
> Inspired by vhost-net implementation, I did initial prototype
> of vhost-blk to see if it provides any benefits over QEMU virtio-blk.
> I haven't handled all the error cases, fixed naming conventions etc.,
> but the implementation is stable to play with. I tried not to deviate
> from vhost-net implementation where possible.
>
> NOTE:  Only change I had to make to vhost core code is to
> increase VHOST_NET_MAX_SG to 130 (128+2) in vhost.h
>
> Performance:
> =============
>
> I have done simple tests to see how it performs. I got very
> encouraging results on sequential read tests. But on sequential
> write tests, I see degrade over virtio-blk. I can't figure out and
> explain why. Can some one shed light on whats happening here ?
>
> Read Results:
> =============
> Test does read of 84GB file from the host (through virtio). I unmount
> and mount the filesystem on the host to make sure there is nothing
> in the page cache..
>
> +#define VHOST_BLK_VQ_MAX 1
> +
> +struct vhost_blk {
> +	struct vhost_dev dev;
> +	struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
> +	struct vhost_poll poll[VHOST_BLK_VQ_MAX];
> +};
> +
> +static int do_handle_io(struct file *file, uint32_t type, uint64_t sector,
> +			struct iovec *iov, int in)
> +{
> +	loff_t pos = sector<<  8;
> +	int ret = 0;
> +
> +	if (type&  VIRTIO_BLK_T_FLUSH)  {
> +		ret = vfs_fsync(file, file->f_path.dentry, 1);
> +	} else if (type&  VIRTIO_BLK_T_OUT) {
> +		ret = vfs_writev(file, iov, in,&pos);
> +	} else {
> +		ret = vfs_readv(file, iov, in,&pos);
> +	}
> +	return ret;
> +}
>    

This should be done asynchronously.  That is likely the cause of write 
performance degradation.  For reads, readahead means that that you're 
async anyway, but writes/syncs are still synchronous.

I also think it should be done at the bio layer.  File I/O is going to 
be slower, if we do vhost-blk we should concentrate on maximum 
performance.  The block layer also exposes more functionality we can use 
(asynchronous barriers for example).

btw, for fairness, cpu measurements should be done from the host side 
and include the vhost thread.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23  2:50       ` Badari Pulavarty
@ 2010-03-23 10:05         ` Avi Kivity
  2010-03-23 14:48           ` Badari Pulavarty
  0 siblings, 1 reply; 48+ messages in thread
From: Avi Kivity @ 2010-03-23 10:05 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Anthony Liguori, kvm

On 03/23/2010 04:50 AM, Badari Pulavarty wrote:
> Anthony Liguori wrote:
>> On 03/22/2010 08:45 PM, Badari Pulavarty wrote:
>>> Anthony Liguori wrote:
>>>> On 03/22/2010 08:00 PM, Badari Pulavarty wrote:
>>>>> Forgot to CC: KVM list earlier
>>>>
>>>>
>>>> These virtio results are still with a 2.6.18 kernel with no aio, 
>>>> right?
>>> Results are on 2.6.33-rc8-net-next kernel. But not using AIO.
>>
>> Can you try with aio and cache=none?
>
> aio-native,cache=none
>
> # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
> 640000+0 records in
> 640000+0 records out
> 83886080000 bytes (84 GB) copied, 470.588 seconds, 170 MB/s
>

You're effectively killing readahead with this.  Please use fio with a 
sequential pattern, queue depth > 1.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23  1:00 [RFC] vhost-blk implementation Badari Pulavarty
  2010-03-23  1:16 ` Anthony Liguori
  2010-03-23 10:03 ` Avi Kivity
@ 2010-03-23 10:09 ` Eran Rom
  2010-03-24 20:04 ` Christoph Hellwig
  3 siblings, 0 replies; 48+ messages in thread
From: Eran Rom @ 2010-03-23 10:09 UTC (permalink / raw)
  To: kvm

> with vhost-blk:
> ----------------
> 
> # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
> 640000+0 records in
> 640000+0 records out
> 83886080000 bytes (84 GB) copied, 126.135 seconds, 665 MB/s
> 
> # time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct
> 
Can you please check both read & write with cache=off, bs=128k 
Thanks very much,
Eran 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23 10:05         ` Avi Kivity
@ 2010-03-23 14:48           ` Badari Pulavarty
  0 siblings, 0 replies; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-23 14:48 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Anthony Liguori, kvm

Avi Kivity wrote:
> On 03/23/2010 04:50 AM, Badari Pulavarty wrote:
>> Anthony Liguori wrote:
>>> On 03/22/2010 08:45 PM, Badari Pulavarty wrote:
>>>> Anthony Liguori wrote:
>>>>> On 03/22/2010 08:00 PM, Badari Pulavarty wrote:
>>>>>> Forgot to CC: KVM list earlier
>>>>>
>>>>>
>>>>> These virtio results are still with a 2.6.18 kernel with no aio, 
>>>>> right?
>>>> Results are on 2.6.33-rc8-net-next kernel. But not using AIO.
>>>
>>> Can you try with aio and cache=none?
>>
>> aio-native,cache=none
>>
>> # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
>> 640000+0 records in
>> 640000+0 records out
>> 83886080000 bytes (84 GB) copied, 470.588 seconds, 170 MB/s
>>
>
> You're effectively killing readahead with this.  

Yes. cache=none, aio=native will force it hit the disk all the time. We 
loose all the benefits of
readahead.
> Please use fio with a sequential pattern, queue depth > 1.
>
Will do.

Thanks,
Badari


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23 10:03 ` Avi Kivity
@ 2010-03-23 14:55   ` Badari Pulavarty
  2010-03-23 16:53     ` Avi Kivity
  2010-03-24 20:05   ` Christoph Hellwig
  1 sibling, 1 reply; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-23 14:55 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm

Avi Kivity wrote:
> On 03/23/2010 03:00 AM, Badari Pulavarty wrote:
>> Forgot to CC: KVM list earlier
>>   
>> [RFC] vhost-blk implementation.eml
>>
>> Subject:
>> [RFC] vhost-blk implementation
>> From:
>> Badari Pulavarty <pbadari@us.ibm.com>
>> Date:
>> Mon, 22 Mar 2010 17:34:06 -0700
>>
>> To:
>> virtualization@lists.linux-foundation.org, qemu-devel@nongnu.org
>>
>>
>> Hi,
>>
>> Inspired by vhost-net implementation, I did initial prototype
>> of vhost-blk to see if it provides any benefits over QEMU virtio-blk.
>> I haven't handled all the error cases, fixed naming conventions etc.,
>> but the implementation is stable to play with. I tried not to deviate
>> from vhost-net implementation where possible.
>>
>> NOTE:  Only change I had to make to vhost core code is to
>> increase VHOST_NET_MAX_SG to 130 (128+2) in vhost.h
>>
>> Performance:
>> =============
>>
>> I have done simple tests to see how it performs. I got very
>> encouraging results on sequential read tests. But on sequential
>> write tests, I see degrade over virtio-blk. I can't figure out and
>> explain why. Can some one shed light on whats happening here ?
>>
>> Read Results:
>> =============
>> Test does read of 84GB file from the host (through virtio). I unmount
>> and mount the filesystem on the host to make sure there is nothing
>> in the page cache..
>>
>> +#define VHOST_BLK_VQ_MAX 1
>> +
>> +struct vhost_blk {
>> +    struct vhost_dev dev;
>> +    struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
>> +    struct vhost_poll poll[VHOST_BLK_VQ_MAX];
>> +};
>> +
>> +static int do_handle_io(struct file *file, uint32_t type, uint64_t 
>> sector,
>> +            struct iovec *iov, int in)
>> +{
>> +    loff_t pos = sector<<  8;
>> +    int ret = 0;
>> +
>> +    if (type&  VIRTIO_BLK_T_FLUSH)  {
>> +        ret = vfs_fsync(file, file->f_path.dentry, 1);
>> +    } else if (type&  VIRTIO_BLK_T_OUT) {
>> +        ret = vfs_writev(file, iov, in,&pos);
>> +    } else {
>> +        ret = vfs_readv(file, iov, in,&pos);
>> +    }
>> +    return ret;
>> +}
>>    
>
> This should be done asynchronously.  That is likely the cause of write 
> performance degradation.  For reads, readahead means that that you're 
> async anyway, but writes/syncs are still synchronous.
I am not sure what you mean by async here. Even if I use 
f_op->aio_write() its still synchronous (except for DIO).  Since we are 
writing to pagecache and not waiting for write()
to complete, this is the best we can do here.

Do you mean offload write() handling to another thread ?
>
> I also think it should be done at the bio layer.  
I am not sure what you meant here. Do you want to do submit_bio() 
directly ? Its not going to be that simple. Since the sector# is offset 
within the file, one have to do getblocks() on it
to find the real-disk-block#s + we have to do get_user_pages() on these 
iovecs before submitting them to bio.. All of this work is done by 
vfs_write()/vfs_read() anyway.. I am not
sure what you are suggesting here..

> File I/O is going to be slower, if we do vhost-blk we should 
> concentrate on maximum performance.  The block layer also exposes more 
> functionality we can use (asynchronous barriers for example).

>
> btw, for fairness, cpu measurements should be done from the host side 
> and include the vhost thread.
>
Will do.

Thanks,
Badari


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23 14:55   ` Badari Pulavarty
@ 2010-03-23 16:53     ` Avi Kivity
  0 siblings, 0 replies; 48+ messages in thread
From: Avi Kivity @ 2010-03-23 16:53 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: kvm

On 03/23/2010 04:55 PM, Badari Pulavarty wrote:
>> This should be done asynchronously.  That is likely the cause of 
>> write performance degradation.  For reads, readahead means that that 
>> you're async anyway, but writes/syncs are still synchronous.
>
> I am not sure what you mean by async here. Even if I use 
> f_op->aio_write() its still synchronous (except for DIO).  Since we 
> are writing to pagecache and not waiting for write()
> to complete, this is the best we can do here.

It fails with O_DIRECT or O_DSYNC, or if the write cache is full and we 
need to wait on writeout.  Reads that don't hit pagecache will also be 
synchronous.  You could use threads (like qemu) or use the block layer 
(which is async).

>
> Do you mean offload write() handling to another thread ?

Yeah.  Read too.

>>
>> I also think it should be done at the bio layer. 
> I am not sure what you meant here. Do you want to do submit_bio() 
> directly ? Its not going to be that simple. Since the sector# is 
> offset within the file, one have to do getblocks() on it
> to find the real-disk-block#s + we have to do get_user_pages() on 
> these iovecs before submitting them to bio.. All of this work is done 
> by vfs_write()/vfs_read() anyway.. I am not
> sure what you are suggesting here..

For the case where the file is actually a partition, use submit_bio().  
When the file is a file, keep it in qemu, that path is going to be 
slower anyway.


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23  1:00 [RFC] vhost-blk implementation Badari Pulavarty
                   ` (2 preceding siblings ...)
  2010-03-23 10:09 ` [RFC] vhost-blk implementation Eran Rom
@ 2010-03-24 20:04 ` Christoph Hellwig
  2010-03-24 20:22   ` Badari Pulavarty
                     ` (2 more replies)
  3 siblings, 3 replies; 48+ messages in thread
From: Christoph Hellwig @ 2010-03-24 20:04 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: kvm

> Inspired by vhost-net implementation, I did initial prototype 
> of vhost-blk to see if it provides any benefits over QEMU virtio-blk.
> I haven't handled all the error cases, fixed naming conventions etc.,
> but the implementation is stable to play with. I tried not to deviate
> from vhost-net implementation where possible.

Can you also send the qemu side of it?

> with vhost-blk:
> ----------------
> 
> # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
> 640000+0 records in
> 640000+0 records out
> 83886080000 bytes (84 GB) copied, 126.135 seconds, 665 MB/s
> 
> real    2m6.137s
> user    0m0.281s
> sys     0m14.725s
> 
> without vhost-blk: (virtio)
> ---------------------------
> 
> # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
> 640000+0 records in
> 640000+0 records out
> 83886080000 bytes (84 GB) copied, 275.466 seconds, 305 MB/s
> 
> real    4m35.468s
> user    0m0.373s
> sys     0m48.074s

Which caching mode is this?  I assume data=writeback, because otherwise
you'd be doing synchronous I/O directly from the handler.

> +static int do_handle_io(struct file *file, uint32_t type, uint64_t sector,
> +			struct iovec *iov, int in)
> +{
> +	loff_t pos = sector << 8;
> +	int ret = 0;
> +
> +	if (type & VIRTIO_BLK_T_FLUSH)  {
> +		ret = vfs_fsync(file, file->f_path.dentry, 1);
> +	} else if (type & VIRTIO_BLK_T_OUT) {
> +		ret = vfs_writev(file, iov, in, &pos);
> +	} else {
> +		ret = vfs_readv(file, iov, in, &pos);
> +	}
> +	return ret;

I have to admit I don't understand the vhost architecture at all, but
where do the actual data pointers used by the iovecs reside?
vfs_readv/writev expect both the iovec itself and the buffers
pointed to by it to reside in userspace, so just using kernel buffers
here will break badly on architectures with different user/kernel
mappings.  A lot of this is fixable using simple set_fs & co tricks,
but for direct I/O which uses get_user_pages even that will fail badly.

Also it seems like you're doing all the I/O synchronous here?  For
data=writeback operations that could explain the read speedup
as you're avoiding context switches, but for actual write I/O
which has to get data to disk (either directly from vfs_writev or
later through vfs_fsync) this seems like a really bad idea stealing
a lot of guest time that should happen in the background.


Other than that the code seems quite nice and simple, but one huge
problem is that it'll only support raw images, and thus misses out
on all the "nice" image formats used in qemu deployments, especially
qcow2.  It's also missing the ioctl magic we're having in various
places, both for controlling host devices like cdroms and SG
passthrough.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23 10:03 ` Avi Kivity
  2010-03-23 14:55   ` Badari Pulavarty
@ 2010-03-24 20:05   ` Christoph Hellwig
  2010-03-25  6:29     ` Avi Kivity
  2010-03-25 15:00     ` Asdo
  1 sibling, 2 replies; 48+ messages in thread
From: Christoph Hellwig @ 2010-03-24 20:05 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Badari Pulavarty, kvm

On Tue, Mar 23, 2010 at 12:03:14PM +0200, Avi Kivity wrote:
> I also think it should be done at the bio layer.  File I/O is going to  
> be slower, if we do vhost-blk we should concentrate on maximum  
> performance.  The block layer also exposes more functionality we can use  
> (asynchronous barriers for example).

The block layer is more flexible, but that limits you to only stack
directly ontop of a block device, which is extremly inflexible.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-24 20:04 ` Christoph Hellwig
@ 2010-03-24 20:22   ` Badari Pulavarty
  2010-03-25  7:57     ` Avi Kivity
                       ` (2 more replies)
  2010-03-24 20:27   ` Badari Pulavarty
  2010-03-29 15:41   ` Badari Pulavarty
  2 siblings, 3 replies; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-24 20:22 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: kvm

Christoph Hellwig wrote:
>> Inspired by vhost-net implementation, I did initial prototype 
>> of vhost-blk to see if it provides any benefits over QEMU virtio-blk.
>> I haven't handled all the error cases, fixed naming conventions etc.,
>> but the implementation is stable to play with. I tried not to deviate
>> from vhost-net implementation where possible.
>>     
>
> Can you also send the qemu side of it?
>
>   
>> with vhost-blk:
>> ----------------
>>
>> # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
>> 640000+0 records in
>> 640000+0 records out
>> 83886080000 bytes (84 GB) copied, 126.135 seconds, 665 MB/s
>>
>> real    2m6.137s
>> user    0m0.281s
>> sys     0m14.725s
>>
>> without vhost-blk: (virtio)
>> ---------------------------
>>
>> # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
>> 640000+0 records in
>> 640000+0 records out
>> 83886080000 bytes (84 GB) copied, 275.466 seconds, 305 MB/s
>>
>> real    4m35.468s
>> user    0m0.373s
>> sys     0m48.074s
>>     
>
> Which caching mode is this?  I assume data=writeback, because otherwise
> you'd be doing synchronous I/O directly from the handler.
>   

Yes. This is with default (writeback) cache model. As mentioned earlier, 
readhead is helping here
and most cases, data would be ready in the pagecache.
>   
>> +static int do_handle_io(struct file *file, uint32_t type, uint64_t sector,
>> +			struct iovec *iov, int in)
>> +{
>> +	loff_t pos = sector << 8;
>> +	int ret = 0;
>> +
>> +	if (type & VIRTIO_BLK_T_FLUSH)  {
>> +		ret = vfs_fsync(file, file->f_path.dentry, 1);
>> +	} else if (type & VIRTIO_BLK_T_OUT) {
>> +		ret = vfs_writev(file, iov, in, &pos);
>> +	} else {
>> +		ret = vfs_readv(file, iov, in, &pos);
>> +	}
>> +	return ret;
>>     
>
> I have to admit I don't understand the vhost architecture at all, but
> where do the actual data pointers used by the iovecs reside?
> vfs_readv/writev expect both the iovec itself and the buffers
> pointed to by it to reside in userspace, so just using kernel buffers
> here will break badly on architectures with different user/kernel
> mappings.  A lot of this is fixable using simple set_fs & co tricks,
> but for direct I/O which uses get_user_pages even that will fail badly.
>   
iovecs and buffers are user-space pointers (from the host kernel point 
of view). They are
guest address. So, I don't need to do any set_fs tricks.
> Also it seems like you're doing all the I/O synchronous here?  For
> data=writeback operations that could explain the read speedup
> as you're avoiding context switches, but for actual write I/O
> which has to get data to disk (either directly from vfs_writev or
> later through vfs_fsync) this seems like a really bad idea stealing
> a lot of guest time that should happen in the background.
>   
Yes. QEMU virtio-blk is batching up all the writes and handing of the 
work to another
thread. When the writes() are complete, its sending a status completion. 
Since I am
doing everything synchronous (even though its write to pagecache) one 
request at a
time, that explains the slow down. We need to find a way to

1) batch IO writes together
2) hand off to another thread to do the IO, so that vhost-thread can handle
next set of requests
3) update the status on the completion

What do should I do here ? I can create bunch of kernel threads to do 
the IO for me.
Or some how fit and reuse AIO io_submit() mechanism. Whats the best way 
here ?
I hate do duplicate all the code VFS is doing.
>
> Other than that the code seems quite nice and simple, but one huge
> problem is that it'll only support raw images, and thus misses out
> on all the "nice" image formats used in qemu deployments, especially
> qcow2.  It's also missing the ioctl magic we're having in various
> places, both for controlling host devices like cdroms and SG
> passthrough.
>   
True... unfortunately, I don't understand all of those (qcow2) details 
yet !! I need to read up on those,
to even make a comment :(

Thanks,
Badari



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-24 20:04 ` Christoph Hellwig
  2010-03-24 20:22   ` Badari Pulavarty
@ 2010-03-24 20:27   ` Badari Pulavarty
  2010-03-29 15:41   ` Badari Pulavarty
  2 siblings, 0 replies; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-24 20:27 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: kvm

Christoph Hellwig wrote:
>> Inspired by vhost-net implementation, I did initial prototype 
>> of vhost-blk to see if it provides any benefits over QEMU virtio-blk.
>> I haven't handled all the error cases, fixed naming conventions etc.,
>> but the implementation is stable to play with. I tried not to deviate
>> from vhost-net implementation where possible.
>>     
>
> Can you also send the qemu side of it?
>   
Its pretty hacky and based it on old patch (vhost-net) from MST for 
simplicity.
I haven't focused on cleaning it up and I will re-base it on MST's 
latest code
once it gets into QEMU.

Thanks,
Badari

---
 hw/virtio-blk.c |  199 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 199 insertions(+)

Index: vhost/hw/virtio-blk.c
===================================================================
--- vhost.orig/hw/virtio-blk.c	2010-02-25 16:47:04.000000000 -0500
+++ vhost/hw/virtio-blk.c	2010-03-17 14:07:26.477430740 -0400
@@ -18,6 +18,7 @@
 #ifdef __linux__
 # include <scsi/sg.h>
 #endif
+#include <kvm.h>

 typedef struct VirtIOBlock
 {
@@ -28,8 +29,13 @@
     char serial_str[BLOCK_SERIAL_STRLEN + 1];
     QEMUBH *bh;
     size_t config_size;
+    uint8_t vhost_started;
 } VirtIOBlock;

+typedef struct BDRVRawState {
+    int fd;
+} BDRVRawState;
+
 static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
 {
     return (VirtIOBlock *)vdev;
@@ -501,6 +507,198 @@
     return 0;
 }

+#if 1
+#include "linux/vhost.h"
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+#include "vhost.h"
+
+int vhost_blk_fd;
+
+struct slot_info {
+        unsigned long phys_addr;
+        unsigned long len;
+        unsigned long userspace_addr;
+        unsigned flags;
+        int logging_count;
+};
+
+extern struct slot_info slots[KVM_MAX_NUM_MEM_REGIONS];
+
+static int vhost_blk_start(struct VirtIODevice *vdev)
+{
+	target_phys_addr_t s, l, a;
+	int r, num, idx = 0;
+	struct vhost_vring_state state;
+	struct vhost_vring_file file;
+       struct vhost_vring_addr addr;
+	unsigned long long used_phys;
+	void *desc, *avail, *used;
+	int i, n =0;
+	struct VirtQueue *q = virtio_queue(vdev, idx);
+    	VirtIOBlock *vb = to_virtio_blk(vdev);
+	struct vhost_memory *mem;
+	BDRVRawState *st = vb->bs->opaque;
+
+	vhost_blk_fd = open("/dev/vhost-blk", O_RDWR);
+	if (vhost_blk_fd < 0) {
+		fprintf(stderr, "unable to open vhost-blk\n");
+		return -errno;
+	}
+
+	r = ioctl(vhost_blk_fd, VHOST_SET_OWNER, NULL);
+        if (r < 0) {
+		fprintf(stderr, "ioctl VHOST_SET_OWNER failed\n");
+                return -errno;
+	}
+
+        for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) {
+                if (!slots[i].len ||
+			(slots[i].flags & KVM_MEM_LOG_DIRTY_PAGES)) {
+		                       continue;
+                }
+                ++n;
+        }
+
+        mem = qemu_mallocz(offsetof(struct vhost_memory, regions) +
+                           n * sizeof(struct vhost_memory_region));
+        if (!mem)
+                return -ENOMEM;
+
+        mem->nregions = n;
+        n = 0;
+        for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) {
+                if (!slots[i].len || (slots[i].flags &
+			KVM_MEM_LOG_DIRTY_PAGES)) {
+                        continue;
+                }
+                mem->regions[n].guest_phys_addr = slots[i].phys_addr;
+                mem->regions[n].memory_size = slots[i].len;
+                mem->regions[n].userspace_addr = slots[i].userspace_addr;
+                ++n;
+        }
+
+        r = ioctl(vhost_blk_fd, VHOST_SET_MEM_TABLE, mem);
+        if (r < 0)
+                return -errno;
+
+	state.index = idx;
+	num = state.num = virtio_queue_get_num(vdev, idx);
+	r = ioctl(vhost_blk_fd, VHOST_SET_VRING_NUM, &state);
+        if (r) {
+		fprintf(stderr, "ioctl VHOST_SET_VRING_NUM failed\n");
+                return -errno;
+        }
+
+	state.num = virtio_queue_last_avail_idx(vdev, idx);
+       r = ioctl(vhost_blk_fd, VHOST_SET_VRING_BASE, &state);
+       if (r) {
+		fprintf(stderr, "ioctl VHOST_SET_VRING_BASE failed\n");
+                return -errno;
+       }
+
+	s = l = sizeof(struct vring_desc) * num;
+	a = virtio_queue_get_desc(vdev, idx);
+	desc = cpu_physical_memory_map(a, &l, 0);
+       if (!desc || l != s) {
+                r = -ENOMEM;
+                goto fail_alloc;
+       }
+       s = l = offsetof(struct vring_avail, ring) +
+                sizeof(u_int64_t) * num;
+        a = virtio_queue_get_avail(vdev, idx);
+        avail = cpu_physical_memory_map(a, &l, 0);
+        if (!avail || l != s) {
+                r = -ENOMEM;
+                goto fail_alloc;
+        }
+        s = l = offsetof(struct vring_used, ring) +
+                sizeof(struct vring_used_elem) * num;
+        used_phys = a = virtio_queue_get_used(vdev, idx);
+        used = cpu_physical_memory_map(a, &l, 1);
+        if (!used || l != s) {
+                r = -ENOMEM;
+                goto fail_alloc;
+        }
+
+ 	addr.index = idx,
+	addr.desc_user_addr = (u_int64_t)(unsigned long)desc,
+	addr.avail_user_addr = (u_int64_t)(unsigned long)avail,
+	addr.used_user_addr = (u_int64_t)(unsigned long)used,
+	addr.log_guest_addr = used_phys,
+	addr.flags = 0;
+        r = ioctl(vhost_blk_fd, VHOST_SET_VRING_ADDR, &addr);
+        if (r < 0) {
+		fprintf(stderr, "ioctl VHOST_SET_VRING_ADDR failed\n");
+		r = -errno;
+		goto fail_alloc;
+        }
+	if (!vdev->binding->guest_notifier || !vdev->binding->host_notifier) {
+                fprintf(stderr, "binding does not support irqfd/queuefd\n");
+                r = -ENOSYS;
+                goto fail_alloc;
+        }
+        r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, true);
+        if (r < 0) {
+                fprintf(stderr, "Error binding guest notifier: %d\n", -r);
+                goto fail_guest_notifier;
+        }
+
+        r = vdev->binding->host_notifier(vdev->binding_opaque, idx, true);
+        if (r < 0) {
+                fprintf(stderr, "Error binding host notifier: %d\n", -r);
+                goto fail_host_notifier;
+        }
+
+	 file.index = idx;
+        file.fd = event_notifier_get_fd(virtio_queue_host_notifier(q));
+        r = ioctl(vhost_blk_fd, VHOST_SET_VRING_KICK, &file);
+        if (r) {
+                goto fail_kick;
+        }
+
+        file.fd = event_notifier_get_fd(virtio_queue_guest_notifier(q));
+        r = ioctl(vhost_blk_fd, VHOST_SET_VRING_CALL, &file);
+        if (r) {
+                goto fail_call;
+        }
+        file.fd =  st->fd;
+        r = ioctl(vhost_blk_fd, VHOST_NET_SET_BACKEND, &file);
+        if (r) {
+		r = -errno;
+                goto fail_call;
+	}
+	return 0;
+fail_call:
+fail_kick:
+        vdev->binding->host_notifier(vdev->binding_opaque, idx, false);
+fail_host_notifier:
+        vdev->binding->guest_notifier(vdev->binding_opaque, idx, false);
+fail_guest_notifier:
+fail_alloc:
+        return r;
+}
+
+static void virtio_blk_set_status(struct VirtIODevice *vdev)
+{
+	VirtIOBlock *s = to_virtio_blk(vdev);
+
+	if (s->vhost_started)
+		return;
+
+	if (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) {
+		int r = vhost_blk_start(vdev);
+		if (r < 0) {
+			fprintf(stderr, "unable to start vhost blk: %d\n", r);
+		} else {
+			s->vhost_started = 1;
+		}
+	}
+}
+
+#endif
+
 VirtIODevice *virtio_blk_init(DeviceState *dev, DriveInfo *dinfo)
 {
     VirtIOBlock *s;
@@ -517,6 +715,7 @@
     s->config_size = size;
     s->vdev.get_config = virtio_blk_update_config;
     s->vdev.get_features = virtio_blk_get_features;
+    s->vdev.set_status = virtio_blk_set_status;
     s->vdev.reset = virtio_blk_reset;
     s->bs = dinfo->bdrv;
     s->rq = NULL;





^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-24 20:05   ` Christoph Hellwig
@ 2010-03-25  6:29     ` Avi Kivity
  2010-03-25 15:48       ` Christoph Hellwig
  2010-03-25 15:00     ` Asdo
  1 sibling, 1 reply; 48+ messages in thread
From: Avi Kivity @ 2010-03-25  6:29 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Badari Pulavarty, kvm

On 03/24/2010 10:05 PM, Christoph Hellwig wrote:
> On Tue, Mar 23, 2010 at 12:03:14PM +0200, Avi Kivity wrote:
>    
>> I also think it should be done at the bio layer.  File I/O is going to
>> be slower, if we do vhost-blk we should concentrate on maximum
>> performance.  The block layer also exposes more functionality we can use
>> (asynchronous barriers for example).
>>      
> The block layer is more flexible, but that limits you to only stack
> directly ontop of a block device, which is extremly inflexible.
>    

We still have a virtio implementation in userspace for file-based images.

In any case, the file APIs are not asynchronous so we'll need a thread 
pool.  That will probably minimize the difference in performance between 
the userspace and kernel implementations.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-24 20:22   ` Badari Pulavarty
@ 2010-03-25  7:57     ` Avi Kivity
  2010-03-25 14:36       ` Badari Pulavarty
  2010-03-25 15:57     ` Christoph Hellwig
  2010-04-05 19:23     ` Christoph Hellwig
  2 siblings, 1 reply; 48+ messages in thread
From: Avi Kivity @ 2010-03-25  7:57 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Christoph Hellwig, kvm

On 03/24/2010 10:22 PM, Badari Pulavarty wrote:
>> Which caching mode is this?  I assume data=writeback, because otherwise
>> you'd be doing synchronous I/O directly from the handler.
>
>
> Yes. This is with default (writeback) cache model. As mentioned 
> earlier, readhead is helping here
> and most cases, data would be ready in the pagecache.


btw, relying on readahead is problematic.  The guest already issues 
readahead requests, and the host will issue readahead based on that 
readahead, reading far into the future.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-25  7:57     ` Avi Kivity
@ 2010-03-25 14:36       ` Badari Pulavarty
  0 siblings, 0 replies; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-25 14:36 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Christoph Hellwig, kvm

Avi Kivity wrote:
> On 03/24/2010 10:22 PM, Badari Pulavarty wrote:
>>> Which caching mode is this?  I assume data=writeback, because otherwise
>>> you'd be doing synchronous I/O directly from the handler.
>>
>>
>> Yes. This is with default (writeback) cache model. As mentioned 
>> earlier, readhead is helping here
>> and most cases, data would be ready in the pagecache.
>
>
> btw, relying on readahead is problematic.  The guest already issues 
> readahead requests, and the host will issue readahead based on that 
> readahead, reading far into the future.
>
Sure. In my tests, I did O_DIRECT in the guest - so there is no 
readahead in the guest.

Thanks,
Badari


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-24 20:05   ` Christoph Hellwig
  2010-03-25  6:29     ` Avi Kivity
@ 2010-03-25 15:00     ` Asdo
  2010-04-05 19:59       ` Christoph Hellwig
  1 sibling, 1 reply; 48+ messages in thread
From: Asdo @ 2010-03-25 15:00 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Avi Kivity, Badari Pulavarty, kvm

Avi Kivity wrote:

> For the case where the file is actually a partition, use 
> submit_bio().  When the file is a file, keep it in qemu, that path is 
> going to be slower anyway. 
[CUT]

Christoph Hellwig wrote:
> On Tue, Mar 23, 2010 at 12:03:14PM +0200, Avi Kivity wrote:
>   
>> I also think it should be done at the bio layer.  File I/O is going to  
>> be slower, if we do vhost-blk we should concentrate on maximum  
>> performance.  The block layer also exposes more functionality we can use  
>> (asynchronous barriers for example).
>>     
>
> The block layer is more flexible, but that limits you to only stack
> directly ontop of a block device, which is extremly inflexible.
>   

Would the loop device provide the features of a block device? I recall 
barrier support at least has been added recently.
Is it recommended to run kvm on a loopback mounted file compared to on a 
raw file?


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-25  6:29     ` Avi Kivity
@ 2010-03-25 15:48       ` Christoph Hellwig
  2010-03-25 15:51         ` Avi Kivity
  0 siblings, 1 reply; 48+ messages in thread
From: Christoph Hellwig @ 2010-03-25 15:48 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Christoph Hellwig, Badari Pulavarty, kvm

On Thu, Mar 25, 2010 at 08:29:03AM +0200, Avi Kivity wrote:
> We still have a virtio implementation in userspace for file-based images.
>
> In any case, the file APIs are not asynchronous so we'll need a thread  
> pool.  That will probably minimize the difference in performance between  
> the userspace and kernel implementations.

The kernel has "real" aio when doing direct I/O, although currently we
can't easily use it from kernelspace.  Thejre's a few people who've been
looking into making it usable for that, though.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-25 15:48       ` Christoph Hellwig
@ 2010-03-25 15:51         ` Avi Kivity
  0 siblings, 0 replies; 48+ messages in thread
From: Avi Kivity @ 2010-03-25 15:51 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Badari Pulavarty, kvm

On 03/25/2010 05:48 PM, Christoph Hellwig wrote:
> On Thu, Mar 25, 2010 at 08:29:03AM +0200, Avi Kivity wrote:
>    
>> We still have a virtio implementation in userspace for file-based images.
>>
>> In any case, the file APIs are not asynchronous so we'll need a thread
>> pool.  That will probably minimize the difference in performance between
>> the userspace and kernel implementations.
>>      
> The kernel has "real" aio when doing direct I/O, although currently we
> can't easily use it from kernelspace.  Thejre's a few people who've been
> looking into making it usable for that, though.
>    

Interesting.  Are there limitations on supported filesystems, or 
allocating vs. non-allocating writes?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-24 20:22   ` Badari Pulavarty
  2010-03-25  7:57     ` Avi Kivity
@ 2010-03-25 15:57     ` Christoph Hellwig
  2010-03-26 18:53       ` Eran Rom
  2010-04-05 19:23     ` Christoph Hellwig
  2 siblings, 1 reply; 48+ messages in thread
From: Christoph Hellwig @ 2010-03-25 15:57 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Christoph Hellwig, kvm

On Wed, Mar 24, 2010 at 01:22:37PM -0700, Badari Pulavarty wrote:
> Yes. This is with default (writeback) cache model. As mentioned earlier,  
> readhead is helping here
> and most cases, data would be ready in the pagecache.

Ok.  cache=writeback performance is something I haven't bothered looking
at at all.  For cache=none any streaming write or random workload with
large enough record sizes got basically the same performance as native
using kernel aio, and same for write but slightly degraded for reads
using the thread pool.  See my attached JLS presentation for some
numbers.

>>   
> iovecs and buffers are user-space pointers (from the host kernel point  
> of view). They are
> guest address. So, I don't need to do any set_fs tricks.

Right now they're not ceclared as such, so sparse would complain.

> Yes. QEMU virtio-blk is batching up all the writes and handing of the  
> work to another
> thread.

If using the thread pool.  If using kernel aio it performs the io_submit
system call directly.

> What do should I do here ? I can create bunch of kernel threads to do  
> the IO for me.
> Or some how fit and reuse AIO io_submit() mechanism. Whats the best way  
> here ?
> I hate do duplicate all the code VFS is doing.

The only thing you can do currently is adding a thread pool.  It might
be able to use io_submit for the O_DIRECT case with some refactoring.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-25 15:57     ` Christoph Hellwig
@ 2010-03-26 18:53       ` Eran Rom
  2010-04-08 16:17         ` Stefan Hajnoczi
  0 siblings, 1 reply; 48+ messages in thread
From: Eran Rom @ 2010-03-26 18:53 UTC (permalink / raw)
  To: kvm

Christoph Hellwig <hch <at> infradead.org> writes:


> Ok.  cache=writeback performance is something I haven't bothered looking
> at at all.  For cache=none any streaming write or random workload with
> large enough record sizes got basically the same performance as native
> using kernel aio, and same for write but slightly degraded for reads
> using the thread pool.  See my attached JLS presentation for some
> numbers.
> 
Looks like the presentation did not make it...
Thanks,
Eran


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-24 20:04 ` Christoph Hellwig
  2010-03-24 20:22   ` Badari Pulavarty
  2010-03-24 20:27   ` Badari Pulavarty
@ 2010-03-29 15:41   ` Badari Pulavarty
  2010-03-29 18:20     ` Chris Wright
  2010-04-05 14:22     ` Stefan Hajnoczi
  2 siblings, 2 replies; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-29 15:41 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: kvm

Hi Christoph,

I am wondering if you can provide your thoughts here..

I modified my vhost-blk implementation to offload work to
work_queues instead of doing synchronously. Infact, I tried
to spread the work across all the CPUs. But to my surprise,
this did not improve the performance compared to virtio-blk.

I see vhost-blk taking more interrupts and context switches
compared to virtio-blk. What is virtio-blk doing which I
am not able to from vhost-blk ???

Thanks,
Badari


vhost-blk

procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 3  1   8920  56076  20760 5603556    0  104   196 79826 17164 13912  0  5 65 30  0
 2  4   9488  57216  20744 5605616    0  114   195 81120 17397 13824  0  5 65 30  0
 2  2  10028  68476  20728 5594764    0  108   206 80318 17162 13845  0  5 65 30  0
 0  4  10560  70856  20708 5593088    0  106   205 82363 17402 13904  0  5 65 30  0
 1  3  10948  80380  20672 5584452    0   78   178 79714 17113 13875  0  5 66 29  0

qemu virtio-blk:

procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 0  1  14124  57456   5144 4924060    0    0   139 142546 11287 9312  1  4 80 15  0
 0  2  14124  56736   5148 4927396    0    0   146 142968 11283 9248  1  4 80 15  0
 0  1  14124  56712   5384 4927020    0    0    74 150738 11182 9327  1  4 80 16  0
 1  1  14124  55496   5392 4927904    0    0     2 159902 11172 9401  1  3 79 17  0
 0  1  14124  55968   5408 4927232    0    0     0 159202 11212 9325  1  3 80 16  0

---
 drivers/vhost/blk.c |  310 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 310 insertions(+)

Index: net-next/drivers/vhost/blk.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ net-next/drivers/vhost/blk.c	2010-03-25 20:06:57.484054770 -0400
@@ -0,0 +1,310 @@
+ /*
+  * virtio-block server in host kernel.
+  * Inspired by vhost-net and shamlessly ripped code from it :)
+  */
+
+#include <linux/compat.h>
+#include <linux/eventfd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_blk.h>
+#include <linux/mmu_context.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/rcupdate.h>
+#include <linux/file.h>
+
+#include "vhost.h"
+
+#define VHOST_BLK_VQ_MAX 1
+
+#if 0
+#define myprintk(fmt, ...) printk(pr_fmt(fmt), ##__VA_ARGS__)
+#else
+#define myprintk(fmt, ...)
+#endif
+
+struct vhost_blk {
+	struct vhost_dev dev;
+	struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+	struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+};
+
+struct vhost_blk_io {
+	struct work_struct work;
+	struct vhost_blk *blk;
+	struct file *file;
+	int head;
+	uint32_t type;
+	uint64_t sector;
+	struct iovec *iov;
+	int nvecs;
+};
+
+static struct workqueue_struct *vblk_workqueue;
+
+static void handle_io_work(struct work_struct *work)
+{
+	struct vhost_blk_io *vbio;
+	struct vhost_virtqueue *vq;
+	struct vhost_blk *blk;
+	int i, ret = 0;
+	loff_t pos;
+	uint8_t status = 0;
+
+	vbio = container_of(work, struct vhost_blk_io, work);
+	blk = vbio->blk;
+	vq = &blk->dev.vqs[0];
+	pos = vbio->sector << 8;
+
+	use_mm(blk->dev.mm);
+
+	if (vbio->type & VIRTIO_BLK_T_FLUSH)  {
+		ret = vfs_fsync(vbio->file, vbio->file->f_path.dentry, 1);
+	} else if (vbio->type & VIRTIO_BLK_T_OUT) {
+		ret = vfs_writev(vbio->file, vbio->iov, vbio->nvecs, &pos);
+	} else {
+		ret = vfs_readv(vbio->file, vbio->iov, vbio->nvecs, &pos);
+	}
+
+	status = (ret < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+	if (copy_to_user(vbio->iov[vbio->nvecs].iov_base, &status, sizeof status) < 0) {
+		printk("copy to user failed\n");
+		vhost_discard_vq_desc(vq);
+        	unuse_mm(blk->dev.mm);
+		return;
+	}
+	mutex_lock(&vq->mutex);
+	vhost_add_used_and_signal(&blk->dev, vq, vbio->head, ret);
+	mutex_unlock(&vq->mutex);
+	unuse_mm(blk->dev.mm);
+	kfree(vbio);
+}
+
+static int cpu = 0;
+static int handoff_io(struct vhost_blk *blk, int head,
+			uint32_t type, uint64_t sector,
+			struct iovec *iov, int nvecs)
+{
+	struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+	struct vhost_blk_io *vbio;
+
+	vbio = kmalloc(sizeof(struct vhost_blk_io), GFP_KERNEL);
+	if (!vbio)
+		return -ENOMEM;
+
+	INIT_WORK(&vbio->work, handle_io_work);
+	vbio->blk = blk;
+	vbio->file = vq->private_data;
+	vbio->head = head;
+	vbio->type = type;
+	vbio->sector = sector;
+	vbio->iov = iov;
+	vbio->nvecs = nvecs;
+
+	cpu = cpumask_next(cpu, cpu_online_mask);
+	if (cpu >= nr_cpu_ids)
+		cpu = cpumask_first(cpu_online_mask);
+	queue_work_on(cpu, vblk_workqueue, &vbio->work);
+
+	return 0;
+}
+
+
+static void handle_blk(struct vhost_blk *blk)
+{
+	struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+	unsigned head, out, in;
+	struct virtio_blk_outhdr hdr;
+	int r, nvecs;
+
+	use_mm(blk->dev.mm);
+	mutex_lock(&vq->mutex);
+
+	vhost_disable_notify(vq);
+
+	for (;;) {
+		head = vhost_get_vq_desc(&blk->dev, vq, vq->iov,
+					 ARRAY_SIZE(vq->iov),
+					 &out, &in, NULL, NULL);
+		if (head == vq->num) {
+			if (unlikely(vhost_enable_notify(vq))) {
+				vhost_disable_notify(vq);
+				continue;
+			}
+			break;
+		}
+
+		BUG_ON(vq->iov[0].iov_len != 16);
+
+		r = copy_from_user(&hdr, vq->iov[0].iov_base, sizeof hdr);
+		if (r < 0) {
+			printk("copy from user failed\n");
+			vhost_discard_vq_desc(vq);
+			break;
+		}
+
+		nvecs = out - 1;
+		if (hdr.type == VIRTIO_BLK_T_IN)
+			nvecs = in - 1;
+
+		BUG_ON(vq->iov[nvecs+1].iov_len != 1);
+		r = handoff_io(blk, head, hdr.type, hdr.sector, &vq->iov[1], nvecs);
+		if (r < 0) {
+			vhost_discard_vq_desc(vq);
+			break;
+		}
+	}
+	mutex_unlock(&vq->mutex);
+	unuse_mm(blk->dev.mm);
+}
+
+static void vhost_blk_flush(struct vhost_blk *n)
+{
+	vhost_poll_flush(n->poll);
+	vhost_poll_flush(&n->dev.vqs[0].poll);
+}
+
+static void handle_blk_kick(struct work_struct *work)
+{
+	struct vhost_virtqueue *vq;
+	struct vhost_blk *blk;
+	vq = container_of(work, struct vhost_virtqueue, poll.work);
+	blk = container_of(vq->dev, struct vhost_blk, dev);
+	handle_blk(blk);
+}
+
+static void handle_rq_blk(struct work_struct *work)
+{
+	struct vhost_blk *blk;
+	blk = container_of(work, struct vhost_blk, poll[0].work);
+	handle_blk(blk);
+}
+
+static int vhost_blk_open(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *n = kmalloc(sizeof *n, GFP_KERNEL);
+	int r;
+	if (!n)
+		return -ENOMEM;
+	n->vqs[0].handle_kick = handle_blk_kick;
+	r = vhost_dev_init(&n->dev, n->vqs, VHOST_BLK_VQ_MAX);
+	if (r < 0) {
+		kfree(n);
+		return r;
+	}
+
+	vhost_poll_init(n->poll, handle_rq_blk, POLLOUT|POLLIN);
+	f->private_data = n;
+	return 0;
+}
+
+static int vhost_blk_release(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *n = f->private_data;
+
+	fput(n->vqs->private_data);
+	kfree(n);
+	return 0;
+}
+
+static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int fd)
+{
+	struct file *file;
+	struct vhost_virtqueue *vq;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	vq = n->vqs + index;
+	mutex_lock(&vq->mutex);
+	rcu_assign_pointer(vq->private_data, file);
+	mutex_unlock(&vq->mutex);
+	return 0;
+}
+
+
+static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
+                            unsigned long arg)
+{
+	struct vhost_blk *n = f->private_data;
+	void __user *argp = (void __user *)arg;
+	struct vhost_vring_file backend;
+	int r;
+
+	switch (ioctl) {
+        case VHOST_NET_SET_BACKEND:
+		r = copy_from_user(&backend, argp, sizeof backend);
+		if (r <	0)
+			return r;
+		return vhost_blk_set_backend(n, backend.index, backend.fd);
+	default:
+		mutex_lock(&n->dev.mutex);
+		r = vhost_dev_ioctl(&n->dev, ioctl, arg);
+		vhost_blk_flush(n);
+		mutex_unlock(&n->dev.mutex);
+		return r;
+	}
+}
+
+const static struct file_operations vhost_blk_fops = {
+	.owner          = THIS_MODULE,
+	.release        = vhost_blk_release,
+	.open           = vhost_blk_open,
+	.unlocked_ioctl = vhost_blk_ioctl,
+};
+
+static struct miscdevice vhost_blk_misc = {
+	234,
+	"vhost-blk",
+	&vhost_blk_fops,
+};
+
+int vhost_blk_init(void)
+{
+	int r = vhost_init();
+	if (r)
+		goto err_init;
+
+	vblk_workqueue = create_workqueue("vblk");
+	if (!vblk_workqueue) {
+		r = -ENOMEM;
+		goto err_vblk;
+	}
+
+	r = misc_register(&vhost_blk_misc);
+	if (r)
+		goto err_reg;
+	return 0;
+err_reg:
+	destroy_workqueue(vblk_workqueue);
+err_vblk:
+	vhost_cleanup();
+err_init:
+	return r;
+
+}
+module_init(vhost_blk_init);
+
+void vhost_blk_exit(void)
+{
+	misc_deregister(&vhost_blk_misc);
+	destroy_workqueue(vblk_workqueue);
+	vhost_cleanup();
+}
+module_exit(vhost_blk_exit);
+
+MODULE_VERSION("0.0.1");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-29 15:41   ` Badari Pulavarty
@ 2010-03-29 18:20     ` Chris Wright
  2010-03-29 20:37       ` Avi Kivity
  2010-04-05 14:22     ` Stefan Hajnoczi
  1 sibling, 1 reply; 48+ messages in thread
From: Chris Wright @ 2010-03-29 18:20 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Christoph Hellwig, kvm

* Badari Pulavarty (pbadari@us.ibm.com) wrote:
> I modified my vhost-blk implementation to offload work to
> work_queues instead of doing synchronously. Infact, I tried
> to spread the work across all the CPUs. But to my surprise,
> this did not improve the performance compared to virtio-blk.
> 
> I see vhost-blk taking more interrupts and context switches
> compared to virtio-blk. What is virtio-blk doing which I
> am not able to from vhost-blk ???

Your io wait time is twice as long and your throughput is about half.
I think the qmeu block submission does an extra attempt at merging
requests.  Does blktrace tell you anything interesting?

> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu-----
>  r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
>  3  1   8920  56076  20760 5603556    0  104   196 79826 17164 13912  0  5 65 30  0
>  2  4   9488  57216  20744 5605616    0  114   195 81120 17397 13824  0  5 65 30  0
>  2  2  10028  68476  20728 5594764    0  108   206 80318 17162 13845  0  5 65 30  0
>  0  4  10560  70856  20708 5593088    0  106   205 82363 17402 13904  0  5 65 30  0
>  1  3  10948  80380  20672 5584452    0   78   178 79714 17113 13875  0  5 66 29  0
> 
> qemu virtio-blk:
> 
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu-----
>  r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
>  0  1  14124  57456   5144 4924060    0    0   139 142546 11287 9312  1  4 80 15  0
>  0  2  14124  56736   5148 4927396    0    0   146 142968 11283 9248  1  4 80 15  0
>  0  1  14124  56712   5384 4927020    0    0    74 150738 11182 9327  1  4 80 16  0
>  1  1  14124  55496   5392 4927904    0    0     2 159902 11172 9401  1  3 79 17  0
>  0  1  14124  55968   5408 4927232    0    0     0 159202 11212 9325  1  3 80 16  0

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-29 18:20     ` Chris Wright
@ 2010-03-29 20:37       ` Avi Kivity
  2010-03-29 22:51         ` Badari Pulavarty
  0 siblings, 1 reply; 48+ messages in thread
From: Avi Kivity @ 2010-03-29 20:37 UTC (permalink / raw)
  To: Chris Wright; +Cc: Badari Pulavarty, Christoph Hellwig, kvm

On 03/29/2010 09:20 PM, Chris Wright wrote:
> * Badari Pulavarty (pbadari@us.ibm.com) wrote:
>    
>> I modified my vhost-blk implementation to offload work to
>> work_queues instead of doing synchronously. Infact, I tried
>> to spread the work across all the CPUs. But to my surprise,
>> this did not improve the performance compared to virtio-blk.
>>
>> I see vhost-blk taking more interrupts and context switches
>> compared to virtio-blk. What is virtio-blk doing which I
>> am not able to from vhost-blk ???
>>      
> Your io wait time is twice as long and your throughput is about half.
> I think the qmeu block submission does an extra attempt at merging
> requests.  Does blktrace tell you anything interesting?
>    

It does.  I suggest using fio O_DIRECT random access patterns to avoid 
such issues.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-29 20:37       ` Avi Kivity
@ 2010-03-29 22:51         ` Badari Pulavarty
  2010-03-29 23:56           ` Chris Wright
  2010-03-30 12:43           ` Avi Kivity
  0 siblings, 2 replies; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-29 22:51 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Chris Wright, Christoph Hellwig, kvm

On Mon, 2010-03-29 at 23:37 +0300, Avi Kivity wrote:
> On 03/29/2010 09:20 PM, Chris Wright wrote:
> > * Badari Pulavarty (pbadari@us.ibm.com) wrote:
> >    
> >> I modified my vhost-blk implementation to offload work to
> >> work_queues instead of doing synchronously. Infact, I tried
> >> to spread the work across all the CPUs. But to my surprise,
> >> this did not improve the performance compared to virtio-blk.
> >>
> >> I see vhost-blk taking more interrupts and context switches
> >> compared to virtio-blk. What is virtio-blk doing which I
> >> am not able to from vhost-blk ???
> >>      
> > Your io wait time is twice as long and your throughput is about half.
> > I think the qmeu block submission does an extra attempt at merging
> > requests.  Does blktrace tell you anything interesting?
> >    

Yes. I see that in my testcase (2M writes) - QEMU is pickup 512K
requests from the virtio ring and merging them back to 2M before
submitting them. 

Unfortunately, I can't do that quite easily in vhost-blk. QEMU
does re-creates iovecs for the merged IO. I have to come up with
a scheme to do this :(

> It does.  I suggest using fio O_DIRECT random access patterns to avoid 
> such issues.

Well, I am not trying to come up with a test case where vhost-blk
performs better than virtio-blk. I am trying to understand where
and why vhost-blk performnce worse than virtio-blk.


Thanks,
Badari



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-29 22:51         ` Badari Pulavarty
@ 2010-03-29 23:56           ` Chris Wright
  2010-03-30 12:43           ` Avi Kivity
  1 sibling, 0 replies; 48+ messages in thread
From: Chris Wright @ 2010-03-29 23:56 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Avi Kivity, Chris Wright, Christoph Hellwig, kvm

* Badari Pulavarty (pbadari@us.ibm.com) wrote:
> On Mon, 2010-03-29 at 23:37 +0300, Avi Kivity wrote:
> > On 03/29/2010 09:20 PM, Chris Wright wrote:
> > > Your io wait time is twice as long and your throughput is about half.
> > > I think the qmeu block submission does an extra attempt at merging
> > > requests.  Does blktrace tell you anything interesting?
> 
> Yes. I see that in my testcase (2M writes) - QEMU is pickup 512K
> requests from the virtio ring and merging them back to 2M before
> submitting them. 
> 
> Unfortunately, I can't do that quite easily in vhost-blk. QEMU
> does re-creates iovecs for the merged IO. I have to come up with
> a scheme to do this :(

Is close cooperator logic kicking in at all?  Alternatively, using same
io_context.

> > It does.  I suggest using fio O_DIRECT random access patterns to avoid 
> > such issues.
> 
> Well, I am not trying to come up with a test case where vhost-blk
> performs better than virtio-blk. I am trying to understand where
> and why vhost-blk performnce worse than virtio-blk.

It would just level the playing field.  Alternatively, commenting out
merging in qemu to further validate it's the source, something as simple
as this in block.c:

-    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
+//    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);


Although, from above, sounds like you've already verified this is the
difference.  IIRC, the sync write path is at odds w/ cfq elevator merging
(so cache=none would have trouble, but you were doing cache=writeback,
right?), but if you can hack your worker threads to share an io_context,
like you had done CLONE_IO,  you might get some merging back (again,
just a hack to pinpoint merging as the culprit).

thanks,
-chris

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-29 22:51         ` Badari Pulavarty
  2010-03-29 23:56           ` Chris Wright
@ 2010-03-30 12:43           ` Avi Kivity
  1 sibling, 0 replies; 48+ messages in thread
From: Avi Kivity @ 2010-03-30 12:43 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Chris Wright, Christoph Hellwig, kvm

On 03/30/2010 01:51 AM, Badari Pulavarty wrote:
>
>    
>>> Your io wait time is twice as long and your throughput is about half.
>>> I think the qmeu block submission does an extra attempt at merging
>>> requests.  Does blktrace tell you anything interesting?
>>>
>>>        
> Yes. I see that in my testcase (2M writes) - QEMU is pickup 512K
> requests from the virtio ring and merging them back to 2M before
> submitting them.
>
> Unfortunately, I can't do that quite easily in vhost-blk. QEMU
> does re-creates iovecs for the merged IO. I have to come up with
> a scheme to do this :(
>    

I don't think that either vhost-blk or virtio-blk should do this.  
Merging increases latency, and in the case of streaming writes makes it 
impossible for the guest to prepare new requests while earlier ones are 
being serviced (in effect it reduces the queue depth to 1).

qcow2 does benefit from merging, but it should do so itself without 
impacting raw.

>> It does.  I suggest using fio O_DIRECT random access patterns to avoid
>> such issues.
>>      
> Well, I am not trying to come up with a test case where vhost-blk
> performs better than virtio-blk. I am trying to understand where
> and why vhost-blk performnce worse than virtio-blk.
>    

In this case qemu-virtio is making an incorrect tradeoff.  The guest 
could easily merge those requests itself.  If you want larger writes, 
tune the guest to issue them.

Another way to look at it:  merging improved bandwidth but increased 
latency, yet you are only measuring bandwidth.  If you measured only 
latency you'd find that vhost-blk is better.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-29 15:41   ` Badari Pulavarty
  2010-03-29 18:20     ` Chris Wright
@ 2010-04-05 14:22     ` Stefan Hajnoczi
  2010-04-06  2:27       ` Badari Pulavarty
  1 sibling, 1 reply; 48+ messages in thread
From: Stefan Hajnoczi @ 2010-04-05 14:22 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: kvm

On Mon, Mar 29, 2010 at 4:41 PM, Badari Pulavarty <pbadari@us.ibm.com> wrote:
> +static void handle_io_work(struct work_struct *work)
> +{
> +       struct vhost_blk_io *vbio;
> +       struct vhost_virtqueue *vq;
> +       struct vhost_blk *blk;
> +       int i, ret = 0;
> +       loff_t pos;
> +       uint8_t status = 0;
> +
> +       vbio = container_of(work, struct vhost_blk_io, work);
> +       blk = vbio->blk;
> +       vq = &blk->dev.vqs[0];
> +       pos = vbio->sector << 8;
> +
> +       use_mm(blk->dev.mm);
> +
> +       if (vbio->type & VIRTIO_BLK_T_FLUSH)  {
> +               ret = vfs_fsync(vbio->file, vbio->file->f_path.dentry, 1);
> +       } else if (vbio->type & VIRTIO_BLK_T_OUT) {
> +               ret = vfs_writev(vbio->file, vbio->iov, vbio->nvecs, &pos);
> +       } else {
> +               ret = vfs_readv(vbio->file, vbio->iov, vbio->nvecs, &pos);
> +       }
> +
> +       status = (ret < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
> +       if (copy_to_user(vbio->iov[vbio->nvecs].iov_base, &status, sizeof status) < 0) {
> +               printk("copy to user failed\n");
> +               vhost_discard_vq_desc(vq);
> +               unuse_mm(blk->dev.mm);
> +               return;

Do you need to kfree(vbio) here?

> +static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int fd)
> +{
> +       struct file *file;
> +       struct vhost_virtqueue *vq;
> +
> +       file = fget(fd);
> +       if (!file)
> +               return -EBADF;
> +
> +       vq = n->vqs + index;
> +       mutex_lock(&vq->mutex);
> +       rcu_assign_pointer(vq->private_data, file);
> +       mutex_unlock(&vq->mutex);
> +       return 0;
> +}
> +
> +
> +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
> +                            unsigned long arg)
> +{
> +       struct vhost_blk *n = f->private_data;
> +       void __user *argp = (void __user *)arg;
> +       struct vhost_vring_file backend;
> +       int r;
> +
> +       switch (ioctl) {
> +        case VHOST_NET_SET_BACKEND:
> +               r = copy_from_user(&backend, argp, sizeof backend);
> +               if (r < 0)
> +                       return r;
> +               return vhost_blk_set_backend(n, backend.index, backend.fd);

I don't see backend.index being checked against VHOST_BLK_VQ_MAX.

Stefan

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-24 20:22   ` Badari Pulavarty
  2010-03-25  7:57     ` Avi Kivity
  2010-03-25 15:57     ` Christoph Hellwig
@ 2010-04-05 19:23     ` Christoph Hellwig
  2010-04-05 23:17       ` Badari Pulavarty
  2 siblings, 1 reply; 48+ messages in thread
From: Christoph Hellwig @ 2010-04-05 19:23 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Christoph Hellwig, kvm

On Wed, Mar 24, 2010 at 01:22:37PM -0700, Badari Pulavarty wrote:
> iovecs and buffers are user-space pointers (from the host kernel point  
> of view). They are
> guest address. So, I don't need to do any set_fs tricks.

>From verifying the code and using the sparse annotations it appears
that the actual buffers are userspace pointers, but the iovecs in
the virtqueue are kernel level pointers, so you would need some
annotations.

While we're at it here is a patch fixing the remaining sparse 
warnings in vhost-blk:


Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: linux-2.6/drivers/vhost/blk.c
===================================================================
--- linux-2.6.orig/drivers/vhost/blk.c	2010-04-05 21:15:11.638004250 +0200
+++ linux-2.6/drivers/vhost/blk.c	2010-04-05 21:16:13.238004599 +0200
@@ -86,7 +86,7 @@ static void handle_blk(struct vhost_blk
 		nvecs++;
 		BUG_ON(vq->iov[nvecs].iov_len != 1);
 
-		if (copy_to_user(vq->iov[nvecs].iov_base, &status, sizeof status) < 0) {
+		if (copy_to_user(vq->iov[nvecs].iov_base, &status, sizeof status)) {
 			printk("copy to user failed\n");
 			vhost_discard_vq_desc(vq);
 			break;
@@ -199,7 +199,7 @@ static struct miscdevice vhost_blk_misc
 	&vhost_blk_fops,
 };
 
-int vhost_blk_init(void)
+static int vhost_blk_init(void)
 {
 	int r = vhost_init();
 	if (r)
@@ -216,7 +216,7 @@ err_init:
 }
 module_init(vhost_blk_init);
 
-void vhost_blk_exit(void)
+static void vhost_blk_exit(void)
 {
 	misc_deregister(&vhost_blk_misc);
 	vhost_cleanup();

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-25 15:00     ` Asdo
@ 2010-04-05 19:59       ` Christoph Hellwig
  2010-04-07  0:36           ` [Qemu-devel] " Badari Pulavarty
  2010-04-07  0:36         ` Badari Pulavarty
  0 siblings, 2 replies; 48+ messages in thread
From: Christoph Hellwig @ 2010-04-05 19:59 UTC (permalink / raw)
  To: Asdo; +Cc: Christoph Hellwig, Avi Kivity, Badari Pulavarty, kvm

On Thu, Mar 25, 2010 at 04:00:56PM +0100, Asdo wrote:
> Would the loop device provide the features of a block device? I recall  
> barrier support at least has been added recently.

It does, but not in a very efficient way.

> Is it recommended to run kvm on a loopback mounted file compared to on a  
> raw file?

No.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-04-05 19:23     ` Christoph Hellwig
@ 2010-04-05 23:17       ` Badari Pulavarty
  0 siblings, 0 replies; 48+ messages in thread
From: Badari Pulavarty @ 2010-04-05 23:17 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: kvm

On Mon, 2010-04-05 at 15:23 -0400, Christoph Hellwig wrote:
> On Wed, Mar 24, 2010 at 01:22:37PM -0700, Badari Pulavarty wrote:
> > iovecs and buffers are user-space pointers (from the host kernel point  
> > of view). They are
> > guest address. So, I don't need to do any set_fs tricks.
> 
> From verifying the code and using the sparse annotations it appears
> that the actual buffers are userspace pointers, but the iovecs in
> the virtqueue are kernel level pointers, so you would need some
> annotations.

Yes. Thats correct. I will add appropriate annotations.

> 
> While we're at it here is a patch fixing the remaining sparse 
> warnings in vhost-blk:

Applied.

Thanks,
Badari

> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Index: linux-2.6/drivers/vhost/blk.c
> ===================================================================
> --- linux-2.6.orig/drivers/vhost/blk.c	2010-04-05 21:15:11.638004250 +0200
> +++ linux-2.6/drivers/vhost/blk.c	2010-04-05 21:16:13.238004599 +0200
> @@ -86,7 +86,7 @@ static void handle_blk(struct vhost_blk
>  		nvecs++;
>  		BUG_ON(vq->iov[nvecs].iov_len != 1);
> 
> -		if (copy_to_user(vq->iov[nvecs].iov_base, &status, sizeof status) < 0) {
> +		if (copy_to_user(vq->iov[nvecs].iov_base, &status, sizeof status)) {
>  			printk("copy to user failed\n");
>  			vhost_discard_vq_desc(vq);
>  			break;
> @@ -199,7 +199,7 @@ static struct miscdevice vhost_blk_misc
>  	&vhost_blk_fops,
>  };
> 
> -int vhost_blk_init(void)
> +static int vhost_blk_init(void)
>  {
>  	int r = vhost_init();
>  	if (r)
> @@ -216,7 +216,7 @@ err_init:
>  }
>  module_init(vhost_blk_init);
> 
> -void vhost_blk_exit(void)
> +static void vhost_blk_exit(void)
>  {
>  	misc_deregister(&vhost_blk_misc);
>  	vhost_cleanup();


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-04-05 14:22     ` Stefan Hajnoczi
@ 2010-04-06  2:27       ` Badari Pulavarty
  0 siblings, 0 replies; 48+ messages in thread
From: Badari Pulavarty @ 2010-04-06  2:27 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: kvm

On Mon, 2010-04-05 at 15:22 +0100, Stefan Hajnoczi wrote:
> On Mon, Mar 29, 2010 at 4:41 PM, Badari Pulavarty <pbadari@us.ibm.com> wrote:
> > +static void handle_io_work(struct work_struct *work)
> > +{
> > +       struct vhost_blk_io *vbio;
> > +       struct vhost_virtqueue *vq;
> > +       struct vhost_blk *blk;
> > +       int i, ret = 0;
> > +       loff_t pos;
> > +       uint8_t status = 0;
> > +
> > +       vbio = container_of(work, struct vhost_blk_io, work);
> > +       blk = vbio->blk;
> > +       vq = &blk->dev.vqs[0];
> > +       pos = vbio->sector << 8;
> > +
> > +       use_mm(blk->dev.mm);
> > +
> > +       if (vbio->type & VIRTIO_BLK_T_FLUSH)  {
> > +               ret = vfs_fsync(vbio->file, vbio->file->f_path.dentry, 1);
> > +       } else if (vbio->type & VIRTIO_BLK_T_OUT) {
> > +               ret = vfs_writev(vbio->file, vbio->iov, vbio->nvecs, &pos);
> > +       } else {
> > +               ret = vfs_readv(vbio->file, vbio->iov, vbio->nvecs, &pos);
> > +       }
> > +
> > +       status = (ret < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
> > +       if (copy_to_user(vbio->iov[vbio->nvecs].iov_base, &status, sizeof status) < 0) {
> > +               printk("copy to user failed\n");
> > +               vhost_discard_vq_desc(vq);
> > +               unuse_mm(blk->dev.mm);
> > +               return;
> 
> Do you need to kfree(vbio) here?

Yes. I do. As mentioned earlier, I haven't fixed error handling yet :(
> 
> > +static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int fd)
> > +{
> > +       struct file *file;
> > +       struct vhost_virtqueue *vq;
> > +
> > +       file = fget(fd);
> > +       if (!file)
> > +               return -EBADF;
> > +
> > +       vq = n->vqs + index;
> > +       mutex_lock(&vq->mutex);
> > +       rcu_assign_pointer(vq->private_data, file);
> > +       mutex_unlock(&vq->mutex);
> > +       return 0;
> > +}
> > +
> > +
> > +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
> > +                            unsigned long arg)
> > +{
> > +       struct vhost_blk *n = f->private_data;
> > +       void __user *argp = (void __user *)arg;
> > +       struct vhost_vring_file backend;
> > +       int r;
> > +
> > +       switch (ioctl) {
> > +        case VHOST_NET_SET_BACKEND:
> > +               r = copy_from_user(&backend, argp, sizeof backend);
> > +               if (r < 0)
> > +                       return r;
> > +               return vhost_blk_set_backend(n, backend.index, backend.fd);
> 
> I don't see backend.index being checked against VHOST_BLK_VQ_MAX.

Yep. You are right. I will add these checks for my next revision.

Thanks,
Badari


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [RFC] vhost-blk implementation (v2)
  2010-04-05 19:59       ` Christoph Hellwig
@ 2010-04-07  0:36           ` Badari Pulavarty
  2010-04-07  0:36         ` Badari Pulavarty
  1 sibling, 0 replies; 48+ messages in thread
From: Badari Pulavarty @ 2010-04-07  0:36 UTC (permalink / raw)
  To: kvm, virtualization, qemu-devel

Hi All,

Here is the latest version of vhost-blk implementation.
Major difference from my previous implementation is that, I
now merge all contiguous requests (both read and write), before
submitting them. This significantly improved IO performance.
I am still collecting performance numbers, I will be posting
in next few days.

Comments ?

Todo:
- Address hch's comments on annontations
- Implement per device read/write queues
- Finish up error handling

Thanks,
Badari

---
 drivers/vhost/blk.c |  445 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 445 insertions(+)

Index: net-next/drivers/vhost/blk.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ net-next/drivers/vhost/blk.c	2010-04-06 16:38:03.563847905 -0400
@@ -0,0 +1,445 @@
+ /*
+  * virtio-block server in host kernel.
+  * Inspired by vhost-net and shamlessly ripped code from it :)
+  */
+
+#include <linux/compat.h>
+#include <linux/eventfd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_blk.h>
+#include <linux/mmu_context.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/rcupdate.h>
+#include <linux/file.h>
+
+#include "vhost.h"
+
+#define VHOST_BLK_VQ_MAX 1
+#define SECTOR_SHIFT 9
+
+struct vhost_blk {
+	struct vhost_dev dev;
+	struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+	struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+};
+
+struct vhost_blk_io {
+	struct list_head list;
+	struct work_struct work;
+	struct vhost_blk *blk;
+	struct file *file;
+	int head;
+	uint32_t type;
+	uint32_t nvecs;
+	uint64_t sector;
+	uint64_t len;
+	struct iovec iov[0];
+};
+
+static struct workqueue_struct *vblk_workqueue;
+static LIST_HEAD(write_queue);
+static LIST_HEAD(read_queue);
+
+static void handle_io_work(struct work_struct *work)
+{
+	struct vhost_blk_io *vbio, *entry;
+	struct vhost_virtqueue *vq;
+	struct vhost_blk *blk;
+	struct list_head single, *head, *node, *tmp;
+
+	int i, need_free, ret = 0;
+	loff_t pos;
+	uint8_t status = 0;
+
+	vbio = container_of(work, struct vhost_blk_io, work);
+	blk = vbio->blk;
+	vq = &blk->dev.vqs[0];
+	pos = vbio->sector << 8;
+
+	use_mm(blk->dev.mm);
+	if (vbio->type & VIRTIO_BLK_T_FLUSH)  {
+		ret = vfs_fsync(vbio->file, vbio->file->f_path.dentry, 1);
+	} else if (vbio->type & VIRTIO_BLK_T_OUT) {
+		ret = vfs_writev(vbio->file, vbio->iov, vbio->nvecs, &pos);
+	} else {
+		ret = vfs_readv(vbio->file, vbio->iov, vbio->nvecs, &pos);
+	}
+	status = (ret < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+	if (vbio->head != -1) {
+		INIT_LIST_HEAD(&single);
+		list_add(&vbio->list, &single);
+		head = &single;
+		need_free = 0;
+	} else {
+		head = &vbio->list;
+		need_free = 1;
+	}
+	list_for_each_entry(entry, head, list) {
+		copy_to_user(entry->iov[entry->nvecs].iov_base, &status, sizeof status);
+	}
+	mutex_lock(&vq->mutex);
+	list_for_each_safe(node, tmp, head) {
+		entry = list_entry(node, struct vhost_blk_io, list);
+		vhost_add_used_and_signal(&blk->dev, vq, entry->head, ret);
+		list_del(node);
+		kfree(entry);
+	}
+	mutex_unlock(&vq->mutex);
+	unuse_mm(blk->dev.mm);
+	if (need_free)
+		kfree(vbio);
+}
+
+static struct vhost_blk_io *allocate_vbio(int nvecs)
+{
+	struct vhost_blk_io *vbio;
+	int size = sizeof(struct vhost_blk_io) + nvecs * sizeof(struct iovec);
+	vbio = kmalloc(size, GFP_KERNEL);
+	if (vbio) {
+		INIT_WORK(&vbio->work, handle_io_work);
+		INIT_LIST_HEAD(&vbio->list);
+	}
+	return vbio;
+}
+
+static void merge_and_handoff_work(struct list_head *queue)
+{
+	struct vhost_blk_io *vbio, *entry;
+	int nvecs = 0;
+	int entries = 0;
+
+	list_for_each_entry(entry, queue, list) {
+		nvecs += entry->nvecs;
+		entries++;
+	}
+
+	if (entries == 1) {
+		vbio = list_first_entry(queue, struct vhost_blk_io, list);
+		list_del(&vbio->list);
+		queue_work(vblk_workqueue, &vbio->work);
+		return;
+	}
+
+	vbio = allocate_vbio(nvecs);
+	if (!vbio) {
+		/* Unable to allocate memory - submit IOs individually */
+		list_for_each_entry(vbio, queue, list) {
+			queue_work(vblk_workqueue, &vbio->work);
+		}
+		INIT_LIST_HEAD(queue);
+		return;
+	}
+
+	entry = list_first_entry(queue, struct vhost_blk_io, list);
+	vbio->nvecs = nvecs;
+	vbio->blk = entry->blk;
+	vbio->file = entry->file;
+	vbio->type = entry->type;
+	vbio->sector = entry->sector;
+	vbio->head = -1;
+	vbio->len = 0;
+	nvecs = 0;
+
+	list_for_each_entry(entry, queue, list) {
+		memcpy(&vbio->iov[nvecs], entry->iov, entry->nvecs * sizeof(struct iovec));
+		nvecs += entry->nvecs;
+		vbio->len += entry->len;
+	}
+	list_replace_init(queue, &vbio->list);
+	queue_work(vblk_workqueue, &vbio->work);
+}
+
+static void start_io(struct list_head *queue)
+{
+	struct list_head start;
+	struct vhost_blk_io *vbio = NULL, *entry;
+
+	if (list_empty(queue))
+                return;
+
+	list_for_each_entry(entry, queue, list) {
+		if (!vbio) {
+			vbio = entry;
+			continue;
+		}
+		if (vbio->sector + (vbio->len >> SECTOR_SHIFT) == entry->sector) {
+			vbio = entry;
+		} else {
+			INIT_LIST_HEAD(&start);
+			list_cut_position(&start, queue, &vbio->list);
+			merge_and_handoff_work(&start);
+			vbio = entry;
+		}
+	}
+	if (!list_empty(queue))
+		merge_and_handoff_work(queue);
+}
+
+static uint64_t calculate_len(struct iovec *iov, int nvecs)
+{
+	uint64_t len = 0;
+	int i;
+
+	for (i=0; i<nvecs; i++)
+		len += iov[i].iov_len;
+	return len;
+}
+
+static void insert_to_queue(struct vhost_blk_io *vbio,
+			struct list_head *queue)
+{
+	struct vhost_blk_io *entry;
+
+	list_for_each_entry(entry, queue, list) {
+		if (entry->sector > vbio->sector)
+			break;
+	}
+	list_add_tail(&vbio->list, &entry->list);
+}
+
+static int handoff_io(struct vhost_blk *blk, int head,
+			uint32_t type, uint64_t sector,
+			struct iovec *iov, int nvecs)
+{
+	struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+	struct vhost_blk_io *vbio;
+
+	vbio = allocate_vbio(nvecs+1);
+	if (!vbio) {
+		return -ENOMEM;
+	}
+	vbio->blk = blk;
+	vbio->head = head;
+	vbio->file = vq->private_data;
+	vbio->type = type;
+	vbio->sector = sector;
+	vbio->nvecs = nvecs;
+	vbio->len = calculate_len(iov, nvecs);
+	memcpy(vbio->iov, iov, (nvecs + 1) * sizeof(struct iovec));
+
+	if (vbio->type & VIRTIO_BLK_T_FLUSH) {
+#if 0
+		/* Sync called - do I need to submit IOs in the queue ? */
+		start_io(&read_queue);
+		start_io(&write_queue);
+#endif
+		queue_work(vblk_workqueue, &vbio->work);
+	} else if (vbio->type & VIRTIO_BLK_T_OUT) {
+		insert_to_queue(vbio, &write_queue);
+	} else {
+		insert_to_queue(vbio, &read_queue);
+	}
+	return 0;
+}
+
+
+static void handle_blk(struct vhost_blk *blk)
+{
+	struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+	unsigned head, out, in;
+	struct virtio_blk_outhdr hdr;
+	int nvecs;
+
+	use_mm(blk->dev.mm);
+	mutex_lock(&vq->mutex);
+
+	vhost_disable_notify(vq);
+
+	for (;;) {
+		head = vhost_get_vq_desc(&blk->dev, vq, vq->iov,
+					 ARRAY_SIZE(vq->iov),
+					 &out, &in, NULL, NULL);
+		if (head == vq->num) {
+			if (unlikely(vhost_enable_notify(vq))) {
+				vhost_disable_notify(vq);
+				continue;
+			}
+			start_io(&read_queue);
+			start_io(&write_queue);
+			break;
+		}
+
+		BUG_ON(vq->iov[0].iov_len != 16);
+
+		if (copy_from_user(&hdr, vq->iov[0].iov_base, sizeof hdr)) {
+			vhost_discard_vq_desc(vq);
+			continue;
+		}
+
+		nvecs = out - 1;
+		if (hdr.type == VIRTIO_BLK_T_IN)
+			nvecs = in - 1;
+
+		BUG_ON(vq->iov[nvecs+1].iov_len != 1);
+		if (handoff_io(blk, head, hdr.type, hdr.sector, &vq->iov[1], nvecs) < 0) {
+			vhost_discard_vq_desc(vq);
+			continue;
+		}
+	}
+	mutex_unlock(&vq->mutex);
+	unuse_mm(blk->dev.mm);
+}
+
+static void vhost_blk_flush(struct vhost_blk *n)
+{
+	vhost_poll_flush(n->poll);
+	vhost_poll_flush(&n->dev.vqs[0].poll);
+}
+
+static void handle_blk_kick(struct work_struct *work)
+{
+	struct vhost_virtqueue *vq;
+	struct vhost_blk *blk;
+	vq = container_of(work, struct vhost_virtqueue, poll.work);
+	blk = container_of(vq->dev, struct vhost_blk, dev);
+	handle_blk(blk);
+}
+
+static void handle_rq_blk(struct work_struct *work)
+{
+	struct vhost_blk *blk;
+	blk = container_of(work, struct vhost_blk, poll[0].work);
+	handle_blk(blk);
+}
+
+static int vhost_blk_open(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *n = kmalloc(sizeof *n, GFP_KERNEL);
+	int r;
+	if (!n)
+		return -ENOMEM;
+	n->vqs[0].handle_kick = handle_blk_kick;
+	r = vhost_dev_init(&n->dev, n->vqs, VHOST_BLK_VQ_MAX);
+	if (r < 0) {
+		kfree(n);
+		return r;
+	}
+
+	vhost_poll_init(n->poll, handle_rq_blk, POLLOUT|POLLIN);
+	f->private_data = n;
+	return 0;
+}
+
+static int vhost_blk_release(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *n = f->private_data;
+
+	fput(n->vqs->private_data);
+	kfree(n);
+	return 0;
+}
+
+static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int fd)
+{
+	struct file *file;
+	struct vhost_virtqueue *vq;
+
+	if (index >= VHOST_BLK_VQ_MAX)
+		return -ENOBUFS;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	vq = n->vqs + index;
+	mutex_lock(&vq->mutex);
+	rcu_assign_pointer(vq->private_data, file);
+	mutex_unlock(&vq->mutex);
+	return 0;
+}
+
+
+static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
+                            unsigned long arg)
+{
+	struct vhost_blk *n = f->private_data;
+	void __user *argp = (void __user *)arg;
+	struct vhost_vring_file backend;
+	int r;
+
+	switch (ioctl) {
+        case VHOST_NET_SET_BACKEND:
+		r = copy_from_user(&backend, argp, sizeof backend);
+		if (r <	0)
+			return r;
+		return vhost_blk_set_backend(n, backend.index, backend.fd);
+	default:
+		mutex_lock(&n->dev.mutex);
+		r = vhost_dev_ioctl(&n->dev, ioctl, arg);
+		vhost_blk_flush(n);
+		mutex_unlock(&n->dev.mutex);
+		return r;
+	}
+}
+
+const static struct file_operations vhost_blk_fops = {
+	.owner          = THIS_MODULE,
+	.release        = vhost_blk_release,
+	.open           = vhost_blk_open,
+	.unlocked_ioctl = vhost_blk_ioctl,
+};
+
+static struct miscdevice vhost_blk_misc = {
+	234,
+	"vhost-blk",
+	&vhost_blk_fops,
+};
+
+static int vhost_blk_init(void)
+{
+	int r = vhost_init();
+	if (r)
+		goto err_init;
+
+	vblk_workqueue = create_workqueue("vblk");
+	if (!vblk_workqueue) {
+		r = -ENOMEM;
+		goto err_vblk;
+	}
+
+	r = misc_register(&vhost_blk_misc);
+	if (r)
+		goto err_reg;
+	return 0;
+err_reg:
+	destroy_workqueue(vblk_workqueue);
+err_vblk:
+	vhost_cleanup();
+err_init:
+	return r;
+
+}
+module_init(vhost_blk_init);
+
+static void vhost_blk_exit(void)
+{
+	misc_deregister(&vhost_blk_misc);
+	destroy_workqueue(vblk_workqueue);
+	vhost_cleanup();
+}
+module_exit(vhost_blk_exit);
+
+MODULE_VERSION("0.0.2");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");



^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Qemu-devel] [RFC] vhost-blk implementation (v2)
@ 2010-04-07  0:36           ` Badari Pulavarty
  0 siblings, 0 replies; 48+ messages in thread
From: Badari Pulavarty @ 2010-04-07  0:36 UTC (permalink / raw)
  To: kvm, virtualization, qemu-devel

Hi All,

Here is the latest version of vhost-blk implementation.
Major difference from my previous implementation is that, I
now merge all contiguous requests (both read and write), before
submitting them. This significantly improved IO performance.
I am still collecting performance numbers, I will be posting
in next few days.

Comments ?

Todo:
- Address hch's comments on annontations
- Implement per device read/write queues
- Finish up error handling

Thanks,
Badari

---
 drivers/vhost/blk.c |  445 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 445 insertions(+)

Index: net-next/drivers/vhost/blk.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ net-next/drivers/vhost/blk.c	2010-04-06 16:38:03.563847905 -0400
@@ -0,0 +1,445 @@
+ /*
+  * virtio-block server in host kernel.
+  * Inspired by vhost-net and shamlessly ripped code from it :)
+  */
+
+#include <linux/compat.h>
+#include <linux/eventfd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_blk.h>
+#include <linux/mmu_context.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/rcupdate.h>
+#include <linux/file.h>
+
+#include "vhost.h"
+
+#define VHOST_BLK_VQ_MAX 1
+#define SECTOR_SHIFT 9
+
+struct vhost_blk {
+	struct vhost_dev dev;
+	struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+	struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+};
+
+struct vhost_blk_io {
+	struct list_head list;
+	struct work_struct work;
+	struct vhost_blk *blk;
+	struct file *file;
+	int head;
+	uint32_t type;
+	uint32_t nvecs;
+	uint64_t sector;
+	uint64_t len;
+	struct iovec iov[0];
+};
+
+static struct workqueue_struct *vblk_workqueue;
+static LIST_HEAD(write_queue);
+static LIST_HEAD(read_queue);
+
+static void handle_io_work(struct work_struct *work)
+{
+	struct vhost_blk_io *vbio, *entry;
+	struct vhost_virtqueue *vq;
+	struct vhost_blk *blk;
+	struct list_head single, *head, *node, *tmp;
+
+	int i, need_free, ret = 0;
+	loff_t pos;
+	uint8_t status = 0;
+
+	vbio = container_of(work, struct vhost_blk_io, work);
+	blk = vbio->blk;
+	vq = &blk->dev.vqs[0];
+	pos = vbio->sector << 8;
+
+	use_mm(blk->dev.mm);
+	if (vbio->type & VIRTIO_BLK_T_FLUSH)  {
+		ret = vfs_fsync(vbio->file, vbio->file->f_path.dentry, 1);
+	} else if (vbio->type & VIRTIO_BLK_T_OUT) {
+		ret = vfs_writev(vbio->file, vbio->iov, vbio->nvecs, &pos);
+	} else {
+		ret = vfs_readv(vbio->file, vbio->iov, vbio->nvecs, &pos);
+	}
+	status = (ret < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+	if (vbio->head != -1) {
+		INIT_LIST_HEAD(&single);
+		list_add(&vbio->list, &single);
+		head = &single;
+		need_free = 0;
+	} else {
+		head = &vbio->list;
+		need_free = 1;
+	}
+	list_for_each_entry(entry, head, list) {
+		copy_to_user(entry->iov[entry->nvecs].iov_base, &status, sizeof status);
+	}
+	mutex_lock(&vq->mutex);
+	list_for_each_safe(node, tmp, head) {
+		entry = list_entry(node, struct vhost_blk_io, list);
+		vhost_add_used_and_signal(&blk->dev, vq, entry->head, ret);
+		list_del(node);
+		kfree(entry);
+	}
+	mutex_unlock(&vq->mutex);
+	unuse_mm(blk->dev.mm);
+	if (need_free)
+		kfree(vbio);
+}
+
+static struct vhost_blk_io *allocate_vbio(int nvecs)
+{
+	struct vhost_blk_io *vbio;
+	int size = sizeof(struct vhost_blk_io) + nvecs * sizeof(struct iovec);
+	vbio = kmalloc(size, GFP_KERNEL);
+	if (vbio) {
+		INIT_WORK(&vbio->work, handle_io_work);
+		INIT_LIST_HEAD(&vbio->list);
+	}
+	return vbio;
+}
+
+static void merge_and_handoff_work(struct list_head *queue)
+{
+	struct vhost_blk_io *vbio, *entry;
+	int nvecs = 0;
+	int entries = 0;
+
+	list_for_each_entry(entry, queue, list) {
+		nvecs += entry->nvecs;
+		entries++;
+	}
+
+	if (entries == 1) {
+		vbio = list_first_entry(queue, struct vhost_blk_io, list);
+		list_del(&vbio->list);
+		queue_work(vblk_workqueue, &vbio->work);
+		return;
+	}
+
+	vbio = allocate_vbio(nvecs);
+	if (!vbio) {
+		/* Unable to allocate memory - submit IOs individually */
+		list_for_each_entry(vbio, queue, list) {
+			queue_work(vblk_workqueue, &vbio->work);
+		}
+		INIT_LIST_HEAD(queue);
+		return;
+	}
+
+	entry = list_first_entry(queue, struct vhost_blk_io, list);
+	vbio->nvecs = nvecs;
+	vbio->blk = entry->blk;
+	vbio->file = entry->file;
+	vbio->type = entry->type;
+	vbio->sector = entry->sector;
+	vbio->head = -1;
+	vbio->len = 0;
+	nvecs = 0;
+
+	list_for_each_entry(entry, queue, list) {
+		memcpy(&vbio->iov[nvecs], entry->iov, entry->nvecs * sizeof(struct iovec));
+		nvecs += entry->nvecs;
+		vbio->len += entry->len;
+	}
+	list_replace_init(queue, &vbio->list);
+	queue_work(vblk_workqueue, &vbio->work);
+}
+
+static void start_io(struct list_head *queue)
+{
+	struct list_head start;
+	struct vhost_blk_io *vbio = NULL, *entry;
+
+	if (list_empty(queue))
+                return;
+
+	list_for_each_entry(entry, queue, list) {
+		if (!vbio) {
+			vbio = entry;
+			continue;
+		}
+		if (vbio->sector + (vbio->len >> SECTOR_SHIFT) == entry->sector) {
+			vbio = entry;
+		} else {
+			INIT_LIST_HEAD(&start);
+			list_cut_position(&start, queue, &vbio->list);
+			merge_and_handoff_work(&start);
+			vbio = entry;
+		}
+	}
+	if (!list_empty(queue))
+		merge_and_handoff_work(queue);
+}
+
+static uint64_t calculate_len(struct iovec *iov, int nvecs)
+{
+	uint64_t len = 0;
+	int i;
+
+	for (i=0; i<nvecs; i++)
+		len += iov[i].iov_len;
+	return len;
+}
+
+static void insert_to_queue(struct vhost_blk_io *vbio,
+			struct list_head *queue)
+{
+	struct vhost_blk_io *entry;
+
+	list_for_each_entry(entry, queue, list) {
+		if (entry->sector > vbio->sector)
+			break;
+	}
+	list_add_tail(&vbio->list, &entry->list);
+}
+
+static int handoff_io(struct vhost_blk *blk, int head,
+			uint32_t type, uint64_t sector,
+			struct iovec *iov, int nvecs)
+{
+	struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+	struct vhost_blk_io *vbio;
+
+	vbio = allocate_vbio(nvecs+1);
+	if (!vbio) {
+		return -ENOMEM;
+	}
+	vbio->blk = blk;
+	vbio->head = head;
+	vbio->file = vq->private_data;
+	vbio->type = type;
+	vbio->sector = sector;
+	vbio->nvecs = nvecs;
+	vbio->len = calculate_len(iov, nvecs);
+	memcpy(vbio->iov, iov, (nvecs + 1) * sizeof(struct iovec));
+
+	if (vbio->type & VIRTIO_BLK_T_FLUSH) {
+#if 0
+		/* Sync called - do I need to submit IOs in the queue ? */
+		start_io(&read_queue);
+		start_io(&write_queue);
+#endif
+		queue_work(vblk_workqueue, &vbio->work);
+	} else if (vbio->type & VIRTIO_BLK_T_OUT) {
+		insert_to_queue(vbio, &write_queue);
+	} else {
+		insert_to_queue(vbio, &read_queue);
+	}
+	return 0;
+}
+
+
+static void handle_blk(struct vhost_blk *blk)
+{
+	struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+	unsigned head, out, in;
+	struct virtio_blk_outhdr hdr;
+	int nvecs;
+
+	use_mm(blk->dev.mm);
+	mutex_lock(&vq->mutex);
+
+	vhost_disable_notify(vq);
+
+	for (;;) {
+		head = vhost_get_vq_desc(&blk->dev, vq, vq->iov,
+					 ARRAY_SIZE(vq->iov),
+					 &out, &in, NULL, NULL);
+		if (head == vq->num) {
+			if (unlikely(vhost_enable_notify(vq))) {
+				vhost_disable_notify(vq);
+				continue;
+			}
+			start_io(&read_queue);
+			start_io(&write_queue);
+			break;
+		}
+
+		BUG_ON(vq->iov[0].iov_len != 16);
+
+		if (copy_from_user(&hdr, vq->iov[0].iov_base, sizeof hdr)) {
+			vhost_discard_vq_desc(vq);
+			continue;
+		}
+
+		nvecs = out - 1;
+		if (hdr.type == VIRTIO_BLK_T_IN)
+			nvecs = in - 1;
+
+		BUG_ON(vq->iov[nvecs+1].iov_len != 1);
+		if (handoff_io(blk, head, hdr.type, hdr.sector, &vq->iov[1], nvecs) < 0) {
+			vhost_discard_vq_desc(vq);
+			continue;
+		}
+	}
+	mutex_unlock(&vq->mutex);
+	unuse_mm(blk->dev.mm);
+}
+
+static void vhost_blk_flush(struct vhost_blk *n)
+{
+	vhost_poll_flush(n->poll);
+	vhost_poll_flush(&n->dev.vqs[0].poll);
+}
+
+static void handle_blk_kick(struct work_struct *work)
+{
+	struct vhost_virtqueue *vq;
+	struct vhost_blk *blk;
+	vq = container_of(work, struct vhost_virtqueue, poll.work);
+	blk = container_of(vq->dev, struct vhost_blk, dev);
+	handle_blk(blk);
+}
+
+static void handle_rq_blk(struct work_struct *work)
+{
+	struct vhost_blk *blk;
+	blk = container_of(work, struct vhost_blk, poll[0].work);
+	handle_blk(blk);
+}
+
+static int vhost_blk_open(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *n = kmalloc(sizeof *n, GFP_KERNEL);
+	int r;
+	if (!n)
+		return -ENOMEM;
+	n->vqs[0].handle_kick = handle_blk_kick;
+	r = vhost_dev_init(&n->dev, n->vqs, VHOST_BLK_VQ_MAX);
+	if (r < 0) {
+		kfree(n);
+		return r;
+	}
+
+	vhost_poll_init(n->poll, handle_rq_blk, POLLOUT|POLLIN);
+	f->private_data = n;
+	return 0;
+}
+
+static int vhost_blk_release(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *n = f->private_data;
+
+	fput(n->vqs->private_data);
+	kfree(n);
+	return 0;
+}
+
+static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int fd)
+{
+	struct file *file;
+	struct vhost_virtqueue *vq;
+
+	if (index >= VHOST_BLK_VQ_MAX)
+		return -ENOBUFS;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	vq = n->vqs + index;
+	mutex_lock(&vq->mutex);
+	rcu_assign_pointer(vq->private_data, file);
+	mutex_unlock(&vq->mutex);
+	return 0;
+}
+
+
+static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
+                            unsigned long arg)
+{
+	struct vhost_blk *n = f->private_data;
+	void __user *argp = (void __user *)arg;
+	struct vhost_vring_file backend;
+	int r;
+
+	switch (ioctl) {
+        case VHOST_NET_SET_BACKEND:
+		r = copy_from_user(&backend, argp, sizeof backend);
+		if (r <	0)
+			return r;
+		return vhost_blk_set_backend(n, backend.index, backend.fd);
+	default:
+		mutex_lock(&n->dev.mutex);
+		r = vhost_dev_ioctl(&n->dev, ioctl, arg);
+		vhost_blk_flush(n);
+		mutex_unlock(&n->dev.mutex);
+		return r;
+	}
+}
+
+const static struct file_operations vhost_blk_fops = {
+	.owner          = THIS_MODULE,
+	.release        = vhost_blk_release,
+	.open           = vhost_blk_open,
+	.unlocked_ioctl = vhost_blk_ioctl,
+};
+
+static struct miscdevice vhost_blk_misc = {
+	234,
+	"vhost-blk",
+	&vhost_blk_fops,
+};
+
+static int vhost_blk_init(void)
+{
+	int r = vhost_init();
+	if (r)
+		goto err_init;
+
+	vblk_workqueue = create_workqueue("vblk");
+	if (!vblk_workqueue) {
+		r = -ENOMEM;
+		goto err_vblk;
+	}
+
+	r = misc_register(&vhost_blk_misc);
+	if (r)
+		goto err_reg;
+	return 0;
+err_reg:
+	destroy_workqueue(vblk_workqueue);
+err_vblk:
+	vhost_cleanup();
+err_init:
+	return r;
+
+}
+module_init(vhost_blk_init);
+
+static void vhost_blk_exit(void)
+{
+	misc_deregister(&vhost_blk_misc);
+	destroy_workqueue(vblk_workqueue);
+	vhost_cleanup();
+}
+module_exit(vhost_blk_exit);
+
+MODULE_VERSION("0.0.2");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [RFC] vhost-blk implementation (v2)
  2010-04-05 19:59       ` Christoph Hellwig
  2010-04-07  0:36           ` [Qemu-devel] " Badari Pulavarty
@ 2010-04-07  0:36         ` Badari Pulavarty
  1 sibling, 0 replies; 48+ messages in thread
From: Badari Pulavarty @ 2010-04-07  0:36 UTC (permalink / raw)
  To: kvm, virtualization, qemu-devel

Hi All,

Here is the latest version of vhost-blk implementation.
Major difference from my previous implementation is that, I
now merge all contiguous requests (both read and write), before
submitting them. This significantly improved IO performance.
I am still collecting performance numbers, I will be posting
in next few days.

Comments ?

Todo:
- Address hch's comments on annontations
- Implement per device read/write queues
- Finish up error handling

Thanks,
Badari

---
 drivers/vhost/blk.c |  445 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 445 insertions(+)

Index: net-next/drivers/vhost/blk.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ net-next/drivers/vhost/blk.c	2010-04-06 16:38:03.563847905 -0400
@@ -0,0 +1,445 @@
+ /*
+  * virtio-block server in host kernel.
+  * Inspired by vhost-net and shamlessly ripped code from it :)
+  */
+
+#include <linux/compat.h>
+#include <linux/eventfd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_blk.h>
+#include <linux/mmu_context.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/rcupdate.h>
+#include <linux/file.h>
+
+#include "vhost.h"
+
+#define VHOST_BLK_VQ_MAX 1
+#define SECTOR_SHIFT 9
+
+struct vhost_blk {
+	struct vhost_dev dev;
+	struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+	struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+};
+
+struct vhost_blk_io {
+	struct list_head list;
+	struct work_struct work;
+	struct vhost_blk *blk;
+	struct file *file;
+	int head;
+	uint32_t type;
+	uint32_t nvecs;
+	uint64_t sector;
+	uint64_t len;
+	struct iovec iov[0];
+};
+
+static struct workqueue_struct *vblk_workqueue;
+static LIST_HEAD(write_queue);
+static LIST_HEAD(read_queue);
+
+static void handle_io_work(struct work_struct *work)
+{
+	struct vhost_blk_io *vbio, *entry;
+	struct vhost_virtqueue *vq;
+	struct vhost_blk *blk;
+	struct list_head single, *head, *node, *tmp;
+
+	int i, need_free, ret = 0;
+	loff_t pos;
+	uint8_t status = 0;
+
+	vbio = container_of(work, struct vhost_blk_io, work);
+	blk = vbio->blk;
+	vq = &blk->dev.vqs[0];
+	pos = vbio->sector << 8;
+
+	use_mm(blk->dev.mm);
+	if (vbio->type & VIRTIO_BLK_T_FLUSH)  {
+		ret = vfs_fsync(vbio->file, vbio->file->f_path.dentry, 1);
+	} else if (vbio->type & VIRTIO_BLK_T_OUT) {
+		ret = vfs_writev(vbio->file, vbio->iov, vbio->nvecs, &pos);
+	} else {
+		ret = vfs_readv(vbio->file, vbio->iov, vbio->nvecs, &pos);
+	}
+	status = (ret < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+	if (vbio->head != -1) {
+		INIT_LIST_HEAD(&single);
+		list_add(&vbio->list, &single);
+		head = &single;
+		need_free = 0;
+	} else {
+		head = &vbio->list;
+		need_free = 1;
+	}
+	list_for_each_entry(entry, head, list) {
+		copy_to_user(entry->iov[entry->nvecs].iov_base, &status, sizeof status);
+	}
+	mutex_lock(&vq->mutex);
+	list_for_each_safe(node, tmp, head) {
+		entry = list_entry(node, struct vhost_blk_io, list);
+		vhost_add_used_and_signal(&blk->dev, vq, entry->head, ret);
+		list_del(node);
+		kfree(entry);
+	}
+	mutex_unlock(&vq->mutex);
+	unuse_mm(blk->dev.mm);
+	if (need_free)
+		kfree(vbio);
+}
+
+static struct vhost_blk_io *allocate_vbio(int nvecs)
+{
+	struct vhost_blk_io *vbio;
+	int size = sizeof(struct vhost_blk_io) + nvecs * sizeof(struct iovec);
+	vbio = kmalloc(size, GFP_KERNEL);
+	if (vbio) {
+		INIT_WORK(&vbio->work, handle_io_work);
+		INIT_LIST_HEAD(&vbio->list);
+	}
+	return vbio;
+}
+
+static void merge_and_handoff_work(struct list_head *queue)
+{
+	struct vhost_blk_io *vbio, *entry;
+	int nvecs = 0;
+	int entries = 0;
+
+	list_for_each_entry(entry, queue, list) {
+		nvecs += entry->nvecs;
+		entries++;
+	}
+
+	if (entries == 1) {
+		vbio = list_first_entry(queue, struct vhost_blk_io, list);
+		list_del(&vbio->list);
+		queue_work(vblk_workqueue, &vbio->work);
+		return;
+	}
+
+	vbio = allocate_vbio(nvecs);
+	if (!vbio) {
+		/* Unable to allocate memory - submit IOs individually */
+		list_for_each_entry(vbio, queue, list) {
+			queue_work(vblk_workqueue, &vbio->work);
+		}
+		INIT_LIST_HEAD(queue);
+		return;
+	}
+
+	entry = list_first_entry(queue, struct vhost_blk_io, list);
+	vbio->nvecs = nvecs;
+	vbio->blk = entry->blk;
+	vbio->file = entry->file;
+	vbio->type = entry->type;
+	vbio->sector = entry->sector;
+	vbio->head = -1;
+	vbio->len = 0;
+	nvecs = 0;
+
+	list_for_each_entry(entry, queue, list) {
+		memcpy(&vbio->iov[nvecs], entry->iov, entry->nvecs * sizeof(struct iovec));
+		nvecs += entry->nvecs;
+		vbio->len += entry->len;
+	}
+	list_replace_init(queue, &vbio->list);
+	queue_work(vblk_workqueue, &vbio->work);
+}
+
+static void start_io(struct list_head *queue)
+{
+	struct list_head start;
+	struct vhost_blk_io *vbio = NULL, *entry;
+
+	if (list_empty(queue))
+                return;
+
+	list_for_each_entry(entry, queue, list) {
+		if (!vbio) {
+			vbio = entry;
+			continue;
+		}
+		if (vbio->sector + (vbio->len >> SECTOR_SHIFT) == entry->sector) {
+			vbio = entry;
+		} else {
+			INIT_LIST_HEAD(&start);
+			list_cut_position(&start, queue, &vbio->list);
+			merge_and_handoff_work(&start);
+			vbio = entry;
+		}
+	}
+	if (!list_empty(queue))
+		merge_and_handoff_work(queue);
+}
+
+static uint64_t calculate_len(struct iovec *iov, int nvecs)
+{
+	uint64_t len = 0;
+	int i;
+
+	for (i=0; i<nvecs; i++)
+		len += iov[i].iov_len;
+	return len;
+}
+
+static void insert_to_queue(struct vhost_blk_io *vbio,
+			struct list_head *queue)
+{
+	struct vhost_blk_io *entry;
+
+	list_for_each_entry(entry, queue, list) {
+		if (entry->sector > vbio->sector)
+			break;
+	}
+	list_add_tail(&vbio->list, &entry->list);
+}
+
+static int handoff_io(struct vhost_blk *blk, int head,
+			uint32_t type, uint64_t sector,
+			struct iovec *iov, int nvecs)
+{
+	struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+	struct vhost_blk_io *vbio;
+
+	vbio = allocate_vbio(nvecs+1);
+	if (!vbio) {
+		return -ENOMEM;
+	}
+	vbio->blk = blk;
+	vbio->head = head;
+	vbio->file = vq->private_data;
+	vbio->type = type;
+	vbio->sector = sector;
+	vbio->nvecs = nvecs;
+	vbio->len = calculate_len(iov, nvecs);
+	memcpy(vbio->iov, iov, (nvecs + 1) * sizeof(struct iovec));
+
+	if (vbio->type & VIRTIO_BLK_T_FLUSH) {
+#if 0
+		/* Sync called - do I need to submit IOs in the queue ? */
+		start_io(&read_queue);
+		start_io(&write_queue);
+#endif
+		queue_work(vblk_workqueue, &vbio->work);
+	} else if (vbio->type & VIRTIO_BLK_T_OUT) {
+		insert_to_queue(vbio, &write_queue);
+	} else {
+		insert_to_queue(vbio, &read_queue);
+	}
+	return 0;
+}
+
+
+static void handle_blk(struct vhost_blk *blk)
+{
+	struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+	unsigned head, out, in;
+	struct virtio_blk_outhdr hdr;
+	int nvecs;
+
+	use_mm(blk->dev.mm);
+	mutex_lock(&vq->mutex);
+
+	vhost_disable_notify(vq);
+
+	for (;;) {
+		head = vhost_get_vq_desc(&blk->dev, vq, vq->iov,
+					 ARRAY_SIZE(vq->iov),
+					 &out, &in, NULL, NULL);
+		if (head == vq->num) {
+			if (unlikely(vhost_enable_notify(vq))) {
+				vhost_disable_notify(vq);
+				continue;
+			}
+			start_io(&read_queue);
+			start_io(&write_queue);
+			break;
+		}
+
+		BUG_ON(vq->iov[0].iov_len != 16);
+
+		if (copy_from_user(&hdr, vq->iov[0].iov_base, sizeof hdr)) {
+			vhost_discard_vq_desc(vq);
+			continue;
+		}
+
+		nvecs = out - 1;
+		if (hdr.type == VIRTIO_BLK_T_IN)
+			nvecs = in - 1;
+
+		BUG_ON(vq->iov[nvecs+1].iov_len != 1);
+		if (handoff_io(blk, head, hdr.type, hdr.sector, &vq->iov[1], nvecs) < 0) {
+			vhost_discard_vq_desc(vq);
+			continue;
+		}
+	}
+	mutex_unlock(&vq->mutex);
+	unuse_mm(blk->dev.mm);
+}
+
+static void vhost_blk_flush(struct vhost_blk *n)
+{
+	vhost_poll_flush(n->poll);
+	vhost_poll_flush(&n->dev.vqs[0].poll);
+}
+
+static void handle_blk_kick(struct work_struct *work)
+{
+	struct vhost_virtqueue *vq;
+	struct vhost_blk *blk;
+	vq = container_of(work, struct vhost_virtqueue, poll.work);
+	blk = container_of(vq->dev, struct vhost_blk, dev);
+	handle_blk(blk);
+}
+
+static void handle_rq_blk(struct work_struct *work)
+{
+	struct vhost_blk *blk;
+	blk = container_of(work, struct vhost_blk, poll[0].work);
+	handle_blk(blk);
+}
+
+static int vhost_blk_open(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *n = kmalloc(sizeof *n, GFP_KERNEL);
+	int r;
+	if (!n)
+		return -ENOMEM;
+	n->vqs[0].handle_kick = handle_blk_kick;
+	r = vhost_dev_init(&n->dev, n->vqs, VHOST_BLK_VQ_MAX);
+	if (r < 0) {
+		kfree(n);
+		return r;
+	}
+
+	vhost_poll_init(n->poll, handle_rq_blk, POLLOUT|POLLIN);
+	f->private_data = n;
+	return 0;
+}
+
+static int vhost_blk_release(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *n = f->private_data;
+
+	fput(n->vqs->private_data);
+	kfree(n);
+	return 0;
+}
+
+static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int fd)
+{
+	struct file *file;
+	struct vhost_virtqueue *vq;
+
+	if (index >= VHOST_BLK_VQ_MAX)
+		return -ENOBUFS;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	vq = n->vqs + index;
+	mutex_lock(&vq->mutex);
+	rcu_assign_pointer(vq->private_data, file);
+	mutex_unlock(&vq->mutex);
+	return 0;
+}
+
+
+static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
+                            unsigned long arg)
+{
+	struct vhost_blk *n = f->private_data;
+	void __user *argp = (void __user *)arg;
+	struct vhost_vring_file backend;
+	int r;
+
+	switch (ioctl) {
+        case VHOST_NET_SET_BACKEND:
+		r = copy_from_user(&backend, argp, sizeof backend);
+		if (r <	0)
+			return r;
+		return vhost_blk_set_backend(n, backend.index, backend.fd);
+	default:
+		mutex_lock(&n->dev.mutex);
+		r = vhost_dev_ioctl(&n->dev, ioctl, arg);
+		vhost_blk_flush(n);
+		mutex_unlock(&n->dev.mutex);
+		return r;
+	}
+}
+
+const static struct file_operations vhost_blk_fops = {
+	.owner          = THIS_MODULE,
+	.release        = vhost_blk_release,
+	.open           = vhost_blk_open,
+	.unlocked_ioctl = vhost_blk_ioctl,
+};
+
+static struct miscdevice vhost_blk_misc = {
+	234,
+	"vhost-blk",
+	&vhost_blk_fops,
+};
+
+static int vhost_blk_init(void)
+{
+	int r = vhost_init();
+	if (r)
+		goto err_init;
+
+	vblk_workqueue = create_workqueue("vblk");
+	if (!vblk_workqueue) {
+		r = -ENOMEM;
+		goto err_vblk;
+	}
+
+	r = misc_register(&vhost_blk_misc);
+	if (r)
+		goto err_reg;
+	return 0;
+err_reg:
+	destroy_workqueue(vblk_workqueue);
+err_vblk:
+	vhost_cleanup();
+err_init:
+	return r;
+
+}
+module_init(vhost_blk_init);
+
+static void vhost_blk_exit(void)
+{
+	misc_deregister(&vhost_blk_misc);
+	destroy_workqueue(vblk_workqueue);
+	vhost_cleanup();
+}
+module_exit(vhost_blk_exit);
+
+MODULE_VERSION("0.0.2");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-26 18:53       ` Eran Rom
@ 2010-04-08 16:17         ` Stefan Hajnoczi
  0 siblings, 0 replies; 48+ messages in thread
From: Stefan Hajnoczi @ 2010-04-08 16:17 UTC (permalink / raw)
  To: Eran Rom; +Cc: kvm

On Fri, Mar 26, 2010 at 6:53 PM, Eran Rom <eranr@il.ibm.com> wrote:
> Christoph Hellwig <hch <at> infradead.org> writes:
>
>
>> Ok.  cache=writeback performance is something I haven't bothered looking
>> at at all.  For cache=none any streaming write or random workload with
>> large enough record sizes got basically the same performance as native
>> using kernel aio, and same for write but slightly degraded for reads
>> using the thread pool.  See my attached JLS presentation for some
>> numbers.
>>
> Looks like the presentation did not make it...

I am interested in the JLS presentation too.  Here is what I found,
hope it's the one you meant, Christoph:

http://events.linuxfoundation.org/images/stories/slides/jls09/jls09_hellwig.odp

Stefan

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-24 17:58           ` Badari Pulavarty
@ 2010-03-24 18:28             ` Michael S. Tsirkin
  0 siblings, 0 replies; 48+ messages in thread
From: Michael S. Tsirkin @ 2010-03-24 18:28 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: qemu-devel, virtualization

On Wed, Mar 24, 2010 at 10:58:50AM -0700, Badari Pulavarty wrote:
> Michael S. Tsirkin wrote:
>> On Tue, Mar 23, 2010 at 12:55:07PM -0700, Badari Pulavarty wrote:
>>   
>>> Michael S. Tsirkin wrote:
>>>     
>>>> On Tue, Mar 23, 2010 at 10:57:33AM -0700, Badari Pulavarty wrote:
>>>>         
>>>>> Michael S. Tsirkin wrote:
>>>>>             
>>>>>> On Mon, Mar 22, 2010 at 05:34:04PM -0700, Badari Pulavarty wrote:
>>>>>>                   
>>>>>>> Write Results:
>>>>>>> ==============
>>>>>>>
>>>>>>> I see degraded IO performance when doing sequential IO write
>>>>>>> tests with vhost-blk compared to virtio-blk.
>>>>>>>
>>>>>>> # time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct
>>>>>>>
>>>>>>> I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with
>>>>>>> vhost-blk. Wondering why ?
>>>>>>>                         
>>>>>> Try to look and number of interrupts and/or number of exits.
>>>>>>                   
>>>>> I checked interrupts and IO exits - there is no major noticeable  
>>>>>  difference between
>>>>> vhost-blk and virtio-blk scenerios.
>>>>>             
>>>>>> It could also be that you are overrunning some queue.
>>>>>>
>>>>>> I don't see any exit mitigation strategy in your patch:
>>>>>> when there are already lots of requests in a queue, it's usually
>>>>>> a good idea to disable notifications and poll the
>>>>>> queue as requests complete. That could help performance.
>>>>>>                   
>>>>> Do you mean poll eventfd for new requests instead of waiting for 
>>>>> new  notifications ?
>>>>> Where do you do that in vhost-net code ?
>>>>>             
>>>> vhost_disable_notify does this.
>>>>
>>>>         
>>>>> Unlike network socket, since we are dealing with a file, there is 
>>>>> no  ->poll support for it.
>>>>> So I can't poll for the data. And also, Issue I am having is on 
>>>>> the   write() side.
>>>>>             
>>>> Not sure I understand.
>>>>
>>>>         
>>>>> I looked at it some more - I see 512K write requests on the
>>>>> virtio-queue  in both vhost-blk and virtio-blk cases. Both qemu or
>>>>> vhost is doing synchronous  writes to page cache (there is no write
>>>>> batching in qemu that is affecting this  case).  I still puzzled on
>>>>> why virtio-blk outperforms vhost-blk.
>>>>>
>>>>> Thanks,
>>>>> Badari
>>>>>             
>>>> If you say the number of requests is the same, we are left with:
>>>> - requests are smaller for some reason?
>>>> - something is causing retries?
>>>>         
>>> No. IO requests sizes are exactly same (512K) in both cases. There 
>>> are  no retries or
>>> errors in both cases. One thing I am not clear is - for some reason   
>>> guest kernel
>>> could push more data into virtio-ring in case of virtio-blk vs   
>>> vhost-blk. Is this possible ?
>>> Does guest gets to run much sooner in virtio-blk case than vhost-blk 
>>> ?  Sorry, if its dumb question -
>>> I don't understand  all the vhost details :(
>>>
>>> Thanks,
>>> Badari
>>>     
>>
>> BTW, did you put the backend in non-blocking mode?
>> As I said, vhost net passes non-blocking flag to
>> socket backend, but vfs_write/read that you use does
>> not have an option to do this.
>>
>> So we'll need to extend the backend to fix that,
>> but just for demo purposes, you could set non-blocking
>> mode on the file from userspace.
>>
>>   
> Michael,
>
> Atleast I understand why the performance difference now.. My debug
> code is changed the behaviour of virtio-blk which confused me.
>
> 1) virtio-blk is able to batch up writes from various requests.
> 2) virtio-blk offloads the writes to different thread
>
> Where as vhost-blk, I do each request syncrhonously. Hence
> the difference. You are right - i have to make backend async.
> I will working on handing off work to in-kernel threads.
> I am not sure about IO completion handling and calling
> vhost_add_used_and_signal() out of context. But let
> me take a stab at it and ask your help if I run into
> issues.
>
> Thanks,
> Badari
>


The way I did it for vhost net, requests are synchronous
but non-blocking. So if it can't be done directly,
I delay it.

-- 
MST

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-24 11:23         ` Michael S. Tsirkin
@ 2010-03-24 17:58           ` Badari Pulavarty
  2010-03-24 18:28             ` Michael S. Tsirkin
  0 siblings, 1 reply; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-24 17:58 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel, virtualization

Michael S. Tsirkin wrote:
> On Tue, Mar 23, 2010 at 12:55:07PM -0700, Badari Pulavarty wrote:
>   
>> Michael S. Tsirkin wrote:
>>     
>>> On Tue, Mar 23, 2010 at 10:57:33AM -0700, Badari Pulavarty wrote:
>>>   
>>>       
>>>> Michael S. Tsirkin wrote:
>>>>     
>>>>         
>>>>> On Mon, Mar 22, 2010 at 05:34:04PM -0700, Badari Pulavarty wrote:
>>>>>         
>>>>>           
>>>>>> Write Results:
>>>>>> ==============
>>>>>>
>>>>>> I see degraded IO performance when doing sequential IO write
>>>>>> tests with vhost-blk compared to virtio-blk.
>>>>>>
>>>>>> # time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct
>>>>>>
>>>>>> I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with
>>>>>> vhost-blk. Wondering why ?
>>>>>>             
>>>>>>             
>>>>> Try to look and number of interrupts and/or number of exits.
>>>>>         
>>>>>           
>>>> I checked interrupts and IO exits - there is no major noticeable   
>>>> difference between
>>>> vhost-blk and virtio-blk scenerios.
>>>>     
>>>>         
>>>>> It could also be that you are overrunning some queue.
>>>>>
>>>>> I don't see any exit mitigation strategy in your patch:
>>>>> when there are already lots of requests in a queue, it's usually
>>>>> a good idea to disable notifications and poll the
>>>>> queue as requests complete. That could help performance.
>>>>>         
>>>>>           
>>>> Do you mean poll eventfd for new requests instead of waiting for new  
>>>> notifications ?
>>>> Where do you do that in vhost-net code ?
>>>>     
>>>>         
>>> vhost_disable_notify does this.
>>>
>>>   
>>>       
>>>> Unlike network socket, since we are dealing with a file, there is no  
>>>> ->poll support for it.
>>>> So I can't poll for the data. And also, Issue I am having is on the   
>>>> write() side.
>>>>     
>>>>         
>>> Not sure I understand.
>>>
>>>   
>>>       
>>>> I looked at it some more - I see 512K write requests on the
>>>> virtio-queue  in both vhost-blk and virtio-blk cases. Both qemu or
>>>> vhost is doing synchronous  writes to page cache (there is no write
>>>> batching in qemu that is affecting this  case).  I still puzzled on
>>>> why virtio-blk outperforms vhost-blk.
>>>>
>>>> Thanks,
>>>> Badari
>>>>     
>>>>         
>>> If you say the number of requests is the same, we are left with:
>>> - requests are smaller for some reason?
>>> - something is causing retries?
>>>   
>>>       
>> No. IO requests sizes are exactly same (512K) in both cases. There are  
>> no retries or
>> errors in both cases. One thing I am not clear is - for some reason  
>> guest kernel
>> could push more data into virtio-ring in case of virtio-blk vs  
>> vhost-blk. Is this possible ?
>> Does guest gets to run much sooner in virtio-blk case than vhost-blk ?  
>> Sorry, if its dumb question -
>> I don't understand  all the vhost details :(
>>
>> Thanks,
>> Badari
>>     
>
> BTW, did you put the backend in non-blocking mode?
> As I said, vhost net passes non-blocking flag to
> socket backend, but vfs_write/read that you use does
> not have an option to do this.
>
> So we'll need to extend the backend to fix that,
> but just for demo purposes, you could set non-blocking
> mode on the file from userspace.
>
>   
Michael,

Atleast I understand why the performance difference now.. My debug
code is changed the behaviour of virtio-blk which confused me.

1) virtio-blk is able to batch up writes from various requests.
2) virtio-blk offloads the writes to different thread

Where as vhost-blk, I do each request syncrhonously. Hence
the difference. You are right - i have to make backend async.
I will working on handing off work to in-kernel threads.
I am not sure about IO completion handling and calling
vhost_add_used_and_signal() out of context. But let
me take a stab at it and ask your help if I run into
issues.

Thanks,
Badari

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23 19:55       ` Badari Pulavarty
  2010-03-24  9:52         ` Michael S. Tsirkin
@ 2010-03-24 11:23         ` Michael S. Tsirkin
  2010-03-24 17:58           ` Badari Pulavarty
  1 sibling, 1 reply; 48+ messages in thread
From: Michael S. Tsirkin @ 2010-03-24 11:23 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: qemu-devel, virtualization

On Tue, Mar 23, 2010 at 12:55:07PM -0700, Badari Pulavarty wrote:
> Michael S. Tsirkin wrote:
>> On Tue, Mar 23, 2010 at 10:57:33AM -0700, Badari Pulavarty wrote:
>>   
>>> Michael S. Tsirkin wrote:
>>>     
>>>> On Mon, Mar 22, 2010 at 05:34:04PM -0700, Badari Pulavarty wrote:
>>>>         
>>>>> Write Results:
>>>>> ==============
>>>>>
>>>>> I see degraded IO performance when doing sequential IO write
>>>>> tests with vhost-blk compared to virtio-blk.
>>>>>
>>>>> # time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct
>>>>>
>>>>> I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with
>>>>> vhost-blk. Wondering why ?
>>>>>             
>>>> Try to look and number of interrupts and/or number of exits.
>>>>         
>>> I checked interrupts and IO exits - there is no major noticeable   
>>> difference between
>>> vhost-blk and virtio-blk scenerios.
>>>     
>>>> It could also be that you are overrunning some queue.
>>>>
>>>> I don't see any exit mitigation strategy in your patch:
>>>> when there are already lots of requests in a queue, it's usually
>>>> a good idea to disable notifications and poll the
>>>> queue as requests complete. That could help performance.
>>>>         
>>> Do you mean poll eventfd for new requests instead of waiting for new  
>>> notifications ?
>>> Where do you do that in vhost-net code ?
>>>     
>>
>> vhost_disable_notify does this.
>>
>>   
>>> Unlike network socket, since we are dealing with a file, there is no  
>>> ->poll support for it.
>>> So I can't poll for the data. And also, Issue I am having is on the   
>>> write() side.
>>>     
>>
>> Not sure I understand.
>>
>>   
>>> I looked at it some more - I see 512K write requests on the
>>> virtio-queue  in both vhost-blk and virtio-blk cases. Both qemu or
>>> vhost is doing synchronous  writes to page cache (there is no write
>>> batching in qemu that is affecting this  case).  I still puzzled on
>>> why virtio-blk outperforms vhost-blk.
>>>
>>> Thanks,
>>> Badari
>>>     
>>
>> If you say the number of requests is the same, we are left with:
>> - requests are smaller for some reason?
>> - something is causing retries?
>>   
> No. IO requests sizes are exactly same (512K) in both cases. There are  
> no retries or
> errors in both cases. One thing I am not clear is - for some reason  
> guest kernel
> could push more data into virtio-ring in case of virtio-blk vs  
> vhost-blk. Is this possible ?
> Does guest gets to run much sooner in virtio-blk case than vhost-blk ?  
> Sorry, if its dumb question -
> I don't understand  all the vhost details :(
>
> Thanks,
> Badari

BTW, did you put the backend in non-blocking mode?
As I said, vhost net passes non-blocking flag to
socket backend, but vfs_write/read that you use does
not have an option to do this.

So we'll need to extend the backend to fix that,
but just for demo purposes, you could set non-blocking
mode on the file from userspace.

-- 
MST

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23 19:55       ` Badari Pulavarty
@ 2010-03-24  9:52         ` Michael S. Tsirkin
  2010-03-24 11:23         ` Michael S. Tsirkin
  1 sibling, 0 replies; 48+ messages in thread
From: Michael S. Tsirkin @ 2010-03-24  9:52 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: qemu-devel, virtualization

On Tue, Mar 23, 2010 at 12:55:07PM -0700, Badari Pulavarty wrote:
> Michael S. Tsirkin wrote:
>> On Tue, Mar 23, 2010 at 10:57:33AM -0700, Badari Pulavarty wrote:
>>   
>>> Michael S. Tsirkin wrote:
>>>     
>>>> On Mon, Mar 22, 2010 at 05:34:04PM -0700, Badari Pulavarty wrote:
>>>>         
>>>>> Write Results:
>>>>> ==============
>>>>>
>>>>> I see degraded IO performance when doing sequential IO write
>>>>> tests with vhost-blk compared to virtio-blk.
>>>>>
>>>>> # time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct
>>>>>
>>>>> I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with
>>>>> vhost-blk. Wondering why ?
>>>>>             
>>>> Try to look and number of interrupts and/or number of exits.
>>>>         
>>> I checked interrupts and IO exits - there is no major noticeable   
>>> difference between
>>> vhost-blk and virtio-blk scenerios.
>>>     
>>>> It could also be that you are overrunning some queue.
>>>>
>>>> I don't see any exit mitigation strategy in your patch:
>>>> when there are already lots of requests in a queue, it's usually
>>>> a good idea to disable notifications and poll the
>>>> queue as requests complete. That could help performance.
>>>>         
>>> Do you mean poll eventfd for new requests instead of waiting for new  
>>> notifications ?
>>> Where do you do that in vhost-net code ?
>>>     
>>
>> vhost_disable_notify does this.
>>
>>   
>>> Unlike network socket, since we are dealing with a file, there is no  
>>> ->poll support for it.
>>> So I can't poll for the data. And also, Issue I am having is on the   
>>> write() side.
>>>     
>>
>> Not sure I understand.
>>
>>   
>>> I looked at it some more - I see 512K write requests on the
>>> virtio-queue  in both vhost-blk and virtio-blk cases. Both qemu or
>>> vhost is doing synchronous  writes to page cache (there is no write
>>> batching in qemu that is affecting this  case).  I still puzzled on
>>> why virtio-blk outperforms vhost-blk.
>>>
>>> Thanks,
>>> Badari
>>>     
>>
>> If you say the number of requests is the same, we are left with:
>> - requests are smaller for some reason?
>> - something is causing retries?
>>   
> No. IO requests sizes are exactly same (512K) in both cases. There are  
> no retries or
> errors in both cases. One thing I am not clear is - for some reason  
> guest kernel
> could push more data into virtio-ring in case of virtio-blk vs  
> vhost-blk. Is this possible ?
> Does guest gets to run much sooner in virtio-blk case than vhost-blk ?  
> Sorry, if its dumb question -
> I don't understand  all the vhost details :(
>
> Thanks,
> Badari
>

You said you observed same number of requests in userspace versus kernel above.
And request size is the same as well. But somehow more data is
transferred? I'm confused.

-- 
MST

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23 18:06     ` Michael S. Tsirkin
@ 2010-03-23 19:55       ` Badari Pulavarty
  2010-03-24  9:52         ` Michael S. Tsirkin
  2010-03-24 11:23         ` Michael S. Tsirkin
  0 siblings, 2 replies; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-23 19:55 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel, virtualization

Michael S. Tsirkin wrote:
> On Tue, Mar 23, 2010 at 10:57:33AM -0700, Badari Pulavarty wrote:
>   
>> Michael S. Tsirkin wrote:
>>     
>>> On Mon, Mar 22, 2010 at 05:34:04PM -0700, Badari Pulavarty wrote:
>>>   
>>>       
>>>> Write Results:
>>>> ==============
>>>>
>>>> I see degraded IO performance when doing sequential IO write
>>>> tests with vhost-blk compared to virtio-blk.
>>>>
>>>> # time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct
>>>>
>>>> I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with
>>>> vhost-blk. Wondering why ?
>>>>     
>>>>         
>>> Try to look and number of interrupts and/or number of exits.
>>>   
>>>       
>> I checked interrupts and IO exits - there is no major noticeable  
>> difference between
>> vhost-blk and virtio-blk scenerios.
>>     
>>> It could also be that you are overrunning some queue.
>>>
>>> I don't see any exit mitigation strategy in your patch:
>>> when there are already lots of requests in a queue, it's usually
>>> a good idea to disable notifications and poll the
>>> queue as requests complete. That could help performance.
>>>   
>>>       
>> Do you mean poll eventfd for new requests instead of waiting for new  
>> notifications ?
>> Where do you do that in vhost-net code ?
>>     
>
> vhost_disable_notify does this.
>
>   
>> Unlike network socket, since we are dealing with a file, there is no  
>> ->poll support for it.
>> So I can't poll for the data. And also, Issue I am having is on the  
>> write() side.
>>     
>
> Not sure I understand.
>
>   
>> I looked at it some more - I see 512K write requests on the
>> virtio-queue  in both vhost-blk and virtio-blk cases. Both qemu or
>> vhost is doing synchronous  writes to page cache (there is no write
>> batching in qemu that is affecting this  case).  I still puzzled on
>> why virtio-blk outperforms vhost-blk.
>>
>> Thanks,
>> Badari
>>     
>
> If you say the number of requests is the same, we are left with:
> - requests are smaller for some reason?
> - something is causing retries?
>   
No. IO requests sizes are exactly same (512K) in both cases. There are 
no retries or
errors in both cases. One thing I am not clear is - for some reason 
guest kernel
could push more data into virtio-ring in case of virtio-blk vs 
vhost-blk. Is this possible ?
Does guest gets to run much sooner in virtio-blk case than vhost-blk ? 
Sorry, if its dumb question -
I don't understand  all the vhost details :(

Thanks,
Badari

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23 17:57   ` Badari Pulavarty
@ 2010-03-23 18:06     ` Michael S. Tsirkin
  2010-03-23 19:55       ` Badari Pulavarty
  0 siblings, 1 reply; 48+ messages in thread
From: Michael S. Tsirkin @ 2010-03-23 18:06 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: Discussion of kvm memory usage, qemu-devel, virtualization

On Tue, Mar 23, 2010 at 10:57:33AM -0700, Badari Pulavarty wrote:
> Michael S. Tsirkin wrote:
>> On Mon, Mar 22, 2010 at 05:34:04PM -0700, Badari Pulavarty wrote:
>>   
>>> Write Results:
>>> ==============
>>>
>>> I see degraded IO performance when doing sequential IO write
>>> tests with vhost-blk compared to virtio-blk.
>>>
>>> # time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct
>>>
>>> I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with
>>> vhost-blk. Wondering why ?
>>>     
>>
>> Try to look and number of interrupts and/or number of exits.
>>   
>
> I checked interrupts and IO exits - there is no major noticeable  
> difference between
> vhost-blk and virtio-blk scenerios.
>> It could also be that you are overrunning some queue.
>>
>> I don't see any exit mitigation strategy in your patch:
>> when there are already lots of requests in a queue, it's usually
>> a good idea to disable notifications and poll the
>> queue as requests complete. That could help performance.
>>   
> Do you mean poll eventfd for new requests instead of waiting for new  
> notifications ?
> Where do you do that in vhost-net code ?

vhost_disable_notify does this.

> Unlike network socket, since we are dealing with a file, there is no  
> ->poll support for it.
> So I can't poll for the data. And also, Issue I am having is on the  
> write() side.

Not sure I understand.

> I looked at it some more - I see 512K write requests on the
> virtio-queue  in both vhost-blk and virtio-blk cases. Both qemu or
> vhost is doing synchronous  writes to page cache (there is no write
> batching in qemu that is affecting this  case).  I still puzzled on
> why virtio-blk outperforms vhost-blk.
>
> Thanks,
> Badari

If you say the number of requests is the same, we are left with:
- requests are smaller for some reason?
- something is causing retries?

-- 
MST

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23 12:39 ` Michael S. Tsirkin
  2010-03-23 14:47   ` Badari Pulavarty
@ 2010-03-23 17:57   ` Badari Pulavarty
  2010-03-23 18:06     ` Michael S. Tsirkin
  1 sibling, 1 reply; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-23 17:57 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Discussion of kvm memory usage, qemu-devel, virtualization

Michael S. Tsirkin wrote:
> On Mon, Mar 22, 2010 at 05:34:04PM -0700, Badari Pulavarty wrote:
>   
>> Write Results:
>> ==============
>>
>> I see degraded IO performance when doing sequential IO write
>> tests with vhost-blk compared to virtio-blk.
>>
>> # time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct
>>
>> I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with
>> vhost-blk. Wondering why ?
>>     
>
> Try to look and number of interrupts and/or number of exits.
>   

I checked interrupts and IO exits - there is no major noticeable 
difference between
vhost-blk and virtio-blk scenerios.
> It could also be that you are overrunning some queue.
>
> I don't see any exit mitigation strategy in your patch:
> when there are already lots of requests in a queue, it's usually
> a good idea to disable notifications and poll the
> queue as requests complete. That could help performance.
>   
Do you mean poll eventfd for new requests instead of waiting for new 
notifications ?
Where do you do that in vhost-net code ?

Unlike network socket, since we are dealing with a file, there is no 
->poll support for it.
So I can't poll for the data. And also, Issue I am having is on the 
write() side.

I looked at it some more - I see 512K write requests on the virtio-queue 
in both
vhost-blk and virtio-blk cases. Both qemu or vhost is doing synchronous 
writes
to page cache (there is no write batching in qemu that is affecting this 
case).
I still puzzled on why virtio-blk outperforms vhost-blk.

Thanks,
Badari

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23 12:39 ` Michael S. Tsirkin
@ 2010-03-23 14:47   ` Badari Pulavarty
  2010-03-23 17:57   ` Badari Pulavarty
  1 sibling, 0 replies; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-23 14:47 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel, virtualization

Michael S. Tsirkin wrote:
> On Mon, Mar 22, 2010 at 05:34:04PM -0700, Badari Pulavarty wrote:
>   
>> Write Results:
>> ==============
>>
>> I see degraded IO performance when doing sequential IO write
>> tests with vhost-blk compared to virtio-blk.
>>
>> # time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct
>>
>> I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with
>> vhost-blk. Wondering why ?
>>     
>
> Try to look and number of interrupts and/or number of exits.
>
> It could also be that you are overrunning some queue.
>   

Yeah.
> I don't see any exit mitigation strategy in your patch:
> when there are already lots of requests in a queue, it's usually
> a good idea to disable notifications and poll the
> queue as requests complete. That could help performance.
>   

Thanks for the suggestions. I will take a closer look.

Thanks,
Badari

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] vhost-blk implementation
  2010-03-23  0:34 Badari Pulavarty
@ 2010-03-23 12:39 ` Michael S. Tsirkin
  2010-03-23 14:47   ` Badari Pulavarty
  2010-03-23 17:57   ` Badari Pulavarty
  0 siblings, 2 replies; 48+ messages in thread
From: Michael S. Tsirkin @ 2010-03-23 12:39 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: qemu-devel, virtualization

On Mon, Mar 22, 2010 at 05:34:04PM -0700, Badari Pulavarty wrote:
> Write Results:
> ==============
> 
> I see degraded IO performance when doing sequential IO write
> tests with vhost-blk compared to virtio-blk.
> 
> # time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct
> 
> I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with
> vhost-blk. Wondering why ?

Try to look and number of interrupts and/or number of exits.

It could also be that you are overrunning some queue.

I don't see any exit mitigation strategy in your patch:
when there are already lots of requests in a queue, it's usually
a good idea to disable notifications and poll the
queue as requests complete. That could help performance.


-- 
MST

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [RFC] vhost-blk implementation
@ 2010-03-23  0:34 Badari Pulavarty
  2010-03-23 12:39 ` Michael S. Tsirkin
  0 siblings, 1 reply; 48+ messages in thread
From: Badari Pulavarty @ 2010-03-23  0:34 UTC (permalink / raw)
  To: virtualization, qemu-devel

Hi,

Inspired by vhost-net implementation, I did initial prototype 
of vhost-blk to see if it provides any benefits over QEMU virtio-blk.
I haven't handled all the error cases, fixed naming conventions etc.,
but the implementation is stable to play with. I tried not to deviate
from vhost-net implementation where possible.

NOTE:  Only change I had to make to vhost core code is to 
increase VHOST_NET_MAX_SG to 130 (128+2) in vhost.h 

Performance:
=============

I have done simple tests to see how it performs. I got very
encouraging results on sequential read tests. But on sequential
write tests, I see degrade over virtio-blk. I can't figure out and
explain why. Can some one shed light on whats happening here ?

Read Results:
=============
Test does read of 84GB file from the host (through virtio). I unmount
and mount the filesystem on the host to make sure there is nothing
in the page cache..


with vhost-blk:
----------------

# time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
640000+0 records in
640000+0 records out
83886080000 bytes (84 GB) copied, 126.135 seconds, 665 MB/s

real    2m6.137s
user    0m0.281s
sys     0m14.725s

without vhost-blk: (virtio)
---------------------------

# time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
640000+0 records in
640000+0 records out
83886080000 bytes (84 GB) copied, 275.466 seconds, 305 MB/s

real    4m35.468s
user    0m0.373s
sys     0m48.074s



Write Results:
==============

I see degraded IO performance when doing sequential IO write
tests with vhost-blk compared to virtio-blk.

# time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct

I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with
vhost-blk. Wondering why ?

Comments/flames ? 

Thanks,
Badari


vhost-blk is in-kernel accelerator for virtio-blk. 
At this time, this is a prototype based on virtio-net.
Lots of error handling and clean up needs to be done.
Read performance is pretty good over QEMU virtio-blk, but
write performance is not anywhere close to QEMU virtio-blk.
Why ?

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
---
 drivers/vhost/blk.c |  242 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 242 insertions(+)

Index: net-next/drivers/vhost/blk.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ net-next/drivers/vhost/blk.c	2010-03-22 18:07:18.156584400 -0400
@@ -0,0 +1,242 @@
+ /*
+  * virtio-block server in host kernel.
+  * Inspired by vhost-net and shamlessly ripped code from it :)
+  */
+
+#include <linux/compat.h>
+#include <linux/eventfd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_blk.h>
+#include <linux/mmu_context.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/rcupdate.h>
+#include <linux/file.h>
+
+#include "vhost.h"
+
+#define VHOST_BLK_VQ_MAX 1
+
+struct vhost_blk {
+	struct vhost_dev dev;
+	struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+	struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+};
+
+static int do_handle_io(struct file *file, uint32_t type, uint64_t sector,
+			struct iovec *iov, int in)
+{
+	loff_t pos = sector << 8;
+	int ret = 0;
+
+	if (type & VIRTIO_BLK_T_FLUSH)  {
+		ret = vfs_fsync(file, file->f_path.dentry, 1);
+	} else if (type & VIRTIO_BLK_T_OUT) {
+		ret = vfs_writev(file, iov, in, &pos);
+	} else {
+		ret = vfs_readv(file, iov, in, &pos);
+	}
+	return ret;
+}
+
+static void handle_blk(struct vhost_blk *blk)
+{
+	struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+	unsigned head, out, in;
+	struct virtio_blk_outhdr hdr;
+	int r, nvecs;
+	uint8_t status = 0;
+
+	use_mm(blk->dev.mm);
+	mutex_lock(&vq->mutex);
+
+	vhost_disable_notify(vq);
+
+	for (;;) {
+		head = vhost_get_vq_desc(&blk->dev, vq, vq->iov,
+					 ARRAY_SIZE(vq->iov),
+					 &out, &in, NULL, NULL);
+		if (head == vq->num) {
+			if (unlikely(vhost_enable_notify(vq))) {
+				vhost_disable_notify(vq);
+				continue;
+			}
+			break;
+		}
+
+		BUG_ON(vq->iov[0].iov_len != 16);
+
+		r = copy_from_user(&hdr, vq->iov[0].iov_base, sizeof hdr);
+		if (r < 0) {
+			printk("copy from user failed\n");
+			vhost_discard_vq_desc(vq);
+			break;
+		}
+
+		nvecs = out - 1;
+		if (hdr.type == VIRTIO_BLK_T_IN)
+			nvecs = in - 1;
+
+		r = do_handle_io(vq->private_data, hdr.type, hdr.sector, &vq->iov[1], nvecs);
+		status = (r < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+
+		nvecs++;
+		BUG_ON(vq->iov[nvecs].iov_len != 1);
+
+		if (copy_to_user(vq->iov[nvecs].iov_base, &status, sizeof status) < 0) {
+			printk("copy to user failed\n");
+			vhost_discard_vq_desc(vq);
+			break;
+		}
+		vhost_add_used_and_signal(&blk->dev, vq, head, r);
+	}
+        mutex_unlock(&vq->mutex);
+        unuse_mm(blk->dev.mm);
+}
+
+static void vhost_blk_flush(struct vhost_blk *n)
+{
+	vhost_poll_flush(n->poll);
+	vhost_poll_flush(&n->dev.vqs[0].poll);
+}
+
+static void handle_blk_kick(struct work_struct *work)
+{
+	struct vhost_virtqueue *vq;
+	struct vhost_blk *blk;
+	vq = container_of(work, struct vhost_virtqueue, poll.work);
+	blk = container_of(vq->dev, struct vhost_blk, dev);
+	handle_blk(blk);
+}
+
+static void handle_rq_blk(struct work_struct *work)
+{
+	struct vhost_blk *blk;
+	blk = container_of(work, struct vhost_blk, poll[0].work);
+	handle_blk(blk);
+}
+
+static int vhost_blk_open(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *n = kmalloc(sizeof *n, GFP_KERNEL);
+	int r;
+	if (!n)
+		return -ENOMEM;
+	n->vqs[0].handle_kick = handle_blk_kick;
+	r = vhost_dev_init(&n->dev, n->vqs, VHOST_BLK_VQ_MAX);
+	if (r < 0) {
+		kfree(n);
+		return r;
+	}
+
+	vhost_poll_init(n->poll, handle_rq_blk, POLLOUT|POLLIN);
+	f->private_data = n;
+	return 0;
+}
+
+static int vhost_blk_release(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *n = f->private_data;
+
+	fput(n->vqs->private_data);
+	kfree(n);
+	return 0;
+}
+
+static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int fd)
+{
+	struct file *file;
+	struct vhost_virtqueue *vq;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	vq = n->vqs + index;
+	mutex_lock(&vq->mutex);
+	rcu_assign_pointer(vq->private_data, file);
+	mutex_unlock(&vq->mutex);
+	return 0;
+}
+
+
+static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
+                            unsigned long arg)
+{
+	struct vhost_blk *n = f->private_data;
+	void __user *argp = (void __user *)arg;
+	struct vhost_vring_file backend;
+	int r;
+
+	switch (ioctl) {
+        case VHOST_NET_SET_BACKEND:
+                r = copy_from_user(&backend, argp, sizeof backend);
+                if (r < 0)
+                        return r;
+                return vhost_blk_set_backend(n, backend.index, backend.fd);
+	default:
+		mutex_lock(&n->dev.mutex);
+		r = vhost_dev_ioctl(&n->dev, ioctl, arg);
+		vhost_blk_flush(n);
+		mutex_unlock(&n->dev.mutex);
+		return r;
+	}
+}
+
+const static struct file_operations vhost_blk_fops = {
+	.owner          = THIS_MODULE,
+	.release        = vhost_blk_release,
+	.open           = vhost_blk_open,
+	.unlocked_ioctl = vhost_blk_ioctl,
+};
+
+static struct miscdevice vhost_blk_misc = {
+	234,
+	"vhost-blk",
+	&vhost_blk_fops,
+};
+
+int vhost_blk_init(void)
+{
+	int r = vhost_init();
+	if (r)
+		goto err_init;
+	r = misc_register(&vhost_blk_misc);
+	if (r)
+		goto err_reg;
+	return 0;
+err_reg:
+	vhost_cleanup();
+err_init:
+	return r;
+
+}
+module_init(vhost_blk_init);
+
+void vhost_blk_exit(void)
+{
+	misc_deregister(&vhost_blk_misc);
+	vhost_cleanup();
+}
+module_exit(vhost_blk_exit);
+
+MODULE_VERSION("0.0.1");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2010-04-08 16:17 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-03-23  1:00 [RFC] vhost-blk implementation Badari Pulavarty
2010-03-23  1:16 ` Anthony Liguori
2010-03-23  1:45   ` Badari Pulavarty
2010-03-23  2:00     ` Anthony Liguori
2010-03-23  2:50       ` Badari Pulavarty
2010-03-23 10:05         ` Avi Kivity
2010-03-23 14:48           ` Badari Pulavarty
2010-03-23 10:03 ` Avi Kivity
2010-03-23 14:55   ` Badari Pulavarty
2010-03-23 16:53     ` Avi Kivity
2010-03-24 20:05   ` Christoph Hellwig
2010-03-25  6:29     ` Avi Kivity
2010-03-25 15:48       ` Christoph Hellwig
2010-03-25 15:51         ` Avi Kivity
2010-03-25 15:00     ` Asdo
2010-04-05 19:59       ` Christoph Hellwig
2010-04-07  0:36         ` [RFC] vhost-blk implementation (v2) Badari Pulavarty
2010-04-07  0:36           ` [Qemu-devel] " Badari Pulavarty
2010-04-07  0:36         ` Badari Pulavarty
2010-03-23 10:09 ` [RFC] vhost-blk implementation Eran Rom
2010-03-24 20:04 ` Christoph Hellwig
2010-03-24 20:22   ` Badari Pulavarty
2010-03-25  7:57     ` Avi Kivity
2010-03-25 14:36       ` Badari Pulavarty
2010-03-25 15:57     ` Christoph Hellwig
2010-03-26 18:53       ` Eran Rom
2010-04-08 16:17         ` Stefan Hajnoczi
2010-04-05 19:23     ` Christoph Hellwig
2010-04-05 23:17       ` Badari Pulavarty
2010-03-24 20:27   ` Badari Pulavarty
2010-03-29 15:41   ` Badari Pulavarty
2010-03-29 18:20     ` Chris Wright
2010-03-29 20:37       ` Avi Kivity
2010-03-29 22:51         ` Badari Pulavarty
2010-03-29 23:56           ` Chris Wright
2010-03-30 12:43           ` Avi Kivity
2010-04-05 14:22     ` Stefan Hajnoczi
2010-04-06  2:27       ` Badari Pulavarty
  -- strict thread matches above, loose matches on Subject: below --
2010-03-23  0:34 Badari Pulavarty
2010-03-23 12:39 ` Michael S. Tsirkin
2010-03-23 14:47   ` Badari Pulavarty
2010-03-23 17:57   ` Badari Pulavarty
2010-03-23 18:06     ` Michael S. Tsirkin
2010-03-23 19:55       ` Badari Pulavarty
2010-03-24  9:52         ` Michael S. Tsirkin
2010-03-24 11:23         ` Michael S. Tsirkin
2010-03-24 17:58           ` Badari Pulavarty
2010-03-24 18:28             ` Michael S. Tsirkin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.