linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH V8 4/4 net-next] vhost: vhost TX zero-copy support
@ 2011-07-06 22:28 Shirley Ma
  2011-07-11 22:04 ` [PATCH RFC] vhost: address fixme in " Michael S. Tsirkin
  0 siblings, 1 reply; 3+ messages in thread
From: Shirley Ma @ 2011-07-06 22:28 UTC (permalink / raw)
  To: David Miller, mst; +Cc: netdev, kvm, linux-kernel

This patch maintains the outstanding userspace buffers in the 
sequence it is delivered to vhost. The outstanding userspace buffers 
will be marked as done once the lower device buffers DMA has finished. 
This is monitored through last reference of kfree_skb callback. Two
buffer index are used for this purpose.

The vhost passes the userspace buffers info to lower device skb 
through message control. Since there will be some done DMAs when
entering vhost handle_tx. The worse case is all buffers in the vq are
in pending/done status, so we need to notify guest to release DMA done 
buffers first before get any new buffers from the vq.

The busywait is waiting for fix in clean up and ring changes.

Signed-off-by: Shirley <xma@us.ibm.com>
---

 drivers/vhost/net.c   |   45 ++++++++++++++++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |   15 +++++++++++++++
 3 files changed, 107 insertions(+), 1 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e224a92..7de0c6e 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,10 @@
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x80000
 
+/* MAX number of TX used buffers for outstanding zerocopy */
+#define VHOST_MAX_PEND 128
+#define VHOST_GOODCOPY_LEN 256
+
 enum {
 	VHOST_NET_VQ_RX = 0,
 	VHOST_NET_VQ_TX = 1,
@@ -151,6 +155,10 @@ static void handle_tx(struct vhost_net *net)
 	hdr_size = vq->vhost_hlen;
 
 	for (;;) {
+		/* Release DMAs done buffers first */
+		if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND)
+			vhost_zerocopy_signal_used(vq);
+
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
 					 &out, &in,
@@ -166,6 +174,12 @@ static void handle_tx(struct vhost_net *net)
 				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 				break;
 			}
+			/* If more outstanding DMAs, queue the work */
+			if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND) {
+				tx_poll_start(net, sock);
+				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+				break;
+			}
 			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 				vhost_disable_notify(&net->dev, vq);
 				continue;
@@ -188,6 +202,26 @@ static void handle_tx(struct vhost_net *net)
 			       iov_length(vq->hdr, s), hdr_size);
 			break;
 		}
+		/* use msg_control to pass vhost zerocopy ubuf info to skb */
+		if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
+			vq->heads[vq->upend_idx].id = head;
+			if (len < VHOST_GOODCOPY_LEN)
+				/* copy don't need to wait for DMA done */
+				vq->heads[vq->upend_idx].len =
+							VHOST_DMA_DONE_LEN;
+			else {
+				struct ubuf_info *ubuf = &vq->ubuf_info[head];
+
+				vq->heads[vq->upend_idx].len = len;
+				ubuf->callback = vhost_zerocopy_callback;
+				ubuf->arg = vq;
+				ubuf->desc = vq->upend_idx;
+				msg.msg_control = ubuf;
+				msg.msg_controllen = sizeof(ubuf);
+			}
+			atomic_inc(&vq->refcnt);
+			vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
+		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(NULL, sock, &msg, len);
 		if (unlikely(err < 0)) {
@@ -198,12 +232,21 @@ static void handle_tx(struct vhost_net *net)
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+			vhost_add_used_and_signal(&net->dev, vq, head, 0);
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
 			break;
 		}
+		/* if upend_idx is full, then wait for free more */
+/*
+		if (unlikely(vq->upend_idx == vq->done_idx)) {
+			tx_poll_start(net, sock);
+			set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+				break;
+		}
+*/
 	}
 
 	mutex_unlock(&vq->mutex);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index ea966b3..db242b1 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -179,6 +179,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
+	vq->upend_idx = 0;
+	vq->done_idx = 0;
+	atomic_set(&vq->refcnt, 0);
 }
 
 static int vhost_worker(void *data)
@@ -237,6 +240,8 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
 					  GFP_KERNEL);
 		dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
 					    UIO_MAXIOV, GFP_KERNEL);
+		dev->vqs[i].ubuf_info = kmalloc(sizeof *dev->vqs[i].ubuf_info *
+					    UIO_MAXIOV, GFP_KERNEL);
 
 		if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
 			!dev->vqs[i].heads)
@@ -249,6 +254,7 @@ err_nomem:
 		kfree(dev->vqs[i].indirect);
 		kfree(dev->vqs[i].log);
 		kfree(dev->vqs[i].heads);
+		kfree(dev->vqs[i].ubuf_info);
 	}
 	return -ENOMEM;
 }
@@ -390,6 +396,30 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
 	return 0;
 }
 
+/* In case of DMA done not in order in lower device driver for some reason.
+ * upend_idx is used to track end of used idx, done_idx is used to track head
+ * of used idx. Once lower device DMA done contiguously, we will signal KVM
+ * guest used idx.
+ */
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
+{
+	int i, j = 0;
+
+	for (i = vq->done_idx; i != vq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
+		if ((vq->heads[i].len == VHOST_DMA_DONE_LEN)) {
+			vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
+			vhost_add_used_and_signal(vq->dev, vq,
+						  vq->heads[i].id, 0);
+			++j;
+		} else
+			break;
+	}
+	if (j) {
+		vq->done_idx = i;
+		atomic_sub(j, &vq->refcnt);
+	}
+}
+
 /* Caller should have device mutex */
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
@@ -400,6 +430,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 			vhost_poll_stop(&dev->vqs[i].poll);
 			vhost_poll_flush(&dev->vqs[i].poll);
 		}
+		/* Wait for all lower device DMAs done (busywait FIXME) */
+		while (atomic_read(&dev->vqs[i].refcnt))
+			vhost_zerocopy_signal_used(&dev->vqs[i]);
 		if (dev->vqs[i].error_ctx)
 			eventfd_ctx_put(dev->vqs[i].error_ctx);
 		if (dev->vqs[i].error)
@@ -612,6 +645,11 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 
 	mutex_lock(&vq->mutex);
 
+	/* clean up lower device outstanding DMAs, before setting ring
+	   busywait FIXME */
+	while (atomic_read(&vq->refcnt))
+		vhost_zerocopy_signal_used(vq);
+
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		/* Resizing ring with an active backend?
@@ -1486,3 +1524,13 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 			       &vq->used->flags, r);
 	}
 }
+
+void vhost_zerocopy_callback(void *arg)
+{
+	struct ubuf_info *ubuf = (struct ubuf_info *)arg;
+	struct vhost_virtqueue *vq;
+
+	vq = (struct vhost_virtqueue *)ubuf->arg;
+	/* set len = 1 to mark this desc buffers done DMA */
+	vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
+}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8e03379..883688c 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -13,6 +13,11 @@
 #include <linux/virtio_ring.h>
 #include <asm/atomic.h>
 
+/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
+ * done */
+#define VHOST_DMA_DONE_LEN	1
+#define VHOST_DMA_CLEAR_LEN	0
+
 struct vhost_device;
 
 struct vhost_work;
@@ -114,6 +119,14 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
+	/* vhost zerocopy support */
+	atomic_t refcnt; /* num of outstanding zerocopy DMAs */
+	/* last used idx for outstanding DMA zerocopy buffers */
+	int upend_idx;
+	/* first used idx for DMA done zerocopy buffers */
+	int done_idx;
+	/* an array of userspace buffers info */
+	struct ubuf_info *ubuf_info;
 };
 
 struct vhost_dev {
@@ -160,6 +173,8 @@ bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
+void vhost_zerocopy_callback(void *arg);
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \



^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH RFC] vhost: address fixme in vhost TX zero-copy support
  2011-07-06 22:28 [PATCH V8 4/4 net-next] vhost: vhost TX zero-copy support Shirley Ma
@ 2011-07-11 22:04 ` Michael S. Tsirkin
  2011-07-12  0:37   ` Shirley Ma
  0 siblings, 1 reply; 3+ messages in thread
From: Michael S. Tsirkin @ 2011-07-11 22:04 UTC (permalink / raw)
  To: Shirley Ma; +Cc: David Miller, netdev, kvm, linux-kernel

So the following should do it, on top of Shirleys's patch, I think.  I'm
a bit not sure about using vq->upend_idx - vq->done_idx to check the
number of outstanding DMA, Shirley, what do you think?
Untested.

I'm also thinking about making the use of this conditinal
on a module parameter, off by default to reduce
stability risk while still enabling more people to
test the feature.
Thoughts?

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7de0c6e..cf8deb3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -156,8 +156,7 @@ static void handle_tx(struct vhost_net *net)
 
 	for (;;) {
 		/* Release DMAs done buffers first */
-		if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND)
-			vhost_zerocopy_signal_used(vq);
+		vhost_zerocopy_signal_used(vq);
 
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
@@ -175,7 +174,7 @@ static void handle_tx(struct vhost_net *net)
 				break;
 			}
 			/* If more outstanding DMAs, queue the work */
-			if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND) {
+			if (vq->upend_idx - vq->done_idx > VHOST_MAX_PEND) {
 				tx_poll_start(net, sock);
 				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 				break;
@@ -214,12 +213,12 @@ static void handle_tx(struct vhost_net *net)
 
 				vq->heads[vq->upend_idx].len = len;
 				ubuf->callback = vhost_zerocopy_callback;
-				ubuf->arg = vq;
+				ubuf->arg = vq->ubufs;
 				ubuf->desc = vq->upend_idx;
 				msg.msg_control = ubuf;
 				msg.msg_controllen = sizeof(ubuf);
+				kref_get(&vq->ubufs->kref);
 			}
-			atomic_inc(&vq->refcnt);
 			vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
 		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
@@ -646,6 +645,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 {
 	struct socket *sock, *oldsock;
 	struct vhost_virtqueue *vq;
+	struct vhost_ubuf_ref *ubufs, *oldubufs = NULL;
 	int r;
 
 	mutex_lock(&n->dev.mutex);
@@ -675,6 +675,13 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	oldsock = rcu_dereference_protected(vq->private_data,
 					    lockdep_is_held(&vq->mutex));
 	if (sock != oldsock) {
+		ubufs = vhost_ubuf_alloc(vq, sock);
+		if (IS_ERR(ubufs)) {
+			r = PTR_ERR(ubufs);
+			goto err_ubufs;
+		}
+		oldubufs = vq->ubufs;
+		vq->ubufs = ubufs;
 		vhost_net_disable_vq(n, vq);
 		rcu_assign_pointer(vq->private_data, sock);
 		vhost_net_enable_vq(n, vq);
@@ -682,6 +689,9 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 
 	mutex_unlock(&vq->mutex);
 
+	if (oldbufs)
+		vhost_ubuf_put_and_wait(oldbufs);
+
 	if (oldsock) {
 		vhost_net_flush_vq(n, index);
 		fput(oldsock->file);
@@ -690,6 +700,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	mutex_unlock(&n->dev.mutex);
 	return 0;
 
+err_ubufs:
+	fput(sock);
 err_vq:
 	mutex_unlock(&vq->mutex);
 err:
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index db242b1..81b1dd7 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -181,7 +181,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->log_ctx = NULL;
 	vq->upend_idx = 0;
 	vq->done_idx = 0;
-	atomic_set(&vq->refcnt, 0);
+	vq->ubufs = NULL;
 }
 
 static int vhost_worker(void *data)
@@ -401,7 +401,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
  * of used idx. Once lower device DMA done contiguously, we will signal KVM
  * guest used idx.
  */
-void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
+int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
 {
 	int i, j = 0;
 
@@ -414,10 +414,9 @@ void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
 		} else
 			break;
 	}
-	if (j) {
+	if (j)
 		vq->done_idx = i;
-		atomic_sub(j, &vq->refcnt);
-	}
+	return j;
 }
 
 /* Caller should have device mutex */
@@ -430,9 +429,13 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 			vhost_poll_stop(&dev->vqs[i].poll);
 			vhost_poll_flush(&dev->vqs[i].poll);
 		}
-		/* Wait for all lower device DMAs done (busywait FIXME) */
-		while (atomic_read(&dev->vqs[i].refcnt))
-			vhost_zerocopy_signal_used(&dev->vqs[i]);
+		/* Wait for all lower device DMAs done. */
+		if (dev->vqs[i].ubufs)
+			vhost_ubuf_put_and_wait(dev->vqs[i].ubufs);
+
+		/* Signal guest as appropriate. */
+		vhost_zerocopy_signal_used(&dev->vqs[i]);
+
 		if (dev->vqs[i].error_ctx)
 			eventfd_ctx_put(dev->vqs[i].error_ctx);
 		if (dev->vqs[i].error)
@@ -645,11 +648,6 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 
 	mutex_lock(&vq->mutex);
 
-	/* clean up lower device outstanding DMAs, before setting ring
-	   busywait FIXME */
-	while (atomic_read(&vq->refcnt))
-		vhost_zerocopy_signal_used(vq);
-
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		/* Resizing ring with an active backend?
@@ -1525,12 +1523,46 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	}
 }
 
+static void vhost_zerocopy_done_signal(struct kref *kref)
+{
+	struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref,
+						    kref);
+	wake_up(&ubufs->wait);
+}
+
+struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
+					void * private_data)
+{
+	struct vhost_ubuf_ref *ubufs;
+	/* No backend? Nothing to count. */
+	if (!private_data)
+		return NULL;
+	ubufs = kmalloc(sizeof *ubufs, GFP_KERNEL);
+	if (!ubufs)
+		return ERR_PTR(-ENOMEM);
+	kref_init(&ubufs->kref);
+	kref_get(&ubufs->kref);
+	init_waitqueue_head(&ubufs->wait);
+	ubufs->vq = vq;
+	return ubufs;
+}
+
+void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
+{
+	kref_put(&ubufs->kref, vhost_zerocopy_done_signal); 
+	wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
+	kfree(ubufs);
+}
+
 void vhost_zerocopy_callback(void *arg)
 {
 	struct ubuf_info *ubuf = (struct ubuf_info *)arg;
+	struct vhost_ubuf_ref *ubufs;
 	struct vhost_virtqueue *vq;
 
-	vq = (struct vhost_virtqueue *)ubuf->arg;
+	ubufs = ubuf->arg;
+	vq = ubufs->vq;
 	/* set len = 1 to mark this desc buffers done DMA */
 	vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
+	kref_put(&ubufs->kref, vhost_zerocopy_done_signal); 
 }
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 883688c..b42b126 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -55,6 +55,17 @@ struct vhost_log {
 	u64 len;
 };
 
+struct vhost_virtqueue;
+
+struct vhost_ubuf_ref {
+	struct kref kref;
+	wait_queue_t wait;
+	struct vhost_virtqueue *vq;
+};
+
+struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *, void *);
+void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *);
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -127,6 +138,9 @@ struct vhost_virtqueue {
 	int done_idx;
 	/* an array of userspace buffers info */
 	struct ubuf_info *ubuf_info;
+	/* Reference counting for outstanding ubufs.
+	 * Protected by vq mutex. Writers must also take device mutex. */
+	struct vhost_ubuf_ref *ubufs;
 };
 
 struct vhost_dev {
@@ -174,7 +188,7 @@ bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
 void vhost_zerocopy_callback(void *arg);
-void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
+int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH RFC] vhost: address fixme in vhost TX zero-copy support
  2011-07-11 22:04 ` [PATCH RFC] vhost: address fixme in " Michael S. Tsirkin
@ 2011-07-12  0:37   ` Shirley Ma
  0 siblings, 0 replies; 3+ messages in thread
From: Shirley Ma @ 2011-07-12  0:37 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, netdev, kvm, linux-kernel

On Tue, 2011-07-12 at 01:04 +0300, Michael S. Tsirkin wrote:
> So the following should do it, on top of Shirleys's patch, I think.
> I'm
> a bit not sure about using vq->upend_idx - vq->done_idx to check the
> number of outstanding DMA, Shirley, what do you think?

Yes, you can use this to track # outstanding DMAs.

> Untested.
> 
> I'm also thinking about making the use of this conditinal
> on a module parameter, off by default to reduce
> stability risk while still enabling more people to
> test the feature.
> Thoughts?

Agreed.

> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 7de0c6e..cf8deb3 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -156,8 +156,7 @@ static void handle_tx(struct vhost_net *net)
> 
>         for (;;) {
>                 /* Release DMAs done buffers first */
> -               if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND)
> -                       vhost_zerocopy_signal_used(vq);
> +               vhost_zerocopy_signal_used(vq);
> 
>                 head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>                                          ARRAY_SIZE(vq->iov),
> @@ -175,7 +174,7 @@ static void handle_tx(struct vhost_net *net)
>                                 break;
>                         }
>                         /* If more outstanding DMAs, queue the work */
> -                       if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND)
> {
> +                       if (vq->upend_idx - vq->done_idx >
> VHOST_MAX_PEND) {
>                                 tx_poll_start(net, sock);
>                                 set_bit(SOCK_ASYNC_NOSPACE,
> &sock->flags);
>                                 break;
> @@ -214,12 +213,12 @@ static void handle_tx(struct vhost_net *net)
> 
>                                 vq->heads[vq->upend_idx].len = len;
>                                 ubuf->callback =
> vhost_zerocopy_callback;
> -                               ubuf->arg = vq;
> +                               ubuf->arg = vq->ubufs;
>                                 ubuf->desc = vq->upend_idx;
>                                 msg.msg_control = ubuf;
>                                 msg.msg_controllen = sizeof(ubuf);
> +                               kref_get(&vq->ubufs->kref);
>                         }
> -                       atomic_inc(&vq->refcnt);
>                         vq->upend_idx = (vq->upend_idx + 1) %
> UIO_MAXIOV;
>                 }
>                 /* TODO: Check specific error and bomb out unless
> ENOBUFS? */
> @@ -646,6 +645,7 @@ static long vhost_net_set_backend(struct vhost_net
> *n, unsigned index, int fd)
>  {
>         struct socket *sock, *oldsock;
>         struct vhost_virtqueue *vq;
> +       struct vhost_ubuf_ref *ubufs, *oldubufs = NULL;
>         int r;
> 
>         mutex_lock(&n->dev.mutex);
> @@ -675,6 +675,13 @@ static long vhost_net_set_backend(struct
> vhost_net *n, unsigned index, int fd)
>         oldsock = rcu_dereference_protected(vq->private_data,
> 
> lockdep_is_held(&vq->mutex));
>         if (sock != oldsock) {
> +               ubufs = vhost_ubuf_alloc(vq, sock);
> +               if (IS_ERR(ubufs)) {
> +                       r = PTR_ERR(ubufs);
> +                       goto err_ubufs;
> +               }
> +               oldubufs = vq->ubufs;
> +               vq->ubufs = ubufs;
>                 vhost_net_disable_vq(n, vq);
>                 rcu_assign_pointer(vq->private_data, sock);
>                 vhost_net_enable_vq(n, vq);
> @@ -682,6 +689,9 @@ static long vhost_net_set_backend(struct vhost_net
> *n, unsigned index, int fd)
> 
>         mutex_unlock(&vq->mutex);
> 
> +       if (oldbufs)
> +               vhost_ubuf_put_and_wait(oldbufs);
> +
>         if (oldsock) {
>                 vhost_net_flush_vq(n, index);
>                 fput(oldsock->file);
> @@ -690,6 +700,8 @@ static long vhost_net_set_backend(struct vhost_net
> *n, unsigned index, int fd)
>         mutex_unlock(&n->dev.mutex);
>         return 0;
> 
> +err_ubufs:
> +       fput(sock);
>  err_vq:
>         mutex_unlock(&vq->mutex);
>  err:
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index db242b1..81b1dd7 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -181,7 +181,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>         vq->log_ctx = NULL;
>         vq->upend_idx = 0;
>         vq->done_idx = 0;
> -       atomic_set(&vq->refcnt, 0);
> +       vq->ubufs = NULL;
>  }
> 
>  static int vhost_worker(void *data)
> @@ -401,7 +401,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
>   * of used idx. Once lower device DMA done contiguously, we will
> signal KVM
>   * guest used idx.
>   */
> -void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
> +int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
>  {
>         int i, j = 0;
> 
> @@ -414,10 +414,9 @@ void vhost_zerocopy_signal_used(struct
> vhost_virtqueue *vq)
>                 } else
>                         break;
>         }
> -       if (j) {
> +       if (j)
>                 vq->done_idx = i;
> -               atomic_sub(j, &vq->refcnt);
> -       }
> +       return j;
>  }
> 
>  /* Caller should have device mutex */
> @@ -430,9 +429,13 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>                         vhost_poll_stop(&dev->vqs[i].poll);
>                         vhost_poll_flush(&dev->vqs[i].poll);
>                 }
> -               /* Wait for all lower device DMAs done (busywait
> FIXME) */
> -               while (atomic_read(&dev->vqs[i].refcnt))
> -                       vhost_zerocopy_signal_used(&dev->vqs[i]);
> +               /* Wait for all lower device DMAs done. */
> +               if (dev->vqs[i].ubufs)
> +                       vhost_ubuf_put_and_wait(dev->vqs[i].ubufs);
> +
> +               /* Signal guest as appropriate. */
> +               vhost_zerocopy_signal_used(&dev->vqs[i]);
> +
>                 if (dev->vqs[i].error_ctx)
>                         eventfd_ctx_put(dev->vqs[i].error_ctx);
>                 if (dev->vqs[i].error)
> @@ -645,11 +648,6 @@ static long vhost_set_vring(struct vhost_dev *d,
> int ioctl, void __user *argp)
> 
>         mutex_lock(&vq->mutex);
> 
> -       /* clean up lower device outstanding DMAs, before setting ring
> -          busywait FIXME */
> -       while (atomic_read(&vq->refcnt))
> -               vhost_zerocopy_signal_used(vq);
> -

We need to clear up outstanding DMAs here too when we set vring.
Otherwise, KVM guest remove/reload virtio_net module vring would be out
of sync with vhost.

>         switch (ioctl) {
>         case VHOST_SET_VRING_NUM:
>                 /* Resizing ring with an active backend?
> @@ -1525,12 +1523,46 @@ void vhost_disable_notify(struct vhost_dev
> *dev, struct vhost_virtqueue *vq)
>         }
>  }
> 
> +static void vhost_zerocopy_done_signal(struct kref *kref)
> +{
> +       struct vhost_ubuf_ref *ubufs = container_of(kref, struct
> vhost_ubuf_ref,
> +                                                   kref);
> +       wake_up(&ubufs->wait);
> +}
> +
> +struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
> +                                       void * private_data)
> +{
> +       struct vhost_ubuf_ref *ubufs;
> +       /* No backend? Nothing to count. */
> +       if (!private_data)
> +               return NULL;
> +       ubufs = kmalloc(sizeof *ubufs, GFP_KERNEL);
> +       if (!ubufs)
> +               return ERR_PTR(-ENOMEM);
> +       kref_init(&ubufs->kref);
> +       kref_get(&ubufs->kref);
> +       init_waitqueue_head(&ubufs->wait);
> +       ubufs->vq = vq;
> +       return ubufs;
> +}
> +
> +void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
> +{
> +       kref_put(&ubufs->kref, vhost_zerocopy_done_signal); 
> +       wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
> +       kfree(ubufs);
> +}
> +
>  void vhost_zerocopy_callback(void *arg)
>  {
>         struct ubuf_info *ubuf = (struct ubuf_info *)arg;
> +       struct vhost_ubuf_ref *ubufs;
>         struct vhost_virtqueue *vq;
> 
> -       vq = (struct vhost_virtqueue *)ubuf->arg;
> +       ubufs = ubuf->arg;
> +       vq = ubufs->vq;
>         /* set len = 1 to mark this desc buffers done DMA */
>         vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
> +       kref_put(&ubufs->kref, vhost_zerocopy_done_signal); 
>  }
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 883688c..b42b126 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -55,6 +55,17 @@ struct vhost_log {
>         u64 len;
>  };
> 
> +struct vhost_virtqueue;
> +
> +struct vhost_ubuf_ref {
> +       struct kref kref;
> +       wait_queue_t wait;
> +       struct vhost_virtqueue *vq;
> +};
> +
> +struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *,
> void *);
> +void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *);
> +
>  /* The virtqueue structure describes a queue attached to a device. */
>  struct vhost_virtqueue {
>         struct vhost_dev *dev;
> @@ -127,6 +138,9 @@ struct vhost_virtqueue {
>         int done_idx;
>         /* an array of userspace buffers info */
>         struct ubuf_info *ubuf_info;
> +       /* Reference counting for outstanding ubufs.
> +        * Protected by vq mutex. Writers must also take device mutex.
> */
> +       struct vhost_ubuf_ref *ubufs;
>  };
> 
>  struct vhost_dev {
> @@ -174,7 +188,7 @@ bool vhost_enable_notify(struct vhost_dev *,
> struct vhost_virtqueue *);
>  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log
> *log,
>                     unsigned int log_num, u64 len);
>  void vhost_zerocopy_callback(void *arg);
> -void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
> +int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
> 
>  #define vq_err(vq, fmt, ...) do {                                  \
>                 pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> -- 


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2011-07-12  0:37 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-07-06 22:28 [PATCH V8 4/4 net-next] vhost: vhost TX zero-copy support Shirley Ma
2011-07-11 22:04 ` [PATCH RFC] vhost: address fixme in " Michael S. Tsirkin
2011-07-12  0:37   ` Shirley Ma

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).