linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Michael S. Tsirkin" <mst@redhat.com>
To: Shirley Ma <mashirle@us.ibm.com>
Cc: David Miller <davem@davemloft.net>,
	netdev@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH RFC] vhost: address fixme in vhost TX zero-copy support
Date: Tue, 12 Jul 2011 01:04:50 +0300	[thread overview]
Message-ID: <20110711220449.GA8346@redhat.com> (raw)
In-Reply-To: <1309991321.10209.26.camel@localhost.localdomain>

So the following should do it, on top of Shirleys's patch, I think.  I'm
a bit not sure about using vq->upend_idx - vq->done_idx to check the
number of outstanding DMA, Shirley, what do you think?
Untested.

I'm also thinking about making the use of this conditinal
on a module parameter, off by default to reduce
stability risk while still enabling more people to
test the feature.
Thoughts?

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7de0c6e..cf8deb3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -156,8 +156,7 @@ static void handle_tx(struct vhost_net *net)
 
 	for (;;) {
 		/* Release DMAs done buffers first */
-		if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND)
-			vhost_zerocopy_signal_used(vq);
+		vhost_zerocopy_signal_used(vq);
 
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
@@ -175,7 +174,7 @@ static void handle_tx(struct vhost_net *net)
 				break;
 			}
 			/* If more outstanding DMAs, queue the work */
-			if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND) {
+			if (vq->upend_idx - vq->done_idx > VHOST_MAX_PEND) {
 				tx_poll_start(net, sock);
 				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 				break;
@@ -214,12 +213,12 @@ static void handle_tx(struct vhost_net *net)
 
 				vq->heads[vq->upend_idx].len = len;
 				ubuf->callback = vhost_zerocopy_callback;
-				ubuf->arg = vq;
+				ubuf->arg = vq->ubufs;
 				ubuf->desc = vq->upend_idx;
 				msg.msg_control = ubuf;
 				msg.msg_controllen = sizeof(ubuf);
+				kref_get(&vq->ubufs->kref);
 			}
-			atomic_inc(&vq->refcnt);
 			vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
 		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
@@ -646,6 +645,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 {
 	struct socket *sock, *oldsock;
 	struct vhost_virtqueue *vq;
+	struct vhost_ubuf_ref *ubufs, *oldubufs = NULL;
 	int r;
 
 	mutex_lock(&n->dev.mutex);
@@ -675,6 +675,13 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	oldsock = rcu_dereference_protected(vq->private_data,
 					    lockdep_is_held(&vq->mutex));
 	if (sock != oldsock) {
+		ubufs = vhost_ubuf_alloc(vq, sock);
+		if (IS_ERR(ubufs)) {
+			r = PTR_ERR(ubufs);
+			goto err_ubufs;
+		}
+		oldubufs = vq->ubufs;
+		vq->ubufs = ubufs;
 		vhost_net_disable_vq(n, vq);
 		rcu_assign_pointer(vq->private_data, sock);
 		vhost_net_enable_vq(n, vq);
@@ -682,6 +689,9 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 
 	mutex_unlock(&vq->mutex);
 
+	if (oldbufs)
+		vhost_ubuf_put_and_wait(oldbufs);
+
 	if (oldsock) {
 		vhost_net_flush_vq(n, index);
 		fput(oldsock->file);
@@ -690,6 +700,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	mutex_unlock(&n->dev.mutex);
 	return 0;
 
+err_ubufs:
+	fput(sock);
 err_vq:
 	mutex_unlock(&vq->mutex);
 err:
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index db242b1..81b1dd7 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -181,7 +181,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->log_ctx = NULL;
 	vq->upend_idx = 0;
 	vq->done_idx = 0;
-	atomic_set(&vq->refcnt, 0);
+	vq->ubufs = NULL;
 }
 
 static int vhost_worker(void *data)
@@ -401,7 +401,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
  * of used idx. Once lower device DMA done contiguously, we will signal KVM
  * guest used idx.
  */
-void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
+int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
 {
 	int i, j = 0;
 
@@ -414,10 +414,9 @@ void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
 		} else
 			break;
 	}
-	if (j) {
+	if (j)
 		vq->done_idx = i;
-		atomic_sub(j, &vq->refcnt);
-	}
+	return j;
 }
 
 /* Caller should have device mutex */
@@ -430,9 +429,13 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 			vhost_poll_stop(&dev->vqs[i].poll);
 			vhost_poll_flush(&dev->vqs[i].poll);
 		}
-		/* Wait for all lower device DMAs done (busywait FIXME) */
-		while (atomic_read(&dev->vqs[i].refcnt))
-			vhost_zerocopy_signal_used(&dev->vqs[i]);
+		/* Wait for all lower device DMAs done. */
+		if (dev->vqs[i].ubufs)
+			vhost_ubuf_put_and_wait(dev->vqs[i].ubufs);
+
+		/* Signal guest as appropriate. */
+		vhost_zerocopy_signal_used(&dev->vqs[i]);
+
 		if (dev->vqs[i].error_ctx)
 			eventfd_ctx_put(dev->vqs[i].error_ctx);
 		if (dev->vqs[i].error)
@@ -645,11 +648,6 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 
 	mutex_lock(&vq->mutex);
 
-	/* clean up lower device outstanding DMAs, before setting ring
-	   busywait FIXME */
-	while (atomic_read(&vq->refcnt))
-		vhost_zerocopy_signal_used(vq);
-
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		/* Resizing ring with an active backend?
@@ -1525,12 +1523,46 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	}
 }
 
+static void vhost_zerocopy_done_signal(struct kref *kref)
+{
+	struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref,
+						    kref);
+	wake_up(&ubufs->wait);
+}
+
+struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
+					void * private_data)
+{
+	struct vhost_ubuf_ref *ubufs;
+	/* No backend? Nothing to count. */
+	if (!private_data)
+		return NULL;
+	ubufs = kmalloc(sizeof *ubufs, GFP_KERNEL);
+	if (!ubufs)
+		return ERR_PTR(-ENOMEM);
+	kref_init(&ubufs->kref);
+	kref_get(&ubufs->kref);
+	init_waitqueue_head(&ubufs->wait);
+	ubufs->vq = vq;
+	return ubufs;
+}
+
+void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
+{
+	kref_put(&ubufs->kref, vhost_zerocopy_done_signal); 
+	wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
+	kfree(ubufs);
+}
+
 void vhost_zerocopy_callback(void *arg)
 {
 	struct ubuf_info *ubuf = (struct ubuf_info *)arg;
+	struct vhost_ubuf_ref *ubufs;
 	struct vhost_virtqueue *vq;
 
-	vq = (struct vhost_virtqueue *)ubuf->arg;
+	ubufs = ubuf->arg;
+	vq = ubufs->vq;
 	/* set len = 1 to mark this desc buffers done DMA */
 	vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
+	kref_put(&ubufs->kref, vhost_zerocopy_done_signal); 
 }
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 883688c..b42b126 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -55,6 +55,17 @@ struct vhost_log {
 	u64 len;
 };
 
+struct vhost_virtqueue;
+
+struct vhost_ubuf_ref {
+	struct kref kref;
+	wait_queue_t wait;
+	struct vhost_virtqueue *vq;
+};
+
+struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *, void *);
+void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *);
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -127,6 +138,9 @@ struct vhost_virtqueue {
 	int done_idx;
 	/* an array of userspace buffers info */
 	struct ubuf_info *ubuf_info;
+	/* Reference counting for outstanding ubufs.
+	 * Protected by vq mutex. Writers must also take device mutex. */
+	struct vhost_ubuf_ref *ubufs;
 };
 
 struct vhost_dev {
@@ -174,7 +188,7 @@ bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
 void vhost_zerocopy_callback(void *arg);
-void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
+int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \

  reply	other threads:[~2011-07-11 22:04 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-07-06 22:28 [PATCH V8 4/4 net-next] vhost: vhost TX zero-copy support Shirley Ma
2011-07-11 22:04 ` Michael S. Tsirkin [this message]
2011-07-12  0:37   ` [PATCH RFC] vhost: address fixme in " Shirley Ma

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110711220449.GA8346@redhat.com \
    --to=mst@redhat.com \
    --cc=davem@davemloft.net \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mashirle@us.ibm.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).