All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
@ 2011-05-16 19:34 Shirley Ma
  2011-05-16 20:45 ` Michael S. Tsirkin
  0 siblings, 1 reply; 23+ messages in thread
From: Shirley Ma @ 2011-05-16 19:34 UTC (permalink / raw)
  To: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann
  Cc: netdev, kvm, linux-kernel

This patch maintains the outstanding userspace buffers in the 
sequence it is delivered to vhost. The outstanding userspace buffers 
will be marked as done once the lower device buffers DMA has finished. 
This is monitored through last reference of kfree_skb callback. Two
buffer index are used for this purpose.

The vhost passes the userspace buffers info to lower device skb 
through message control. Since there will be some done DMAs when
entering vhost handle_tx. The worse case is all buffers in the vq are
in pending/done status, so we need to notify guest to release DMA done 
buffers first before get any new buffers from the vq.

Signed-off-by: Shirley <xma@us.ibm.com>
---

 drivers/vhost/net.c   |   39 +++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.c |   58 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |   12 ++++++++++
 3 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2f7c76a..c964000 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,10 @@
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x80000
 
+/* MAX number of TX used buffers for outstanding zerocopy */
+#define VHOST_MAX_PEND 128
+#define VHOST_GOODCOPY_LEN 256
+
 enum {
 	VHOST_NET_VQ_RX = 0,
 	VHOST_NET_VQ_TX = 1,
@@ -129,6 +133,7 @@ static void handle_tx(struct vhost_net *net)
 	int err, wmem;
 	size_t hdr_size;
 	struct socket *sock;
+	struct skb_ubuf_info pend;
 
 	/* TODO: check that we are running from vhost_worker? */
 	sock = rcu_dereference_check(vq->private_data, 1);
@@ -151,6 +156,10 @@ static void handle_tx(struct vhost_net *net)
 	hdr_size = vq->vhost_hlen;
 
 	for (;;) {
+		/* Release DMAs done buffers first */
+		if (atomic_read(&vq->refcnt) > VHOST_MAX_ZEROCOPY_PEND)
+			vhost_zerocopy_signal_used(vq, false);
+
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
 					 &out, &in,
@@ -166,6 +175,12 @@ static void handle_tx(struct vhost_net *net)
 				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 				break;
 			}
+			/* If more outstanding DMAs, queue the work */
+			if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND) {
+				tx_poll_start(net, sock);
+				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+				break;
+			}
 			if (unlikely(vhost_enable_notify(vq))) {
 				vhost_disable_notify(vq);
 				continue;
@@ -188,17 +203,37 @@ static void handle_tx(struct vhost_net *net)
 			       iov_length(vq->hdr, s), hdr_size);
 			break;
 		}
+		/* use msg_control to pass vhost zerocopy ubuf info to skb */
+		if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
+			vq->heads[vq->upend_idx].id = head;
+			if (len <= VHOST_GOODCOPY_LEN)
+				/* copy don't need to wait for DMA done */
+				vq->heads[vq->upend_idx].len =
+							VHOST_DMA_DONE_LEN;
+			else {
+				vq->heads[vq->upend_idx].len = len;
+				pend.callback = vhost_zerocopy_callback;
+				pend.arg = vq;
+				pend.desc = vq->upend_idx;
+				msg.msg_control = &pend;
+				msg.msg_controllen = sizeof(pend);
+			}
+			atomic_inc(&vq->refcnt);
+			vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
+		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(NULL, sock, &msg, len);
 		if (unlikely(err < 0)) {
-			vhost_discard_vq_desc(vq, 1);
+			if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+				vhost_discard_vq_desc(vq, 1);
 			tx_poll_start(net, sock);
 			break;
 		}
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+			vhost_add_used_and_signal(&net->dev, vq, head, 0);
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ab2912..1a40310 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -174,6 +174,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
+	vq->upend_idx = 0;
+	vq->done_idx = 0;
+	atomic_set(&vq->refcnt, 0);
 }
 
 static int vhost_worker(void *data)
@@ -385,16 +388,62 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
 	return 0;
 }
 
+/* Since we need to keep the order of used_idx as avail_idx, it's possible that
+ * DMA done not in order in lower device driver for some reason. To prevent
+ * used_idx out of order, upend_idx is used to track avail_idx order, done_idx
+ * is used to track used_idx order. Once lower device DMA done, then upend_idx
+ * can move to done_idx.
+ */
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown)
+{
+	int i, j = 0;
+
+	i = vq->done_idx;
+	while (i != vq->upend_idx) {
+		if ((vq->heads[i].len == VHOST_DMA_DONE_LEN) || shutdown) {
+			/* reset len = 0 */
+			vq->heads[i].len = 0;
+			i = (i + 1) % UIO_MAXIOV;
+			++j;
+		} else
+			break;
+	}
+	if (j) {
+		/* heads is an array, i is used to tracking the last done_idx,
+		 * if last done_idx is less than current done_idx, then
+		 * add_used_n is not contiguous */
+		if (i > vq->done_idx)
+			vhost_add_used_n(vq, &vq->heads[vq->done_idx], j);
+		else {
+			vhost_add_used_n(vq, &vq->heads[vq->done_idx],
+					 UIO_MAXIOV - vq->done_idx);
+			vhost_add_used_n(vq, vq->heads, i);
+		}
+		vq->done_idx = i;
+		vhost_signal(vq->dev, vq);
+		atomic_sub(j, &vq->refcnt);
+	}
+}
+
 /* Caller should have device mutex */
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
 	int i;
+	unsigned long begin = jiffies;
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
 			vhost_poll_stop(&dev->vqs[i].poll);
 			vhost_poll_flush(&dev->vqs[i].poll);
 		}
+		/* Wait for all lower device DMAs done, then notify virtio_net
+		 * or just notify it without waiting for all DMA done here ?
+		 * in case of DMAs never done for some reason */
+		if (atomic_read(&dev->vqs[i].refcnt)) {
+			/* how long should we wait ? */
+			msleep(1000);
+			vhost_zerocopy_signal_used(&dev->vqs[i], true);
+		}
 		if (dev->vqs[i].error_ctx)
 			eventfd_ctx_put(dev->vqs[i].error_ctx);
 		if (dev->vqs[i].error)
@@ -1416,3 +1465,12 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
 		vq_err(vq, "Failed to enable notification at %p: %d\n",
 		       &vq->used->flags, r);
 }
+
+void vhost_zerocopy_callback(struct sk_buff *skb)
+{
+	int idx = skb_shinfo(skb)->ubuf.desc;
+	struct vhost_virtqueue *vq = skb_shinfo(skb)->ubuf.arg;
+
+	/* set len = 1 to mark this desc buffers done DMA */
+	vq->heads[idx].len = VHOST_DMA_DONE_LEN;
+}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b3363ae..8e3ecc7 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -13,6 +13,10 @@
 #include <linux/virtio_ring.h>
 #include <asm/atomic.h>
 
+/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
+ * done */
+#define VHOST_DMA_DONE_LEN	1
+
 struct vhost_device;
 
 struct vhost_work;
@@ -108,6 +112,12 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
+	/* vhost zerocopy support */
+	atomic_t refcnt; /* num of outstanding zerocopy DMAs */
+	/* copy of avail idx to monitor outstanding DMA zerocopy buffers */
+	int upend_idx;
+	/* copy of used idx to monintor DMA done zerocopy buffers */
+	int done_idx;
 };
 
 struct vhost_dev {
@@ -154,6 +164,8 @@ bool vhost_enable_notify(struct vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
+void vhost_zerocopy_callback(struct sk_buff *skb);
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-16 19:34 [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support Shirley Ma
@ 2011-05-16 20:45 ` Michael S. Tsirkin
  2011-05-16 20:56   ` Shirley Ma
  0 siblings, 1 reply; 23+ messages in thread
From: Michael S. Tsirkin @ 2011-05-16 20:45 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

> +/* Since we need to keep the order of used_idx as avail_idx, it's possible that
> + * DMA done not in order in lower device driver for some reason. To prevent
> + * used_idx out of order, upend_idx is used to track avail_idx order, done_idx
> + * is used to track used_idx order. Once lower device DMA done, then upend_idx
> + * can move to done_idx.

Could you clarify this please? virtio explicitly allows out of order
completion of requests. Does it simplify code that we try to keep
used index updates in-order? Because if not, this is not
really a requirement.

-- 
MST

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-16 20:45 ` Michael S. Tsirkin
@ 2011-05-16 20:56   ` Shirley Ma
  2011-05-16 21:24     ` Michael S. Tsirkin
  2011-05-16 21:27     ` Michael S. Tsirkin
  0 siblings, 2 replies; 23+ messages in thread
From: Shirley Ma @ 2011-05-16 20:56 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Mon, 2011-05-16 at 23:45 +0300, Michael S. Tsirkin wrote:
> > +/* Since we need to keep the order of used_idx as avail_idx, it's
> possible that
> > + * DMA done not in order in lower device driver for some reason. To
> prevent
> > + * used_idx out of order, upend_idx is used to track avail_idx
> order, done_idx
> > + * is used to track used_idx order. Once lower device DMA done,
> then upend_idx
> > + * can move to done_idx.
> 
> Could you clarify this please? virtio explicitly allows out of order
> completion of requests. Does it simplify code that we try to keep
> used index updates in-order? Because if not, this is not
> really a requirement.

Hello Mike,

Based on my testing, vhost_add_used() must be in order from
vhost_get_vq_desc(). Otherwise, virtio_net ring seems get double
freed. I didn't spend time on debugging this.

in virtqueue_get_buf

        if (unlikely(!vq->data[i])) {
                BAD_RING(vq, "id %u is not a head!\n", i);
                return NULL;
        }

That's the reason I created the upend_idx and done_idx.

Thanks
Shirley


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-16 20:56   ` Shirley Ma
@ 2011-05-16 21:24     ` Michael S. Tsirkin
  2011-05-16 21:30       ` Shirley Ma
  2011-05-17  4:31       ` Shirley Ma
  2011-05-16 21:27     ` Michael S. Tsirkin
  1 sibling, 2 replies; 23+ messages in thread
From: Michael S. Tsirkin @ 2011-05-16 21:24 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Mon, May 16, 2011 at 01:56:54PM -0700, Shirley Ma wrote:
> On Mon, 2011-05-16 at 23:45 +0300, Michael S. Tsirkin wrote:
> > > +/* Since we need to keep the order of used_idx as avail_idx, it's
> > possible that
> > > + * DMA done not in order in lower device driver for some reason. To
> > prevent
> > > + * used_idx out of order, upend_idx is used to track avail_idx
> > order, done_idx
> > > + * is used to track used_idx order. Once lower device DMA done,
> > then upend_idx
> > > + * can move to done_idx.
> > 
> > Could you clarify this please? virtio explicitly allows out of order
> > completion of requests. Does it simplify code that we try to keep
> > used index updates in-order? Because if not, this is not
> > really a requirement.
> 
> Hello Mike,
> 
> Based on my testing, vhost_add_used() must be in order from
> vhost_get_vq_desc(). Otherwise, virtio_net ring seems get double
> freed.

Double-freed or you get NULL below?

> I didn't spend time on debugging this.
> 
> in virtqueue_get_buf
> 
>         if (unlikely(!vq->data[i])) {
>                 BAD_RING(vq, "id %u is not a head!\n", i);
>                 return NULL;
>         }

Yes but i used here is the head that we read from the
ring, not the ring index itself.
	i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id
we must complete any id only once, but in any order.

> That's the reason I created the upend_idx and done_idx.
> 
> Thanks
> Shirley

Very strange, it sounds like a bug, but I can't tell where: in
host or in guest. If it's in the guest, we must fix it.
If in host, we should only fix it if it makes life simpler for us.
Could you try to nail it down pls?  Another question: will code get
simpler or more complex if that restriction's removed?

-- 
MST

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-16 20:56   ` Shirley Ma
  2011-05-16 21:24     ` Michael S. Tsirkin
@ 2011-05-16 21:27     ` Michael S. Tsirkin
  1 sibling, 0 replies; 23+ messages in thread
From: Michael S. Tsirkin @ 2011-05-16 21:27 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Mon, May 16, 2011 at 01:56:54PM -0700, Shirley Ma wrote:
> On Mon, 2011-05-16 at 23:45 +0300, Michael S. Tsirkin wrote:
> > > +/* Since we need to keep the order of used_idx as avail_idx, it's
> > possible that
> > > + * DMA done not in order in lower device driver for some reason. To
> > prevent
> > > + * used_idx out of order, upend_idx is used to track avail_idx
> > order, done_idx
> > > + * is used to track used_idx order. Once lower device DMA done,
> > then upend_idx
> > > + * can move to done_idx.
> > 
> > Could you clarify this please? virtio explicitly allows out of order
> > completion of requests. Does it simplify code that we try to keep
> > used index updates in-order? Because if not, this is not
> > really a requirement.
> 
> Hello Mike,
> 
> Based on my testing, vhost_add_used() must be in order from
> vhost_get_vq_desc(). Otherwise, virtio_net ring seems get double
> freed. I didn't spend time on debugging this.
> 
> in virtqueue_get_buf
> 
>         if (unlikely(!vq->data[i])) {
>                 BAD_RING(vq, "id %u is not a head!\n", i);
>                 return NULL;
>         }
> 
> That's the reason I created the upend_idx and done_idx.
> 
> Thanks
> Shirley

One thing of note: it's possible that this actually works
better than trying to complete out of order, as the
ring just keeps going which should be good for cache
utilization. OTOH, this might explain why
you are over-running the TX ring much more with this patch.

So I don't say this should block merging the patch,
but I very much would like to understand the issue,
and it's interesting to experiment with fixing it
and seeing what it does to performance and to code size.

-- 
MST

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-16 21:24     ` Michael S. Tsirkin
@ 2011-05-16 21:30       ` Shirley Ma
  2011-05-17  4:31       ` Shirley Ma
  1 sibling, 0 replies; 23+ messages in thread
From: Shirley Ma @ 2011-05-16 21:30 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Tue, 2011-05-17 at 00:24 +0300, Michael S. Tsirkin wrote:
> Very strange, it sounds like a bug, but I can't tell where: in
> host or in guest. If it's in the guest, we must fix it.
> If in host, we should only fix it if it makes life simpler for us.
> Could you try to nail it down pls?  Another question: will code get
> simpler or more complex if that restriction's removed? 

It should be similar. We still need to maintain the pending list, and
mark the DMA done ids.

I can make a try to narrow this issue down.

Thanks
Shirley


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-16 21:24     ` Michael S. Tsirkin
  2011-05-16 21:30       ` Shirley Ma
@ 2011-05-17  4:31       ` Shirley Ma
  2011-05-17  5:55         ` Michael S. Tsirkin
  1 sibling, 1 reply; 23+ messages in thread
From: Shirley Ma @ 2011-05-17  4:31 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Tue, 2011-05-17 at 00:24 +0300, Michael S. Tsirkin wrote:
> On Mon, May 16, 2011 at 01:56:54PM -0700, Shirley Ma wrote:
> > On Mon, 2011-05-16 at 23:45 +0300, Michael S. Tsirkin wrote:
> > > > +/* Since we need to keep the order of used_idx as avail_idx,
> it's
> > > possible that
> > > > + * DMA done not in order in lower device driver for some
> reason. To
> > > prevent
> > > > + * used_idx out of order, upend_idx is used to track avail_idx
> > > order, done_idx
> > > > + * is used to track used_idx order. Once lower device DMA done,
> > > then upend_idx
> > > > + * can move to done_idx.
> > > 
> > > Could you clarify this please? virtio explicitly allows out of
> order
> > > completion of requests. Does it simplify code that we try to keep
> > > used index updates in-order? Because if not, this is not
> > > really a requirement.
> > 
> > Hello Mike,
> > 
> > Based on my testing, vhost_add_used() must be in order from
> > vhost_get_vq_desc(). Otherwise, virtio_net ring seems get double
> > freed.
> 
> Double-freed or you get NULL below?

More likely is NULL.

> > I didn't spend time on debugging this.
> > 
> > in virtqueue_get_buf
> > 
> >         if (unlikely(!vq->data[i])) {
> >                 BAD_RING(vq, "id %u is not a head!\n", i);
> >                 return NULL;
> >         }
> 
> Yes but i used here is the head that we read from the
> ring, not the ring index itself.
>         i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id
> we must complete any id only once, but in any order.

It is in any order of ring id, but must be in the order of "head"
returns  from  vhost_get_vq_desc(), any clue?


Thanks
Shirley


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-17  4:31       ` Shirley Ma
@ 2011-05-17  5:55         ` Michael S. Tsirkin
  2011-05-17 15:22           ` Shirley Ma
  0 siblings, 1 reply; 23+ messages in thread
From: Michael S. Tsirkin @ 2011-05-17  5:55 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Mon, May 16, 2011 at 09:31:23PM -0700, Shirley Ma wrote:
> On Tue, 2011-05-17 at 00:24 +0300, Michael S. Tsirkin wrote:
> > On Mon, May 16, 2011 at 01:56:54PM -0700, Shirley Ma wrote:
> > > On Mon, 2011-05-16 at 23:45 +0300, Michael S. Tsirkin wrote:
> > > > > +/* Since we need to keep the order of used_idx as avail_idx,
> > it's
> > > > possible that
> > > > > + * DMA done not in order in lower device driver for some
> > reason. To
> > > > prevent
> > > > > + * used_idx out of order, upend_idx is used to track avail_idx
> > > > order, done_idx
> > > > > + * is used to track used_idx order. Once lower device DMA done,
> > > > then upend_idx
> > > > > + * can move to done_idx.
> > > > 
> > > > Could you clarify this please? virtio explicitly allows out of
> > order
> > > > completion of requests. Does it simplify code that we try to keep
> > > > used index updates in-order? Because if not, this is not
> > > > really a requirement.
> > > 
> > > Hello Mike,
> > > 
> > > Based on my testing, vhost_add_used() must be in order from
> > > vhost_get_vq_desc(). Otherwise, virtio_net ring seems get double
> > > freed.
> > 
> > Double-freed or you get NULL below?
> 
> More likely is NULL.
> 
> > > I didn't spend time on debugging this.
> > > 
> > > in virtqueue_get_buf
> > > 
> > >         if (unlikely(!vq->data[i])) {
> > >                 BAD_RING(vq, "id %u is not a head!\n", i);
> > >                 return NULL;
> > >         }
> > 
> > Yes but i used here is the head that we read from the
> > ring, not the ring index itself.
> >         i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id
> > we must complete any id only once, but in any order.
> 
> It is in any order of ring id, but must be in the order of "head"
> returns  from  vhost_get_vq_desc(), any clue?
> 
> 
> Thanks
> Shirley


Something in your patch that overwrites the id in vhost
and makes it put the wrong id in the used ring?

By the way, need to keep in mind that a guest can
give us the same head twice, need to make sure this
at least does not corrupt host memory.

-- 
MST

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-17  5:55         ` Michael S. Tsirkin
@ 2011-05-17 15:22           ` Shirley Ma
  2011-05-17 15:28             ` Michael S. Tsirkin
  0 siblings, 1 reply; 23+ messages in thread
From: Shirley Ma @ 2011-05-17 15:22 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Tue, 2011-05-17 at 08:55 +0300, Michael S. Tsirkin wrote:
> Something in your patch that overwrites the id in vhost
> and makes it put the wrong id in the used ring?
> 
> By the way, need to keep in mind that a guest can
> give us the same head twice, need to make sure this
> at least does not corrupt host memory.

I think I didn't explain the problem very well here.

This patch doesn't overwrite the id. It just keeps the same coming
sequence from "head return vhost_get_vq_desc()" to pass to
vhost_add_used.

The same ids can be used many times once it passes to guest from
vhost_add_used. There is no problem. The zero copy patch doesn't have
any issue.

The problem is the order of head from return vhost_get_vq_desc should be
in sequence when it passes to vhost_add_used.

The original code has no problem, because it gets one head and pass that
head to vhost_add_used one by one once done the copy. So it's in
sequence.

This issue can easily recreate without zerocopy patch by simply changing
the order from "head return vhost_get_vq_desc" when passing to
vhost_add_used.

Thanks
Shirley


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-17 15:22           ` Shirley Ma
@ 2011-05-17 15:28             ` Michael S. Tsirkin
  2011-05-17 15:34               ` Shirley Ma
  0 siblings, 1 reply; 23+ messages in thread
From: Michael S. Tsirkin @ 2011-05-17 15:28 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Tue, May 17, 2011 at 08:22:14AM -0700, Shirley Ma wrote:
> On Tue, 2011-05-17 at 08:55 +0300, Michael S. Tsirkin wrote:
> > Something in your patch that overwrites the id in vhost
> > and makes it put the wrong id in the used ring?
> > 
> > By the way, need to keep in mind that a guest can
> > give us the same head twice, need to make sure this
> > at least does not corrupt host memory.
> 
> I think I didn't explain the problem very well here.
> 
> This patch doesn't overwrite the id. It just keeps the same coming
> sequence from "head return vhost_get_vq_desc()" to pass to
> vhost_add_used.
> 
> The same ids can be used many times once it passes to guest from
> vhost_add_used. There is no problem. The zero copy patch doesn't have
> any issue.
> 
> The problem is the order of head from return vhost_get_vq_desc should be
> in sequence when it passes to vhost_add_used.

Which is the order the descriptors are put on avail ring.
By design, guest should not depend on used ring entries
being in order with avail ring (and btw with virtio block,
they aren't). If it does, it's a bug I think.

> The original code has no problem, because it gets one head and pass that
> head to vhost_add_used one by one once done the copy. So it's in
> sequence.
> 
> This issue can easily recreate without zerocopy patch by simply changing
> the order from "head return vhost_get_vq_desc" when passing to
> vhost_add_used.
> 
> Thanks
> Shirley

Ah, did you try that? Could you post this patch pls?
This seems to imply a bug in guest virtio.

-- 
MST

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-17 15:28             ` Michael S. Tsirkin
@ 2011-05-17 15:34               ` Shirley Ma
  2011-05-17 20:46                 ` [TEST PATCH net-next] vhost: accumulate multiple used and sigal in vhost TX test Shirley Ma
  2011-05-17 20:50                 ` [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support Shirley Ma
  0 siblings, 2 replies; 23+ messages in thread
From: Shirley Ma @ 2011-05-17 15:34 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Tue, 2011-05-17 at 18:28 +0300, Michael S. Tsirkin wrote:
> Which is the order the descriptors are put on avail ring.
> By design, guest should not depend on used ring entries
> being in order with avail ring (and btw with virtio block,
> they aren't). If it does, it's a bug I think.

Ok, I thought, the order should be maintained.

> > The original code has no problem, because it gets one head and pass
> that
> > head to vhost_add_used one by one once done the copy. So it's in
> > sequence.
> > 
> > This issue can easily recreate without zerocopy patch by simply
> changing
> > the order from "head return vhost_get_vq_desc" when passing to
> > vhost_add_used.
> > 
> > Thanks
> > Shirley
> 
> Ah, did you try that? Could you post this patch pls?
> This seems to imply a bug in guest virtio. 

I am creating the patch against net-next for you to test today.

Thanks
Shirley


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [TEST PATCH net-next] vhost: accumulate multiple used and sigal in vhost TX test
  2011-05-17 15:34               ` Shirley Ma
@ 2011-05-17 20:46                 ` Shirley Ma
  2011-05-17 20:52                   ` Michael S. Tsirkin
  2011-05-17 20:50                 ` [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support Shirley Ma
  1 sibling, 1 reply; 23+ messages in thread
From: Shirley Ma @ 2011-05-17 20:46 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

Hello Michael,

Here is the patch I used to test out of order before: add used in a pend
array, and swap the last two ids. 

I used to hit an issue, but now it seems working well.

This won't impact zero-copy patch since we need to maintain the pend
used ids anyway.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---
 drivers/vhost/net.c   |   24 +++++++++++++++++++++++-
 drivers/vhost/vhost.c |   11 +++++++++++
 drivers/vhost/vhost.h |    1 +
 3 files changed, 35 insertions(+), 1 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2f7c76a..19e1baa 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,8 @@
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x80000
 
+#define VHOST_MAX_PEND	128
+
 enum {
 	VHOST_NET_VQ_RX = 0,
 	VHOST_NET_VQ_TX = 1,
@@ -198,13 +200,33 @@ static void handle_tx(struct vhost_net *net)
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		vq->heads[vq->pend_idx].id = head;
+		vq->heads[vq->pend_idx].len = 0;
+		++vq->pend_idx;
+		if (vq->pend_idx >= VHOST_MAX_PEND) {
+			int id;
+			id = vq->heads[vq->pend_idx-1].id;
+			vq->heads[vq->pend_idx-1].id = vq->heads[vq->pend_idx-2].id; 
+			vq->heads[vq->pend_idx-2].id = id;
+			vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
+						    vq->pend_idx);
+			vq->pend_idx = 0;
+		}
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
 			break;
 		}
 	}
+	if (vq->pend_idx >= VHOST_MAX_PEND) {
+		int id;
+		id = vq->heads[vq->pend_idx-1].id;
+		vq->heads[vq->pend_idx-1].id = vq->heads[vq->pend_idx-2].id;
+		vq->heads[vq->pend_idx-2].id = id;
+		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
+					    vq->pend_idx);
+		vq->pend_idx = 0;
+	}
 
 	mutex_unlock(&vq->mutex);
 }
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ab2912..7eea6b3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -174,6 +174,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
+	vq->pend_idx = 0;
 }
 
 static int vhost_worker(void *data)
@@ -395,6 +396,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 			vhost_poll_stop(&dev->vqs[i].poll);
 			vhost_poll_flush(&dev->vqs[i].poll);
 		}
+		if (dev->vqs[i].pend_idx != 0) {
+			vhost_add_used_and_signal_n(dev, &dev->vqs[i],
+				dev->vqs[i].heads, dev->vqs[i].pend_idx);
+			dev->vqs[i].pend_idx = 0;
+		}
 		if (dev->vqs[i].error_ctx)
 			eventfd_ctx_put(dev->vqs[i].error_ctx);
 		if (dev->vqs[i].error)
@@ -603,6 +609,11 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 
 	mutex_lock(&vq->mutex);
 
+	if (vq->pend_idx != 0) {
+		vhost_add_used_and_signal_n(d, vq, vq->heads, vq->pend_idx);
+		vq->pend_idx = 0;
+	}
+
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		/* Resizing ring with an active backend?
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b3363ae..44a412d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -108,6 +108,7 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
+	int pend_idx;
 };
 
 struct vhost_dev {



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-17 15:34               ` Shirley Ma
  2011-05-17 20:46                 ` [TEST PATCH net-next] vhost: accumulate multiple used and sigal in vhost TX test Shirley Ma
@ 2011-05-17 20:50                 ` Shirley Ma
  2011-05-17 20:58                   ` Michael S. Tsirkin
  2011-05-17 21:28                   ` Michael S. Tsirkin
  1 sibling, 2 replies; 23+ messages in thread
From: Shirley Ma @ 2011-05-17 20:50 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

Resubmit the patch with most update. This patch passed some
live-migration test against RHEL6.2. I will run more stress test w/i
live migration.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

 drivers/vhost/net.c   |   37 +++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.c |   55 ++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.h |   12 ++++++++++
 3 files changed, 101 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2f7c76a..6bd6e28 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,9 @@
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x80000
 
+/* MAX number of TX used buffers for outstanding zerocopy */
+#define VHOST_MAX_ZEROCOPY_PEND 128 
+
 enum {
 	VHOST_NET_VQ_RX = 0,
 	VHOST_NET_VQ_TX = 1,
@@ -129,6 +132,7 @@ static void handle_tx(struct vhost_net *net)
 	int err, wmem;
 	size_t hdr_size;
 	struct socket *sock;
+	struct skb_ubuf_info pend;
 
 	/* TODO: check that we are running from vhost_worker? */
 	sock = rcu_dereference_check(vq->private_data, 1);
@@ -151,6 +155,10 @@ static void handle_tx(struct vhost_net *net)
 	hdr_size = vq->vhost_hlen;
 
 	for (;;) {
+		/* Release DMAs done buffers first */
+		if (atomic_read(&vq->refcnt) > VHOST_MAX_ZEROCOPY_PEND)
+			vhost_zerocopy_signal_used(vq, false);
+
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
 					 &out, &in,
@@ -166,6 +174,13 @@ static void handle_tx(struct vhost_net *net)
 				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 				break;
 			}
+			/* If more outstanding DMAs, queue the work */
+			if (sock_flag(sock->sk, SOCK_ZEROCOPY) &&
+			    (atomic_read(&vq->refcnt) > VHOST_MAX_ZEROCOPY_PEND)) {
+				tx_poll_start(net, sock);
+				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+				break;
+			}
 			if (unlikely(vhost_enable_notify(vq))) {
 				vhost_disable_notify(vq);
 				continue;
@@ -188,17 +203,35 @@ static void handle_tx(struct vhost_net *net)
 			       iov_length(vq->hdr, s), hdr_size);
 			break;
 		}
+		/* use msg_control to pass vhost zerocopy ubuf info to skb */
+		if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
+			vq->heads[vq->upend_idx].id = head;
+			if (len <= 128)
+				vq->heads[vq->upend_idx].len = VHOST_DMA_DONE_LEN;
+			else {
+				vq->heads[vq->upend_idx].len = len;
+				pend.callback = vhost_zerocopy_callback;
+				pend.arg = vq;
+				pend.desc = vq->upend_idx;
+				msg.msg_control = &pend;
+				msg.msg_controllen = sizeof(pend);
+			}
+			atomic_inc(&vq->refcnt);
+			vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
+		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(NULL, sock, &msg, len);
 		if (unlikely(err < 0)) {
-			vhost_discard_vq_desc(vq, 1);
+			if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+				vhost_discard_vq_desc(vq, 1);
 			tx_poll_start(net, sock);
 			break;
 		}
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+			vhost_add_used_and_signal(&net->dev, vq, head, 0);
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ab2912..ce799d6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -174,6 +174,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
+	vq->upend_idx = 0;
+	vq->done_idx = 0;
+	atomic_set(&vq->refcnt, 0);
 }
 
 static int vhost_worker(void *data)
@@ -230,7 +233,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
 					       UIO_MAXIOV, GFP_KERNEL);
 		dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV,
 					  GFP_KERNEL);
-		dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
+		dev->vqs[i].heads = kzalloc(sizeof *dev->vqs[i].heads *
 					    UIO_MAXIOV, GFP_KERNEL);
 
 		if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
@@ -385,6 +388,38 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
 	return 0;
 }
 
+/* 
+	comments
+*/
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown)
+{
+	int i, j = 0;
+
+	i = vq->done_idx;
+	while (i != vq->upend_idx) {
+		if ((vq->heads[i].len == VHOST_DMA_DONE_LEN) || shutdown) {
+			/* reset len = 0 */
+			vq->heads[i].len = 0;
+			i = (i + 1) % UIO_MAXIOV;
+			++j;
+		} else
+			break;
+	}
+	if (j) {
+		/* comments */
+		if (i > vq->done_idx)
+			vhost_add_used_n(vq, &vq->heads[vq->done_idx], j);
+		else {
+			vhost_add_used_n(vq, &vq->heads[vq->done_idx],
+					 UIO_MAXIOV - vq->done_idx);
+			vhost_add_used_n(vq, vq->heads, i);
+		}
+		vq->done_idx = i;
+		vhost_signal(vq->dev, vq);
+		atomic_sub(j, &vq->refcnt);
+	}
+}
+
 /* Caller should have device mutex */
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
@@ -395,6 +430,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 			vhost_poll_stop(&dev->vqs[i].poll);
 			vhost_poll_flush(&dev->vqs[i].poll);
 		}
+		/* wait for all lower device DMAs done, then notify guest */
+		if (atomic_read(&dev->vqs[i].refcnt)) {
+			msleep(1000);
+			vhost_zerocopy_signal_used(&dev->vqs[i], true);
+		}
 		if (dev->vqs[i].error_ctx)
 			eventfd_ctx_put(dev->vqs[i].error_ctx);
 		if (dev->vqs[i].error)
@@ -603,6 +643,10 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 
 	mutex_lock(&vq->mutex);
 
+	/* force all lower device DMAs done */
+	if (atomic_read(&vq->refcnt)) 
+		vhost_zerocopy_signal_used(vq, true);
+
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		/* Resizing ring with an active backend?
@@ -1416,3 +1460,12 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
 		vq_err(vq, "Failed to enable notification at %p: %d\n",
 		       &vq->used->flags, r);
 }
+
+void vhost_zerocopy_callback(struct sk_buff *skb)
+{
+	int idx = skb_shinfo(skb)->ubuf.desc;
+	struct vhost_virtqueue *vq = skb_shinfo(skb)->ubuf.arg;
+
+	/* set len = 1 to mark this desc buffers done DMA */
+	vq->heads[idx].len = VHOST_DMA_DONE_LEN;
+}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b3363ae..8e3ecc7 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -13,6 +13,10 @@
 #include <linux/virtio_ring.h>
 #include <asm/atomic.h>
 
+/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
+ * done */
+#define VHOST_DMA_DONE_LEN	1
+
 struct vhost_device;
 
 struct vhost_work;
@@ -108,6 +112,12 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
+	/* vhost zerocopy support */
+	atomic_t refcnt; /* num of outstanding zerocopy DMAs */
+	/* copy of avail idx to monitor outstanding DMA zerocopy buffers */
+	int upend_idx;
+	/* copy of used idx to monintor DMA done zerocopy buffers */
+	int done_idx;
 };
 
 struct vhost_dev {
@@ -154,6 +164,8 @@ bool vhost_enable_notify(struct vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
+void vhost_zerocopy_callback(struct sk_buff *skb);
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [TEST PATCH net-next] vhost: accumulate multiple used and sigal in vhost TX test
  2011-05-17 20:46                 ` [TEST PATCH net-next] vhost: accumulate multiple used and sigal in vhost TX test Shirley Ma
@ 2011-05-17 20:52                   ` Michael S. Tsirkin
  0 siblings, 0 replies; 23+ messages in thread
From: Michael S. Tsirkin @ 2011-05-17 20:52 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Tue, May 17, 2011 at 01:46:21PM -0700, Shirley Ma wrote:
> Hello Michael,
> 
> Here is the patch I used to test out of order before: add used in a pend
> array, and swap the last two ids. 
> 
> I used to hit an issue, but now it seems working well.

Aha, so if I apply this guest will *not* crash? :)

> This won't impact zero-copy patch since we need to maintain the pend
> used ids anyway.

Yes, but now we can mark them used immediately as they are
completed - and I am guessing this will relieve the
pressure on tx ring that you see?

> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ---
>  drivers/vhost/net.c   |   24 +++++++++++++++++++++++-
>  drivers/vhost/vhost.c |   11 +++++++++++
>  drivers/vhost/vhost.h |    1 +
>  3 files changed, 35 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 2f7c76a..19e1baa 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -32,6 +32,8 @@
>   * Using this limit prevents one virtqueue from starving others. */
>  #define VHOST_NET_WEIGHT 0x80000
>  
> +#define VHOST_MAX_PEND	128
> +
>  enum {
>  	VHOST_NET_VQ_RX = 0,
>  	VHOST_NET_VQ_TX = 1,
> @@ -198,13 +200,33 @@ static void handle_tx(struct vhost_net *net)
>  		if (err != len)
>  			pr_debug("Truncated TX packet: "
>  				 " len %d != %zd\n", err, len);
> -		vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +		vq->heads[vq->pend_idx].id = head;
> +		vq->heads[vq->pend_idx].len = 0;
> +		++vq->pend_idx;
> +		if (vq->pend_idx >= VHOST_MAX_PEND) {
> +			int id;
> +			id = vq->heads[vq->pend_idx-1].id;
> +			vq->heads[vq->pend_idx-1].id = vq->heads[vq->pend_idx-2].id; 
> +			vq->heads[vq->pend_idx-2].id = id;
> +			vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> +						    vq->pend_idx);
> +			vq->pend_idx = 0;
> +		}
>  		total_len += len;
>  		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>  			vhost_poll_queue(&vq->poll);
>  			break;
>  		}
>  	}
> +	if (vq->pend_idx >= VHOST_MAX_PEND) {
> +		int id;
> +		id = vq->heads[vq->pend_idx-1].id;
> +		vq->heads[vq->pend_idx-1].id = vq->heads[vq->pend_idx-2].id;
> +		vq->heads[vq->pend_idx-2].id = id;
> +		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> +					    vq->pend_idx);
> +		vq->pend_idx = 0;
> +	}
>  
>  	mutex_unlock(&vq->mutex);
>  }
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 2ab2912..7eea6b3 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -174,6 +174,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->call_ctx = NULL;
>  	vq->call = NULL;
>  	vq->log_ctx = NULL;
> +	vq->pend_idx = 0;
>  }
>  
>  static int vhost_worker(void *data)
> @@ -395,6 +396,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  			vhost_poll_stop(&dev->vqs[i].poll);
>  			vhost_poll_flush(&dev->vqs[i].poll);
>  		}
> +		if (dev->vqs[i].pend_idx != 0) {
> +			vhost_add_used_and_signal_n(dev, &dev->vqs[i],
> +				dev->vqs[i].heads, dev->vqs[i].pend_idx);
> +			dev->vqs[i].pend_idx = 0;
> +		}
>  		if (dev->vqs[i].error_ctx)
>  			eventfd_ctx_put(dev->vqs[i].error_ctx);
>  		if (dev->vqs[i].error)
> @@ -603,6 +609,11 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
>  
>  	mutex_lock(&vq->mutex);
>  
> +	if (vq->pend_idx != 0) {
> +		vhost_add_used_and_signal_n(d, vq, vq->heads, vq->pend_idx);
> +		vq->pend_idx = 0;
> +	}
> +
>  	switch (ioctl) {
>  	case VHOST_SET_VRING_NUM:
>  		/* Resizing ring with an active backend?
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index b3363ae..44a412d 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -108,6 +108,7 @@ struct vhost_virtqueue {
>  	/* Log write descriptors */
>  	void __user *log_base;
>  	struct vhost_log *log;
> +	int pend_idx;
>  };
>  
>  struct vhost_dev {
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-17 20:50                 ` [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support Shirley Ma
@ 2011-05-17 20:58                   ` Michael S. Tsirkin
  2011-05-17 21:01                     ` Shirley Ma
  2011-05-17 21:28                   ` Michael S. Tsirkin
  1 sibling, 1 reply; 23+ messages in thread
From: Michael S. Tsirkin @ 2011-05-17 20:58 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Tue, May 17, 2011 at 01:50:19PM -0700, Shirley Ma wrote:
> Resubmit the patch with most update. This patch passed some
> live-migration test against RHEL6.2. I will run more stress test w/i
> live migration.

What changed from the last version?

> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ---
> 
>  drivers/vhost/net.c   |   37 +++++++++++++++++++++++++++++++-
>  drivers/vhost/vhost.c |   55 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  drivers/vhost/vhost.h |   12 ++++++++++
>  3 files changed, 101 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 2f7c76a..6bd6e28 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -32,6 +32,9 @@
>   * Using this limit prevents one virtqueue from starving others. */
>  #define VHOST_NET_WEIGHT 0x80000
>  
> +/* MAX number of TX used buffers for outstanding zerocopy */
> +#define VHOST_MAX_ZEROCOPY_PEND 128 
> +
>  enum {
>  	VHOST_NET_VQ_RX = 0,
>  	VHOST_NET_VQ_TX = 1,
> @@ -129,6 +132,7 @@ static void handle_tx(struct vhost_net *net)
>  	int err, wmem;
>  	size_t hdr_size;
>  	struct socket *sock;
> +	struct skb_ubuf_info pend;
>  
>  	/* TODO: check that we are running from vhost_worker? */
>  	sock = rcu_dereference_check(vq->private_data, 1);
> @@ -151,6 +155,10 @@ static void handle_tx(struct vhost_net *net)
>  	hdr_size = vq->vhost_hlen;
>  
>  	for (;;) {
> +		/* Release DMAs done buffers first */
> +		if (atomic_read(&vq->refcnt) > VHOST_MAX_ZEROCOPY_PEND)
> +			vhost_zerocopy_signal_used(vq, false);
> +
>  		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>  					 ARRAY_SIZE(vq->iov),
>  					 &out, &in,
> @@ -166,6 +174,13 @@ static void handle_tx(struct vhost_net *net)
>  				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
>  				break;
>  			}
> +			/* If more outstanding DMAs, queue the work */
> +			if (sock_flag(sock->sk, SOCK_ZEROCOPY) &&
> +			    (atomic_read(&vq->refcnt) > VHOST_MAX_ZEROCOPY_PEND)) {
> +				tx_poll_start(net, sock);
> +				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
> +				break;
> +			}
>  			if (unlikely(vhost_enable_notify(vq))) {
>  				vhost_disable_notify(vq);
>  				continue;
> @@ -188,17 +203,35 @@ static void handle_tx(struct vhost_net *net)
>  			       iov_length(vq->hdr, s), hdr_size);
>  			break;
>  		}
> +		/* use msg_control to pass vhost zerocopy ubuf info to skb */
> +		if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
> +			vq->heads[vq->upend_idx].id = head;
> +			if (len <= 128)
> +				vq->heads[vq->upend_idx].len = VHOST_DMA_DONE_LEN;
> +			else {
> +				vq->heads[vq->upend_idx].len = len;
> +				pend.callback = vhost_zerocopy_callback;
> +				pend.arg = vq;
> +				pend.desc = vq->upend_idx;
> +				msg.msg_control = &pend;
> +				msg.msg_controllen = sizeof(pend);
> +			}
> +			atomic_inc(&vq->refcnt);
> +			vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
> +		}
>  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
>  		err = sock->ops->sendmsg(NULL, sock, &msg, len);
>  		if (unlikely(err < 0)) {
> -			vhost_discard_vq_desc(vq, 1);
> +			if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
> +				vhost_discard_vq_desc(vq, 1);
>  			tx_poll_start(net, sock);
>  			break;
>  		}
>  		if (err != len)
>  			pr_debug("Truncated TX packet: "
>  				 " len %d != %zd\n", err, len);
> -		vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +		if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
> +			vhost_add_used_and_signal(&net->dev, vq, head, 0);
>  		total_len += len;
>  		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>  			vhost_poll_queue(&vq->poll);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 2ab2912..ce799d6 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -174,6 +174,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->call_ctx = NULL;
>  	vq->call = NULL;
>  	vq->log_ctx = NULL;
> +	vq->upend_idx = 0;
> +	vq->done_idx = 0;
> +	atomic_set(&vq->refcnt, 0);
>  }
>  
>  static int vhost_worker(void *data)
> @@ -230,7 +233,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
>  					       UIO_MAXIOV, GFP_KERNEL);
>  		dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV,
>  					  GFP_KERNEL);
> -		dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
> +		dev->vqs[i].heads = kzalloc(sizeof *dev->vqs[i].heads *
>  					    UIO_MAXIOV, GFP_KERNEL);
>  
>  		if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
> @@ -385,6 +388,38 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
>  	return 0;
>  }
>  
> +/* 
> +	comments
> +*/
> +void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown)
> +{
> +	int i, j = 0;
> +
> +	i = vq->done_idx;
> +	while (i != vq->upend_idx) {
> +		if ((vq->heads[i].len == VHOST_DMA_DONE_LEN) || shutdown) {
> +			/* reset len = 0 */
> +			vq->heads[i].len = 0;
> +			i = (i + 1) % UIO_MAXIOV;
> +			++j;
> +		} else
> +			break;
> +	}
> +	if (j) {
> +		/* comments */
> +		if (i > vq->done_idx)
> +			vhost_add_used_n(vq, &vq->heads[vq->done_idx], j);
> +		else {
> +			vhost_add_used_n(vq, &vq->heads[vq->done_idx],
> +					 UIO_MAXIOV - vq->done_idx);
> +			vhost_add_used_n(vq, vq->heads, i);
> +		}
> +		vq->done_idx = i;
> +		vhost_signal(vq->dev, vq);
> +		atomic_sub(j, &vq->refcnt);
> +	}
> +}
> +
>  /* Caller should have device mutex */
>  void vhost_dev_cleanup(struct vhost_dev *dev)
>  {
> @@ -395,6 +430,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  			vhost_poll_stop(&dev->vqs[i].poll);
>  			vhost_poll_flush(&dev->vqs[i].poll);
>  		}
> +		/* wait for all lower device DMAs done, then notify guest */
> +		if (atomic_read(&dev->vqs[i].refcnt)) {
> +			msleep(1000);
> +			vhost_zerocopy_signal_used(&dev->vqs[i], true);
> +		}
>  		if (dev->vqs[i].error_ctx)
>  			eventfd_ctx_put(dev->vqs[i].error_ctx);
>  		if (dev->vqs[i].error)
> @@ -603,6 +643,10 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
>  
>  	mutex_lock(&vq->mutex);
>  
> +	/* force all lower device DMAs done */
> +	if (atomic_read(&vq->refcnt)) 
> +		vhost_zerocopy_signal_used(vq, true);
> +
>  	switch (ioctl) {
>  	case VHOST_SET_VRING_NUM:
>  		/* Resizing ring with an active backend?
> @@ -1416,3 +1460,12 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
>  		vq_err(vq, "Failed to enable notification at %p: %d\n",
>  		       &vq->used->flags, r);
>  }
> +
> +void vhost_zerocopy_callback(struct sk_buff *skb)
> +{
> +	int idx = skb_shinfo(skb)->ubuf.desc;
> +	struct vhost_virtqueue *vq = skb_shinfo(skb)->ubuf.arg;
> +
> +	/* set len = 1 to mark this desc buffers done DMA */
> +	vq->heads[idx].len = VHOST_DMA_DONE_LEN;
> +}
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index b3363ae..8e3ecc7 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -13,6 +13,10 @@
>  #include <linux/virtio_ring.h>
>  #include <asm/atomic.h>
>  
> +/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
> + * done */
> +#define VHOST_DMA_DONE_LEN	1
> +
>  struct vhost_device;
>  
>  struct vhost_work;
> @@ -108,6 +112,12 @@ struct vhost_virtqueue {
>  	/* Log write descriptors */
>  	void __user *log_base;
>  	struct vhost_log *log;
> +	/* vhost zerocopy support */
> +	atomic_t refcnt; /* num of outstanding zerocopy DMAs */
> +	/* copy of avail idx to monitor outstanding DMA zerocopy buffers */
> +	int upend_idx;
> +	/* copy of used idx to monintor DMA done zerocopy buffers */
> +	int done_idx;
>  };
>  
>  struct vhost_dev {
> @@ -154,6 +164,8 @@ bool vhost_enable_notify(struct vhost_virtqueue *);
>  
>  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
>  		    unsigned int log_num, u64 len);
> +void vhost_zerocopy_callback(struct sk_buff *skb);
> +void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown);
>  
>  #define vq_err(vq, fmt, ...) do {                                  \
>  		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-17 20:58                   ` Michael S. Tsirkin
@ 2011-05-17 21:01                     ` Shirley Ma
  0 siblings, 0 replies; 23+ messages in thread
From: Shirley Ma @ 2011-05-17 21:01 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Tue, 2011-05-17 at 23:58 +0300, Michael S. Tsirkin wrote:
> On Tue, May 17, 2011 at 01:50:19PM -0700, Shirley Ma wrote:
> > Resubmit the patch with most update. This patch passed some
> > live-migration test against RHEL6.2. I will run more stress test w/i
> > live migration.
> 
> What changed from the last version?

Sorry, I forgot to mention: add clean up pending before set_vring.

Thanks
Shirley


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-17 20:50                 ` [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support Shirley Ma
  2011-05-17 20:58                   ` Michael S. Tsirkin
@ 2011-05-17 21:28                   ` Michael S. Tsirkin
  2011-05-17 22:21                     ` Shirley Ma
  2011-05-18  5:14                     ` Shirley Ma
  1 sibling, 2 replies; 23+ messages in thread
From: Michael S. Tsirkin @ 2011-05-17 21:28 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Tue, May 17, 2011 at 01:50:19PM -0700, Shirley Ma wrote:
> Resubmit the patch with most update. This patch passed some
> live-migration test against RHEL6.2. I will run more stress test w/i
> live migration.
> 
> Signed-off-by: Shirley Ma <xma@us.ibm.com>

Cool. cleanup path needs a fix - are you use you can
not use kobj or some other existing refcounting?

Is perf regressiion caused by tx ring overrun gone now?

I added some comments about how we might be aqble
to complete requests out of order but it's not a must.

> ---
> 
>  drivers/vhost/net.c   |   37 +++++++++++++++++++++++++++++++-
>  drivers/vhost/vhost.c |   55 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  drivers/vhost/vhost.h |   12 ++++++++++
>  3 files changed, 101 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 2f7c76a..6bd6e28 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -32,6 +32,9 @@
>   * Using this limit prevents one virtqueue from starving others. */
>  #define VHOST_NET_WEIGHT 0x80000
>  
> +/* MAX number of TX used buffers for outstanding zerocopy */
> +#define VHOST_MAX_ZEROCOPY_PEND 128 
> +
>  enum {
>  	VHOST_NET_VQ_RX = 0,
>  	VHOST_NET_VQ_TX = 1,
> @@ -129,6 +132,7 @@ static void handle_tx(struct vhost_net *net)
>  	int err, wmem;
>  	size_t hdr_size;
>  	struct socket *sock;
> +	struct skb_ubuf_info pend;
>  
>  	/* TODO: check that we are running from vhost_worker? */
>  	sock = rcu_dereference_check(vq->private_data, 1);
> @@ -151,6 +155,10 @@ static void handle_tx(struct vhost_net *net)
>  	hdr_size = vq->vhost_hlen;
>  
>  	for (;;) {
> +		/* Release DMAs done buffers first */
> +		if (atomic_read(&vq->refcnt) > VHOST_MAX_ZEROCOPY_PEND)
> +			vhost_zerocopy_signal_used(vq, false);
> +
>  		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>  					 ARRAY_SIZE(vq->iov),
>  					 &out, &in,
> @@ -166,6 +174,13 @@ static void handle_tx(struct vhost_net *net)
>  				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
>  				break;
>  			}
> +			/* If more outstanding DMAs, queue the work */
> +			if (sock_flag(sock->sk, SOCK_ZEROCOPY) &&
> +			    (atomic_read(&vq->refcnt) > VHOST_MAX_ZEROCOPY_PEND)) {
> +				tx_poll_start(net, sock);
> +				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
> +				break;
> +			}
>  			if (unlikely(vhost_enable_notify(vq))) {
>  				vhost_disable_notify(vq);
>  				continue;
> @@ -188,17 +203,35 @@ static void handle_tx(struct vhost_net *net)
>  			       iov_length(vq->hdr, s), hdr_size);
>  			break;
>  		}
> +		/* use msg_control to pass vhost zerocopy ubuf info to skb */
> +		if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
> +			vq->heads[vq->upend_idx].id = head;
> +			if (len <= 128)

I thought we have a constant for that?

> +				vq->heads[vq->upend_idx].len = VHOST_DMA_DONE_LEN;
> +			else {
> +				vq->heads[vq->upend_idx].len = len;
> +				pend.callback = vhost_zerocopy_callback;
> +				pend.arg = vq;
> +				pend.desc = vq->upend_idx;
> +				msg.msg_control = &pend;
> +				msg.msg_controllen = sizeof(pend);
> +			}
> +			atomic_inc(&vq->refcnt);
> +			vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;

Ok, so we deal with a cyclic ring apparently? What guarantees we don't
overrun it?


> +		}
>  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
>  		err = sock->ops->sendmsg(NULL, sock, &msg, len);
>  		if (unlikely(err < 0)) {
> -			vhost_discard_vq_desc(vq, 1);
> +			if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
> +				vhost_discard_vq_desc(vq, 1);

How are errors handled with zerocopy?


>  			tx_poll_start(net, sock);
>  			break;
>  		}
>  		if (err != len)
>  			pr_debug("Truncated TX packet: "
>  				 " len %d != %zd\n", err, len);
> -		vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +		if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
> +			vhost_add_used_and_signal(&net->dev, vq, head, 0);
>  		total_len += len;
>  		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>  			vhost_poll_queue(&vq->poll);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 2ab2912..ce799d6 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -174,6 +174,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->call_ctx = NULL;
>  	vq->call = NULL;
>  	vq->log_ctx = NULL;
> +	vq->upend_idx = 0;
> +	vq->done_idx = 0;
> +	atomic_set(&vq->refcnt, 0);
>  }
>  
>  static int vhost_worker(void *data)
> @@ -230,7 +233,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
>  					       UIO_MAXIOV, GFP_KERNEL);
>  		dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV,
>  					  GFP_KERNEL);
> -		dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
> +		dev->vqs[i].heads = kzalloc(sizeof *dev->vqs[i].heads *
>  					    UIO_MAXIOV, GFP_KERNEL);

Which fields need to be initialized actually?

>  
>  		if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
> @@ -385,6 +388,38 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
>  	return 0;
>  }
>  
> +/* 
> +	comments
> +*/

Hmm.

> +void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown)
> +{
> +	int i, j = 0;
> +
> +	i = vq->done_idx;
> +	while (i != vq->upend_idx) {

A for loop might be clearer.

> +		if ((vq->heads[i].len == VHOST_DMA_DONE_LEN) || shutdown) {

On shutdown, we signal all buffers used to the guest?
Why?


> +			/* reset len = 0 */

comment not very helpful.
Could you explain what this does instead?
Or better use some constant instead of 0 ...

> +			vq->heads[i].len = 0;
> +			i = (i + 1) % UIO_MAXIOV;
> +			++j;
> +		} else
> +			break;

Hmm so if the 1st entry does not complete, you do not signal anything?

> +	}

Looking at this loop, done idx is the consumer and used idx
is the producer, right?

> +	if (j) {
> +		/* comments */

Yes?

> +		if (i > vq->done_idx)
> +			vhost_add_used_n(vq, &vq->heads[vq->done_idx], j);
> +		else {
> +			vhost_add_used_n(vq, &vq->heads[vq->done_idx],
> +					 UIO_MAXIOV - vq->done_idx);
> +			vhost_add_used_n(vq, vq->heads, i);
> +		}
> +		vq->done_idx = i;
> +		vhost_signal(vq->dev, vq);
> +		atomic_sub(j, &vq->refcnt);

Code will likely be simpler if you call vhost_add_used once for
each head in the first loop. Possibly add_used_signal might be
a good idea too.

> +	}
> +}
> +
>  /* Caller should have device mutex */
>  void vhost_dev_cleanup(struct vhost_dev *dev)
>  {
> @@ -395,6 +430,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  			vhost_poll_stop(&dev->vqs[i].poll);
>  			vhost_poll_flush(&dev->vqs[i].poll);
>  		}
> +		/* wait for all lower device DMAs done, then notify guest */
> +		if (atomic_read(&dev->vqs[i].refcnt)) {
> +			msleep(1000);
> +			vhost_zerocopy_signal_used(&dev->vqs[i], true);
> +		}

This needs to be fixed somehow. Use a completion object and wait
on it?

>  		if (dev->vqs[i].error_ctx)
>  			eventfd_ctx_put(dev->vqs[i].error_ctx);
>  		if (dev->vqs[i].error)
> @@ -603,6 +643,10 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
>  
>  	mutex_lock(&vq->mutex);
>  
> +	/* force all lower device DMAs done */
> +	if (atomic_read(&vq->refcnt)) 
> +		vhost_zerocopy_signal_used(vq, true);
> +
>  	switch (ioctl) {
>  	case VHOST_SET_VRING_NUM:
>  		/* Resizing ring with an active backend?
> @@ -1416,3 +1460,12 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
>  		vq_err(vq, "Failed to enable notification at %p: %d\n",
>  		       &vq->used->flags, r);
>  }
> +
> +void vhost_zerocopy_callback(struct sk_buff *skb)
> +{
> +	int idx = skb_shinfo(skb)->ubuf.desc;
> +	struct vhost_virtqueue *vq = skb_shinfo(skb)->ubuf.arg;
> +
> +	/* set len = 1 to mark this desc buffers done DMA */

this comment can now go.

> +	vq->heads[idx].len = VHOST_DMA_DONE_LEN;
> +}
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index b3363ae..8e3ecc7 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -13,6 +13,10 @@
>  #include <linux/virtio_ring.h>
>  #include <asm/atomic.h>
>  
> +/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
> + * done */
> +#define VHOST_DMA_DONE_LEN	1
> +
>  struct vhost_device;
>  
>  struct vhost_work;
> @@ -108,6 +112,12 @@ struct vhost_virtqueue {
>  	/* Log write descriptors */
>  	void __user *log_base;
>  	struct vhost_log *log;
> +	/* vhost zerocopy support */
> +	atomic_t refcnt; /* num of outstanding zerocopy DMAs */

future enhancement idea: this is used apparently under vq lock
so no need for an atomic?

> +	/* copy of avail idx to monitor outstanding DMA zerocopy buffers */

looking at code upend_idx seems to be calculated independently
of guest avail idx - could you clarify pls?

> +	int upend_idx;
> +	/* copy of used idx to monintor DMA done zerocopy buffers */

monitor

> +	int done_idx;


I think in reality these are just producer and consumer
in the head structure which for zero copy is used



>  };
>  
>  struct vhost_dev {
> @@ -154,6 +164,8 @@ bool vhost_enable_notify(struct vhost_virtqueue *);
>  
>  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
>  		    unsigned int log_num, u64 len);
> +void vhost_zerocopy_callback(struct sk_buff *skb);
> +void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown);
>  
>  #define vq_err(vq, fmt, ...) do {                                  \
>  		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-17 21:28                   ` Michael S. Tsirkin
@ 2011-05-17 22:21                     ` Shirley Ma
  2011-05-18  5:14                     ` Shirley Ma
  1 sibling, 0 replies; 23+ messages in thread
From: Shirley Ma @ 2011-05-17 22:21 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Wed, 2011-05-18 at 00:28 +0300, Michael S. Tsirkin wrote:
> On Tue, May 17, 2011 at 01:50:19PM -0700, Shirley Ma wrote:
> > Resubmit the patch with most update. This patch passed some
> > live-migration test against RHEL6.2. I will run more stress test w/i
> > live migration.
> > 
> > Signed-off-by: Shirley Ma <xma@us.ibm.com>
> 
> Cool. cleanup path needs a fix - are you use you can
> not use kobj or some other existing refcounting?
> 
> Is perf regressiion caused by tx ring overrun gone now?
> 
> I added some comments about how we might be aqble
> to complete requests out of order but it's not a must.

Oops, I used the old version vhost patch instead of most update one.

I will integrate your comments below and submit V6 patch instead.

thanks
Shirley


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-17 21:28                   ` Michael S. Tsirkin
  2011-05-17 22:21                     ` Shirley Ma
@ 2011-05-18  5:14                     ` Shirley Ma
  2011-05-18  6:16                       ` [PATCH V6 " Shirley Ma
                                         ` (2 more replies)
  1 sibling, 3 replies; 23+ messages in thread
From: Shirley Ma @ 2011-05-18  5:14 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Wed, 2011-05-18 at 00:28 +0300, Michael S. Tsirkin wrote:
> On Tue, May 17, 2011 at 01:50:19PM -0700, Shirley Ma wrote:
> > Resubmit the patch with most update. This patch passed some
> > live-migration test against RHEL6.2. I will run more stress test w/i
> > live migration.
> > 
> > Signed-off-by: Shirley Ma <xma@us.ibm.com>
> 
> Cool. cleanup path needs a fix - are you use you can
> not use kobj or some other existing refcounting?
I will look at this.

> Is perf regressiion caused by tx ring overrun gone now?
Nope.

> I added some comments about how we might be aqble
> to complete requests out of order but it's not a must.
Ok.

> > ---
> > 
> >  drivers/vhost/net.c   |   37 +++++++++++++++++++++++++++++++-
> >  drivers/vhost/vhost.c |   55
> ++++++++++++++++++++++++++++++++++++++++++++++++-
> >  drivers/vhost/vhost.h |   12 ++++++++++
> >  3 files changed, 101 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 2f7c76a..6bd6e28 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -32,6 +32,9 @@
> >   * Using this limit prevents one virtqueue from starving others. */
> >  #define VHOST_NET_WEIGHT 0x80000
> >  
> > +/* MAX number of TX used buffers for outstanding zerocopy */
> > +#define VHOST_MAX_ZEROCOPY_PEND 128 
> > +
> >  enum {
> >       VHOST_NET_VQ_RX = 0,
> >       VHOST_NET_VQ_TX = 1,
> > @@ -129,6 +132,7 @@ static void handle_tx(struct vhost_net *net)
> >       int err, wmem;
> >       size_t hdr_size;
> >       struct socket *sock;
> > +     struct skb_ubuf_info pend;
> >  
> >       /* TODO: check that we are running from vhost_worker? */
> >       sock = rcu_dereference_check(vq->private_data, 1);
> > @@ -151,6 +155,10 @@ static void handle_tx(struct vhost_net *net)
> >       hdr_size = vq->vhost_hlen;
> >  
> >       for (;;) {
> > +             /* Release DMAs done buffers first */
> > +             if (atomic_read(&vq->refcnt) >
> VHOST_MAX_ZEROCOPY_PEND)
> > +                     vhost_zerocopy_signal_used(vq, false);
> > +
> >               head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> >                                        ARRAY_SIZE(vq->iov),
> >                                        &out, &in,
> > @@ -166,6 +174,13 @@ static void handle_tx(struct vhost_net *net)
> >                               set_bit(SOCK_ASYNC_NOSPACE,
> &sock->flags);
> >                               break;
> >                       }
> > +                     /* If more outstanding DMAs, queue the work */
> > +                     if (sock_flag(sock->sk, SOCK_ZEROCOPY) &&
> > +                         (atomic_read(&vq->refcnt) >
> VHOST_MAX_ZEROCOPY_PEND)) {
> > +                             tx_poll_start(net, sock);
> > +                             set_bit(SOCK_ASYNC_NOSPACE,
> &sock->flags);
> > +                             break;
> > +                     }
> >                       if (unlikely(vhost_enable_notify(vq))) {
> >                               vhost_disable_notify(vq);
> >                               continue;
> > @@ -188,17 +203,35 @@ static void handle_tx(struct vhost_net *net)
> >                              iov_length(vq->hdr, s), hdr_size);
> >                       break;
> >               }
> > +             /* use msg_control to pass vhost zerocopy ubuf info to
> skb */
> > +             if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
> > +                     vq->heads[vq->upend_idx].id = head;
> > +                     if (len <= 128)
> 
> I thought we have a constant for that?

Yes, fixed it already. I might change it to PAGE_SIZE for now since the
small message sizes regression issue.

> > +                             vq->heads[vq->upend_idx].len =
> VHOST_DMA_DONE_LEN;
> > +                     else {
> > +                             vq->heads[vq->upend_idx].len = len;
> > +                             pend.callback =
> vhost_zerocopy_callback;
> > +                             pend.arg = vq;
> > +                             pend.desc = vq->upend_idx;
> > +                             msg.msg_control = &pend;
> > +                             msg.msg_controllen = sizeof(pend);
> > +                     }
> > +                     atomic_inc(&vq->refcnt);
> > +                     vq->upend_idx = (vq->upend_idx + 1) %
> UIO_MAXIOV;
> 
> Ok, so we deal with a cyclic ring apparently? What guarantees we don't
> overrun it?

VHOST_MAX_PEND (which is 128) should prevent it from overrun normally.
In the case of none DMAs can be done, the maximum entries are the ring
size, which is much smaller or equal to UIO_MAXIOV for current
implementation. 

Maybe I should add some condition check if it is overrun then exits?

> 
> > +             }
> >               /* TODO: Check specific error and bomb out unless
> ENOBUFS? */
> >               err = sock->ops->sendmsg(NULL, sock, &msg, len);
> >               if (unlikely(err < 0)) {
> > -                     vhost_discard_vq_desc(vq, 1);
> > +                     if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
> > +                             vhost_discard_vq_desc(vq, 1);
> 
> How are errors handled with zerocopy?

I thought about it before: if error after macvtap skb is allocated, then
that skb has a callback which will set DMA done for add_used, if the
error before macvtap skb is allocated, then it can reverse as copy case.
To avoid the complexity check, I decided to not handle it.

> 
> >                       tx_poll_start(net, sock);
> >                       break;
> >               }
> >               if (err != len)
> >                       pr_debug("Truncated TX packet: "
> >                                " len %d != %zd\n", err, len);
> > -             vhost_add_used_and_signal(&net->dev, vq, head, 0);
> > +             if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
> > +                     vhost_add_used_and_signal(&net->dev, vq, head,
> 0);
> >               total_len += len;
> >               if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> >                       vhost_poll_queue(&vq->poll);
> > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > index 2ab2912..ce799d6 100644
> > --- a/drivers/vhost/vhost.c
> > +++ b/drivers/vhost/vhost.c
> > @@ -174,6 +174,9 @@ static void vhost_vq_reset(struct vhost_dev
> *dev,
> >       vq->call_ctx = NULL;
> >       vq->call = NULL;
> >       vq->log_ctx = NULL;
> > +     vq->upend_idx = 0;
> > +     vq->done_idx = 0;
> > +     atomic_set(&vq->refcnt, 0);
> >  }
> >  
> >  static int vhost_worker(void *data)
> > @@ -230,7 +233,7 @@ static long vhost_dev_alloc_iovecs(struct
> vhost_dev *dev)
> >                                              UIO_MAXIOV,
> GFP_KERNEL);
> >               dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log *
> UIO_MAXIOV,
> >                                         GFP_KERNEL);
> > -             dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads
> *
> > +             dev->vqs[i].heads = kzalloc(sizeof *dev->vqs[i].heads
> *
> >                                           UIO_MAXIOV, GFP_KERNEL);
> 
> Which fields need to be initialized actually?

Nope, already fixed it with kmalloc.

> >  
> >               if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
> > @@ -385,6 +388,38 @@ long vhost_dev_reset_owner(struct vhost_dev
> *dev)
> >       return 0;
> >  }
> >  
> > +/* 
> > +     comments
> > +*/
> 
> Hmm.

Fixed it in previous patch already.

> > +void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool
> shutdown)
> > +{
> > +     int i, j = 0;
> > +
> > +     i = vq->done_idx;
> > +     while (i != vq->upend_idx) {
> 
> A for loop might be clearer.
Ok.

> > +             if ((vq->heads[i].len == VHOST_DMA_DONE_LEN) ||
> shutdown) {
> 
> On shutdown, we signal all buffers used to the guest?
> Why?

We signal all outstand DMAs in the case of driver has some DMA issue to
prevent us from shutting down. I am afraid vhost cleanup could wait
forever.

> > +                     /* reset len = 0 */
> 
> comment not very helpful.
> Could you explain what this does instead?
> Or better use some constant instead of 0 ...

Fixed it already.

> > +                     vq->heads[i].len = 0;
> > +                     i = (i + 1) % UIO_MAXIOV;
> > +                     ++j;
> > +             } else
> > +                     break;
> 
> Hmm so if the 1st entry does not complete, you do not signal anything?

No used buffers to guest, should we signal still?


> > +     }
> 
> Looking at this loop, done idx is the consumer and used idx
> is the producer, right?

Yes.

> > +     if (j) {
> > +             /* comments */
> 
> Yes?
> 
> > +             if (i > vq->done_idx)
> > +                     vhost_add_used_n(vq, &vq->heads[vq->done_idx],
> j);
> > +             else {
> > +                     vhost_add_used_n(vq, &vq->heads[vq->done_idx],
> > +                                      UIO_MAXIOV - vq->done_idx);
> > +                     vhost_add_used_n(vq, vq->heads, i);
> > +             }
> > +             vq->done_idx = i;
> > +             vhost_signal(vq->dev, vq);
> > +             atomic_sub(j, &vq->refcnt);
> 
> Code will likely be simpler if you call vhost_add_used once for
> each head in the first loop. Possibly add_used_signal might be
> a good idea too.

Ok.

> > +     }
> > +}
> > +
> >  /* Caller should have device mutex */
> >  void vhost_dev_cleanup(struct vhost_dev *dev)
> >  {
> > @@ -395,6 +430,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> >                       vhost_poll_stop(&dev->vqs[i].poll);
> >                       vhost_poll_flush(&dev->vqs[i].poll);
> >               }
> > +             /* wait for all lower device DMAs done, then notify
> guest */
> > +             if (atomic_read(&dev->vqs[i].refcnt)) {
> > +                     msleep(1000);
> > +                     vhost_zerocopy_signal_used(&dev->vqs[i],
> true);
> > +             }
> 
> This needs to be fixed somehow. Use a completion object and wait
> on it?

Worried about what if the driver has some DMAs issue, which would
prevent vhost from shutting down.

> >               if (dev->vqs[i].error_ctx)
> >                       eventfd_ctx_put(dev->vqs[i].error_ctx);
> >               if (dev->vqs[i].error)
> > @@ -603,6 +643,10 @@ static long vhost_set_vring(struct vhost_dev
> *d, int ioctl, void __user *argp)
> >  
> >       mutex_lock(&vq->mutex);
> >  
> > +     /* force all lower device DMAs done */
> > +     if (atomic_read(&vq->refcnt)) 
> > +             vhost_zerocopy_signal_used(vq, true);
> > +
> >       switch (ioctl) {
> >       case VHOST_SET_VRING_NUM:
> >               /* Resizing ring with an active backend?
> > @@ -1416,3 +1460,12 @@ void vhost_disable_notify(struct
> vhost_virtqueue *vq)
> >               vq_err(vq, "Failed to enable notification at %p: %d
> \n",
> >                      &vq->used->flags, r);
> >  }
> > +
> > +void vhost_zerocopy_callback(struct sk_buff *skb)
> > +{
> > +     int idx = skb_shinfo(skb)->ubuf.desc;
> > +     struct vhost_virtqueue *vq = skb_shinfo(skb)->ubuf.arg;
> > +
> > +     /* set len = 1 to mark this desc buffers done DMA */
> 
> this comment can now go.

Yes, it's gone already.

> > +     vq->heads[idx].len = VHOST_DMA_DONE_LEN;
> > +}
> > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > index b3363ae..8e3ecc7 100644
> > --- a/drivers/vhost/vhost.h
> > +++ b/drivers/vhost/vhost.h
> > @@ -13,6 +13,10 @@
> >  #include <linux/virtio_ring.h>
> >  #include <asm/atomic.h>
> >  
> > +/* This is for zerocopy, used buffer len is set to 1 when lower
> device DMA
> > + * done */
> > +#define VHOST_DMA_DONE_LEN   1
> > +
> >  struct vhost_device;
> >  
> >  struct vhost_work;
> > @@ -108,6 +112,12 @@ struct vhost_virtqueue {
> >       /* Log write descriptors */
> >       void __user *log_base;
> >       struct vhost_log *log;
> > +     /* vhost zerocopy support */
> > +     atomic_t refcnt; /* num of outstanding zerocopy DMAs */
> 
> future enhancement idea: this is used apparently under vq lock
> so no need for an atomic?

It is also used in skb vhost zerocopy callback.

> > +     /* copy of avail idx to monitor outstanding DMA zerocopy
> buffers */
> 
> looking at code upend_idx seems to be calculated independently
> of guest avail idx - could you clarify pls?

Yes, you are right. Should change it to: upend_idx is used to track
vring ids for outstanding zero-copy DMA buffers?

> > +     int upend_idx;
> > +     /* copy of used idx to monintor DMA done zerocopy buffers */
> 
> monitor

Ok.

> > +     int done_idx;
> 
> 
> I think in reality these are just producer and consumer
> in the head structure which for zero copy is used
Yes.

> 
> >  };
> >  
> >  struct vhost_dev {
> > @@ -154,6 +164,8 @@ bool vhost_enable_notify(struct vhost_virtqueue
> *);
> >  
> >  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log
> *log,
> >                   unsigned int log_num, u64 len);
> > +void vhost_zerocopy_callback(struct sk_buff *skb);
> > +void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool
> shutdown);
> >  
> >  #define vq_err(vq, fmt, ...) do {
> \
> >               pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> > 


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V6 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-18  5:14                     ` Shirley Ma
@ 2011-05-18  6:16                       ` Shirley Ma
  2011-05-18  8:43                         ` Michael S. Tsirkin
  2011-05-18  8:32                       ` [PATCH V5 " Michael S. Tsirkin
  2011-05-18  8:45                       ` Michael S. Tsirkin
  2 siblings, 1 reply; 23+ messages in thread
From: Shirley Ma @ 2011-05-18  6:16 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

Hello Michael,

Here is the update the patch based on all of your review comments except
the completion/wait for cleanup since I am worried about outstanding
DMAs would prevent vhost from shutting down. I am sending out this for
your review, and test it out later.

For error handling, I update macvtap.c so we can discard the desc even
in zero-copy case.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---
 drivers/vhost/net.c   |   42 +++++++++++++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.c |   49 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |   13 +++++++++++++
 3 files changed, 103 insertions(+), 1 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2f7c76a..e87a1f8 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,10 @@
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x80000
 
+/* MAX number of TX used buffers for outstanding zerocopy */
+#define VHOST_MAX_PEND 128
+#define VHOST_GOODCOPY_LEN PAGE_SIZE
+
 enum {
 	VHOST_NET_VQ_RX = 0,
 	VHOST_NET_VQ_TX = 1,
@@ -129,6 +133,7 @@ static void handle_tx(struct vhost_net *net)
 	int err, wmem;
 	size_t hdr_size;
 	struct socket *sock;
+	struct skb_ubuf_info pend;
 
 	/* TODO: check that we are running from vhost_worker? */
 	sock = rcu_dereference_check(vq->private_data, 1);
@@ -151,6 +156,10 @@ static void handle_tx(struct vhost_net *net)
 	hdr_size = vq->vhost_hlen;
 
 	for (;;) {
+		/* Release DMAs done buffers first */
+		if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND)
+			vhost_zerocopy_signal_used(vq, false);
+
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
 					 &out, &in,
@@ -166,6 +175,12 @@ static void handle_tx(struct vhost_net *net)
 				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 				break;
 			}
+			/* If more outstanding DMAs, queue the work */
+			if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND) {
+				tx_poll_start(net, sock);
+				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+				break;
+			}
 			if (unlikely(vhost_enable_notify(vq))) {
 				vhost_disable_notify(vq);
 				continue;
@@ -188,6 +203,30 @@ static void handle_tx(struct vhost_net *net)
 			       iov_length(vq->hdr, s), hdr_size);
 			break;
 		}
+		/* use msg_control to pass vhost zerocopy ubuf info to skb */
+		if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
+			vq->heads[vq->upend_idx].id = head;
+			if (len < VHOST_GOODCOPY_LEN)
+				/* copy don't need to wait for DMA done */
+				vq->heads[vq->upend_idx].len =
+							VHOST_DMA_DONE_LEN;
+			else {
+				vq->heads[vq->upend_idx].len = len;
+				pend.callback = vhost_zerocopy_callback;
+				pend.arg = vq;
+				pend.desc = vq->upend_idx;
+				msg.msg_control = &pend;
+				msg.msg_controllen = sizeof(pend);
+			}
+			atomic_inc(&vq->refcnt);
+			vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
+			/* if upend_idx is full, then wait for free more */
+			if (vq->upend_idx == vq->done_idx) {
+				tx_poll_start(net, sock);
+				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+				break;
+			}
+		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(NULL, sock, &msg, len);
 		if (unlikely(err < 0)) {
@@ -198,7 +237,8 @@ static void handle_tx(struct vhost_net *net)
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+			vhost_add_used_and_signal(&net->dev, vq, head, 0);
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ab2912..f4c2730 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -174,6 +174,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
+	vq->upend_idx = 0;
+	vq->done_idx = 0;
+	atomic_set(&vq->refcnt, 0);
 }
 
 static int vhost_worker(void *data)
@@ -385,16 +388,49 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
 	return 0;
 }
 
+/* In case of DMA done not in order in lower device driver for some reason.
+ * upend_idx is used to track end of used idx, done_idx is used to track head
+ * of used idx. Once lower device DMA done contiguously, we will signal KVM
+ * guest used idx.
+ */
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown)
+{
+	int i, j = 0;
+
+	for (i = vq->done_idx; i != vq->upend_idx; i = i++ % UIO_MAXIOV) {
+		if ((vq->heads[i].len == VHOST_DMA_DONE_LEN) || shutdown) {
+			vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
+			vhost_add_used_and_signal(vq->dev, vq,
+						  vq->heads[i].id, 0);
+			++j;
+		} else
+			break;
+	}
+	if (j) {
+		vq->done_idx = i;
+		atomic_sub(j, &vq->refcnt);
+	}
+}
+
 /* Caller should have device mutex */
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
 	int i;
+	unsigned long begin = jiffies;
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
 			vhost_poll_stop(&dev->vqs[i].poll);
 			vhost_poll_flush(&dev->vqs[i].poll);
 		}
+		/* Wait for all lower device DMAs done, then notify virtio_net
+		 * or just notify it without waiting for all DMA done here ?
+		 * in case of DMAs never done for some reason */
+		if (atomic_read(&dev->vqs[i].refcnt)) {
+			/* how long should we wait ? */
+			msleep(1000);
+			vhost_zerocopy_signal_used(&dev->vqs[i], true);
+		}
 		if (dev->vqs[i].error_ctx)
 			eventfd_ctx_put(dev->vqs[i].error_ctx);
 		if (dev->vqs[i].error)
@@ -603,6 +639,10 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 
 	mutex_lock(&vq->mutex);
 
+	/* clean up lower device outstanding DMAs, before setting ring */
+	if (atomic_read(&vq->refcnt))
+		vhost_zerocopy_signal_used(vq, true);
+
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		/* Resizing ring with an active backend?
@@ -1416,3 +1456,12 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
 		vq_err(vq, "Failed to enable notification at %p: %d\n",
 		       &vq->used->flags, r);
 }
+
+void vhost_zerocopy_callback(struct sk_buff *skb)
+{
+	int idx = skb_shinfo(skb)->ubuf.desc;
+	struct vhost_virtqueue *vq = skb_shinfo(skb)->ubuf.arg;
+
+	/* set len = 1 to mark this desc buffers done DMA */
+	vq->heads[idx].len = VHOST_DMA_DONE_LEN;
+}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b3363ae..d0e7ac6 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -13,6 +13,11 @@
 #include <linux/virtio_ring.h>
 #include <asm/atomic.h>
 
+/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
+ * done */
+#define VHOST_DMA_DONE_LEN	1
+#define VHOST_DMA_CLEAR_LEN	0
+
 struct vhost_device;
 
 struct vhost_work;
@@ -108,6 +113,12 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
+	/* vhost zerocopy support */
+	atomic_t refcnt; /* num of outstanding zerocopy DMAs */
+	/* last used idx for outstanding DMA zerocopy buffers */
+	int upend_idx;
+	/* first used idx for DMA done zerocopy buffers */
+	int done_idx;
 };
 
 struct vhost_dev {
@@ -154,6 +165,8 @@ bool vhost_enable_notify(struct vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
+void vhost_zerocopy_callback(struct sk_buff *skb);
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \




^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-18  5:14                     ` Shirley Ma
  2011-05-18  6:16                       ` [PATCH V6 " Shirley Ma
@ 2011-05-18  8:32                       ` Michael S. Tsirkin
  2011-05-18  8:45                       ` Michael S. Tsirkin
  2 siblings, 0 replies; 23+ messages in thread
From: Michael S. Tsirkin @ 2011-05-18  8:32 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Tue, May 17, 2011 at 10:14:34PM -0700, Shirley Ma wrote:
> On Wed, 2011-05-18 at 00:28 +0300, Michael S. Tsirkin wrote:
> > On Tue, May 17, 2011 at 01:50:19PM -0700, Shirley Ma wrote:
> > > Resubmit the patch with most update. This patch passed some
> > > live-migration test against RHEL6.2. I will run more stress test w/i
> > > live migration.
> > > 
> > > Signed-off-by: Shirley Ma <xma@us.ibm.com>
> > 
> > Cool. cleanup path needs a fix - are you use you can
> > not use kobj or some other existing refcounting?
> I will look at this.
> 
> > Is perf regressiion caused by tx ring overrun gone now?
> Nope.
> 
> > I added some comments about how we might be aqble
> > to complete requests out of order but it's not a must.
> Ok.
> 
> > > ---
> > > 
> > >  drivers/vhost/net.c   |   37 +++++++++++++++++++++++++++++++-
> > >  drivers/vhost/vhost.c |   55
> > ++++++++++++++++++++++++++++++++++++++++++++++++-
> > >  drivers/vhost/vhost.h |   12 ++++++++++
> > >  3 files changed, 101 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > index 2f7c76a..6bd6e28 100644
> > > --- a/drivers/vhost/net.c
> > > +++ b/drivers/vhost/net.c
> > > @@ -32,6 +32,9 @@
> > >   * Using this limit prevents one virtqueue from starving others. */
> > >  #define VHOST_NET_WEIGHT 0x80000
> > >  
> > > +/* MAX number of TX used buffers for outstanding zerocopy */
> > > +#define VHOST_MAX_ZEROCOPY_PEND 128 
> > > +
> > >  enum {
> > >       VHOST_NET_VQ_RX = 0,
> > >       VHOST_NET_VQ_TX = 1,
> > > @@ -129,6 +132,7 @@ static void handle_tx(struct vhost_net *net)
> > >       int err, wmem;
> > >       size_t hdr_size;
> > >       struct socket *sock;
> > > +     struct skb_ubuf_info pend;
> > >  
> > >       /* TODO: check that we are running from vhost_worker? */
> > >       sock = rcu_dereference_check(vq->private_data, 1);
> > > @@ -151,6 +155,10 @@ static void handle_tx(struct vhost_net *net)
> > >       hdr_size = vq->vhost_hlen;
> > >  
> > >       for (;;) {
> > > +             /* Release DMAs done buffers first */
> > > +             if (atomic_read(&vq->refcnt) >
> > VHOST_MAX_ZEROCOPY_PEND)
> > > +                     vhost_zerocopy_signal_used(vq, false);
> > > +
> > >               head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > >                                        ARRAY_SIZE(vq->iov),
> > >                                        &out, &in,
> > > @@ -166,6 +174,13 @@ static void handle_tx(struct vhost_net *net)
> > >                               set_bit(SOCK_ASYNC_NOSPACE,
> > &sock->flags);
> > >                               break;
> > >                       }
> > > +                     /* If more outstanding DMAs, queue the work */
> > > +                     if (sock_flag(sock->sk, SOCK_ZEROCOPY) &&
> > > +                         (atomic_read(&vq->refcnt) >
> > VHOST_MAX_ZEROCOPY_PEND)) {
> > > +                             tx_poll_start(net, sock);
> > > +                             set_bit(SOCK_ASYNC_NOSPACE,
> > &sock->flags);
> > > +                             break;
> > > +                     }
> > >                       if (unlikely(vhost_enable_notify(vq))) {
> > >                               vhost_disable_notify(vq);
> > >                               continue;
> > > @@ -188,17 +203,35 @@ static void handle_tx(struct vhost_net *net)
> > >                              iov_length(vq->hdr, s), hdr_size);
> > >                       break;
> > >               }
> > > +             /* use msg_control to pass vhost zerocopy ubuf info to
> > skb */
> > > +             if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
> > > +                     vq->heads[vq->upend_idx].id = head;
> > > +                     if (len <= 128)
> > 
> > I thought we have a constant for that?
> 
> Yes, fixed it already. I might change it to PAGE_SIZE for now since the
> small message sizes regression issue.
> 
> > > +                             vq->heads[vq->upend_idx].len =
> > VHOST_DMA_DONE_LEN;
> > > +                     else {
> > > +                             vq->heads[vq->upend_idx].len = len;
> > > +                             pend.callback =
> > vhost_zerocopy_callback;
> > > +                             pend.arg = vq;
> > > +                             pend.desc = vq->upend_idx;
> > > +                             msg.msg_control = &pend;
> > > +                             msg.msg_controllen = sizeof(pend);
> > > +                     }
> > > +                     atomic_inc(&vq->refcnt);
> > > +                     vq->upend_idx = (vq->upend_idx + 1) %
> > UIO_MAXIOV;
> > 
> > Ok, so we deal with a cyclic ring apparently? What guarantees we don't
> > overrun it?
> 
> VHOST_MAX_PEND (which is 128) should prevent it from overrun normally.
> In the case of none DMAs can be done, the maximum entries are the ring
> size, which is much smaller or equal to UIO_MAXIOV for current
> implementation. 
> 
> Maybe I should add some condition check if it is overrun then exits?

If this can't happen, maybe add a comment why. A check + WARN_ON
can't hurt either.

> > 
> > > +             }
> > >               /* TODO: Check specific error and bomb out unless
> > ENOBUFS? */
> > >               err = sock->ops->sendmsg(NULL, sock, &msg, len);
> > >               if (unlikely(err < 0)) {
> > > -                     vhost_discard_vq_desc(vq, 1);
> > > +                     if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
> > > +                             vhost_discard_vq_desc(vq, 1);
> > 
> > How are errors handled with zerocopy?
> 
> I thought about it before: if error after macvtap skb is allocated, then
> that skb has a callback which will set DMA done for add_used, if the
> error before macvtap skb is allocated, then it can reverse as copy case.
> To avoid the complexity check, I decided to not handle it.

I think we need to handle it, errors do happen.
macvtap should either don't invoke callback on error or return success.
Then it's simple.

> > 
> > >                       tx_poll_start(net, sock);
> > >                       break;
> > >               }
> > >               if (err != len)
> > >                       pr_debug("Truncated TX packet: "
> > >                                " len %d != %zd\n", err, len);
> > > -             vhost_add_used_and_signal(&net->dev, vq, head, 0);
> > > +             if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
> > > +                     vhost_add_used_and_signal(&net->dev, vq, head,
> > 0);
> > >               total_len += len;
> > >               if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> > >                       vhost_poll_queue(&vq->poll);
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index 2ab2912..ce799d6 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -174,6 +174,9 @@ static void vhost_vq_reset(struct vhost_dev
> > *dev,
> > >       vq->call_ctx = NULL;
> > >       vq->call = NULL;
> > >       vq->log_ctx = NULL;
> > > +     vq->upend_idx = 0;
> > > +     vq->done_idx = 0;
> > > +     atomic_set(&vq->refcnt, 0);
> > >  }
> > >  
> > >  static int vhost_worker(void *data)
> > > @@ -230,7 +233,7 @@ static long vhost_dev_alloc_iovecs(struct
> > vhost_dev *dev)
> > >                                              UIO_MAXIOV,
> > GFP_KERNEL);
> > >               dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log *
> > UIO_MAXIOV,
> > >                                         GFP_KERNEL);
> > > -             dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads
> > *
> > > +             dev->vqs[i].heads = kzalloc(sizeof *dev->vqs[i].heads
> > *
> > >                                           UIO_MAXIOV, GFP_KERNEL);
> > 
> > Which fields need to be initialized actually?
> 
> Nope, already fixed it with kmalloc.
> 
> > >  
> > >               if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
> > > @@ -385,6 +388,38 @@ long vhost_dev_reset_owner(struct vhost_dev
> > *dev)
> > >       return 0;
> > >  }
> > >  
> > > +/* 
> > > +     comments
> > > +*/
> > 
> > Hmm.
> 
> Fixed it in previous patch already.
> 
> > > +void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool
> > shutdown)
> > > +{
> > > +     int i, j = 0;
> > > +
> > > +     i = vq->done_idx;
> > > +     while (i != vq->upend_idx) {
> > 
> > A for loop might be clearer.
> Ok.
> 
> > > +             if ((vq->heads[i].len == VHOST_DMA_DONE_LEN) ||
> > shutdown) {
> > 
> > On shutdown, we signal all buffers used to the guest?
> > Why?
> 
> We signal all outstand DMAs in the case of driver has some DMA issue to
> prevent us from shutting down. I am afraid vhost cleanup could wait
> forever.

Yes but then
1. guest can reuse the buffers for something else, before device reads them
2. vhost-net module can go away
etc

IMO that this happens in finite time is one of the things driver should
guarantee when it sets the zcopy flags.

BTW we must flush these on memory table change too, right?

> > > +                     /* reset len = 0 */
> > 
> > comment not very helpful.
> > Could you explain what this does instead?
> > Or better use some constant instead of 0 ...
> 
> Fixed it already.
> 
> > > +                     vq->heads[i].len = 0;
> > > +                     i = (i + 1) % UIO_MAXIOV;
> > > +                     ++j;
> > > +             } else
> > > +                     break;
> > 
> > Hmm so if the 1st entry does not complete, you do not signal anything?
> 
> No used buffers to guest, should we signal still?

This is just myself noting that we still force used entries
to be in order. It's ok from correctness pov but likely
is at least part of the reason you still see
TX ring overruns.

> > > +     }
> > 
> > Looking at this loop, done idx is the consumer and used idx
> > is the producer, right?
> 
> Yes.
> 
> > > +     if (j) {
> > > +             /* comments */
> > 
> > Yes?
> > 
> > > +             if (i > vq->done_idx)
> > > +                     vhost_add_used_n(vq, &vq->heads[vq->done_idx],
> > j);
> > > +             else {
> > > +                     vhost_add_used_n(vq, &vq->heads[vq->done_idx],
> > > +                                      UIO_MAXIOV - vq->done_idx);
> > > +                     vhost_add_used_n(vq, vq->heads, i);
> > > +             }
> > > +             vq->done_idx = i;
> > > +             vhost_signal(vq->dev, vq);
> > > +             atomic_sub(j, &vq->refcnt);
> > 
> > Code will likely be simpler if you call vhost_add_used once for
> > each head in the first loop. Possibly add_used_signal might be
> > a good idea too.
> 
> Ok.
> 
> > > +     }
> > > +}
> > > +
> > >  /* Caller should have device mutex */
> > >  void vhost_dev_cleanup(struct vhost_dev *dev)
> > >  {
> > > @@ -395,6 +430,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > >                       vhost_poll_stop(&dev->vqs[i].poll);
> > >                       vhost_poll_flush(&dev->vqs[i].poll);
> > >               }
> > > +             /* wait for all lower device DMAs done, then notify
> > guest */
> > > +             if (atomic_read(&dev->vqs[i].refcnt)) {
> > > +                     msleep(1000);
> > > +                     vhost_zerocopy_signal_used(&dev->vqs[i],
> > true);
> > > +             }
> > 
> > This needs to be fixed somehow. Use a completion object and wait
> > on it?
> 
> Worried about what if the driver has some DMAs issue, which would
> prevent vhost from shutting down.

This needs to be fixed in the driver then.

> > >               if (dev->vqs[i].error_ctx)
> > >                       eventfd_ctx_put(dev->vqs[i].error_ctx);
> > >               if (dev->vqs[i].error)
> > > @@ -603,6 +643,10 @@ static long vhost_set_vring(struct vhost_dev
> > *d, int ioctl, void __user *argp)
> > >  
> > >       mutex_lock(&vq->mutex);
> > >  
> > > +     /* force all lower device DMAs done */
> > > +     if (atomic_read(&vq->refcnt)) 
> > > +             vhost_zerocopy_signal_used(vq, true);
> > > +
> > >       switch (ioctl) {
> > >       case VHOST_SET_VRING_NUM:
> > >               /* Resizing ring with an active backend?
> > > @@ -1416,3 +1460,12 @@ void vhost_disable_notify(struct
> > vhost_virtqueue *vq)
> > >               vq_err(vq, "Failed to enable notification at %p: %d
> > \n",
> > >                      &vq->used->flags, r);
> > >  }
> > > +
> > > +void vhost_zerocopy_callback(struct sk_buff *skb)
> > > +{
> > > +     int idx = skb_shinfo(skb)->ubuf.desc;
> > > +     struct vhost_virtqueue *vq = skb_shinfo(skb)->ubuf.arg;
> > > +
> > > +     /* set len = 1 to mark this desc buffers done DMA */
> > 
> > this comment can now go.
> 
> Yes, it's gone already.
> 
> > > +     vq->heads[idx].len = VHOST_DMA_DONE_LEN;
> > > +}
> > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > index b3363ae..8e3ecc7 100644
> > > --- a/drivers/vhost/vhost.h
> > > +++ b/drivers/vhost/vhost.h
> > > @@ -13,6 +13,10 @@
> > >  #include <linux/virtio_ring.h>
> > >  #include <asm/atomic.h>
> > >  
> > > +/* This is for zerocopy, used buffer len is set to 1 when lower
> > device DMA
> > > + * done */
> > > +#define VHOST_DMA_DONE_LEN   1
> > > +
> > >  struct vhost_device;
> > >  
> > >  struct vhost_work;
> > > @@ -108,6 +112,12 @@ struct vhost_virtqueue {
> > >       /* Log write descriptors */
> > >       void __user *log_base;
> > >       struct vhost_log *log;
> > > +     /* vhost zerocopy support */
> > > +     atomic_t refcnt; /* num of outstanding zerocopy DMAs */
> > 
> > future enhancement idea: this is used apparently under vq lock
> > so no need for an atomic?
> 
> It is also used in skb vhost zerocopy callback.
> 
> > > +     /* copy of avail idx to monitor outstanding DMA zerocopy
> > buffers */
> > 
> > looking at code upend_idx seems to be calculated independently
> > of guest avail idx - could you clarify pls?
> 
> Yes, you are right. Should change it to: upend_idx is used to track
> vring ids for outstanding zero-copy DMA buffers?

mention producer-consumer index in a head array used as
a circular ring datastructure.

> > > +     int upend_idx;
> > > +     /* copy of used idx to monintor DMA done zerocopy buffers */
> > 
> > monitor
> 
> Ok.
> 
> > > +     int done_idx;
> > 
> > 
> > I think in reality these are just producer and consumer
> > in the head structure which for zero copy is used
> Yes.
> 
> > 
> > >  };
> > >  
> > >  struct vhost_dev {
> > > @@ -154,6 +164,8 @@ bool vhost_enable_notify(struct vhost_virtqueue
> > *);
> > >  
> > >  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log
> > *log,
> > >                   unsigned int log_num, u64 len);
> > > +void vhost_zerocopy_callback(struct sk_buff *skb);
> > > +void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool
> > shutdown);
> > >  
> > >  #define vq_err(vq, fmt, ...) do {
> > \
> > >               pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> > > 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V6 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-18  6:16                       ` [PATCH V6 " Shirley Ma
@ 2011-05-18  8:43                         ` Michael S. Tsirkin
  0 siblings, 0 replies; 23+ messages in thread
From: Michael S. Tsirkin @ 2011-05-18  8:43 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Tue, May 17, 2011 at 11:16:23PM -0700, Shirley Ma wrote:
> Hello Michael,
> 
> Here is the update the patch based on all of your review comments except
> the completion/wait for cleanup since I am worried about outstanding
> DMAs would prevent vhost from shutting down.

Don't see what you are worried about. Doing this in a timely fashion
is just one of the things driver commits to when it published the zcopy flag.
Otherwise guest networking will get stuck as entries in the tx ring
don't free up, which is also a problem. Let's just add a comment documenting this.

> I am sending out this for your review, and test it out later.

So let's use a completion to cleanly flush outstanding requests.
Any chance kref can be reused? It's not a must.

One other piece that is currently missing is flushing
requests on memory table update. Maybe we can stick some
info in high bits of the desc field for that?

> For error handling, I update macvtap.c so we can discard the desc even
> in zero-copy case.
> 
> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ---
>  drivers/vhost/net.c   |   42 +++++++++++++++++++++++++++++++++++++++++-
>  drivers/vhost/vhost.c |   49 +++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |   13 +++++++++++++
>  3 files changed, 103 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 2f7c76a..e87a1f8 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -32,6 +32,10 @@
>   * Using this limit prevents one virtqueue from starving others. */
>  #define VHOST_NET_WEIGHT 0x80000
>  
> +/* MAX number of TX used buffers for outstanding zerocopy */
> +#define VHOST_MAX_PEND 128
> +#define VHOST_GOODCOPY_LEN PAGE_SIZE
> +
>  enum {
>  	VHOST_NET_VQ_RX = 0,
>  	VHOST_NET_VQ_TX = 1,
> @@ -129,6 +133,7 @@ static void handle_tx(struct vhost_net *net)
>  	int err, wmem;
>  	size_t hdr_size;
>  	struct socket *sock;
> +	struct skb_ubuf_info pend;
>  
>  	/* TODO: check that we are running from vhost_worker? */
>  	sock = rcu_dereference_check(vq->private_data, 1);
> @@ -151,6 +156,10 @@ static void handle_tx(struct vhost_net *net)
>  	hdr_size = vq->vhost_hlen;
>  
>  	for (;;) {
> +		/* Release DMAs done buffers first */
> +		if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND)
> +			vhost_zerocopy_signal_used(vq, false);
> +
>  		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>  					 ARRAY_SIZE(vq->iov),
>  					 &out, &in,
> @@ -166,6 +175,12 @@ static void handle_tx(struct vhost_net *net)
>  				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
>  				break;
>  			}
> +			/* If more outstanding DMAs, queue the work */
> +			if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND) {
> +				tx_poll_start(net, sock);
> +				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
> +				break;
> +			}
>  			if (unlikely(vhost_enable_notify(vq))) {
>  				vhost_disable_notify(vq);
>  				continue;
> @@ -188,6 +203,30 @@ static void handle_tx(struct vhost_net *net)
>  			       iov_length(vq->hdr, s), hdr_size);
>  			break;
>  		}
> +		/* use msg_control to pass vhost zerocopy ubuf info to skb */
> +		if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
> +			vq->heads[vq->upend_idx].id = head;
> +			if (len < VHOST_GOODCOPY_LEN)
> +				/* copy don't need to wait for DMA done */
> +				vq->heads[vq->upend_idx].len =
> +							VHOST_DMA_DONE_LEN;
> +			else {
> +				vq->heads[vq->upend_idx].len = len;
> +				pend.callback = vhost_zerocopy_callback;
> +				pend.arg = vq;
> +				pend.desc = vq->upend_idx;
> +				msg.msg_control = &pend;
> +				msg.msg_controllen = sizeof(pend);
> +			}
> +			atomic_inc(&vq->refcnt);
> +			vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
> +			/* if upend_idx is full, then wait for free more */
> +			if (vq->upend_idx == vq->done_idx) {
> +				tx_poll_start(net, sock);
> +				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
> +				break;
> +			}
> +		}
>  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
>  		err = sock->ops->sendmsg(NULL, sock, &msg, len);
>  		if (unlikely(err < 0)) {
> @@ -198,7 +237,8 @@ static void handle_tx(struct vhost_net *net)
>  		if (err != len)
>  			pr_debug("Truncated TX packet: "
>  				 " len %d != %zd\n", err, len);
> -		vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +		if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
> +			vhost_add_used_and_signal(&net->dev, vq, head, 0);
>  		total_len += len;
>  		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>  			vhost_poll_queue(&vq->poll);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 2ab2912..f4c2730 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -174,6 +174,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->call_ctx = NULL;
>  	vq->call = NULL;
>  	vq->log_ctx = NULL;
> +	vq->upend_idx = 0;
> +	vq->done_idx = 0;
> +	atomic_set(&vq->refcnt, 0);
>  }
>  
>  static int vhost_worker(void *data)
> @@ -385,16 +388,49 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
>  	return 0;
>  }
>  
> +/* In case of DMA done not in order in lower device driver for some reason.
> + * upend_idx is used to track end of used idx, done_idx is used to track head
> + * of used idx. Once lower device DMA done contiguously, we will signal KVM
> + * guest used idx.
> + */
> +void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown)
> +{
> +	int i, j = 0;
> +
> +	for (i = vq->done_idx; i != vq->upend_idx; i = i++ % UIO_MAXIOV) {
> +		if ((vq->heads[i].len == VHOST_DMA_DONE_LEN) || shutdown) {
> +			vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
> +			vhost_add_used_and_signal(vq->dev, vq,
> +						  vq->heads[i].id, 0);
> +			++j;
> +		} else
> +			break;
> +	}
> +	if (j) {
> +		vq->done_idx = i;
> +		atomic_sub(j, &vq->refcnt);
> +	}
> +}
> +
>  /* Caller should have device mutex */
>  void vhost_dev_cleanup(struct vhost_dev *dev)
>  {
>  	int i;
> +	unsigned long begin = jiffies;
>  
>  	for (i = 0; i < dev->nvqs; ++i) {
>  		if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
>  			vhost_poll_stop(&dev->vqs[i].poll);
>  			vhost_poll_flush(&dev->vqs[i].poll);
>  		}
> +		/* Wait for all lower device DMAs done, then notify virtio_net
> +		 * or just notify it without waiting for all DMA done here ?
> +		 * in case of DMAs never done for some reason */
> +		if (atomic_read(&dev->vqs[i].refcnt)) {
> +			/* how long should we wait ? */
> +			msleep(1000);
> +			vhost_zerocopy_signal_used(&dev->vqs[i], true);
> +		}
>  		if (dev->vqs[i].error_ctx)
>  			eventfd_ctx_put(dev->vqs[i].error_ctx);
>  		if (dev->vqs[i].error)
> @@ -603,6 +639,10 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
>  
>  	mutex_lock(&vq->mutex);
>  
> +	/* clean up lower device outstanding DMAs, before setting ring */
> +	if (atomic_read(&vq->refcnt))
> +		vhost_zerocopy_signal_used(vq, true);
> +
>  	switch (ioctl) {
>  	case VHOST_SET_VRING_NUM:
>  		/* Resizing ring with an active backend?
> @@ -1416,3 +1456,12 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
>  		vq_err(vq, "Failed to enable notification at %p: %d\n",
>  		       &vq->used->flags, r);
>  }
> +
> +void vhost_zerocopy_callback(struct sk_buff *skb)
> +{
> +	int idx = skb_shinfo(skb)->ubuf.desc;
> +	struct vhost_virtqueue *vq = skb_shinfo(skb)->ubuf.arg;


Let's do some sanity checking of idx here in case
there's a driver bug.

> +
> +	/* set len = 1 to mark this desc buffers done DMA */
> +	vq->heads[idx].len = VHOST_DMA_DONE_LEN;
> +}
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index b3363ae..d0e7ac6 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -13,6 +13,11 @@
>  #include <linux/virtio_ring.h>
>  #include <asm/atomic.h>
>  
> +/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
> + * done */
> +#define VHOST_DMA_DONE_LEN	1
> +#define VHOST_DMA_CLEAR_LEN	0
> +
>  struct vhost_device;
>  
>  struct vhost_work;
> @@ -108,6 +113,12 @@ struct vhost_virtqueue {
>  	/* Log write descriptors */
>  	void __user *log_base;
>  	struct vhost_log *log;
> +	/* vhost zerocopy support */
> +	atomic_t refcnt; /* num of outstanding zerocopy DMAs */
> +	/* last used idx for outstanding DMA zerocopy buffers */
> +	int upend_idx;
> +	/* first used idx for DMA done zerocopy buffers */
> +	int done_idx;
>  };
>  
>  struct vhost_dev {
> @@ -154,6 +165,8 @@ bool vhost_enable_notify(struct vhost_virtqueue *);
>  
>  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
>  		    unsigned int log_num, u64 len);
> +void vhost_zerocopy_callback(struct sk_buff *skb);
> +void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown);
>  
>  #define vq_err(vq, fmt, ...) do {                                  \
>  		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> 
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
  2011-05-18  5:14                     ` Shirley Ma
  2011-05-18  6:16                       ` [PATCH V6 " Shirley Ma
  2011-05-18  8:32                       ` [PATCH V5 " Michael S. Tsirkin
@ 2011-05-18  8:45                       ` Michael S. Tsirkin
  2 siblings, 0 replies; 23+ messages in thread
From: Michael S. Tsirkin @ 2011-05-18  8:45 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel

On Tue, May 17, 2011 at 10:14:34PM -0700, Shirley Ma wrote:
> Yes, fixed it already. I might change it to PAGE_SIZE for now since the
> small message sizes regression issue.

If that fixes it, let's do that for now.
-- 
MST

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2011-05-18  8:45 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-05-16 19:34 [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support Shirley Ma
2011-05-16 20:45 ` Michael S. Tsirkin
2011-05-16 20:56   ` Shirley Ma
2011-05-16 21:24     ` Michael S. Tsirkin
2011-05-16 21:30       ` Shirley Ma
2011-05-17  4:31       ` Shirley Ma
2011-05-17  5:55         ` Michael S. Tsirkin
2011-05-17 15:22           ` Shirley Ma
2011-05-17 15:28             ` Michael S. Tsirkin
2011-05-17 15:34               ` Shirley Ma
2011-05-17 20:46                 ` [TEST PATCH net-next] vhost: accumulate multiple used and sigal in vhost TX test Shirley Ma
2011-05-17 20:52                   ` Michael S. Tsirkin
2011-05-17 20:50                 ` [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support Shirley Ma
2011-05-17 20:58                   ` Michael S. Tsirkin
2011-05-17 21:01                     ` Shirley Ma
2011-05-17 21:28                   ` Michael S. Tsirkin
2011-05-17 22:21                     ` Shirley Ma
2011-05-18  5:14                     ` Shirley Ma
2011-05-18  6:16                       ` [PATCH V6 " Shirley Ma
2011-05-18  8:43                         ` Michael S. Tsirkin
2011-05-18  8:32                       ` [PATCH V5 " Michael S. Tsirkin
2011-05-18  8:45                       ` Michael S. Tsirkin
2011-05-16 21:27     ` Michael S. Tsirkin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.