linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop
@ 2011-01-17  8:10 Jason Wang
  2011-01-17  8:11 ` [PATCH 2/3] vhost-net: Unify the code of mergeable and big buffer handling Jason Wang
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Jason Wang @ 2011-01-17  8:10 UTC (permalink / raw)
  To: virtualization, netdev, kvm, mst; +Cc: linux-kernel

No need to check the support of mergeable buffer inside the recevie
loop as the whole handle_rx()_xx is in the read critical region.  So
this patch move it ahead of the receiving loop.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c |    5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 14fc189..95e49de 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -411,7 +411,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
 	};
 
 	size_t total_len = 0;
-	int err, headcount;
+	int err, headcount, mergeable;
 	size_t vhost_hlen, sock_hlen;
 	size_t vhost_len, sock_len;
 	struct socket *sock = rcu_dereference(vq->private_data);
@@ -425,6 +425,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
 
 	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
+	mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF);
 
 	while ((sock_len = peek_head_len(sock->sk))) {
 		sock_len += sock_hlen;
@@ -474,7 +475,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
 			break;
 		}
 		/* TODO: Should check and handle checksum. */
-		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
+		if (likely(mergeable) &&
 		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
 				      offsetof(typeof(hdr), num_buffers),
 				      sizeof hdr.num_buffers)) {


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/3] vhost-net: Unify the code of mergeable and big buffer handling
  2011-01-17  8:10 [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop Jason Wang
@ 2011-01-17  8:11 ` Jason Wang
  2011-01-17  8:36   ` Michael S. Tsirkin
  2011-01-17  8:11 ` [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len() Jason Wang
  2011-01-17  8:46 ` [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop Michael S. Tsirkin
  2 siblings, 1 reply; 20+ messages in thread
From: Jason Wang @ 2011-01-17  8:11 UTC (permalink / raw)
  To: virtualization, netdev, kvm, mst; +Cc: linux-kernel

Codes duplication were found between the handling of mergeable and big
buffers, so this patch tries to unify them. This could be easily done
by adding a quota to the get_rx_bufs() which is used to limit the
number of buffers it returns (for mergeable buffer, the quota is
simply UIO_MAXIOV, for big buffers, the quota is just 1), and then the
previous handle_rx_mergeable() could be resued also for big buffers.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c |  128 +++------------------------------------------------
 1 files changed, 7 insertions(+), 121 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 95e49de..c32a2e4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -227,6 +227,7 @@ static int peek_head_len(struct sock *sk)
  * @iovcount	- returned count of io vectors we fill
  * @log		- vhost log
  * @log_num	- log offset
+ * @quota       - headcount quota, 1 for big buffer
  *	returns number of buffer heads allocated, negative on error
  */
 static int get_rx_bufs(struct vhost_virtqueue *vq,
@@ -234,7 +235,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 		       int datalen,
 		       unsigned *iovcount,
 		       struct vhost_log *log,
-		       unsigned *log_num)
+		       unsigned *log_num,
+		       unsigned int quota)
 {
 	unsigned int out, in;
 	int seg = 0;
@@ -242,7 +244,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 	unsigned d;
 	int r, nlogs = 0;
 
-	while (datalen > 0) {
+	while (datalen > 0 && headcount < quota) {
 		if (unlikely(seg >= UIO_MAXIOV)) {
 			r = -ENOBUFS;
 			goto err;
@@ -282,116 +284,7 @@ err:
 
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
-static void handle_rx_big(struct vhost_net *net)
-{
-	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
-	unsigned out, in, log, s;
-	int head;
-	struct vhost_log *vq_log;
-	struct msghdr msg = {
-		.msg_name = NULL,
-		.msg_namelen = 0,
-		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
-		.msg_controllen = 0,
-		.msg_iov = vq->iov,
-		.msg_flags = MSG_DONTWAIT,
-	};
-
-	struct virtio_net_hdr hdr = {
-		.flags = 0,
-		.gso_type = VIRTIO_NET_HDR_GSO_NONE
-	};
-
-	size_t len, total_len = 0;
-	int err;
-	size_t hdr_size;
-	struct socket *sock = rcu_dereference(vq->private_data);
-	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
-		return;
-
-	mutex_lock(&vq->mutex);
-	vhost_disable_notify(vq);
-	hdr_size = vq->vhost_hlen;
-
-	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
-		vq->log : NULL;
-
-	for (;;) {
-		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
-					 ARRAY_SIZE(vq->iov),
-					 &out, &in,
-					 vq_log, &log);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(head < 0))
-			break;
-		/* OK, now we need to know about added descriptors. */
-		if (head == vq->num) {
-			if (unlikely(vhost_enable_notify(vq))) {
-				/* They have slipped one in as we were
-				 * doing that: check again. */
-				vhost_disable_notify(vq);
-				continue;
-			}
-			/* Nothing new?  Wait for eventfd to tell us
-			 * they refilled. */
-			break;
-		}
-		/* We don't need to be notified again. */
-		if (out) {
-			vq_err(vq, "Unexpected descriptor format for RX: "
-			       "out %d, int %d\n",
-			       out, in);
-			break;
-		}
-		/* Skip header. TODO: support TSO/mergeable rx buffers. */
-		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
-		msg.msg_iovlen = in;
-		len = iov_length(vq->iov, in);
-		/* Sanity check */
-		if (!len) {
-			vq_err(vq, "Unexpected header len for RX: "
-			       "%zd expected %zd\n",
-			       iov_length(vq->hdr, s), hdr_size);
-			break;
-		}
-		err = sock->ops->recvmsg(NULL, sock, &msg,
-					 len, MSG_DONTWAIT | MSG_TRUNC);
-		/* TODO: Check specific error and bomb out unless EAGAIN? */
-		if (err < 0) {
-			vhost_discard_vq_desc(vq, 1);
-			break;
-		}
-		/* TODO: Should check and handle checksum. */
-		if (err > len) {
-			pr_debug("Discarded truncated rx packet: "
-				 " len %d > %zd\n", err, len);
-			vhost_discard_vq_desc(vq, 1);
-			continue;
-		}
-		len = err;
-		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
-		if (err) {
-			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
-			       vq->iov->iov_base, err);
-			break;
-		}
-		len += hdr_size;
-		vhost_add_used_and_signal(&net->dev, vq, head, len);
-		if (unlikely(vq_log))
-			vhost_log_write(vq, vq_log, log, len);
-		total_len += len;
-		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
-			vhost_poll_queue(&vq->poll);
-			break;
-		}
-	}
-
-	mutex_unlock(&vq->mutex);
-}
-
-/* Expects to be always run from workqueue - which acts as
- * read-size critical section for our kind of RCU. */
-static void handle_rx_mergeable(struct vhost_net *net)
+static void handle_rx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
 	unsigned uninitialized_var(in), log;
@@ -431,7 +324,8 @@ static void handle_rx_mergeable(struct vhost_net *net)
 		sock_len += sock_hlen;
 		vhost_len = sock_len + vhost_hlen;
 		headcount = get_rx_bufs(vq, vq->heads, vhost_len,
-					&in, vq_log, &log);
+					&in, vq_log, &log,
+					likely(mergeable) ? UIO_MAXIOV : 1);
 		/* On error, stop handling until the next kick. */
 		if (unlikely(headcount < 0))
 			break;
@@ -497,14 +391,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
 	mutex_unlock(&vq->mutex);
 }
 
-static void handle_rx(struct vhost_net *net)
-{
-	if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
-		handle_rx_mergeable(net);
-	else
-		handle_rx_big(net);
-}
-
 static void handle_tx_kick(struct vhost_work *work)
 {
 	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len()
  2011-01-17  8:10 [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop Jason Wang
  2011-01-17  8:11 ` [PATCH 2/3] vhost-net: Unify the code of mergeable and big buffer handling Jason Wang
@ 2011-01-17  8:11 ` Jason Wang
  2011-01-17  9:33   ` Eric Dumazet
                     ` (2 more replies)
  2011-01-17  8:46 ` [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop Michael S. Tsirkin
  2 siblings, 3 replies; 20+ messages in thread
From: Jason Wang @ 2011-01-17  8:11 UTC (permalink / raw)
  To: virtualization, netdev, kvm, mst; +Cc: linux-kernel

We can use lock_sock_fast() instead of lock_sock() in order to get
speedup in peek_head_len().

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index c32a2e4..50b622a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -211,12 +211,12 @@ static int peek_head_len(struct sock *sk)
 {
 	struct sk_buff *head;
 	int len = 0;
+	bool slow = lock_sock_fast(sk);
 
-	lock_sock(sk);
 	head = skb_peek(&sk->sk_receive_queue);
 	if (head)
 		len = head->len;
-	release_sock(sk);
+	unlock_sock_fast(sk, slow);
 	return len;
 }
 


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] vhost-net: Unify the code of mergeable and big buffer handling
  2011-01-17  8:11 ` [PATCH 2/3] vhost-net: Unify the code of mergeable and big buffer handling Jason Wang
@ 2011-01-17  8:36   ` Michael S. Tsirkin
  2011-01-18  3:05     ` Jason Wang
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-01-17  8:36 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, kvm, linux-kernel

On Mon, Jan 17, 2011 at 04:11:08PM +0800, Jason Wang wrote:
> Codes duplication were found between the handling of mergeable and big
> buffers, so this patch tries to unify them. This could be easily done
> by adding a quota to the get_rx_bufs() which is used to limit the
> number of buffers it returns (for mergeable buffer, the quota is
> simply UIO_MAXIOV, for big buffers, the quota is just 1), and then the
> previous handle_rx_mergeable() could be resued also for big buffers.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

We actually started this way. However the code turned out
to have measureable overhead when handle_rx_mergeable
is called with quota 1.
So before applying this I'd like to see some data
to show this is not the case anymore.

> ---
>  drivers/vhost/net.c |  128 +++------------------------------------------------
>  1 files changed, 7 insertions(+), 121 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 95e49de..c32a2e4 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -227,6 +227,7 @@ static int peek_head_len(struct sock *sk)
>   * @iovcount	- returned count of io vectors we fill
>   * @log		- vhost log
>   * @log_num	- log offset
> + * @quota       - headcount quota, 1 for big buffer
>   *	returns number of buffer heads allocated, negative on error
>   */
>  static int get_rx_bufs(struct vhost_virtqueue *vq,
> @@ -234,7 +235,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>  		       int datalen,
>  		       unsigned *iovcount,
>  		       struct vhost_log *log,
> -		       unsigned *log_num)
> +		       unsigned *log_num,
> +		       unsigned int quota)
>  {
>  	unsigned int out, in;
>  	int seg = 0;
> @@ -242,7 +244,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>  	unsigned d;
>  	int r, nlogs = 0;
>  
> -	while (datalen > 0) {
> +	while (datalen > 0 && headcount < quota) {
>  		if (unlikely(seg >= UIO_MAXIOV)) {
>  			r = -ENOBUFS;
>  			goto err;
> @@ -282,116 +284,7 @@ err:
>  
>  /* Expects to be always run from workqueue - which acts as
>   * read-size critical section for our kind of RCU. */
> -static void handle_rx_big(struct vhost_net *net)
> -{
> -	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> -	unsigned out, in, log, s;
> -	int head;
> -	struct vhost_log *vq_log;
> -	struct msghdr msg = {
> -		.msg_name = NULL,
> -		.msg_namelen = 0,
> -		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
> -		.msg_controllen = 0,
> -		.msg_iov = vq->iov,
> -		.msg_flags = MSG_DONTWAIT,
> -	};
> -
> -	struct virtio_net_hdr hdr = {
> -		.flags = 0,
> -		.gso_type = VIRTIO_NET_HDR_GSO_NONE
> -	};
> -
> -	size_t len, total_len = 0;
> -	int err;
> -	size_t hdr_size;
> -	struct socket *sock = rcu_dereference(vq->private_data);
> -	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> -		return;
> -
> -	mutex_lock(&vq->mutex);
> -	vhost_disable_notify(vq);
> -	hdr_size = vq->vhost_hlen;
> -
> -	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> -		vq->log : NULL;
> -
> -	for (;;) {
> -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> -					 ARRAY_SIZE(vq->iov),
> -					 &out, &in,
> -					 vq_log, &log);
> -		/* On error, stop handling until the next kick. */
> -		if (unlikely(head < 0))
> -			break;
> -		/* OK, now we need to know about added descriptors. */
> -		if (head == vq->num) {
> -			if (unlikely(vhost_enable_notify(vq))) {
> -				/* They have slipped one in as we were
> -				 * doing that: check again. */
> -				vhost_disable_notify(vq);
> -				continue;
> -			}
> -			/* Nothing new?  Wait for eventfd to tell us
> -			 * they refilled. */
> -			break;
> -		}
> -		/* We don't need to be notified again. */
> -		if (out) {
> -			vq_err(vq, "Unexpected descriptor format for RX: "
> -			       "out %d, int %d\n",
> -			       out, in);
> -			break;
> -		}
> -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
> -		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> -		msg.msg_iovlen = in;
> -		len = iov_length(vq->iov, in);
> -		/* Sanity check */
> -		if (!len) {
> -			vq_err(vq, "Unexpected header len for RX: "
> -			       "%zd expected %zd\n",
> -			       iov_length(vq->hdr, s), hdr_size);
> -			break;
> -		}
> -		err = sock->ops->recvmsg(NULL, sock, &msg,
> -					 len, MSG_DONTWAIT | MSG_TRUNC);
> -		/* TODO: Check specific error and bomb out unless EAGAIN? */
> -		if (err < 0) {
> -			vhost_discard_vq_desc(vq, 1);
> -			break;
> -		}
> -		/* TODO: Should check and handle checksum. */
> -		if (err > len) {
> -			pr_debug("Discarded truncated rx packet: "
> -				 " len %d > %zd\n", err, len);
> -			vhost_discard_vq_desc(vq, 1);
> -			continue;
> -		}
> -		len = err;
> -		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
> -		if (err) {
> -			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
> -			       vq->iov->iov_base, err);
> -			break;
> -		}
> -		len += hdr_size;
> -		vhost_add_used_and_signal(&net->dev, vq, head, len);
> -		if (unlikely(vq_log))
> -			vhost_log_write(vq, vq_log, log, len);
> -		total_len += len;
> -		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> -			vhost_poll_queue(&vq->poll);
> -			break;
> -		}
> -	}
> -
> -	mutex_unlock(&vq->mutex);
> -}
> -
> -/* Expects to be always run from workqueue - which acts as
> - * read-size critical section for our kind of RCU. */
> -static void handle_rx_mergeable(struct vhost_net *net)
> +static void handle_rx(struct vhost_net *net)
>  {
>  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
>  	unsigned uninitialized_var(in), log;
> @@ -431,7 +324,8 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  		sock_len += sock_hlen;
>  		vhost_len = sock_len + vhost_hlen;
>  		headcount = get_rx_bufs(vq, vq->heads, vhost_len,
> -					&in, vq_log, &log);
> +					&in, vq_log, &log,
> +					likely(mergeable) ? UIO_MAXIOV : 1);
>  		/* On error, stop handling until the next kick. */
>  		if (unlikely(headcount < 0))
>  			break;
> @@ -497,14 +391,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  	mutex_unlock(&vq->mutex);
>  }
>  
> -static void handle_rx(struct vhost_net *net)
> -{
> -	if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
> -		handle_rx_mergeable(net);
> -	else
> -		handle_rx_big(net);
> -}
> -
>  static void handle_tx_kick(struct vhost_work *work)
>  {
>  	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop
  2011-01-17  8:10 [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop Jason Wang
  2011-01-17  8:11 ` [PATCH 2/3] vhost-net: Unify the code of mergeable and big buffer handling Jason Wang
  2011-01-17  8:11 ` [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len() Jason Wang
@ 2011-01-17  8:46 ` Michael S. Tsirkin
  2011-01-18  4:26   ` Jason Wang
  2 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-01-17  8:46 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, kvm, linux-kernel

On Mon, Jan 17, 2011 at 04:10:59PM +0800, Jason Wang wrote:
> No need to check the support of mergeable buffer inside the recevie
> loop as the whole handle_rx()_xx is in the read critical region.  So
> this patch move it ahead of the receiving loop.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Well feature check is mostly just features & bit
so why would it be slower? Because of the rcu
adding memory barriers? Do you observe a
measureable speedup with this patch?

Apropos, I noticed that the check in vhost_has_feature
is wrong: it uses the same kind of RCU as the
private pointer. So we'll have to fix that properly
by adding more lockdep classes, but for now
we'll need to make
the check 1 || lockdep_is_held(&dev->mutex);
and add a TODO.

> ---
>  drivers/vhost/net.c |    5 +++--
>  1 files changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 14fc189..95e49de 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -411,7 +411,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  	};
>  
>  	size_t total_len = 0;
> -	int err, headcount;
> +	int err, headcount, mergeable;
>  	size_t vhost_hlen, sock_hlen;
>  	size_t vhost_len, sock_len;
>  	struct socket *sock = rcu_dereference(vq->private_data);
> @@ -425,6 +425,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  
>  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>  		vq->log : NULL;
> +	mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF);
>  
>  	while ((sock_len = peek_head_len(sock->sk))) {
>  		sock_len += sock_hlen;
> @@ -474,7 +475,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  			break;
>  		}
>  		/* TODO: Should check and handle checksum. */
> -		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
> +		if (likely(mergeable) &&
>  		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
>  				      offsetof(typeof(hdr), num_buffers),
>  				      sizeof hdr.num_buffers)) {

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len()
  2011-01-17  8:11 ` [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len() Jason Wang
@ 2011-01-17  9:33   ` Eric Dumazet
  2011-01-17  9:57   ` Michael S. Tsirkin
  2011-03-13 15:06   ` Michael S. Tsirkin
  2 siblings, 0 replies; 20+ messages in thread
From: Eric Dumazet @ 2011-01-17  9:33 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, kvm, mst, linux-kernel

Le lundi 17 janvier 2011 à 16:11 +0800, Jason Wang a écrit :
> We can use lock_sock_fast() instead of lock_sock() in order to get
> speedup in peek_head_len().
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/net.c |    4 ++--
>  1 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index c32a2e4..50b622a 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -211,12 +211,12 @@ static int peek_head_len(struct sock *sk)
>  {
>  	struct sk_buff *head;
>  	int len = 0;
> +	bool slow = lock_sock_fast(sk);
>  
> -	lock_sock(sk);
>  	head = skb_peek(&sk->sk_receive_queue);
>  	if (head)
>  		len = head->len;
> -	release_sock(sk);
> +	unlock_sock_fast(sk, slow);
>  	return len;
>  }
>  
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len()
  2011-01-17  8:11 ` [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len() Jason Wang
  2011-01-17  9:33   ` Eric Dumazet
@ 2011-01-17  9:57   ` Michael S. Tsirkin
  2011-03-13 15:06   ` Michael S. Tsirkin
  2 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-01-17  9:57 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, kvm, linux-kernel, eric.dumazet

On Mon, Jan 17, 2011 at 04:11:17PM +0800, Jason Wang wrote:
> We can use lock_sock_fast() instead of lock_sock() in order to get
> speedup in peek_head_len().
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Queued for 2.6.39, thanks everyone.

> ---
>  drivers/vhost/net.c |    4 ++--
>  1 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index c32a2e4..50b622a 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -211,12 +211,12 @@ static int peek_head_len(struct sock *sk)
>  {
>  	struct sk_buff *head;
>  	int len = 0;
> +	bool slow = lock_sock_fast(sk);
>  
> -	lock_sock(sk);
>  	head = skb_peek(&sk->sk_receive_queue);
>  	if (head)
>  		len = head->len;
> -	release_sock(sk);
> +	unlock_sock_fast(sk, slow);
>  	return len;
>  }
>  

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] vhost-net: Unify the code of mergeable and big buffer handling
  2011-01-17  8:36   ` Michael S. Tsirkin
@ 2011-01-18  3:05     ` Jason Wang
  2011-01-18  4:37       ` Michael S. Tsirkin
  0 siblings, 1 reply; 20+ messages in thread
From: Jason Wang @ 2011-01-18  3:05 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jason Wang, virtualization, netdev, kvm, linux-kernel

Michael S. Tsirkin writes:
 > On Mon, Jan 17, 2011 at 04:11:08PM +0800, Jason Wang wrote:
 > > Codes duplication were found between the handling of mergeable and big
 > > buffers, so this patch tries to unify them. This could be easily done
 > > by adding a quota to the get_rx_bufs() which is used to limit the
 > > number of buffers it returns (for mergeable buffer, the quota is
 > > simply UIO_MAXIOV, for big buffers, the quota is just 1), and then the
 > > previous handle_rx_mergeable() could be resued also for big buffers.
 > > 
 > > Signed-off-by: Jason Wang <jasowang@redhat.com>
 > 
 > We actually started this way. However the code turned out
 > to have measureable overhead when handle_rx_mergeable
 > is called with quota 1.
 > So before applying this I'd like to see some data
 > to show this is not the case anymore.
 > 

I've run a round of test (Host->Guest) for these three patches on my desktop:

Without these patches

mergeable buffers:
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.42 (10.66.91.42) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384     64    60.00       575.87   69.20    26.36    39.375  7.499  
 87380  16384    256    60.01      1123.57   73.16    34.73    21.335  5.064  
 87380  16384    512    60.00      1351.12   75.26    35.80    18.251  4.341  
 87380  16384   1024    60.00      1955.31   74.73    37.67    12.523  3.156  
 87380  16384   2048    60.01      3411.92   74.82    39.49    7.186   1.896  

bug buffers:
Netperf test results
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.109 (10.66.91.109) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384     64    60.00       567.10   72.06    26.13    41.638  7.550  
 87380  16384    256    60.00      1143.69   74.66    32.58    21.392  4.667  
 87380  16384    512    60.00      1460.92   73.94    33.40    16.585  3.746  
 87380  16384   1024    60.00      3454.85   77.49    33.89    7.349   1.607  
 87380  16384   2048    60.00      3781.11   76.51    38.38    6.631   1.663  

With these patches:

mergeable buffers:
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.236 (10.66.91.236) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384     64    60.00       657.53   71.27    26.42    35.517  6.583  
 87380  16384    256    60.00      1217.73   74.34    34.67    20.004  4.665  
 87380  16384    512    60.00      1575.25   75.27    37.12    15.658  3.861  
 87380  16384   1024    60.00      2416.07   74.77    37.20    10.140  2.522  
 87380  16384   2048    60.00      3702.29   77.31    36.29    6.842   1.606  

big buffers:
Netperf test results
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.202 (10.66.91.202) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384     64    60.00       647.67   71.86    27.26    36.356  6.895  
 87380  16384    256    60.00      1265.82   76.19    36.54    19.724  4.729  
 87380  16384    512    60.00      1796.64   76.06    39.48    13.872  3.601  
 87380  16384   1024    60.00      4008.37   77.05    36.47    6.299   1.491  
 87380  16384   2048    60.00      4468.56   75.18    41.79    5.513   1.532 

Looks like the unification does not hurt the performance, and with those patches
we can get some improvement. BTW, the regression of mergeable buffer still
exist.

 > > ---
 > >  drivers/vhost/net.c |  128 +++------------------------------------------------
 > >  1 files changed, 7 insertions(+), 121 deletions(-)
 > > 
 > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 > > index 95e49de..c32a2e4 100644
 > > --- a/drivers/vhost/net.c
 > > +++ b/drivers/vhost/net.c
 > > @@ -227,6 +227,7 @@ static int peek_head_len(struct sock *sk)
 > >   * @iovcount	- returned count of io vectors we fill
 > >   * @log		- vhost log
 > >   * @log_num	- log offset
 > > + * @quota       - headcount quota, 1 for big buffer
 > >   *	returns number of buffer heads allocated, negative on error
 > >   */
 > >  static int get_rx_bufs(struct vhost_virtqueue *vq,
 > > @@ -234,7 +235,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 > >  		       int datalen,
 > >  		       unsigned *iovcount,
 > >  		       struct vhost_log *log,
 > > -		       unsigned *log_num)
 > > +		       unsigned *log_num,
 > > +		       unsigned int quota)
 > >  {
 > >  	unsigned int out, in;
 > >  	int seg = 0;
 > > @@ -242,7 +244,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 > >  	unsigned d;
 > >  	int r, nlogs = 0;
 > >  
 > > -	while (datalen > 0) {
 > > +	while (datalen > 0 && headcount < quota) {
 > >  		if (unlikely(seg >= UIO_MAXIOV)) {
 > >  			r = -ENOBUFS;
 > >  			goto err;
 > > @@ -282,116 +284,7 @@ err:
 > >  
 > >  /* Expects to be always run from workqueue - which acts as
 > >   * read-size critical section for our kind of RCU. */
 > > -static void handle_rx_big(struct vhost_net *net)
 > > -{
 > > -	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
 > > -	unsigned out, in, log, s;
 > > -	int head;
 > > -	struct vhost_log *vq_log;
 > > -	struct msghdr msg = {
 > > -		.msg_name = NULL,
 > > -		.msg_namelen = 0,
 > > -		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
 > > -		.msg_controllen = 0,
 > > -		.msg_iov = vq->iov,
 > > -		.msg_flags = MSG_DONTWAIT,
 > > -	};
 > > -
 > > -	struct virtio_net_hdr hdr = {
 > > -		.flags = 0,
 > > -		.gso_type = VIRTIO_NET_HDR_GSO_NONE
 > > -	};
 > > -
 > > -	size_t len, total_len = 0;
 > > -	int err;
 > > -	size_t hdr_size;
 > > -	struct socket *sock = rcu_dereference(vq->private_data);
 > > -	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
 > > -		return;
 > > -
 > > -	mutex_lock(&vq->mutex);
 > > -	vhost_disable_notify(vq);
 > > -	hdr_size = vq->vhost_hlen;
 > > -
 > > -	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 > > -		vq->log : NULL;
 > > -
 > > -	for (;;) {
 > > -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 > > -					 ARRAY_SIZE(vq->iov),
 > > -					 &out, &in,
 > > -					 vq_log, &log);
 > > -		/* On error, stop handling until the next kick. */
 > > -		if (unlikely(head < 0))
 > > -			break;
 > > -		/* OK, now we need to know about added descriptors. */
 > > -		if (head == vq->num) {
 > > -			if (unlikely(vhost_enable_notify(vq))) {
 > > -				/* They have slipped one in as we were
 > > -				 * doing that: check again. */
 > > -				vhost_disable_notify(vq);
 > > -				continue;
 > > -			}
 > > -			/* Nothing new?  Wait for eventfd to tell us
 > > -			 * they refilled. */
 > > -			break;
 > > -		}
 > > -		/* We don't need to be notified again. */
 > > -		if (out) {
 > > -			vq_err(vq, "Unexpected descriptor format for RX: "
 > > -			       "out %d, int %d\n",
 > > -			       out, in);
 > > -			break;
 > > -		}
 > > -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
 > > -		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
 > > -		msg.msg_iovlen = in;
 > > -		len = iov_length(vq->iov, in);
 > > -		/* Sanity check */
 > > -		if (!len) {
 > > -			vq_err(vq, "Unexpected header len for RX: "
 > > -			       "%zd expected %zd\n",
 > > -			       iov_length(vq->hdr, s), hdr_size);
 > > -			break;
 > > -		}
 > > -		err = sock->ops->recvmsg(NULL, sock, &msg,
 > > -					 len, MSG_DONTWAIT | MSG_TRUNC);
 > > -		/* TODO: Check specific error and bomb out unless EAGAIN? */
 > > -		if (err < 0) {
 > > -			vhost_discard_vq_desc(vq, 1);
 > > -			break;
 > > -		}
 > > -		/* TODO: Should check and handle checksum. */
 > > -		if (err > len) {
 > > -			pr_debug("Discarded truncated rx packet: "
 > > -				 " len %d > %zd\n", err, len);
 > > -			vhost_discard_vq_desc(vq, 1);
 > > -			continue;
 > > -		}
 > > -		len = err;
 > > -		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
 > > -		if (err) {
 > > -			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
 > > -			       vq->iov->iov_base, err);
 > > -			break;
 > > -		}
 > > -		len += hdr_size;
 > > -		vhost_add_used_and_signal(&net->dev, vq, head, len);
 > > -		if (unlikely(vq_log))
 > > -			vhost_log_write(vq, vq_log, log, len);
 > > -		total_len += len;
 > > -		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 > > -			vhost_poll_queue(&vq->poll);
 > > -			break;
 > > -		}
 > > -	}
 > > -
 > > -	mutex_unlock(&vq->mutex);
 > > -}
 > > -
 > > -/* Expects to be always run from workqueue - which acts as
 > > - * read-size critical section for our kind of RCU. */
 > > -static void handle_rx_mergeable(struct vhost_net *net)
 > > +static void handle_rx(struct vhost_net *net)
 > >  {
 > >  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
 > >  	unsigned uninitialized_var(in), log;
 > > @@ -431,7 +324,8 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  		sock_len += sock_hlen;
 > >  		vhost_len = sock_len + vhost_hlen;
 > >  		headcount = get_rx_bufs(vq, vq->heads, vhost_len,
 > > -					&in, vq_log, &log);
 > > +					&in, vq_log, &log,
 > > +					likely(mergeable) ? UIO_MAXIOV : 1);
 > >  		/* On error, stop handling until the next kick. */
 > >  		if (unlikely(headcount < 0))
 > >  			break;
 > > @@ -497,14 +391,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  	mutex_unlock(&vq->mutex);
 > >  }
 > >  
 > > -static void handle_rx(struct vhost_net *net)
 > > -{
 > > -	if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
 > > -		handle_rx_mergeable(net);
 > > -	else
 > > -		handle_rx_big(net);
 > > -}
 > > -
 > >  static void handle_tx_kick(struct vhost_work *work)
 > >  {
 > >  	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop
  2011-01-17  8:46 ` [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop Michael S. Tsirkin
@ 2011-01-18  4:26   ` Jason Wang
  2011-01-18  4:36     ` Michael S. Tsirkin
  0 siblings, 1 reply; 20+ messages in thread
From: Jason Wang @ 2011-01-18  4:26 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jason Wang, virtualization, netdev, kvm, linux-kernel

Michael S. Tsirkin writes:
 > On Mon, Jan 17, 2011 at 04:10:59PM +0800, Jason Wang wrote:
 > > No need to check the support of mergeable buffer inside the recevie
 > > loop as the whole handle_rx()_xx is in the read critical region.  So
 > > this patch move it ahead of the receiving loop.
 > > 
 > > Signed-off-by: Jason Wang <jasowang@redhat.com>
 > 
 > Well feature check is mostly just features & bit
 > so why would it be slower? Because of the rcu
 > adding memory barriers? Do you observe a
 > measureable speedup with this patch?
 > 

I do not measure the performance for just this patch, maybe not obvious. And it
can also help the code unification.

 > Apropos, I noticed that the check in vhost_has_feature
 > is wrong: it uses the same kind of RCU as the
 > private pointer. So we'll have to fix that properly
 > by adding more lockdep classes, but for now
 > we'll need to make
 > the check 1 || lockdep_is_held(&dev->mutex);
 > and add a TODO.
 > 

Not sure, lockdep_is_head(&dev->mutex) maybe not accurate but sufficient, as it
was always held in the read critical region.

 > > ---
 > >  drivers/vhost/net.c |    5 +++--
 > >  1 files changed, 3 insertions(+), 2 deletions(-)
 > > 
 > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 > > index 14fc189..95e49de 100644
 > > --- a/drivers/vhost/net.c
 > > +++ b/drivers/vhost/net.c
 > > @@ -411,7 +411,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  	};
 > >  
 > >  	size_t total_len = 0;
 > > -	int err, headcount;
 > > +	int err, headcount, mergeable;
 > >  	size_t vhost_hlen, sock_hlen;
 > >  	size_t vhost_len, sock_len;
 > >  	struct socket *sock = rcu_dereference(vq->private_data);
 > > @@ -425,6 +425,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  
 > >  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 > >  		vq->log : NULL;
 > > +	mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF);
 > >  
 > >  	while ((sock_len = peek_head_len(sock->sk))) {
 > >  		sock_len += sock_hlen;
 > > @@ -474,7 +475,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  			break;
 > >  		}
 > >  		/* TODO: Should check and handle checksum. */
 > > -		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
 > > +		if (likely(mergeable) &&
 > >  		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
 > >  				      offsetof(typeof(hdr), num_buffers),
 > >  				      sizeof hdr.num_buffers)) {

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop
  2011-01-18  4:26   ` Jason Wang
@ 2011-01-18  4:36     ` Michael S. Tsirkin
  2011-01-18  9:15       ` Jason Wang
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-01-18  4:36 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, kvm, linux-kernel

On Tue, Jan 18, 2011 at 12:26:17PM +0800, Jason Wang wrote:
> Michael S. Tsirkin writes:
>  > On Mon, Jan 17, 2011 at 04:10:59PM +0800, Jason Wang wrote:
>  > > No need to check the support of mergeable buffer inside the recevie
>  > > loop as the whole handle_rx()_xx is in the read critical region.  So
>  > > this patch move it ahead of the receiving loop.
>  > > 
>  > > Signed-off-by: Jason Wang <jasowang@redhat.com>
>  > 
>  > Well feature check is mostly just features & bit
>  > so why would it be slower? Because of the rcu
>  > adding memory barriers? Do you observe a
>  > measureable speedup with this patch?
>  > 
> 
> I do not measure the performance for just this patch, maybe not obvious. And it
> can also help the code unification.
> 
>  > Apropos, I noticed that the check in vhost_has_feature
>  > is wrong: it uses the same kind of RCU as the
>  > private pointer. So we'll have to fix that properly
>  > by adding more lockdep classes, but for now
>  > we'll need to make
>  > the check 1 || lockdep_is_held(&dev->mutex);
>  > and add a TODO.
>  > 
> 
> Not sure, lockdep_is_head(&dev->mutex) maybe not accurate but sufficient, as it
> was always held in the read critical region.

Not really, when we call vhost_has_feature from the vq handling thread
it's not.

>  > > ---
>  > >  drivers/vhost/net.c |    5 +++--
>  > >  1 files changed, 3 insertions(+), 2 deletions(-)
>  > > 
>  > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>  > > index 14fc189..95e49de 100644
>  > > --- a/drivers/vhost/net.c
>  > > +++ b/drivers/vhost/net.c
>  > > @@ -411,7 +411,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  > >  	};
>  > >  
>  > >  	size_t total_len = 0;
>  > > -	int err, headcount;
>  > > +	int err, headcount, mergeable;
>  > >  	size_t vhost_hlen, sock_hlen;
>  > >  	size_t vhost_len, sock_len;
>  > >  	struct socket *sock = rcu_dereference(vq->private_data);
>  > > @@ -425,6 +425,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  > >  
>  > >  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>  > >  		vq->log : NULL;
>  > > +	mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF);
>  > >  
>  > >  	while ((sock_len = peek_head_len(sock->sk))) {
>  > >  		sock_len += sock_hlen;
>  > > @@ -474,7 +475,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  > >  			break;
>  > >  		}
>  > >  		/* TODO: Should check and handle checksum. */
>  > > -		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
>  > > +		if (likely(mergeable) &&
>  > >  		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
>  > >  				      offsetof(typeof(hdr), num_buffers),
>  > >  				      sizeof hdr.num_buffers)) {

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] vhost-net: Unify the code of mergeable and big buffer handling
  2011-01-18  3:05     ` Jason Wang
@ 2011-01-18  4:37       ` Michael S. Tsirkin
  2011-01-18  7:41         ` Jason Wang
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-01-18  4:37 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, kvm, linux-kernel

On Tue, Jan 18, 2011 at 11:05:33AM +0800, Jason Wang wrote:
> Michael S. Tsirkin writes:
>  > On Mon, Jan 17, 2011 at 04:11:08PM +0800, Jason Wang wrote:
>  > > Codes duplication were found between the handling of mergeable and big
>  > > buffers, so this patch tries to unify them. This could be easily done
>  > > by adding a quota to the get_rx_bufs() which is used to limit the
>  > > number of buffers it returns (for mergeable buffer, the quota is
>  > > simply UIO_MAXIOV, for big buffers, the quota is just 1), and then the
>  > > previous handle_rx_mergeable() could be resued also for big buffers.
>  > > 
>  > > Signed-off-by: Jason Wang <jasowang@redhat.com>
>  > 
>  > We actually started this way. However the code turned out
>  > to have measureable overhead when handle_rx_mergeable
>  > is called with quota 1.
>  > So before applying this I'd like to see some data
>  > to show this is not the case anymore.
>  > 
> 
> I've run a round of test (Host->Guest) for these three patches on my desktop:

Yes but what if you only apply patch 3?

> Without these patches
> 
> mergeable buffers:
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.42 (10.66.91.42) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
> Size   Size    Size     Time     Throughput  local    remote   local   remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
> 
>  87380  16384     64    60.00       575.87   69.20    26.36    39.375  7.499  
>  87380  16384    256    60.01      1123.57   73.16    34.73    21.335  5.064  
>  87380  16384    512    60.00      1351.12   75.26    35.80    18.251  4.341  
>  87380  16384   1024    60.00      1955.31   74.73    37.67    12.523  3.156  
>  87380  16384   2048    60.01      3411.92   74.82    39.49    7.186   1.896  
> 
> bug buffers:
> Netperf test results
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.109 (10.66.91.109) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
> Size   Size    Size     Time     Throughput  local    remote   local   remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
> 
>  87380  16384     64    60.00       567.10   72.06    26.13    41.638  7.550  
>  87380  16384    256    60.00      1143.69   74.66    32.58    21.392  4.667  
>  87380  16384    512    60.00      1460.92   73.94    33.40    16.585  3.746  
>  87380  16384   1024    60.00      3454.85   77.49    33.89    7.349   1.607  
>  87380  16384   2048    60.00      3781.11   76.51    38.38    6.631   1.663  
> 
> With these patches:
> 
> mergeable buffers:
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.236 (10.66.91.236) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
> Size   Size    Size     Time     Throughput  local    remote   local   remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
> 
>  87380  16384     64    60.00       657.53   71.27    26.42    35.517  6.583  
>  87380  16384    256    60.00      1217.73   74.34    34.67    20.004  4.665  
>  87380  16384    512    60.00      1575.25   75.27    37.12    15.658  3.861  
>  87380  16384   1024    60.00      2416.07   74.77    37.20    10.140  2.522  
>  87380  16384   2048    60.00      3702.29   77.31    36.29    6.842   1.606  
> 
> big buffers:
> Netperf test results
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.202 (10.66.91.202) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
> Size   Size    Size     Time     Throughput  local    remote   local   remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
> 
>  87380  16384     64    60.00       647.67   71.86    27.26    36.356  6.895  
>  87380  16384    256    60.00      1265.82   76.19    36.54    19.724  4.729  
>  87380  16384    512    60.00      1796.64   76.06    39.48    13.872  3.601  
>  87380  16384   1024    60.00      4008.37   77.05    36.47    6.299   1.491  
>  87380  16384   2048    60.00      4468.56   75.18    41.79    5.513   1.532 
> 
> Looks like the unification does not hurt the performance, and with those patches
> we can get some improvement. BTW, the regression of mergeable buffer still
> exist.
> 
>  > > ---
>  > >  drivers/vhost/net.c |  128 +++------------------------------------------------
>  > >  1 files changed, 7 insertions(+), 121 deletions(-)
>  > > 
>  > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>  > > index 95e49de..c32a2e4 100644
>  > > --- a/drivers/vhost/net.c
>  > > +++ b/drivers/vhost/net.c
>  > > @@ -227,6 +227,7 @@ static int peek_head_len(struct sock *sk)
>  > >   * @iovcount	- returned count of io vectors we fill
>  > >   * @log		- vhost log
>  > >   * @log_num	- log offset
>  > > + * @quota       - headcount quota, 1 for big buffer
>  > >   *	returns number of buffer heads allocated, negative on error
>  > >   */
>  > >  static int get_rx_bufs(struct vhost_virtqueue *vq,
>  > > @@ -234,7 +235,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>  > >  		       int datalen,
>  > >  		       unsigned *iovcount,
>  > >  		       struct vhost_log *log,
>  > > -		       unsigned *log_num)
>  > > +		       unsigned *log_num,
>  > > +		       unsigned int quota)
>  > >  {
>  > >  	unsigned int out, in;
>  > >  	int seg = 0;
>  > > @@ -242,7 +244,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>  > >  	unsigned d;
>  > >  	int r, nlogs = 0;
>  > >  
>  > > -	while (datalen > 0) {
>  > > +	while (datalen > 0 && headcount < quota) {
>  > >  		if (unlikely(seg >= UIO_MAXIOV)) {
>  > >  			r = -ENOBUFS;
>  > >  			goto err;
>  > > @@ -282,116 +284,7 @@ err:
>  > >  
>  > >  /* Expects to be always run from workqueue - which acts as
>  > >   * read-size critical section for our kind of RCU. */
>  > > -static void handle_rx_big(struct vhost_net *net)
>  > > -{
>  > > -	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
>  > > -	unsigned out, in, log, s;
>  > > -	int head;
>  > > -	struct vhost_log *vq_log;
>  > > -	struct msghdr msg = {
>  > > -		.msg_name = NULL,
>  > > -		.msg_namelen = 0,
>  > > -		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
>  > > -		.msg_controllen = 0,
>  > > -		.msg_iov = vq->iov,
>  > > -		.msg_flags = MSG_DONTWAIT,
>  > > -	};
>  > > -
>  > > -	struct virtio_net_hdr hdr = {
>  > > -		.flags = 0,
>  > > -		.gso_type = VIRTIO_NET_HDR_GSO_NONE
>  > > -	};
>  > > -
>  > > -	size_t len, total_len = 0;
>  > > -	int err;
>  > > -	size_t hdr_size;
>  > > -	struct socket *sock = rcu_dereference(vq->private_data);
>  > > -	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
>  > > -		return;
>  > > -
>  > > -	mutex_lock(&vq->mutex);
>  > > -	vhost_disable_notify(vq);
>  > > -	hdr_size = vq->vhost_hlen;
>  > > -
>  > > -	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>  > > -		vq->log : NULL;
>  > > -
>  > > -	for (;;) {
>  > > -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>  > > -					 ARRAY_SIZE(vq->iov),
>  > > -					 &out, &in,
>  > > -					 vq_log, &log);
>  > > -		/* On error, stop handling until the next kick. */
>  > > -		if (unlikely(head < 0))
>  > > -			break;
>  > > -		/* OK, now we need to know about added descriptors. */
>  > > -		if (head == vq->num) {
>  > > -			if (unlikely(vhost_enable_notify(vq))) {
>  > > -				/* They have slipped one in as we were
>  > > -				 * doing that: check again. */
>  > > -				vhost_disable_notify(vq);
>  > > -				continue;
>  > > -			}
>  > > -			/* Nothing new?  Wait for eventfd to tell us
>  > > -			 * they refilled. */
>  > > -			break;
>  > > -		}
>  > > -		/* We don't need to be notified again. */
>  > > -		if (out) {
>  > > -			vq_err(vq, "Unexpected descriptor format for RX: "
>  > > -			       "out %d, int %d\n",
>  > > -			       out, in);
>  > > -			break;
>  > > -		}
>  > > -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
>  > > -		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
>  > > -		msg.msg_iovlen = in;
>  > > -		len = iov_length(vq->iov, in);
>  > > -		/* Sanity check */
>  > > -		if (!len) {
>  > > -			vq_err(vq, "Unexpected header len for RX: "
>  > > -			       "%zd expected %zd\n",
>  > > -			       iov_length(vq->hdr, s), hdr_size);
>  > > -			break;
>  > > -		}
>  > > -		err = sock->ops->recvmsg(NULL, sock, &msg,
>  > > -					 len, MSG_DONTWAIT | MSG_TRUNC);
>  > > -		/* TODO: Check specific error and bomb out unless EAGAIN? */
>  > > -		if (err < 0) {
>  > > -			vhost_discard_vq_desc(vq, 1);
>  > > -			break;
>  > > -		}
>  > > -		/* TODO: Should check and handle checksum. */
>  > > -		if (err > len) {
>  > > -			pr_debug("Discarded truncated rx packet: "
>  > > -				 " len %d > %zd\n", err, len);
>  > > -			vhost_discard_vq_desc(vq, 1);
>  > > -			continue;
>  > > -		}
>  > > -		len = err;
>  > > -		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
>  > > -		if (err) {
>  > > -			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
>  > > -			       vq->iov->iov_base, err);
>  > > -			break;
>  > > -		}
>  > > -		len += hdr_size;
>  > > -		vhost_add_used_and_signal(&net->dev, vq, head, len);
>  > > -		if (unlikely(vq_log))
>  > > -			vhost_log_write(vq, vq_log, log, len);
>  > > -		total_len += len;
>  > > -		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>  > > -			vhost_poll_queue(&vq->poll);
>  > > -			break;
>  > > -		}
>  > > -	}
>  > > -
>  > > -	mutex_unlock(&vq->mutex);
>  > > -}
>  > > -
>  > > -/* Expects to be always run from workqueue - which acts as
>  > > - * read-size critical section for our kind of RCU. */
>  > > -static void handle_rx_mergeable(struct vhost_net *net)
>  > > +static void handle_rx(struct vhost_net *net)
>  > >  {
>  > >  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
>  > >  	unsigned uninitialized_var(in), log;
>  > > @@ -431,7 +324,8 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  > >  		sock_len += sock_hlen;
>  > >  		vhost_len = sock_len + vhost_hlen;
>  > >  		headcount = get_rx_bufs(vq, vq->heads, vhost_len,
>  > > -					&in, vq_log, &log);
>  > > +					&in, vq_log, &log,
>  > > +					likely(mergeable) ? UIO_MAXIOV : 1);
>  > >  		/* On error, stop handling until the next kick. */
>  > >  		if (unlikely(headcount < 0))
>  > >  			break;
>  > > @@ -497,14 +391,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  > >  	mutex_unlock(&vq->mutex);
>  > >  }
>  > >  
>  > > -static void handle_rx(struct vhost_net *net)
>  > > -{
>  > > -	if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
>  > > -		handle_rx_mergeable(net);
>  > > -	else
>  > > -		handle_rx_big(net);
>  > > -}
>  > > -
>  > >  static void handle_tx_kick(struct vhost_work *work)
>  > >  {
>  > >  	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] vhost-net: Unify the code of mergeable and big buffer handling
  2011-01-18  4:37       ` Michael S. Tsirkin
@ 2011-01-18  7:41         ` Jason Wang
  0 siblings, 0 replies; 20+ messages in thread
From: Jason Wang @ 2011-01-18  7:41 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jason Wang, virtualization, netdev, kvm, linux-kernel

Michael S. Tsirkin writes:
 > On Tue, Jan 18, 2011 at 11:05:33AM +0800, Jason Wang wrote:
 > > Michael S. Tsirkin writes:
 > >  > On Mon, Jan 17, 2011 at 04:11:08PM +0800, Jason Wang wrote:
 > >  > > Codes duplication were found between the handling of mergeable and big
 > >  > > buffers, so this patch tries to unify them. This could be easily done
 > >  > > by adding a quota to the get_rx_bufs() which is used to limit the
 > >  > > number of buffers it returns (for mergeable buffer, the quota is
 > >  > > simply UIO_MAXIOV, for big buffers, the quota is just 1), and then the
 > >  > > previous handle_rx_mergeable() could be resued also for big buffers.
 > >  > > 
 > >  > > Signed-off-by: Jason Wang <jasowang@redhat.com>
 > >  > 
 > >  > We actually started this way. However the code turned out
 > >  > to have measureable overhead when handle_rx_mergeable
 > >  > is called with quota 1.
 > >  > So before applying this I'd like to see some data
 > >  > to show this is not the case anymore.
 > >  > 
 > > 
 > > I've run a round of test (Host->Guest) for these three patches on my desktop:
 > 
 > Yes but what if you only apply patch 3?
 > 

Result here, slightly better than without it but worse than applying all the three
patches except for big buffer size at 2048 but the differnece could be neglected.

mergeable buffers:
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.161 (10.66.91.161) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384     64    60.00       584.91   69.41    26.10    38.882  7.310  
 87380  16384    256    60.00      1194.05   72.81    34.82    19.980  4.778  
 87380  16384    512    60.00      1503.93   74.23    36.86    16.174  4.016  
 87380  16384   1024    60.00      2004.57   73.53    37.59    12.019  3.073  
 87380  16384   2048    60.00      3445.96   73.76    38.88    7.014   1.849  

big buffers:
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.129 (10.66.91.129) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384     64    60.00       646.95   72.25    27.05    36.595  6.850  
 87380  16384    256    60.00      1258.61   74.88    33.01    19.495  4.297  
 87380  16384    512    60.00      1655.95   74.14    33.96    14.671  3.360  
 87380  16384   1024    60.00      3220.32   74.24    33.31    7.555   1.695  
 87380  16384   2048    60.00      4516.40   73.88    42.10    5.360   1.527  

 > > Without these patches
 > > 
 > > mergeable buffers:
 > > TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.42 (10.66.91.42) port 0 AF_INET
 > > Recv   Send    Send                          Utilization       Service Demand
 > > Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
 > > Size   Size    Size     Time     Throughput  local    remote   local   remote
 > > bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
 > > 
 > >  87380  16384     64    60.00       575.87   69.20    26.36    39.375  7.499  
 > >  87380  16384    256    60.01      1123.57   73.16    34.73    21.335  5.064  
 > >  87380  16384    512    60.00      1351.12   75.26    35.80    18.251  4.341  
 > >  87380  16384   1024    60.00      1955.31   74.73    37.67    12.523  3.156  
 > >  87380  16384   2048    60.01      3411.92   74.82    39.49    7.186   1.896  
 > > 
 > > bug buffers:
 > > Netperf test results
 > > TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.109 (10.66.91.109) port 0 AF_INET
 > > Recv   Send    Send                          Utilization       Service Demand
 > > Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
 > > Size   Size    Size     Time     Throughput  local    remote   local   remote
 > > bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
 > > 
 > >  87380  16384     64    60.00       567.10   72.06    26.13    41.638  7.550  
 > >  87380  16384    256    60.00      1143.69   74.66    32.58    21.392  4.667  
 > >  87380  16384    512    60.00      1460.92   73.94    33.40    16.585  3.746  
 > >  87380  16384   1024    60.00      3454.85   77.49    33.89    7.349   1.607  
 > >  87380  16384   2048    60.00      3781.11   76.51    38.38    6.631   1.663  
 > > 
 > > With these patches:
 > > 
 > > mergeable buffers:
 > > TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.236 (10.66.91.236) port 0 AF_INET
 > > Recv   Send    Send                          Utilization       Service Demand
 > > Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
 > > Size   Size    Size     Time     Throughput  local    remote   local   remote
 > > bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
 > > 
 > >  87380  16384     64    60.00       657.53   71.27    26.42    35.517  6.583  
 > >  87380  16384    256    60.00      1217.73   74.34    34.67    20.004  4.665  
 > >  87380  16384    512    60.00      1575.25   75.27    37.12    15.658  3.861  
 > >  87380  16384   1024    60.00      2416.07   74.77    37.20    10.140  2.522  
 > >  87380  16384   2048    60.00      3702.29   77.31    36.29    6.842   1.606  
 > > 
 > > big buffers:
 > > Netperf test results
 > > TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.202 (10.66.91.202) port 0 AF_INET
 > > Recv   Send    Send                          Utilization       Service Demand
 > > Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
 > > Size   Size    Size     Time     Throughput  local    remote   local   remote
 > > bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
 > > 
 > >  87380  16384     64    60.00       647.67   71.86    27.26    36.356  6.895  
 > >  87380  16384    256    60.00      1265.82   76.19    36.54    19.724  4.729  
 > >  87380  16384    512    60.00      1796.64   76.06    39.48    13.872  3.601  
 > >  87380  16384   1024    60.00      4008.37   77.05    36.47    6.299   1.491  
 > >  87380  16384   2048    60.00      4468.56   75.18    41.79    5.513   1.532 
 > > 
 > > Looks like the unification does not hurt the performance, and with those patches
 > > we can get some improvement. BTW, the regression of mergeable buffer still
 > > exist.
 > > 
 > >  > > ---
 > >  > >  drivers/vhost/net.c |  128 +++------------------------------------------------
 > >  > >  1 files changed, 7 insertions(+), 121 deletions(-)
 > >  > > 
 > >  > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 > >  > > index 95e49de..c32a2e4 100644
 > >  > > --- a/drivers/vhost/net.c
 > >  > > +++ b/drivers/vhost/net.c
 > >  > > @@ -227,6 +227,7 @@ static int peek_head_len(struct sock *sk)
 > >  > >   * @iovcount	- returned count of io vectors we fill
 > >  > >   * @log		- vhost log
 > >  > >   * @log_num	- log offset
 > >  > > + * @quota       - headcount quota, 1 for big buffer
 > >  > >   *	returns number of buffer heads allocated, negative on error
 > >  > >   */
 > >  > >  static int get_rx_bufs(struct vhost_virtqueue *vq,
 > >  > > @@ -234,7 +235,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 > >  > >  		       int datalen,
 > >  > >  		       unsigned *iovcount,
 > >  > >  		       struct vhost_log *log,
 > >  > > -		       unsigned *log_num)
 > >  > > +		       unsigned *log_num,
 > >  > > +		       unsigned int quota)
 > >  > >  {
 > >  > >  	unsigned int out, in;
 > >  > >  	int seg = 0;
 > >  > > @@ -242,7 +244,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 > >  > >  	unsigned d;
 > >  > >  	int r, nlogs = 0;
 > >  > >  
 > >  > > -	while (datalen > 0) {
 > >  > > +	while (datalen > 0 && headcount < quota) {
 > >  > >  		if (unlikely(seg >= UIO_MAXIOV)) {
 > >  > >  			r = -ENOBUFS;
 > >  > >  			goto err;
 > >  > > @@ -282,116 +284,7 @@ err:
 > >  > >  
 > >  > >  /* Expects to be always run from workqueue - which acts as
 > >  > >   * read-size critical section for our kind of RCU. */
 > >  > > -static void handle_rx_big(struct vhost_net *net)
 > >  > > -{
 > >  > > -	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
 > >  > > -	unsigned out, in, log, s;
 > >  > > -	int head;
 > >  > > -	struct vhost_log *vq_log;
 > >  > > -	struct msghdr msg = {
 > >  > > -		.msg_name = NULL,
 > >  > > -		.msg_namelen = 0,
 > >  > > -		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
 > >  > > -		.msg_controllen = 0,
 > >  > > -		.msg_iov = vq->iov,
 > >  > > -		.msg_flags = MSG_DONTWAIT,
 > >  > > -	};
 > >  > > -
 > >  > > -	struct virtio_net_hdr hdr = {
 > >  > > -		.flags = 0,
 > >  > > -		.gso_type = VIRTIO_NET_HDR_GSO_NONE
 > >  > > -	};
 > >  > > -
 > >  > > -	size_t len, total_len = 0;
 > >  > > -	int err;
 > >  > > -	size_t hdr_size;
 > >  > > -	struct socket *sock = rcu_dereference(vq->private_data);
 > >  > > -	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
 > >  > > -		return;
 > >  > > -
 > >  > > -	mutex_lock(&vq->mutex);
 > >  > > -	vhost_disable_notify(vq);
 > >  > > -	hdr_size = vq->vhost_hlen;
 > >  > > -
 > >  > > -	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 > >  > > -		vq->log : NULL;
 > >  > > -
 > >  > > -	for (;;) {
 > >  > > -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 > >  > > -					 ARRAY_SIZE(vq->iov),
 > >  > > -					 &out, &in,
 > >  > > -					 vq_log, &log);
 > >  > > -		/* On error, stop handling until the next kick. */
 > >  > > -		if (unlikely(head < 0))
 > >  > > -			break;
 > >  > > -		/* OK, now we need to know about added descriptors. */
 > >  > > -		if (head == vq->num) {
 > >  > > -			if (unlikely(vhost_enable_notify(vq))) {
 > >  > > -				/* They have slipped one in as we were
 > >  > > -				 * doing that: check again. */
 > >  > > -				vhost_disable_notify(vq);
 > >  > > -				continue;
 > >  > > -			}
 > >  > > -			/* Nothing new?  Wait for eventfd to tell us
 > >  > > -			 * they refilled. */
 > >  > > -			break;
 > >  > > -		}
 > >  > > -		/* We don't need to be notified again. */
 > >  > > -		if (out) {
 > >  > > -			vq_err(vq, "Unexpected descriptor format for RX: "
 > >  > > -			       "out %d, int %d\n",
 > >  > > -			       out, in);
 > >  > > -			break;
 > >  > > -		}
 > >  > > -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
 > >  > > -		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
 > >  > > -		msg.msg_iovlen = in;
 > >  > > -		len = iov_length(vq->iov, in);
 > >  > > -		/* Sanity check */
 > >  > > -		if (!len) {
 > >  > > -			vq_err(vq, "Unexpected header len for RX: "
 > >  > > -			       "%zd expected %zd\n",
 > >  > > -			       iov_length(vq->hdr, s), hdr_size);
 > >  > > -			break;
 > >  > > -		}
 > >  > > -		err = sock->ops->recvmsg(NULL, sock, &msg,
 > >  > > -					 len, MSG_DONTWAIT | MSG_TRUNC);
 > >  > > -		/* TODO: Check specific error and bomb out unless EAGAIN? */
 > >  > > -		if (err < 0) {
 > >  > > -			vhost_discard_vq_desc(vq, 1);
 > >  > > -			break;
 > >  > > -		}
 > >  > > -		/* TODO: Should check and handle checksum. */
 > >  > > -		if (err > len) {
 > >  > > -			pr_debug("Discarded truncated rx packet: "
 > >  > > -				 " len %d > %zd\n", err, len);
 > >  > > -			vhost_discard_vq_desc(vq, 1);
 > >  > > -			continue;
 > >  > > -		}
 > >  > > -		len = err;
 > >  > > -		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
 > >  > > -		if (err) {
 > >  > > -			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
 > >  > > -			       vq->iov->iov_base, err);
 > >  > > -			break;
 > >  > > -		}
 > >  > > -		len += hdr_size;
 > >  > > -		vhost_add_used_and_signal(&net->dev, vq, head, len);
 > >  > > -		if (unlikely(vq_log))
 > >  > > -			vhost_log_write(vq, vq_log, log, len);
 > >  > > -		total_len += len;
 > >  > > -		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 > >  > > -			vhost_poll_queue(&vq->poll);
 > >  > > -			break;
 > >  > > -		}
 > >  > > -	}
 > >  > > -
 > >  > > -	mutex_unlock(&vq->mutex);
 > >  > > -}
 > >  > > -
 > >  > > -/* Expects to be always run from workqueue - which acts as
 > >  > > - * read-size critical section for our kind of RCU. */
 > >  > > -static void handle_rx_mergeable(struct vhost_net *net)
 > >  > > +static void handle_rx(struct vhost_net *net)
 > >  > >  {
 > >  > >  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
 > >  > >  	unsigned uninitialized_var(in), log;
 > >  > > @@ -431,7 +324,8 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  > >  		sock_len += sock_hlen;
 > >  > >  		vhost_len = sock_len + vhost_hlen;
 > >  > >  		headcount = get_rx_bufs(vq, vq->heads, vhost_len,
 > >  > > -					&in, vq_log, &log);
 > >  > > +					&in, vq_log, &log,
 > >  > > +					likely(mergeable) ? UIO_MAXIOV : 1);
 > >  > >  		/* On error, stop handling until the next kick. */
 > >  > >  		if (unlikely(headcount < 0))
 > >  > >  			break;
 > >  > > @@ -497,14 +391,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  > >  	mutex_unlock(&vq->mutex);
 > >  > >  }
 > >  > >  
 > >  > > -static void handle_rx(struct vhost_net *net)
 > >  > > -{
 > >  > > -	if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
 > >  > > -		handle_rx_mergeable(net);
 > >  > > -	else
 > >  > > -		handle_rx_big(net);
 > >  > > -}
 > >  > > -
 > >  > >  static void handle_tx_kick(struct vhost_work *work)
 > >  > >  {
 > >  > >  	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop
  2011-01-18  4:36     ` Michael S. Tsirkin
@ 2011-01-18  9:15       ` Jason Wang
  0 siblings, 0 replies; 20+ messages in thread
From: Jason Wang @ 2011-01-18  9:15 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jason Wang, virtualization, netdev, kvm, linux-kernel

Michael S. Tsirkin writes:
 > On Tue, Jan 18, 2011 at 12:26:17PM +0800, Jason Wang wrote:
 > > Michael S. Tsirkin writes:
 > >  > On Mon, Jan 17, 2011 at 04:10:59PM +0800, Jason Wang wrote:
 > >  > > No need to check the support of mergeable buffer inside the recevie
 > >  > > loop as the whole handle_rx()_xx is in the read critical region.  So
 > >  > > this patch move it ahead of the receiving loop.
 > >  > > 
 > >  > > Signed-off-by: Jason Wang <jasowang@redhat.com>
 > >  > 
 > >  > Well feature check is mostly just features & bit
 > >  > so why would it be slower? Because of the rcu
 > >  > adding memory barriers? Do you observe a
 > >  > measureable speedup with this patch?
 > >  > 
 > > 
 > > I do not measure the performance for just this patch, maybe not obvious. And it
 > > can also help the code unification.
 > > 
 > >  > Apropos, I noticed that the check in vhost_has_feature
 > >  > is wrong: it uses the same kind of RCU as the
 > >  > private pointer. So we'll have to fix that properly
 > >  > by adding more lockdep classes, but for now
 > >  > we'll need to make
 > >  > the check 1 || lockdep_is_held(&dev->mutex);
 > >  > and add a TODO.
 > >  > 
 > > 
 > > Not sure, lockdep_is_head(&dev->mutex) maybe not accurate but sufficient, as it
 > > was always held in the read critical region.
 > 
 > Not really, when we call vhost_has_feature from the vq handling thread
 > it's not.
 > 

Seems not, see vhost_net_ioctl().

 > >  > > ---
 > >  > >  drivers/vhost/net.c |    5 +++--
 > >  > >  1 files changed, 3 insertions(+), 2 deletions(-)
 > >  > > 
 > >  > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 > >  > > index 14fc189..95e49de 100644
 > >  > > --- a/drivers/vhost/net.c
 > >  > > +++ b/drivers/vhost/net.c
 > >  > > @@ -411,7 +411,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  > >  	};
 > >  > >  
 > >  > >  	size_t total_len = 0;
 > >  > > -	int err, headcount;
 > >  > > +	int err, headcount, mergeable;
 > >  > >  	size_t vhost_hlen, sock_hlen;
 > >  > >  	size_t vhost_len, sock_len;
 > >  > >  	struct socket *sock = rcu_dereference(vq->private_data);
 > >  > > @@ -425,6 +425,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  > >  
 > >  > >  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 > >  > >  		vq->log : NULL;
 > >  > > +	mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF);
 > >  > >  
 > >  > >  	while ((sock_len = peek_head_len(sock->sk))) {
 > >  > >  		sock_len += sock_hlen;
 > >  > > @@ -474,7 +475,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  > >  			break;
 > >  > >  		}
 > >  > >  		/* TODO: Should check and handle checksum. */
 > >  > > -		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
 > >  > > +		if (likely(mergeable) &&
 > >  > >  		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
 > >  > >  				      offsetof(typeof(hdr), num_buffers),
 > >  > >  				      sizeof hdr.num_buffers)) {

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len()
  2011-01-17  8:11 ` [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len() Jason Wang
  2011-01-17  9:33   ` Eric Dumazet
  2011-01-17  9:57   ` Michael S. Tsirkin
@ 2011-03-13 15:06   ` Michael S. Tsirkin
  2011-03-13 15:52     ` Eric Dumazet
  2 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-03-13 15:06 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, kvm, linux-kernel

On Mon, Jan 17, 2011 at 04:11:17PM +0800, Jason Wang wrote:
> We can use lock_sock_fast() instead of lock_sock() in order to get
> speedup in peek_head_len().
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/net.c |    4 ++--
>  1 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index c32a2e4..50b622a 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -211,12 +211,12 @@ static int peek_head_len(struct sock *sk)
>  {
>  	struct sk_buff *head;
>  	int len = 0;
> +	bool slow = lock_sock_fast(sk);
>  
> -	lock_sock(sk);
>  	head = skb_peek(&sk->sk_receive_queue);
>  	if (head)
>  		len = head->len;
> -	release_sock(sk);
> +	unlock_sock_fast(sk, slow);
>  	return len;
>  }
>  

Wanted to apply this, but looking at the code I think the lock_sock here
is wrong. What we really need is to handle the case where the skb is
pulled from the receive queue after skb_peek.  However this is not the
right lock to use for that, sk_receive_queue.lock is.
So I expect the following is the right way to handle this.
Comments?

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 0329c41..5720301 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -213,12 +213,13 @@ static int peek_head_len(struct sock *sk)
 {
 	struct sk_buff *head;
 	int len = 0;
+	unsigned long flags;
 
-	lock_sock(sk);
+	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
 	head = skb_peek(&sk->sk_receive_queue);
-	if (head)
+	if (likely(head))
 		len = head->len;
-	release_sock(sk);
+	spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
 	return len;
 }
 

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len()
  2011-03-13 15:06   ` Michael S. Tsirkin
@ 2011-03-13 15:52     ` Eric Dumazet
  2011-03-13 16:19       ` Michael S. Tsirkin
  0 siblings, 1 reply; 20+ messages in thread
From: Eric Dumazet @ 2011-03-13 15:52 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jason Wang, virtualization, netdev, kvm, linux-kernel

Le dimanche 13 mars 2011 à 17:06 +0200, Michael S. Tsirkin a écrit :
> On Mon, Jan 17, 2011 at 04:11:17PM +0800, Jason Wang wrote:
> > We can use lock_sock_fast() instead of lock_sock() in order to get
> > speedup in peek_head_len().
> > 
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > ---
> >  drivers/vhost/net.c |    4 ++--
> >  1 files changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index c32a2e4..50b622a 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -211,12 +211,12 @@ static int peek_head_len(struct sock *sk)
> >  {
> >  	struct sk_buff *head;
> >  	int len = 0;
> > +	bool slow = lock_sock_fast(sk);
> >  
> > -	lock_sock(sk);
> >  	head = skb_peek(&sk->sk_receive_queue);
> >  	if (head)
> >  		len = head->len;
> > -	release_sock(sk);
> > +	unlock_sock_fast(sk, slow);
> >  	return len;
> >  }
> >  
> 
> Wanted to apply this, but looking at the code I think the lock_sock here
> is wrong. What we really need is to handle the case where the skb is
> pulled from the receive queue after skb_peek.  However this is not the
> right lock to use for that, sk_receive_queue.lock is.
> So I expect the following is the right way to handle this.
> Comments?
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 0329c41..5720301 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -213,12 +213,13 @@ static int peek_head_len(struct sock *sk)
>  {
>  	struct sk_buff *head;
>  	int len = 0;
> +	unsigned long flags;
>  
> -	lock_sock(sk);
> +	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
>  	head = skb_peek(&sk->sk_receive_queue);
> -	if (head)
> +	if (likely(head))
>  		len = head->len;
> -	release_sock(sk);
> +	spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
>  	return len;
>  }
>  

You may be right, only way to be sure is to check the other side.

If it uses skb_queue_tail(), then yes, your patch is fine.

If other side did not lock socket, then your patch is a bug fix.




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len()
  2011-03-13 15:52     ` Eric Dumazet
@ 2011-03-13 16:19       ` Michael S. Tsirkin
  2011-03-13 16:32         ` Eric Dumazet
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-03-13 16:19 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jason Wang, virtualization, netdev, kvm, linux-kernel

On Sun, Mar 13, 2011 at 04:52:50PM +0100, Eric Dumazet wrote:
> Le dimanche 13 mars 2011 à 17:06 +0200, Michael S. Tsirkin a écrit :
> > On Mon, Jan 17, 2011 at 04:11:17PM +0800, Jason Wang wrote:
> > > We can use lock_sock_fast() instead of lock_sock() in order to get
> > > speedup in peek_head_len().
> > > 
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > ---
> > >  drivers/vhost/net.c |    4 ++--
> > >  1 files changed, 2 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > index c32a2e4..50b622a 100644
> > > --- a/drivers/vhost/net.c
> > > +++ b/drivers/vhost/net.c
> > > @@ -211,12 +211,12 @@ static int peek_head_len(struct sock *sk)
> > >  {
> > >  	struct sk_buff *head;
> > >  	int len = 0;
> > > +	bool slow = lock_sock_fast(sk);
> > >  
> > > -	lock_sock(sk);
> > >  	head = skb_peek(&sk->sk_receive_queue);
> > >  	if (head)
> > >  		len = head->len;
> > > -	release_sock(sk);
> > > +	unlock_sock_fast(sk, slow);
> > >  	return len;
> > >  }
> > >  
> > 
> > Wanted to apply this, but looking at the code I think the lock_sock here
> > is wrong. What we really need is to handle the case where the skb is
> > pulled from the receive queue after skb_peek.  However this is not the
> > right lock to use for that, sk_receive_queue.lock is.
> > So I expect the following is the right way to handle this.
> > Comments?
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > 
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 0329c41..5720301 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -213,12 +213,13 @@ static int peek_head_len(struct sock *sk)
> >  {
> >  	struct sk_buff *head;
> >  	int len = 0;
> > +	unsigned long flags;
> >  
> > -	lock_sock(sk);
> > +	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
> >  	head = skb_peek(&sk->sk_receive_queue);
> > -	if (head)
> > +	if (likely(head))
> >  		len = head->len;
> > -	release_sock(sk);
> > +	spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
> >  	return len;
> >  }
> >  
> 
> You may be right, only way to be sure is to check the other side.
> 
> If it uses skb_queue_tail(), then yes, your patch is fine.
> 
> If other side did not lock socket, then your patch is a bug fix.
> 
> 

Other side is in drivers/net/tun.c and net/packet/af_packet.c
At least wrt tun it seems clear socket is not locked.
Besides queue, dequeue seems to be done without socket locked.

-- 
MST

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len()
  2011-03-13 16:19       ` Michael S. Tsirkin
@ 2011-03-13 16:32         ` Eric Dumazet
  2011-03-13 16:43           ` Michael S. Tsirkin
  0 siblings, 1 reply; 20+ messages in thread
From: Eric Dumazet @ 2011-03-13 16:32 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jason Wang, virtualization, netdev, kvm, linux-kernel

Le dimanche 13 mars 2011 à 18:19 +0200, Michael S. Tsirkin a écrit :

> Other side is in drivers/net/tun.c and net/packet/af_packet.c
> At least wrt tun it seems clear socket is not locked.

Yes (assuming you refer to tun_net_xmit())

> Besides queue, dequeue seems to be done without socket locked.
> 

It seems this code (assuming you speak of drivers/vhost/net.c ?) has
some races indeed.




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len()
  2011-03-13 16:32         ` Eric Dumazet
@ 2011-03-13 16:43           ` Michael S. Tsirkin
  2011-03-13 17:41             ` Eric Dumazet
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-03-13 16:43 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jason Wang, virtualization, netdev, kvm, linux-kernel

On Sun, Mar 13, 2011 at 05:32:07PM +0100, Eric Dumazet wrote:
> Le dimanche 13 mars 2011 à 18:19 +0200, Michael S. Tsirkin a écrit :
> 
> > Other side is in drivers/net/tun.c and net/packet/af_packet.c
> > At least wrt tun it seems clear socket is not locked.
> 
> Yes (assuming you refer to tun_net_xmit())
> 
> > Besides queue, dequeue seems to be done without socket locked.
> > 
> 
> It seems this code (assuming you speak of drivers/vhost/net.c ?) has
> some races indeed.
> 

Hmm. Any more besides the one fixed here?

-- 
MST

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len()
  2011-03-13 16:43           ` Michael S. Tsirkin
@ 2011-03-13 17:41             ` Eric Dumazet
  2011-03-13 21:11               ` Michael S. Tsirkin
  0 siblings, 1 reply; 20+ messages in thread
From: Eric Dumazet @ 2011-03-13 17:41 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jason Wang, virtualization, netdev, kvm, linux-kernel

Le dimanche 13 mars 2011 à 18:43 +0200, Michael S. Tsirkin a écrit :
> On Sun, Mar 13, 2011 at 05:32:07PM +0100, Eric Dumazet wrote:
> > Le dimanche 13 mars 2011 à 18:19 +0200, Michael S. Tsirkin a écrit :
> > 
> > > Other side is in drivers/net/tun.c and net/packet/af_packet.c
> > > At least wrt tun it seems clear socket is not locked.
> > 
> > Yes (assuming you refer to tun_net_xmit())
> > 
> > > Besides queue, dequeue seems to be done without socket locked.
> > > 
> > 
> > It seems this code (assuming you speak of drivers/vhost/net.c ?) has
> > some races indeed.
> > 
> 
> Hmm. Any more besides the one fixed here?
> 

If writers and readers dont share a common lock, how can they reliably
synchronize states ?

For example, the check at line 420 seems unsafe or useless.

skb_queue_empty(&sock->sk->sk_receive_queue)




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len()
  2011-03-13 17:41             ` Eric Dumazet
@ 2011-03-13 21:11               ` Michael S. Tsirkin
  0 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-03-13 21:11 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jason Wang, virtualization, netdev, kvm, linux-kernel

On Sun, Mar 13, 2011 at 06:41:32PM +0100, Eric Dumazet wrote:
> Le dimanche 13 mars 2011 à 18:43 +0200, Michael S. Tsirkin a écrit :
> > On Sun, Mar 13, 2011 at 05:32:07PM +0100, Eric Dumazet wrote:
> > > Le dimanche 13 mars 2011 à 18:19 +0200, Michael S. Tsirkin a écrit :
> > > 
> > > > Other side is in drivers/net/tun.c and net/packet/af_packet.c
> > > > At least wrt tun it seems clear socket is not locked.
> > > 
> > > Yes (assuming you refer to tun_net_xmit())
> > > 
> > > > Besides queue, dequeue seems to be done without socket locked.
> > > > 
> > > 
> > > It seems this code (assuming you speak of drivers/vhost/net.c ?) has
> > > some races indeed.
> > > 
> > 
> > Hmm. Any more besides the one fixed here?
> > 
> 
> If writers and readers dont share a common lock, how can they reliably
> synchronize states ?

They are all supposed to use sk_receive_queue.lock I think.

> For example, the check at line 420 seems unsafe or useless.
> 
> skb_queue_empty(&sock->sk->sk_receive_queue)
> 

It's mostly useless: code that is called after this
 does skb_peek and checks the result under the spinlock.
This was supposed to be an optimization: quickly check
that queue is not empty before we bother disabling notifications
etc, but I dont' remember at this point whether it actually gives any gain.
Thanks for pointing this out, I'll take it out I think (below).
Note: there are two places of this call in upstream: handle_rx_bug and
handle_rx_mergeable, but they are merged into a single
handle_rx by a patch by Jason Wang.
The below patch is on top.
If you like to look at the latest code,
it's here master.kernel.org:/home/mst/pub/vhost.git
branch vhost-net-next has it all.

Eric, thanks very much for pointing out these.
Is there anything else that you see in this driver?


Thanks!


    vhost-net: remove unlocked use of receive_queue
    
    Use of skb_queue_empty(&sock->sk->sk_receive_queue)
    without taking the sk_receive_queue.lock is unsafe
    or useless. Take it out.
    
    Reported-by:  Eric Dumazet <eric.dumazet@gmail.com>
    Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 5720301..2f7c76a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -311,7 +311,7 @@ static void handle_rx(struct vhost_net *net)
 	/* TODO: check that we are running from vhost_worker? */
 	struct socket *sock = rcu_dereference_check(vq->private_data, 1);
 
-	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+	if (!sock)
 		return;
 
 	mutex_lock(&vq->mutex);

^ permalink raw reply related	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2011-03-13 21:11 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-17  8:10 [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop Jason Wang
2011-01-17  8:11 ` [PATCH 2/3] vhost-net: Unify the code of mergeable and big buffer handling Jason Wang
2011-01-17  8:36   ` Michael S. Tsirkin
2011-01-18  3:05     ` Jason Wang
2011-01-18  4:37       ` Michael S. Tsirkin
2011-01-18  7:41         ` Jason Wang
2011-01-17  8:11 ` [PATCH 3/3] vhost-net: use lock_sock_fast() in peek_head_len() Jason Wang
2011-01-17  9:33   ` Eric Dumazet
2011-01-17  9:57   ` Michael S. Tsirkin
2011-03-13 15:06   ` Michael S. Tsirkin
2011-03-13 15:52     ` Eric Dumazet
2011-03-13 16:19       ` Michael S. Tsirkin
2011-03-13 16:32         ` Eric Dumazet
2011-03-13 16:43           ` Michael S. Tsirkin
2011-03-13 17:41             ` Eric Dumazet
2011-03-13 21:11               ` Michael S. Tsirkin
2011-01-17  8:46 ` [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop Michael S. Tsirkin
2011-01-18  4:26   ` Jason Wang
2011-01-18  4:36     ` Michael S. Tsirkin
2011-01-18  9:15       ` Jason Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).