linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] tuntap: rx batching
@ 2016-11-09  7:38 Jason Wang
  2016-11-09  7:38 ` [PATCH 2/3] vhost: better detection of available buffers Jason Wang
                   ` (2 more replies)
  0 siblings, 3 replies; 25+ messages in thread
From: Jason Wang @ 2016-11-09  7:38 UTC (permalink / raw)
  To: mst, netdev, linux-kernel; +Cc: Jason Wang

Backlog were used for tuntap rx, but it can only process 1 packet at
one time since it was scheduled during sendmsg() synchronously in
process context. This lead bad cache utilization so this patch tries
to do some batching before call rx NAPI. This is done through:

- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
  batch the packet temporarily in a linked list and submit them all
  once MSG_MORE were cleared.
- implement a tuntap specific NAPI handler for processing this kind of
  possible batching. (This could be done by extending backlog to
  support skb like, but using a tun specific one looks cleaner and
  easier for future extension).

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 65 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 1588469..d40583b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -74,6 +74,7 @@
 #include <linux/skb_array.h>
 
 #include <asm/uaccess.h>
+#include <linux/interrupt.h>
 
 /* Uncomment to enable debugging */
 /* #define TUN_DEBUG 1 */
@@ -169,6 +170,8 @@ struct tun_file {
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
+	struct napi_struct napi;
+	struct sk_buff_head process_queue;
 };
 
 struct tun_flow_entry {
@@ -522,6 +525,8 @@ static void tun_queue_purge(struct tun_file *tfile)
 	while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
 		kfree_skb(skb);
 
+	skb_queue_purge(&tfile->sk.sk_write_queue);
+	skb_queue_purge(&tfile->process_queue);
 	skb_queue_purge(&tfile->sk.sk_error_queue);
 }
 
@@ -532,6 +537,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 
 	tun = rtnl_dereference(tfile->tun);
 
+	if (tun && clean) {
+		napi_disable(&tfile->napi);
+		netif_napi_del(&tfile->napi);
+	}
+
 	if (tun && !tfile->detached) {
 		u16 index = tfile->queue_index;
 		BUG_ON(index >= tun->numqueues);
@@ -587,6 +597,7 @@ static void tun_detach_all(struct net_device *dev)
 
 	for (i = 0; i < n; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
+		napi_disable(&tfile->napi);
 		BUG_ON(!tfile);
 		tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
 		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
@@ -603,6 +614,7 @@ static void tun_detach_all(struct net_device *dev)
 	synchronize_net();
 	for (i = 0; i < n; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
+		netif_napi_del(&tfile->napi);
 		/* Drop read queue */
 		tun_queue_purge(tfile);
 		sock_put(&tfile->sk);
@@ -618,6 +630,41 @@ static void tun_detach_all(struct net_device *dev)
 		module_put(THIS_MODULE);
 }
 
+static int tun_poll(struct napi_struct *napi, int budget)
+{
+	struct tun_file *tfile = container_of(napi, struct tun_file, napi);
+	struct sk_buff_head *input_queue =
+	       &tfile->socket.sk->sk_write_queue;
+	struct sk_buff *skb;
+	unsigned int received = 0;
+
+	while (1) {
+		while ((skb = __skb_dequeue(&tfile->process_queue))) {
+			netif_receive_skb(skb);
+			if (++received >= budget)
+				return received;
+		}
+
+		spin_lock(&input_queue->lock);
+		if (skb_queue_empty(input_queue)) {
+			spin_unlock(&input_queue->lock);
+			break;
+		}
+		skb_queue_splice_tail_init(input_queue, &tfile->process_queue);
+		spin_unlock(&input_queue->lock);
+	}
+
+	if (received < budget) {
+		napi_complete(napi);
+		if (skb_peek(&tfile->socket.sk->sk_write_queue) &&
+		    unlikely(napi_schedule_prep(napi))) {
+			__napi_schedule(napi);
+		}
+	}
+
+	return received;
+}
+
 static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
 {
 	struct tun_file *tfile = file->private_data;
@@ -666,9 +713,11 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
 
 	if (tfile->detached)
 		tun_enable_queue(tfile);
-	else
+	else {
 		sock_hold(&tfile->sk);
-
+		netif_napi_add(tun->dev, &tfile->napi, tun_poll, 64);
+		napi_enable(&tfile->napi);
+	}
 	tun_set_real_num_queues(tun);
 
 	/* device is allowed to go away first, so no need to hold extra
@@ -1150,7 +1199,7 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
 /* Get packet from user space buffer */
 static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			    void *msg_control, struct iov_iter *from,
-			    int noblock)
+			    int noblock, bool more)
 {
 	struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
 	struct sk_buff *skb;
@@ -1296,7 +1345,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	skb_probe_transport_header(skb, 0);
 
 	rxhash = skb_get_hash(skb);
-	netif_rx_ni(skb);
+	skb_queue_tail(&tfile->socket.sk->sk_write_queue, skb);
+
+	if (!more) {
+		local_bh_disable();
+		napi_schedule(&tfile->napi);
+		local_bh_enable();
+	}
 
 	stats = get_cpu_ptr(tun->pcpu_stats);
 	u64_stats_update_begin(&stats->syncp);
@@ -1319,7 +1374,8 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (!tun)
 		return -EBADFD;
 
-	result = tun_get_user(tun, tfile, NULL, from, file->f_flags & O_NONBLOCK);
+	result = tun_get_user(tun, tfile, NULL, from,
+			      file->f_flags & O_NONBLOCK, false);
 
 	tun_put(tun);
 	return result;
@@ -1579,7 +1635,8 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 		return -EBADFD;
 
 	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
-			   m->msg_flags & MSG_DONTWAIT);
+			   m->msg_flags & MSG_DONTWAIT,
+			   m->msg_flags & MSG_MORE);
 	tun_put(tun);
 	return ret;
 }
@@ -2336,6 +2393,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	file->private_data = tfile;
 	INIT_LIST_HEAD(&tfile->next);
 
+	skb_queue_head_init(&tfile->process_queue);
+
 	sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
 
 	return 0;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 2/3] vhost: better detection of available buffers
  2016-11-09  7:38 [PATCH 1/3] tuntap: rx batching Jason Wang
@ 2016-11-09  7:38 ` Jason Wang
  2016-11-09 19:57   ` Michael S. Tsirkin
  2016-11-09  7:38 ` [PATCH 3/3] vhost_net: tx support batching Jason Wang
  2016-11-09 16:38 ` [PATCH 1/3] tuntap: rx batching Michael S. Tsirkin
  2 siblings, 1 reply; 25+ messages in thread
From: Jason Wang @ 2016-11-09  7:38 UTC (permalink / raw)
  To: mst, netdev, linux-kernel; +Cc: Jason Wang

We should use vq->last_avail_idx instead of vq->avail_idx in the
checking of vhost_vq_avail_empty() since latter is the cached avail
index from guest but we want to know if there's pending available
buffers in the virtqueue.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c6f2d89..fdf4cdf 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2230,7 +2230,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	if (r)
 		return false;
 
-	return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
+	return vhost16_to_cpu(vq, avail_idx) == vq->last_avail_idx;
 }
 EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 3/3] vhost_net: tx support batching
  2016-11-09  7:38 [PATCH 1/3] tuntap: rx batching Jason Wang
  2016-11-09  7:38 ` [PATCH 2/3] vhost: better detection of available buffers Jason Wang
@ 2016-11-09  7:38 ` Jason Wang
  2016-11-09 20:05   ` Michael S. Tsirkin
  2016-11-09 16:38 ` [PATCH 1/3] tuntap: rx batching Michael S. Tsirkin
  2 siblings, 1 reply; 25+ messages in thread
From: Jason Wang @ 2016-11-09  7:38 UTC (permalink / raw)
  To: mst, netdev, linux-kernel; +Cc: Jason Wang

This patch tries to utilize tuntap rx batching by peeking the tx
virtqueue during transmission, if there's more available buffers in
the virtqueue, set MSG_MORE flag for a hint for tuntap to batch the
packets. The maximum number of batched tx packets were specified
through a module parameter: tx_bached.

When use 16 as tx_batched:

Pktgen test shows 16% on tx pps in guest.
Netperf test does not show obvious regression.

For safety, 1 were used as the default value for tx_batched.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 15 ++++++++++++++-
 drivers/vhost/vhost.c |  1 +
 drivers/vhost/vhost.h |  1 +
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 5dc128a..51c378e 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -35,6 +35,10 @@ module_param(experimental_zcopytx, int, 0444);
 MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
 		                       " 1 -Enable; 0 - Disable");
 
+static int tx_batched = 1;
+module_param(tx_batched, int, 0444);
+MODULE_PARM_DESC(tx_batched, "Number of patches batched in TX");
+
 /* Max number of bytes transferred before requeueing the job.
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x80000
@@ -454,6 +458,16 @@ static void handle_tx(struct vhost_net *net)
 			msg.msg_control = NULL;
 			ubufs = NULL;
 		}
+		total_len += len;
+		if (vq->delayed < tx_batched &&
+		    total_len < VHOST_NET_WEIGHT &&
+		    !vhost_vq_avail_empty(&net->dev, vq)) {
+			vq->delayed++;
+			msg.msg_flags |= MSG_MORE;
+		} else {
+			vq->delayed = 0;
+			msg.msg_flags &= ~MSG_MORE;
+		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(sock, &msg, len);
 		if (unlikely(err < 0)) {
@@ -472,7 +486,6 @@ static void handle_tx(struct vhost_net *net)
 			vhost_add_used_and_signal(&net->dev, vq, head, 0);
 		else
 			vhost_zerocopy_signal_used(net, vq);
-		total_len += len;
 		vhost_net_tx_packet(net);
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index fdf4cdf..bc362c7 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -311,6 +311,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->busyloop_timeout = 0;
 	vq->umem = NULL;
 	vq->iotlb = NULL;
+	vq->delayed = 0;
 }
 
 static int vhost_worker(void *data)
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 78f3c5f..9f81a94 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -141,6 +141,7 @@ struct vhost_virtqueue {
 	bool user_be;
 #endif
 	u32 busyloop_timeout;
+	int delayed;
 };
 
 struct vhost_msg_node {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] tuntap: rx batching
  2016-11-09  7:38 [PATCH 1/3] tuntap: rx batching Jason Wang
  2016-11-09  7:38 ` [PATCH 2/3] vhost: better detection of available buffers Jason Wang
  2016-11-09  7:38 ` [PATCH 3/3] vhost_net: tx support batching Jason Wang
@ 2016-11-09 16:38 ` Michael S. Tsirkin
  2016-11-11  2:07   ` Jason Wang
  2 siblings, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2016-11-09 16:38 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel

On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
> Backlog were used for tuntap rx, but it can only process 1 packet at
> one time since it was scheduled during sendmsg() synchronously in
> process context. This lead bad cache utilization so this patch tries
> to do some batching before call rx NAPI. This is done through:
> 
> - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
>   batch the packet temporarily in a linked list and submit them all
>   once MSG_MORE were cleared.
> - implement a tuntap specific NAPI handler for processing this kind of
>   possible batching. (This could be done by extending backlog to
>   support skb like, but using a tun specific one looks cleaner and
>   easier for future extension).
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

So why do we need an extra queue? This is not what hardware devices do.
How about adding the packet to queue unconditionally, deferring
signalling until we get sendmsg without MSG_MORE?


> ---
>  drivers/net/tun.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 65 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 1588469..d40583b 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -74,6 +74,7 @@
>  #include <linux/skb_array.h>
>  
>  #include <asm/uaccess.h>
> +#include <linux/interrupt.h>
>  
>  /* Uncomment to enable debugging */
>  /* #define TUN_DEBUG 1 */
> @@ -169,6 +170,8 @@ struct tun_file {
>  	struct list_head next;
>  	struct tun_struct *detached;
>  	struct skb_array tx_array;
> +	struct napi_struct napi;
> +	struct sk_buff_head process_queue;
>  };
>  
>  struct tun_flow_entry {
> @@ -522,6 +525,8 @@ static void tun_queue_purge(struct tun_file *tfile)
>  	while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
>  		kfree_skb(skb);
>  
> +	skb_queue_purge(&tfile->sk.sk_write_queue);
> +	skb_queue_purge(&tfile->process_queue);
>  	skb_queue_purge(&tfile->sk.sk_error_queue);
>  }
>  
> @@ -532,6 +537,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
>  
>  	tun = rtnl_dereference(tfile->tun);
>  
> +	if (tun && clean) {
> +		napi_disable(&tfile->napi);
> +		netif_napi_del(&tfile->napi);
> +	}
> +
>  	if (tun && !tfile->detached) {
>  		u16 index = tfile->queue_index;
>  		BUG_ON(index >= tun->numqueues);
> @@ -587,6 +597,7 @@ static void tun_detach_all(struct net_device *dev)
>  
>  	for (i = 0; i < n; i++) {
>  		tfile = rtnl_dereference(tun->tfiles[i]);
> +		napi_disable(&tfile->napi);
>  		BUG_ON(!tfile);
>  		tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
>  		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
> @@ -603,6 +614,7 @@ static void tun_detach_all(struct net_device *dev)
>  	synchronize_net();
>  	for (i = 0; i < n; i++) {
>  		tfile = rtnl_dereference(tun->tfiles[i]);
> +		netif_napi_del(&tfile->napi);
>  		/* Drop read queue */
>  		tun_queue_purge(tfile);
>  		sock_put(&tfile->sk);
> @@ -618,6 +630,41 @@ static void tun_detach_all(struct net_device *dev)
>  		module_put(THIS_MODULE);
>  }
>  
> +static int tun_poll(struct napi_struct *napi, int budget)
> +{
> +	struct tun_file *tfile = container_of(napi, struct tun_file, napi);
> +	struct sk_buff_head *input_queue =
> +	       &tfile->socket.sk->sk_write_queue;
> +	struct sk_buff *skb;
> +	unsigned int received = 0;
> +
> +	while (1) {
> +		while ((skb = __skb_dequeue(&tfile->process_queue))) {
> +			netif_receive_skb(skb);
> +			if (++received >= budget)
> +				return received;
> +		}
> +
> +		spin_lock(&input_queue->lock);
> +		if (skb_queue_empty(input_queue)) {
> +			spin_unlock(&input_queue->lock);
> +			break;
> +		}
> +		skb_queue_splice_tail_init(input_queue, &tfile->process_queue);
> +		spin_unlock(&input_queue->lock);
> +	}
> +
> +	if (received < budget) {
> +		napi_complete(napi);
> +		if (skb_peek(&tfile->socket.sk->sk_write_queue) &&
> +		    unlikely(napi_schedule_prep(napi))) {
> +			__napi_schedule(napi);
> +		}
> +	}
> +
> +	return received;
> +}
> +
>  static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
>  {
>  	struct tun_file *tfile = file->private_data;
> @@ -666,9 +713,11 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
>  
>  	if (tfile->detached)
>  		tun_enable_queue(tfile);
> -	else
> +	else {
>  		sock_hold(&tfile->sk);
> -
> +		netif_napi_add(tun->dev, &tfile->napi, tun_poll, 64);
> +		napi_enable(&tfile->napi);
> +	}
>  	tun_set_real_num_queues(tun);
>  
>  	/* device is allowed to go away first, so no need to hold extra
> @@ -1150,7 +1199,7 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
>  /* Get packet from user space buffer */
>  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>  			    void *msg_control, struct iov_iter *from,
> -			    int noblock)
> +			    int noblock, bool more)
>  {
>  	struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
>  	struct sk_buff *skb;
> @@ -1296,7 +1345,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>  	skb_probe_transport_header(skb, 0);
>  
>  	rxhash = skb_get_hash(skb);
> -	netif_rx_ni(skb);
> +	skb_queue_tail(&tfile->socket.sk->sk_write_queue, skb);
> +
> +	if (!more) {
> +		local_bh_disable();
> +		napi_schedule(&tfile->napi);
> +		local_bh_enable();

Why do we need to disable bh here? I thought napi_schedule can
be called from any context.

> +	}
>  
>  	stats = get_cpu_ptr(tun->pcpu_stats);
>  	u64_stats_update_begin(&stats->syncp);
> @@ -1319,7 +1374,8 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	if (!tun)
>  		return -EBADFD;
>  
> -	result = tun_get_user(tun, tfile, NULL, from, file->f_flags & O_NONBLOCK);
> +	result = tun_get_user(tun, tfile, NULL, from,
> +			      file->f_flags & O_NONBLOCK, false);
>  
>  	tun_put(tun);
>  	return result;
> @@ -1579,7 +1635,8 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
>  		return -EBADFD;
>  
>  	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
> -			   m->msg_flags & MSG_DONTWAIT);
> +			   m->msg_flags & MSG_DONTWAIT,
> +			   m->msg_flags & MSG_MORE);
>  	tun_put(tun);
>  	return ret;
>  }
> @@ -2336,6 +2393,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>  	file->private_data = tfile;
>  	INIT_LIST_HEAD(&tfile->next);
>  
> +	skb_queue_head_init(&tfile->process_queue);
> +
>  	sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
>  
>  	return 0;
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/3] vhost: better detection of available buffers
  2016-11-09  7:38 ` [PATCH 2/3] vhost: better detection of available buffers Jason Wang
@ 2016-11-09 19:57   ` Michael S. Tsirkin
  2016-11-11  2:18     ` Jason Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2016-11-09 19:57 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel

On Wed, Nov 09, 2016 at 03:38:32PM +0800, Jason Wang wrote:
> We should use vq->last_avail_idx instead of vq->avail_idx in the
> checking of vhost_vq_avail_empty() since latter is the cached avail
> index from guest but we want to know if there's pending available
> buffers in the virtqueue.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

I'm not sure why is this patch here. Is it related to
batching somehow?


> ---
>  drivers/vhost/vhost.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index c6f2d89..fdf4cdf 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2230,7 +2230,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
>  	if (r)
>  		return false;
>  
> -	return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
> +	return vhost16_to_cpu(vq, avail_idx) == vq->last_avail_idx;
>  }
>  EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);

That might be OK for TX but it's probably wrong for RX
where the fact that used != avail does not mean
we have enough space to store the packet.

Maybe we should just rename this to vhost_vq_avail_unchanged
to clarify usage.


>  
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/3] vhost_net: tx support batching
  2016-11-09  7:38 ` [PATCH 3/3] vhost_net: tx support batching Jason Wang
@ 2016-11-09 20:05   ` Michael S. Tsirkin
  2016-11-11  2:27     ` Jason Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2016-11-09 20:05 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel

On Wed, Nov 09, 2016 at 03:38:33PM +0800, Jason Wang wrote:
> This patch tries to utilize tuntap rx batching by peeking the tx
> virtqueue during transmission, if there's more available buffers in
> the virtqueue, set MSG_MORE flag for a hint for tuntap to batch the
> packets. The maximum number of batched tx packets were specified
> through a module parameter: tx_bached.
> 
> When use 16 as tx_batched:

When using

> 
> Pktgen test shows 16% on tx pps in guest.
> Netperf test does not show obvious regression.

Why doesn't netperf benefit?

> For safety, 1 were used as the default value for tx_batched.

s/were used/is used/

> Signed-off-by: Jason Wang <jasowang@redhat.com>

These tests unfortunately only run a single flow.
The concern would be whether this increases latency when
NIC is busy with other flows, so I think this is what
you need to test.


> ---
>  drivers/vhost/net.c   | 15 ++++++++++++++-
>  drivers/vhost/vhost.c |  1 +
>  drivers/vhost/vhost.h |  1 +
>  3 files changed, 16 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 5dc128a..51c378e 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -35,6 +35,10 @@ module_param(experimental_zcopytx, int, 0444);
>  MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
>  		                       " 1 -Enable; 0 - Disable");
>  
> +static int tx_batched = 1;
> +module_param(tx_batched, int, 0444);
> +MODULE_PARM_DESC(tx_batched, "Number of patches batched in TX");
> +
>  /* Max number of bytes transferred before requeueing the job.
>   * Using this limit prevents one virtqueue from starving others. */
>  #define VHOST_NET_WEIGHT 0x80000

I think we should do some tests and find a good default.



> @@ -454,6 +458,16 @@ static void handle_tx(struct vhost_net *net)
>  			msg.msg_control = NULL;
>  			ubufs = NULL;
>  		}
> +		total_len += len;
> +		if (vq->delayed < tx_batched &&
> +		    total_len < VHOST_NET_WEIGHT &&
> +		    !vhost_vq_avail_empty(&net->dev, vq)) {
> +			vq->delayed++;
> +			msg.msg_flags |= MSG_MORE;
> +		} else {
> +			vq->delayed = 0;
> +			msg.msg_flags &= ~MSG_MORE;
> +		}
>  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
>  		err = sock->ops->sendmsg(sock, &msg, len);
>  		if (unlikely(err < 0)) {
> @@ -472,7 +486,6 @@ static void handle_tx(struct vhost_net *net)
>  			vhost_add_used_and_signal(&net->dev, vq, head, 0);
>  		else
>  			vhost_zerocopy_signal_used(net, vq);
> -		total_len += len;
>  		vhost_net_tx_packet(net);
>  		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>  			vhost_poll_queue(&vq->poll);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index fdf4cdf..bc362c7 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -311,6 +311,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->busyloop_timeout = 0;
>  	vq->umem = NULL;
>  	vq->iotlb = NULL;
> +	vq->delayed = 0;
>  }
>  
>  static int vhost_worker(void *data)
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 78f3c5f..9f81a94 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -141,6 +141,7 @@ struct vhost_virtqueue {
>  	bool user_be;
>  #endif
>  	u32 busyloop_timeout;
> +	int delayed;
>  };
>  
>  struct vhost_msg_node {
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] tuntap: rx batching
  2016-11-09 16:38 ` [PATCH 1/3] tuntap: rx batching Michael S. Tsirkin
@ 2016-11-11  2:07   ` Jason Wang
  2016-11-11  3:31     ` Michael S. Tsirkin
  0 siblings, 1 reply; 25+ messages in thread
From: Jason Wang @ 2016-11-11  2:07 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel



On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
> On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
>> Backlog were used for tuntap rx, but it can only process 1 packet at
>> one time since it was scheduled during sendmsg() synchronously in
>> process context. This lead bad cache utilization so this patch tries
>> to do some batching before call rx NAPI. This is done through:
>>
>> - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
>>    batch the packet temporarily in a linked list and submit them all
>>    once MSG_MORE were cleared.
>> - implement a tuntap specific NAPI handler for processing this kind of
>>    possible batching. (This could be done by extending backlog to
>>    support skb like, but using a tun specific one looks cleaner and
>>    easier for future extension).
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> So why do we need an extra queue?

The idea was borrowed from backlog to allow some kind of bulking and 
avoid spinlock on each dequeuing.

>   This is not what hardware devices do.
> How about adding the packet to queue unconditionally, deferring
> signalling until we get sendmsg without MSG_MORE?

Then you need touch spinlock when dequeuing each packet.

>
>
>> ---
>>   drivers/net/tun.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
>>   1 file changed, 65 insertions(+), 6 deletions(-)
>>

[...]

>>   	rxhash = skb_get_hash(skb);
>> -	netif_rx_ni(skb);
>> +	skb_queue_tail(&tfile->socket.sk->sk_write_queue, skb);
>> +
>> +	if (!more) {
>> +		local_bh_disable();
>> +		napi_schedule(&tfile->napi);
>> +		local_bh_enable();
> Why do we need to disable bh here? I thought napi_schedule can
> be called from any context.

Yes, it's unnecessary. Will remove.

Thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/3] vhost: better detection of available buffers
  2016-11-09 19:57   ` Michael S. Tsirkin
@ 2016-11-11  2:18     ` Jason Wang
  2016-11-11  3:41       ` Michael S. Tsirkin
  0 siblings, 1 reply; 25+ messages in thread
From: Jason Wang @ 2016-11-11  2:18 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel



On 2016年11月10日 03:57, Michael S. Tsirkin wrote:
> On Wed, Nov 09, 2016 at 03:38:32PM +0800, Jason Wang wrote:
>> We should use vq->last_avail_idx instead of vq->avail_idx in the
>> checking of vhost_vq_avail_empty() since latter is the cached avail
>> index from guest but we want to know if there's pending available
>> buffers in the virtqueue.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> I'm not sure why is this patch here. Is it related to
> batching somehow?

Yes, we need to know whether or not there's still buffers left in the 
virtqueue, so need to check last_avail_idx. Otherwise, we're checking if 
guest has submitted new buffers.

>
>
>> ---
>>   drivers/vhost/vhost.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index c6f2d89..fdf4cdf 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2230,7 +2230,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
>>   	if (r)
>>   		return false;
>>   
>> -	return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
>> +	return vhost16_to_cpu(vq, avail_idx) == vq->last_avail_idx;
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
> That might be OK for TX but it's probably wrong for RX
> where the fact that used != avail does not mean
> we have enough space to store the packet.

Right, but it's no harm since it was just a hint, handle_rx() can handle 
this situation.

>
> Maybe we should just rename this to vhost_vq_avail_unchanged
> to clarify usage.
>

Ok.

>>   
>> -- 
>> 2.7.4

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/3] vhost_net: tx support batching
  2016-11-09 20:05   ` Michael S. Tsirkin
@ 2016-11-11  2:27     ` Jason Wang
  0 siblings, 0 replies; 25+ messages in thread
From: Jason Wang @ 2016-11-11  2:27 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel



On 2016年11月10日 04:05, Michael S. Tsirkin wrote:
> On Wed, Nov 09, 2016 at 03:38:33PM +0800, Jason Wang wrote:
>> This patch tries to utilize tuntap rx batching by peeking the tx
>> virtqueue during transmission, if there's more available buffers in
>> the virtqueue, set MSG_MORE flag for a hint for tuntap to batch the
>> packets. The maximum number of batched tx packets were specified
>> through a module parameter: tx_bached.
>>
>> When use 16 as tx_batched:
> When using
>
>> Pktgen test shows 16% on tx pps in guest.
>> Netperf test does not show obvious regression.
> Why doesn't netperf benefit?

This is probably because the tests (4VCPU, 1queue, TCP, mlx4) does not 
produce 100% stress on vhost thread. In pktgen test, 100% stress on 
vhost thread is achieved easily.

>
>> For safety, 1 were used as the default value for tx_batched.
> s/were used/is used/
>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> These tests unfortunately only run a single flow.
> The concern would be whether this increases latency when
> NIC is busy with other flows, so I think this is what
> you need to test.

Multiple flows were tested too, no obvious improvement/regression were 
found.


>
>
>> ---
>>   drivers/vhost/net.c   | 15 ++++++++++++++-
>>   drivers/vhost/vhost.c |  1 +
>>   drivers/vhost/vhost.h |  1 +
>>   3 files changed, 16 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index 5dc128a..51c378e 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -35,6 +35,10 @@ module_param(experimental_zcopytx, int, 0444);
>>   MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
>>   		                       " 1 -Enable; 0 - Disable");
>>   
>> +static int tx_batched = 1;
>> +module_param(tx_batched, int, 0444);
>> +MODULE_PARM_DESC(tx_batched, "Number of patches batched in TX");
>> +
>>   /* Max number of bytes transferred before requeueing the job.
>>    * Using this limit prevents one virtqueue from starving others. */
>>   #define VHOST_NET_WEIGHT 0x80000
> I think we should do some tests and find a good default.

Ok, will test 4 and 32 to see if there's any difference. (Btw, 16 were 
chosed since dpdk tends to batch 16 packet during TX).

Thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] tuntap: rx batching
  2016-11-11  2:07   ` Jason Wang
@ 2016-11-11  3:31     ` Michael S. Tsirkin
  2016-11-11  4:10       ` Jason Wang
  2016-11-11  4:17       ` John Fastabend
  0 siblings, 2 replies; 25+ messages in thread
From: Michael S. Tsirkin @ 2016-11-11  3:31 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel

On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
> 
> 
> On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
> > On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
> > > Backlog were used for tuntap rx, but it can only process 1 packet at
> > > one time since it was scheduled during sendmsg() synchronously in
> > > process context. This lead bad cache utilization so this patch tries
> > > to do some batching before call rx NAPI. This is done through:
> > > 
> > > - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
> > >    batch the packet temporarily in a linked list and submit them all
> > >    once MSG_MORE were cleared.
> > > - implement a tuntap specific NAPI handler for processing this kind of
> > >    possible batching. (This could be done by extending backlog to
> > >    support skb like, but using a tun specific one looks cleaner and
> > >    easier for future extension).
> > > 
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > So why do we need an extra queue?
> 
> The idea was borrowed from backlog to allow some kind of bulking and avoid
> spinlock on each dequeuing.
> 
> >   This is not what hardware devices do.
> > How about adding the packet to queue unconditionally, deferring
> > signalling until we get sendmsg without MSG_MORE?
> 
> Then you need touch spinlock when dequeuing each packet.

It runs on the same CPU, right? Otherwise we should use skb_array...

> > 
> > 
> > > ---
> > >   drivers/net/tun.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
> > >   1 file changed, 65 insertions(+), 6 deletions(-)
> > > 
> 
> [...]
> 
> > >   	rxhash = skb_get_hash(skb);
> > > -	netif_rx_ni(skb);
> > > +	skb_queue_tail(&tfile->socket.sk->sk_write_queue, skb);
> > > +
> > > +	if (!more) {
> > > +		local_bh_disable();
> > > +		napi_schedule(&tfile->napi);
> > > +		local_bh_enable();
> > Why do we need to disable bh here? I thought napi_schedule can
> > be called from any context.
> 
> Yes, it's unnecessary. Will remove.
> 
> Thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/3] vhost: better detection of available buffers
  2016-11-11  2:18     ` Jason Wang
@ 2016-11-11  3:41       ` Michael S. Tsirkin
  2016-11-11  4:18         ` Jason Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2016-11-11  3:41 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel

On Fri, Nov 11, 2016 at 10:18:37AM +0800, Jason Wang wrote:
> 
> 
> On 2016年11月10日 03:57, Michael S. Tsirkin wrote:
> > On Wed, Nov 09, 2016 at 03:38:32PM +0800, Jason Wang wrote:
> > > We should use vq->last_avail_idx instead of vq->avail_idx in the
> > > checking of vhost_vq_avail_empty() since latter is the cached avail
> > > index from guest but we want to know if there's pending available
> > > buffers in the virtqueue.
> > > 
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > I'm not sure why is this patch here. Is it related to
> > batching somehow?
> 
> Yes, we need to know whether or not there's still buffers left in the
> virtqueue, so need to check last_avail_idx. Otherwise, we're checking if
> guest has submitted new buffers.
> 
> > 
> > 
> > > ---
> > >   drivers/vhost/vhost.c | 2 +-
> > >   1 file changed, 1 insertion(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index c6f2d89..fdf4cdf 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -2230,7 +2230,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> > >   	if (r)
> > >   		return false;
> > > -	return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
> > > +	return vhost16_to_cpu(vq, avail_idx) == vq->last_avail_idx;
> > >   }
> > >   EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
> > That might be OK for TX but it's probably wrong for RX
> > where the fact that used != avail does not mean
> > we have enough space to store the packet.
> 
> Right, but it's no harm since it was just a hint, handle_rx() can handle
> this situation.

Means busy polling will cause useless load on the CPU though.

> > 
> > Maybe we should just rename this to vhost_vq_avail_unchanged
> > to clarify usage.
> > 
> 
> Ok.
> 
> > > -- 
> > > 2.7.4

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] tuntap: rx batching
  2016-11-11  3:31     ` Michael S. Tsirkin
@ 2016-11-11  4:10       ` Jason Wang
  2016-11-11  4:17       ` John Fastabend
  1 sibling, 0 replies; 25+ messages in thread
From: Jason Wang @ 2016-11-11  4:10 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel



On 2016年11月11日 11:31, Michael S. Tsirkin wrote:
> On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
>> >
>> >
>> >On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
>>> > >On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
>>>> > > >Backlog were used for tuntap rx, but it can only process 1 packet at
>>>> > > >one time since it was scheduled during sendmsg() synchronously in
>>>> > > >process context. This lead bad cache utilization so this patch tries
>>>> > > >to do some batching before call rx NAPI. This is done through:
>>>> > > >
>>>> > > >- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
>>>> > > >    batch the packet temporarily in a linked list and submit them all
>>>> > > >    once MSG_MORE were cleared.
>>>> > > >- implement a tuntap specific NAPI handler for processing this kind of
>>>> > > >    possible batching. (This could be done by extending backlog to
>>>> > > >    support skb like, but using a tun specific one looks cleaner and
>>>> > > >    easier for future extension).
>>>> > > >
>>>> > > >Signed-off-by: Jason Wang<jasowang@redhat.com>
>>> > >So why do we need an extra queue?
>> >
>> >The idea was borrowed from backlog to allow some kind of bulking and avoid
>> >spinlock on each dequeuing.
>> >
>>> > >   This is not what hardware devices do.
>>> > >How about adding the packet to queue unconditionally, deferring
>>> > >signalling until we get sendmsg without MSG_MORE?
>> >
>> >Then you need touch spinlock when dequeuing each packet.
> It runs on the same CPU, right? Otherwise we should use skb_array...
>

There could be multiple senders technically. Will try skb_array and see 
if there's any difference.

Thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] tuntap: rx batching
  2016-11-11  3:31     ` Michael S. Tsirkin
  2016-11-11  4:10       ` Jason Wang
@ 2016-11-11  4:17       ` John Fastabend
  2016-11-11  4:28         ` Jason Wang
  1 sibling, 1 reply; 25+ messages in thread
From: John Fastabend @ 2016-11-11  4:17 UTC (permalink / raw)
  To: Michael S. Tsirkin, Jason Wang; +Cc: netdev, linux-kernel

On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
> On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
>>
>>
>> On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
>>> On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
>>>> Backlog were used for tuntap rx, but it can only process 1 packet at
>>>> one time since it was scheduled during sendmsg() synchronously in
>>>> process context. This lead bad cache utilization so this patch tries
>>>> to do some batching before call rx NAPI. This is done through:
>>>>
>>>> - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
>>>>    batch the packet temporarily in a linked list and submit them all
>>>>    once MSG_MORE were cleared.
>>>> - implement a tuntap specific NAPI handler for processing this kind of
>>>>    possible batching. (This could be done by extending backlog to
>>>>    support skb like, but using a tun specific one looks cleaner and
>>>>    easier for future extension).
>>>>
>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>> So why do we need an extra queue?
>>
>> The idea was borrowed from backlog to allow some kind of bulking and avoid
>> spinlock on each dequeuing.
>>
>>>   This is not what hardware devices do.
>>> How about adding the packet to queue unconditionally, deferring
>>> signalling until we get sendmsg without MSG_MORE?
>>
>> Then you need touch spinlock when dequeuing each packet.
> 

Random thought, I have a cmpxchg ring I am using for the qdisc work that
could possibly replace the spinlock implementation. I haven't figured
out the resizing API yet because I did not need it but I assume it could
help here and let you dequeue multiple skbs in one operation.

I can post the latest version if useful or an older version is
somewhere on patchworks as well.

.John


> It runs on the same CPU, right? Otherwise we should use skb_array...
> 
>>>
>>>
>>>> ---
>>>>   drivers/net/tun.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
>>>>   1 file changed, 65 insertions(+), 6 deletions(-)
>>>>
>>
>> [...]
>>
>>>>   	rxhash = skb_get_hash(skb);
>>>> -	netif_rx_ni(skb);
>>>> +	skb_queue_tail(&tfile->socket.sk->sk_write_queue, skb);
>>>> +
>>>> +	if (!more) {
>>>> +		local_bh_disable();
>>>> +		napi_schedule(&tfile->napi);
>>>> +		local_bh_enable();
>>> Why do we need to disable bh here? I thought napi_schedule can
>>> be called from any context.
>>
>> Yes, it's unnecessary. Will remove.
>>
>> Thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/3] vhost: better detection of available buffers
  2016-11-11  3:41       ` Michael S. Tsirkin
@ 2016-11-11  4:18         ` Jason Wang
  2016-11-11 16:20           ` Michael S. Tsirkin
  0 siblings, 1 reply; 25+ messages in thread
From: Jason Wang @ 2016-11-11  4:18 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel



On 2016年11月11日 11:41, Michael S. Tsirkin wrote:
> On Fri, Nov 11, 2016 at 10:18:37AM +0800, Jason Wang wrote:
>> >
>> >
>> >On 2016年11月10日 03:57, Michael S. Tsirkin wrote:
>>> > >On Wed, Nov 09, 2016 at 03:38:32PM +0800, Jason Wang wrote:
>>>> > > >We should use vq->last_avail_idx instead of vq->avail_idx in the
>>>> > > >checking of vhost_vq_avail_empty() since latter is the cached avail
>>>> > > >index from guest but we want to know if there's pending available
>>>> > > >buffers in the virtqueue.
>>>> > > >
>>>> > > >Signed-off-by: Jason Wang<jasowang@redhat.com>
>>> > >I'm not sure why is this patch here. Is it related to
>>> > >batching somehow?
>> >
>> >Yes, we need to know whether or not there's still buffers left in the
>> >virtqueue, so need to check last_avail_idx. Otherwise, we're checking if
>> >guest has submitted new buffers.
>> >
>>> > >
>>> > >
>>>> > > >---
>>>> > > >   drivers/vhost/vhost.c | 2 +-
>>>> > > >   1 file changed, 1 insertion(+), 1 deletion(-)
>>>> > > >
>>>> > > >diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>> > > >index c6f2d89..fdf4cdf 100644
>>>> > > >--- a/drivers/vhost/vhost.c
>>>> > > >+++ b/drivers/vhost/vhost.c
>>>> > > >@@ -2230,7 +2230,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
>>>> > > >   	if (r)
>>>> > > >   		return false;
>>>> > > >-	return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
>>>> > > >+	return vhost16_to_cpu(vq, avail_idx) == vq->last_avail_idx;
>>>> > > >   }
>>>> > > >   EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
>>> > >That might be OK for TX but it's probably wrong for RX
>>> > >where the fact that used != avail does not mean
>>> > >we have enough space to store the packet.
>> >
>> >Right, but it's no harm since it was just a hint, handle_rx() can handle
>> >this situation.
> Means busy polling will cause useless load on the CPU though.
>

Right, but,it's not easy to have 100% correct hint here. Needs more thought.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] tuntap: rx batching
  2016-11-11  4:17       ` John Fastabend
@ 2016-11-11  4:28         ` Jason Wang
  2016-11-11  4:45           ` John Fastabend
  2016-11-11 16:20           ` Michael S. Tsirkin
  0 siblings, 2 replies; 25+ messages in thread
From: Jason Wang @ 2016-11-11  4:28 UTC (permalink / raw)
  To: John Fastabend, Michael S. Tsirkin; +Cc: netdev, linux-kernel



On 2016年11月11日 12:17, John Fastabend wrote:
> On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
>> >On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
>>> >>
>>> >>
>>> >>On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
>>>> >>>On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
>>>>> >>>>Backlog were used for tuntap rx, but it can only process 1 packet at
>>>>> >>>>one time since it was scheduled during sendmsg() synchronously in
>>>>> >>>>process context. This lead bad cache utilization so this patch tries
>>>>> >>>>to do some batching before call rx NAPI. This is done through:
>>>>> >>>>
>>>>> >>>>- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
>>>>> >>>>    batch the packet temporarily in a linked list and submit them all
>>>>> >>>>    once MSG_MORE were cleared.
>>>>> >>>>- implement a tuntap specific NAPI handler for processing this kind of
>>>>> >>>>    possible batching. (This could be done by extending backlog to
>>>>> >>>>    support skb like, but using a tun specific one looks cleaner and
>>>>> >>>>    easier for future extension).
>>>>> >>>>
>>>>> >>>>Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>> >>>So why do we need an extra queue?
>>> >>
>>> >>The idea was borrowed from backlog to allow some kind of bulking and avoid
>>> >>spinlock on each dequeuing.
>>> >>
>>>> >>>   This is not what hardware devices do.
>>>> >>>How about adding the packet to queue unconditionally, deferring
>>>> >>>signalling until we get sendmsg without MSG_MORE?
>>> >>
>>> >>Then you need touch spinlock when dequeuing each packet.
>> >
> Random thought, I have a cmpxchg ring I am using for the qdisc work that
> could possibly replace the spinlock implementation. I haven't figured
> out the resizing API yet because I did not need it but I assume it could
> help here and let you dequeue multiple skbs in one operation.
>
> I can post the latest version if useful or an older version is
> somewhere on patchworks as well.
>
> .John
>
>

Look useful here, and I can compare the performance if you post.

A question is can we extend the skb_array to support that?

Thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] tuntap: rx batching
  2016-11-11  4:28         ` Jason Wang
@ 2016-11-11  4:45           ` John Fastabend
  2016-11-11 16:20           ` Michael S. Tsirkin
  1 sibling, 0 replies; 25+ messages in thread
From: John Fastabend @ 2016-11-11  4:45 UTC (permalink / raw)
  To: Jason Wang, Michael S. Tsirkin; +Cc: netdev, linux-kernel

On 16-11-10 08:28 PM, Jason Wang wrote:
> 
> 
> On 2016年11月11日 12:17, John Fastabend wrote:
>> On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
>>> >On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
>>>> >>
>>>> >>
>>>> >>On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
>>>>> >>>On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
>>>>>> >>>>Backlog were used for tuntap rx, but it can only process 1
>>>>>> packet at
>>>>>> >>>>one time since it was scheduled during sendmsg() synchronously in
>>>>>> >>>>process context. This lead bad cache utilization so this patch
>>>>>> tries
>>>>>> >>>>to do some batching before call rx NAPI. This is done through:
>>>>>> >>>>
>>>>>> >>>>- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
>>>>>> >>>>    batch the packet temporarily in a linked list and submit
>>>>>> them all
>>>>>> >>>>    once MSG_MORE were cleared.
>>>>>> >>>>- implement a tuntap specific NAPI handler for processing this
>>>>>> kind of
>>>>>> >>>>    possible batching. (This could be done by extending
>>>>>> backlog to
>>>>>> >>>>    support skb like, but using a tun specific one looks
>>>>>> cleaner and
>>>>>> >>>>    easier for future extension).
>>>>>> >>>>
>>>>>> >>>>Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>> >>>So why do we need an extra queue?
>>>> >>
>>>> >>The idea was borrowed from backlog to allow some kind of bulking
>>>> and avoid
>>>> >>spinlock on each dequeuing.
>>>> >>
>>>>> >>>   This is not what hardware devices do.
>>>>> >>>How about adding the packet to queue unconditionally, deferring
>>>>> >>>signalling until we get sendmsg without MSG_MORE?
>>>> >>
>>>> >>Then you need touch spinlock when dequeuing each packet.
>>> >
>> Random thought, I have a cmpxchg ring I am using for the qdisc work that
>> could possibly replace the spinlock implementation. I haven't figured
>> out the resizing API yet because I did not need it but I assume it could
>> help here and let you dequeue multiple skbs in one operation.
>>
>> I can post the latest version if useful or an older version is
>> somewhere on patchworks as well.
>>
>> .John
>>
>>
> 
> Look useful here, and I can compare the performance if you post.
> 
> A question is can we extend the skb_array to support that?
> 
> Thanks
> 

Sent out two RFC patches with the implementation, the first has been
running on my system for some time the second for multiple packets is
only lightly tested and that was awhile back.

.John

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] tuntap: rx batching
  2016-11-11  4:28         ` Jason Wang
  2016-11-11  4:45           ` John Fastabend
@ 2016-11-11 16:20           ` Michael S. Tsirkin
  2016-11-15  3:14             ` Jason Wang
  1 sibling, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2016-11-11 16:20 UTC (permalink / raw)
  To: Jason Wang; +Cc: John Fastabend, netdev, linux-kernel

On Fri, Nov 11, 2016 at 12:28:38PM +0800, Jason Wang wrote:
> 
> 
> On 2016年11月11日 12:17, John Fastabend wrote:
> > On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
> > > >On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
> > > > >>
> > > > >>
> > > > >>On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
> > > > > >>>On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
> > > > > > >>>>Backlog were used for tuntap rx, but it can only process 1 packet at
> > > > > > >>>>one time since it was scheduled during sendmsg() synchronously in
> > > > > > >>>>process context. This lead bad cache utilization so this patch tries
> > > > > > >>>>to do some batching before call rx NAPI. This is done through:
> > > > > > >>>>
> > > > > > >>>>- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
> > > > > > >>>>    batch the packet temporarily in a linked list and submit them all
> > > > > > >>>>    once MSG_MORE were cleared.
> > > > > > >>>>- implement a tuntap specific NAPI handler for processing this kind of
> > > > > > >>>>    possible batching. (This could be done by extending backlog to
> > > > > > >>>>    support skb like, but using a tun specific one looks cleaner and
> > > > > > >>>>    easier for future extension).
> > > > > > >>>>
> > > > > > >>>>Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > > > >>>So why do we need an extra queue?
> > > > >>
> > > > >>The idea was borrowed from backlog to allow some kind of bulking and avoid
> > > > >>spinlock on each dequeuing.
> > > > >>
> > > > > >>>   This is not what hardware devices do.
> > > > > >>>How about adding the packet to queue unconditionally, deferring
> > > > > >>>signalling until we get sendmsg without MSG_MORE?
> > > > >>
> > > > >>Then you need touch spinlock when dequeuing each packet.
> > > >
> > Random thought, I have a cmpxchg ring I am using for the qdisc work that
> > could possibly replace the spinlock implementation. I haven't figured
> > out the resizing API yet because I did not need it but I assume it could
> > help here and let you dequeue multiple skbs in one operation.
> > 
> > I can post the latest version if useful or an older version is
> > somewhere on patchworks as well.
> > 
> > .John
> > 
> > 
> 
> Look useful here, and I can compare the performance if you post.
> 
> A question is can we extend the skb_array to support that?
> 
> Thanks

I'd like to start with simple patch adding napi with one queue, then add
optimization patches on top.

One issue that comes to mind is that write queue limits
are byte based, they do not count packets unlike tun rx queue.



-- 
MST

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/3] vhost: better detection of available buffers
  2016-11-11  4:18         ` Jason Wang
@ 2016-11-11 16:20           ` Michael S. Tsirkin
  2016-11-15  3:16             ` Jason Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2016-11-11 16:20 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel

On Fri, Nov 11, 2016 at 12:18:50PM +0800, Jason Wang wrote:
> 
> 
> On 2016年11月11日 11:41, Michael S. Tsirkin wrote:
> > On Fri, Nov 11, 2016 at 10:18:37AM +0800, Jason Wang wrote:
> > > >
> > > >
> > > >On 2016年11月10日 03:57, Michael S. Tsirkin wrote:
> > > > > >On Wed, Nov 09, 2016 at 03:38:32PM +0800, Jason Wang wrote:
> > > > > > > >We should use vq->last_avail_idx instead of vq->avail_idx in the
> > > > > > > >checking of vhost_vq_avail_empty() since latter is the cached avail
> > > > > > > >index from guest but we want to know if there's pending available
> > > > > > > >buffers in the virtqueue.
> > > > > > > >
> > > > > > > >Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > > > >I'm not sure why is this patch here. Is it related to
> > > > > >batching somehow?
> > > >
> > > >Yes, we need to know whether or not there's still buffers left in the
> > > >virtqueue, so need to check last_avail_idx. Otherwise, we're checking if
> > > >guest has submitted new buffers.
> > > >
> > > > > >
> > > > > >
> > > > > > > >---
> > > > > > > >   drivers/vhost/vhost.c | 2 +-
> > > > > > > >   1 file changed, 1 insertion(+), 1 deletion(-)
> > > > > > > >
> > > > > > > >diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > >index c6f2d89..fdf4cdf 100644
> > > > > > > >--- a/drivers/vhost/vhost.c
> > > > > > > >+++ b/drivers/vhost/vhost.c
> > > > > > > >@@ -2230,7 +2230,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> > > > > > > >   	if (r)
> > > > > > > >   		return false;
> > > > > > > >-	return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
> > > > > > > >+	return vhost16_to_cpu(vq, avail_idx) == vq->last_avail_idx;
> > > > > > > >   }
> > > > > > > >   EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
> > > > > >That might be OK for TX but it's probably wrong for RX
> > > > > >where the fact that used != avail does not mean
> > > > > >we have enough space to store the packet.
> > > >
> > > >Right, but it's no harm since it was just a hint, handle_rx() can handle
> > > >this situation.
> > Means busy polling will cause useless load on the CPU though.
> > 
> 
> Right, but,it's not easy to have 100% correct hint here. Needs more thought.

What's wrong with what we have? It polls until value changes.

-- 
MST

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] tuntap: rx batching
  2016-11-11 16:20           ` Michael S. Tsirkin
@ 2016-11-15  3:14             ` Jason Wang
  2016-11-15  3:41               ` Michael S. Tsirkin
  0 siblings, 1 reply; 25+ messages in thread
From: Jason Wang @ 2016-11-15  3:14 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: John Fastabend, netdev, linux-kernel



On 2016年11月12日 00:20, Michael S. Tsirkin wrote:
> On Fri, Nov 11, 2016 at 12:28:38PM +0800, Jason Wang wrote:
>>
>> On 2016年11月11日 12:17, John Fastabend wrote:
>>> On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
>>>>> On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
>>>>>>>
>>>>>>> On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
>>>>>>>>> On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
>>>>>>>>>>> Backlog were used for tuntap rx, but it can only process 1 packet at
>>>>>>>>>>> one time since it was scheduled during sendmsg() synchronously in
>>>>>>>>>>> process context. This lead bad cache utilization so this patch tries
>>>>>>>>>>> to do some batching before call rx NAPI. This is done through:
>>>>>>>>>>>
>>>>>>>>>>> - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
>>>>>>>>>>>     batch the packet temporarily in a linked list and submit them all
>>>>>>>>>>>     once MSG_MORE were cleared.
>>>>>>>>>>> - implement a tuntap specific NAPI handler for processing this kind of
>>>>>>>>>>>     possible batching. (This could be done by extending backlog to
>>>>>>>>>>>     support skb like, but using a tun specific one looks cleaner and
>>>>>>>>>>>     easier for future extension).
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>>>>>> So why do we need an extra queue?
>>>>>>> The idea was borrowed from backlog to allow some kind of bulking and avoid
>>>>>>> spinlock on each dequeuing.
>>>>>>>
>>>>>>>>>    This is not what hardware devices do.
>>>>>>>>> How about adding the packet to queue unconditionally, deferring
>>>>>>>>> signalling until we get sendmsg without MSG_MORE?
>>>>>>> Then you need touch spinlock when dequeuing each packet.
>>> Random thought, I have a cmpxchg ring I am using for the qdisc work that
>>> could possibly replace the spinlock implementation. I haven't figured
>>> out the resizing API yet because I did not need it but I assume it could
>>> help here and let you dequeue multiple skbs in one operation.
>>>
>>> I can post the latest version if useful or an older version is
>>> somewhere on patchworks as well.
>>>
>>> .John
>>>
>>>
>> Look useful here, and I can compare the performance if you post.
>>
>> A question is can we extend the skb_array to support that?
>>
>> Thanks
> I'd like to start with simple patch adding napi with one queue, then add
> optimization patches on top.

The point is tun is using backlog who uses two queues (process_queue and 
input_pkt_queue).

How about something like:

1) NAPI support with skb_array
2) MSG_MORE support
3) other optimizations on top

?

>
> One issue that comes to mind is that write queue limits
> are byte based, they do not count packets unlike tun rx queue.

I'm not sure I get the issue, write queue is not exported and only used 
for batching. We probably need an internal limit in tun to avoid OOM 
attacker from guest.

Thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/3] vhost: better detection of available buffers
  2016-11-11 16:20           ` Michael S. Tsirkin
@ 2016-11-15  3:16             ` Jason Wang
  2016-11-15  3:28               ` Michael S. Tsirkin
  0 siblings, 1 reply; 25+ messages in thread
From: Jason Wang @ 2016-11-15  3:16 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel



On 2016年11月12日 00:20, Michael S. Tsirkin wrote:
> On Fri, Nov 11, 2016 at 12:18:50PM +0800, Jason Wang wrote:
>>
>> On 2016年11月11日 11:41, Michael S. Tsirkin wrote:
>>> On Fri, Nov 11, 2016 at 10:18:37AM +0800, Jason Wang wrote:
>>>>>
>>>>> On 2016年11月10日 03:57, Michael S. Tsirkin wrote:
>>>>>>> On Wed, Nov 09, 2016 at 03:38:32PM +0800, Jason Wang wrote:
>>>>>>>>> We should use vq->last_avail_idx instead of vq->avail_idx in the
>>>>>>>>> checking of vhost_vq_avail_empty() since latter is the cached avail
>>>>>>>>> index from guest but we want to know if there's pending available
>>>>>>>>> buffers in the virtqueue.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>>>> I'm not sure why is this patch here. Is it related to
>>>>>>> batching somehow?
>>>>> Yes, we need to know whether or not there's still buffers left in the
>>>>> virtqueue, so need to check last_avail_idx. Otherwise, we're checking if
>>>>> guest has submitted new buffers.
>>>>>
>>>>>>>
>>>>>>>>> ---
>>>>>>>>>    drivers/vhost/vhost.c | 2 +-
>>>>>>>>>    1 file changed, 1 insertion(+), 1 deletion(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>>>>>>> index c6f2d89..fdf4cdf 100644
>>>>>>>>> --- a/drivers/vhost/vhost.c
>>>>>>>>> +++ b/drivers/vhost/vhost.c
>>>>>>>>> @@ -2230,7 +2230,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
>>>>>>>>>    	if (r)
>>>>>>>>>    		return false;
>>>>>>>>> -	return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
>>>>>>>>> +	return vhost16_to_cpu(vq, avail_idx) == vq->last_avail_idx;
>>>>>>>>>    }
>>>>>>>>>    EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
>>>>>>> That might be OK for TX but it's probably wrong for RX
>>>>>>> where the fact that used != avail does not mean
>>>>>>> we have enough space to store the packet.
>>>>> Right, but it's no harm since it was just a hint, handle_rx() can handle
>>>>> this situation.
>>> Means busy polling will cause useless load on the CPU though.
>>>
>> Right, but,it's not easy to have 100% correct hint here. Needs more thought.
> What's wrong with what we have? It polls until value changes.
>

But as you said, this does not mean (in mergeable cases) we have enough 
space to store the packet.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/3] vhost: better detection of available buffers
  2016-11-15  3:16             ` Jason Wang
@ 2016-11-15  3:28               ` Michael S. Tsirkin
  2016-11-15  8:00                 ` Jason Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2016-11-15  3:28 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel

On Tue, Nov 15, 2016 at 11:16:59AM +0800, Jason Wang wrote:
> 
> 
> On 2016年11月12日 00:20, Michael S. Tsirkin wrote:
> > On Fri, Nov 11, 2016 at 12:18:50PM +0800, Jason Wang wrote:
> > > 
> > > On 2016年11月11日 11:41, Michael S. Tsirkin wrote:
> > > > On Fri, Nov 11, 2016 at 10:18:37AM +0800, Jason Wang wrote:
> > > > > > 
> > > > > > On 2016年11月10日 03:57, Michael S. Tsirkin wrote:
> > > > > > > > On Wed, Nov 09, 2016 at 03:38:32PM +0800, Jason Wang wrote:
> > > > > > > > > > We should use vq->last_avail_idx instead of vq->avail_idx in the
> > > > > > > > > > checking of vhost_vq_avail_empty() since latter is the cached avail
> > > > > > > > > > index from guest but we want to know if there's pending available
> > > > > > > > > > buffers in the virtqueue.
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > > > > > > I'm not sure why is this patch here. Is it related to
> > > > > > > > batching somehow?
> > > > > > Yes, we need to know whether or not there's still buffers left in the
> > > > > > virtqueue, so need to check last_avail_idx. Otherwise, we're checking if
> > > > > > guest has submitted new buffers.
> > > > > > 
> > > > > > > > 
> > > > > > > > > > ---
> > > > > > > > > >    drivers/vhost/vhost.c | 2 +-
> > > > > > > > > >    1 file changed, 1 insertion(+), 1 deletion(-)
> > > > > > > > > > 
> > > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > > > index c6f2d89..fdf4cdf 100644
> > > > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > > > @@ -2230,7 +2230,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> > > > > > > > > >    	if (r)
> > > > > > > > > >    		return false;
> > > > > > > > > > -	return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
> > > > > > > > > > +	return vhost16_to_cpu(vq, avail_idx) == vq->last_avail_idx;
> > > > > > > > > >    }
> > > > > > > > > >    EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
> > > > > > > > That might be OK for TX but it's probably wrong for RX
> > > > > > > > where the fact that used != avail does not mean
> > > > > > > > we have enough space to store the packet.
> > > > > > Right, but it's no harm since it was just a hint, handle_rx() can handle
> > > > > > this situation.
> > > > Means busy polling will cause useless load on the CPU though.
> > > > 
> > > Right, but,it's not easy to have 100% correct hint here. Needs more thought.
> > What's wrong with what we have? It polls until value changes.
> > 
> 
> But as you said, this does not mean (in mergeable cases) we have enough
> space to store the packet.

Absolutely but it checks once and then only re-checks after value
changes again.

-- 
MST

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] tuntap: rx batching
  2016-11-15  3:14             ` Jason Wang
@ 2016-11-15  3:41               ` Michael S. Tsirkin
  2016-11-15  8:08                 ` Jason Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2016-11-15  3:41 UTC (permalink / raw)
  To: Jason Wang; +Cc: John Fastabend, netdev, linux-kernel

On Tue, Nov 15, 2016 at 11:14:48AM +0800, Jason Wang wrote:
> 
> 
> On 2016年11月12日 00:20, Michael S. Tsirkin wrote:
> > On Fri, Nov 11, 2016 at 12:28:38PM +0800, Jason Wang wrote:
> > > 
> > > On 2016年11月11日 12:17, John Fastabend wrote:
> > > > On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
> > > > > > On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
> > > > > > > > 
> > > > > > > > On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
> > > > > > > > > > On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
> > > > > > > > > > > > Backlog were used for tuntap rx, but it can only process 1 packet at
> > > > > > > > > > > > one time since it was scheduled during sendmsg() synchronously in
> > > > > > > > > > > > process context. This lead bad cache utilization so this patch tries
> > > > > > > > > > > > to do some batching before call rx NAPI. This is done through:
> > > > > > > > > > > > 
> > > > > > > > > > > > - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
> > > > > > > > > > > >     batch the packet temporarily in a linked list and submit them all
> > > > > > > > > > > >     once MSG_MORE were cleared.
> > > > > > > > > > > > - implement a tuntap specific NAPI handler for processing this kind of
> > > > > > > > > > > >     possible batching. (This could be done by extending backlog to
> > > > > > > > > > > >     support skb like, but using a tun specific one looks cleaner and
> > > > > > > > > > > >     easier for future extension).
> > > > > > > > > > > > 
> > > > > > > > > > > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > > > > > > > > So why do we need an extra queue?
> > > > > > > > The idea was borrowed from backlog to allow some kind of bulking and avoid
> > > > > > > > spinlock on each dequeuing.
> > > > > > > > 
> > > > > > > > > >    This is not what hardware devices do.
> > > > > > > > > > How about adding the packet to queue unconditionally, deferring
> > > > > > > > > > signalling until we get sendmsg without MSG_MORE?
> > > > > > > > Then you need touch spinlock when dequeuing each packet.
> > > > Random thought, I have a cmpxchg ring I am using for the qdisc work that
> > > > could possibly replace the spinlock implementation. I haven't figured
> > > > out the resizing API yet because I did not need it but I assume it could
> > > > help here and let you dequeue multiple skbs in one operation.
> > > > 
> > > > I can post the latest version if useful or an older version is
> > > > somewhere on patchworks as well.
> > > > 
> > > > .John
> > > > 
> > > > 
> > > Look useful here, and I can compare the performance if you post.
> > > 
> > > A question is can we extend the skb_array to support that?
> > > 
> > > Thanks
> > I'd like to start with simple patch adding napi with one queue, then add
> > optimization patches on top.
> 
> The point is tun is using backlog who uses two queues (process_queue and
> input_pkt_queue).
> 
> How about something like:
> 
> 1) NAPI support with skb_array

I would start with just write queue linked list. It all runs on a single
CPU normally, so the nice reductions of cache line bounces due to skb
array should never materialize.

While we are at it, limiting the size of the queue might
be a good idea. Kind of like TUNSETSNDBUF but 1. actually
working where instead of tracking packets within net stack
we make sndbuf track the internal buffer


> 2) MSG_MORE support
> 3) other optimizations on top
> 
> ?
> 
> > 
> > One issue that comes to mind is that write queue limits
> > are byte based, they do not count packets unlike tun rx queue.
> 
> I'm not sure I get the issue, write queue is not exported and only used for
> batching. We probably need an internal limit in tun to avoid OOM attacker
> from guest.
> 
> Thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/3] vhost: better detection of available buffers
  2016-11-15  3:28               ` Michael S. Tsirkin
@ 2016-11-15  8:00                 ` Jason Wang
  2016-11-15 14:46                   ` Michael S. Tsirkin
  0 siblings, 1 reply; 25+ messages in thread
From: Jason Wang @ 2016-11-15  8:00 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel



On 2016年11月15日 11:28, Michael S. Tsirkin wrote:
> On Tue, Nov 15, 2016 at 11:16:59AM +0800, Jason Wang wrote:
>>
>> On 2016年11月12日 00:20, Michael S. Tsirkin wrote:
>>> On Fri, Nov 11, 2016 at 12:18:50PM +0800, Jason Wang wrote:
>>>> On 2016年11月11日 11:41, Michael S. Tsirkin wrote:
>>>>> On Fri, Nov 11, 2016 at 10:18:37AM +0800, Jason Wang wrote:
>>>>>>> On 2016年11月10日 03:57, Michael S. Tsirkin wrote:
>>>>>>>>> On Wed, Nov 09, 2016 at 03:38:32PM +0800, Jason Wang wrote:
>>>>>>>>>>> We should use vq->last_avail_idx instead of vq->avail_idx in the
>>>>>>>>>>> checking of vhost_vq_avail_empty() since latter is the cached avail
>>>>>>>>>>> index from guest but we want to know if there's pending available
>>>>>>>>>>> buffers in the virtqueue.
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>>>>>> I'm not sure why is this patch here. Is it related to
>>>>>>>>> batching somehow?
>>>>>>> Yes, we need to know whether or not there's still buffers left in the
>>>>>>> virtqueue, so need to check last_avail_idx. Otherwise, we're checking if
>>>>>>> guest has submitted new buffers.
>>>>>>>
>>>>>>>>>>> ---
>>>>>>>>>>>     drivers/vhost/vhost.c | 2 +-
>>>>>>>>>>>     1 file changed, 1 insertion(+), 1 deletion(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>>>>>>>>> index c6f2d89..fdf4cdf 100644
>>>>>>>>>>> --- a/drivers/vhost/vhost.c
>>>>>>>>>>> +++ b/drivers/vhost/vhost.c
>>>>>>>>>>> @@ -2230,7 +2230,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
>>>>>>>>>>>     	if (r)
>>>>>>>>>>>     		return false;
>>>>>>>>>>> -	return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
>>>>>>>>>>> +	return vhost16_to_cpu(vq, avail_idx) == vq->last_avail_idx;
>>>>>>>>>>>     }
>>>>>>>>>>>     EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
>>>>>>>>> That might be OK for TX but it's probably wrong for RX
>>>>>>>>> where the fact that used != avail does not mean
>>>>>>>>> we have enough space to store the packet.
>>>>>>> Right, but it's no harm since it was just a hint, handle_rx() can handle
>>>>>>> this situation.
>>>>> Means busy polling will cause useless load on the CPU though.
>>>>>
>>>> Right, but,it's not easy to have 100% correct hint here. Needs more thought.
>>> What's wrong with what we have? It polls until value changes.
>>>
>> But as you said, this does not mean (in mergeable cases) we have enough
>> space to store the packet.
> Absolutely but it checks once and then only re-checks after value
> changes again.
>

Since get_rx_bufs() does not get enough buffers, we will wait for the 
kick in this case. For busy polling, we probably want to stay in the 
busy loop here.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] tuntap: rx batching
  2016-11-15  3:41               ` Michael S. Tsirkin
@ 2016-11-15  8:08                 ` Jason Wang
  0 siblings, 0 replies; 25+ messages in thread
From: Jason Wang @ 2016-11-15  8:08 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: John Fastabend, netdev, linux-kernel



On 2016年11月15日 11:41, Michael S. Tsirkin wrote:
> On Tue, Nov 15, 2016 at 11:14:48AM +0800, Jason Wang wrote:
>> >
>> >
>> >On 2016年11月12日 00:20, Michael S. Tsirkin wrote:
>>> > >On Fri, Nov 11, 2016 at 12:28:38PM +0800, Jason Wang wrote:
>>>> > > >
>>>> > > >On 2016年11月11日 12:17, John Fastabend wrote:
>>>>> > > > >On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
>>>>>>> > > > > > >On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
>>>>>>>>> > > > > > > > >
>>>>>>>>> > > > > > > > >On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
>>>>>>>>>>> > > > > > > > > > >On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
>>>>>>>>>>>>> > > > > > > > > > > > >Backlog were used for tuntap rx, but it can only process 1 packet at
>>>>>>>>>>>>> > > > > > > > > > > > >one time since it was scheduled during sendmsg() synchronously in
>>>>>>>>>>>>> > > > > > > > > > > > >process context. This lead bad cache utilization so this patch tries
>>>>>>>>>>>>> > > > > > > > > > > > >to do some batching before call rx NAPI. This is done through:
>>>>>>>>>>>>> > > > > > > > > > > > >
>>>>>>>>>>>>> > > > > > > > > > > > >- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
>>>>>>>>>>>>> > > > > > > > > > > > >     batch the packet temporarily in a linked list and submit them all
>>>>>>>>>>>>> > > > > > > > > > > > >     once MSG_MORE were cleared.
>>>>>>>>>>>>> > > > > > > > > > > > >- implement a tuntap specific NAPI handler for processing this kind of
>>>>>>>>>>>>> > > > > > > > > > > > >     possible batching. (This could be done by extending backlog to
>>>>>>>>>>>>> > > > > > > > > > > > >     support skb like, but using a tun specific one looks cleaner and
>>>>>>>>>>>>> > > > > > > > > > > > >     easier for future extension).
>>>>>>>>>>>>> > > > > > > > > > > > >
>>>>>>>>>>>>> > > > > > > > > > > > >Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>>>>>>>> > > > > > > > > > >So why do we need an extra queue?
>>>>>>>>> > > > > > > > >The idea was borrowed from backlog to allow some kind of bulking and avoid
>>>>>>>>> > > > > > > > >spinlock on each dequeuing.
>>>>>>>>> > > > > > > > >
>>>>>>>>>>> > > > > > > > > > >    This is not what hardware devices do.
>>>>>>>>>>> > > > > > > > > > >How about adding the packet to queue unconditionally, deferring
>>>>>>>>>>> > > > > > > > > > >signalling until we get sendmsg without MSG_MORE?
>>>>>>>>> > > > > > > > >Then you need touch spinlock when dequeuing each packet.
>>>>> > > > >Random thought, I have a cmpxchg ring I am using for the qdisc work that
>>>>> > > > >could possibly replace the spinlock implementation. I haven't figured
>>>>> > > > >out the resizing API yet because I did not need it but I assume it could
>>>>> > > > >help here and let you dequeue multiple skbs in one operation.
>>>>> > > > >
>>>>> > > > >I can post the latest version if useful or an older version is
>>>>> > > > >somewhere on patchworks as well.
>>>>> > > > >
>>>>> > > > >.John
>>>>> > > > >
>>>>> > > > >
>>>> > > >Look useful here, and I can compare the performance if you post.
>>>> > > >
>>>> > > >A question is can we extend the skb_array to support that?
>>>> > > >
>>>> > > >Thanks
>>> > >I'd like to start with simple patch adding napi with one queue, then add
>>> > >optimization patches on top.
>> >
>> >The point is tun is using backlog who uses two queues (process_queue and
>> >input_pkt_queue).
>> >
>> >How about something like:
>> >
>> >1) NAPI support with skb_array
> I would start with just write queue linked list. It all runs on a single
> CPU normally,

True for virt but I'm not sure the others. If we have multiple senders 
at the same time, current code scales very well.

>   so the nice reductions of cache line bounces due to skb
> array should never materialize.
>
> While we are at it, limiting the size of the queue might
> be a good idea. Kind of like TUNSETSNDBUF but 1. actually
> working where instead of tracking packets within net stack
> we make sndbuf track the internal buffer

Get your point, will start from simple skb list.

Thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/3] vhost: better detection of available buffers
  2016-11-15  8:00                 ` Jason Wang
@ 2016-11-15 14:46                   ` Michael S. Tsirkin
  0 siblings, 0 replies; 25+ messages in thread
From: Michael S. Tsirkin @ 2016-11-15 14:46 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel

On Tue, Nov 15, 2016 at 04:00:21PM +0800, Jason Wang wrote:
> 
> 
> On 2016年11月15日 11:28, Michael S. Tsirkin wrote:
> > On Tue, Nov 15, 2016 at 11:16:59AM +0800, Jason Wang wrote:
> > > 
> > > On 2016年11月12日 00:20, Michael S. Tsirkin wrote:
> > > > On Fri, Nov 11, 2016 at 12:18:50PM +0800, Jason Wang wrote:
> > > > > On 2016年11月11日 11:41, Michael S. Tsirkin wrote:
> > > > > > On Fri, Nov 11, 2016 at 10:18:37AM +0800, Jason Wang wrote:
> > > > > > > > On 2016年11月10日 03:57, Michael S. Tsirkin wrote:
> > > > > > > > > > On Wed, Nov 09, 2016 at 03:38:32PM +0800, Jason Wang wrote:
> > > > > > > > > > > > We should use vq->last_avail_idx instead of vq->avail_idx in the
> > > > > > > > > > > > checking of vhost_vq_avail_empty() since latter is the cached avail
> > > > > > > > > > > > index from guest but we want to know if there's pending available
> > > > > > > > > > > > buffers in the virtqueue.
> > > > > > > > > > > > 
> > > > > > > > > > > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > > > > > > > > I'm not sure why is this patch here. Is it related to
> > > > > > > > > > batching somehow?
> > > > > > > > Yes, we need to know whether or not there's still buffers left in the
> > > > > > > > virtqueue, so need to check last_avail_idx. Otherwise, we're checking if
> > > > > > > > guest has submitted new buffers.
> > > > > > > > 
> > > > > > > > > > > > ---
> > > > > > > > > > > >     drivers/vhost/vhost.c | 2 +-
> > > > > > > > > > > >     1 file changed, 1 insertion(+), 1 deletion(-)
> > > > > > > > > > > > 
> > > > > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > > > > > index c6f2d89..fdf4cdf 100644
> > > > > > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > > > > > @@ -2230,7 +2230,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> > > > > > > > > > > >     	if (r)
> > > > > > > > > > > >     		return false;
> > > > > > > > > > > > -	return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
> > > > > > > > > > > > +	return vhost16_to_cpu(vq, avail_idx) == vq->last_avail_idx;
> > > > > > > > > > > >     }
> > > > > > > > > > > >     EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
> > > > > > > > > > That might be OK for TX but it's probably wrong for RX
> > > > > > > > > > where the fact that used != avail does not mean
> > > > > > > > > > we have enough space to store the packet.
> > > > > > > > Right, but it's no harm since it was just a hint, handle_rx() can handle
> > > > > > > > this situation.
> > > > > > Means busy polling will cause useless load on the CPU though.
> > > > > > 
> > > > > Right, but,it's not easy to have 100% correct hint here. Needs more thought.
> > > > What's wrong with what we have? It polls until value changes.
> > > > 
> > > But as you said, this does not mean (in mergeable cases) we have enough
> > > space to store the packet.
> > Absolutely but it checks once and then only re-checks after value
> > changes again.
> > 
> 
> Since get_rx_bufs() does not get enough buffers, we will wait for the kick
> in this case. For busy polling, we probably want to stay in the busy loop
> here.

That's what I'm saying. You don't want to re-poll the queue
if available idx was unchanged.

-- 
MST

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2016-11-15 14:47 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-09  7:38 [PATCH 1/3] tuntap: rx batching Jason Wang
2016-11-09  7:38 ` [PATCH 2/3] vhost: better detection of available buffers Jason Wang
2016-11-09 19:57   ` Michael S. Tsirkin
2016-11-11  2:18     ` Jason Wang
2016-11-11  3:41       ` Michael S. Tsirkin
2016-11-11  4:18         ` Jason Wang
2016-11-11 16:20           ` Michael S. Tsirkin
2016-11-15  3:16             ` Jason Wang
2016-11-15  3:28               ` Michael S. Tsirkin
2016-11-15  8:00                 ` Jason Wang
2016-11-15 14:46                   ` Michael S. Tsirkin
2016-11-09  7:38 ` [PATCH 3/3] vhost_net: tx support batching Jason Wang
2016-11-09 20:05   ` Michael S. Tsirkin
2016-11-11  2:27     ` Jason Wang
2016-11-09 16:38 ` [PATCH 1/3] tuntap: rx batching Michael S. Tsirkin
2016-11-11  2:07   ` Jason Wang
2016-11-11  3:31     ` Michael S. Tsirkin
2016-11-11  4:10       ` Jason Wang
2016-11-11  4:17       ` John Fastabend
2016-11-11  4:28         ` Jason Wang
2016-11-11  4:45           ` John Fastabend
2016-11-11 16:20           ` Michael S. Tsirkin
2016-11-15  3:14             ` Jason Wang
2016-11-15  3:41               ` Michael S. Tsirkin
2016-11-15  8:08                 ` Jason Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).