[net-next,RFC,5/5] vhost_net: basic tx virtqueue batched processing
diff mbox series

Message ID 1506067355-5771-6-git-send-email-jasowang@redhat.com
State New, archived
Headers show
Series
  • batched tx processing in vhost_net
Related show

Commit Message

Jason Wang Sept. 22, 2017, 8:02 a.m. UTC
This patch implements basic batched processing of tx virtqueue by
prefetching desc indices and updating used ring in a batch. For
non-zerocopy case, vq->heads were used for storing the prefetched
indices and updating used ring. It is also a requirement for doing
more batching on top. For zerocopy case and for simplicity, batched
processing were simply disabled by only fetching and processing one
descriptor at a time, this could be optimized in the future.

XDP_DROP (without touching skb) on tun (with Moongen in guest) with
zercopy disabled:

Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
Before: 3.20Mpps
After:  3.90Mpps (+22%)

No differences were seen with zerocopy enabled.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 215 ++++++++++++++++++++++++++++----------------------
 drivers/vhost/vhost.c |   2 +-
 2 files changed, 121 insertions(+), 96 deletions(-)

Comments

Michael S. Tsirkin Sept. 26, 2017, 7:25 p.m. UTC | #1
On Fri, Sep 22, 2017 at 04:02:35PM +0800, Jason Wang wrote:
> This patch implements basic batched processing of tx virtqueue by
> prefetching desc indices and updating used ring in a batch. For
> non-zerocopy case, vq->heads were used for storing the prefetched
> indices and updating used ring. It is also a requirement for doing
> more batching on top. For zerocopy case and for simplicity, batched
> processing were simply disabled by only fetching and processing one
> descriptor at a time, this could be optimized in the future.
> 
> XDP_DROP (without touching skb) on tun (with Moongen in guest) with
> zercopy disabled:
> 
> Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
> Before: 3.20Mpps
> After:  3.90Mpps (+22%)
> 
> No differences were seen with zerocopy enabled.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

So where is the speedup coming from? I'd guess the ring is
hot in cache, it's faster to access it in one go, then
pass many packets to net stack. Is that right?

Another possibility is better code cache locality.

So how about this patchset is refactored:

1. use existing APIs just first get packets then
   transmit them all then use them all
2. add new APIs and move the loop into vhost core
   for more speedups



> ---
>  drivers/vhost/net.c   | 215 ++++++++++++++++++++++++++++----------------------
>  drivers/vhost/vhost.c |   2 +-
>  2 files changed, 121 insertions(+), 96 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index c89640e..c439892 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -408,27 +408,25 @@ static int vhost_net_enable_vq(struct vhost_net *n,
>  	return vhost_poll_start(poll, sock->file);
>  }
>  
> -static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> -				    struct vhost_virtqueue *vq,
> -				    struct iovec iov[], unsigned int iov_size,
> -				    unsigned int *out_num, unsigned int *in_num)
> +static bool vhost_net_tx_avail(struct vhost_net *net,
> +			       struct vhost_virtqueue *vq)
>  {
>  	unsigned long uninitialized_var(endtime);
> -	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> -				  out_num, in_num, NULL, NULL);
>  
> -	if (r == vq->num && vq->busyloop_timeout) {
> -		preempt_disable();
> -		endtime = busy_clock() + vq->busyloop_timeout;
> -		while (vhost_can_busy_poll(vq->dev, endtime) &&
> -		       vhost_vq_avail_empty(vq->dev, vq))
> -			cpu_relax();
> -		preempt_enable();
> -		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> -				      out_num, in_num, NULL, NULL);
> -	}
> +	if (!vq->busyloop_timeout)
> +		return false;
>  
> -	return r;
> +	if (!vhost_vq_avail_empty(vq->dev, vq))
> +		return true;
> +
> +	preempt_disable();
> +	endtime = busy_clock() + vq->busyloop_timeout;
> +	while (vhost_can_busy_poll(vq->dev, endtime) &&
> +		vhost_vq_avail_empty(vq->dev, vq))
> +		cpu_relax();
> +	preempt_enable();
> +
> +	return !vhost_vq_avail_empty(vq->dev, vq);
>  }
>  
>  static bool vhost_exceeds_maxpend(struct vhost_net *net)
> @@ -446,8 +444,9 @@ static void handle_tx(struct vhost_net *net)
>  {
>  	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
>  	struct vhost_virtqueue *vq = &nvq->vq;
> +	struct vring_used_elem used, *heads = vq->heads;
>  	unsigned out, in;
> -	int head;
> +	int avails, head;
>  	struct msghdr msg = {
>  		.msg_name = NULL,
>  		.msg_namelen = 0,
> @@ -461,6 +460,7 @@ static void handle_tx(struct vhost_net *net)
>  	struct socket *sock;
>  	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
>  	bool zcopy, zcopy_used;
> +	int i, batched = VHOST_NET_BATCH;
>  
>  	mutex_lock(&vq->mutex);
>  	sock = vq->private_data;
> @@ -475,6 +475,12 @@ static void handle_tx(struct vhost_net *net)
>  	hdr_size = nvq->vhost_hlen;
>  	zcopy = nvq->ubufs;
>  
> +	/* Disable zerocopy batched fetching for simplicity */
> +	if (zcopy) {
> +		heads = &used;
> +		batched = 1;
> +	}
> +
>  	for (;;) {
>  		/* Release DMAs done buffers first */
>  		if (zcopy)
> @@ -486,95 +492,114 @@ static void handle_tx(struct vhost_net *net)
>  		if (unlikely(vhost_exceeds_maxpend(net)))
>  			break;
>  
> -		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
> -						ARRAY_SIZE(vq->iov),
> -						&out, &in);
> +		avails = vhost_prefetch_desc_indices(vq, heads, batched, !zcopy);
>  		/* On error, stop handling until the next kick. */
> -		if (unlikely(head < 0))
> +		if (unlikely(avails < 0))
>  			break;
> -		/* Nothing new?  Wait for eventfd to tell us they refilled. */
> -		if (head == vq->num) {
> +		/* Nothing new?  Busy poll for a while or wait for
> +		 * eventfd to tell us they refilled. */
> +		if (!avails) {
> +			if (vhost_net_tx_avail(net, vq))
> +				continue;
>  			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>  				vhost_disable_notify(&net->dev, vq);
>  				continue;
>  			}
>  			break;
>  		}
> -		if (in) {
> -			vq_err(vq, "Unexpected descriptor format for TX: "
> -			       "out %d, int %d\n", out, in);
> -			break;
> -		}
> -		/* Skip header. TODO: support TSO. */
> -		len = iov_length(vq->iov, out);
> -		iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
> -		iov_iter_advance(&msg.msg_iter, hdr_size);
> -		/* Sanity check */
> -		if (!msg_data_left(&msg)) {
> -			vq_err(vq, "Unexpected header len for TX: "
> -			       "%zd expected %zd\n",
> -			       len, hdr_size);
> -			break;
> -		}
> -		len = msg_data_left(&msg);
> -
> -		zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
> -				   && (nvq->upend_idx + 1) % UIO_MAXIOV !=
> -				      nvq->done_idx
> -				   && vhost_net_tx_select_zcopy(net);
> -
> -		/* use msg_control to pass vhost zerocopy ubuf info to skb */
> -		if (zcopy_used) {
> -			struct ubuf_info *ubuf;
> -			ubuf = nvq->ubuf_info + nvq->upend_idx;
> -
> -			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
> -			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
> -			ubuf->callback = vhost_zerocopy_callback;
> -			ubuf->ctx = nvq->ubufs;
> -			ubuf->desc = nvq->upend_idx;
> -			refcount_set(&ubuf->refcnt, 1);
> -			msg.msg_control = ubuf;
> -			msg.msg_controllen = sizeof(ubuf);
> -			ubufs = nvq->ubufs;
> -			atomic_inc(&ubufs->refcount);
> -			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
> -		} else {
> -			msg.msg_control = NULL;
> -			ubufs = NULL;
> -		}
> +		for (i = 0; i < avails; i++) {
> +			head = __vhost_get_vq_desc(vq, vq->iov,
> +						   ARRAY_SIZE(vq->iov),
> +						   &out, &in, NULL, NULL,
> +					       vhost16_to_cpu(vq, heads[i].id));
> +			if (in) {
> +				vq_err(vq, "Unexpected descriptor format for "
> +					   "TX: out %d, int %d\n", out, in);
> +				goto out;
> +			}
>  
> -		total_len += len;
> -		if (total_len < VHOST_NET_WEIGHT &&
> -		    !vhost_vq_avail_empty(&net->dev, vq) &&
> -		    likely(!vhost_exceeds_maxpend(net))) {
> -			msg.msg_flags |= MSG_MORE;
> -		} else {
> -			msg.msg_flags &= ~MSG_MORE;
> -		}
> +			/* Skip header. TODO: support TSO. */
> +			len = iov_length(vq->iov, out);
> +			iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
> +			iov_iter_advance(&msg.msg_iter, hdr_size);
> +			/* Sanity check */
> +			if (!msg_data_left(&msg)) {
> +				vq_err(vq, "Unexpected header len for TX: "
> +					"%zd expected %zd\n",
> +					len, hdr_size);
> +				goto out;
> +			}
> +			len = msg_data_left(&msg);
>  
> -		/* TODO: Check specific error and bomb out unless ENOBUFS? */
> -		err = sock->ops->sendmsg(sock, &msg, len);
> -		if (unlikely(err < 0)) {
> +			zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
> +				     && (nvq->upend_idx + 1) % UIO_MAXIOV !=
> +					nvq->done_idx
> +				     && vhost_net_tx_select_zcopy(net);
> +
> +			/* use msg_control to pass vhost zerocopy ubuf
> +			 * info to skb
> +			 */
>  			if (zcopy_used) {
> -				vhost_net_ubuf_put(ubufs);
> -				nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
> -					% UIO_MAXIOV;
> +				struct ubuf_info *ubuf;
> +				ubuf = nvq->ubuf_info + nvq->upend_idx;
> +
> +				vq->heads[nvq->upend_idx].id =
> +					cpu_to_vhost32(vq, head);
> +				vq->heads[nvq->upend_idx].len =
> +					VHOST_DMA_IN_PROGRESS;
> +				ubuf->callback = vhost_zerocopy_callback;
> +				ubuf->ctx = nvq->ubufs;
> +				ubuf->desc = nvq->upend_idx;
> +				refcount_set(&ubuf->refcnt, 1);
> +				msg.msg_control = ubuf;
> +				msg.msg_controllen = sizeof(ubuf);
> +				ubufs = nvq->ubufs;
> +				atomic_inc(&ubufs->refcount);
> +				nvq->upend_idx =
> +					(nvq->upend_idx + 1) % UIO_MAXIOV;
> +			} else {
> +				msg.msg_control = NULL;
> +				ubufs = NULL;
> +			}
> +
> +			total_len += len;
> +			if (total_len < VHOST_NET_WEIGHT &&
> +				!vhost_vq_avail_empty(&net->dev, vq) &&
> +				likely(!vhost_exceeds_maxpend(net))) {
> +				msg.msg_flags |= MSG_MORE;
> +			} else {
> +				msg.msg_flags &= ~MSG_MORE;
> +			}
> +
> +			/* TODO: Check specific error and bomb out
> +			 * unless ENOBUFS?
> +			 */
> +			err = sock->ops->sendmsg(sock, &msg, len);
> +			if (unlikely(err < 0)) {
> +				if (zcopy_used) {
> +					vhost_net_ubuf_put(ubufs);
> +					nvq->upend_idx =
> +				   ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV;
> +				}
> +				vhost_discard_vq_desc(vq, 1);
> +				goto out;
> +			}
> +			if (err != len)
> +				pr_debug("Truncated TX packet: "
> +					" len %d != %zd\n", err, len);
> +			if (!zcopy) {
> +				vhost_add_used_idx(vq, 1);
> +				vhost_signal(&net->dev, vq);
> +			} else if (!zcopy_used) {
> +				vhost_add_used_and_signal(&net->dev,
> +							  vq, head, 0);
> +			} else
> +				vhost_zerocopy_signal_used(net, vq);
> +			vhost_net_tx_packet(net);
> +			if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> +				vhost_poll_queue(&vq->poll);
> +				goto out;
>  			}
> -			vhost_discard_vq_desc(vq, 1);
> -			break;
> -		}
> -		if (err != len)
> -			pr_debug("Truncated TX packet: "
> -				 " len %d != %zd\n", err, len);
> -		if (!zcopy_used)
> -			vhost_add_used_and_signal(&net->dev, vq, head, 0);
> -		else
> -			vhost_zerocopy_signal_used(net, vq);
> -		vhost_net_tx_packet(net);
> -		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> -			vhost_poll_queue(&vq->poll);
> -			break;
>  		}
>  	}
>  out:
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 6532cda..8764df5 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -392,7 +392,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
>  		vq->indirect = kmalloc(sizeof *vq->indirect * UIO_MAXIOV,
>  				       GFP_KERNEL);
>  		vq->log = kmalloc(sizeof *vq->log * UIO_MAXIOV, GFP_KERNEL);
> -		vq->heads = kmalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);
> +		vq->heads = kzalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);
>  		if (!vq->indirect || !vq->log || !vq->heads)
>  			goto err_nomem;
>  	}
> -- 
> 2.7.4
Jason Wang Sept. 27, 2017, 2:04 a.m. UTC | #2
On 2017年09月27日 03:25, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:35PM +0800, Jason Wang wrote:
>> This patch implements basic batched processing of tx virtqueue by
>> prefetching desc indices and updating used ring in a batch. For
>> non-zerocopy case, vq->heads were used for storing the prefetched
>> indices and updating used ring. It is also a requirement for doing
>> more batching on top. For zerocopy case and for simplicity, batched
>> processing were simply disabled by only fetching and processing one
>> descriptor at a time, this could be optimized in the future.
>>
>> XDP_DROP (without touching skb) on tun (with Moongen in guest) with
>> zercopy disabled:
>>
>> Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
>> Before: 3.20Mpps
>> After:  3.90Mpps (+22%)
>>
>> No differences were seen with zerocopy enabled.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> So where is the speedup coming from? I'd guess the ring is
> hot in cache, it's faster to access it in one go, then
> pass many packets to net stack. Is that right?
>
> Another possibility is better code cache locality.

Yes, I think the speed up comes from:

- less cache misses
- less cache line bounce when virtqueue is about to be full (guest is 
faster than host which is the case of MoonGen)
- less memory barriers
- possible faster copy speed by using copy_to_user() on modern CPUs

>
> So how about this patchset is refactored:
>
> 1. use existing APIs just first get packets then
>     transmit them all then use them all

Looks like current API can not get packets first, it only support get 
packet one by one (if you mean vhost_get_vq_desc()). And used ring 
updating may get more misses in this case.

> 2. add new APIs and move the loop into vhost core
>     for more speedups

I don't see any advantages, looks like just need some e.g callbacks in 
this case.

Thanks
Michael S. Tsirkin Sept. 27, 2017, 10:19 p.m. UTC | #3
On Wed, Sep 27, 2017 at 10:04:18AM +0800, Jason Wang wrote:
> 
> 
> On 2017年09月27日 03:25, Michael S. Tsirkin wrote:
> > On Fri, Sep 22, 2017 at 04:02:35PM +0800, Jason Wang wrote:
> > > This patch implements basic batched processing of tx virtqueue by
> > > prefetching desc indices and updating used ring in a batch. For
> > > non-zerocopy case, vq->heads were used for storing the prefetched
> > > indices and updating used ring. It is also a requirement for doing
> > > more batching on top. For zerocopy case and for simplicity, batched
> > > processing were simply disabled by only fetching and processing one
> > > descriptor at a time, this could be optimized in the future.
> > > 
> > > XDP_DROP (without touching skb) on tun (with Moongen in guest) with
> > > zercopy disabled:
> > > 
> > > Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
> > > Before: 3.20Mpps
> > > After:  3.90Mpps (+22%)
> > > 
> > > No differences were seen with zerocopy enabled.
> > > 
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > So where is the speedup coming from? I'd guess the ring is
> > hot in cache, it's faster to access it in one go, then
> > pass many packets to net stack. Is that right?
> > 
> > Another possibility is better code cache locality.
> 
> Yes, I think the speed up comes from:
> 
> - less cache misses
> - less cache line bounce when virtqueue is about to be full (guest is faster
> than host which is the case of MoonGen)
> - less memory barriers
> - possible faster copy speed by using copy_to_user() on modern CPUs
> 
> > 
> > So how about this patchset is refactored:
> > 
> > 1. use existing APIs just first get packets then
> >     transmit them all then use them all
> 
> Looks like current API can not get packets first, it only support get packet
> one by one (if you mean vhost_get_vq_desc()). And used ring updating may get
> more misses in this case.

Right. So if you do

for (...)
	vhost_get_vq_desc


then later

for (...)
	vhost_add_used


then you get most of benefits except maybe code cache misses
and copy_to_user.







> > 2. add new APIs and move the loop into vhost core
> >     for more speedups
> 
> I don't see any advantages, looks like just need some e.g callbacks in this
> case.
> 
> Thanks

IUC callbacks pretty much destroy the code cache locality advantages,
IP is jumping around too much.
Willem de Bruijn Sept. 28, 2017, 12:55 a.m. UTC | #4
> @@ -461,6 +460,7 @@ static void handle_tx(struct vhost_net *net)
>         struct socket *sock;
>         struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
>         bool zcopy, zcopy_used;
> +       int i, batched = VHOST_NET_BATCH;
>
>         mutex_lock(&vq->mutex);
>         sock = vq->private_data;
> @@ -475,6 +475,12 @@ static void handle_tx(struct vhost_net *net)
>         hdr_size = nvq->vhost_hlen;
>         zcopy = nvq->ubufs;
>
> +       /* Disable zerocopy batched fetching for simplicity */

This special case can perhaps be avoided if we no longer block
on vhost_exceeds_maxpend, but revert to copying.

> +       if (zcopy) {
> +               heads = &used;

Can this special case of batchsize 1 not use vq->heads?

> +               batched = 1;
> +       }
> +
>         for (;;) {
>                 /* Release DMAs done buffers first */
>                 if (zcopy)
> @@ -486,95 +492,114 @@ static void handle_tx(struct vhost_net *net)
>                 if (unlikely(vhost_exceeds_maxpend(net)))
>                         break;

> +                       /* TODO: Check specific error and bomb out
> +                        * unless ENOBUFS?
> +                        */
> +                       err = sock->ops->sendmsg(sock, &msg, len);
> +                       if (unlikely(err < 0)) {
> +                               if (zcopy_used) {
> +                                       vhost_net_ubuf_put(ubufs);
> +                                       nvq->upend_idx =
> +                                  ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV;
> +                               }
> +                               vhost_discard_vq_desc(vq, 1);
> +                               goto out;
> +                       }
> +                       if (err != len)
> +                               pr_debug("Truncated TX packet: "
> +                                       " len %d != %zd\n", err, len);
> +                       if (!zcopy) {
> +                               vhost_add_used_idx(vq, 1);
> +                               vhost_signal(&net->dev, vq);
> +                       } else if (!zcopy_used) {
> +                               vhost_add_used_and_signal(&net->dev,
> +                                                         vq, head, 0);

While batching, perhaps can also move this producer index update
out of the loop and using vhost_add_used_and_signal_n.

> +                       } else
> +                               vhost_zerocopy_signal_used(net, vq);
> +                       vhost_net_tx_packet(net);
> +                       if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> +                               vhost_poll_queue(&vq->poll);
> +                               goto out;
>                         }
> -                       vhost_discard_vq_desc(vq, 1);
> -                       break;
> -               }
> -               if (err != len)
> -                       pr_debug("Truncated TX packet: "
> -                                " len %d != %zd\n", err, len);
> -               if (!zcopy_used)
> -                       vhost_add_used_and_signal(&net->dev, vq, head, 0);
> -               else
> -                       vhost_zerocopy_signal_used(net, vq);
> -               vhost_net_tx_packet(net);
> -               if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> -                       vhost_poll_queue(&vq->poll);
> -                       break;

This patch touches many lines just for indentation. If having to touch
these lines anyway (dirtying git blame), it may be a good time to move
the processing of a single descriptor code into a separate helper function.
And while breaking up, perhaps another helper for setting up ubuf_info.
If you agree, preferably in a separate noop refactor patch that precedes
the functional changes.
Jason Wang Sept. 28, 2017, 7:02 a.m. UTC | #5
On 2017年09月28日 06:19, Michael S. Tsirkin wrote:
> On Wed, Sep 27, 2017 at 10:04:18AM +0800, Jason Wang wrote:
>>
>> On 2017年09月27日 03:25, Michael S. Tsirkin wrote:
>>> On Fri, Sep 22, 2017 at 04:02:35PM +0800, Jason Wang wrote:
>>>> This patch implements basic batched processing of tx virtqueue by
>>>> prefetching desc indices and updating used ring in a batch. For
>>>> non-zerocopy case, vq->heads were used for storing the prefetched
>>>> indices and updating used ring. It is also a requirement for doing
>>>> more batching on top. For zerocopy case and for simplicity, batched
>>>> processing were simply disabled by only fetching and processing one
>>>> descriptor at a time, this could be optimized in the future.
>>>>
>>>> XDP_DROP (without touching skb) on tun (with Moongen in guest) with
>>>> zercopy disabled:
>>>>
>>>> Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
>>>> Before: 3.20Mpps
>>>> After:  3.90Mpps (+22%)
>>>>
>>>> No differences were seen with zerocopy enabled.
>>>>
>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>> So where is the speedup coming from? I'd guess the ring is
>>> hot in cache, it's faster to access it in one go, then
>>> pass many packets to net stack. Is that right?
>>>
>>> Another possibility is better code cache locality.
>> Yes, I think the speed up comes from:
>>
>> - less cache misses
>> - less cache line bounce when virtqueue is about to be full (guest is faster
>> than host which is the case of MoonGen)
>> - less memory barriers
>> - possible faster copy speed by using copy_to_user() on modern CPUs
>>
>>> So how about this patchset is refactored:
>>>
>>> 1. use existing APIs just first get packets then
>>>      transmit them all then use them all
>> Looks like current API can not get packets first, it only support get packet
>> one by one (if you mean vhost_get_vq_desc()). And used ring updating may get
>> more misses in this case.
> Right. So if you do
>
> for (...)
> 	vhost_get_vq_desc
>
>
> then later
>
> for (...)
> 	vhost_add_used
>
>
> then you get most of benefits except maybe code cache misses
> and copy_to_user.
>

If you mean vhost_add_used_n(), then more barriers and userspace memory 
access as well. And you still need to cache vring_used_elem in vq->heads 
or elsewhere. Since you do not batch reading avail ring indexes, more 
write miss will happen if the ring is about to be full. So it looks 
slower than what is done in this patch?

And if we want to indo more batching on top (do we want this?), we still 
need new APIs anyway.

Thanks

>
>
>
>
>
>>> 2. add new APIs and move the loop into vhost core
>>>      for more speedups
>> I don't see any advantages, looks like just need some e.g callbacks in this
>> case.
>>
>> Thanks
> IUC callbacks pretty much destroy the code cache locality advantages,
> IP is jumping around too much.
>
>
Jason Wang Sept. 28, 2017, 7:50 a.m. UTC | #6
On 2017年09月28日 08:55, Willem de Bruijn wrote:
>> @@ -461,6 +460,7 @@ static void handle_tx(struct vhost_net *net)
>>          struct socket *sock;
>>          struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
>>          bool zcopy, zcopy_used;
>> +       int i, batched = VHOST_NET_BATCH;
>>
>>          mutex_lock(&vq->mutex);
>>          sock = vq->private_data;
>> @@ -475,6 +475,12 @@ static void handle_tx(struct vhost_net *net)
>>          hdr_size = nvq->vhost_hlen;
>>          zcopy = nvq->ubufs;
>>
>> +       /* Disable zerocopy batched fetching for simplicity */
> This special case can perhaps be avoided if we no longer block
> on vhost_exceeds_maxpend, but revert to copying.

Yes, I think so. For simplicity, I do it for data copy first. If the 
idea is convinced, I will try to do zerocopy on top.

>
>> +       if (zcopy) {
>> +               heads = &used;
> Can this special case of batchsize 1 not use vq->heads?

It doesn't in fact?

>
>> +               batched = 1;
>> +       }
>> +
>>          for (;;) {
>>                  /* Release DMAs done buffers first */
>>                  if (zcopy)
>> @@ -486,95 +492,114 @@ static void handle_tx(struct vhost_net *net)
>>                  if (unlikely(vhost_exceeds_maxpend(net)))
>>                          break;
>> +                       /* TODO: Check specific error and bomb out
>> +                        * unless ENOBUFS?
>> +                        */
>> +                       err = sock->ops->sendmsg(sock, &msg, len);
>> +                       if (unlikely(err < 0)) {
>> +                               if (zcopy_used) {
>> +                                       vhost_net_ubuf_put(ubufs);
>> +                                       nvq->upend_idx =
>> +                                  ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV;
>> +                               }
>> +                               vhost_discard_vq_desc(vq, 1);
>> +                               goto out;
>> +                       }
>> +                       if (err != len)
>> +                               pr_debug("Truncated TX packet: "
>> +                                       " len %d != %zd\n", err, len);
>> +                       if (!zcopy) {
>> +                               vhost_add_used_idx(vq, 1);
>> +                               vhost_signal(&net->dev, vq);
>> +                       } else if (!zcopy_used) {
>> +                               vhost_add_used_and_signal(&net->dev,
>> +                                                         vq, head, 0);
> While batching, perhaps can also move this producer index update
> out of the loop and using vhost_add_used_and_signal_n.

Yes.

>
>> +                       } else
>> +                               vhost_zerocopy_signal_used(net, vq);
>> +                       vhost_net_tx_packet(net);
>> +                       if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>> +                               vhost_poll_queue(&vq->poll);
>> +                               goto out;
>>                          }
>> -                       vhost_discard_vq_desc(vq, 1);
>> -                       break;
>> -               }
>> -               if (err != len)
>> -                       pr_debug("Truncated TX packet: "
>> -                                " len %d != %zd\n", err, len);
>> -               if (!zcopy_used)
>> -                       vhost_add_used_and_signal(&net->dev, vq, head, 0);
>> -               else
>> -                       vhost_zerocopy_signal_used(net, vq);
>> -               vhost_net_tx_packet(net);
>> -               if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>> -                       vhost_poll_queue(&vq->poll);
>> -                       break;
> This patch touches many lines just for indentation. If having to touch
> these lines anyway (dirtying git blame), it may be a good time to move
> the processing of a single descriptor code into a separate helper function.
> And while breaking up, perhaps another helper for setting up ubuf_info.
> If you agree, preferably in a separate noop refactor patch that precedes
> the functional changes.

Right and it looks better, will try to do this.

Thanks
Jason Wang Sept. 28, 2017, 7:52 a.m. UTC | #7
On 2017年09月28日 06:19, Michael S. Tsirkin wrote:
>
>>> 2. add new APIs and move the loop into vhost core
>>>      for more speedups
>> I don't see any advantages, looks like just need some e.g callbacks in this
>> case.
>>
>> Thanks
> IUC callbacks pretty much destroy the code cache locality advantages,
> IP is jumping around too much.

I may miss something, but we need net specific code anyway in this case?

Thanks

Patch
diff mbox series

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index c89640e..c439892 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -408,27 +408,25 @@  static int vhost_net_enable_vq(struct vhost_net *n,
 	return vhost_poll_start(poll, sock->file);
 }
 
-static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
-				    struct vhost_virtqueue *vq,
-				    struct iovec iov[], unsigned int iov_size,
-				    unsigned int *out_num, unsigned int *in_num)
+static bool vhost_net_tx_avail(struct vhost_net *net,
+			       struct vhost_virtqueue *vq)
 {
 	unsigned long uninitialized_var(endtime);
-	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-				  out_num, in_num, NULL, NULL);
 
-	if (r == vq->num && vq->busyloop_timeout) {
-		preempt_disable();
-		endtime = busy_clock() + vq->busyloop_timeout;
-		while (vhost_can_busy_poll(vq->dev, endtime) &&
-		       vhost_vq_avail_empty(vq->dev, vq))
-			cpu_relax();
-		preempt_enable();
-		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-				      out_num, in_num, NULL, NULL);
-	}
+	if (!vq->busyloop_timeout)
+		return false;
 
-	return r;
+	if (!vhost_vq_avail_empty(vq->dev, vq))
+		return true;
+
+	preempt_disable();
+	endtime = busy_clock() + vq->busyloop_timeout;
+	while (vhost_can_busy_poll(vq->dev, endtime) &&
+		vhost_vq_avail_empty(vq->dev, vq))
+		cpu_relax();
+	preempt_enable();
+
+	return !vhost_vq_avail_empty(vq->dev, vq);
 }
 
 static bool vhost_exceeds_maxpend(struct vhost_net *net)
@@ -446,8 +444,9 @@  static void handle_tx(struct vhost_net *net)
 {
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
 	struct vhost_virtqueue *vq = &nvq->vq;
+	struct vring_used_elem used, *heads = vq->heads;
 	unsigned out, in;
-	int head;
+	int avails, head;
 	struct msghdr msg = {
 		.msg_name = NULL,
 		.msg_namelen = 0,
@@ -461,6 +460,7 @@  static void handle_tx(struct vhost_net *net)
 	struct socket *sock;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
 	bool zcopy, zcopy_used;
+	int i, batched = VHOST_NET_BATCH;
 
 	mutex_lock(&vq->mutex);
 	sock = vq->private_data;
@@ -475,6 +475,12 @@  static void handle_tx(struct vhost_net *net)
 	hdr_size = nvq->vhost_hlen;
 	zcopy = nvq->ubufs;
 
+	/* Disable zerocopy batched fetching for simplicity */
+	if (zcopy) {
+		heads = &used;
+		batched = 1;
+	}
+
 	for (;;) {
 		/* Release DMAs done buffers first */
 		if (zcopy)
@@ -486,95 +492,114 @@  static void handle_tx(struct vhost_net *net)
 		if (unlikely(vhost_exceeds_maxpend(net)))
 			break;
 
-		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
-						ARRAY_SIZE(vq->iov),
-						&out, &in);
+		avails = vhost_prefetch_desc_indices(vq, heads, batched, !zcopy);
 		/* On error, stop handling until the next kick. */
-		if (unlikely(head < 0))
+		if (unlikely(avails < 0))
 			break;
-		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (head == vq->num) {
+		/* Nothing new?  Busy poll for a while or wait for
+		 * eventfd to tell us they refilled. */
+		if (!avails) {
+			if (vhost_net_tx_avail(net, vq))
+				continue;
 			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 				vhost_disable_notify(&net->dev, vq);
 				continue;
 			}
 			break;
 		}
-		if (in) {
-			vq_err(vq, "Unexpected descriptor format for TX: "
-			       "out %d, int %d\n", out, in);
-			break;
-		}
-		/* Skip header. TODO: support TSO. */
-		len = iov_length(vq->iov, out);
-		iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
-		iov_iter_advance(&msg.msg_iter, hdr_size);
-		/* Sanity check */
-		if (!msg_data_left(&msg)) {
-			vq_err(vq, "Unexpected header len for TX: "
-			       "%zd expected %zd\n",
-			       len, hdr_size);
-			break;
-		}
-		len = msg_data_left(&msg);
-
-		zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
-				   && (nvq->upend_idx + 1) % UIO_MAXIOV !=
-				      nvq->done_idx
-				   && vhost_net_tx_select_zcopy(net);
-
-		/* use msg_control to pass vhost zerocopy ubuf info to skb */
-		if (zcopy_used) {
-			struct ubuf_info *ubuf;
-			ubuf = nvq->ubuf_info + nvq->upend_idx;
-
-			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
-			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
-			ubuf->callback = vhost_zerocopy_callback;
-			ubuf->ctx = nvq->ubufs;
-			ubuf->desc = nvq->upend_idx;
-			refcount_set(&ubuf->refcnt, 1);
-			msg.msg_control = ubuf;
-			msg.msg_controllen = sizeof(ubuf);
-			ubufs = nvq->ubufs;
-			atomic_inc(&ubufs->refcount);
-			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
-		} else {
-			msg.msg_control = NULL;
-			ubufs = NULL;
-		}
+		for (i = 0; i < avails; i++) {
+			head = __vhost_get_vq_desc(vq, vq->iov,
+						   ARRAY_SIZE(vq->iov),
+						   &out, &in, NULL, NULL,
+					       vhost16_to_cpu(vq, heads[i].id));
+			if (in) {
+				vq_err(vq, "Unexpected descriptor format for "
+					   "TX: out %d, int %d\n", out, in);
+				goto out;
+			}
 
-		total_len += len;
-		if (total_len < VHOST_NET_WEIGHT &&
-		    !vhost_vq_avail_empty(&net->dev, vq) &&
-		    likely(!vhost_exceeds_maxpend(net))) {
-			msg.msg_flags |= MSG_MORE;
-		} else {
-			msg.msg_flags &= ~MSG_MORE;
-		}
+			/* Skip header. TODO: support TSO. */
+			len = iov_length(vq->iov, out);
+			iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
+			iov_iter_advance(&msg.msg_iter, hdr_size);
+			/* Sanity check */
+			if (!msg_data_left(&msg)) {
+				vq_err(vq, "Unexpected header len for TX: "
+					"%zd expected %zd\n",
+					len, hdr_size);
+				goto out;
+			}
+			len = msg_data_left(&msg);
 
-		/* TODO: Check specific error and bomb out unless ENOBUFS? */
-		err = sock->ops->sendmsg(sock, &msg, len);
-		if (unlikely(err < 0)) {
+			zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
+				     && (nvq->upend_idx + 1) % UIO_MAXIOV !=
+					nvq->done_idx
+				     && vhost_net_tx_select_zcopy(net);
+
+			/* use msg_control to pass vhost zerocopy ubuf
+			 * info to skb
+			 */
 			if (zcopy_used) {
-				vhost_net_ubuf_put(ubufs);
-				nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
-					% UIO_MAXIOV;
+				struct ubuf_info *ubuf;
+				ubuf = nvq->ubuf_info + nvq->upend_idx;
+
+				vq->heads[nvq->upend_idx].id =
+					cpu_to_vhost32(vq, head);
+				vq->heads[nvq->upend_idx].len =
+					VHOST_DMA_IN_PROGRESS;
+				ubuf->callback = vhost_zerocopy_callback;
+				ubuf->ctx = nvq->ubufs;
+				ubuf->desc = nvq->upend_idx;
+				refcount_set(&ubuf->refcnt, 1);
+				msg.msg_control = ubuf;
+				msg.msg_controllen = sizeof(ubuf);
+				ubufs = nvq->ubufs;
+				atomic_inc(&ubufs->refcount);
+				nvq->upend_idx =
+					(nvq->upend_idx + 1) % UIO_MAXIOV;
+			} else {
+				msg.msg_control = NULL;
+				ubufs = NULL;
+			}
+
+			total_len += len;
+			if (total_len < VHOST_NET_WEIGHT &&
+				!vhost_vq_avail_empty(&net->dev, vq) &&
+				likely(!vhost_exceeds_maxpend(net))) {
+				msg.msg_flags |= MSG_MORE;
+			} else {
+				msg.msg_flags &= ~MSG_MORE;
+			}
+
+			/* TODO: Check specific error and bomb out
+			 * unless ENOBUFS?
+			 */
+			err = sock->ops->sendmsg(sock, &msg, len);
+			if (unlikely(err < 0)) {
+				if (zcopy_used) {
+					vhost_net_ubuf_put(ubufs);
+					nvq->upend_idx =
+				   ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV;
+				}
+				vhost_discard_vq_desc(vq, 1);
+				goto out;
+			}
+			if (err != len)
+				pr_debug("Truncated TX packet: "
+					" len %d != %zd\n", err, len);
+			if (!zcopy) {
+				vhost_add_used_idx(vq, 1);
+				vhost_signal(&net->dev, vq);
+			} else if (!zcopy_used) {
+				vhost_add_used_and_signal(&net->dev,
+							  vq, head, 0);
+			} else
+				vhost_zerocopy_signal_used(net, vq);
+			vhost_net_tx_packet(net);
+			if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
+				vhost_poll_queue(&vq->poll);
+				goto out;
 			}
-			vhost_discard_vq_desc(vq, 1);
-			break;
-		}
-		if (err != len)
-			pr_debug("Truncated TX packet: "
-				 " len %d != %zd\n", err, len);
-		if (!zcopy_used)
-			vhost_add_used_and_signal(&net->dev, vq, head, 0);
-		else
-			vhost_zerocopy_signal_used(net, vq);
-		vhost_net_tx_packet(net);
-		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
-			vhost_poll_queue(&vq->poll);
-			break;
 		}
 	}
 out:
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6532cda..8764df5 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -392,7 +392,7 @@  static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
 		vq->indirect = kmalloc(sizeof *vq->indirect * UIO_MAXIOV,
 				       GFP_KERNEL);
 		vq->log = kmalloc(sizeof *vq->log * UIO_MAXIOV, GFP_KERNEL);
-		vq->heads = kmalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);
+		vq->heads = kzalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);
 		if (!vq->indirect || !vq->log || !vq->heads)
 			goto err_nomem;
 	}