[net-next,RFC,2/5] vhost: introduce helper to prefetch desc index
diff mbox series

Message ID 1506067355-5771-3-git-send-email-jasowang@redhat.com
State New, archived
Headers show
Series
  • batched tx processing in vhost_net
Related show

Commit Message

Jason Wang Sept. 22, 2017, 8:02 a.m. UTC
This patch introduces vhost_prefetch_desc_indices() which could batch
descriptor indices fetching and used ring updating. This intends to
reduce the cache misses of indices fetching and updating and reduce
cache line bounce when virtqueue is almost full. copy_to_user() was
used in order to benefit from modern cpus that support fast string
copy. Batched virtqueue processing will be the first user.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |  3 +++
 2 files changed, 58 insertions(+)

Comments

Stefan Hajnoczi Sept. 22, 2017, 9:02 a.m. UTC | #1
On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index f87ec75..8424166d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>  }
>  EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>  
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +				struct vring_used_elem *heads,
> +				u16 num, bool used_update)

Missing doc comment.

> +{
> +	int ret, ret2;
> +	u16 last_avail_idx, last_used_idx, total, copied;
> +	__virtio16 avail_idx;
> +	struct vring_used_elem __user *used;
> +	int i;

The following variable names are a little confusing:

last_avail_idx vs vq->last_avail_idx.  last_avail_idx is a wrapped
avail->ring[] index, vq->last_avail_idx is a free-running counter.  The
same for last_used_idx vs vq->last_used_idx.

num argument vs vq->num.  The argument could be called nheads instead to
make it clear that this is heads[] and not the virtqueue size.

Not a bug but it took me a while to figure out what was going on.
Jason Wang Sept. 25, 2017, 2:04 a.m. UTC | #2
On 2017年09月22日 17:02, Stefan Hajnoczi wrote:
> On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index f87ec75..8424166d 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>>   
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +				struct vring_used_elem *heads,
>> +				u16 num, bool used_update)
> Missing doc comment.

Will fix this.

>
>> +{
>> +	int ret, ret2;
>> +	u16 last_avail_idx, last_used_idx, total, copied;
>> +	__virtio16 avail_idx;
>> +	struct vring_used_elem __user *used;
>> +	int i;
> The following variable names are a little confusing:
>
> last_avail_idx vs vq->last_avail_idx.  last_avail_idx is a wrapped
> avail->ring[] index, vq->last_avail_idx is a free-running counter.  The
> same for last_used_idx vs vq->last_used_idx.
>
> num argument vs vq->num.  The argument could be called nheads instead to
> make it clear that this is heads[] and not the virtqueue size.
>
> Not a bug but it took me a while to figure out what was going on.

I admit the name is confusing. Let me try better ones in V2.

Thanks
Michael S. Tsirkin Sept. 26, 2017, 7:19 p.m. UTC | #3
On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
> This patch introduces vhost_prefetch_desc_indices() which could batch
> descriptor indices fetching and used ring updating. This intends to
> reduce the cache misses of indices fetching and updating and reduce
> cache line bounce when virtqueue is almost full. copy_to_user() was
> used in order to benefit from modern cpus that support fast string
> copy. Batched virtqueue processing will be the first user.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  3 +++
>  2 files changed, 58 insertions(+)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index f87ec75..8424166d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>  }
>  EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>  
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +				struct vring_used_elem *heads,
> +				u16 num, bool used_update)

why do you need to combine used update with prefetch?

> +{
> +	int ret, ret2;
> +	u16 last_avail_idx, last_used_idx, total, copied;
> +	__virtio16 avail_idx;
> +	struct vring_used_elem __user *used;
> +	int i;
> +
> +	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
> +		vq_err(vq, "Failed to access avail idx at %p\n",
> +		       &vq->avail->idx);
> +		return -EFAULT;
> +	}
> +	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
> +	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> +	total = vq->avail_idx - vq->last_avail_idx;
> +	ret = total = min(total, num);
> +
> +	for (i = 0; i < ret; i++) {
> +		ret2 = vhost_get_avail(vq, heads[i].id,
> +				      &vq->avail->ring[last_avail_idx]);
> +		if (unlikely(ret2)) {
> +			vq_err(vq, "Failed to get descriptors\n");
> +			return -EFAULT;
> +		}
> +		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
> +	}
> +
> +	if (!used_update)
> +		return ret;
> +
> +	last_used_idx = vq->last_used_idx & (vq->num - 1);
> +	while (total) {
> +		copied = min((u16)(vq->num - last_used_idx), total);
> +		ret2 = vhost_copy_to_user(vq,
> +					  &vq->used->ring[last_used_idx],
> +					  &heads[ret - total],
> +					  copied * sizeof(*used));
> +
> +		if (unlikely(ret2)) {
> +			vq_err(vq, "Failed to update used ring!\n");
> +			return -EFAULT;
> +		}
> +
> +		last_used_idx = 0;
> +		total -= copied;
> +	}
> +
> +	/* Only get avail ring entries after they have been exposed by guest. */
> +	smp_rmb();

Barrier before return is a very confusing API. I guess it's designed to
be used in a specific way to make it necessary - but what is it?


> +	return ret;
> +}
> +EXPORT_SYMBOL(vhost_prefetch_desc_indices);
>  
>  static int __init vhost_init(void)
>  {
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 39ff897..16c2cb6 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -228,6 +228,9 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
>  ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
>  			     struct iov_iter *from);
>  int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +				struct vring_used_elem *heads,
> +				u16 num, bool used_update);
>  
>  #define vq_err(vq, fmt, ...) do {                                  \
>  		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> -- 
> 2.7.4
Jason Wang Sept. 27, 2017, 12:35 a.m. UTC | #4
On 2017年09月27日 03:19, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
>> This patch introduces vhost_prefetch_desc_indices() which could batch
>> descriptor indices fetching and used ring updating. This intends to
>> reduce the cache misses of indices fetching and updating and reduce
>> cache line bounce when virtqueue is almost full. copy_to_user() was
>> used in order to benefit from modern cpus that support fast string
>> copy. Batched virtqueue processing will be the first user.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>   drivers/vhost/vhost.h |  3 +++
>>   2 files changed, 58 insertions(+)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index f87ec75..8424166d 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>>   
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +				struct vring_used_elem *heads,
>> +				u16 num, bool used_update)
> why do you need to combine used update with prefetch?

For better performance and I believe we don't care about the overhead 
when we meet errors in tx.

>
>> +{
>> +	int ret, ret2;
>> +	u16 last_avail_idx, last_used_idx, total, copied;
>> +	__virtio16 avail_idx;
>> +	struct vring_used_elem __user *used;
>> +	int i;
>> +
>> +	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
>> +		vq_err(vq, "Failed to access avail idx at %p\n",
>> +		       &vq->avail->idx);
>> +		return -EFAULT;
>> +	}
>> +	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
>> +	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
>> +	total = vq->avail_idx - vq->last_avail_idx;
>> +	ret = total = min(total, num);
>> +
>> +	for (i = 0; i < ret; i++) {
>> +		ret2 = vhost_get_avail(vq, heads[i].id,
>> +				      &vq->avail->ring[last_avail_idx]);
>> +		if (unlikely(ret2)) {
>> +			vq_err(vq, "Failed to get descriptors\n");
>> +			return -EFAULT;
>> +		}
>> +		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
>> +	}
>> +
>> +	if (!used_update)
>> +		return ret;
>> +
>> +	last_used_idx = vq->last_used_idx & (vq->num - 1);
>> +	while (total) {
>> +		copied = min((u16)(vq->num - last_used_idx), total);
>> +		ret2 = vhost_copy_to_user(vq,
>> +					  &vq->used->ring[last_used_idx],
>> +					  &heads[ret - total],
>> +					  copied * sizeof(*used));
>> +
>> +		if (unlikely(ret2)) {
>> +			vq_err(vq, "Failed to update used ring!\n");
>> +			return -EFAULT;
>> +		}
>> +
>> +		last_used_idx = 0;
>> +		total -= copied;
>> +	}
>> +
>> +	/* Only get avail ring entries after they have been exposed by guest. */
>> +	smp_rmb();
> Barrier before return is a very confusing API. I guess it's designed to
> be used in a specific way to make it necessary - but what is it?

Looks like a and we need do this after reading avail_idx.

Thanks

>
>
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(vhost_prefetch_desc_indices);
>>   
>>   static int __init vhost_init(void)
>>   {
>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>> index 39ff897..16c2cb6 100644
>> --- a/drivers/vhost/vhost.h
>> +++ b/drivers/vhost/vhost.h
>> @@ -228,6 +228,9 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
>>   ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
>>   			     struct iov_iter *from);
>>   int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +				struct vring_used_elem *heads,
>> +				u16 num, bool used_update);
>>   
>>   #define vq_err(vq, fmt, ...) do {                                  \
>>   		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
>> -- 
>> 2.7.4
Michael S. Tsirkin Sept. 27, 2017, 10:57 p.m. UTC | #5
On Wed, Sep 27, 2017 at 08:35:47AM +0800, Jason Wang wrote:
> 
> 
> On 2017年09月27日 03:19, Michael S. Tsirkin wrote:
> > On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
> > > This patch introduces vhost_prefetch_desc_indices() which could batch
> > > descriptor indices fetching and used ring updating. This intends to
> > > reduce the cache misses of indices fetching and updating and reduce
> > > cache line bounce when virtqueue is almost full. copy_to_user() was
> > > used in order to benefit from modern cpus that support fast string
> > > copy. Batched virtqueue processing will be the first user.
> > > 
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > ---
> > >   drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
> > >   drivers/vhost/vhost.h |  3 +++
> > >   2 files changed, 58 insertions(+)
> > > 
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index f87ec75..8424166d 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
> > >   }
> > >   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
> > > +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> > > +				struct vring_used_elem *heads,
> > > +				u16 num, bool used_update)
> > why do you need to combine used update with prefetch?
> 
> For better performance


Why is sticking a branch in there better than requesting the update
conditionally from the caller?



> and I believe we don't care about the overhead when
> we meet errors in tx.

That's a separate question, I do not really understand how
you can fetch a descriptor and update the used ring at the same
time. This allows the guest to overwrite the buffer.
I might be misunderstanding what is going on here though.


> > 
> > > +{
> > > +	int ret, ret2;
> > > +	u16 last_avail_idx, last_used_idx, total, copied;
> > > +	__virtio16 avail_idx;
> > > +	struct vring_used_elem __user *used;
> > > +	int i;
> > > +
> > > +	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
> > > +		vq_err(vq, "Failed to access avail idx at %p\n",
> > > +		       &vq->avail->idx);
> > > +		return -EFAULT;
> > > +	}
> > > +	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
> > > +	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> > > +	total = vq->avail_idx - vq->last_avail_idx;
> > > +	ret = total = min(total, num);
> > > +
> > > +	for (i = 0; i < ret; i++) {
> > > +		ret2 = vhost_get_avail(vq, heads[i].id,
> > > +				      &vq->avail->ring[last_avail_idx]);
> > > +		if (unlikely(ret2)) {
> > > +			vq_err(vq, "Failed to get descriptors\n");
> > > +			return -EFAULT;
> > > +		}
> > > +		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
> > > +	}
> > > +
> > > +	if (!used_update)
> > > +		return ret;
> > > +
> > > +	last_used_idx = vq->last_used_idx & (vq->num - 1);
> > > +	while (total) {
> > > +		copied = min((u16)(vq->num - last_used_idx), total);
> > > +		ret2 = vhost_copy_to_user(vq,
> > > +					  &vq->used->ring[last_used_idx],
> > > +					  &heads[ret - total],
> > > +					  copied * sizeof(*used));
> > > +
> > > +		if (unlikely(ret2)) {
> > > +			vq_err(vq, "Failed to update used ring!\n");
> > > +			return -EFAULT;
> > > +		}
> > > +
> > > +		last_used_idx = 0;
> > > +		total -= copied;
> > > +	}
> > > +
> > > +	/* Only get avail ring entries after they have been exposed by guest. */
> > > +	smp_rmb();
> > Barrier before return is a very confusing API. I guess it's designed to
> > be used in a specific way to make it necessary - but what is it?
> 
> Looks like a and we need do this after reading avail_idx.
> 
> Thanks
> 
> > 
> > 
> > > +	return ret;
> > > +}
> > > +EXPORT_SYMBOL(vhost_prefetch_desc_indices);
> > >   static int __init vhost_init(void)
> > >   {
> > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > index 39ff897..16c2cb6 100644
> > > --- a/drivers/vhost/vhost.h
> > > +++ b/drivers/vhost/vhost.h
> > > @@ -228,6 +228,9 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
> > >   ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
> > >   			     struct iov_iter *from);
> > >   int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
> > > +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> > > +				struct vring_used_elem *heads,
> > > +				u16 num, bool used_update);
> > >   #define vq_err(vq, fmt, ...) do {                                  \
> > >   		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> > > -- 
> > > 2.7.4
Willem de Bruijn Sept. 28, 2017, 12:47 a.m. UTC | #6
On Fri, Sep 22, 2017 at 4:02 AM, Jason Wang <jasowang@redhat.com> wrote:
> This patch introduces vhost_prefetch_desc_indices() which could batch
> descriptor indices fetching and used ring updating. This intends to
> reduce the cache misses of indices fetching and updating and reduce
> cache line bounce when virtqueue is almost full. copy_to_user() was
> used in order to benefit from modern cpus that support fast string
> copy. Batched virtqueue processing will be the first user.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  3 +++
>  2 files changed, 58 insertions(+)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index f87ec75..8424166d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>  }
>  EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +                               struct vring_used_elem *heads,
> +                               u16 num, bool used_update)
> +{
> +       int ret, ret2;
> +       u16 last_avail_idx, last_used_idx, total, copied;
> +       __virtio16 avail_idx;
> +       struct vring_used_elem __user *used;
> +       int i;
> +
> +       if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
> +               vq_err(vq, "Failed to access avail idx at %p\n",
> +                      &vq->avail->idx);
> +               return -EFAULT;
> +       }
> +       last_avail_idx = vq->last_avail_idx & (vq->num - 1);
> +       vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> +       total = vq->avail_idx - vq->last_avail_idx;
> +       ret = total = min(total, num);
> +
> +       for (i = 0; i < ret; i++) {
> +               ret2 = vhost_get_avail(vq, heads[i].id,
> +                                     &vq->avail->ring[last_avail_idx]);
> +               if (unlikely(ret2)) {
> +                       vq_err(vq, "Failed to get descriptors\n");
> +                       return -EFAULT;
> +               }
> +               last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
> +       }

This is understandably very similar to the existing logic in vhost_get_vq_desc.
Can that be extracted to a helper to avoid code duplication?

Perhaps one helper to update vq->avail_idx and return num, and
another to call vhost_get_avail one or more times.

> +
> +       if (!used_update)
> +               return ret;
> +
> +       last_used_idx = vq->last_used_idx & (vq->num - 1);
> +       while (total) {
> +               copied = min((u16)(vq->num - last_used_idx), total);
> +               ret2 = vhost_copy_to_user(vq,
> +                                         &vq->used->ring[last_used_idx],
> +                                         &heads[ret - total],
> +                                         copied * sizeof(*used));
> +
> +               if (unlikely(ret2)) {
> +                       vq_err(vq, "Failed to update used ring!\n");
> +                       return -EFAULT;
> +               }
> +
> +               last_used_idx = 0;
> +               total -= copied;
> +       }

This second part seems unrelated and could be a separate function?

Also, no need for ret2 and double assignment "ret = total =" if not
modifying total
in the the second loop:

  for (i = 0; i < total; ) {
    ...
    i += copied;
  }
Jason Wang Sept. 28, 2017, 7:18 a.m. UTC | #7
On 2017年09月28日 06:57, Michael S. Tsirkin wrote:
> On Wed, Sep 27, 2017 at 08:35:47AM +0800, Jason Wang wrote:
>>
>> On 2017年09月27日 03:19, Michael S. Tsirkin wrote:
>>> On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
>>>> This patch introduces vhost_prefetch_desc_indices() which could batch
>>>> descriptor indices fetching and used ring updating. This intends to
>>>> reduce the cache misses of indices fetching and updating and reduce
>>>> cache line bounce when virtqueue is almost full. copy_to_user() was
>>>> used in order to benefit from modern cpus that support fast string
>>>> copy. Batched virtqueue processing will be the first user.
>>>>
>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>>> ---
>>>>    drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    drivers/vhost/vhost.h |  3 +++
>>>>    2 files changed, 58 insertions(+)
>>>>
>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>> index f87ec75..8424166d 100644
>>>> --- a/drivers/vhost/vhost.c
>>>> +++ b/drivers/vhost/vhost.c
>>>> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>>>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>>>> +				struct vring_used_elem *heads,
>>>> +				u16 num, bool used_update)
>>> why do you need to combine used update with prefetch?
>> For better performance
>
> Why is sticking a branch in there better than requesting the update
> conditionally from the caller?

Ok, I get your point, I can split the two functions.

>
>
>
>> and I believe we don't care about the overhead when
>> we meet errors in tx.
> That's a separate question, I do not really understand how
> you can fetch a descriptor and update the used ring at the same
> time. This allows the guest to overwrite the buffer.
> I might be misunderstanding what is going on here though.

We don't update used idx, so guest can't overwrite the buffer I think?

Thanks
Jason Wang Sept. 28, 2017, 7:44 a.m. UTC | #8
On 2017年09月28日 08:47, Willem de Bruijn wrote:
> On Fri, Sep 22, 2017 at 4:02 AM, Jason Wang <jasowang@redhat.com> wrote:
>> This patch introduces vhost_prefetch_desc_indices() which could batch
>> descriptor indices fetching and used ring updating. This intends to
>> reduce the cache misses of indices fetching and updating and reduce
>> cache line bounce when virtqueue is almost full. copy_to_user() was
>> used in order to benefit from modern cpus that support fast string
>> copy. Batched virtqueue processing will be the first user.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>   drivers/vhost/vhost.h |  3 +++
>>   2 files changed, 58 insertions(+)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index f87ec75..8424166d 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>>
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +                               struct vring_used_elem *heads,
>> +                               u16 num, bool used_update)
>> +{
>> +       int ret, ret2;
>> +       u16 last_avail_idx, last_used_idx, total, copied;
>> +       __virtio16 avail_idx;
>> +       struct vring_used_elem __user *used;
>> +       int i;
>> +
>> +       if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
>> +               vq_err(vq, "Failed to access avail idx at %p\n",
>> +                      &vq->avail->idx);
>> +               return -EFAULT;
>> +       }
>> +       last_avail_idx = vq->last_avail_idx & (vq->num - 1);
>> +       vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
>> +       total = vq->avail_idx - vq->last_avail_idx;
>> +       ret = total = min(total, num);
>> +
>> +       for (i = 0; i < ret; i++) {
>> +               ret2 = vhost_get_avail(vq, heads[i].id,
>> +                                     &vq->avail->ring[last_avail_idx]);
>> +               if (unlikely(ret2)) {
>> +                       vq_err(vq, "Failed to get descriptors\n");
>> +                       return -EFAULT;
>> +               }
>> +               last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
>> +       }
> This is understandably very similar to the existing logic in vhost_get_vq_desc.
> Can that be extracted to a helper to avoid code duplication?
>
> Perhaps one helper to update vq->avail_idx and return num, and
> another to call vhost_get_avail one or more times.

Yes it can.

>
>> +
>> +       if (!used_update)
>> +               return ret;
>> +
>> +       last_used_idx = vq->last_used_idx & (vq->num - 1);
>> +       while (total) {
>> +               copied = min((u16)(vq->num - last_used_idx), total);
>> +               ret2 = vhost_copy_to_user(vq,
>> +                                         &vq->used->ring[last_used_idx],
>> +                                         &heads[ret - total],
>> +                                         copied * sizeof(*used));
>> +
>> +               if (unlikely(ret2)) {
>> +                       vq_err(vq, "Failed to update used ring!\n");
>> +                       return -EFAULT;
>> +               }
>> +
>> +               last_used_idx = 0;
>> +               total -= copied;
>> +       }
> This second part seems unrelated and could be a separate function?

Yes.

>
> Also, no need for ret2 and double assignment "ret = total =" if not
> modifying total
> in the the second loop:
>
>    for (i = 0; i < total; ) {
>      ...
>      i += copied;
>    }

Right, will do this in V2.

Thanks

Patch
diff mbox series

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f87ec75..8424166d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2437,6 +2437,61 @@  struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
 }
 EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
 
+int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
+				struct vring_used_elem *heads,
+				u16 num, bool used_update)
+{
+	int ret, ret2;
+	u16 last_avail_idx, last_used_idx, total, copied;
+	__virtio16 avail_idx;
+	struct vring_used_elem __user *used;
+	int i;
+
+	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
+		vq_err(vq, "Failed to access avail idx at %p\n",
+		       &vq->avail->idx);
+		return -EFAULT;
+	}
+	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
+	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+	total = vq->avail_idx - vq->last_avail_idx;
+	ret = total = min(total, num);
+
+	for (i = 0; i < ret; i++) {
+		ret2 = vhost_get_avail(vq, heads[i].id,
+				      &vq->avail->ring[last_avail_idx]);
+		if (unlikely(ret2)) {
+			vq_err(vq, "Failed to get descriptors\n");
+			return -EFAULT;
+		}
+		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
+	}
+
+	if (!used_update)
+		return ret;
+
+	last_used_idx = vq->last_used_idx & (vq->num - 1);
+	while (total) {
+		copied = min((u16)(vq->num - last_used_idx), total);
+		ret2 = vhost_copy_to_user(vq,
+					  &vq->used->ring[last_used_idx],
+					  &heads[ret - total],
+					  copied * sizeof(*used));
+
+		if (unlikely(ret2)) {
+			vq_err(vq, "Failed to update used ring!\n");
+			return -EFAULT;
+		}
+
+		last_used_idx = 0;
+		total -= copied;
+	}
+
+	/* Only get avail ring entries after they have been exposed by guest. */
+	smp_rmb();
+	return ret;
+}
+EXPORT_SYMBOL(vhost_prefetch_desc_indices);
 
 static int __init vhost_init(void)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 39ff897..16c2cb6 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -228,6 +228,9 @@  ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
 ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
 			     struct iov_iter *from);
 int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
+int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
+				struct vring_used_elem *heads,
+				u16 num, bool used_update);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \