[net-next,3/3] vhost: access vq metadata through kernel virtual address
diff mbox series

Message ID 20181213101022.12475-4-jasowang@redhat.com
State Superseded
Headers show
Series
  • vhost: accelerate metadata access through vmap()
Related show

Commit Message

Jason Wang Dec. 13, 2018, 10:10 a.m. UTC
It was noticed that the copy_user() friends that was used to access
virtqueue metdata tends to be very expensive for dataplane
implementation like vhost since it involves lots of software check,
speculation barrier, hardware feature toggling (e.g SMAP). The
extra cost will be more obvious when transferring small packets.

This patch tries to eliminate those overhead by pin vq metadata pages
and access them through vmap(). During SET_VRING_ADDR, we will setup
those mappings and memory accessors are modified to use pointers to
access the metadata directly.

Note, this was only done when device IOTLB is not enabled. We could
use similar method to optimize it in the future.

Tests shows about ~24% improvement on TX PPS when using virtio-user +
vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):

Before: ~5.0Mpps
After:  ~6.1Mpps

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |  11 +++
 2 files changed, 189 insertions(+)

Comments

Michael S. Tsirkin Dec. 13, 2018, 3:44 p.m. UTC | #1
On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote:
> It was noticed that the copy_user() friends that was used to access
> virtqueue metdata tends to be very expensive for dataplane
> implementation like vhost since it involves lots of software check,
> speculation barrier, hardware feature toggling (e.g SMAP). The
> extra cost will be more obvious when transferring small packets.
> 
> This patch tries to eliminate those overhead by pin vq metadata pages
> and access them through vmap(). During SET_VRING_ADDR, we will setup
> those mappings and memory accessors are modified to use pointers to
> access the metadata directly.
> 
> Note, this was only done when device IOTLB is not enabled. We could
> use similar method to optimize it in the future.
> 
> Tests shows about ~24% improvement on TX PPS when using virtio-user +
> vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):
> 
> Before: ~5.0Mpps
> After:  ~6.1Mpps
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  11 +++
>  2 files changed, 189 insertions(+)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index bafe39d2e637..1bd24203afb6 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev,
>  		vq->indirect = NULL;
>  		vq->heads = NULL;
>  		vq->dev = dev;
> +		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
> +		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
> +		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
>  		mutex_init(&vq->mutex);
>  		vhost_vq_reset(dev, vq);
>  		if (vq->handle_kick)
> @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>  	spin_unlock(&dev->iotlb_lock);
>  }
>  
> +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
> +			   size_t size, int write)
> +{
> +	struct page **pages;
> +	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
> +	int npinned;
> +	void *vaddr;
> +
> +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
> +	if (!pages)
> +		return -ENOMEM;
> +
> +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
> +	if (npinned != npages)
> +		goto err;
> +

As I said I have doubts about the whole approach, but this
implementation in particular isn't a good idea
as it keeps the page around forever.
So no THP, no NUMA rebalancing, userspace-controlled
amount of memory locked up and not accounted for.

Don't get me wrong it's a great patch in an ideal world.
But then in an ideal world no barriers smap etc are necessary at all.


> +	vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
> +	if (!vaddr)
> +		goto err;
> +
> +	map->pages = pages;
> +	map->addr = vaddr + (uaddr & (PAGE_SIZE - 1));
> +	map->npages = npages;
> +
> +	return 0;
> +
> +err:
> +	if (npinned > 0)
> +		release_pages(pages, npinned);
> +	kfree(pages);
> +	return -EFAULT;
> +}
> +
> +static void vhost_uninit_vmap(struct vhost_vmap *map)
> +{
> +	if (!map->addr)
> +		return;
> +
> +	vunmap(map->addr);
> +	release_pages(map->pages, map->npages);
> +	kfree(map->pages);
> +
> +	map->addr = NULL;
> +	map->pages = NULL;
> +	map->npages = 0;
> +}
> +
> +static void vhost_clean_vmaps(struct vhost_virtqueue *vq)
> +{
> +	vhost_uninit_vmap(&vq->avail_ring);
> +	vhost_uninit_vmap(&vq->desc_ring);
> +	vhost_uninit_vmap(&vq->used_ring);
> +}
> +
> +static int vhost_setup_vmaps(struct vhost_virtqueue *vq, unsigned long avail,
> +			     unsigned long desc, unsigned long used)
> +{
> +	size_t event = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
> +	size_t avail_size, desc_size, used_size;
> +	int ret;
> +
> +	vhost_clean_vmaps(vq);
> +
> +	avail_size = sizeof(*vq->avail) +
> +		     sizeof(*vq->avail->ring) * vq->num + event;
> +	ret = vhost_init_vmap(&vq->avail_ring, avail, avail_size, false);
> +	if (ret) {
> +		vq_err(vq, "Fail to setup vmap for avail ring!\n");
> +		goto err_avail;
> +	}
> +
> +	desc_size = sizeof(*vq->desc) * vq->num;
> +	ret = vhost_init_vmap(&vq->desc_ring, desc, desc_size, false);
> +	if (ret) {
> +		vq_err(vq, "Fail to setup vmap for desc ring!\n");
> +		goto err_desc;
> +	}
> +
> +	used_size = sizeof(*vq->used) +
> +		    sizeof(*vq->used->ring) * vq->num + event;
> +	ret = vhost_init_vmap(&vq->used_ring, used, used_size, true);
> +	if (ret) {
> +		vq_err(vq, "Fail to setup vmap for used ring!\n");
> +		goto err_used;
> +	}
> +
> +	return 0;
> +
> +err_used:
> +	vhost_uninit_vmap(&vq->used_ring);
> +err_desc:
> +	vhost_uninit_vmap(&vq->avail_ring);
> +err_avail:
> +	return -EFAULT;
> +}
> +
>  void vhost_dev_cleanup(struct vhost_dev *dev)
>  {
>  	int i;
> @@ -626,6 +725,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  		if (dev->vqs[i]->call_ctx)
>  			eventfd_ctx_put(dev->vqs[i]->call_ctx);
>  		vhost_vq_reset(dev, dev->vqs[i]);
> +		vhost_clean_vmaps(dev->vqs[i]);
>  	}
>  	vhost_dev_free_iovecs(dev);
>  	if (dev->log_ctx)
> @@ -873,6 +973,14 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
>  
>  static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>  {
> +	if (!vq->iotlb) {
> +		struct vring_used *used = vq->used_ring.addr;
> +
> +		*((__virtio16 *)&used->ring[vq->num]) =
> +			cpu_to_vhost16(vq, vq->avail_idx);
> +		return 0;
> +	}
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
>  			      vhost_avail_event(vq));
>  }
> @@ -881,6 +989,13 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  				 struct vring_used_elem *head, int idx,
>  				 int count)
>  {
> +	if (!vq->iotlb) {
> +		struct vring_used *used = vq->used_ring.addr;
> +
> +		memcpy(used->ring + idx, head, count * sizeof(*head));
> +		return 0;
> +	}
> +
>  	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
>  				  count * sizeof(*head));
>  }
> @@ -888,6 +1003,13 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  
>  {
> +	if (!vq->iotlb) {
> +		struct vring_used *used = vq->used_ring.addr;
> +
> +		used->flags = cpu_to_vhost16(vq, vq->used_flags);
> +		return 0;
> +	}
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
>  			      &vq->used->flags);
>  }
> @@ -895,6 +1017,13 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>  
>  {
> +	if (!vq->iotlb) {
> +		struct vring_used *used = vq->used_ring.addr;
> +
> +		used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
> +		return 0;
> +	}
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>  			      &vq->used->idx);
>  }
> @@ -926,12 +1055,26 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>  static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>  				      __virtio16 *idx)
>  {
> +	if (!vq->iotlb) {
> +		struct vring_avail *avail = vq->avail_ring.addr;
> +
> +		*idx = avail->idx;
> +		return 0;
> +	}
> +
>  	return vhost_get_avail(vq, *idx, &vq->avail->idx);
>  }
>  
>  static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  				       __virtio16 *head, int idx)
>  {
> +	if (!vq->iotlb) {
> +		struct vring_avail *avail = vq->avail_ring.addr;
> +
> +		*head = avail->ring[idx & (vq->num - 1)];
> +		return 0;
> +	}
> +
>  	return vhost_get_avail(vq, *head,
>  			       &vq->avail->ring[idx & (vq->num - 1)]);
>  }
> @@ -939,24 +1082,52 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>  					__virtio16 *flags)
>  {
> +	if (!vq->iotlb) {
> +		struct vring_avail *avail = vq->avail_ring.addr;
> +
> +		*flags = avail->flags;
> +		return 0;
> +	}
> +
>  	return vhost_get_avail(vq, *flags, &vq->avail->flags);
>  }
>  
>  static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>  				       __virtio16 *event)
>  {
> +	if (!vq->iotlb) {
> +		struct vring_avail *avail = vq->avail_ring.addr;
> +
> +		*event = (__virtio16)avail->ring[vq->num];
> +		return 0;
> +	}
> +
>  	return vhost_get_avail(vq, *event, vhost_used_event(vq));
>  }
>  
>  static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>  				     __virtio16 *idx)
>  {
> +	if (!vq->iotlb) {
> +		struct vring_used *used = vq->used_ring.addr;
> +
> +		*idx = used->idx;
> +		return 0;
> +	}
> +
>  	return vhost_get_used(vq, *idx, &vq->used->idx);
>  }
>  
>  static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>  				 struct vring_desc *desc, int idx)
>  {
> +	if (!vq->iotlb) {
> +		struct vring_desc *d = vq->desc_ring.addr;
> +
> +		*desc = *(d + idx);
> +		return 0;
> +	}
> +
>  	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
>  }
>  
> @@ -1551,6 +1722,13 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
>  			}
>  		}
>  
> +		if (!vq->iotlb && vhost_setup_vmaps(vq, a.avail_user_addr,
> +							a.desc_user_addr,
> +							a.used_user_addr)) {
> +			r = -EINVAL;
> +			break;
> +		}
> +
>  		vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
>  		vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
>  		vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 466ef7542291..89dc0ad3d055 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -80,6 +80,12 @@ enum vhost_uaddr_type {
>  	VHOST_NUM_ADDRS = 3,
>  };
>  
> +struct vhost_vmap {
> +	struct page **pages;
> +	void *addr;
> +	int npages;
> +};
> +
>  /* The virtqueue structure describes a queue attached to a device. */
>  struct vhost_virtqueue {
>  	struct vhost_dev *dev;
> @@ -90,6 +96,11 @@ struct vhost_virtqueue {
>  	struct vring_desc __user *desc;
>  	struct vring_avail __user *avail;
>  	struct vring_used __user *used;
> +
> +	struct vhost_vmap avail_ring;
> +	struct vhost_vmap desc_ring;
> +	struct vhost_vmap used_ring;
> +
>  	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
>  	struct file *kick;
>  	struct eventfd_ctx *call_ctx;
> -- 
> 2.17.1
Konrad Rzeszutek Wilk Dec. 13, 2018, 9:18 p.m. UTC | #2
.giant snip..
> > +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
> > +	if (npinned != npages)
> > +		goto err;
> > +
> 
> As I said I have doubts about the whole approach, but this
> implementation in particular isn't a good idea
> as it keeps the page around forever.
> So no THP, no NUMA rebalancing, userspace-controlled
> amount of memory locked up and not accounted for.
> 
> Don't get me wrong it's a great patch in an ideal world.
> But then in an ideal world no barriers smap etc are necessary at all.

So .. suggestions on how this could be accepted? As in other ways
where we still get vmap and the issues you mentioned are not troubling you?

Thanks!
Michael S. Tsirkin Dec. 13, 2018, 9:58 p.m. UTC | #3
On Thu, Dec 13, 2018 at 04:18:40PM -0500, Konrad Rzeszutek Wilk wrote:
> .giant snip..
> > > +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
> > > +	if (npinned != npages)
> > > +		goto err;
> > > +
> > 
> > As I said I have doubts about the whole approach, but this
> > implementation in particular isn't a good idea
> > as it keeps the page around forever.
> > So no THP, no NUMA rebalancing, userspace-controlled
> > amount of memory locked up and not accounted for.
> > 
> > Don't get me wrong it's a great patch in an ideal world.
> > But then in an ideal world no barriers smap etc are necessary at all.
> 
> So .. suggestions on how this could be accepted? As in other ways
> where we still get vmap and the issues you mentioned are not troubling you?
> 
> Thanks!

I'd suggest leave vmap alone and find ways to speed up accesses
that can fault.
Jason Wang Dec. 14, 2018, 3:57 a.m. UTC | #4
On 2018/12/13 下午11:44, Michael S. Tsirkin wrote:
> On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote:
>> It was noticed that the copy_user() friends that was used to access
>> virtqueue metdata tends to be very expensive for dataplane
>> implementation like vhost since it involves lots of software check,
>> speculation barrier, hardware feature toggling (e.g SMAP). The
>> extra cost will be more obvious when transferring small packets.
>>
>> This patch tries to eliminate those overhead by pin vq metadata pages
>> and access them through vmap(). During SET_VRING_ADDR, we will setup
>> those mappings and memory accessors are modified to use pointers to
>> access the metadata directly.
>>
>> Note, this was only done when device IOTLB is not enabled. We could
>> use similar method to optimize it in the future.
>>
>> Tests shows about ~24% improvement on TX PPS when using virtio-user +
>> vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):
>>
>> Before: ~5.0Mpps
>> After:  ~6.1Mpps
>>
>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
>>   drivers/vhost/vhost.h |  11 +++
>>   2 files changed, 189 insertions(+)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index bafe39d2e637..1bd24203afb6 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev,
>>   		vq->indirect = NULL;
>>   		vq->heads = NULL;
>>   		vq->dev = dev;
>> +		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
>> +		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
>> +		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
>>   		mutex_init(&vq->mutex);
>>   		vhost_vq_reset(dev, vq);
>>   		if (vq->handle_kick)
>> @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>>   	spin_unlock(&dev->iotlb_lock);
>>   }
>>   
>> +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
>> +			   size_t size, int write)
>> +{
>> +	struct page **pages;
>> +	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
>> +	int npinned;
>> +	void *vaddr;
>> +
>> +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
>> +	if (!pages)
>> +		return -ENOMEM;
>> +
>> +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
>> +	if (npinned != npages)
>> +		goto err;
>> +
> As I said I have doubts about the whole approach, but this
> implementation in particular isn't a good idea
> as it keeps the page around forever.
> So no THP, no NUMA rebalancing,


This is the price of all GUP users not only vhost itself. What's more 
important, the goal is not to be left too much behind for other backends 
like DPDK or AF_XDP (all of which are using GUP).


> userspace-controlled
> amount of memory locked up and not accounted for.


It's pretty easy to add this since the slow path was still kept. If we 
exceeds the limitation, we can switch back to slow path.


>
> Don't get me wrong it's a great patch in an ideal world.
> But then in an ideal world no barriers smap etc are necessary at all.


Again, this is only for metadata accessing not the data which has been 
used for years for real use cases.

For SMAP, it makes senses for the address that kernel can not forcast. 
But it's not the case for the vhost metadata since we know the address 
will be accessed very frequently. For speculation barrier, it helps 
nothing for the data path of vhost which is a kthread. Packet or AF_XDP 
benefit from accessing metadata directly, we should do it as well.

Thanks
Michael S. Tsirkin Dec. 14, 2018, 12:36 p.m. UTC | #5
On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote:
> 
> On 2018/12/13 下午11:44, Michael S. Tsirkin wrote:
> > On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote:
> > > It was noticed that the copy_user() friends that was used to access
> > > virtqueue metdata tends to be very expensive for dataplane
> > > implementation like vhost since it involves lots of software check,
> > > speculation barrier, hardware feature toggling (e.g SMAP). The
> > > extra cost will be more obvious when transferring small packets.
> > > 
> > > This patch tries to eliminate those overhead by pin vq metadata pages
> > > and access them through vmap(). During SET_VRING_ADDR, we will setup
> > > those mappings and memory accessors are modified to use pointers to
> > > access the metadata directly.
> > > 
> > > Note, this was only done when device IOTLB is not enabled. We could
> > > use similar method to optimize it in the future.
> > > 
> > > Tests shows about ~24% improvement on TX PPS when using virtio-user +
> > > vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):
> > > 
> > > Before: ~5.0Mpps
> > > After:  ~6.1Mpps
> > > 
> > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > ---
> > >   drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
> > >   drivers/vhost/vhost.h |  11 +++
> > >   2 files changed, 189 insertions(+)
> > > 
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index bafe39d2e637..1bd24203afb6 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev,
> > >   		vq->indirect = NULL;
> > >   		vq->heads = NULL;
> > >   		vq->dev = dev;
> > > +		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
> > > +		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
> > > +		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
> > >   		mutex_init(&vq->mutex);
> > >   		vhost_vq_reset(dev, vq);
> > >   		if (vq->handle_kick)
> > > @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev)
> > >   	spin_unlock(&dev->iotlb_lock);
> > >   }
> > > +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
> > > +			   size_t size, int write)
> > > +{
> > > +	struct page **pages;
> > > +	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
> > > +	int npinned;
> > > +	void *vaddr;
> > > +
> > > +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
> > > +	if (!pages)
> > > +		return -ENOMEM;
> > > +
> > > +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
> > > +	if (npinned != npages)
> > > +		goto err;
> > > +
> > As I said I have doubts about the whole approach, but this
> > implementation in particular isn't a good idea
> > as it keeps the page around forever.
> > So no THP, no NUMA rebalancing,
> 
> 
> This is the price of all GUP users not only vhost itself.

Yes. GUP is just not a great interface for vhost to use.

> What's more
> important, the goal is not to be left too much behind for other backends
> like DPDK or AF_XDP (all of which are using GUP).


So these guys assume userspace knows what it's doing.
We can't assume that.

> 
> > userspace-controlled
> > amount of memory locked up and not accounted for.
> 
> 
> It's pretty easy to add this since the slow path was still kept. If we
> exceeds the limitation, we can switch back to slow path.
> 
> > 
> > Don't get me wrong it's a great patch in an ideal world.
> > But then in an ideal world no barriers smap etc are necessary at all.
> 
> 
> Again, this is only for metadata accessing not the data which has been used
> for years for real use cases.
> 
> For SMAP, it makes senses for the address that kernel can not forcast. But
> it's not the case for the vhost metadata since we know the address will be
> accessed very frequently. For speculation barrier, it helps nothing for the
> data path of vhost which is a kthread.

I don't see how a kthread makes any difference. We do have a validation
step which makes some difference.

> Packet or AF_XDP benefit from
> accessing metadata directly, we should do it as well.
> 
> Thanks
kbuild test robot Dec. 14, 2018, 2:48 p.m. UTC | #6
Hi Jason,

I love your patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Jason-Wang/vhost-accelerate-metadata-access-through-vmap/20181214-200417
config: mips-malta_kvm_defconfig (attached as .config)
compiler: mipsel-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.2.0 make.cross ARCH=mips 

All errors (new ones prefixed by >>):

   drivers//vhost/vhost.c: In function 'vhost_init_vmap':
>> drivers//vhost/vhost.c:648:3: error: implicit declaration of function 'release_pages'; did you mean 'release_task'? [-Werror=implicit-function-declaration]
      release_pages(pages, npinned);
      ^~~~~~~~~~~~~
      release_task
   cc1: some warnings being treated as errors

vim +648 drivers//vhost/vhost.c

   619	
   620	static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
   621				   size_t size, int write)
   622	{
   623		struct page **pages;
   624		int npages = DIV_ROUND_UP(size, PAGE_SIZE);
   625		int npinned;
   626		void *vaddr;
   627	
   628		pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
   629		if (!pages)
   630			return -ENOMEM;
   631	
   632		npinned = get_user_pages_fast(uaddr, npages, write, pages);
   633		if (npinned != npages)
   634			goto err;
   635	
   636		vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
   637		if (!vaddr)
   638			goto err;
   639	
   640		map->pages = pages;
   641		map->addr = vaddr + (uaddr & (PAGE_SIZE - 1));
   642		map->npages = npages;
   643	
   644		return 0;
   645	
   646	err:
   647		if (npinned > 0)
 > 648			release_pages(pages, npinned);
   649		kfree(pages);
   650		return -EFAULT;
   651	}
   652	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
David Miller Dec. 15, 2018, 9:15 p.m. UTC | #7
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 14 Dec 2018 11:57:35 +0800

> This is the price of all GUP users not only vhost itself. What's more
> important, the goal is not to be left too much behind for other
> backends like DPDK or AF_XDP (all of which are using GUP).

+1
Jason Wang Dec. 24, 2018, 7:53 a.m. UTC | #8
On 2018/12/14 下午8:36, Michael S. Tsirkin wrote:
> On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote:
>> On 2018/12/13 下午11:44, Michael S. Tsirkin wrote:
>>> On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote:
>>>> It was noticed that the copy_user() friends that was used to access
>>>> virtqueue metdata tends to be very expensive for dataplane
>>>> implementation like vhost since it involves lots of software check,
>>>> speculation barrier, hardware feature toggling (e.g SMAP). The
>>>> extra cost will be more obvious when transferring small packets.
>>>>
>>>> This patch tries to eliminate those overhead by pin vq metadata pages
>>>> and access them through vmap(). During SET_VRING_ADDR, we will setup
>>>> those mappings and memory accessors are modified to use pointers to
>>>> access the metadata directly.
>>>>
>>>> Note, this was only done when device IOTLB is not enabled. We could
>>>> use similar method to optimize it in the future.
>>>>
>>>> Tests shows about ~24% improvement on TX PPS when using virtio-user +
>>>> vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):
>>>>
>>>> Before: ~5.0Mpps
>>>> After:  ~6.1Mpps
>>>>
>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>> ---
>>>>    drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
>>>>    drivers/vhost/vhost.h |  11 +++
>>>>    2 files changed, 189 insertions(+)
>>>>
>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>> index bafe39d2e637..1bd24203afb6 100644
>>>> --- a/drivers/vhost/vhost.c
>>>> +++ b/drivers/vhost/vhost.c
>>>> @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev,
>>>>    		vq->indirect = NULL;
>>>>    		vq->heads = NULL;
>>>>    		vq->dev = dev;
>>>> +		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
>>>> +		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
>>>> +		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
>>>>    		mutex_init(&vq->mutex);
>>>>    		vhost_vq_reset(dev, vq);
>>>>    		if (vq->handle_kick)
>>>> @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>>>>    	spin_unlock(&dev->iotlb_lock);
>>>>    }
>>>> +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
>>>> +			   size_t size, int write)
>>>> +{
>>>> +	struct page **pages;
>>>> +	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
>>>> +	int npinned;
>>>> +	void *vaddr;
>>>> +
>>>> +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
>>>> +	if (!pages)
>>>> +		return -ENOMEM;
>>>> +
>>>> +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
>>>> +	if (npinned != npages)
>>>> +		goto err;
>>>> +
>>> As I said I have doubts about the whole approach, but this
>>> implementation in particular isn't a good idea
>>> as it keeps the page around forever.


The pages wil be released during set features.


>>> So no THP, no NUMA rebalancing,


For THP, we will probably miss 2 or 4 pages, but does this really matter 
consider the gain we have? For NUMA rebalancing, I'm even not quite sure 
if it can helps for the case of IPC (vhost). It looks to me the worst 
case it may cause page to be thrash between nodes if vhost and userspace 
are running in two nodes.


>>
>> This is the price of all GUP users not only vhost itself.
> Yes. GUP is just not a great interface for vhost to use.


Zerocopy codes (enabled by defualt) use them for years.


>
>> What's more
>> important, the goal is not to be left too much behind for other backends
>> like DPDK or AF_XDP (all of which are using GUP).
>
> So these guys assume userspace knows what it's doing.
> We can't assume that.


What kind of assumption do you they have?


>
>>> userspace-controlled
>>> amount of memory locked up and not accounted for.
>>
>> It's pretty easy to add this since the slow path was still kept. If we
>> exceeds the limitation, we can switch back to slow path.
>>
>>> Don't get me wrong it's a great patch in an ideal world.
>>> But then in an ideal world no barriers smap etc are necessary at all.
>>
>> Again, this is only for metadata accessing not the data which has been used
>> for years for real use cases.
>>
>> For SMAP, it makes senses for the address that kernel can not forcast. But
>> it's not the case for the vhost metadata since we know the address will be
>> accessed very frequently. For speculation barrier, it helps nothing for the
>> data path of vhost which is a kthread.
> I don't see how a kthread makes any difference. We do have a validation
> step which makes some difference.


The problem is not kthread but the address of userspace address. The 
addresses of vq metadata tends to be consistent for a while, and vhost 
knows they will be frequently. SMAP doesn't help too much in this case.

Thanks.


>
>> Packet or AF_XDP benefit from
>> accessing metadata directly, we should do it as well.
>>
>> Thanks
Michael S. Tsirkin Dec. 24, 2018, 6:10 p.m. UTC | #9
On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote:
> 
> On 2018/12/14 下午8:36, Michael S. Tsirkin wrote:
> > On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote:
> > > On 2018/12/13 下午11:44, Michael S. Tsirkin wrote:
> > > > On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote:
> > > > > It was noticed that the copy_user() friends that was used to access
> > > > > virtqueue metdata tends to be very expensive for dataplane
> > > > > implementation like vhost since it involves lots of software check,
> > > > > speculation barrier, hardware feature toggling (e.g SMAP). The
> > > > > extra cost will be more obvious when transferring small packets.
> > > > > 
> > > > > This patch tries to eliminate those overhead by pin vq metadata pages
> > > > > and access them through vmap(). During SET_VRING_ADDR, we will setup
> > > > > those mappings and memory accessors are modified to use pointers to
> > > > > access the metadata directly.
> > > > > 
> > > > > Note, this was only done when device IOTLB is not enabled. We could
> > > > > use similar method to optimize it in the future.
> > > > > 
> > > > > Tests shows about ~24% improvement on TX PPS when using virtio-user +
> > > > > vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):
> > > > > 
> > > > > Before: ~5.0Mpps
> > > > > After:  ~6.1Mpps
> > > > > 
> > > > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > > > ---
> > > > >    drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
> > > > >    drivers/vhost/vhost.h |  11 +++
> > > > >    2 files changed, 189 insertions(+)
> > > > > 
> > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > index bafe39d2e637..1bd24203afb6 100644
> > > > > --- a/drivers/vhost/vhost.c
> > > > > +++ b/drivers/vhost/vhost.c
> > > > > @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev,
> > > > >    		vq->indirect = NULL;
> > > > >    		vq->heads = NULL;
> > > > >    		vq->dev = dev;
> > > > > +		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
> > > > > +		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
> > > > > +		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
> > > > >    		mutex_init(&vq->mutex);
> > > > >    		vhost_vq_reset(dev, vq);
> > > > >    		if (vq->handle_kick)
> > > > > @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev)
> > > > >    	spin_unlock(&dev->iotlb_lock);
> > > > >    }
> > > > > +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
> > > > > +			   size_t size, int write)
> > > > > +{
> > > > > +	struct page **pages;
> > > > > +	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
> > > > > +	int npinned;
> > > > > +	void *vaddr;
> > > > > +
> > > > > +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
> > > > > +	if (!pages)
> > > > > +		return -ENOMEM;
> > > > > +
> > > > > +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
> > > > > +	if (npinned != npages)
> > > > > +		goto err;
> > > > > +
> > > > As I said I have doubts about the whole approach, but this
> > > > implementation in particular isn't a good idea
> > > > as it keeps the page around forever.
> 
> 
> The pages wil be released during set features.
> 
> 
> > > > So no THP, no NUMA rebalancing,
> 
> 
> For THP, we will probably miss 2 or 4 pages, but does this really matter
> consider the gain we have?

We as in vhost? networking isn't the only thing guest does.
We don't even know if this guest does a lot of networking.
You don't
know what else is in this huge page. Can be something very important
that guest touches all the time.

> For NUMA rebalancing, I'm even not quite sure if
> it can helps for the case of IPC (vhost). It looks to me the worst case it
> may cause page to be thrash between nodes if vhost and userspace are running
> in two nodes.


So again it's a gain for vhost but has a completely unpredictable effect on
other functionality of the guest.

That's what bothers me with this approach.




> 
> > > 
> > > This is the price of all GUP users not only vhost itself.
> > Yes. GUP is just not a great interface for vhost to use.
> 
> 
> Zerocopy codes (enabled by defualt) use them for years.

But only for TX and temporarily. We pin, read, unpin.

Your patch is different

- it writes into memory and GUP has known issues with file
  backed memory
- it keeps pages pinned forever



> 
> > 
> > > What's more
> > > important, the goal is not to be left too much behind for other backends
> > > like DPDK or AF_XDP (all of which are using GUP).
> > 
> > So these guys assume userspace knows what it's doing.
> > We can't assume that.
> 
> 
> What kind of assumption do you they have?
> 
> 
> > 
> > > > userspace-controlled
> > > > amount of memory locked up and not accounted for.
> > > 
> > > It's pretty easy to add this since the slow path was still kept. If we
> > > exceeds the limitation, we can switch back to slow path.
> > > 
> > > > Don't get me wrong it's a great patch in an ideal world.
> > > > But then in an ideal world no barriers smap etc are necessary at all.
> > > 
> > > Again, this is only for metadata accessing not the data which has been used
> > > for years for real use cases.
> > > 
> > > For SMAP, it makes senses for the address that kernel can not forcast. But
> > > it's not the case for the vhost metadata since we know the address will be
> > > accessed very frequently. For speculation barrier, it helps nothing for the
> > > data path of vhost which is a kthread.
> > I don't see how a kthread makes any difference. We do have a validation
> > step which makes some difference.
> 
> 
> The problem is not kthread but the address of userspace address. The
> addresses of vq metadata tends to be consistent for a while, and vhost knows
> they will be frequently. SMAP doesn't help too much in this case.
> 
> Thanks.

It's true for a real life applications but a malicious one
can call the setup ioctls any number of times. And SMAP is
all about malcious applications.

> 
> > 
> > > Packet or AF_XDP benefit from
> > > accessing metadata directly, we should do it as well.
> > > 
> > > Thanks
Jason Wang Dec. 25, 2018, 10:05 a.m. UTC | #10
On 2018/12/25 上午2:10, Michael S. Tsirkin wrote:
> On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote:
>> On 2018/12/14 下午8:36, Michael S. Tsirkin wrote:
>>> On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote:
>>>> On 2018/12/13 下午11:44, Michael S. Tsirkin wrote:
>>>>> On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote:
>>>>>> It was noticed that the copy_user() friends that was used to access
>>>>>> virtqueue metdata tends to be very expensive for dataplane
>>>>>> implementation like vhost since it involves lots of software check,
>>>>>> speculation barrier, hardware feature toggling (e.g SMAP). The
>>>>>> extra cost will be more obvious when transferring small packets.
>>>>>>
>>>>>> This patch tries to eliminate those overhead by pin vq metadata pages
>>>>>> and access them through vmap(). During SET_VRING_ADDR, we will setup
>>>>>> those mappings and memory accessors are modified to use pointers to
>>>>>> access the metadata directly.
>>>>>>
>>>>>> Note, this was only done when device IOTLB is not enabled. We could
>>>>>> use similar method to optimize it in the future.
>>>>>>
>>>>>> Tests shows about ~24% improvement on TX PPS when using virtio-user +
>>>>>> vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):
>>>>>>
>>>>>> Before: ~5.0Mpps
>>>>>> After:  ~6.1Mpps
>>>>>>
>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>>> ---
>>>>>>     drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
>>>>>>     drivers/vhost/vhost.h |  11 +++
>>>>>>     2 files changed, 189 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>>>> index bafe39d2e637..1bd24203afb6 100644
>>>>>> --- a/drivers/vhost/vhost.c
>>>>>> +++ b/drivers/vhost/vhost.c
>>>>>> @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev,
>>>>>>     		vq->indirect = NULL;
>>>>>>     		vq->heads = NULL;
>>>>>>     		vq->dev = dev;
>>>>>> +		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
>>>>>> +		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
>>>>>> +		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
>>>>>>     		mutex_init(&vq->mutex);
>>>>>>     		vhost_vq_reset(dev, vq);
>>>>>>     		if (vq->handle_kick)
>>>>>> @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>>>>>>     	spin_unlock(&dev->iotlb_lock);
>>>>>>     }
>>>>>> +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
>>>>>> +			   size_t size, int write)
>>>>>> +{
>>>>>> +	struct page **pages;
>>>>>> +	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
>>>>>> +	int npinned;
>>>>>> +	void *vaddr;
>>>>>> +
>>>>>> +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
>>>>>> +	if (!pages)
>>>>>> +		return -ENOMEM;
>>>>>> +
>>>>>> +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
>>>>>> +	if (npinned != npages)
>>>>>> +		goto err;
>>>>>> +
>>>>> As I said I have doubts about the whole approach, but this
>>>>> implementation in particular isn't a good idea
>>>>> as it keeps the page around forever.
>>
>> The pages wil be released during set features.
>>
>>
>>>>> So no THP, no NUMA rebalancing,
>>
>> For THP, we will probably miss 2 or 4 pages, but does this really matter
>> consider the gain we have?
> We as in vhost? networking isn't the only thing guest does.
> We don't even know if this guest does a lot of networking.
> You don't
> know what else is in this huge page. Can be something very important
> that guest touches all the time.


Well, the probability should be very small consider we usually give 
several gigabytes to guest. The rest of the pages that doesn't sit in 
the same hugepage with metadata can still be merged by THP.  Anyway, I 
can test the differences.


>
>> For NUMA rebalancing, I'm even not quite sure if
>> it can helps for the case of IPC (vhost). It looks to me the worst case it
>> may cause page to be thrash between nodes if vhost and userspace are running
>> in two nodes.
>
> So again it's a gain for vhost but has a completely unpredictable effect on
> other functionality of the guest.
>
> That's what bothers me with this approach.


So:

- The rest of the pages could still be balanced to other nodes, no?

- try to balance metadata pages (belongs to co-operate processes) itself 
is still questionable


>
>
>
>
>>>> This is the price of all GUP users not only vhost itself.
>>> Yes. GUP is just not a great interface for vhost to use.
>>
>> Zerocopy codes (enabled by defualt) use them for years.
> But only for TX and temporarily. We pin, read, unpin.


Probably not. For several reasons that the page will be not be released 
soon or held for a very long period of time or even forever.


>
> Your patch is different
>
> - it writes into memory and GUP has known issues with file
>    backed memory


The ordinary user for vhost is anonymous pages I think?


> - it keeps pages pinned forever
>
>
>
>>>> What's more
>>>> important, the goal is not to be left too much behind for other backends
>>>> like DPDK or AF_XDP (all of which are using GUP).
>>> So these guys assume userspace knows what it's doing.
>>> We can't assume that.
>>
>> What kind of assumption do you they have?
>>
>>
>>>>> userspace-controlled
>>>>> amount of memory locked up and not accounted for.
>>>> It's pretty easy to add this since the slow path was still kept. If we
>>>> exceeds the limitation, we can switch back to slow path.
>>>>
>>>>> Don't get me wrong it's a great patch in an ideal world.
>>>>> But then in an ideal world no barriers smap etc are necessary at all.
>>>> Again, this is only for metadata accessing not the data which has been used
>>>> for years for real use cases.
>>>>
>>>> For SMAP, it makes senses for the address that kernel can not forcast. But
>>>> it's not the case for the vhost metadata since we know the address will be
>>>> accessed very frequently. For speculation barrier, it helps nothing for the
>>>> data path of vhost which is a kthread.
>>> I don't see how a kthread makes any difference. We do have a validation
>>> step which makes some difference.
>>
>> The problem is not kthread but the address of userspace address. The
>> addresses of vq metadata tends to be consistent for a while, and vhost knows
>> they will be frequently. SMAP doesn't help too much in this case.
>>
>> Thanks.
> It's true for a real life applications but a malicious one
> can call the setup ioctls any number of times. And SMAP is
> all about malcious applications.


We don't do this in the path of ioctl, there's no context switch between 
userspace and kernel in the worker thread. SMAP is used to prevent 
kernel from accessing userspace pages unexpectedly which is not the case 
for metadata access.

Thanks


>
>>>> Packet or AF_XDP benefit from
>>>> accessing metadata directly, we should do it as well.
>>>>
>>>> Thanks
Michael S. Tsirkin Dec. 25, 2018, 12:50 p.m. UTC | #11
On Tue, Dec 25, 2018 at 06:05:25PM +0800, Jason Wang wrote:
> 
> On 2018/12/25 上午2:10, Michael S. Tsirkin wrote:
> > On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote:
> > > On 2018/12/14 下午8:36, Michael S. Tsirkin wrote:
> > > > On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote:
> > > > > On 2018/12/13 下午11:44, Michael S. Tsirkin wrote:
> > > > > > On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote:
> > > > > > > It was noticed that the copy_user() friends that was used to access
> > > > > > > virtqueue metdata tends to be very expensive for dataplane
> > > > > > > implementation like vhost since it involves lots of software check,
> > > > > > > speculation barrier, hardware feature toggling (e.g SMAP). The
> > > > > > > extra cost will be more obvious when transferring small packets.
> > > > > > > 
> > > > > > > This patch tries to eliminate those overhead by pin vq metadata pages
> > > > > > > and access them through vmap(). During SET_VRING_ADDR, we will setup
> > > > > > > those mappings and memory accessors are modified to use pointers to
> > > > > > > access the metadata directly.
> > > > > > > 
> > > > > > > Note, this was only done when device IOTLB is not enabled. We could
> > > > > > > use similar method to optimize it in the future.
> > > > > > > 
> > > > > > > Tests shows about ~24% improvement on TX PPS when using virtio-user +
> > > > > > > vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):
> > > > > > > 
> > > > > > > Before: ~5.0Mpps
> > > > > > > After:  ~6.1Mpps
> > > > > > > 
> > > > > > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > > > > > ---
> > > > > > >     drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
> > > > > > >     drivers/vhost/vhost.h |  11 +++
> > > > > > >     2 files changed, 189 insertions(+)
> > > > > > > 
> > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > index bafe39d2e637..1bd24203afb6 100644
> > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev,
> > > > > > >     		vq->indirect = NULL;
> > > > > > >     		vq->heads = NULL;
> > > > > > >     		vq->dev = dev;
> > > > > > > +		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
> > > > > > > +		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
> > > > > > > +		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
> > > > > > >     		mutex_init(&vq->mutex);
> > > > > > >     		vhost_vq_reset(dev, vq);
> > > > > > >     		if (vq->handle_kick)
> > > > > > > @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev)
> > > > > > >     	spin_unlock(&dev->iotlb_lock);
> > > > > > >     }
> > > > > > > +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
> > > > > > > +			   size_t size, int write)
> > > > > > > +{
> > > > > > > +	struct page **pages;
> > > > > > > +	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
> > > > > > > +	int npinned;
> > > > > > > +	void *vaddr;
> > > > > > > +
> > > > > > > +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
> > > > > > > +	if (!pages)
> > > > > > > +		return -ENOMEM;
> > > > > > > +
> > > > > > > +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
> > > > > > > +	if (npinned != npages)
> > > > > > > +		goto err;
> > > > > > > +
> > > > > > As I said I have doubts about the whole approach, but this
> > > > > > implementation in particular isn't a good idea
> > > > > > as it keeps the page around forever.
> > > 
> > > The pages wil be released during set features.
> > > 
> > > 
> > > > > > So no THP, no NUMA rebalancing,
> > > 
> > > For THP, we will probably miss 2 or 4 pages, but does this really matter
> > > consider the gain we have?
> > We as in vhost? networking isn't the only thing guest does.
> > We don't even know if this guest does a lot of networking.
> > You don't
> > know what else is in this huge page. Can be something very important
> > that guest touches all the time.
> 
> 
> Well, the probability should be very small consider we usually give several
> gigabytes to guest. The rest of the pages that doesn't sit in the same
> hugepage with metadata can still be merged by THP.  Anyway, I can test the
> differences.

Thanks!

> 
> > 
> > > For NUMA rebalancing, I'm even not quite sure if
> > > it can helps for the case of IPC (vhost). It looks to me the worst case it
> > > may cause page to be thrash between nodes if vhost and userspace are running
> > > in two nodes.
> > 
> > So again it's a gain for vhost but has a completely unpredictable effect on
> > other functionality of the guest.
> > 
> > That's what bothers me with this approach.
> 
> 
> So:
> 
> - The rest of the pages could still be balanced to other nodes, no?
> 
> - try to balance metadata pages (belongs to co-operate processes) itself is
> still questionable

I am not sure why. It should be easy enough to force the VCPU and vhost
to move (e.g. start them pinned to 1 cpu, then pin them to another one).
Clearly sometimes this would be necessary for load balancing reasons.
With autonuma after a while (could take seconds but it will happen) the
memory will migrate.




> 
> > 
> > 
> > 
> > 
> > > > > This is the price of all GUP users not only vhost itself.
> > > > Yes. GUP is just not a great interface for vhost to use.
> > > 
> > > Zerocopy codes (enabled by defualt) use them for years.
> > But only for TX and temporarily. We pin, read, unpin.
> 
> 
> Probably not. For several reasons that the page will be not be released soon
> or held for a very long period of time or even forever.


With zero copy? Well it's pinned until transmit. Takes a while
but could be enough for autocopy to work esp since
its the packet memory so not reused immediately.

> 
> > 
> > Your patch is different
> > 
> > - it writes into memory and GUP has known issues with file
> >    backed memory
> 
> 
> The ordinary user for vhost is anonymous pages I think?


It's not the most common scenario and not the fastest one
(e.g. THP does not work) but file backed is useful sometimes.
It would not be nice at all to corrupt guest memory in that case.


> 
> > - it keeps pages pinned forever
> > 
> > 
> > 
> > > > > What's more
> > > > > important, the goal is not to be left too much behind for other backends
> > > > > like DPDK or AF_XDP (all of which are using GUP).
> > > > So these guys assume userspace knows what it's doing.
> > > > We can't assume that.
> > > 
> > > What kind of assumption do you they have?
> > > 
> > > 
> > > > > > userspace-controlled
> > > > > > amount of memory locked up and not accounted for.
> > > > > It's pretty easy to add this since the slow path was still kept. If we
> > > > > exceeds the limitation, we can switch back to slow path.
> > > > > 
> > > > > > Don't get me wrong it's a great patch in an ideal world.
> > > > > > But then in an ideal world no barriers smap etc are necessary at all.
> > > > > Again, this is only for metadata accessing not the data which has been used
> > > > > for years for real use cases.
> > > > > 
> > > > > For SMAP, it makes senses for the address that kernel can not forcast. But
> > > > > it's not the case for the vhost metadata since we know the address will be
> > > > > accessed very frequently. For speculation barrier, it helps nothing for the
> > > > > data path of vhost which is a kthread.
> > > > I don't see how a kthread makes any difference. We do have a validation
> > > > step which makes some difference.
> > > 
> > > The problem is not kthread but the address of userspace address. The
> > > addresses of vq metadata tends to be consistent for a while, and vhost knows
> > > they will be frequently. SMAP doesn't help too much in this case.
> > > 
> > > Thanks.
> > It's true for a real life applications but a malicious one
> > can call the setup ioctls any number of times. And SMAP is
> > all about malcious applications.
> 
> 
> We don't do this in the path of ioctl, there's no context switch between
> userspace and kernel in the worker thread. SMAP is used to prevent kernel
> from accessing userspace pages unexpectedly which is not the case for
> metadata access.
> 
> Thanks

OK let's forget smap for now.

> 
> > 
> > > > > Packet or AF_XDP benefit from
> > > > > accessing metadata directly, we should do it as well.
> > > > > 
> > > > > Thanks
Jason Wang Dec. 26, 2018, 3:57 a.m. UTC | #12
On 2018/12/25 下午8:50, Michael S. Tsirkin wrote:
> On Tue, Dec 25, 2018 at 06:05:25PM +0800, Jason Wang wrote:
>> On 2018/12/25 上午2:10, Michael S. Tsirkin wrote:
>>> On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote:
>>>> On 2018/12/14 下午8:36, Michael S. Tsirkin wrote:
>>>>> On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote:
>>>>>> On 2018/12/13 下午11:44, Michael S. Tsirkin wrote:
>>>>>>> On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote:
>>>>>>>> It was noticed that the copy_user() friends that was used to access
>>>>>>>> virtqueue metdata tends to be very expensive for dataplane
>>>>>>>> implementation like vhost since it involves lots of software check,
>>>>>>>> speculation barrier, hardware feature toggling (e.g SMAP). The
>>>>>>>> extra cost will be more obvious when transferring small packets.
>>>>>>>>
>>>>>>>> This patch tries to eliminate those overhead by pin vq metadata pages
>>>>>>>> and access them through vmap(). During SET_VRING_ADDR, we will setup
>>>>>>>> those mappings and memory accessors are modified to use pointers to
>>>>>>>> access the metadata directly.
>>>>>>>>
>>>>>>>> Note, this was only done when device IOTLB is not enabled. We could
>>>>>>>> use similar method to optimize it in the future.
>>>>>>>>
>>>>>>>> Tests shows about ~24% improvement on TX PPS when using virtio-user +
>>>>>>>> vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):
>>>>>>>>
>>>>>>>> Before: ~5.0Mpps
>>>>>>>> After:  ~6.1Mpps
>>>>>>>>
>>>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>>>>> ---
>>>>>>>>      drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>      drivers/vhost/vhost.h |  11 +++
>>>>>>>>      2 files changed, 189 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>>>>>> index bafe39d2e637..1bd24203afb6 100644
>>>>>>>> --- a/drivers/vhost/vhost.c
>>>>>>>> +++ b/drivers/vhost/vhost.c
>>>>>>>> @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev,
>>>>>>>>      		vq->indirect = NULL;
>>>>>>>>      		vq->heads = NULL;
>>>>>>>>      		vq->dev = dev;
>>>>>>>> +		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
>>>>>>>> +		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
>>>>>>>> +		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
>>>>>>>>      		mutex_init(&vq->mutex);
>>>>>>>>      		vhost_vq_reset(dev, vq);
>>>>>>>>      		if (vq->handle_kick)
>>>>>>>> @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>>>>>>>>      	spin_unlock(&dev->iotlb_lock);
>>>>>>>>      }
>>>>>>>> +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
>>>>>>>> +			   size_t size, int write)
>>>>>>>> +{
>>>>>>>> +	struct page **pages;
>>>>>>>> +	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
>>>>>>>> +	int npinned;
>>>>>>>> +	void *vaddr;
>>>>>>>> +
>>>>>>>> +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
>>>>>>>> +	if (!pages)
>>>>>>>> +		return -ENOMEM;
>>>>>>>> +
>>>>>>>> +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
>>>>>>>> +	if (npinned != npages)
>>>>>>>> +		goto err;
>>>>>>>> +
>>>>>>> As I said I have doubts about the whole approach, but this
>>>>>>> implementation in particular isn't a good idea
>>>>>>> as it keeps the page around forever.
>>>> The pages wil be released during set features.
>>>>
>>>>
>>>>>>> So no THP, no NUMA rebalancing,
>>>> For THP, we will probably miss 2 or 4 pages, but does this really matter
>>>> consider the gain we have?
>>> We as in vhost? networking isn't the only thing guest does.
>>> We don't even know if this guest does a lot of networking.
>>> You don't
>>> know what else is in this huge page. Can be something very important
>>> that guest touches all the time.
>>
>> Well, the probability should be very small consider we usually give several
>> gigabytes to guest. The rest of the pages that doesn't sit in the same
>> hugepage with metadata can still be merged by THP.  Anyway, I can test the
>> differences.
> Thanks!
>
>>>> For NUMA rebalancing, I'm even not quite sure if
>>>> it can helps for the case of IPC (vhost). It looks to me the worst case it
>>>> may cause page to be thrash between nodes if vhost and userspace are running
>>>> in two nodes.
>>> So again it's a gain for vhost but has a completely unpredictable effect on
>>> other functionality of the guest.
>>>
>>> That's what bothers me with this approach.
>>
>> So:
>>
>> - The rest of the pages could still be balanced to other nodes, no?
>>
>> - try to balance metadata pages (belongs to co-operate processes) itself is
>> still questionable
> I am not sure why. It should be easy enough to force the VCPU and vhost
> to move (e.g. start them pinned to 1 cpu, then pin them to another one).
> Clearly sometimes this would be necessary for load balancing reasons.


Yes, but it looks to me the part of motivation of auto NUMA is to avoid 
manual pinning.


> With autonuma after a while (could take seconds but it will happen) the
> memory will migrate.
>

Yes. As you mentioned during the discuss, I wonder we could do it 
similarly through mmu notifier like APIC access page in commit 
c24ae0dcd3e ("kvm: x86: Unpin and remove kvm_arch->apic_access_page")


>
>
>>>
>>>
>>>
>>>>>> This is the price of all GUP users not only vhost itself.
>>>>> Yes. GUP is just not a great interface for vhost to use.
>>>> Zerocopy codes (enabled by defualt) use them for years.
>>> But only for TX and temporarily. We pin, read, unpin.
>>
>> Probably not. For several reasons that the page will be not be released soon
>> or held for a very long period of time or even forever.
>
> With zero copy? Well it's pinned until transmit. Takes a while
> but could be enough for autocopy to work esp since
> its the packet memory so not reused immediately.
>
>>> Your patch is different
>>>
>>> - it writes into memory and GUP has known issues with file
>>>     backed memory
>>
>> The ordinary user for vhost is anonymous pages I think?
>
> It's not the most common scenario and not the fastest one
> (e.g. THP does not work) but file backed is useful sometimes.
> It would not be nice at all to corrupt guest memory in that case.


Ok.


>
>>> - it keeps pages pinned forever
>>>
>>>
>>>
>>>>>> What's more
>>>>>> important, the goal is not to be left too much behind for other backends
>>>>>> like DPDK or AF_XDP (all of which are using GUP).
>>>>> So these guys assume userspace knows what it's doing.
>>>>> We can't assume that.
>>>> What kind of assumption do you they have?
>>>>
>>>>
>>>>>>> userspace-controlled
>>>>>>> amount of memory locked up and not accounted for.
>>>>>> It's pretty easy to add this since the slow path was still kept. If we
>>>>>> exceeds the limitation, we can switch back to slow path.
>>>>>>
>>>>>>> Don't get me wrong it's a great patch in an ideal world.
>>>>>>> But then in an ideal world no barriers smap etc are necessary at all.
>>>>>> Again, this is only for metadata accessing not the data which has been used
>>>>>> for years for real use cases.
>>>>>>
>>>>>> For SMAP, it makes senses for the address that kernel can not forcast. But
>>>>>> it's not the case for the vhost metadata since we know the address will be
>>>>>> accessed very frequently. For speculation barrier, it helps nothing for the
>>>>>> data path of vhost which is a kthread.
>>>>> I don't see how a kthread makes any difference. We do have a validation
>>>>> step which makes some difference.
>>>> The problem is not kthread but the address of userspace address. The
>>>> addresses of vq metadata tends to be consistent for a while, and vhost knows
>>>> they will be frequently. SMAP doesn't help too much in this case.
>>>>
>>>> Thanks.
>>> It's true for a real life applications but a malicious one
>>> can call the setup ioctls any number of times. And SMAP is
>>> all about malcious applications.
>>
>> We don't do this in the path of ioctl, there's no context switch between
>> userspace and kernel in the worker thread. SMAP is used to prevent kernel
>> from accessing userspace pages unexpectedly which is not the case for
>> metadata access.
>>
>> Thanks
> OK let's forget smap for now.


Some numbers I measured:

On an old Sandy bridge machine without SMAP support. Remove speculation 
barrier boost the performance from 4.6Mpps to 5.1Mpps

On a newer Broadwell machine with SMAP support. Remove speculation 
barrier only gives 2%-5% improvement, disable SMAP completely through 
Kconfig boost 57% performance from 4.8Mpps to 7.5Mpps. (Vmap gives 6Mpps 
- 6.1Mpps, it only bypass SMAP for metadata).

So it looks like for recent machine, SMAP becomes pain point when the 
copy is short (e.g 64B) for high PPS.

Thanks


>
>>>>>> Packet or AF_XDP benefit from
>>>>>> accessing metadata directly, we should do it as well.
>>>>>>
>>>>>> Thanks
Michael S. Tsirkin Dec. 26, 2018, 3:02 p.m. UTC | #13
On Wed, Dec 26, 2018 at 11:57:32AM +0800, Jason Wang wrote:
> 
> On 2018/12/25 下午8:50, Michael S. Tsirkin wrote:
> > On Tue, Dec 25, 2018 at 06:05:25PM +0800, Jason Wang wrote:
> > > On 2018/12/25 上午2:10, Michael S. Tsirkin wrote:
> > > > On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote:
> > > > > On 2018/12/14 下午8:36, Michael S. Tsirkin wrote:
> > > > > > On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote:
> > > > > > > On 2018/12/13 下午11:44, Michael S. Tsirkin wrote:
> > > > > > > > On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote:
> > > > > > > > > It was noticed that the copy_user() friends that was used to access
> > > > > > > > > virtqueue metdata tends to be very expensive for dataplane
> > > > > > > > > implementation like vhost since it involves lots of software check,
> > > > > > > > > speculation barrier, hardware feature toggling (e.g SMAP). The
> > > > > > > > > extra cost will be more obvious when transferring small packets.
> > > > > > > > > 
> > > > > > > > > This patch tries to eliminate those overhead by pin vq metadata pages
> > > > > > > > > and access them through vmap(). During SET_VRING_ADDR, we will setup
> > > > > > > > > those mappings and memory accessors are modified to use pointers to
> > > > > > > > > access the metadata directly.
> > > > > > > > > 
> > > > > > > > > Note, this was only done when device IOTLB is not enabled. We could
> > > > > > > > > use similar method to optimize it in the future.
> > > > > > > > > 
> > > > > > > > > Tests shows about ~24% improvement on TX PPS when using virtio-user +
> > > > > > > > > vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):
> > > > > > > > > 
> > > > > > > > > Before: ~5.0Mpps
> > > > > > > > > After:  ~6.1Mpps
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > > > > > > > ---
> > > > > > > > >      drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > >      drivers/vhost/vhost.h |  11 +++
> > > > > > > > >      2 files changed, 189 insertions(+)
> > > > > > > > > 
> > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > > index bafe39d2e637..1bd24203afb6 100644
> > > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > > @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev,
> > > > > > > > >      		vq->indirect = NULL;
> > > > > > > > >      		vq->heads = NULL;
> > > > > > > > >      		vq->dev = dev;
> > > > > > > > > +		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
> > > > > > > > > +		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
> > > > > > > > > +		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
> > > > > > > > >      		mutex_init(&vq->mutex);
> > > > > > > > >      		vhost_vq_reset(dev, vq);
> > > > > > > > >      		if (vq->handle_kick)
> > > > > > > > > @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev)
> > > > > > > > >      	spin_unlock(&dev->iotlb_lock);
> > > > > > > > >      }
> > > > > > > > > +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
> > > > > > > > > +			   size_t size, int write)
> > > > > > > > > +{
> > > > > > > > > +	struct page **pages;
> > > > > > > > > +	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
> > > > > > > > > +	int npinned;
> > > > > > > > > +	void *vaddr;
> > > > > > > > > +
> > > > > > > > > +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
> > > > > > > > > +	if (!pages)
> > > > > > > > > +		return -ENOMEM;
> > > > > > > > > +
> > > > > > > > > +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
> > > > > > > > > +	if (npinned != npages)
> > > > > > > > > +		goto err;
> > > > > > > > > +
> > > > > > > > As I said I have doubts about the whole approach, but this
> > > > > > > > implementation in particular isn't a good idea
> > > > > > > > as it keeps the page around forever.
> > > > > The pages wil be released during set features.
> > > > > 
> > > > > 
> > > > > > > > So no THP, no NUMA rebalancing,
> > > > > For THP, we will probably miss 2 or 4 pages, but does this really matter
> > > > > consider the gain we have?
> > > > We as in vhost? networking isn't the only thing guest does.
> > > > We don't even know if this guest does a lot of networking.
> > > > You don't
> > > > know what else is in this huge page. Can be something very important
> > > > that guest touches all the time.
> > > 
> > > Well, the probability should be very small consider we usually give several
> > > gigabytes to guest. The rest of the pages that doesn't sit in the same
> > > hugepage with metadata can still be merged by THP.  Anyway, I can test the
> > > differences.
> > Thanks!
> > 
> > > > > For NUMA rebalancing, I'm even not quite sure if
> > > > > it can helps for the case of IPC (vhost). It looks to me the worst case it
> > > > > may cause page to be thrash between nodes if vhost and userspace are running
> > > > > in two nodes.
> > > > So again it's a gain for vhost but has a completely unpredictable effect on
> > > > other functionality of the guest.
> > > > 
> > > > That's what bothers me with this approach.
> > > 
> > > So:
> > > 
> > > - The rest of the pages could still be balanced to other nodes, no?
> > > 
> > > - try to balance metadata pages (belongs to co-operate processes) itself is
> > > still questionable
> > I am not sure why. It should be easy enough to force the VCPU and vhost
> > to move (e.g. start them pinned to 1 cpu, then pin them to another one).
> > Clearly sometimes this would be necessary for load balancing reasons.
> 
> 
> Yes, but it looks to me the part of motivation of auto NUMA is to avoid
> manual pinning.

... of memory. Yes.


> 
> > With autonuma after a while (could take seconds but it will happen) the
> > memory will migrate.
> > 
> 
> Yes. As you mentioned during the discuss, I wonder we could do it similarly
> through mmu notifier like APIC access page in commit c24ae0dcd3e ("kvm: x86:
> Unpin and remove kvm_arch->apic_access_page")

That would be a possible approach.

> 
> > 
> > 
> > > > 
> > > > 
> > > > 
> > > > > > > This is the price of all GUP users not only vhost itself.
> > > > > > Yes. GUP is just not a great interface for vhost to use.
> > > > > Zerocopy codes (enabled by defualt) use them for years.
> > > > But only for TX and temporarily. We pin, read, unpin.
> > > 
> > > Probably not. For several reasons that the page will be not be released soon
> > > or held for a very long period of time or even forever.
> > 
> > With zero copy? Well it's pinned until transmit. Takes a while
> > but could be enough for autocopy to work esp since
> > its the packet memory so not reused immediately.
> > 
> > > > Your patch is different
> > > > 
> > > > - it writes into memory and GUP has known issues with file
> > > >     backed memory
> > > 
> > > The ordinary user for vhost is anonymous pages I think?
> > 
> > It's not the most common scenario and not the fastest one
> > (e.g. THP does not work) but file backed is useful sometimes.
> > It would not be nice at all to corrupt guest memory in that case.
> 
> 
> Ok.
> 
> 
> > 
> > > > - it keeps pages pinned forever
> > > > 
> > > > 
> > > > 
> > > > > > > What's more
> > > > > > > important, the goal is not to be left too much behind for other backends
> > > > > > > like DPDK or AF_XDP (all of which are using GUP).
> > > > > > So these guys assume userspace knows what it's doing.
> > > > > > We can't assume that.
> > > > > What kind of assumption do you they have?
> > > > > 
> > > > > 
> > > > > > > > userspace-controlled
> > > > > > > > amount of memory locked up and not accounted for.
> > > > > > > It's pretty easy to add this since the slow path was still kept. If we
> > > > > > > exceeds the limitation, we can switch back to slow path.
> > > > > > > 
> > > > > > > > Don't get me wrong it's a great patch in an ideal world.
> > > > > > > > But then in an ideal world no barriers smap etc are necessary at all.
> > > > > > > Again, this is only for metadata accessing not the data which has been used
> > > > > > > for years for real use cases.
> > > > > > > 
> > > > > > > For SMAP, it makes senses for the address that kernel can not forcast. But
> > > > > > > it's not the case for the vhost metadata since we know the address will be
> > > > > > > accessed very frequently. For speculation barrier, it helps nothing for the
> > > > > > > data path of vhost which is a kthread.
> > > > > > I don't see how a kthread makes any difference. We do have a validation
> > > > > > step which makes some difference.
> > > > > The problem is not kthread but the address of userspace address. The
> > > > > addresses of vq metadata tends to be consistent for a while, and vhost knows
> > > > > they will be frequently. SMAP doesn't help too much in this case.
> > > > > 
> > > > > Thanks.
> > > > It's true for a real life applications but a malicious one
> > > > can call the setup ioctls any number of times. And SMAP is
> > > > all about malcious applications.
> > > 
> > > We don't do this in the path of ioctl, there's no context switch between
> > > userspace and kernel in the worker thread. SMAP is used to prevent kernel
> > > from accessing userspace pages unexpectedly which is not the case for
> > > metadata access.
> > > 
> > > Thanks
> > OK let's forget smap for now.
> 
> 
> Some numbers I measured:
> 
> On an old Sandy bridge machine without SMAP support. Remove speculation
> barrier boost the performance from 4.6Mpps to 5.1Mpps
> 
> On a newer Broadwell machine with SMAP support. Remove speculation barrier
> only gives 2%-5% improvement, disable SMAP completely through Kconfig boost
> 57% performance from 4.8Mpps to 7.5Mpps. (Vmap gives 6Mpps - 6.1Mpps, it
> only bypass SMAP for metadata).
> 
> So it looks like for recent machine, SMAP becomes pain point when the copy
> is short (e.g 64B) for high PPS.
> 
> Thanks

Thanks a lot for looking into this!

So first of all users can just boot with nosmap, right?
What's wrong with that? Yes it's not fine-grained but OTOH
it's easy to understand.

And I guess this confirms that if we are going to worry
about smap enabled, we need to look into packet copies
too, not just meta-data.

Vaguely could see a module option (off by default)
where vhost basically does user_access_begin
when it starts running, then uses unsafe accesses
in vhost and tun and then user_access_end.


> 
> > 
> > > > > > > Packet or AF_XDP benefit from
> > > > > > > accessing metadata directly, we should do it as well.
> > > > > > > 
> > > > > > > Thanks
Jason Wang Dec. 27, 2018, 9:39 a.m. UTC | #14
On 2018/12/26 下午11:02, Michael S. Tsirkin wrote:
> On Wed, Dec 26, 2018 at 11:57:32AM +0800, Jason Wang wrote:
>> On 2018/12/25 下午8:50, Michael S. Tsirkin wrote:
>>> On Tue, Dec 25, 2018 at 06:05:25PM +0800, Jason Wang wrote:
>>>> On 2018/12/25 上午2:10, Michael S. Tsirkin wrote:
>>>>> On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote:
>>>>>> On 2018/12/14 下午8:36, Michael S. Tsirkin wrote:
>>>>>>> On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote:
>>>>>>>> On 2018/12/13 下午11:44, Michael S. Tsirkin wrote:
>>>>>>>>> On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote:
>>>>>>>>>> It was noticed that the copy_user() friends that was used to access
>>>>>>>>>> virtqueue metdata tends to be very expensive for dataplane
>>>>>>>>>> implementation like vhost since it involves lots of software check,
>>>>>>>>>> speculation barrier, hardware feature toggling (e.g SMAP). The
>>>>>>>>>> extra cost will be more obvious when transferring small packets.
>>>>>>>>>>
>>>>>>>>>> This patch tries to eliminate those overhead by pin vq metadata pages
>>>>>>>>>> and access them through vmap(). During SET_VRING_ADDR, we will setup
>>>>>>>>>> those mappings and memory accessors are modified to use pointers to
>>>>>>>>>> access the metadata directly.
>>>>>>>>>>
>>>>>>>>>> Note, this was only done when device IOTLB is not enabled. We could
>>>>>>>>>> use similar method to optimize it in the future.
>>>>>>>>>>
>>>>>>>>>> Tests shows about ~24% improvement on TX PPS when using virtio-user +
>>>>>>>>>> vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):
>>>>>>>>>>
>>>>>>>>>> Before: ~5.0Mpps
>>>>>>>>>> After:  ~6.1Mpps
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>>>>>>> ---
>>>>>>>>>>       drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>>>       drivers/vhost/vhost.h |  11 +++
>>>>>>>>>>       2 files changed, 189 insertions(+)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>>>>>>>> index bafe39d2e637..1bd24203afb6 100644
>>>>>>>>>> --- a/drivers/vhost/vhost.c
>>>>>>>>>> +++ b/drivers/vhost/vhost.c
>>>>>>>>>> @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev,
>>>>>>>>>>       		vq->indirect = NULL;
>>>>>>>>>>       		vq->heads = NULL;
>>>>>>>>>>       		vq->dev = dev;
>>>>>>>>>> +		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
>>>>>>>>>> +		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
>>>>>>>>>> +		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
>>>>>>>>>>       		mutex_init(&vq->mutex);
>>>>>>>>>>       		vhost_vq_reset(dev, vq);
>>>>>>>>>>       		if (vq->handle_kick)
>>>>>>>>>> @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>>>>>>>>>>       	spin_unlock(&dev->iotlb_lock);
>>>>>>>>>>       }
>>>>>>>>>> +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
>>>>>>>>>> +			   size_t size, int write)
>>>>>>>>>> +{
>>>>>>>>>> +	struct page **pages;
>>>>>>>>>> +	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
>>>>>>>>>> +	int npinned;
>>>>>>>>>> +	void *vaddr;
>>>>>>>>>> +
>>>>>>>>>> +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
>>>>>>>>>> +	if (!pages)
>>>>>>>>>> +		return -ENOMEM;
>>>>>>>>>> +
>>>>>>>>>> +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
>>>>>>>>>> +	if (npinned != npages)
>>>>>>>>>> +		goto err;
>>>>>>>>>> +
>>>>>>>>> As I said I have doubts about the whole approach, but this
>>>>>>>>> implementation in particular isn't a good idea
>>>>>>>>> as it keeps the page around forever.
>>>>>> The pages wil be released during set features.
>>>>>>
>>>>>>
>>>>>>>>> So no THP, no NUMA rebalancing,
>>>>>> For THP, we will probably miss 2 or 4 pages, but does this really matter
>>>>>> consider the gain we have?
>>>>> We as in vhost? networking isn't the only thing guest does.
>>>>> We don't even know if this guest does a lot of networking.
>>>>> You don't
>>>>> know what else is in this huge page. Can be something very important
>>>>> that guest touches all the time.
>>>> Well, the probability should be very small consider we usually give several
>>>> gigabytes to guest. The rest of the pages that doesn't sit in the same
>>>> hugepage with metadata can still be merged by THP.  Anyway, I can test the
>>>> differences.
>>> Thanks!
>>>
>>>>>> For NUMA rebalancing, I'm even not quite sure if
>>>>>> it can helps for the case of IPC (vhost). It looks to me the worst case it
>>>>>> may cause page to be thrash between nodes if vhost and userspace are running
>>>>>> in two nodes.
>>>>> So again it's a gain for vhost but has a completely unpredictable effect on
>>>>> other functionality of the guest.
>>>>>
>>>>> That's what bothers me with this approach.
>>>> So:
>>>>
>>>> - The rest of the pages could still be balanced to other nodes, no?
>>>>
>>>> - try to balance metadata pages (belongs to co-operate processes) itself is
>>>> still questionable
>>> I am not sure why. It should be easy enough to force the VCPU and vhost
>>> to move (e.g. start them pinned to 1 cpu, then pin them to another one).
>>> Clearly sometimes this would be necessary for load balancing reasons.
>>
>> Yes, but it looks to me the part of motivation of auto NUMA is to avoid
>> manual pinning.
> ... of memory. Yes.
>
>
>>> With autonuma after a while (could take seconds but it will happen) the
>>> memory will migrate.
>>>
>> Yes. As you mentioned during the discuss, I wonder we could do it similarly
>> through mmu notifier like APIC access page in commit c24ae0dcd3e ("kvm: x86:
>> Unpin and remove kvm_arch->apic_access_page")
> That would be a possible approach.


Yes, this looks possible, and the conversion seems not hard. Let me have 
a try with this.


[...]


>>>>>>> I don't see how a kthread makes any difference. We do have a validation
>>>>>>> step which makes some difference.
>>>>>> The problem is not kthread but the address of userspace address. The
>>>>>> addresses of vq metadata tends to be consistent for a while, and vhost knows
>>>>>> they will be frequently. SMAP doesn't help too much in this case.
>>>>>>
>>>>>> Thanks.
>>>>> It's true for a real life applications but a malicious one
>>>>> can call the setup ioctls any number of times. And SMAP is
>>>>> all about malcious applications.
>>>> We don't do this in the path of ioctl, there's no context switch between
>>>> userspace and kernel in the worker thread. SMAP is used to prevent kernel
>>>> from accessing userspace pages unexpectedly which is not the case for
>>>> metadata access.
>>>>
>>>> Thanks
>>> OK let's forget smap for now.
>>
>> Some numbers I measured:
>>
>> On an old Sandy bridge machine without SMAP support. Remove speculation
>> barrier boost the performance from 4.6Mpps to 5.1Mpps
>>
>> On a newer Broadwell machine with SMAP support. Remove speculation barrier
>> only gives 2%-5% improvement, disable SMAP completely through Kconfig boost
>> 57% performance from 4.8Mpps to 7.5Mpps. (Vmap gives 6Mpps - 6.1Mpps, it
>> only bypass SMAP for metadata).
>>
>> So it looks like for recent machine, SMAP becomes pain point when the copy
>> is short (e.g 64B) for high PPS.
>>
>> Thanks
> Thanks a lot for looking into this!
>
> So first of all users can just boot with nosmap, right?
> What's wrong with that?


Nothing wrong, just realize we had this kernel parameter.


>   Yes it's not fine-grained but OTOH
> it's easy to understand.
>
> And I guess this confirms that if we are going to worry
> about smap enabled, we need to look into packet copies
> too, not just meta-data.


For packet copies, we can do batch copy which is pretty simple for the 
case of XDP. I've already had patches for this.


>
> Vaguely could see a module option (off by default)
> where vhost basically does user_access_begin
> when it starts running, then uses unsafe accesses
> in vhost and tun and then user_access_end.


Using user_access_begin() is more tricky than imaged. E.g it requires:

- userspace address to be validated before through access_ok() [1]

- It doesn't support calling a function that does explicit schedule 
since SMAP/PAN state is not maintained through schedule() [2]

[1] https://lwn.net/Articles/736348/

[2] https://lkml.org/lkml/2018/11/23/430

So calling user_access_begin() all the time when vhost is running seems 
pretty dangerous.

For a better batched datacopy, I tend to build not only XDP but also skb 
in vhost in the future.

Thanks


>
>
>>>>>>>> Packet or AF_XDP benefit from
>>>>>>>> accessing metadata directly, we should do it as well.
>>>>>>>>
>>>>>>>> Thanks
Michael S. Tsirkin Dec. 30, 2018, 6:30 p.m. UTC | #15
On Thu, Dec 27, 2018 at 05:39:21PM +0800, Jason Wang wrote:
> 
> On 2018/12/26 下午11:02, Michael S. Tsirkin wrote:
> > On Wed, Dec 26, 2018 at 11:57:32AM +0800, Jason Wang wrote:
> > > On 2018/12/25 下午8:50, Michael S. Tsirkin wrote:
> > > > On Tue, Dec 25, 2018 at 06:05:25PM +0800, Jason Wang wrote:
> > > > > On 2018/12/25 上午2:10, Michael S. Tsirkin wrote:
> > > > > > On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote:
> > > > > > > On 2018/12/14 下午8:36, Michael S. Tsirkin wrote:
> > > > > > > > On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote:
> > > > > > > > > On 2018/12/13 下午11:44, Michael S. Tsirkin wrote:
> > > > > > > > > > On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote:
> > > > > > > > > > > It was noticed that the copy_user() friends that was used to access
> > > > > > > > > > > virtqueue metdata tends to be very expensive for dataplane
> > > > > > > > > > > implementation like vhost since it involves lots of software check,
> > > > > > > > > > > speculation barrier, hardware feature toggling (e.g SMAP). The
> > > > > > > > > > > extra cost will be more obvious when transferring small packets.
> > > > > > > > > > > 
> > > > > > > > > > > This patch tries to eliminate those overhead by pin vq metadata pages
> > > > > > > > > > > and access them through vmap(). During SET_VRING_ADDR, we will setup
> > > > > > > > > > > those mappings and memory accessors are modified to use pointers to
> > > > > > > > > > > access the metadata directly.
> > > > > > > > > > > 
> > > > > > > > > > > Note, this was only done when device IOTLB is not enabled. We could
> > > > > > > > > > > use similar method to optimize it in the future.
> > > > > > > > > > > 
> > > > > > > > > > > Tests shows about ~24% improvement on TX PPS when using virtio-user +
> > > > > > > > > > > vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):
> > > > > > > > > > > 
> > > > > > > > > > > Before: ~5.0Mpps
> > > > > > > > > > > After:  ~6.1Mpps
> > > > > > > > > > > 
> > > > > > > > > > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > > > > > > > > > ---
> > > > > > > > > > >       drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > > >       drivers/vhost/vhost.h |  11 +++
> > > > > > > > > > >       2 files changed, 189 insertions(+)
> > > > > > > > > > > 
> > > > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > > > > index bafe39d2e637..1bd24203afb6 100644
> > > > > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > > > > @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev,
> > > > > > > > > > >       		vq->indirect = NULL;
> > > > > > > > > > >       		vq->heads = NULL;
> > > > > > > > > > >       		vq->dev = dev;
> > > > > > > > > > > +		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
> > > > > > > > > > > +		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
> > > > > > > > > > > +		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
> > > > > > > > > > >       		mutex_init(&vq->mutex);
> > > > > > > > > > >       		vhost_vq_reset(dev, vq);
> > > > > > > > > > >       		if (vq->handle_kick)
> > > > > > > > > > > @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev)
> > > > > > > > > > >       	spin_unlock(&dev->iotlb_lock);
> > > > > > > > > > >       }
> > > > > > > > > > > +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
> > > > > > > > > > > +			   size_t size, int write)
> > > > > > > > > > > +{
> > > > > > > > > > > +	struct page **pages;
> > > > > > > > > > > +	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
> > > > > > > > > > > +	int npinned;
> > > > > > > > > > > +	void *vaddr;
> > > > > > > > > > > +
> > > > > > > > > > > +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
> > > > > > > > > > > +	if (!pages)
> > > > > > > > > > > +		return -ENOMEM;
> > > > > > > > > > > +
> > > > > > > > > > > +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
> > > > > > > > > > > +	if (npinned != npages)
> > > > > > > > > > > +		goto err;
> > > > > > > > > > > +
> > > > > > > > > > As I said I have doubts about the whole approach, but this
> > > > > > > > > > implementation in particular isn't a good idea
> > > > > > > > > > as it keeps the page around forever.
> > > > > > > The pages wil be released during set features.
> > > > > > > 
> > > > > > > 
> > > > > > > > > > So no THP, no NUMA rebalancing,
> > > > > > > For THP, we will probably miss 2 or 4 pages, but does this really matter
> > > > > > > consider the gain we have?
> > > > > > We as in vhost? networking isn't the only thing guest does.
> > > > > > We don't even know if this guest does a lot of networking.
> > > > > > You don't
> > > > > > know what else is in this huge page. Can be something very important
> > > > > > that guest touches all the time.
> > > > > Well, the probability should be very small consider we usually give several
> > > > > gigabytes to guest. The rest of the pages that doesn't sit in the same
> > > > > hugepage with metadata can still be merged by THP.  Anyway, I can test the
> > > > > differences.
> > > > Thanks!
> > > > 
> > > > > > > For NUMA rebalancing, I'm even not quite sure if
> > > > > > > it can helps for the case of IPC (vhost). It looks to me the worst case it
> > > > > > > may cause page to be thrash between nodes if vhost and userspace are running
> > > > > > > in two nodes.
> > > > > > So again it's a gain for vhost but has a completely unpredictable effect on
> > > > > > other functionality of the guest.
> > > > > > 
> > > > > > That's what bothers me with this approach.
> > > > > So:
> > > > > 
> > > > > - The rest of the pages could still be balanced to other nodes, no?
> > > > > 
> > > > > - try to balance metadata pages (belongs to co-operate processes) itself is
> > > > > still questionable
> > > > I am not sure why. It should be easy enough to force the VCPU and vhost
> > > > to move (e.g. start them pinned to 1 cpu, then pin them to another one).
> > > > Clearly sometimes this would be necessary for load balancing reasons.
> > > 
> > > Yes, but it looks to me the part of motivation of auto NUMA is to avoid
> > > manual pinning.
> > ... of memory. Yes.
> > 
> > 
> > > > With autonuma after a while (could take seconds but it will happen) the
> > > > memory will migrate.
> > > > 
> > > Yes. As you mentioned during the discuss, I wonder we could do it similarly
> > > through mmu notifier like APIC access page in commit c24ae0dcd3e ("kvm: x86:
> > > Unpin and remove kvm_arch->apic_access_page")
> > That would be a possible approach.
> 
> 
> Yes, this looks possible, and the conversion seems not hard. Let me have a
> try with this.
> 
> 
> [...]
> 
> 
> > > > > > > > I don't see how a kthread makes any difference. We do have a validation
> > > > > > > > step which makes some difference.
> > > > > > > The problem is not kthread but the address of userspace address. The
> > > > > > > addresses of vq metadata tends to be consistent for a while, and vhost knows
> > > > > > > they will be frequently. SMAP doesn't help too much in this case.
> > > > > > > 
> > > > > > > Thanks.
> > > > > > It's true for a real life applications but a malicious one
> > > > > > can call the setup ioctls any number of times. And SMAP is
> > > > > > all about malcious applications.
> > > > > We don't do this in the path of ioctl, there's no context switch between
> > > > > userspace and kernel in the worker thread. SMAP is used to prevent kernel
> > > > > from accessing userspace pages unexpectedly which is not the case for
> > > > > metadata access.
> > > > > 
> > > > > Thanks
> > > > OK let's forget smap for now.
> > > 
> > > Some numbers I measured:
> > > 
> > > On an old Sandy bridge machine without SMAP support. Remove speculation
> > > barrier boost the performance from 4.6Mpps to 5.1Mpps
> > > 
> > > On a newer Broadwell machine with SMAP support. Remove speculation barrier
> > > only gives 2%-5% improvement, disable SMAP completely through Kconfig boost
> > > 57% performance from 4.8Mpps to 7.5Mpps. (Vmap gives 6Mpps - 6.1Mpps, it
> > > only bypass SMAP for metadata).
> > > 
> > > So it looks like for recent machine, SMAP becomes pain point when the copy
> > > is short (e.g 64B) for high PPS.
> > > 
> > > Thanks
> > Thanks a lot for looking into this!
> > 
> > So first of all users can just boot with nosmap, right?
> > What's wrong with that?
> 
> 
> Nothing wrong, just realize we had this kernel parameter.
> 
> 
> >   Yes it's not fine-grained but OTOH
> > it's easy to understand.
> > 
> > And I guess this confirms that if we are going to worry
> > about smap enabled, we need to look into packet copies
> > too, not just meta-data.
> 
> 
> For packet copies, we can do batch copy which is pretty simple for the case
> of XDP. I've already had patches for this.
> 
> 
> > 
> > Vaguely could see a module option (off by default)
> > where vhost basically does user_access_begin
> > when it starts running, then uses unsafe accesses
> > in vhost and tun and then user_access_end.
> 
> 
> Using user_access_begin() is more tricky than imaged. E.g it requires:
> 
> - userspace address to be validated before through access_ok() [1]

This part is fine I think - addresses come from the memory
map and when userspace supplies the memory map
we validate everything with access_ok.
Well do we validate with the iotlb too? Don't see it right now
so maybe not but it's easy to add.

> - It doesn't support calling a function that does explicit schedule since
> SMAP/PAN state is not maintained through schedule() [2]
> 
> [1] https://lwn.net/Articles/736348/
> 
> [2] https://lkml.org/lkml/2018/11/23/430
> 
> So calling user_access_begin() all the time when vhost is running seems
> pretty dangerous.

Yes it requires some rework e.g. to try getting memory with
GFP_ATOMIC. We could then do a slow path with GFP_KERNEL
if that fails.

> For a better batched datacopy, I tend to build not only XDP but also skb in
> vhost in the future.
> 
> Thanks

Sure, why not.

> 
> > 
> > 
> > > > > > > > > Packet or AF_XDP benefit from
> > > > > > > > > accessing metadata directly, we should do it as well.
> > > > > > > > > 
> > > > > > > > > Thanks
Jason Wang Jan. 2, 2019, 11:38 a.m. UTC | #16
On 2018/12/31 上午2:30, Michael S. Tsirkin wrote:
> On Thu, Dec 27, 2018 at 05:39:21PM +0800, Jason Wang wrote:
>> On 2018/12/26 下午11:02, Michael S. Tsirkin wrote:
>>> On Wed, Dec 26, 2018 at 11:57:32AM +0800, Jason Wang wrote:
>>>> On 2018/12/25 下午8:50, Michael S. Tsirkin wrote:
>>>>> On Tue, Dec 25, 2018 at 06:05:25PM +0800, Jason Wang wrote:
>>>>>> On 2018/12/25 上午2:10, Michael S. Tsirkin wrote:
>>>>>>> On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote:
>>>>>>>> On 2018/12/14 下午8:36, Michael S. Tsirkin wrote:
>>>>>>>>> On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote:
>>>>>>>>>> On 2018/12/13 下午11:44, Michael S. Tsirkin wrote:
>>>>>>>>>>> On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote:
>>>>>>>>>>>> It was noticed that the copy_user() friends that was used to access
>>>>>>>>>>>> virtqueue metdata tends to be very expensive for dataplane
>>>>>>>>>>>> implementation like vhost since it involves lots of software check,
>>>>>>>>>>>> speculation barrier, hardware feature toggling (e.g SMAP). The
>>>>>>>>>>>> extra cost will be more obvious when transferring small packets.
>>>>>>>>>>>>
>>>>>>>>>>>> This patch tries to eliminate those overhead by pin vq metadata pages
>>>>>>>>>>>> and access them through vmap(). During SET_VRING_ADDR, we will setup
>>>>>>>>>>>> those mappings and memory accessors are modified to use pointers to
>>>>>>>>>>>> access the metadata directly.
>>>>>>>>>>>>
>>>>>>>>>>>> Note, this was only done when device IOTLB is not enabled. We could
>>>>>>>>>>>> use similar method to optimize it in the future.
>>>>>>>>>>>>
>>>>>>>>>>>> Tests shows about ~24% improvement on TX PPS when using virtio-user +
>>>>>>>>>>>> vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):
>>>>>>>>>>>>
>>>>>>>>>>>> Before: ~5.0Mpps
>>>>>>>>>>>> After:  ~6.1Mpps
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>        drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>>>>>        drivers/vhost/vhost.h |  11 +++
>>>>>>>>>>>>        2 files changed, 189 insertions(+)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>>>>>>>>>> index bafe39d2e637..1bd24203afb6 100644
>>>>>>>>>>>> --- a/drivers/vhost/vhost.c
>>>>>>>>>>>> +++ b/drivers/vhost/vhost.c
>>>>>>>>>>>> @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev,
>>>>>>>>>>>>        		vq->indirect = NULL;
>>>>>>>>>>>>        		vq->heads = NULL;
>>>>>>>>>>>>        		vq->dev = dev;
>>>>>>>>>>>> +		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
>>>>>>>>>>>> +		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
>>>>>>>>>>>> +		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
>>>>>>>>>>>>        		mutex_init(&vq->mutex);
>>>>>>>>>>>>        		vhost_vq_reset(dev, vq);
>>>>>>>>>>>>        		if (vq->handle_kick)
>>>>>>>>>>>> @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>>>>>>>>>>>>        	spin_unlock(&dev->iotlb_lock);
>>>>>>>>>>>>        }
>>>>>>>>>>>> +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
>>>>>>>>>>>> +			   size_t size, int write)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +	struct page **pages;
>>>>>>>>>>>> +	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
>>>>>>>>>>>> +	int npinned;
>>>>>>>>>>>> +	void *vaddr;
>>>>>>>>>>>> +
>>>>>>>>>>>> +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
>>>>>>>>>>>> +	if (!pages)
>>>>>>>>>>>> +		return -ENOMEM;
>>>>>>>>>>>> +
>>>>>>>>>>>> +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
>>>>>>>>>>>> +	if (npinned != npages)
>>>>>>>>>>>> +		goto err;
>>>>>>>>>>>> +
>>>>>>>>>>> As I said I have doubts about the whole approach, but this
>>>>>>>>>>> implementation in particular isn't a good idea
>>>>>>>>>>> as it keeps the page around forever.
>>>>>>>> The pages wil be released during set features.
>>>>>>>>
>>>>>>>>
>>>>>>>>>>> So no THP, no NUMA rebalancing,
>>>>>>>> For THP, we will probably miss 2 or 4 pages, but does this really matter
>>>>>>>> consider the gain we have?
>>>>>>> We as in vhost? networking isn't the only thing guest does.
>>>>>>> We don't even know if this guest does a lot of networking.
>>>>>>> You don't
>>>>>>> know what else is in this huge page. Can be something very important
>>>>>>> that guest touches all the time.
>>>>>> Well, the probability should be very small consider we usually give several
>>>>>> gigabytes to guest. The rest of the pages that doesn't sit in the same
>>>>>> hugepage with metadata can still be merged by THP.  Anyway, I can test the
>>>>>> differences.
>>>>> Thanks!
>>>>>
>>>>>>>> For NUMA rebalancing, I'm even not quite sure if
>>>>>>>> it can helps for the case of IPC (vhost). It looks to me the worst case it
>>>>>>>> may cause page to be thrash between nodes if vhost and userspace are running
>>>>>>>> in two nodes.
>>>>>>> So again it's a gain for vhost but has a completely unpredictable effect on
>>>>>>> other functionality of the guest.
>>>>>>>
>>>>>>> That's what bothers me with this approach.
>>>>>> So:
>>>>>>
>>>>>> - The rest of the pages could still be balanced to other nodes, no?
>>>>>>
>>>>>> - try to balance metadata pages (belongs to co-operate processes) itself is
>>>>>> still questionable
>>>>> I am not sure why. It should be easy enough to force the VCPU and vhost
>>>>> to move (e.g. start them pinned to 1 cpu, then pin them to another one).
>>>>> Clearly sometimes this would be necessary for load balancing reasons.
>>>> Yes, but it looks to me the part of motivation of auto NUMA is to avoid
>>>> manual pinning.
>>> ... of memory. Yes.
>>>
>>>
>>>>> With autonuma after a while (could take seconds but it will happen) the
>>>>> memory will migrate.
>>>>>
>>>> Yes. As you mentioned during the discuss, I wonder we could do it similarly
>>>> through mmu notifier like APIC access page in commit c24ae0dcd3e ("kvm: x86:
>>>> Unpin and remove kvm_arch->apic_access_page")
>>> That would be a possible approach.
>>
>> Yes, this looks possible, and the conversion seems not hard. Let me have a
>> try with this.
>>
>>
>> [...]
>>
>>
>>>>>>>>> I don't see how a kthread makes any difference. We do have a validation
>>>>>>>>> step which makes some difference.
>>>>>>>> The problem is not kthread but the address of userspace address. The
>>>>>>>> addresses of vq metadata tends to be consistent for a while, and vhost knows
>>>>>>>> they will be frequently. SMAP doesn't help too much in this case.
>>>>>>>>
>>>>>>>> Thanks.
>>>>>>> It's true for a real life applications but a malicious one
>>>>>>> can call the setup ioctls any number of times. And SMAP is
>>>>>>> all about malcious applications.
>>>>>> We don't do this in the path of ioctl, there's no context switch between
>>>>>> userspace and kernel in the worker thread. SMAP is used to prevent kernel
>>>>>> from accessing userspace pages unexpectedly which is not the case for
>>>>>> metadata access.
>>>>>>
>>>>>> Thanks
>>>>> OK let's forget smap for now.
>>>> Some numbers I measured:
>>>>
>>>> On an old Sandy bridge machine without SMAP support. Remove speculation
>>>> barrier boost the performance from 4.6Mpps to 5.1Mpps
>>>>
>>>> On a newer Broadwell machine with SMAP support. Remove speculation barrier
>>>> only gives 2%-5% improvement, disable SMAP completely through Kconfig boost
>>>> 57% performance from 4.8Mpps to 7.5Mpps. (Vmap gives 6Mpps - 6.1Mpps, it
>>>> only bypass SMAP for metadata).
>>>>
>>>> So it looks like for recent machine, SMAP becomes pain point when the copy
>>>> is short (e.g 64B) for high PPS.
>>>>
>>>> Thanks
>>> Thanks a lot for looking into this!
>>>
>>> So first of all users can just boot with nosmap, right?
>>> What's wrong with that?
>>
>> Nothing wrong, just realize we had this kernel parameter.
>>
>>
>>>    Yes it's not fine-grained but OTOH
>>> it's easy to understand.
>>>
>>> And I guess this confirms that if we are going to worry
>>> about smap enabled, we need to look into packet copies
>>> too, not just meta-data.
>>
>> For packet copies, we can do batch copy which is pretty simple for the case
>> of XDP. I've already had patches for this.
>>
>>
>>> Vaguely could see a module option (off by default)
>>> where vhost basically does user_access_begin
>>> when it starts running, then uses unsafe accesses
>>> in vhost and tun and then user_access_end.
>>
>> Using user_access_begin() is more tricky than imaged. E.g it requires:
>>
>> - userspace address to be validated before through access_ok() [1]
> This part is fine I think - addresses come from the memory
> map and when userspace supplies the memory map
> we validate everything with access_ok.
> Well do we validate with the iotlb too? Don't see it right now
> so maybe not but it's easy to add.


Yes, it's not hard.


>
>> - It doesn't support calling a function that does explicit schedule since
>> SMAP/PAN state is not maintained through schedule() [2]
>>
>> [1] https://lwn.net/Articles/736348/
>>
>> [2] https://lkml.org/lkml/2018/11/23/430
>>
>> So calling user_access_begin() all the time when vhost is running seems
>> pretty dangerous.
> Yes it requires some rework e.g. to try getting memory with
> GFP_ATOMIC. We could then do a slow path with GFP_KERNEL
> if that fails.


I'm not sure this is the only part that needs care. Consider all the 
under layer network or block codes assumes a process context, it's not 
easy to figure out all I'm afraid. And even if we could, it's hard to 
prevent it from being added in the future.

Thanks


>
>> For a better batched datacopy, I tend to build not only XDP but also skb in
>> vhost in the future.
>>
>> Thanks
> Sure, why not.
>
>>>
>>>>>>>>>> Packet or AF_XDP benefit from
>>>>>>>>>> accessing metadata directly, we should do it as well.
>>>>>>>>>>
>>>>>>>>>> Thanks

Patch
diff mbox series

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index bafe39d2e637..1bd24203afb6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -443,6 +443,9 @@  void vhost_dev_init(struct vhost_dev *dev,
 		vq->indirect = NULL;
 		vq->heads = NULL;
 		vq->dev = dev;
+		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
+		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
+		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
 		mutex_init(&vq->mutex);
 		vhost_vq_reset(dev, vq);
 		if (vq->handle_kick)
@@ -614,6 +617,102 @@  static void vhost_clear_msg(struct vhost_dev *dev)
 	spin_unlock(&dev->iotlb_lock);
 }
 
+static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
+			   size_t size, int write)
+{
+	struct page **pages;
+	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
+	int npinned;
+	void *vaddr;
+
+	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	npinned = get_user_pages_fast(uaddr, npages, write, pages);
+	if (npinned != npages)
+		goto err;
+
+	vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
+	if (!vaddr)
+		goto err;
+
+	map->pages = pages;
+	map->addr = vaddr + (uaddr & (PAGE_SIZE - 1));
+	map->npages = npages;
+
+	return 0;
+
+err:
+	if (npinned > 0)
+		release_pages(pages, npinned);
+	kfree(pages);
+	return -EFAULT;
+}
+
+static void vhost_uninit_vmap(struct vhost_vmap *map)
+{
+	if (!map->addr)
+		return;
+
+	vunmap(map->addr);
+	release_pages(map->pages, map->npages);
+	kfree(map->pages);
+
+	map->addr = NULL;
+	map->pages = NULL;
+	map->npages = 0;
+}
+
+static void vhost_clean_vmaps(struct vhost_virtqueue *vq)
+{
+	vhost_uninit_vmap(&vq->avail_ring);
+	vhost_uninit_vmap(&vq->desc_ring);
+	vhost_uninit_vmap(&vq->used_ring);
+}
+
+static int vhost_setup_vmaps(struct vhost_virtqueue *vq, unsigned long avail,
+			     unsigned long desc, unsigned long used)
+{
+	size_t event = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+	size_t avail_size, desc_size, used_size;
+	int ret;
+
+	vhost_clean_vmaps(vq);
+
+	avail_size = sizeof(*vq->avail) +
+		     sizeof(*vq->avail->ring) * vq->num + event;
+	ret = vhost_init_vmap(&vq->avail_ring, avail, avail_size, false);
+	if (ret) {
+		vq_err(vq, "Fail to setup vmap for avail ring!\n");
+		goto err_avail;
+	}
+
+	desc_size = sizeof(*vq->desc) * vq->num;
+	ret = vhost_init_vmap(&vq->desc_ring, desc, desc_size, false);
+	if (ret) {
+		vq_err(vq, "Fail to setup vmap for desc ring!\n");
+		goto err_desc;
+	}
+
+	used_size = sizeof(*vq->used) +
+		    sizeof(*vq->used->ring) * vq->num + event;
+	ret = vhost_init_vmap(&vq->used_ring, used, used_size, true);
+	if (ret) {
+		vq_err(vq, "Fail to setup vmap for used ring!\n");
+		goto err_used;
+	}
+
+	return 0;
+
+err_used:
+	vhost_uninit_vmap(&vq->used_ring);
+err_desc:
+	vhost_uninit_vmap(&vq->avail_ring);
+err_avail:
+	return -EFAULT;
+}
+
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
 	int i;
@@ -626,6 +725,7 @@  void vhost_dev_cleanup(struct vhost_dev *dev)
 		if (dev->vqs[i]->call_ctx)
 			eventfd_ctx_put(dev->vqs[i]->call_ctx);
 		vhost_vq_reset(dev, dev->vqs[i]);
+		vhost_clean_vmaps(dev->vqs[i]);
 	}
 	vhost_dev_free_iovecs(dev);
 	if (dev->log_ctx)
@@ -873,6 +973,14 @@  static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 
 static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 {
+	if (!vq->iotlb) {
+		struct vring_used *used = vq->used_ring.addr;
+
+		*((__virtio16 *)&used->ring[vq->num]) =
+			cpu_to_vhost16(vq, vq->avail_idx);
+		return 0;
+	}
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
 			      vhost_avail_event(vq));
 }
@@ -881,6 +989,13 @@  static inline int vhost_put_used(struct vhost_virtqueue *vq,
 				 struct vring_used_elem *head, int idx,
 				 int count)
 {
+	if (!vq->iotlb) {
+		struct vring_used *used = vq->used_ring.addr;
+
+		memcpy(used->ring + idx, head, count * sizeof(*head));
+		return 0;
+	}
+
 	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
 				  count * sizeof(*head));
 }
@@ -888,6 +1003,13 @@  static inline int vhost_put_used(struct vhost_virtqueue *vq,
 static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 
 {
+	if (!vq->iotlb) {
+		struct vring_used *used = vq->used_ring.addr;
+
+		used->flags = cpu_to_vhost16(vq, vq->used_flags);
+		return 0;
+	}
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
 			      &vq->used->flags);
 }
@@ -895,6 +1017,13 @@  static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 
 {
+	if (!vq->iotlb) {
+		struct vring_used *used = vq->used_ring.addr;
+
+		used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
+		return 0;
+	}
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
 			      &vq->used->idx);
 }
@@ -926,12 +1055,26 @@  static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
 				      __virtio16 *idx)
 {
+	if (!vq->iotlb) {
+		struct vring_avail *avail = vq->avail_ring.addr;
+
+		*idx = avail->idx;
+		return 0;
+	}
+
 	return vhost_get_avail(vq, *idx, &vq->avail->idx);
 }
 
 static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 				       __virtio16 *head, int idx)
 {
+	if (!vq->iotlb) {
+		struct vring_avail *avail = vq->avail_ring.addr;
+
+		*head = avail->ring[idx & (vq->num - 1)];
+		return 0;
+	}
+
 	return vhost_get_avail(vq, *head,
 			       &vq->avail->ring[idx & (vq->num - 1)]);
 }
@@ -939,24 +1082,52 @@  static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
 					__virtio16 *flags)
 {
+	if (!vq->iotlb) {
+		struct vring_avail *avail = vq->avail_ring.addr;
+
+		*flags = avail->flags;
+		return 0;
+	}
+
 	return vhost_get_avail(vq, *flags, &vq->avail->flags);
 }
 
 static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
 				       __virtio16 *event)
 {
+	if (!vq->iotlb) {
+		struct vring_avail *avail = vq->avail_ring.addr;
+
+		*event = (__virtio16)avail->ring[vq->num];
+		return 0;
+	}
+
 	return vhost_get_avail(vq, *event, vhost_used_event(vq));
 }
 
 static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
 				     __virtio16 *idx)
 {
+	if (!vq->iotlb) {
+		struct vring_used *used = vq->used_ring.addr;
+
+		*idx = used->idx;
+		return 0;
+	}
+
 	return vhost_get_used(vq, *idx, &vq->used->idx);
 }
 
 static inline int vhost_get_desc(struct vhost_virtqueue *vq,
 				 struct vring_desc *desc, int idx)
 {
+	if (!vq->iotlb) {
+		struct vring_desc *d = vq->desc_ring.addr;
+
+		*desc = *(d + idx);
+		return 0;
+	}
+
 	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
 }
 
@@ -1551,6 +1722,13 @@  long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
 			}
 		}
 
+		if (!vq->iotlb && vhost_setup_vmaps(vq, a.avail_user_addr,
+							a.desc_user_addr,
+							a.used_user_addr)) {
+			r = -EINVAL;
+			break;
+		}
+
 		vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
 		vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
 		vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 466ef7542291..89dc0ad3d055 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -80,6 +80,12 @@  enum vhost_uaddr_type {
 	VHOST_NUM_ADDRS = 3,
 };
 
+struct vhost_vmap {
+	struct page **pages;
+	void *addr;
+	int npages;
+};
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -90,6 +96,11 @@  struct vhost_virtqueue {
 	struct vring_desc __user *desc;
 	struct vring_avail __user *avail;
 	struct vring_used __user *used;
+
+	struct vhost_vmap avail_ring;
+	struct vhost_vmap desc_ring;
+	struct vhost_vmap used_ring;
+
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
 	struct file *kick;
 	struct eventfd_ctx *call_ctx;