All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] Revert and rework on the metadata accelreation
@ 2019-09-05 12:27 Jason Wang
  2019-09-05 12:27 ` [PATCH 1/2] Revert "vhost: access vq metadata through kernel virtual address" Jason Wang
                   ` (7 more replies)
  0 siblings, 8 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-05 12:27 UTC (permalink / raw)
  To: mst, jasowang, kvm, virtualization
  Cc: netdev, linux-kernel, jgg, aarcange, jglisse, linux-mm

Hi:

Per request from Michael and Jason, the metadata accelreation is
reverted in this version and rework in next version.

Please review.

Thanks

Jason Wang (2):
  Revert "vhost: access vq metadata through kernel virtual address"
  vhost: re-introducing metadata acceleration through kernel virtual
    address

 drivers/vhost/vhost.c | 202 +++++++++++++++++++++++++-----------------
 drivers/vhost/vhost.h |   8 +-
 2 files changed, 123 insertions(+), 87 deletions(-)

-- 
2.19.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 1/2] Revert "vhost: access vq metadata through kernel virtual address"
  2019-09-05 12:27 [PATCH 0/2] Revert and rework on the metadata accelreation Jason Wang
  2019-09-05 12:27 ` [PATCH 1/2] Revert "vhost: access vq metadata through kernel virtual address" Jason Wang
@ 2019-09-05 12:27 ` Jason Wang
  2019-09-06 13:46   ` Michael S. Tsirkin
  2019-09-06 13:46   ` Michael S. Tsirkin
  2019-09-05 12:27   ` Jason Wang
                   ` (5 subsequent siblings)
  7 siblings, 2 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-05 12:27 UTC (permalink / raw)
  To: mst, jasowang, kvm, virtualization
  Cc: netdev, linux-kernel, jgg, aarcange, jglisse, linux-mm

It was reported that metadata acceleration introduces several issues,
so this patch reverts commit ff466032dc9e5a61217f22ea34b2df932786bbfc,
73f628ec9e6bcc45b77c53fe6d0c0ec55eaf82af and
0b4a7092ffe568a55bf8f3cefdf79ff666586d91.

We will rework it on the next version.

Cc: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 515 +-----------------------------------------
 drivers/vhost/vhost.h |  41 ----
 2 files changed, 3 insertions(+), 553 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0536f8526359..791562e03fe0 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -298,160 +298,6 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
 		__vhost_vq_meta_reset(d->vqs[i]);
 }
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-static void vhost_map_unprefetch(struct vhost_map *map)
-{
-	kfree(map->pages);
-	map->pages = NULL;
-	map->npages = 0;
-	map->addr = NULL;
-}
-
-static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
-{
-	struct vhost_map *map[VHOST_NUM_ADDRS];
-	int i;
-
-	spin_lock(&vq->mmu_lock);
-	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
-		map[i] = rcu_dereference_protected(vq->maps[i],
-				  lockdep_is_held(&vq->mmu_lock));
-		if (map[i])
-			rcu_assign_pointer(vq->maps[i], NULL);
-	}
-	spin_unlock(&vq->mmu_lock);
-
-	synchronize_rcu();
-
-	for (i = 0; i < VHOST_NUM_ADDRS; i++)
-		if (map[i])
-			vhost_map_unprefetch(map[i]);
-
-}
-
-static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
-{
-	int i;
-
-	vhost_uninit_vq_maps(vq);
-	for (i = 0; i < VHOST_NUM_ADDRS; i++)
-		vq->uaddrs[i].size = 0;
-}
-
-static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
-				     unsigned long start,
-				     unsigned long end)
-{
-	if (unlikely(!uaddr->size))
-		return false;
-
-	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
-}
-
-static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
-				      int index,
-				      unsigned long start,
-				      unsigned long end)
-{
-	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
-	struct vhost_map *map;
-	int i;
-
-	if (!vhost_map_range_overlap(uaddr, start, end))
-		return;
-
-	spin_lock(&vq->mmu_lock);
-	++vq->invalidate_count;
-
-	map = rcu_dereference_protected(vq->maps[index],
-					lockdep_is_held(&vq->mmu_lock));
-	if (map) {
-		if (uaddr->write) {
-			for (i = 0; i < map->npages; i++)
-				set_page_dirty(map->pages[i]);
-		}
-		rcu_assign_pointer(vq->maps[index], NULL);
-	}
-	spin_unlock(&vq->mmu_lock);
-
-	if (map) {
-		synchronize_rcu();
-		vhost_map_unprefetch(map);
-	}
-}
-
-static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
-				    int index,
-				    unsigned long start,
-				    unsigned long end)
-{
-	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
-		return;
-
-	spin_lock(&vq->mmu_lock);
-	--vq->invalidate_count;
-	spin_unlock(&vq->mmu_lock);
-}
-
-static int vhost_invalidate_range_start(struct mmu_notifier *mn,
-					const struct mmu_notifier_range *range)
-{
-	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
-					     mmu_notifier);
-	int i, j;
-
-	if (!mmu_notifier_range_blockable(range))
-		return -EAGAIN;
-
-	for (i = 0; i < dev->nvqs; i++) {
-		struct vhost_virtqueue *vq = dev->vqs[i];
-
-		for (j = 0; j < VHOST_NUM_ADDRS; j++)
-			vhost_invalidate_vq_start(vq, j,
-						  range->start,
-						  range->end);
-	}
-
-	return 0;
-}
-
-static void vhost_invalidate_range_end(struct mmu_notifier *mn,
-				       const struct mmu_notifier_range *range)
-{
-	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
-					     mmu_notifier);
-	int i, j;
-
-	for (i = 0; i < dev->nvqs; i++) {
-		struct vhost_virtqueue *vq = dev->vqs[i];
-
-		for (j = 0; j < VHOST_NUM_ADDRS; j++)
-			vhost_invalidate_vq_end(vq, j,
-						range->start,
-						range->end);
-	}
-}
-
-static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
-	.invalidate_range_start = vhost_invalidate_range_start,
-	.invalidate_range_end = vhost_invalidate_range_end,
-};
-
-static void vhost_init_maps(struct vhost_dev *dev)
-{
-	struct vhost_virtqueue *vq;
-	int i, j;
-
-	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
-
-	for (i = 0; i < dev->nvqs; ++i) {
-		vq = dev->vqs[i];
-		for (j = 0; j < VHOST_NUM_ADDRS; j++)
-			RCU_INIT_POINTER(vq->maps[j], NULL);
-	}
-}
-#endif
-
 static void vhost_vq_reset(struct vhost_dev *dev,
 			   struct vhost_virtqueue *vq)
 {
@@ -480,11 +326,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->busyloop_timeout = 0;
 	vq->umem = NULL;
 	vq->iotlb = NULL;
-	vq->invalidate_count = 0;
 	__vhost_vq_meta_reset(vq);
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	vhost_reset_vq_maps(vq);
-#endif
 }
 
 static int vhost_worker(void *data)
@@ -634,9 +476,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 	INIT_LIST_HEAD(&dev->read_list);
 	INIT_LIST_HEAD(&dev->pending_list);
 	spin_lock_init(&dev->iotlb_lock);
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	vhost_init_maps(dev);
-#endif
+
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		vq = dev->vqs[i];
@@ -645,7 +485,6 @@ void vhost_dev_init(struct vhost_dev *dev,
 		vq->heads = NULL;
 		vq->dev = dev;
 		mutex_init(&vq->mutex);
-		spin_lock_init(&vq->mmu_lock);
 		vhost_vq_reset(dev, vq);
 		if (vq->handle_kick)
 			vhost_poll_init(&vq->poll, vq->handle_kick,
@@ -725,18 +564,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 	if (err)
 		goto err_cgroup;
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
-	if (err)
-		goto err_mmu_notifier;
-#endif
-
 	return 0;
-
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-err_mmu_notifier:
-	vhost_dev_free_iovecs(dev);
-#endif
 err_cgroup:
 	kthread_stop(worker);
 	dev->worker = NULL;
@@ -827,107 +655,6 @@ static void vhost_clear_msg(struct vhost_dev *dev)
 	spin_unlock(&dev->iotlb_lock);
 }
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
-			      int index, unsigned long uaddr,
-			      size_t size, bool write)
-{
-	struct vhost_uaddr *addr = &vq->uaddrs[index];
-
-	addr->uaddr = uaddr;
-	addr->size = size;
-	addr->write = write;
-}
-
-static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
-{
-	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
-			  (unsigned long)vq->desc,
-			  vhost_get_desc_size(vq, vq->num),
-			  false);
-	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
-			  (unsigned long)vq->avail,
-			  vhost_get_avail_size(vq, vq->num),
-			  false);
-	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
-			  (unsigned long)vq->used,
-			  vhost_get_used_size(vq, vq->num),
-			  true);
-}
-
-static int vhost_map_prefetch(struct vhost_virtqueue *vq,
-			       int index)
-{
-	struct vhost_map *map;
-	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
-	struct page **pages;
-	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
-	int npinned;
-	void *vaddr, *v;
-	int err;
-	int i;
-
-	spin_lock(&vq->mmu_lock);
-
-	err = -EFAULT;
-	if (vq->invalidate_count)
-		goto err;
-
-	err = -ENOMEM;
-	map = kmalloc(sizeof(*map), GFP_ATOMIC);
-	if (!map)
-		goto err;
-
-	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
-	if (!pages)
-		goto err_pages;
-
-	err = EFAULT;
-	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
-					uaddr->write, pages);
-	if (npinned > 0)
-		release_pages(pages, npinned);
-	if (npinned != npages)
-		goto err_gup;
-
-	for (i = 0; i < npinned; i++)
-		if (PageHighMem(pages[i]))
-			goto err_gup;
-
-	vaddr = v = page_address(pages[0]);
-
-	/* For simplicity, fallback to userspace address if VA is not
-	 * contigious.
-	 */
-	for (i = 1; i < npinned; i++) {
-		v += PAGE_SIZE;
-		if (v != page_address(pages[i]))
-			goto err_gup;
-	}
-
-	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
-	map->npages = npages;
-	map->pages = pages;
-
-	rcu_assign_pointer(vq->maps[index], map);
-	/* No need for a synchronize_rcu(). This function should be
-	 * called by dev->worker so we are serialized with all
-	 * readers.
-	 */
-	spin_unlock(&vq->mmu_lock);
-
-	return 0;
-
-err_gup:
-	kfree(pages);
-err_pages:
-	kfree(map);
-err:
-	spin_unlock(&vq->mmu_lock);
-	return err;
-}
-#endif
-
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
 	int i;
@@ -957,16 +684,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 		kthread_stop(dev->worker);
 		dev->worker = NULL;
 	}
-	if (dev->mm) {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-		mmu_notifier_unregister(&dev->mmu_notifier, dev->mm);
-#endif
+	if (dev->mm)
 		mmput(dev->mm);
-	}
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	for (i = 0; i < dev->nvqs; i++)
-		vhost_uninit_vq_maps(dev->vqs[i]);
-#endif
 	dev->mm = NULL;
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
@@ -1195,26 +914,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 
 static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			*((__virtio16 *)&used->ring[vq->num]) =
-				cpu_to_vhost16(vq, vq->avail_idx);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
 			      vhost_avail_event(vq));
 }
@@ -1223,27 +922,6 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 				 struct vring_used_elem *head, int idx,
 				 int count)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-	size_t size;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			size = count * sizeof(*head);
-			memcpy(used->ring + idx, head, size);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
 				  count * sizeof(*head));
 }
@@ -1251,25 +929,6 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			used->flags = cpu_to_vhost16(vq, vq->used_flags);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
 			      &vq->used->flags);
 }
@@ -1277,25 +936,6 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
 			      &vq->used->idx);
 }
@@ -1341,50 +981,12 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
 static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
 				      __virtio16 *idx)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_avail *avail;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
-		if (likely(map)) {
-			avail = map->addr;
-			*idx = avail->idx;
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_avail(vq, *idx, &vq->avail->idx);
 }
 
 static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 				       __virtio16 *head, int idx)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_avail *avail;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
-		if (likely(map)) {
-			avail = map->addr;
-			*head = avail->ring[idx & (vq->num - 1)];
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_avail(vq, *head,
 			       &vq->avail->ring[idx & (vq->num - 1)]);
 }
@@ -1392,98 +994,24 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
 					__virtio16 *flags)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_avail *avail;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
-		if (likely(map)) {
-			avail = map->addr;
-			*flags = avail->flags;
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_avail(vq, *flags, &vq->avail->flags);
 }
 
 static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
 				       __virtio16 *event)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_avail *avail;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
-		if (likely(map)) {
-			avail = map->addr;
-			*event = (__virtio16)avail->ring[vq->num];
-			rcu_read_unlock();
-			return 0;
-		}
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_avail(vq, *event, vhost_used_event(vq));
 }
 
 static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
 				     __virtio16 *idx)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			*idx = used->idx;
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_used(vq, *idx, &vq->used->idx);
 }
 
 static inline int vhost_get_desc(struct vhost_virtqueue *vq,
 				 struct vring_desc *desc, int idx)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_desc *d;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
-		if (likely(map)) {
-			d = map->addr;
-			*desc = *(d + idx);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
 }
 
@@ -1824,32 +1352,12 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
 	return true;
 }
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
-{
-	struct vhost_map __rcu *map;
-	int i;
-
-	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
-		rcu_read_lock();
-		map = rcu_dereference(vq->maps[i]);
-		rcu_read_unlock();
-		if (unlikely(!map))
-			vhost_map_prefetch(vq, i);
-	}
-}
-#endif
-
 int vq_meta_prefetch(struct vhost_virtqueue *vq)
 {
 	unsigned int num = vq->num;
 
-	if (!vq->iotlb) {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-		vhost_vq_map_prefetch(vq);
-#endif
+	if (!vq->iotlb)
 		return 1;
-	}
 
 	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
 			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
@@ -2060,16 +1568,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 
 	mutex_lock(&vq->mutex);
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	/* Unregister MMU notifer to allow invalidation callback
-	 * can access vq->uaddrs[] without holding a lock.
-	 */
-	if (d->mm)
-		mmu_notifier_unregister(&d->mmu_notifier, d->mm);
-
-	vhost_uninit_vq_maps(vq);
-#endif
-
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		r = vhost_vring_set_num(d, vq, argp);
@@ -2081,13 +1579,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 		BUG();
 	}
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	vhost_setup_vq_uaddr(vq);
-
-	if (d->mm)
-		mmu_notifier_register(&d->mmu_notifier, d->mm);
-#endif
-
 	mutex_unlock(&vq->mutex);
 
 	return r;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 42a8c2a13ab1..e9ed2722b633 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -12,9 +12,6 @@
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
 #include <linux/atomic.h>
-#include <linux/pagemap.h>
-#include <linux/mmu_notifier.h>
-#include <asm/cacheflush.h>
 
 struct vhost_work;
 typedef void (*vhost_work_fn_t)(struct vhost_work *work);
@@ -83,24 +80,6 @@ enum vhost_uaddr_type {
 	VHOST_NUM_ADDRS = 3,
 };
 
-struct vhost_map {
-	int npages;
-	void *addr;
-	struct page **pages;
-};
-
-struct vhost_uaddr {
-	unsigned long uaddr;
-	size_t size;
-	bool write;
-};
-
-#if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
-#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
-#else
-#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
-#endif
-
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -111,22 +90,7 @@ struct vhost_virtqueue {
 	struct vring_desc __user *desc;
 	struct vring_avail __user *avail;
 	struct vring_used __user *used;
-
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	/* Read by memory accessors, modified by meta data
-	 * prefetching, MMU notifier and vring ioctl().
-	 * Synchonrized through mmu_lock (writers) and RCU (writers
-	 * and readers).
-	 */
-	struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
-	/* Read by MMU notifier, modified by vring ioctl(),
-	 * synchronized through MMU notifier
-	 * registering/unregistering.
-	 */
-	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
-#endif
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
-
 	struct file *kick;
 	struct eventfd_ctx *call_ctx;
 	struct eventfd_ctx *error_ctx;
@@ -181,8 +145,6 @@ struct vhost_virtqueue {
 	bool user_be;
 #endif
 	u32 busyloop_timeout;
-	spinlock_t mmu_lock;
-	int invalidate_count;
 };
 
 struct vhost_msg_node {
@@ -196,9 +158,6 @@ struct vhost_msg_node {
 
 struct vhost_dev {
 	struct mm_struct *mm;
-#ifdef CONFIG_MMU_NOTIFIER
-	struct mmu_notifier mmu_notifier;
-#endif
 	struct mutex mutex;
 	struct vhost_virtqueue **vqs;
 	int nvqs;
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 1/2] Revert "vhost: access vq metadata through kernel virtual address"
  2019-09-05 12:27 [PATCH 0/2] Revert and rework on the metadata accelreation Jason Wang
@ 2019-09-05 12:27 ` Jason Wang
  2019-09-05 12:27 ` Jason Wang
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-05 12:27 UTC (permalink / raw)
  To: mst, jasowang, kvm, virtualization
  Cc: aarcange, netdev, linux-kernel, linux-mm, jglisse, jgg

It was reported that metadata acceleration introduces several issues,
so this patch reverts commit ff466032dc9e5a61217f22ea34b2df932786bbfc,
73f628ec9e6bcc45b77c53fe6d0c0ec55eaf82af and
0b4a7092ffe568a55bf8f3cefdf79ff666586d91.

We will rework it on the next version.

Cc: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 515 +-----------------------------------------
 drivers/vhost/vhost.h |  41 ----
 2 files changed, 3 insertions(+), 553 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0536f8526359..791562e03fe0 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -298,160 +298,6 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
 		__vhost_vq_meta_reset(d->vqs[i]);
 }
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-static void vhost_map_unprefetch(struct vhost_map *map)
-{
-	kfree(map->pages);
-	map->pages = NULL;
-	map->npages = 0;
-	map->addr = NULL;
-}
-
-static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
-{
-	struct vhost_map *map[VHOST_NUM_ADDRS];
-	int i;
-
-	spin_lock(&vq->mmu_lock);
-	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
-		map[i] = rcu_dereference_protected(vq->maps[i],
-				  lockdep_is_held(&vq->mmu_lock));
-		if (map[i])
-			rcu_assign_pointer(vq->maps[i], NULL);
-	}
-	spin_unlock(&vq->mmu_lock);
-
-	synchronize_rcu();
-
-	for (i = 0; i < VHOST_NUM_ADDRS; i++)
-		if (map[i])
-			vhost_map_unprefetch(map[i]);
-
-}
-
-static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
-{
-	int i;
-
-	vhost_uninit_vq_maps(vq);
-	for (i = 0; i < VHOST_NUM_ADDRS; i++)
-		vq->uaddrs[i].size = 0;
-}
-
-static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
-				     unsigned long start,
-				     unsigned long end)
-{
-	if (unlikely(!uaddr->size))
-		return false;
-
-	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
-}
-
-static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
-				      int index,
-				      unsigned long start,
-				      unsigned long end)
-{
-	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
-	struct vhost_map *map;
-	int i;
-
-	if (!vhost_map_range_overlap(uaddr, start, end))
-		return;
-
-	spin_lock(&vq->mmu_lock);
-	++vq->invalidate_count;
-
-	map = rcu_dereference_protected(vq->maps[index],
-					lockdep_is_held(&vq->mmu_lock));
-	if (map) {
-		if (uaddr->write) {
-			for (i = 0; i < map->npages; i++)
-				set_page_dirty(map->pages[i]);
-		}
-		rcu_assign_pointer(vq->maps[index], NULL);
-	}
-	spin_unlock(&vq->mmu_lock);
-
-	if (map) {
-		synchronize_rcu();
-		vhost_map_unprefetch(map);
-	}
-}
-
-static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
-				    int index,
-				    unsigned long start,
-				    unsigned long end)
-{
-	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
-		return;
-
-	spin_lock(&vq->mmu_lock);
-	--vq->invalidate_count;
-	spin_unlock(&vq->mmu_lock);
-}
-
-static int vhost_invalidate_range_start(struct mmu_notifier *mn,
-					const struct mmu_notifier_range *range)
-{
-	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
-					     mmu_notifier);
-	int i, j;
-
-	if (!mmu_notifier_range_blockable(range))
-		return -EAGAIN;
-
-	for (i = 0; i < dev->nvqs; i++) {
-		struct vhost_virtqueue *vq = dev->vqs[i];
-
-		for (j = 0; j < VHOST_NUM_ADDRS; j++)
-			vhost_invalidate_vq_start(vq, j,
-						  range->start,
-						  range->end);
-	}
-
-	return 0;
-}
-
-static void vhost_invalidate_range_end(struct mmu_notifier *mn,
-				       const struct mmu_notifier_range *range)
-{
-	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
-					     mmu_notifier);
-	int i, j;
-
-	for (i = 0; i < dev->nvqs; i++) {
-		struct vhost_virtqueue *vq = dev->vqs[i];
-
-		for (j = 0; j < VHOST_NUM_ADDRS; j++)
-			vhost_invalidate_vq_end(vq, j,
-						range->start,
-						range->end);
-	}
-}
-
-static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
-	.invalidate_range_start = vhost_invalidate_range_start,
-	.invalidate_range_end = vhost_invalidate_range_end,
-};
-
-static void vhost_init_maps(struct vhost_dev *dev)
-{
-	struct vhost_virtqueue *vq;
-	int i, j;
-
-	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
-
-	for (i = 0; i < dev->nvqs; ++i) {
-		vq = dev->vqs[i];
-		for (j = 0; j < VHOST_NUM_ADDRS; j++)
-			RCU_INIT_POINTER(vq->maps[j], NULL);
-	}
-}
-#endif
-
 static void vhost_vq_reset(struct vhost_dev *dev,
 			   struct vhost_virtqueue *vq)
 {
@@ -480,11 +326,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->busyloop_timeout = 0;
 	vq->umem = NULL;
 	vq->iotlb = NULL;
-	vq->invalidate_count = 0;
 	__vhost_vq_meta_reset(vq);
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	vhost_reset_vq_maps(vq);
-#endif
 }
 
 static int vhost_worker(void *data)
@@ -634,9 +476,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 	INIT_LIST_HEAD(&dev->read_list);
 	INIT_LIST_HEAD(&dev->pending_list);
 	spin_lock_init(&dev->iotlb_lock);
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	vhost_init_maps(dev);
-#endif
+
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		vq = dev->vqs[i];
@@ -645,7 +485,6 @@ void vhost_dev_init(struct vhost_dev *dev,
 		vq->heads = NULL;
 		vq->dev = dev;
 		mutex_init(&vq->mutex);
-		spin_lock_init(&vq->mmu_lock);
 		vhost_vq_reset(dev, vq);
 		if (vq->handle_kick)
 			vhost_poll_init(&vq->poll, vq->handle_kick,
@@ -725,18 +564,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 	if (err)
 		goto err_cgroup;
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
-	if (err)
-		goto err_mmu_notifier;
-#endif
-
 	return 0;
-
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-err_mmu_notifier:
-	vhost_dev_free_iovecs(dev);
-#endif
 err_cgroup:
 	kthread_stop(worker);
 	dev->worker = NULL;
@@ -827,107 +655,6 @@ static void vhost_clear_msg(struct vhost_dev *dev)
 	spin_unlock(&dev->iotlb_lock);
 }
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
-			      int index, unsigned long uaddr,
-			      size_t size, bool write)
-{
-	struct vhost_uaddr *addr = &vq->uaddrs[index];
-
-	addr->uaddr = uaddr;
-	addr->size = size;
-	addr->write = write;
-}
-
-static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
-{
-	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
-			  (unsigned long)vq->desc,
-			  vhost_get_desc_size(vq, vq->num),
-			  false);
-	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
-			  (unsigned long)vq->avail,
-			  vhost_get_avail_size(vq, vq->num),
-			  false);
-	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
-			  (unsigned long)vq->used,
-			  vhost_get_used_size(vq, vq->num),
-			  true);
-}
-
-static int vhost_map_prefetch(struct vhost_virtqueue *vq,
-			       int index)
-{
-	struct vhost_map *map;
-	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
-	struct page **pages;
-	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
-	int npinned;
-	void *vaddr, *v;
-	int err;
-	int i;
-
-	spin_lock(&vq->mmu_lock);
-
-	err = -EFAULT;
-	if (vq->invalidate_count)
-		goto err;
-
-	err = -ENOMEM;
-	map = kmalloc(sizeof(*map), GFP_ATOMIC);
-	if (!map)
-		goto err;
-
-	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
-	if (!pages)
-		goto err_pages;
-
-	err = EFAULT;
-	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
-					uaddr->write, pages);
-	if (npinned > 0)
-		release_pages(pages, npinned);
-	if (npinned != npages)
-		goto err_gup;
-
-	for (i = 0; i < npinned; i++)
-		if (PageHighMem(pages[i]))
-			goto err_gup;
-
-	vaddr = v = page_address(pages[0]);
-
-	/* For simplicity, fallback to userspace address if VA is not
-	 * contigious.
-	 */
-	for (i = 1; i < npinned; i++) {
-		v += PAGE_SIZE;
-		if (v != page_address(pages[i]))
-			goto err_gup;
-	}
-
-	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
-	map->npages = npages;
-	map->pages = pages;
-
-	rcu_assign_pointer(vq->maps[index], map);
-	/* No need for a synchronize_rcu(). This function should be
-	 * called by dev->worker so we are serialized with all
-	 * readers.
-	 */
-	spin_unlock(&vq->mmu_lock);
-
-	return 0;
-
-err_gup:
-	kfree(pages);
-err_pages:
-	kfree(map);
-err:
-	spin_unlock(&vq->mmu_lock);
-	return err;
-}
-#endif
-
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
 	int i;
@@ -957,16 +684,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 		kthread_stop(dev->worker);
 		dev->worker = NULL;
 	}
-	if (dev->mm) {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-		mmu_notifier_unregister(&dev->mmu_notifier, dev->mm);
-#endif
+	if (dev->mm)
 		mmput(dev->mm);
-	}
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	for (i = 0; i < dev->nvqs; i++)
-		vhost_uninit_vq_maps(dev->vqs[i]);
-#endif
 	dev->mm = NULL;
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
@@ -1195,26 +914,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 
 static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			*((__virtio16 *)&used->ring[vq->num]) =
-				cpu_to_vhost16(vq, vq->avail_idx);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
 			      vhost_avail_event(vq));
 }
@@ -1223,27 +922,6 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 				 struct vring_used_elem *head, int idx,
 				 int count)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-	size_t size;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			size = count * sizeof(*head);
-			memcpy(used->ring + idx, head, size);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
 				  count * sizeof(*head));
 }
@@ -1251,25 +929,6 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			used->flags = cpu_to_vhost16(vq, vq->used_flags);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
 			      &vq->used->flags);
 }
@@ -1277,25 +936,6 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
 			      &vq->used->idx);
 }
@@ -1341,50 +981,12 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
 static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
 				      __virtio16 *idx)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_avail *avail;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
-		if (likely(map)) {
-			avail = map->addr;
-			*idx = avail->idx;
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_avail(vq, *idx, &vq->avail->idx);
 }
 
 static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 				       __virtio16 *head, int idx)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_avail *avail;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
-		if (likely(map)) {
-			avail = map->addr;
-			*head = avail->ring[idx & (vq->num - 1)];
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_avail(vq, *head,
 			       &vq->avail->ring[idx & (vq->num - 1)]);
 }
@@ -1392,98 +994,24 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
 					__virtio16 *flags)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_avail *avail;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
-		if (likely(map)) {
-			avail = map->addr;
-			*flags = avail->flags;
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_avail(vq, *flags, &vq->avail->flags);
 }
 
 static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
 				       __virtio16 *event)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_avail *avail;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
-		if (likely(map)) {
-			avail = map->addr;
-			*event = (__virtio16)avail->ring[vq->num];
-			rcu_read_unlock();
-			return 0;
-		}
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_avail(vq, *event, vhost_used_event(vq));
 }
 
 static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
 				     __virtio16 *idx)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			*idx = used->idx;
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_used(vq, *idx, &vq->used->idx);
 }
 
 static inline int vhost_get_desc(struct vhost_virtqueue *vq,
 				 struct vring_desc *desc, int idx)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_desc *d;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
-		if (likely(map)) {
-			d = map->addr;
-			*desc = *(d + idx);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
 }
 
@@ -1824,32 +1352,12 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
 	return true;
 }
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
-{
-	struct vhost_map __rcu *map;
-	int i;
-
-	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
-		rcu_read_lock();
-		map = rcu_dereference(vq->maps[i]);
-		rcu_read_unlock();
-		if (unlikely(!map))
-			vhost_map_prefetch(vq, i);
-	}
-}
-#endif
-
 int vq_meta_prefetch(struct vhost_virtqueue *vq)
 {
 	unsigned int num = vq->num;
 
-	if (!vq->iotlb) {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-		vhost_vq_map_prefetch(vq);
-#endif
+	if (!vq->iotlb)
 		return 1;
-	}
 
 	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
 			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
@@ -2060,16 +1568,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 
 	mutex_lock(&vq->mutex);
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	/* Unregister MMU notifer to allow invalidation callback
-	 * can access vq->uaddrs[] without holding a lock.
-	 */
-	if (d->mm)
-		mmu_notifier_unregister(&d->mmu_notifier, d->mm);
-
-	vhost_uninit_vq_maps(vq);
-#endif
-
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		r = vhost_vring_set_num(d, vq, argp);
@@ -2081,13 +1579,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 		BUG();
 	}
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	vhost_setup_vq_uaddr(vq);
-
-	if (d->mm)
-		mmu_notifier_register(&d->mmu_notifier, d->mm);
-#endif
-
 	mutex_unlock(&vq->mutex);
 
 	return r;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 42a8c2a13ab1..e9ed2722b633 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -12,9 +12,6 @@
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
 #include <linux/atomic.h>
-#include <linux/pagemap.h>
-#include <linux/mmu_notifier.h>
-#include <asm/cacheflush.h>
 
 struct vhost_work;
 typedef void (*vhost_work_fn_t)(struct vhost_work *work);
@@ -83,24 +80,6 @@ enum vhost_uaddr_type {
 	VHOST_NUM_ADDRS = 3,
 };
 
-struct vhost_map {
-	int npages;
-	void *addr;
-	struct page **pages;
-};
-
-struct vhost_uaddr {
-	unsigned long uaddr;
-	size_t size;
-	bool write;
-};
-
-#if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
-#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
-#else
-#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
-#endif
-
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -111,22 +90,7 @@ struct vhost_virtqueue {
 	struct vring_desc __user *desc;
 	struct vring_avail __user *avail;
 	struct vring_used __user *used;
-
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	/* Read by memory accessors, modified by meta data
-	 * prefetching, MMU notifier and vring ioctl().
-	 * Synchonrized through mmu_lock (writers) and RCU (writers
-	 * and readers).
-	 */
-	struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
-	/* Read by MMU notifier, modified by vring ioctl(),
-	 * synchronized through MMU notifier
-	 * registering/unregistering.
-	 */
-	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
-#endif
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
-
 	struct file *kick;
 	struct eventfd_ctx *call_ctx;
 	struct eventfd_ctx *error_ctx;
@@ -181,8 +145,6 @@ struct vhost_virtqueue {
 	bool user_be;
 #endif
 	u32 busyloop_timeout;
-	spinlock_t mmu_lock;
-	int invalidate_count;
 };
 
 struct vhost_msg_node {
@@ -196,9 +158,6 @@ struct vhost_msg_node {
 
 struct vhost_dev {
 	struct mm_struct *mm;
-#ifdef CONFIG_MMU_NOTIFIER
-	struct mmu_notifier mmu_notifier;
-#endif
 	struct mutex mutex;
 	struct vhost_virtqueue **vqs;
 	int nvqs;
-- 
2.19.1

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-05 12:27 [PATCH 0/2] Revert and rework on the metadata accelreation Jason Wang
@ 2019-09-05 12:27   ` Jason Wang
  2019-09-05 12:27 ` Jason Wang
                     ` (6 subsequent siblings)
  7 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-05 12:27 UTC (permalink / raw)
  To: mst, jasowang, kvm, virtualization
  Cc: netdev, linux-kernel, jgg, aarcange, jglisse, linux-mm,
	James Bottomley, Christoph Hellwig, David Miller,
	linux-arm-kernel, linux-parisc

This is a rework on the commit 7f466032dc9e ("vhost: access vq
metadata through kernel virtual address").

It was noticed that the copy_to/from_user() friends that was used to
access virtqueue metdata tends to be very expensive for dataplane
implementation like vhost since it involves lots of software checks,
speculation barriers, hardware feature toggling (e.g SMAP). The
extra cost will be more obvious when transferring small packets since
the time spent on metadata accessing become more significant.

This patch tries to eliminate those overheads by accessing them
through direct mapping of those pages. Invalidation callbacks is
implemented for co-operation with general VM management (swap, KSM,
THP or NUMA balancing). We will try to get the direct mapping of vq
metadata before each round of packet processing if it doesn't
exist. If we fail, we will simplely fallback to copy_to/from_user()
friends.

This invalidation, direct mapping access and set are synchronized
through spinlock. This takes a step back from the original commit
7f466032dc9e ("vhost: access vq metadata through kernel virtual
address") which tries to RCU which is suspicious and hard to be
reviewed. This won't perform as well as RCU because of the atomic,
this could be addressed by the future optimization.

This method might does not work for high mem page which requires
temporary mapping so we just fallback to normal
copy_to/from_user() and may not for arch that has virtual tagged cache
since extra cache flushing is needed to eliminate the alias. This will
result complex logic and bad performance. For those archs, this patch
simply go for copy_to/from_user() friends. This is done by ruling out
kernel mapping codes through ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.

Note that this is only done when device IOTLB is not enabled. We
could use similar method to optimize IOTLB in the future.

Tests shows at most about 22% improvement on TX PPS when using
virtio-user + vhost_net + xdp1 + TAP on 4.0GHz Kaby Lake.

        SMAP on | SMAP off
Before: 4.9Mpps | 6.9Mpps
After:  6.0Mpps | 7.5Mpps

On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
any difference.

Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Miller <davem@davemloft.net>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: linux-mm@kvack.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-parisc@vger.kernel.org
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vhost.c | 551 +++++++++++++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.h |  41 ++++
 2 files changed, 589 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 791562e03fe0..f98155f28f02 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -298,6 +298,182 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
 		__vhost_vq_meta_reset(d->vqs[i]);
 }
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_map_unprefetch(struct vhost_map *map)
+{
+	kfree(map->pages);
+	kfree(map);
+}
+
+static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
+				struct vhost_map *map, int index)
+{
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	int i;
+
+	if (uaddr->write) {
+		for (i = 0; i < map->npages; i++)
+			set_page_dirty(map->pages[i]);
+	}
+}
+
+static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
+{
+	struct vhost_map *map[VHOST_NUM_ADDRS];
+	int i;
+
+	spin_lock(&vq->mmu_lock);
+	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
+		map[i] = vq->maps[i];
+		if (map[i]) {
+			vhost_set_map_dirty(vq, map[i], i);
+			vq->maps[i] = NULL;
+		}
+	}
+	spin_unlock(&vq->mmu_lock);
+
+	/* No need for synchronization since we are serialized with
+	 * memory accessors (e.g vq mutex held).
+	 */
+
+	for (i = 0; i < VHOST_NUM_ADDRS; i++)
+		if (map[i])
+			vhost_map_unprefetch(map[i]);
+
+}
+
+static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
+{
+	int i;
+
+	vhost_uninit_vq_maps(vq);
+	for (i = 0; i < VHOST_NUM_ADDRS; i++)
+		vq->uaddrs[i].size = 0;
+}
+
+static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
+				     unsigned long start,
+				     unsigned long end)
+{
+	if (unlikely(!uaddr->size))
+		return false;
+
+	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
+}
+
+static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
+{
+	spin_lock(&vq->mmu_lock);
+}
+
+static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
+{
+	spin_unlock(&vq->mmu_lock);
+}
+
+static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
+				     int index,
+				     unsigned long start,
+				     unsigned long end,
+				     bool blockable)
+{
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	struct vhost_map *map;
+
+	if (!vhost_map_range_overlap(uaddr, start, end))
+		return 0;
+	else if (!blockable)
+		return -EAGAIN;
+
+	spin_lock(&vq->mmu_lock);
+	++vq->invalidate_count;
+
+	map = vq->maps[index];
+	if (map)
+		vq->maps[index] = NULL;
+	spin_unlock(&vq->mmu_lock);
+
+	if (map) {
+		vhost_set_map_dirty(vq, map, index);
+		vhost_map_unprefetch(map);
+	}
+
+	return 0;
+}
+
+static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
+				    int index,
+				    unsigned long start,
+				    unsigned long end)
+{
+	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
+		return;
+
+	spin_lock(&vq->mmu_lock);
+	--vq->invalidate_count;
+	spin_unlock(&vq->mmu_lock);
+}
+
+static int vhost_invalidate_range_start(struct mmu_notifier *mn,
+					const struct mmu_notifier_range *range)
+{
+	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+					     mmu_notifier);
+	bool blockable = mmu_notifier_range_blockable(range);
+	int i, j, ret;
+
+	for (i = 0; i < dev->nvqs; i++) {
+		struct vhost_virtqueue *vq = dev->vqs[i];
+
+		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
+			ret = vhost_invalidate_vq_start(vq, j,
+							range->start,
+							range->end, blockable);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void vhost_invalidate_range_end(struct mmu_notifier *mn,
+				       const struct mmu_notifier_range *range)
+{
+	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+					     mmu_notifier);
+	int i, j;
+
+	for (i = 0; i < dev->nvqs; i++) {
+		struct vhost_virtqueue *vq = dev->vqs[i];
+
+		for (j = 0; j < VHOST_NUM_ADDRS; j++)
+			vhost_invalidate_vq_end(vq, j,
+						range->start,
+						range->end);
+	}
+}
+
+static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
+	.invalidate_range_start = vhost_invalidate_range_start,
+	.invalidate_range_end = vhost_invalidate_range_end,
+};
+
+static void vhost_init_maps(struct vhost_dev *dev)
+{
+	struct vhost_virtqueue *vq;
+	int i, j;
+
+	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
+
+	for (i = 0; i < dev->nvqs; ++i) {
+		vq = dev->vqs[i];
+		for (j = 0; j < VHOST_NUM_ADDRS; j++)
+			vq->maps[j] = NULL;
+	}
+}
+#endif
+
 static void vhost_vq_reset(struct vhost_dev *dev,
 			   struct vhost_virtqueue *vq)
 {
@@ -326,7 +502,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->busyloop_timeout = 0;
 	vq->umem = NULL;
 	vq->iotlb = NULL;
+	vq->invalidate_count = 0;
 	__vhost_vq_meta_reset(vq);
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	vhost_reset_vq_maps(vq);
+#endif
 }
 
 static int vhost_worker(void *data)
@@ -471,12 +651,15 @@ void vhost_dev_init(struct vhost_dev *dev,
 	dev->iov_limit = iov_limit;
 	dev->weight = weight;
 	dev->byte_weight = byte_weight;
+	dev->has_notifier = false;
 	init_llist_head(&dev->work_list);
 	init_waitqueue_head(&dev->wait);
 	INIT_LIST_HEAD(&dev->read_list);
 	INIT_LIST_HEAD(&dev->pending_list);
 	spin_lock_init(&dev->iotlb_lock);
-
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	vhost_init_maps(dev);
+#endif
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		vq = dev->vqs[i];
@@ -485,6 +668,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 		vq->heads = NULL;
 		vq->dev = dev;
 		mutex_init(&vq->mutex);
+		spin_lock_init(&vq->mmu_lock);
 		vhost_vq_reset(dev, vq);
 		if (vq->handle_kick)
 			vhost_poll_init(&vq->poll, vq->handle_kick,
@@ -564,7 +748,19 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 	if (err)
 		goto err_cgroup;
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
+	if (err)
+		goto err_mmu_notifier;
+#endif
+	dev->has_notifier = true;
+
 	return 0;
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+err_mmu_notifier:
+	vhost_dev_free_iovecs(dev);
+#endif
 err_cgroup:
 	kthread_stop(worker);
 	dev->worker = NULL;
@@ -655,6 +851,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
 	spin_unlock(&dev->iotlb_lock);
 }
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
+			      int index, unsigned long uaddr,
+			      size_t size, bool write)
+{
+	struct vhost_uaddr *addr = &vq->uaddrs[index];
+
+	addr->uaddr = uaddr;
+	addr->size = size;
+	addr->write = write;
+}
+
+static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
+{
+	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
+			  (unsigned long)vq->desc,
+			  vhost_get_desc_size(vq, vq->num),
+			  false);
+	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
+			  (unsigned long)vq->avail,
+			  vhost_get_avail_size(vq, vq->num),
+			  false);
+	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
+			  (unsigned long)vq->used,
+			  vhost_get_used_size(vq, vq->num),
+			  true);
+}
+
+static int vhost_map_prefetch(struct vhost_virtqueue *vq,
+			       int index)
+{
+	struct vhost_map *map;
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	struct page **pages;
+	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
+	int npinned;
+	void *vaddr, *v;
+	int err;
+	int i;
+
+	spin_lock(&vq->mmu_lock);
+
+	err = -EFAULT;
+	if (vq->invalidate_count)
+		goto err;
+
+	err = -ENOMEM;
+	map = kmalloc(sizeof(*map), GFP_ATOMIC);
+	if (!map)
+		goto err;
+
+	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
+	if (!pages)
+		goto err_pages;
+
+	err = EFAULT;
+	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
+					uaddr->write, pages);
+	if (npinned > 0)
+		release_pages(pages, npinned);
+	if (npinned != npages)
+		goto err_gup;
+
+	for (i = 0; i < npinned; i++)
+		if (PageHighMem(pages[i]))
+			goto err_gup;
+
+	vaddr = v = page_address(pages[0]);
+
+	/* For simplicity, fallback to userspace address if VA is not
+	 * contigious.
+	 */
+	for (i = 1; i < npinned; i++) {
+		v += PAGE_SIZE;
+		if (v != page_address(pages[i]))
+			goto err_gup;
+	}
+
+	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
+	map->npages = npages;
+	map->pages = pages;
+
+	vq->maps[index] = map;
+	/* No need for a synchronize_rcu(). This function should be
+	 * called by dev->worker so we are serialized with all
+	 * readers.
+	 */
+	spin_unlock(&vq->mmu_lock);
+
+	return 0;
+
+err_gup:
+	kfree(pages);
+err_pages:
+	kfree(map);
+err:
+	spin_unlock(&vq->mmu_lock);
+	return err;
+}
+#endif
+
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
 	int i;
@@ -684,8 +981,20 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 		kthread_stop(dev->worker);
 		dev->worker = NULL;
 	}
-	if (dev->mm)
+	if (dev->mm) {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+		if (dev->has_notifier) {
+			mmu_notifier_unregister(&dev->mmu_notifier,
+						dev->mm);
+			dev->has_notifier = false;
+		}
+#endif
 		mmput(dev->mm);
+	}
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	for (i = 0; i < dev->nvqs; i++)
+		vhost_uninit_vq_maps(dev->vqs[i]);
+#endif
 	dev->mm = NULL;
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
@@ -914,6 +1223,26 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 
 static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			*((__virtio16 *)&used->ring[vq->num]) =
+				cpu_to_vhost16(vq, vq->avail_idx);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
 			      vhost_avail_event(vq));
 }
@@ -922,6 +1251,27 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 				 struct vring_used_elem *head, int idx,
 				 int count)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+	size_t size;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			size = count * sizeof(*head);
+			memcpy(used->ring + idx, head, size);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
 				  count * sizeof(*head));
 }
@@ -929,6 +1279,25 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			used->flags = cpu_to_vhost16(vq, vq->used_flags);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
 			      &vq->used->flags);
 }
@@ -936,6 +1305,25 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
 			      &vq->used->idx);
 }
@@ -981,12 +1369,50 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
 static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
 				      __virtio16 *idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*idx = avail->idx;
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *idx, &vq->avail->idx);
 }
 
 static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 				       __virtio16 *head, int idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*head = avail->ring[idx & (vq->num - 1)];
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *head,
 			       &vq->avail->ring[idx & (vq->num - 1)]);
 }
@@ -994,24 +1420,98 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
 					__virtio16 *flags)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*flags = avail->flags;
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *flags, &vq->avail->flags);
 }
 
 static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
 				       __virtio16 *event)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*event = (__virtio16)avail->ring[vq->num];
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *event, vhost_used_event(vq));
 }
 
 static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
 				     __virtio16 *idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			*idx = used->idx;
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_used(vq, *idx, &vq->used->idx);
 }
 
 static inline int vhost_get_desc(struct vhost_virtqueue *vq,
 				 struct vring_desc *desc, int idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_desc *d;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_DESC];
+		if (likely(map)) {
+			d = map->addr;
+			*desc = *(d + idx);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
 }
 
@@ -1352,12 +1852,30 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
 	return true;
 }
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
+{
+	struct vhost_map *map;
+	int i;
+
+	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
+		map = vq->maps[i];
+		if (unlikely(!map))
+			vhost_map_prefetch(vq, i);
+	}
+}
+#endif
+
 int vq_meta_prefetch(struct vhost_virtqueue *vq)
 {
 	unsigned int num = vq->num;
 
-	if (!vq->iotlb)
+	if (!vq->iotlb) {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+		vhost_vq_map_prefetch(vq);
+#endif
 		return 1;
+	}
 
 	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
 			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
@@ -1568,6 +2086,22 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 
 	mutex_lock(&vq->mutex);
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	/* Unregister MMU notifer to allow invalidation callback
+	 * can access vq->uaddrs[] without holding a lock.
+	 */
+	if (d->has_notifier) {
+		mmu_notifier_unregister(&d->mmu_notifier, d->mm);
+		d->has_notifier = false;
+	}
+
+	/* reset invalidate_count in case we are in the middle of
+	 * invalidate_start() and invalidate_end().
+	 */
+	vq->invalidate_count = 0;
+	vhost_uninit_vq_maps(vq);
+#endif
+
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		r = vhost_vring_set_num(d, vq, argp);
@@ -1579,6 +2113,17 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 		BUG();
 	}
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	if (r == 0)
+		vhost_setup_vq_uaddr(vq);
+
+	if (d->mm) {
+		r = mmu_notifier_register(&d->mmu_notifier, d->mm);
+		if (!r)
+			d->has_notifier = true;
+	}
+#endif
+
 	mutex_unlock(&vq->mutex);
 
 	return r;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index e9ed2722b633..85e97e0f77f5 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -12,6 +12,9 @@
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
 #include <linux/atomic.h>
+#include <linux/pagemap.h>
+#include <linux/mmu_notifier.h>
+#include <asm/cacheflush.h>
 
 struct vhost_work;
 typedef void (*vhost_work_fn_t)(struct vhost_work *work);
@@ -80,6 +83,24 @@ enum vhost_uaddr_type {
 	VHOST_NUM_ADDRS = 3,
 };
 
+struct vhost_map {
+	int npages;
+	void *addr;
+	struct page **pages;
+};
+
+struct vhost_uaddr {
+	unsigned long uaddr;
+	size_t size;
+	bool write;
+};
+
+#if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
+#define VHOST_ARCH_CAN_ACCEL_UACCESS 1
+#else
+#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
+#endif
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -90,7 +111,21 @@ struct vhost_virtqueue {
 	struct vring_desc __user *desc;
 	struct vring_avail __user *avail;
 	struct vring_used __user *used;
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	/* Read by memory accessors, modified by meta data
+	 * prefetching, MMU notifier and vring ioctl().
+	 * Synchonrized through mmu_lock.
+	 */
+	struct vhost_map *maps[VHOST_NUM_ADDRS];
+	/* Read by MMU notifier, modified by vring ioctl(),
+	 * synchronized through MMU notifier
+	 * registering/unregistering.
+	 */
+	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
+#endif
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
+
 	struct file *kick;
 	struct eventfd_ctx *call_ctx;
 	struct eventfd_ctx *error_ctx;
@@ -145,6 +180,8 @@ struct vhost_virtqueue {
 	bool user_be;
 #endif
 	u32 busyloop_timeout;
+	spinlock_t mmu_lock;
+	int invalidate_count;
 };
 
 struct vhost_msg_node {
@@ -158,6 +195,9 @@ struct vhost_msg_node {
 
 struct vhost_dev {
 	struct mm_struct *mm;
+#ifdef CONFIG_MMU_NOTIFIER
+	struct mmu_notifier mmu_notifier;
+#endif
 	struct mutex mutex;
 	struct vhost_virtqueue **vqs;
 	int nvqs;
@@ -173,6 +213,7 @@ struct vhost_dev {
 	int iov_limit;
 	int weight;
 	int byte_weight;
+	bool has_notifier;
 };
 
 bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len);
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-05 12:27 [PATCH 0/2] Revert and rework on the metadata accelreation Jason Wang
                   ` (2 preceding siblings ...)
  2019-09-05 12:27   ` Jason Wang
@ 2019-09-05 12:27 ` Jason Wang
  2019-09-05 13:59 ` [PATCH 0/2] Revert and rework on the metadata accelreation Jason Gunthorpe
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-05 12:27 UTC (permalink / raw)
  To: mst, jasowang, kvm, virtualization
  Cc: aarcange, Christoph Hellwig, linux-parisc, netdev, linux-kernel,
	James Bottomley, linux-mm, jglisse, jgg, David Miller,
	linux-arm-kernel

This is a rework on the commit 7f466032dc9e ("vhost: access vq
metadata through kernel virtual address").

It was noticed that the copy_to/from_user() friends that was used to
access virtqueue metdata tends to be very expensive for dataplane
implementation like vhost since it involves lots of software checks,
speculation barriers, hardware feature toggling (e.g SMAP). The
extra cost will be more obvious when transferring small packets since
the time spent on metadata accessing become more significant.

This patch tries to eliminate those overheads by accessing them
through direct mapping of those pages. Invalidation callbacks is
implemented for co-operation with general VM management (swap, KSM,
THP or NUMA balancing). We will try to get the direct mapping of vq
metadata before each round of packet processing if it doesn't
exist. If we fail, we will simplely fallback to copy_to/from_user()
friends.

This invalidation, direct mapping access and set are synchronized
through spinlock. This takes a step back from the original commit
7f466032dc9e ("vhost: access vq metadata through kernel virtual
address") which tries to RCU which is suspicious and hard to be
reviewed. This won't perform as well as RCU because of the atomic,
this could be addressed by the future optimization.

This method might does not work for high mem page which requires
temporary mapping so we just fallback to normal
copy_to/from_user() and may not for arch that has virtual tagged cache
since extra cache flushing is needed to eliminate the alias. This will
result complex logic and bad performance. For those archs, this patch
simply go for copy_to/from_user() friends. This is done by ruling out
kernel mapping codes through ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.

Note that this is only done when device IOTLB is not enabled. We
could use similar method to optimize IOTLB in the future.

Tests shows at most about 22% improvement on TX PPS when using
virtio-user + vhost_net + xdp1 + TAP on 4.0GHz Kaby Lake.

        SMAP on | SMAP off
Before: 4.9Mpps | 6.9Mpps
After:  6.0Mpps | 7.5Mpps

On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
any difference.

Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Miller <davem@davemloft.net>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: linux-mm@kvack.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-parisc@vger.kernel.org
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vhost.c | 551 +++++++++++++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.h |  41 ++++
 2 files changed, 589 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 791562e03fe0..f98155f28f02 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -298,6 +298,182 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
 		__vhost_vq_meta_reset(d->vqs[i]);
 }
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_map_unprefetch(struct vhost_map *map)
+{
+	kfree(map->pages);
+	kfree(map);
+}
+
+static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
+				struct vhost_map *map, int index)
+{
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	int i;
+
+	if (uaddr->write) {
+		for (i = 0; i < map->npages; i++)
+			set_page_dirty(map->pages[i]);
+	}
+}
+
+static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
+{
+	struct vhost_map *map[VHOST_NUM_ADDRS];
+	int i;
+
+	spin_lock(&vq->mmu_lock);
+	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
+		map[i] = vq->maps[i];
+		if (map[i]) {
+			vhost_set_map_dirty(vq, map[i], i);
+			vq->maps[i] = NULL;
+		}
+	}
+	spin_unlock(&vq->mmu_lock);
+
+	/* No need for synchronization since we are serialized with
+	 * memory accessors (e.g vq mutex held).
+	 */
+
+	for (i = 0; i < VHOST_NUM_ADDRS; i++)
+		if (map[i])
+			vhost_map_unprefetch(map[i]);
+
+}
+
+static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
+{
+	int i;
+
+	vhost_uninit_vq_maps(vq);
+	for (i = 0; i < VHOST_NUM_ADDRS; i++)
+		vq->uaddrs[i].size = 0;
+}
+
+static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
+				     unsigned long start,
+				     unsigned long end)
+{
+	if (unlikely(!uaddr->size))
+		return false;
+
+	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
+}
+
+static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
+{
+	spin_lock(&vq->mmu_lock);
+}
+
+static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
+{
+	spin_unlock(&vq->mmu_lock);
+}
+
+static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
+				     int index,
+				     unsigned long start,
+				     unsigned long end,
+				     bool blockable)
+{
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	struct vhost_map *map;
+
+	if (!vhost_map_range_overlap(uaddr, start, end))
+		return 0;
+	else if (!blockable)
+		return -EAGAIN;
+
+	spin_lock(&vq->mmu_lock);
+	++vq->invalidate_count;
+
+	map = vq->maps[index];
+	if (map)
+		vq->maps[index] = NULL;
+	spin_unlock(&vq->mmu_lock);
+
+	if (map) {
+		vhost_set_map_dirty(vq, map, index);
+		vhost_map_unprefetch(map);
+	}
+
+	return 0;
+}
+
+static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
+				    int index,
+				    unsigned long start,
+				    unsigned long end)
+{
+	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
+		return;
+
+	spin_lock(&vq->mmu_lock);
+	--vq->invalidate_count;
+	spin_unlock(&vq->mmu_lock);
+}
+
+static int vhost_invalidate_range_start(struct mmu_notifier *mn,
+					const struct mmu_notifier_range *range)
+{
+	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+					     mmu_notifier);
+	bool blockable = mmu_notifier_range_blockable(range);
+	int i, j, ret;
+
+	for (i = 0; i < dev->nvqs; i++) {
+		struct vhost_virtqueue *vq = dev->vqs[i];
+
+		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
+			ret = vhost_invalidate_vq_start(vq, j,
+							range->start,
+							range->end, blockable);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void vhost_invalidate_range_end(struct mmu_notifier *mn,
+				       const struct mmu_notifier_range *range)
+{
+	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+					     mmu_notifier);
+	int i, j;
+
+	for (i = 0; i < dev->nvqs; i++) {
+		struct vhost_virtqueue *vq = dev->vqs[i];
+
+		for (j = 0; j < VHOST_NUM_ADDRS; j++)
+			vhost_invalidate_vq_end(vq, j,
+						range->start,
+						range->end);
+	}
+}
+
+static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
+	.invalidate_range_start = vhost_invalidate_range_start,
+	.invalidate_range_end = vhost_invalidate_range_end,
+};
+
+static void vhost_init_maps(struct vhost_dev *dev)
+{
+	struct vhost_virtqueue *vq;
+	int i, j;
+
+	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
+
+	for (i = 0; i < dev->nvqs; ++i) {
+		vq = dev->vqs[i];
+		for (j = 0; j < VHOST_NUM_ADDRS; j++)
+			vq->maps[j] = NULL;
+	}
+}
+#endif
+
 static void vhost_vq_reset(struct vhost_dev *dev,
 			   struct vhost_virtqueue *vq)
 {
@@ -326,7 +502,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->busyloop_timeout = 0;
 	vq->umem = NULL;
 	vq->iotlb = NULL;
+	vq->invalidate_count = 0;
 	__vhost_vq_meta_reset(vq);
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	vhost_reset_vq_maps(vq);
+#endif
 }
 
 static int vhost_worker(void *data)
@@ -471,12 +651,15 @@ void vhost_dev_init(struct vhost_dev *dev,
 	dev->iov_limit = iov_limit;
 	dev->weight = weight;
 	dev->byte_weight = byte_weight;
+	dev->has_notifier = false;
 	init_llist_head(&dev->work_list);
 	init_waitqueue_head(&dev->wait);
 	INIT_LIST_HEAD(&dev->read_list);
 	INIT_LIST_HEAD(&dev->pending_list);
 	spin_lock_init(&dev->iotlb_lock);
-
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	vhost_init_maps(dev);
+#endif
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		vq = dev->vqs[i];
@@ -485,6 +668,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 		vq->heads = NULL;
 		vq->dev = dev;
 		mutex_init(&vq->mutex);
+		spin_lock_init(&vq->mmu_lock);
 		vhost_vq_reset(dev, vq);
 		if (vq->handle_kick)
 			vhost_poll_init(&vq->poll, vq->handle_kick,
@@ -564,7 +748,19 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 	if (err)
 		goto err_cgroup;
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
+	if (err)
+		goto err_mmu_notifier;
+#endif
+	dev->has_notifier = true;
+
 	return 0;
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+err_mmu_notifier:
+	vhost_dev_free_iovecs(dev);
+#endif
 err_cgroup:
 	kthread_stop(worker);
 	dev->worker = NULL;
@@ -655,6 +851,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
 	spin_unlock(&dev->iotlb_lock);
 }
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
+			      int index, unsigned long uaddr,
+			      size_t size, bool write)
+{
+	struct vhost_uaddr *addr = &vq->uaddrs[index];
+
+	addr->uaddr = uaddr;
+	addr->size = size;
+	addr->write = write;
+}
+
+static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
+{
+	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
+			  (unsigned long)vq->desc,
+			  vhost_get_desc_size(vq, vq->num),
+			  false);
+	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
+			  (unsigned long)vq->avail,
+			  vhost_get_avail_size(vq, vq->num),
+			  false);
+	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
+			  (unsigned long)vq->used,
+			  vhost_get_used_size(vq, vq->num),
+			  true);
+}
+
+static int vhost_map_prefetch(struct vhost_virtqueue *vq,
+			       int index)
+{
+	struct vhost_map *map;
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	struct page **pages;
+	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
+	int npinned;
+	void *vaddr, *v;
+	int err;
+	int i;
+
+	spin_lock(&vq->mmu_lock);
+
+	err = -EFAULT;
+	if (vq->invalidate_count)
+		goto err;
+
+	err = -ENOMEM;
+	map = kmalloc(sizeof(*map), GFP_ATOMIC);
+	if (!map)
+		goto err;
+
+	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
+	if (!pages)
+		goto err_pages;
+
+	err = EFAULT;
+	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
+					uaddr->write, pages);
+	if (npinned > 0)
+		release_pages(pages, npinned);
+	if (npinned != npages)
+		goto err_gup;
+
+	for (i = 0; i < npinned; i++)
+		if (PageHighMem(pages[i]))
+			goto err_gup;
+
+	vaddr = v = page_address(pages[0]);
+
+	/* For simplicity, fallback to userspace address if VA is not
+	 * contigious.
+	 */
+	for (i = 1; i < npinned; i++) {
+		v += PAGE_SIZE;
+		if (v != page_address(pages[i]))
+			goto err_gup;
+	}
+
+	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
+	map->npages = npages;
+	map->pages = pages;
+
+	vq->maps[index] = map;
+	/* No need for a synchronize_rcu(). This function should be
+	 * called by dev->worker so we are serialized with all
+	 * readers.
+	 */
+	spin_unlock(&vq->mmu_lock);
+
+	return 0;
+
+err_gup:
+	kfree(pages);
+err_pages:
+	kfree(map);
+err:
+	spin_unlock(&vq->mmu_lock);
+	return err;
+}
+#endif
+
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
 	int i;
@@ -684,8 +981,20 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 		kthread_stop(dev->worker);
 		dev->worker = NULL;
 	}
-	if (dev->mm)
+	if (dev->mm) {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+		if (dev->has_notifier) {
+			mmu_notifier_unregister(&dev->mmu_notifier,
+						dev->mm);
+			dev->has_notifier = false;
+		}
+#endif
 		mmput(dev->mm);
+	}
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	for (i = 0; i < dev->nvqs; i++)
+		vhost_uninit_vq_maps(dev->vqs[i]);
+#endif
 	dev->mm = NULL;
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
@@ -914,6 +1223,26 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 
 static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			*((__virtio16 *)&used->ring[vq->num]) =
+				cpu_to_vhost16(vq, vq->avail_idx);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
 			      vhost_avail_event(vq));
 }
@@ -922,6 +1251,27 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 				 struct vring_used_elem *head, int idx,
 				 int count)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+	size_t size;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			size = count * sizeof(*head);
+			memcpy(used->ring + idx, head, size);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
 				  count * sizeof(*head));
 }
@@ -929,6 +1279,25 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			used->flags = cpu_to_vhost16(vq, vq->used_flags);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
 			      &vq->used->flags);
 }
@@ -936,6 +1305,25 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
 			      &vq->used->idx);
 }
@@ -981,12 +1369,50 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
 static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
 				      __virtio16 *idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*idx = avail->idx;
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *idx, &vq->avail->idx);
 }
 
 static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 				       __virtio16 *head, int idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*head = avail->ring[idx & (vq->num - 1)];
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *head,
 			       &vq->avail->ring[idx & (vq->num - 1)]);
 }
@@ -994,24 +1420,98 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
 					__virtio16 *flags)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*flags = avail->flags;
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *flags, &vq->avail->flags);
 }
 
 static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
 				       __virtio16 *event)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*event = (__virtio16)avail->ring[vq->num];
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *event, vhost_used_event(vq));
 }
 
 static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
 				     __virtio16 *idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			*idx = used->idx;
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_used(vq, *idx, &vq->used->idx);
 }
 
 static inline int vhost_get_desc(struct vhost_virtqueue *vq,
 				 struct vring_desc *desc, int idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_desc *d;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_DESC];
+		if (likely(map)) {
+			d = map->addr;
+			*desc = *(d + idx);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
 }
 
@@ -1352,12 +1852,30 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
 	return true;
 }
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
+{
+	struct vhost_map *map;
+	int i;
+
+	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
+		map = vq->maps[i];
+		if (unlikely(!map))
+			vhost_map_prefetch(vq, i);
+	}
+}
+#endif
+
 int vq_meta_prefetch(struct vhost_virtqueue *vq)
 {
 	unsigned int num = vq->num;
 
-	if (!vq->iotlb)
+	if (!vq->iotlb) {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+		vhost_vq_map_prefetch(vq);
+#endif
 		return 1;
+	}
 
 	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
 			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
@@ -1568,6 +2086,22 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 
 	mutex_lock(&vq->mutex);
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	/* Unregister MMU notifer to allow invalidation callback
+	 * can access vq->uaddrs[] without holding a lock.
+	 */
+	if (d->has_notifier) {
+		mmu_notifier_unregister(&d->mmu_notifier, d->mm);
+		d->has_notifier = false;
+	}
+
+	/* reset invalidate_count in case we are in the middle of
+	 * invalidate_start() and invalidate_end().
+	 */
+	vq->invalidate_count = 0;
+	vhost_uninit_vq_maps(vq);
+#endif
+
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		r = vhost_vring_set_num(d, vq, argp);
@@ -1579,6 +2113,17 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 		BUG();
 	}
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	if (r == 0)
+		vhost_setup_vq_uaddr(vq);
+
+	if (d->mm) {
+		r = mmu_notifier_register(&d->mmu_notifier, d->mm);
+		if (!r)
+			d->has_notifier = true;
+	}
+#endif
+
 	mutex_unlock(&vq->mutex);
 
 	return r;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index e9ed2722b633..85e97e0f77f5 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -12,6 +12,9 @@
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
 #include <linux/atomic.h>
+#include <linux/pagemap.h>
+#include <linux/mmu_notifier.h>
+#include <asm/cacheflush.h>
 
 struct vhost_work;
 typedef void (*vhost_work_fn_t)(struct vhost_work *work);
@@ -80,6 +83,24 @@ enum vhost_uaddr_type {
 	VHOST_NUM_ADDRS = 3,
 };
 
+struct vhost_map {
+	int npages;
+	void *addr;
+	struct page **pages;
+};
+
+struct vhost_uaddr {
+	unsigned long uaddr;
+	size_t size;
+	bool write;
+};
+
+#if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
+#define VHOST_ARCH_CAN_ACCEL_UACCESS 1
+#else
+#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
+#endif
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -90,7 +111,21 @@ struct vhost_virtqueue {
 	struct vring_desc __user *desc;
 	struct vring_avail __user *avail;
 	struct vring_used __user *used;
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	/* Read by memory accessors, modified by meta data
+	 * prefetching, MMU notifier and vring ioctl().
+	 * Synchonrized through mmu_lock.
+	 */
+	struct vhost_map *maps[VHOST_NUM_ADDRS];
+	/* Read by MMU notifier, modified by vring ioctl(),
+	 * synchronized through MMU notifier
+	 * registering/unregistering.
+	 */
+	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
+#endif
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
+
 	struct file *kick;
 	struct eventfd_ctx *call_ctx;
 	struct eventfd_ctx *error_ctx;
@@ -145,6 +180,8 @@ struct vhost_virtqueue {
 	bool user_be;
 #endif
 	u32 busyloop_timeout;
+	spinlock_t mmu_lock;
+	int invalidate_count;
 };
 
 struct vhost_msg_node {
@@ -158,6 +195,9 @@ struct vhost_msg_node {
 
 struct vhost_dev {
 	struct mm_struct *mm;
+#ifdef CONFIG_MMU_NOTIFIER
+	struct mmu_notifier mmu_notifier;
+#endif
 	struct mutex mutex;
 	struct vhost_virtqueue **vqs;
 	int nvqs;
@@ -173,6 +213,7 @@ struct vhost_dev {
 	int iov_limit;
 	int weight;
 	int byte_weight;
+	bool has_notifier;
 };
 
 bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len);
-- 
2.19.1

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
@ 2019-09-05 12:27   ` Jason Wang
  0 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-05 12:27 UTC (permalink / raw)
  To: mst, jasowang, kvm, virtualization
  Cc: aarcange, Christoph Hellwig, linux-parisc, netdev, linux-kernel,
	James Bottomley, linux-mm, jglisse, jgg, David Miller,
	linux-arm-kernel

This is a rework on the commit 7f466032dc9e ("vhost: access vq
metadata through kernel virtual address").

It was noticed that the copy_to/from_user() friends that was used to
access virtqueue metdata tends to be very expensive for dataplane
implementation like vhost since it involves lots of software checks,
speculation barriers, hardware feature toggling (e.g SMAP). The
extra cost will be more obvious when transferring small packets since
the time spent on metadata accessing become more significant.

This patch tries to eliminate those overheads by accessing them
through direct mapping of those pages. Invalidation callbacks is
implemented for co-operation with general VM management (swap, KSM,
THP or NUMA balancing). We will try to get the direct mapping of vq
metadata before each round of packet processing if it doesn't
exist. If we fail, we will simplely fallback to copy_to/from_user()
friends.

This invalidation, direct mapping access and set are synchronized
through spinlock. This takes a step back from the original commit
7f466032dc9e ("vhost: access vq metadata through kernel virtual
address") which tries to RCU which is suspicious and hard to be
reviewed. This won't perform as well as RCU because of the atomic,
this could be addressed by the future optimization.

This method might does not work for high mem page which requires
temporary mapping so we just fallback to normal
copy_to/from_user() and may not for arch that has virtual tagged cache
since extra cache flushing is needed to eliminate the alias. This will
result complex logic and bad performance. For those archs, this patch
simply go for copy_to/from_user() friends. This is done by ruling out
kernel mapping codes through ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.

Note that this is only done when device IOTLB is not enabled. We
could use similar method to optimize IOTLB in the future.

Tests shows at most about 22% improvement on TX PPS when using
virtio-user + vhost_net + xdp1 + TAP on 4.0GHz Kaby Lake.

        SMAP on | SMAP off
Before: 4.9Mpps | 6.9Mpps
After:  6.0Mpps | 7.5Mpps

On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
any difference.

Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Miller <davem@davemloft.net>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: linux-mm@kvack.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-parisc@vger.kernel.org
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vhost.c | 551 +++++++++++++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.h |  41 ++++
 2 files changed, 589 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 791562e03fe0..f98155f28f02 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -298,6 +298,182 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
 		__vhost_vq_meta_reset(d->vqs[i]);
 }
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_map_unprefetch(struct vhost_map *map)
+{
+	kfree(map->pages);
+	kfree(map);
+}
+
+static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
+				struct vhost_map *map, int index)
+{
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	int i;
+
+	if (uaddr->write) {
+		for (i = 0; i < map->npages; i++)
+			set_page_dirty(map->pages[i]);
+	}
+}
+
+static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
+{
+	struct vhost_map *map[VHOST_NUM_ADDRS];
+	int i;
+
+	spin_lock(&vq->mmu_lock);
+	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
+		map[i] = vq->maps[i];
+		if (map[i]) {
+			vhost_set_map_dirty(vq, map[i], i);
+			vq->maps[i] = NULL;
+		}
+	}
+	spin_unlock(&vq->mmu_lock);
+
+	/* No need for synchronization since we are serialized with
+	 * memory accessors (e.g vq mutex held).
+	 */
+
+	for (i = 0; i < VHOST_NUM_ADDRS; i++)
+		if (map[i])
+			vhost_map_unprefetch(map[i]);
+
+}
+
+static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
+{
+	int i;
+
+	vhost_uninit_vq_maps(vq);
+	for (i = 0; i < VHOST_NUM_ADDRS; i++)
+		vq->uaddrs[i].size = 0;
+}
+
+static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
+				     unsigned long start,
+				     unsigned long end)
+{
+	if (unlikely(!uaddr->size))
+		return false;
+
+	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
+}
+
+static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
+{
+	spin_lock(&vq->mmu_lock);
+}
+
+static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
+{
+	spin_unlock(&vq->mmu_lock);
+}
+
+static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
+				     int index,
+				     unsigned long start,
+				     unsigned long end,
+				     bool blockable)
+{
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	struct vhost_map *map;
+
+	if (!vhost_map_range_overlap(uaddr, start, end))
+		return 0;
+	else if (!blockable)
+		return -EAGAIN;
+
+	spin_lock(&vq->mmu_lock);
+	++vq->invalidate_count;
+
+	map = vq->maps[index];
+	if (map)
+		vq->maps[index] = NULL;
+	spin_unlock(&vq->mmu_lock);
+
+	if (map) {
+		vhost_set_map_dirty(vq, map, index);
+		vhost_map_unprefetch(map);
+	}
+
+	return 0;
+}
+
+static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
+				    int index,
+				    unsigned long start,
+				    unsigned long end)
+{
+	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
+		return;
+
+	spin_lock(&vq->mmu_lock);
+	--vq->invalidate_count;
+	spin_unlock(&vq->mmu_lock);
+}
+
+static int vhost_invalidate_range_start(struct mmu_notifier *mn,
+					const struct mmu_notifier_range *range)
+{
+	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+					     mmu_notifier);
+	bool blockable = mmu_notifier_range_blockable(range);
+	int i, j, ret;
+
+	for (i = 0; i < dev->nvqs; i++) {
+		struct vhost_virtqueue *vq = dev->vqs[i];
+
+		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
+			ret = vhost_invalidate_vq_start(vq, j,
+							range->start,
+							range->end, blockable);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void vhost_invalidate_range_end(struct mmu_notifier *mn,
+				       const struct mmu_notifier_range *range)
+{
+	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+					     mmu_notifier);
+	int i, j;
+
+	for (i = 0; i < dev->nvqs; i++) {
+		struct vhost_virtqueue *vq = dev->vqs[i];
+
+		for (j = 0; j < VHOST_NUM_ADDRS; j++)
+			vhost_invalidate_vq_end(vq, j,
+						range->start,
+						range->end);
+	}
+}
+
+static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
+	.invalidate_range_start = vhost_invalidate_range_start,
+	.invalidate_range_end = vhost_invalidate_range_end,
+};
+
+static void vhost_init_maps(struct vhost_dev *dev)
+{
+	struct vhost_virtqueue *vq;
+	int i, j;
+
+	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
+
+	for (i = 0; i < dev->nvqs; ++i) {
+		vq = dev->vqs[i];
+		for (j = 0; j < VHOST_NUM_ADDRS; j++)
+			vq->maps[j] = NULL;
+	}
+}
+#endif
+
 static void vhost_vq_reset(struct vhost_dev *dev,
 			   struct vhost_virtqueue *vq)
 {
@@ -326,7 +502,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->busyloop_timeout = 0;
 	vq->umem = NULL;
 	vq->iotlb = NULL;
+	vq->invalidate_count = 0;
 	__vhost_vq_meta_reset(vq);
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	vhost_reset_vq_maps(vq);
+#endif
 }
 
 static int vhost_worker(void *data)
@@ -471,12 +651,15 @@ void vhost_dev_init(struct vhost_dev *dev,
 	dev->iov_limit = iov_limit;
 	dev->weight = weight;
 	dev->byte_weight = byte_weight;
+	dev->has_notifier = false;
 	init_llist_head(&dev->work_list);
 	init_waitqueue_head(&dev->wait);
 	INIT_LIST_HEAD(&dev->read_list);
 	INIT_LIST_HEAD(&dev->pending_list);
 	spin_lock_init(&dev->iotlb_lock);
-
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	vhost_init_maps(dev);
+#endif
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		vq = dev->vqs[i];
@@ -485,6 +668,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 		vq->heads = NULL;
 		vq->dev = dev;
 		mutex_init(&vq->mutex);
+		spin_lock_init(&vq->mmu_lock);
 		vhost_vq_reset(dev, vq);
 		if (vq->handle_kick)
 			vhost_poll_init(&vq->poll, vq->handle_kick,
@@ -564,7 +748,19 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 	if (err)
 		goto err_cgroup;
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
+	if (err)
+		goto err_mmu_notifier;
+#endif
+	dev->has_notifier = true;
+
 	return 0;
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+err_mmu_notifier:
+	vhost_dev_free_iovecs(dev);
+#endif
 err_cgroup:
 	kthread_stop(worker);
 	dev->worker = NULL;
@@ -655,6 +851,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
 	spin_unlock(&dev->iotlb_lock);
 }
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
+			      int index, unsigned long uaddr,
+			      size_t size, bool write)
+{
+	struct vhost_uaddr *addr = &vq->uaddrs[index];
+
+	addr->uaddr = uaddr;
+	addr->size = size;
+	addr->write = write;
+}
+
+static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
+{
+	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
+			  (unsigned long)vq->desc,
+			  vhost_get_desc_size(vq, vq->num),
+			  false);
+	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
+			  (unsigned long)vq->avail,
+			  vhost_get_avail_size(vq, vq->num),
+			  false);
+	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
+			  (unsigned long)vq->used,
+			  vhost_get_used_size(vq, vq->num),
+			  true);
+}
+
+static int vhost_map_prefetch(struct vhost_virtqueue *vq,
+			       int index)
+{
+	struct vhost_map *map;
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	struct page **pages;
+	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
+	int npinned;
+	void *vaddr, *v;
+	int err;
+	int i;
+
+	spin_lock(&vq->mmu_lock);
+
+	err = -EFAULT;
+	if (vq->invalidate_count)
+		goto err;
+
+	err = -ENOMEM;
+	map = kmalloc(sizeof(*map), GFP_ATOMIC);
+	if (!map)
+		goto err;
+
+	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
+	if (!pages)
+		goto err_pages;
+
+	err = EFAULT;
+	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
+					uaddr->write, pages);
+	if (npinned > 0)
+		release_pages(pages, npinned);
+	if (npinned != npages)
+		goto err_gup;
+
+	for (i = 0; i < npinned; i++)
+		if (PageHighMem(pages[i]))
+			goto err_gup;
+
+	vaddr = v = page_address(pages[0]);
+
+	/* For simplicity, fallback to userspace address if VA is not
+	 * contigious.
+	 */
+	for (i = 1; i < npinned; i++) {
+		v += PAGE_SIZE;
+		if (v != page_address(pages[i]))
+			goto err_gup;
+	}
+
+	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
+	map->npages = npages;
+	map->pages = pages;
+
+	vq->maps[index] = map;
+	/* No need for a synchronize_rcu(). This function should be
+	 * called by dev->worker so we are serialized with all
+	 * readers.
+	 */
+	spin_unlock(&vq->mmu_lock);
+
+	return 0;
+
+err_gup:
+	kfree(pages);
+err_pages:
+	kfree(map);
+err:
+	spin_unlock(&vq->mmu_lock);
+	return err;
+}
+#endif
+
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
 	int i;
@@ -684,8 +981,20 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 		kthread_stop(dev->worker);
 		dev->worker = NULL;
 	}
-	if (dev->mm)
+	if (dev->mm) {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+		if (dev->has_notifier) {
+			mmu_notifier_unregister(&dev->mmu_notifier,
+						dev->mm);
+			dev->has_notifier = false;
+		}
+#endif
 		mmput(dev->mm);
+	}
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	for (i = 0; i < dev->nvqs; i++)
+		vhost_uninit_vq_maps(dev->vqs[i]);
+#endif
 	dev->mm = NULL;
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
@@ -914,6 +1223,26 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 
 static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			*((__virtio16 *)&used->ring[vq->num]) =
+				cpu_to_vhost16(vq, vq->avail_idx);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
 			      vhost_avail_event(vq));
 }
@@ -922,6 +1251,27 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 				 struct vring_used_elem *head, int idx,
 				 int count)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+	size_t size;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			size = count * sizeof(*head);
+			memcpy(used->ring + idx, head, size);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
 				  count * sizeof(*head));
 }
@@ -929,6 +1279,25 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			used->flags = cpu_to_vhost16(vq, vq->used_flags);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
 			      &vq->used->flags);
 }
@@ -936,6 +1305,25 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
 			      &vq->used->idx);
 }
@@ -981,12 +1369,50 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
 static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
 				      __virtio16 *idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*idx = avail->idx;
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *idx, &vq->avail->idx);
 }
 
 static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 				       __virtio16 *head, int idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*head = avail->ring[idx & (vq->num - 1)];
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *head,
 			       &vq->avail->ring[idx & (vq->num - 1)]);
 }
@@ -994,24 +1420,98 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
 					__virtio16 *flags)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*flags = avail->flags;
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *flags, &vq->avail->flags);
 }
 
 static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
 				       __virtio16 *event)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*event = (__virtio16)avail->ring[vq->num];
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *event, vhost_used_event(vq));
 }
 
 static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
 				     __virtio16 *idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			*idx = used->idx;
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_used(vq, *idx, &vq->used->idx);
 }
 
 static inline int vhost_get_desc(struct vhost_virtqueue *vq,
 				 struct vring_desc *desc, int idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_desc *d;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_DESC];
+		if (likely(map)) {
+			d = map->addr;
+			*desc = *(d + idx);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
 }
 
@@ -1352,12 +1852,30 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
 	return true;
 }
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
+{
+	struct vhost_map *map;
+	int i;
+
+	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
+		map = vq->maps[i];
+		if (unlikely(!map))
+			vhost_map_prefetch(vq, i);
+	}
+}
+#endif
+
 int vq_meta_prefetch(struct vhost_virtqueue *vq)
 {
 	unsigned int num = vq->num;
 
-	if (!vq->iotlb)
+	if (!vq->iotlb) {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+		vhost_vq_map_prefetch(vq);
+#endif
 		return 1;
+	}
 
 	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
 			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
@@ -1568,6 +2086,22 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 
 	mutex_lock(&vq->mutex);
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	/* Unregister MMU notifer to allow invalidation callback
+	 * can access vq->uaddrs[] without holding a lock.
+	 */
+	if (d->has_notifier) {
+		mmu_notifier_unregister(&d->mmu_notifier, d->mm);
+		d->has_notifier = false;
+	}
+
+	/* reset invalidate_count in case we are in the middle of
+	 * invalidate_start() and invalidate_end().
+	 */
+	vq->invalidate_count = 0;
+	vhost_uninit_vq_maps(vq);
+#endif
+
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		r = vhost_vring_set_num(d, vq, argp);
@@ -1579,6 +2113,17 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 		BUG();
 	}
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	if (r == 0)
+		vhost_setup_vq_uaddr(vq);
+
+	if (d->mm) {
+		r = mmu_notifier_register(&d->mmu_notifier, d->mm);
+		if (!r)
+			d->has_notifier = true;
+	}
+#endif
+
 	mutex_unlock(&vq->mutex);
 
 	return r;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index e9ed2722b633..85e97e0f77f5 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -12,6 +12,9 @@
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
 #include <linux/atomic.h>
+#include <linux/pagemap.h>
+#include <linux/mmu_notifier.h>
+#include <asm/cacheflush.h>
 
 struct vhost_work;
 typedef void (*vhost_work_fn_t)(struct vhost_work *work);
@@ -80,6 +83,24 @@ enum vhost_uaddr_type {
 	VHOST_NUM_ADDRS = 3,
 };
 
+struct vhost_map {
+	int npages;
+	void *addr;
+	struct page **pages;
+};
+
+struct vhost_uaddr {
+	unsigned long uaddr;
+	size_t size;
+	bool write;
+};
+
+#if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
+#define VHOST_ARCH_CAN_ACCEL_UACCESS 1
+#else
+#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
+#endif
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -90,7 +111,21 @@ struct vhost_virtqueue {
 	struct vring_desc __user *desc;
 	struct vring_avail __user *avail;
 	struct vring_used __user *used;
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	/* Read by memory accessors, modified by meta data
+	 * prefetching, MMU notifier and vring ioctl().
+	 * Synchonrized through mmu_lock.
+	 */
+	struct vhost_map *maps[VHOST_NUM_ADDRS];
+	/* Read by MMU notifier, modified by vring ioctl(),
+	 * synchronized through MMU notifier
+	 * registering/unregistering.
+	 */
+	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
+#endif
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
+
 	struct file *kick;
 	struct eventfd_ctx *call_ctx;
 	struct eventfd_ctx *error_ctx;
@@ -145,6 +180,8 @@ struct vhost_virtqueue {
 	bool user_be;
 #endif
 	u32 busyloop_timeout;
+	spinlock_t mmu_lock;
+	int invalidate_count;
 };
 
 struct vhost_msg_node {
@@ -158,6 +195,9 @@ struct vhost_msg_node {
 
 struct vhost_dev {
 	struct mm_struct *mm;
+#ifdef CONFIG_MMU_NOTIFIER
+	struct mmu_notifier mmu_notifier;
+#endif
 	struct mutex mutex;
 	struct vhost_virtqueue **vqs;
 	int nvqs;
@@ -173,6 +213,7 @@ struct vhost_dev {
 	int iov_limit;
 	int weight;
 	int byte_weight;
+	bool has_notifier;
 };
 
 bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len);
-- 
2.19.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
  2019-09-05 12:27 [PATCH 0/2] Revert and rework on the metadata accelreation Jason Wang
@ 2019-09-05 13:59   ` Jason Gunthorpe
  2019-09-05 12:27 ` Jason Wang
                     ` (6 subsequent siblings)
  7 siblings, 0 replies; 50+ messages in thread
From: Jason Gunthorpe @ 2019-09-05 13:59 UTC (permalink / raw)
  To: Jason Wang
  Cc: mst, kvm, virtualization, netdev, linux-kernel, aarcange,
	jglisse, linux-mm

On Thu, Sep 05, 2019 at 08:27:34PM +0800, Jason Wang wrote:
> Hi:
> 
> Per request from Michael and Jason, the metadata accelreation is
> reverted in this version and rework in next version.
> 
> Please review.
> 
> Thanks
> 
> Jason Wang (2):
>   Revert "vhost: access vq metadata through kernel virtual address"
>   vhost: re-introducing metadata acceleration through kernel virtual
>     address

There are a bunch of patches in the queue already that will help
vhost, and I a working on one for next cycle that will help alot more
too.

I think you should apply the revert this cycle and rebase the other
patch for next..

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
@ 2019-09-05 13:59   ` Jason Gunthorpe
  0 siblings, 0 replies; 50+ messages in thread
From: Jason Gunthorpe @ 2019-09-05 13:59 UTC (permalink / raw)
  To: Jason Wang
  Cc: mst, kvm, virtualization, netdev, linux-kernel, aarcange,
	jglisse, linux-mm

On Thu, Sep 05, 2019 at 08:27:34PM +0800, Jason Wang wrote:
> Hi:
> 
> Per request from Michael and Jason, the metadata accelreation is
> reverted in this version and rework in next version.
> 
> Please review.
> 
> Thanks
> 
> Jason Wang (2):
>   Revert "vhost: access vq metadata through kernel virtual address"
>   vhost: re-introducing metadata acceleration through kernel virtual
>     address

There are a bunch of patches in the queue already that will help
vhost, and I a working on one for next cycle that will help alot more
too.

I think you should apply the revert this cycle and rebase the other
patch for next..

Jason


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
  2019-09-05 12:27 [PATCH 0/2] Revert and rework on the metadata accelreation Jason Wang
                   ` (3 preceding siblings ...)
  2019-09-05 12:27 ` Jason Wang
@ 2019-09-05 13:59 ` Jason Gunthorpe
  2019-09-05 13:59   ` Jason Gunthorpe
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 50+ messages in thread
From: Jason Gunthorpe @ 2019-09-05 13:59 UTC (permalink / raw)
  To: Jason Wang
  Cc: aarcange, kvm, mst, netdev, linux-kernel, virtualization,
	linux-mm, jglisse

On Thu, Sep 05, 2019 at 08:27:34PM +0800, Jason Wang wrote:
> Hi:
> 
> Per request from Michael and Jason, the metadata accelreation is
> reverted in this version and rework in next version.
> 
> Please review.
> 
> Thanks
> 
> Jason Wang (2):
>   Revert "vhost: access vq metadata through kernel virtual address"
>   vhost: re-introducing metadata acceleration through kernel virtual
>     address

There are a bunch of patches in the queue already that will help
vhost, and I a working on one for next cycle that will help alot more
too.

I think you should apply the revert this cycle and rebase the other
patch for next..

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-05 12:27 [PATCH 0/2] Revert and rework on the metadata accelreation Jason Wang
@ 2019-09-06  3:21   ` Hillf Danton
  2019-09-05 12:27 ` Jason Wang
                     ` (6 subsequent siblings)
  7 siblings, 0 replies; 50+ messages in thread
From: Hillf Danton @ 2019-09-06  3:21 UTC (permalink / raw)
  To: Jason Wang
  Cc: mst, kvm, virtualization, netdev, linux-kernel, jgg, aarcange,
	jglisse, linux-mm, James Bottomley, Christoph Hellwig,
	David Miller, linux-arm-kernel, linux-parisc


On Thu,  5 Sep 2019 20:27:36 +0800 From:   Jason Wang <jasowang@redhat.com>
> 
> +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
> +				struct vhost_map *map, int index)
> +{
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	int i;
> +
> +	if (uaddr->write) {
> +		for (i = 0; i < map->npages; i++)
> +			set_page_dirty(map->pages[i]);
> +	}

Not sure need to set page dirty under page lock.




^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-05 12:27 [PATCH 0/2] Revert and rework on the metadata accelreation Jason Wang
                   ` (6 preceding siblings ...)
  2019-09-06  3:21   ` Hillf Danton
@ 2019-09-06  3:21 ` Hillf Danton
  7 siblings, 0 replies; 50+ messages in thread
From: Hillf Danton @ 2019-09-06  3:21 UTC (permalink / raw)
  To: Jason Wang
  Cc: aarcange, Christoph Hellwig, linux-parisc, kvm, mst, netdev,
	linux-kernel, virtualization, James Bottomley, linux-mm, jglisse,
	jgg, David Miller, linux-arm-kernel


On Thu,  5 Sep 2019 20:27:36 +0800 From:   Jason Wang <jasowang@redhat.com>
> 
> +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
> +				struct vhost_map *map, int index)
> +{
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	int i;
> +
> +	if (uaddr->write) {
> +		for (i = 0; i < map->npages; i++)
> +			set_page_dirty(map->pages[i]);
> +	}

Not sure need to set page dirty under page lock.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
@ 2019-09-06  3:21   ` Hillf Danton
  0 siblings, 0 replies; 50+ messages in thread
From: Hillf Danton @ 2019-09-06  3:21 UTC (permalink / raw)
  To: Jason Wang
  Cc: aarcange, Christoph Hellwig, linux-parisc, kvm, mst, netdev,
	linux-kernel, virtualization, James Bottomley, linux-mm, jglisse,
	jgg, David Miller, linux-arm-kernel


On Thu,  5 Sep 2019 20:27:36 +0800 From:   Jason Wang <jasowang@redhat.com>
> 
> +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
> +				struct vhost_map *map, int index)
> +{
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	int i;
> +
> +	if (uaddr->write) {
> +		for (i = 0; i < map->npages; i++)
> +			set_page_dirty(map->pages[i]);
> +	}

Not sure need to set page dirty under page lock.



_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
  2019-09-05 13:59   ` Jason Gunthorpe
@ 2019-09-06 10:02     ` Jason Wang
  -1 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-06 10:02 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: mst, kvm, virtualization, netdev, linux-kernel, aarcange,
	jglisse, linux-mm


On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
> On Thu, Sep 05, 2019 at 08:27:34PM +0800, Jason Wang wrote:
>> Hi:
>>
>> Per request from Michael and Jason, the metadata accelreation is
>> reverted in this version and rework in next version.
>>
>> Please review.
>>
>> Thanks
>>
>> Jason Wang (2):
>>    Revert "vhost: access vq metadata through kernel virtual address"
>>    vhost: re-introducing metadata acceleration through kernel virtual
>>      address
> There are a bunch of patches in the queue already that will help
> vhost, and I a working on one for next cycle that will help alot more
> too.


I will check those patches, but if you can give me some pointers or 
keywords it would be much appreciated.


>
> I think you should apply the revert this cycle and rebase the other
> patch for next..
>
> Jason


Yes, the plan is to revert in this release cycle.

Thanks


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
@ 2019-09-06 10:02     ` Jason Wang
  0 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-06 10:02 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: mst, kvm, virtualization, netdev, linux-kernel, aarcange,
	jglisse, linux-mm


On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
> On Thu, Sep 05, 2019 at 08:27:34PM +0800, Jason Wang wrote:
>> Hi:
>>
>> Per request from Michael and Jason, the metadata accelreation is
>> reverted in this version and rework in next version.
>>
>> Please review.
>>
>> Thanks
>>
>> Jason Wang (2):
>>    Revert "vhost: access vq metadata through kernel virtual address"
>>    vhost: re-introducing metadata acceleration through kernel virtual
>>      address
> There are a bunch of patches in the queue already that will help
> vhost, and I a working on one for next cycle that will help alot more
> too.


I will check those patches, but if you can give me some pointers or 
keywords it would be much appreciated.


>
> I think you should apply the revert this cycle and rebase the other
> patch for next..
>
> Jason


Yes, the plan is to revert in this release cycle.

Thanks



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
  2019-09-05 13:59   ` Jason Gunthorpe
  (?)
@ 2019-09-06 10:02   ` Jason Wang
  -1 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-06 10:02 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: aarcange, kvm, mst, netdev, linux-kernel, virtualization,
	linux-mm, jglisse


On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
> On Thu, Sep 05, 2019 at 08:27:34PM +0800, Jason Wang wrote:
>> Hi:
>>
>> Per request from Michael and Jason, the metadata accelreation is
>> reverted in this version and rework in next version.
>>
>> Please review.
>>
>> Thanks
>>
>> Jason Wang (2):
>>    Revert "vhost: access vq metadata through kernel virtual address"
>>    vhost: re-introducing metadata acceleration through kernel virtual
>>      address
> There are a bunch of patches in the queue already that will help
> vhost, and I a working on one for next cycle that will help alot more
> too.


I will check those patches, but if you can give me some pointers or 
keywords it would be much appreciated.


>
> I think you should apply the revert this cycle and rebase the other
> patch for next..
>
> Jason


Yes, the plan is to revert in this release cycle.

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-06  3:21   ` Hillf Danton
@ 2019-09-06 12:51     ` Jason Wang
  -1 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-06 12:51 UTC (permalink / raw)
  To: Hillf Danton
  Cc: mst, kvm, virtualization, netdev, linux-kernel, jgg, aarcange,
	jglisse, linux-mm, James Bottomley, Christoph Hellwig,
	David Miller, linux-arm-kernel, linux-parisc


On 2019/9/6 上午11:21, Hillf Danton wrote:
> On Thu,  5 Sep 2019 20:27:36 +0800 From:   Jason Wang <jasowang@redhat.com>
>> +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
>> +				struct vhost_map *map, int index)
>> +{
>> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> +	int i;
>> +
>> +	if (uaddr->write) {
>> +		for (i = 0; i < map->npages; i++)
>> +			set_page_dirty(map->pages[i]);
>> +	}
> Not sure need to set page dirty under page lock.


Just to make sure I understand the issue. Do you mean there's no need 
for set_page_dirty() here? If yes, is there any other function that 
already did this?

Thanks


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-06  3:21   ` Hillf Danton
  (?)
  (?)
@ 2019-09-06 12:51   ` Jason Wang
  -1 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-06 12:51 UTC (permalink / raw)
  To: Hillf Danton
  Cc: aarcange, Christoph Hellwig, linux-parisc, kvm, mst, netdev,
	linux-kernel, virtualization, James Bottomley, linux-mm, jglisse,
	jgg, David Miller, linux-arm-kernel


On 2019/9/6 上午11:21, Hillf Danton wrote:
> On Thu,  5 Sep 2019 20:27:36 +0800 From:   Jason Wang <jasowang@redhat.com>
>> +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
>> +				struct vhost_map *map, int index)
>> +{
>> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> +	int i;
>> +
>> +	if (uaddr->write) {
>> +		for (i = 0; i < map->npages; i++)
>> +			set_page_dirty(map->pages[i]);
>> +	}
> Not sure need to set page dirty under page lock.


Just to make sure I understand the issue. Do you mean there's no need 
for set_page_dirty() here? If yes, is there any other function that 
already did this?

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
@ 2019-09-06 12:51     ` Jason Wang
  0 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-06 12:51 UTC (permalink / raw)
  To: Hillf Danton
  Cc: aarcange, Christoph Hellwig, linux-parisc, kvm, mst, netdev,
	linux-kernel, virtualization, James Bottomley, linux-mm, jglisse,
	jgg, David Miller, linux-arm-kernel


On 2019/9/6 上午11:21, Hillf Danton wrote:
> On Thu,  5 Sep 2019 20:27:36 +0800 From:   Jason Wang <jasowang@redhat.com>
>> +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
>> +				struct vhost_map *map, int index)
>> +{
>> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> +	int i;
>> +
>> +	if (uaddr->write) {
>> +		for (i = 0; i < map->npages; i++)
>> +			set_page_dirty(map->pages[i]);
>> +	}
> Not sure need to set page dirty under page lock.


Just to make sure I understand the issue. Do you mean there's no need 
for set_page_dirty() here? If yes, is there any other function that 
already did this?

Thanks


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
  2019-09-06 10:02     ` Jason Wang
  (?)
@ 2019-09-06 13:15     ` David Miller
  2019-09-09  7:18       ` Jason Wang
  2019-09-09  7:18       ` Jason Wang
  -1 siblings, 2 replies; 50+ messages in thread
From: David Miller @ 2019-09-06 13:15 UTC (permalink / raw)
  To: jasowang
  Cc: jgg, mst, kvm, virtualization, netdev, linux-kernel, aarcange,
	jglisse, linux-mm

From: Jason Wang <jasowang@redhat.com>
Date: Fri, 6 Sep 2019 18:02:35 +0800

> On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
>> I think you should apply the revert this cycle and rebase the other
>> patch for next..
>>
>> Jason
> 
> Yes, the plan is to revert in this release cycle.

Then you should reset patch #1 all by itself targetting 'net'.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
  2019-09-06 10:02     ` Jason Wang
  (?)
  (?)
@ 2019-09-06 13:15     ` David Miller
  -1 siblings, 0 replies; 50+ messages in thread
From: David Miller @ 2019-09-06 13:15 UTC (permalink / raw)
  To: jasowang
  Cc: aarcange, kvm, mst, netdev, linux-kernel, virtualization,
	linux-mm, jglisse, jgg

From: Jason Wang <jasowang@redhat.com>
Date: Fri, 6 Sep 2019 18:02:35 +0800

> On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
>> I think you should apply the revert this cycle and rebase the other
>> patch for next..
>>
>> Jason
> 
> Yes, the plan is to revert in this release cycle.

Then you should reset patch #1 all by itself targetting 'net'.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 1/2] Revert "vhost: access vq metadata through kernel virtual address"
  2019-09-05 12:27 ` Jason Wang
@ 2019-09-06 13:46   ` Michael S. Tsirkin
  2019-09-09  7:16     ` Jason Wang
  2019-09-09  7:16     ` Jason Wang
  2019-09-06 13:46   ` Michael S. Tsirkin
  1 sibling, 2 replies; 50+ messages in thread
From: Michael S. Tsirkin @ 2019-09-06 13:46 UTC (permalink / raw)
  To: Jason Wang
  Cc: kvm, virtualization, netdev, linux-kernel, jgg, aarcange,
	jglisse, linux-mm

On Thu, Sep 05, 2019 at 08:27:35PM +0800, Jason Wang wrote:
> It was reported that metadata acceleration introduces several issues,
> so this patch reverts commit ff466032dc9e5a61217f22ea34b2df932786bbfc,
> 73f628ec9e6bcc45b77c53fe6d0c0ec55eaf82af and
> 0b4a7092ffe568a55bf8f3cefdf79ff666586d91.
> 
> We will rework it on the next version.
> 
> Cc: Jason Gunthorpe <jgg@mellanox.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>


I am confused by the above.
What I see upstream is 7f466032dc.

commit 7f466032dc9e5a61217f22ea34b2df932786bbfc
Author: Jason Wang <jasowang@redhat.com>
Date:   Fri May 24 04:12:18 2019 -0400

    vhost: access vq metadata through kernel virtual address

so this is what I reverted.

Pls take a look, and let me know if you see issues.

Thanks!

> ---
>  drivers/vhost/vhost.c | 515 +-----------------------------------------
>  drivers/vhost/vhost.h |  41 ----
>  2 files changed, 3 insertions(+), 553 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 0536f8526359..791562e03fe0 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -298,160 +298,6 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
>  		__vhost_vq_meta_reset(d->vqs[i]);
>  }
>  
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -static void vhost_map_unprefetch(struct vhost_map *map)
> -{
> -	kfree(map->pages);
> -	map->pages = NULL;
> -	map->npages = 0;
> -	map->addr = NULL;
> -}
> -
> -static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
> -{
> -	struct vhost_map *map[VHOST_NUM_ADDRS];
> -	int i;
> -
> -	spin_lock(&vq->mmu_lock);
> -	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> -		map[i] = rcu_dereference_protected(vq->maps[i],
> -				  lockdep_is_held(&vq->mmu_lock));
> -		if (map[i])
> -			rcu_assign_pointer(vq->maps[i], NULL);
> -	}
> -	spin_unlock(&vq->mmu_lock);
> -
> -	synchronize_rcu();
> -
> -	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> -		if (map[i])
> -			vhost_map_unprefetch(map[i]);
> -
> -}
> -
> -static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
> -{
> -	int i;
> -
> -	vhost_uninit_vq_maps(vq);
> -	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> -		vq->uaddrs[i].size = 0;
> -}
> -
> -static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
> -				     unsigned long start,
> -				     unsigned long end)
> -{
> -	if (unlikely(!uaddr->size))
> -		return false;
> -
> -	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
> -}
> -
> -static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
> -				      int index,
> -				      unsigned long start,
> -				      unsigned long end)
> -{
> -	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> -	struct vhost_map *map;
> -	int i;
> -
> -	if (!vhost_map_range_overlap(uaddr, start, end))
> -		return;
> -
> -	spin_lock(&vq->mmu_lock);
> -	++vq->invalidate_count;
> -
> -	map = rcu_dereference_protected(vq->maps[index],
> -					lockdep_is_held(&vq->mmu_lock));
> -	if (map) {
> -		if (uaddr->write) {
> -			for (i = 0; i < map->npages; i++)
> -				set_page_dirty(map->pages[i]);
> -		}
> -		rcu_assign_pointer(vq->maps[index], NULL);
> -	}
> -	spin_unlock(&vq->mmu_lock);
> -
> -	if (map) {
> -		synchronize_rcu();
> -		vhost_map_unprefetch(map);
> -	}
> -}
> -
> -static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
> -				    int index,
> -				    unsigned long start,
> -				    unsigned long end)
> -{
> -	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
> -		return;
> -
> -	spin_lock(&vq->mmu_lock);
> -	--vq->invalidate_count;
> -	spin_unlock(&vq->mmu_lock);
> -}
> -
> -static int vhost_invalidate_range_start(struct mmu_notifier *mn,
> -					const struct mmu_notifier_range *range)
> -{
> -	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> -					     mmu_notifier);
> -	int i, j;
> -
> -	if (!mmu_notifier_range_blockable(range))
> -		return -EAGAIN;
> -
> -	for (i = 0; i < dev->nvqs; i++) {
> -		struct vhost_virtqueue *vq = dev->vqs[i];
> -
> -		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> -			vhost_invalidate_vq_start(vq, j,
> -						  range->start,
> -						  range->end);
> -	}
> -
> -	return 0;
> -}
> -
> -static void vhost_invalidate_range_end(struct mmu_notifier *mn,
> -				       const struct mmu_notifier_range *range)
> -{
> -	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> -					     mmu_notifier);
> -	int i, j;
> -
> -	for (i = 0; i < dev->nvqs; i++) {
> -		struct vhost_virtqueue *vq = dev->vqs[i];
> -
> -		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> -			vhost_invalidate_vq_end(vq, j,
> -						range->start,
> -						range->end);
> -	}
> -}
> -
> -static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
> -	.invalidate_range_start = vhost_invalidate_range_start,
> -	.invalidate_range_end = vhost_invalidate_range_end,
> -};
> -
> -static void vhost_init_maps(struct vhost_dev *dev)
> -{
> -	struct vhost_virtqueue *vq;
> -	int i, j;
> -
> -	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
> -
> -	for (i = 0; i < dev->nvqs; ++i) {
> -		vq = dev->vqs[i];
> -		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> -			RCU_INIT_POINTER(vq->maps[j], NULL);
> -	}
> -}
> -#endif
> -
>  static void vhost_vq_reset(struct vhost_dev *dev,
>  			   struct vhost_virtqueue *vq)
>  {
> @@ -480,11 +326,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->busyloop_timeout = 0;
>  	vq->umem = NULL;
>  	vq->iotlb = NULL;
> -	vq->invalidate_count = 0;
>  	__vhost_vq_meta_reset(vq);
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	vhost_reset_vq_maps(vq);
> -#endif
>  }
>  
>  static int vhost_worker(void *data)
> @@ -634,9 +476,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>  	INIT_LIST_HEAD(&dev->read_list);
>  	INIT_LIST_HEAD(&dev->pending_list);
>  	spin_lock_init(&dev->iotlb_lock);
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	vhost_init_maps(dev);
> -#endif
> +
>  
>  	for (i = 0; i < dev->nvqs; ++i) {
>  		vq = dev->vqs[i];
> @@ -645,7 +485,6 @@ void vhost_dev_init(struct vhost_dev *dev,
>  		vq->heads = NULL;
>  		vq->dev = dev;
>  		mutex_init(&vq->mutex);
> -		spin_lock_init(&vq->mmu_lock);
>  		vhost_vq_reset(dev, vq);
>  		if (vq->handle_kick)
>  			vhost_poll_init(&vq->poll, vq->handle_kick,
> @@ -725,18 +564,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>  	if (err)
>  		goto err_cgroup;
>  
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
> -	if (err)
> -		goto err_mmu_notifier;
> -#endif
> -
>  	return 0;
> -
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -err_mmu_notifier:
> -	vhost_dev_free_iovecs(dev);
> -#endif
>  err_cgroup:
>  	kthread_stop(worker);
>  	dev->worker = NULL;
> @@ -827,107 +655,6 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>  	spin_unlock(&dev->iotlb_lock);
>  }
>  
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
> -			      int index, unsigned long uaddr,
> -			      size_t size, bool write)
> -{
> -	struct vhost_uaddr *addr = &vq->uaddrs[index];
> -
> -	addr->uaddr = uaddr;
> -	addr->size = size;
> -	addr->write = write;
> -}
> -
> -static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
> -{
> -	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
> -			  (unsigned long)vq->desc,
> -			  vhost_get_desc_size(vq, vq->num),
> -			  false);
> -	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
> -			  (unsigned long)vq->avail,
> -			  vhost_get_avail_size(vq, vq->num),
> -			  false);
> -	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
> -			  (unsigned long)vq->used,
> -			  vhost_get_used_size(vq, vq->num),
> -			  true);
> -}
> -
> -static int vhost_map_prefetch(struct vhost_virtqueue *vq,
> -			       int index)
> -{
> -	struct vhost_map *map;
> -	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> -	struct page **pages;
> -	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
> -	int npinned;
> -	void *vaddr, *v;
> -	int err;
> -	int i;
> -
> -	spin_lock(&vq->mmu_lock);
> -
> -	err = -EFAULT;
> -	if (vq->invalidate_count)
> -		goto err;
> -
> -	err = -ENOMEM;
> -	map = kmalloc(sizeof(*map), GFP_ATOMIC);
> -	if (!map)
> -		goto err;
> -
> -	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
> -	if (!pages)
> -		goto err_pages;
> -
> -	err = EFAULT;
> -	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
> -					uaddr->write, pages);
> -	if (npinned > 0)
> -		release_pages(pages, npinned);
> -	if (npinned != npages)
> -		goto err_gup;
> -
> -	for (i = 0; i < npinned; i++)
> -		if (PageHighMem(pages[i]))
> -			goto err_gup;
> -
> -	vaddr = v = page_address(pages[0]);
> -
> -	/* For simplicity, fallback to userspace address if VA is not
> -	 * contigious.
> -	 */
> -	for (i = 1; i < npinned; i++) {
> -		v += PAGE_SIZE;
> -		if (v != page_address(pages[i]))
> -			goto err_gup;
> -	}
> -
> -	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
> -	map->npages = npages;
> -	map->pages = pages;
> -
> -	rcu_assign_pointer(vq->maps[index], map);
> -	/* No need for a synchronize_rcu(). This function should be
> -	 * called by dev->worker so we are serialized with all
> -	 * readers.
> -	 */
> -	spin_unlock(&vq->mmu_lock);
> -
> -	return 0;
> -
> -err_gup:
> -	kfree(pages);
> -err_pages:
> -	kfree(map);
> -err:
> -	spin_unlock(&vq->mmu_lock);
> -	return err;
> -}
> -#endif
> -
>  void vhost_dev_cleanup(struct vhost_dev *dev)
>  {
>  	int i;
> @@ -957,16 +684,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  		kthread_stop(dev->worker);
>  		dev->worker = NULL;
>  	}
> -	if (dev->mm) {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -		mmu_notifier_unregister(&dev->mmu_notifier, dev->mm);
> -#endif
> +	if (dev->mm)
>  		mmput(dev->mm);
> -	}
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	for (i = 0; i < dev->nvqs; i++)
> -		vhost_uninit_vq_maps(dev->vqs[i]);
> -#endif
>  	dev->mm = NULL;
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> @@ -1195,26 +914,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
>  
>  static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_used *used;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> -		if (likely(map)) {
> -			used = map->addr;
> -			*((__virtio16 *)&used->ring[vq->num]) =
> -				cpu_to_vhost16(vq, vq->avail_idx);
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
>  			      vhost_avail_event(vq));
>  }
> @@ -1223,27 +922,6 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  				 struct vring_used_elem *head, int idx,
>  				 int count)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_used *used;
> -	size_t size;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> -		if (likely(map)) {
> -			used = map->addr;
> -			size = count * sizeof(*head);
> -			memcpy(used->ring + idx, head, size);
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
>  				  count * sizeof(*head));
>  }
> @@ -1251,25 +929,6 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_used *used;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> -		if (likely(map)) {
> -			used = map->addr;
> -			used->flags = cpu_to_vhost16(vq, vq->used_flags);
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
>  			      &vq->used->flags);
>  }
> @@ -1277,25 +936,6 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>  
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_used *used;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> -		if (likely(map)) {
> -			used = map->addr;
> -			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>  			      &vq->used->idx);
>  }
> @@ -1341,50 +981,12 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
>  static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>  				      __virtio16 *idx)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_avail *avail;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> -		if (likely(map)) {
> -			avail = map->addr;
> -			*idx = avail->idx;
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_get_avail(vq, *idx, &vq->avail->idx);
>  }
>  
>  static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  				       __virtio16 *head, int idx)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_avail *avail;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> -		if (likely(map)) {
> -			avail = map->addr;
> -			*head = avail->ring[idx & (vq->num - 1)];
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_get_avail(vq, *head,
>  			       &vq->avail->ring[idx & (vq->num - 1)]);
>  }
> @@ -1392,98 +994,24 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>  					__virtio16 *flags)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_avail *avail;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> -		if (likely(map)) {
> -			avail = map->addr;
> -			*flags = avail->flags;
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_get_avail(vq, *flags, &vq->avail->flags);
>  }
>  
>  static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>  				       __virtio16 *event)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_avail *avail;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> -		if (likely(map)) {
> -			avail = map->addr;
> -			*event = (__virtio16)avail->ring[vq->num];
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_get_avail(vq, *event, vhost_used_event(vq));
>  }
>  
>  static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>  				     __virtio16 *idx)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_used *used;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> -		if (likely(map)) {
> -			used = map->addr;
> -			*idx = used->idx;
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_get_used(vq, *idx, &vq->used->idx);
>  }
>  
>  static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>  				 struct vring_desc *desc, int idx)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_desc *d;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
> -		if (likely(map)) {
> -			d = map->addr;
> -			*desc = *(d + idx);
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
>  }
>  
> @@ -1824,32 +1352,12 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
>  	return true;
>  }
>  
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
> -{
> -	struct vhost_map __rcu *map;
> -	int i;
> -
> -	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> -		rcu_read_lock();
> -		map = rcu_dereference(vq->maps[i]);
> -		rcu_read_unlock();
> -		if (unlikely(!map))
> -			vhost_map_prefetch(vq, i);
> -	}
> -}
> -#endif
> -
>  int vq_meta_prefetch(struct vhost_virtqueue *vq)
>  {
>  	unsigned int num = vq->num;
>  
> -	if (!vq->iotlb) {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -		vhost_vq_map_prefetch(vq);
> -#endif
> +	if (!vq->iotlb)
>  		return 1;
> -	}
>  
>  	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
>  			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
> @@ -2060,16 +1568,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
>  
>  	mutex_lock(&vq->mutex);
>  
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	/* Unregister MMU notifer to allow invalidation callback
> -	 * can access vq->uaddrs[] without holding a lock.
> -	 */
> -	if (d->mm)
> -		mmu_notifier_unregister(&d->mmu_notifier, d->mm);
> -
> -	vhost_uninit_vq_maps(vq);
> -#endif
> -
>  	switch (ioctl) {
>  	case VHOST_SET_VRING_NUM:
>  		r = vhost_vring_set_num(d, vq, argp);
> @@ -2081,13 +1579,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
>  		BUG();
>  	}
>  
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	vhost_setup_vq_uaddr(vq);
> -
> -	if (d->mm)
> -		mmu_notifier_register(&d->mmu_notifier, d->mm);
> -#endif
> -
>  	mutex_unlock(&vq->mutex);
>  
>  	return r;
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 42a8c2a13ab1..e9ed2722b633 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -12,9 +12,6 @@
>  #include <linux/virtio_config.h>
>  #include <linux/virtio_ring.h>
>  #include <linux/atomic.h>
> -#include <linux/pagemap.h>
> -#include <linux/mmu_notifier.h>
> -#include <asm/cacheflush.h>
>  
>  struct vhost_work;
>  typedef void (*vhost_work_fn_t)(struct vhost_work *work);
> @@ -83,24 +80,6 @@ enum vhost_uaddr_type {
>  	VHOST_NUM_ADDRS = 3,
>  };
>  
> -struct vhost_map {
> -	int npages;
> -	void *addr;
> -	struct page **pages;
> -};
> -
> -struct vhost_uaddr {
> -	unsigned long uaddr;
> -	size_t size;
> -	bool write;
> -};
> -
> -#if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
> -#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
> -#else
> -#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
> -#endif
> -
>  /* The virtqueue structure describes a queue attached to a device. */
>  struct vhost_virtqueue {
>  	struct vhost_dev *dev;
> @@ -111,22 +90,7 @@ struct vhost_virtqueue {
>  	struct vring_desc __user *desc;
>  	struct vring_avail __user *avail;
>  	struct vring_used __user *used;
> -
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	/* Read by memory accessors, modified by meta data
> -	 * prefetching, MMU notifier and vring ioctl().
> -	 * Synchonrized through mmu_lock (writers) and RCU (writers
> -	 * and readers).
> -	 */
> -	struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
> -	/* Read by MMU notifier, modified by vring ioctl(),
> -	 * synchronized through MMU notifier
> -	 * registering/unregistering.
> -	 */
> -	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
> -#endif
>  	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
> -
>  	struct file *kick;
>  	struct eventfd_ctx *call_ctx;
>  	struct eventfd_ctx *error_ctx;
> @@ -181,8 +145,6 @@ struct vhost_virtqueue {
>  	bool user_be;
>  #endif
>  	u32 busyloop_timeout;
> -	spinlock_t mmu_lock;
> -	int invalidate_count;
>  };
>  
>  struct vhost_msg_node {
> @@ -196,9 +158,6 @@ struct vhost_msg_node {
>  
>  struct vhost_dev {
>  	struct mm_struct *mm;
> -#ifdef CONFIG_MMU_NOTIFIER
> -	struct mmu_notifier mmu_notifier;
> -#endif
>  	struct mutex mutex;
>  	struct vhost_virtqueue **vqs;
>  	int nvqs;
> -- 
> 2.19.1

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 1/2] Revert "vhost: access vq metadata through kernel virtual address"
  2019-09-05 12:27 ` Jason Wang
  2019-09-06 13:46   ` Michael S. Tsirkin
@ 2019-09-06 13:46   ` Michael S. Tsirkin
  1 sibling, 0 replies; 50+ messages in thread
From: Michael S. Tsirkin @ 2019-09-06 13:46 UTC (permalink / raw)
  To: Jason Wang
  Cc: aarcange, kvm, netdev, linux-kernel, virtualization, linux-mm,
	jglisse, jgg

On Thu, Sep 05, 2019 at 08:27:35PM +0800, Jason Wang wrote:
> It was reported that metadata acceleration introduces several issues,
> so this patch reverts commit ff466032dc9e5a61217f22ea34b2df932786bbfc,
> 73f628ec9e6bcc45b77c53fe6d0c0ec55eaf82af and
> 0b4a7092ffe568a55bf8f3cefdf79ff666586d91.
> 
> We will rework it on the next version.
> 
> Cc: Jason Gunthorpe <jgg@mellanox.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>


I am confused by the above.
What I see upstream is 7f466032dc.

commit 7f466032dc9e5a61217f22ea34b2df932786bbfc
Author: Jason Wang <jasowang@redhat.com>
Date:   Fri May 24 04:12:18 2019 -0400

    vhost: access vq metadata through kernel virtual address

so this is what I reverted.

Pls take a look, and let me know if you see issues.

Thanks!

> ---
>  drivers/vhost/vhost.c | 515 +-----------------------------------------
>  drivers/vhost/vhost.h |  41 ----
>  2 files changed, 3 insertions(+), 553 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 0536f8526359..791562e03fe0 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -298,160 +298,6 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
>  		__vhost_vq_meta_reset(d->vqs[i]);
>  }
>  
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -static void vhost_map_unprefetch(struct vhost_map *map)
> -{
> -	kfree(map->pages);
> -	map->pages = NULL;
> -	map->npages = 0;
> -	map->addr = NULL;
> -}
> -
> -static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
> -{
> -	struct vhost_map *map[VHOST_NUM_ADDRS];
> -	int i;
> -
> -	spin_lock(&vq->mmu_lock);
> -	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> -		map[i] = rcu_dereference_protected(vq->maps[i],
> -				  lockdep_is_held(&vq->mmu_lock));
> -		if (map[i])
> -			rcu_assign_pointer(vq->maps[i], NULL);
> -	}
> -	spin_unlock(&vq->mmu_lock);
> -
> -	synchronize_rcu();
> -
> -	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> -		if (map[i])
> -			vhost_map_unprefetch(map[i]);
> -
> -}
> -
> -static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
> -{
> -	int i;
> -
> -	vhost_uninit_vq_maps(vq);
> -	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> -		vq->uaddrs[i].size = 0;
> -}
> -
> -static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
> -				     unsigned long start,
> -				     unsigned long end)
> -{
> -	if (unlikely(!uaddr->size))
> -		return false;
> -
> -	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
> -}
> -
> -static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
> -				      int index,
> -				      unsigned long start,
> -				      unsigned long end)
> -{
> -	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> -	struct vhost_map *map;
> -	int i;
> -
> -	if (!vhost_map_range_overlap(uaddr, start, end))
> -		return;
> -
> -	spin_lock(&vq->mmu_lock);
> -	++vq->invalidate_count;
> -
> -	map = rcu_dereference_protected(vq->maps[index],
> -					lockdep_is_held(&vq->mmu_lock));
> -	if (map) {
> -		if (uaddr->write) {
> -			for (i = 0; i < map->npages; i++)
> -				set_page_dirty(map->pages[i]);
> -		}
> -		rcu_assign_pointer(vq->maps[index], NULL);
> -	}
> -	spin_unlock(&vq->mmu_lock);
> -
> -	if (map) {
> -		synchronize_rcu();
> -		vhost_map_unprefetch(map);
> -	}
> -}
> -
> -static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
> -				    int index,
> -				    unsigned long start,
> -				    unsigned long end)
> -{
> -	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
> -		return;
> -
> -	spin_lock(&vq->mmu_lock);
> -	--vq->invalidate_count;
> -	spin_unlock(&vq->mmu_lock);
> -}
> -
> -static int vhost_invalidate_range_start(struct mmu_notifier *mn,
> -					const struct mmu_notifier_range *range)
> -{
> -	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> -					     mmu_notifier);
> -	int i, j;
> -
> -	if (!mmu_notifier_range_blockable(range))
> -		return -EAGAIN;
> -
> -	for (i = 0; i < dev->nvqs; i++) {
> -		struct vhost_virtqueue *vq = dev->vqs[i];
> -
> -		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> -			vhost_invalidate_vq_start(vq, j,
> -						  range->start,
> -						  range->end);
> -	}
> -
> -	return 0;
> -}
> -
> -static void vhost_invalidate_range_end(struct mmu_notifier *mn,
> -				       const struct mmu_notifier_range *range)
> -{
> -	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> -					     mmu_notifier);
> -	int i, j;
> -
> -	for (i = 0; i < dev->nvqs; i++) {
> -		struct vhost_virtqueue *vq = dev->vqs[i];
> -
> -		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> -			vhost_invalidate_vq_end(vq, j,
> -						range->start,
> -						range->end);
> -	}
> -}
> -
> -static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
> -	.invalidate_range_start = vhost_invalidate_range_start,
> -	.invalidate_range_end = vhost_invalidate_range_end,
> -};
> -
> -static void vhost_init_maps(struct vhost_dev *dev)
> -{
> -	struct vhost_virtqueue *vq;
> -	int i, j;
> -
> -	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
> -
> -	for (i = 0; i < dev->nvqs; ++i) {
> -		vq = dev->vqs[i];
> -		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> -			RCU_INIT_POINTER(vq->maps[j], NULL);
> -	}
> -}
> -#endif
> -
>  static void vhost_vq_reset(struct vhost_dev *dev,
>  			   struct vhost_virtqueue *vq)
>  {
> @@ -480,11 +326,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->busyloop_timeout = 0;
>  	vq->umem = NULL;
>  	vq->iotlb = NULL;
> -	vq->invalidate_count = 0;
>  	__vhost_vq_meta_reset(vq);
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	vhost_reset_vq_maps(vq);
> -#endif
>  }
>  
>  static int vhost_worker(void *data)
> @@ -634,9 +476,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>  	INIT_LIST_HEAD(&dev->read_list);
>  	INIT_LIST_HEAD(&dev->pending_list);
>  	spin_lock_init(&dev->iotlb_lock);
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	vhost_init_maps(dev);
> -#endif
> +
>  
>  	for (i = 0; i < dev->nvqs; ++i) {
>  		vq = dev->vqs[i];
> @@ -645,7 +485,6 @@ void vhost_dev_init(struct vhost_dev *dev,
>  		vq->heads = NULL;
>  		vq->dev = dev;
>  		mutex_init(&vq->mutex);
> -		spin_lock_init(&vq->mmu_lock);
>  		vhost_vq_reset(dev, vq);
>  		if (vq->handle_kick)
>  			vhost_poll_init(&vq->poll, vq->handle_kick,
> @@ -725,18 +564,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>  	if (err)
>  		goto err_cgroup;
>  
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
> -	if (err)
> -		goto err_mmu_notifier;
> -#endif
> -
>  	return 0;
> -
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -err_mmu_notifier:
> -	vhost_dev_free_iovecs(dev);
> -#endif
>  err_cgroup:
>  	kthread_stop(worker);
>  	dev->worker = NULL;
> @@ -827,107 +655,6 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>  	spin_unlock(&dev->iotlb_lock);
>  }
>  
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
> -			      int index, unsigned long uaddr,
> -			      size_t size, bool write)
> -{
> -	struct vhost_uaddr *addr = &vq->uaddrs[index];
> -
> -	addr->uaddr = uaddr;
> -	addr->size = size;
> -	addr->write = write;
> -}
> -
> -static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
> -{
> -	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
> -			  (unsigned long)vq->desc,
> -			  vhost_get_desc_size(vq, vq->num),
> -			  false);
> -	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
> -			  (unsigned long)vq->avail,
> -			  vhost_get_avail_size(vq, vq->num),
> -			  false);
> -	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
> -			  (unsigned long)vq->used,
> -			  vhost_get_used_size(vq, vq->num),
> -			  true);
> -}
> -
> -static int vhost_map_prefetch(struct vhost_virtqueue *vq,
> -			       int index)
> -{
> -	struct vhost_map *map;
> -	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> -	struct page **pages;
> -	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
> -	int npinned;
> -	void *vaddr, *v;
> -	int err;
> -	int i;
> -
> -	spin_lock(&vq->mmu_lock);
> -
> -	err = -EFAULT;
> -	if (vq->invalidate_count)
> -		goto err;
> -
> -	err = -ENOMEM;
> -	map = kmalloc(sizeof(*map), GFP_ATOMIC);
> -	if (!map)
> -		goto err;
> -
> -	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
> -	if (!pages)
> -		goto err_pages;
> -
> -	err = EFAULT;
> -	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
> -					uaddr->write, pages);
> -	if (npinned > 0)
> -		release_pages(pages, npinned);
> -	if (npinned != npages)
> -		goto err_gup;
> -
> -	for (i = 0; i < npinned; i++)
> -		if (PageHighMem(pages[i]))
> -			goto err_gup;
> -
> -	vaddr = v = page_address(pages[0]);
> -
> -	/* For simplicity, fallback to userspace address if VA is not
> -	 * contigious.
> -	 */
> -	for (i = 1; i < npinned; i++) {
> -		v += PAGE_SIZE;
> -		if (v != page_address(pages[i]))
> -			goto err_gup;
> -	}
> -
> -	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
> -	map->npages = npages;
> -	map->pages = pages;
> -
> -	rcu_assign_pointer(vq->maps[index], map);
> -	/* No need for a synchronize_rcu(). This function should be
> -	 * called by dev->worker so we are serialized with all
> -	 * readers.
> -	 */
> -	spin_unlock(&vq->mmu_lock);
> -
> -	return 0;
> -
> -err_gup:
> -	kfree(pages);
> -err_pages:
> -	kfree(map);
> -err:
> -	spin_unlock(&vq->mmu_lock);
> -	return err;
> -}
> -#endif
> -
>  void vhost_dev_cleanup(struct vhost_dev *dev)
>  {
>  	int i;
> @@ -957,16 +684,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  		kthread_stop(dev->worker);
>  		dev->worker = NULL;
>  	}
> -	if (dev->mm) {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -		mmu_notifier_unregister(&dev->mmu_notifier, dev->mm);
> -#endif
> +	if (dev->mm)
>  		mmput(dev->mm);
> -	}
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	for (i = 0; i < dev->nvqs; i++)
> -		vhost_uninit_vq_maps(dev->vqs[i]);
> -#endif
>  	dev->mm = NULL;
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> @@ -1195,26 +914,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
>  
>  static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_used *used;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> -		if (likely(map)) {
> -			used = map->addr;
> -			*((__virtio16 *)&used->ring[vq->num]) =
> -				cpu_to_vhost16(vq, vq->avail_idx);
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
>  			      vhost_avail_event(vq));
>  }
> @@ -1223,27 +922,6 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  				 struct vring_used_elem *head, int idx,
>  				 int count)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_used *used;
> -	size_t size;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> -		if (likely(map)) {
> -			used = map->addr;
> -			size = count * sizeof(*head);
> -			memcpy(used->ring + idx, head, size);
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
>  				  count * sizeof(*head));
>  }
> @@ -1251,25 +929,6 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_used *used;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> -		if (likely(map)) {
> -			used = map->addr;
> -			used->flags = cpu_to_vhost16(vq, vq->used_flags);
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
>  			      &vq->used->flags);
>  }
> @@ -1277,25 +936,6 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>  
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_used *used;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> -		if (likely(map)) {
> -			used = map->addr;
> -			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>  			      &vq->used->idx);
>  }
> @@ -1341,50 +981,12 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
>  static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>  				      __virtio16 *idx)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_avail *avail;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> -		if (likely(map)) {
> -			avail = map->addr;
> -			*idx = avail->idx;
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_get_avail(vq, *idx, &vq->avail->idx);
>  }
>  
>  static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  				       __virtio16 *head, int idx)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_avail *avail;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> -		if (likely(map)) {
> -			avail = map->addr;
> -			*head = avail->ring[idx & (vq->num - 1)];
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_get_avail(vq, *head,
>  			       &vq->avail->ring[idx & (vq->num - 1)]);
>  }
> @@ -1392,98 +994,24 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>  					__virtio16 *flags)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_avail *avail;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> -		if (likely(map)) {
> -			avail = map->addr;
> -			*flags = avail->flags;
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_get_avail(vq, *flags, &vq->avail->flags);
>  }
>  
>  static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>  				       __virtio16 *event)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_avail *avail;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> -		if (likely(map)) {
> -			avail = map->addr;
> -			*event = (__virtio16)avail->ring[vq->num];
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_get_avail(vq, *event, vhost_used_event(vq));
>  }
>  
>  static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>  				     __virtio16 *idx)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_used *used;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> -		if (likely(map)) {
> -			used = map->addr;
> -			*idx = used->idx;
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_get_used(vq, *idx, &vq->used->idx);
>  }
>  
>  static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>  				 struct vring_desc *desc, int idx)
>  {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	struct vhost_map *map;
> -	struct vring_desc *d;
> -
> -	if (!vq->iotlb) {
> -		rcu_read_lock();
> -
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
> -		if (likely(map)) {
> -			d = map->addr;
> -			*desc = *(d + idx);
> -			rcu_read_unlock();
> -			return 0;
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -#endif
> -
>  	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
>  }
>  
> @@ -1824,32 +1352,12 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
>  	return true;
>  }
>  
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
> -{
> -	struct vhost_map __rcu *map;
> -	int i;
> -
> -	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> -		rcu_read_lock();
> -		map = rcu_dereference(vq->maps[i]);
> -		rcu_read_unlock();
> -		if (unlikely(!map))
> -			vhost_map_prefetch(vq, i);
> -	}
> -}
> -#endif
> -
>  int vq_meta_prefetch(struct vhost_virtqueue *vq)
>  {
>  	unsigned int num = vq->num;
>  
> -	if (!vq->iotlb) {
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -		vhost_vq_map_prefetch(vq);
> -#endif
> +	if (!vq->iotlb)
>  		return 1;
> -	}
>  
>  	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
>  			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
> @@ -2060,16 +1568,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
>  
>  	mutex_lock(&vq->mutex);
>  
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	/* Unregister MMU notifer to allow invalidation callback
> -	 * can access vq->uaddrs[] without holding a lock.
> -	 */
> -	if (d->mm)
> -		mmu_notifier_unregister(&d->mmu_notifier, d->mm);
> -
> -	vhost_uninit_vq_maps(vq);
> -#endif
> -
>  	switch (ioctl) {
>  	case VHOST_SET_VRING_NUM:
>  		r = vhost_vring_set_num(d, vq, argp);
> @@ -2081,13 +1579,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
>  		BUG();
>  	}
>  
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	vhost_setup_vq_uaddr(vq);
> -
> -	if (d->mm)
> -		mmu_notifier_register(&d->mmu_notifier, d->mm);
> -#endif
> -
>  	mutex_unlock(&vq->mutex);
>  
>  	return r;
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 42a8c2a13ab1..e9ed2722b633 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -12,9 +12,6 @@
>  #include <linux/virtio_config.h>
>  #include <linux/virtio_ring.h>
>  #include <linux/atomic.h>
> -#include <linux/pagemap.h>
> -#include <linux/mmu_notifier.h>
> -#include <asm/cacheflush.h>
>  
>  struct vhost_work;
>  typedef void (*vhost_work_fn_t)(struct vhost_work *work);
> @@ -83,24 +80,6 @@ enum vhost_uaddr_type {
>  	VHOST_NUM_ADDRS = 3,
>  };
>  
> -struct vhost_map {
> -	int npages;
> -	void *addr;
> -	struct page **pages;
> -};
> -
> -struct vhost_uaddr {
> -	unsigned long uaddr;
> -	size_t size;
> -	bool write;
> -};
> -
> -#if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
> -#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
> -#else
> -#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
> -#endif
> -
>  /* The virtqueue structure describes a queue attached to a device. */
>  struct vhost_virtqueue {
>  	struct vhost_dev *dev;
> @@ -111,22 +90,7 @@ struct vhost_virtqueue {
>  	struct vring_desc __user *desc;
>  	struct vring_avail __user *avail;
>  	struct vring_used __user *used;
> -
> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
> -	/* Read by memory accessors, modified by meta data
> -	 * prefetching, MMU notifier and vring ioctl().
> -	 * Synchonrized through mmu_lock (writers) and RCU (writers
> -	 * and readers).
> -	 */
> -	struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
> -	/* Read by MMU notifier, modified by vring ioctl(),
> -	 * synchronized through MMU notifier
> -	 * registering/unregistering.
> -	 */
> -	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
> -#endif
>  	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
> -
>  	struct file *kick;
>  	struct eventfd_ctx *call_ctx;
>  	struct eventfd_ctx *error_ctx;
> @@ -181,8 +145,6 @@ struct vhost_virtqueue {
>  	bool user_be;
>  #endif
>  	u32 busyloop_timeout;
> -	spinlock_t mmu_lock;
> -	int invalidate_count;
>  };
>  
>  struct vhost_msg_node {
> @@ -196,9 +158,6 @@ struct vhost_msg_node {
>  
>  struct vhost_dev {
>  	struct mm_struct *mm;
> -#ifdef CONFIG_MMU_NOTIFIER
> -	struct mmu_notifier mmu_notifier;
> -#endif
>  	struct mutex mutex;
>  	struct vhost_virtqueue **vqs;
>  	int nvqs;
> -- 
> 2.19.1

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
  2019-09-06 10:02     ` Jason Wang
  (?)
@ 2019-09-07 15:03       ` Jason Gunthorpe
  -1 siblings, 0 replies; 50+ messages in thread
From: Jason Gunthorpe @ 2019-09-07 15:03 UTC (permalink / raw)
  To: Jason Wang
  Cc: mst, kvm, virtualization, netdev, linux-kernel, aarcange,
	jglisse, linux-mm

On Fri, Sep 06, 2019 at 06:02:35PM +0800, Jason Wang wrote:
> 
> On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
> > On Thu, Sep 05, 2019 at 08:27:34PM +0800, Jason Wang wrote:
> > > Hi:
> > > 
> > > Per request from Michael and Jason, the metadata accelreation is
> > > reverted in this version and rework in next version.
> > > 
> > > Please review.
> > > 
> > > Thanks
> > > 
> > > Jason Wang (2):
> > >    Revert "vhost: access vq metadata through kernel virtual address"
> > >    vhost: re-introducing metadata acceleration through kernel virtual
> > >      address
> > There are a bunch of patches in the queue already that will help
> > vhost, and I a working on one for next cycle that will help alot more
> > too.
> 
> 
> I will check those patches, but if you can give me some pointers or keywords
> it would be much appreciated.

You can look here:

https://github.com/jgunthorpe/linux/commits/mmu_notifier

The first parts, the get/put are in the hmm tree, and the last part,
the interval tree in the last commit is still a WIP, but it would
remove alot of that code from vhost as well.

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
@ 2019-09-07 15:03       ` Jason Gunthorpe
  0 siblings, 0 replies; 50+ messages in thread
From: Jason Gunthorpe @ 2019-09-07 15:03 UTC (permalink / raw)
  To: Jason Wang
  Cc: mst, kvm, virtualization, netdev, linux-kernel, aarcange,
	jglisse, linux-mm

On Fri, Sep 06, 2019 at 06:02:35PM +0800, Jason Wang wrote:
> 
> On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
> > On Thu, Sep 05, 2019 at 08:27:34PM +0800, Jason Wang wrote:
> > > Hi:
> > > 
> > > Per request from Michael and Jason, the metadata accelreation is
> > > reverted in this version and rework in next version.
> > > 
> > > Please review.
> > > 
> > > Thanks
> > > 
> > > Jason Wang (2):
> > >    Revert "vhost: access vq metadata through kernel virtual address"
> > >    vhost: re-introducing metadata acceleration through kernel virtual
> > >      address
> > There are a bunch of patches in the queue already that will help
> > vhost, and I a working on one for next cycle that will help alot more
> > too.
> 
> 
> I will check those patches, but if you can give me some pointers or keywords
> it would be much appreciated.

You can look here:

https://github.com/jgunthorpe/linux/commits/mmu_notifier

The first parts, the get/put are in the hmm tree, and the last part,
the interval tree in the last commit is still a WIP, but it would
remove alot of that code from vhost as well.

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
@ 2019-09-07 15:03       ` Jason Gunthorpe
  0 siblings, 0 replies; 50+ messages in thread
From: Jason Gunthorpe @ 2019-09-07 15:03 UTC (permalink / raw)
  To: Jason Wang
  Cc: aarcange, kvm, mst, netdev, linux-kernel, virtualization,
	linux-mm, jglisse

On Fri, Sep 06, 2019 at 06:02:35PM +0800, Jason Wang wrote:
> 
> On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
> > On Thu, Sep 05, 2019 at 08:27:34PM +0800, Jason Wang wrote:
> > > Hi:
> > > 
> > > Per request from Michael and Jason, the metadata accelreation is
> > > reverted in this version and rework in next version.
> > > 
> > > Please review.
> > > 
> > > Thanks
> > > 
> > > Jason Wang (2):
> > >    Revert "vhost: access vq metadata through kernel virtual address"
> > >    vhost: re-introducing metadata acceleration through kernel virtual
> > >      address
> > There are a bunch of patches in the queue already that will help
> > vhost, and I a working on one for next cycle that will help alot more
> > too.
> 
> 
> I will check those patches, but if you can give me some pointers or keywords
> it would be much appreciated.

You can look here:

https://github.com/jgunthorpe/linux/commits/mmu_notifier

The first parts, the get/put are in the hmm tree, and the last part,
the interval tree in the last commit is still a WIP, but it would
remove alot of that code from vhost as well.

Jason
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-05 12:27   ` Jason Wang
@ 2019-09-08 11:05     ` Michael S. Tsirkin
  -1 siblings, 0 replies; 50+ messages in thread
From: Michael S. Tsirkin @ 2019-09-08 11:05 UTC (permalink / raw)
  To: Jason Wang
  Cc: kvm, virtualization, netdev, linux-kernel, jgg, aarcange,
	jglisse, linux-mm, James Bottomley, Christoph Hellwig,
	David Miller, linux-arm-kernel, linux-parisc

On Thu, Sep 05, 2019 at 08:27:36PM +0800, Jason Wang wrote:
> This is a rework on the commit 7f466032dc9e ("vhost: access vq
> metadata through kernel virtual address").
> 
> It was noticed that the copy_to/from_user() friends that was used to
> access virtqueue metdata tends to be very expensive for dataplane
> implementation like vhost since it involves lots of software checks,
> speculation barriers,

So if we drop speculation barrier,
there's a problem here in access will now be speculated.
This effectively disables the defence in depth effect of
b3bbfb3fb5d25776b8e3f361d2eedaabb0b496cd
    x86: Introduce __uaccess_begin_nospec() and uaccess_try_nospec


So now we need to sprinkle array_index_nospec or barrier_nospec over the
code whenever we use an index we got from userspace.
See below for some examples.


> hardware feature toggling (e.g SMAP). The
> extra cost will be more obvious when transferring small packets since
> the time spent on metadata accessing become more significant.
> 
> This patch tries to eliminate those overheads by accessing them
> through direct mapping of those pages. Invalidation callbacks is
> implemented for co-operation with general VM management (swap, KSM,
> THP or NUMA balancing). We will try to get the direct mapping of vq
> metadata before each round of packet processing if it doesn't
> exist. If we fail, we will simplely fallback to copy_to/from_user()
> friends.
> 
> This invalidation, direct mapping access and set are synchronized
> through spinlock. This takes a step back from the original commit
> 7f466032dc9e ("vhost: access vq metadata through kernel virtual
> address") which tries to RCU which is suspicious and hard to be
> reviewed. This won't perform as well as RCU because of the atomic,
> this could be addressed by the future optimization.
> 
> This method might does not work for high mem page which requires
> temporary mapping so we just fallback to normal
> copy_to/from_user() and may not for arch that has virtual tagged cache
> since extra cache flushing is needed to eliminate the alias. This will
> result complex logic and bad performance. For those archs, this patch
> simply go for copy_to/from_user() friends. This is done by ruling out
> kernel mapping codes through ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.
> 
> Note that this is only done when device IOTLB is not enabled. We
> could use similar method to optimize IOTLB in the future.
> 
> Tests shows at most about 22% improvement on TX PPS when using
> virtio-user + vhost_net + xdp1 + TAP on 4.0GHz Kaby Lake.
> 
>         SMAP on | SMAP off
> Before: 4.9Mpps | 6.9Mpps
> After:  6.0Mpps | 7.5Mpps
> 
> On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
> any difference.

Why is not Kaby Lake with SMAP off the same as Sandy Bridge?


> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: David Miller <davem@davemloft.net>
> Cc: Jerome Glisse <jglisse@redhat.com>
> Cc: Jason Gunthorpe <jgg@mellanox.com>
> Cc: linux-mm@kvack.org
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-parisc@vger.kernel.org
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  drivers/vhost/vhost.c | 551 +++++++++++++++++++++++++++++++++++++++++-
>  drivers/vhost/vhost.h |  41 ++++
>  2 files changed, 589 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 791562e03fe0..f98155f28f02 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -298,6 +298,182 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
>  		__vhost_vq_meta_reset(d->vqs[i]);
>  }
>  
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +static void vhost_map_unprefetch(struct vhost_map *map)
> +{
> +	kfree(map->pages);
> +	kfree(map);
> +}
> +
> +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
> +				struct vhost_map *map, int index)
> +{
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	int i;
> +
> +	if (uaddr->write) {
> +		for (i = 0; i < map->npages; i++)
> +			set_page_dirty(map->pages[i]);
> +	}
> +}
> +
> +static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
> +{
> +	struct vhost_map *map[VHOST_NUM_ADDRS];
> +	int i;
> +
> +	spin_lock(&vq->mmu_lock);
> +	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> +		map[i] = vq->maps[i];
> +		if (map[i]) {
> +			vhost_set_map_dirty(vq, map[i], i);
> +			vq->maps[i] = NULL;
> +		}
> +	}
> +	spin_unlock(&vq->mmu_lock);
> +
> +	/* No need for synchronization since we are serialized with
> +	 * memory accessors (e.g vq mutex held).
> +	 */
> +
> +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> +		if (map[i])
> +			vhost_map_unprefetch(map[i]);
> +
> +}
> +
> +static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
> +{
> +	int i;
> +
> +	vhost_uninit_vq_maps(vq);
> +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> +		vq->uaddrs[i].size = 0;
> +}
> +
> +static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
> +				     unsigned long start,
> +				     unsigned long end)
> +{
> +	if (unlikely(!uaddr->size))
> +		return false;
> +
> +	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
> +}
> +
> +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
> +{
> +	spin_lock(&vq->mmu_lock);
> +}
> +
> +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
> +{
> +	spin_unlock(&vq->mmu_lock);
> +}
> +
> +static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
> +				     int index,
> +				     unsigned long start,
> +				     unsigned long end,
> +				     bool blockable)
> +{
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	struct vhost_map *map;
> +
> +	if (!vhost_map_range_overlap(uaddr, start, end))
> +		return 0;
> +	else if (!blockable)
> +		return -EAGAIN;
> +
> +	spin_lock(&vq->mmu_lock);
> +	++vq->invalidate_count;
> +
> +	map = vq->maps[index];
> +	if (map)
> +		vq->maps[index] = NULL;
> +	spin_unlock(&vq->mmu_lock);
> +
> +	if (map) {
> +		vhost_set_map_dirty(vq, map, index);
> +		vhost_map_unprefetch(map);
> +	}
> +
> +	return 0;
> +}
> +
> +static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
> +				    int index,
> +				    unsigned long start,
> +				    unsigned long end)
> +{
> +	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
> +		return;
> +
> +	spin_lock(&vq->mmu_lock);
> +	--vq->invalidate_count;
> +	spin_unlock(&vq->mmu_lock);
> +}
> +
> +static int vhost_invalidate_range_start(struct mmu_notifier *mn,
> +					const struct mmu_notifier_range *range)
> +{
> +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> +					     mmu_notifier);
> +	bool blockable = mmu_notifier_range_blockable(range);
> +	int i, j, ret;
> +
> +	for (i = 0; i < dev->nvqs; i++) {
> +		struct vhost_virtqueue *vq = dev->vqs[i];
> +
> +		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
> +			ret = vhost_invalidate_vq_start(vq, j,
> +							range->start,
> +							range->end, blockable);
> +			if (ret)
> +				return ret;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static void vhost_invalidate_range_end(struct mmu_notifier *mn,
> +				       const struct mmu_notifier_range *range)
> +{
> +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> +					     mmu_notifier);
> +	int i, j;
> +
> +	for (i = 0; i < dev->nvqs; i++) {
> +		struct vhost_virtqueue *vq = dev->vqs[i];
> +
> +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> +			vhost_invalidate_vq_end(vq, j,
> +						range->start,
> +						range->end);
> +	}
> +}
> +
> +static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
> +	.invalidate_range_start = vhost_invalidate_range_start,
> +	.invalidate_range_end = vhost_invalidate_range_end,
> +};
> +
> +static void vhost_init_maps(struct vhost_dev *dev)
> +{
> +	struct vhost_virtqueue *vq;
> +	int i, j;
> +
> +	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
> +
> +	for (i = 0; i < dev->nvqs; ++i) {
> +		vq = dev->vqs[i];
> +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> +			vq->maps[j] = NULL;
> +	}
> +}
> +#endif
> +
>  static void vhost_vq_reset(struct vhost_dev *dev,
>  			   struct vhost_virtqueue *vq)
>  {
> @@ -326,7 +502,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->busyloop_timeout = 0;
>  	vq->umem = NULL;
>  	vq->iotlb = NULL;
> +	vq->invalidate_count = 0;
>  	__vhost_vq_meta_reset(vq);
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	vhost_reset_vq_maps(vq);
> +#endif
>  }
>  
>  static int vhost_worker(void *data)
> @@ -471,12 +651,15 @@ void vhost_dev_init(struct vhost_dev *dev,
>  	dev->iov_limit = iov_limit;
>  	dev->weight = weight;
>  	dev->byte_weight = byte_weight;
> +	dev->has_notifier = false;
>  	init_llist_head(&dev->work_list);
>  	init_waitqueue_head(&dev->wait);
>  	INIT_LIST_HEAD(&dev->read_list);
>  	INIT_LIST_HEAD(&dev->pending_list);
>  	spin_lock_init(&dev->iotlb_lock);
> -
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	vhost_init_maps(dev);
> +#endif
>  
>  	for (i = 0; i < dev->nvqs; ++i) {
>  		vq = dev->vqs[i];
> @@ -485,6 +668,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>  		vq->heads = NULL;
>  		vq->dev = dev;
>  		mutex_init(&vq->mutex);
> +		spin_lock_init(&vq->mmu_lock);
>  		vhost_vq_reset(dev, vq);
>  		if (vq->handle_kick)
>  			vhost_poll_init(&vq->poll, vq->handle_kick,
> @@ -564,7 +748,19 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>  	if (err)
>  		goto err_cgroup;
>  
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
> +	if (err)
> +		goto err_mmu_notifier;
> +#endif
> +	dev->has_notifier = true;
> +
>  	return 0;
> +
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +err_mmu_notifier:
> +	vhost_dev_free_iovecs(dev);
> +#endif
>  err_cgroup:
>  	kthread_stop(worker);
>  	dev->worker = NULL;
> @@ -655,6 +851,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>  	spin_unlock(&dev->iotlb_lock);
>  }
>  
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
> +			      int index, unsigned long uaddr,
> +			      size_t size, bool write)
> +{
> +	struct vhost_uaddr *addr = &vq->uaddrs[index];
> +
> +	addr->uaddr = uaddr;
> +	addr->size = size;
> +	addr->write = write;
> +}
> +
> +static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
> +{
> +	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
> +			  (unsigned long)vq->desc,
> +			  vhost_get_desc_size(vq, vq->num),
> +			  false);
> +	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
> +			  (unsigned long)vq->avail,
> +			  vhost_get_avail_size(vq, vq->num),
> +			  false);
> +	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
> +			  (unsigned long)vq->used,
> +			  vhost_get_used_size(vq, vq->num),
> +			  true);
> +}
> +
> +static int vhost_map_prefetch(struct vhost_virtqueue *vq,
> +			       int index)
> +{
> +	struct vhost_map *map;
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	struct page **pages;
> +	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
> +	int npinned;
> +	void *vaddr, *v;
> +	int err;
> +	int i;
> +
> +	spin_lock(&vq->mmu_lock);
> +
> +	err = -EFAULT;
> +	if (vq->invalidate_count)
> +		goto err;
> +
> +	err = -ENOMEM;
> +	map = kmalloc(sizeof(*map), GFP_ATOMIC);
> +	if (!map)
> +		goto err;
> +
> +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
> +	if (!pages)
> +		goto err_pages;
> +
> +	err = EFAULT;
> +	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
> +					uaddr->write, pages);
> +	if (npinned > 0)
> +		release_pages(pages, npinned);
> +	if (npinned != npages)
> +		goto err_gup;
> +
> +	for (i = 0; i < npinned; i++)
> +		if (PageHighMem(pages[i]))
> +			goto err_gup;
> +
> +	vaddr = v = page_address(pages[0]);
> +
> +	/* For simplicity, fallback to userspace address if VA is not
> +	 * contigious.
> +	 */
> +	for (i = 1; i < npinned; i++) {
> +		v += PAGE_SIZE;
> +		if (v != page_address(pages[i]))
> +			goto err_gup;
> +	}
> +
> +	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
> +	map->npages = npages;
> +	map->pages = pages;
> +
> +	vq->maps[index] = map;
> +	/* No need for a synchronize_rcu(). This function should be
> +	 * called by dev->worker so we are serialized with all
> +	 * readers.
> +	 */
> +	spin_unlock(&vq->mmu_lock);
> +
> +	return 0;
> +
> +err_gup:
> +	kfree(pages);
> +err_pages:
> +	kfree(map);
> +err:
> +	spin_unlock(&vq->mmu_lock);
> +	return err;
> +}
> +#endif
> +
>  void vhost_dev_cleanup(struct vhost_dev *dev)
>  {
>  	int i;
> @@ -684,8 +981,20 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  		kthread_stop(dev->worker);
>  		dev->worker = NULL;
>  	}
> -	if (dev->mm)
> +	if (dev->mm) {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +		if (dev->has_notifier) {
> +			mmu_notifier_unregister(&dev->mmu_notifier,
> +						dev->mm);
> +			dev->has_notifier = false;
> +		}
> +#endif
>  		mmput(dev->mm);
> +	}
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	for (i = 0; i < dev->nvqs; i++)
> +		vhost_uninit_vq_maps(dev->vqs[i]);
> +#endif
>  	dev->mm = NULL;
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> @@ -914,6 +1223,26 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
>  
>  static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			*((__virtio16 *)&used->ring[vq->num]) =
> +				cpu_to_vhost16(vq, vq->avail_idx);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
>  			      vhost_avail_event(vq));
>  }
> @@ -922,6 +1251,27 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  				 struct vring_used_elem *head, int idx,
>  				 int count)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +	size_t size;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			size = count * sizeof(*head);
> +			memcpy(used->ring + idx, head, size);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
>  				  count * sizeof(*head));
>  }
> @@ -929,6 +1279,25 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			used->flags = cpu_to_vhost16(vq, vq->used_flags);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
>  			      &vq->used->flags);
>  }
> @@ -936,6 +1305,25 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>  
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>  			      &vq->used->idx);
>  }
> @@ -981,12 +1369,50 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
>  static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>  				      __virtio16 *idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*idx = avail->idx;

index can now be speculated.

> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *idx, &vq->avail->idx);
>  }
>  
>  static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  				       __virtio16 *head, int idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*head = avail->ring[idx & (vq->num - 1)];


Since idx can be speculated, I guess we need array_index_nospec here?


> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *head,
>  			       &vq->avail->ring[idx & (vq->num - 1)]);
>  }
> @@ -994,24 +1420,98 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>  					__virtio16 *flags)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*flags = avail->flags;
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *flags, &vq->avail->flags);
>  }
>  
>  static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>  				       __virtio16 *event)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*event = (__virtio16)avail->ring[vq->num];
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *event, vhost_used_event(vq));
>  }
>  
>  static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>  				     __virtio16 *idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			*idx = used->idx;
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_used(vq, *idx, &vq->used->idx);
>  }


This seems to be used during init. Why do we bother
accelerating this?


>  
>  static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>  				 struct vring_desc *desc, int idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_desc *d;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_DESC];
> +		if (likely(map)) {
> +			d = map->addr;
> +			*desc = *(d + idx);


Since idx can be speculated, I guess we need array_index_nospec here?


> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
>  }
>  

I also wonder about the userspace address we get eventualy.
It would seem that we need to prevent that from speculating -
and that seems like a good idea even if this patch isn't
applied. As you are playing with micro-benchmarks, maybe
you could the below patch?
It's unfortunately untested.
Thanks a lot in advance!

===>
vhost: block speculation of translated descriptors

iovec addresses coming from vhost are assumed to be
pre-validated, but in fact can be speculated to a value
out of range.

Userspace address are later validated with array_index_nospec so we can
be sure kernel info does not leak through these addresses, but vhost
must also not leak userspace info outside the allowed memory table to
guests.

Following the defence in depth principle, make sure
the address is not validated out of node range.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

---


diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5dc174ac8cac..863e25011ef6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2072,7 +2076,9 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 		size = node->size - addr + node->start;
 		_iov->iov_len = min((u64)len - s, size);
 		_iov->iov_base = (void __user *)(unsigned long)
-			(node->userspace_addr + addr - node->start);
+			(node->userspace_addr +
+			 array_index_nospec(addr - node->start,
+					    node->size));
 		s += size;
 		addr += size;
 		++ret;

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-05 12:27   ` Jason Wang
  (?)
  (?)
@ 2019-09-08 11:05   ` Michael S. Tsirkin
  -1 siblings, 0 replies; 50+ messages in thread
From: Michael S. Tsirkin @ 2019-09-08 11:05 UTC (permalink / raw)
  To: Jason Wang
  Cc: aarcange, Christoph Hellwig, linux-parisc, kvm, netdev,
	linux-kernel, virtualization, James Bottomley, linux-mm, jglisse,
	jgg, David Miller, linux-arm-kernel

On Thu, Sep 05, 2019 at 08:27:36PM +0800, Jason Wang wrote:
> This is a rework on the commit 7f466032dc9e ("vhost: access vq
> metadata through kernel virtual address").
> 
> It was noticed that the copy_to/from_user() friends that was used to
> access virtqueue metdata tends to be very expensive for dataplane
> implementation like vhost since it involves lots of software checks,
> speculation barriers,

So if we drop speculation barrier,
there's a problem here in access will now be speculated.
This effectively disables the defence in depth effect of
b3bbfb3fb5d25776b8e3f361d2eedaabb0b496cd
    x86: Introduce __uaccess_begin_nospec() and uaccess_try_nospec


So now we need to sprinkle array_index_nospec or barrier_nospec over the
code whenever we use an index we got from userspace.
See below for some examples.


> hardware feature toggling (e.g SMAP). The
> extra cost will be more obvious when transferring small packets since
> the time spent on metadata accessing become more significant.
> 
> This patch tries to eliminate those overheads by accessing them
> through direct mapping of those pages. Invalidation callbacks is
> implemented for co-operation with general VM management (swap, KSM,
> THP or NUMA balancing). We will try to get the direct mapping of vq
> metadata before each round of packet processing if it doesn't
> exist. If we fail, we will simplely fallback to copy_to/from_user()
> friends.
> 
> This invalidation, direct mapping access and set are synchronized
> through spinlock. This takes a step back from the original commit
> 7f466032dc9e ("vhost: access vq metadata through kernel virtual
> address") which tries to RCU which is suspicious and hard to be
> reviewed. This won't perform as well as RCU because of the atomic,
> this could be addressed by the future optimization.
> 
> This method might does not work for high mem page which requires
> temporary mapping so we just fallback to normal
> copy_to/from_user() and may not for arch that has virtual tagged cache
> since extra cache flushing is needed to eliminate the alias. This will
> result complex logic and bad performance. For those archs, this patch
> simply go for copy_to/from_user() friends. This is done by ruling out
> kernel mapping codes through ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.
> 
> Note that this is only done when device IOTLB is not enabled. We
> could use similar method to optimize IOTLB in the future.
> 
> Tests shows at most about 22% improvement on TX PPS when using
> virtio-user + vhost_net + xdp1 + TAP on 4.0GHz Kaby Lake.
> 
>         SMAP on | SMAP off
> Before: 4.9Mpps | 6.9Mpps
> After:  6.0Mpps | 7.5Mpps
> 
> On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
> any difference.

Why is not Kaby Lake with SMAP off the same as Sandy Bridge?


> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: David Miller <davem@davemloft.net>
> Cc: Jerome Glisse <jglisse@redhat.com>
> Cc: Jason Gunthorpe <jgg@mellanox.com>
> Cc: linux-mm@kvack.org
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-parisc@vger.kernel.org
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  drivers/vhost/vhost.c | 551 +++++++++++++++++++++++++++++++++++++++++-
>  drivers/vhost/vhost.h |  41 ++++
>  2 files changed, 589 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 791562e03fe0..f98155f28f02 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -298,6 +298,182 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
>  		__vhost_vq_meta_reset(d->vqs[i]);
>  }
>  
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +static void vhost_map_unprefetch(struct vhost_map *map)
> +{
> +	kfree(map->pages);
> +	kfree(map);
> +}
> +
> +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
> +				struct vhost_map *map, int index)
> +{
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	int i;
> +
> +	if (uaddr->write) {
> +		for (i = 0; i < map->npages; i++)
> +			set_page_dirty(map->pages[i]);
> +	}
> +}
> +
> +static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
> +{
> +	struct vhost_map *map[VHOST_NUM_ADDRS];
> +	int i;
> +
> +	spin_lock(&vq->mmu_lock);
> +	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> +		map[i] = vq->maps[i];
> +		if (map[i]) {
> +			vhost_set_map_dirty(vq, map[i], i);
> +			vq->maps[i] = NULL;
> +		}
> +	}
> +	spin_unlock(&vq->mmu_lock);
> +
> +	/* No need for synchronization since we are serialized with
> +	 * memory accessors (e.g vq mutex held).
> +	 */
> +
> +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> +		if (map[i])
> +			vhost_map_unprefetch(map[i]);
> +
> +}
> +
> +static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
> +{
> +	int i;
> +
> +	vhost_uninit_vq_maps(vq);
> +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> +		vq->uaddrs[i].size = 0;
> +}
> +
> +static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
> +				     unsigned long start,
> +				     unsigned long end)
> +{
> +	if (unlikely(!uaddr->size))
> +		return false;
> +
> +	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
> +}
> +
> +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
> +{
> +	spin_lock(&vq->mmu_lock);
> +}
> +
> +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
> +{
> +	spin_unlock(&vq->mmu_lock);
> +}
> +
> +static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
> +				     int index,
> +				     unsigned long start,
> +				     unsigned long end,
> +				     bool blockable)
> +{
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	struct vhost_map *map;
> +
> +	if (!vhost_map_range_overlap(uaddr, start, end))
> +		return 0;
> +	else if (!blockable)
> +		return -EAGAIN;
> +
> +	spin_lock(&vq->mmu_lock);
> +	++vq->invalidate_count;
> +
> +	map = vq->maps[index];
> +	if (map)
> +		vq->maps[index] = NULL;
> +	spin_unlock(&vq->mmu_lock);
> +
> +	if (map) {
> +		vhost_set_map_dirty(vq, map, index);
> +		vhost_map_unprefetch(map);
> +	}
> +
> +	return 0;
> +}
> +
> +static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
> +				    int index,
> +				    unsigned long start,
> +				    unsigned long end)
> +{
> +	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
> +		return;
> +
> +	spin_lock(&vq->mmu_lock);
> +	--vq->invalidate_count;
> +	spin_unlock(&vq->mmu_lock);
> +}
> +
> +static int vhost_invalidate_range_start(struct mmu_notifier *mn,
> +					const struct mmu_notifier_range *range)
> +{
> +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> +					     mmu_notifier);
> +	bool blockable = mmu_notifier_range_blockable(range);
> +	int i, j, ret;
> +
> +	for (i = 0; i < dev->nvqs; i++) {
> +		struct vhost_virtqueue *vq = dev->vqs[i];
> +
> +		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
> +			ret = vhost_invalidate_vq_start(vq, j,
> +							range->start,
> +							range->end, blockable);
> +			if (ret)
> +				return ret;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static void vhost_invalidate_range_end(struct mmu_notifier *mn,
> +				       const struct mmu_notifier_range *range)
> +{
> +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> +					     mmu_notifier);
> +	int i, j;
> +
> +	for (i = 0; i < dev->nvqs; i++) {
> +		struct vhost_virtqueue *vq = dev->vqs[i];
> +
> +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> +			vhost_invalidate_vq_end(vq, j,
> +						range->start,
> +						range->end);
> +	}
> +}
> +
> +static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
> +	.invalidate_range_start = vhost_invalidate_range_start,
> +	.invalidate_range_end = vhost_invalidate_range_end,
> +};
> +
> +static void vhost_init_maps(struct vhost_dev *dev)
> +{
> +	struct vhost_virtqueue *vq;
> +	int i, j;
> +
> +	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
> +
> +	for (i = 0; i < dev->nvqs; ++i) {
> +		vq = dev->vqs[i];
> +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> +			vq->maps[j] = NULL;
> +	}
> +}
> +#endif
> +
>  static void vhost_vq_reset(struct vhost_dev *dev,
>  			   struct vhost_virtqueue *vq)
>  {
> @@ -326,7 +502,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->busyloop_timeout = 0;
>  	vq->umem = NULL;
>  	vq->iotlb = NULL;
> +	vq->invalidate_count = 0;
>  	__vhost_vq_meta_reset(vq);
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	vhost_reset_vq_maps(vq);
> +#endif
>  }
>  
>  static int vhost_worker(void *data)
> @@ -471,12 +651,15 @@ void vhost_dev_init(struct vhost_dev *dev,
>  	dev->iov_limit = iov_limit;
>  	dev->weight = weight;
>  	dev->byte_weight = byte_weight;
> +	dev->has_notifier = false;
>  	init_llist_head(&dev->work_list);
>  	init_waitqueue_head(&dev->wait);
>  	INIT_LIST_HEAD(&dev->read_list);
>  	INIT_LIST_HEAD(&dev->pending_list);
>  	spin_lock_init(&dev->iotlb_lock);
> -
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	vhost_init_maps(dev);
> +#endif
>  
>  	for (i = 0; i < dev->nvqs; ++i) {
>  		vq = dev->vqs[i];
> @@ -485,6 +668,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>  		vq->heads = NULL;
>  		vq->dev = dev;
>  		mutex_init(&vq->mutex);
> +		spin_lock_init(&vq->mmu_lock);
>  		vhost_vq_reset(dev, vq);
>  		if (vq->handle_kick)
>  			vhost_poll_init(&vq->poll, vq->handle_kick,
> @@ -564,7 +748,19 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>  	if (err)
>  		goto err_cgroup;
>  
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
> +	if (err)
> +		goto err_mmu_notifier;
> +#endif
> +	dev->has_notifier = true;
> +
>  	return 0;
> +
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +err_mmu_notifier:
> +	vhost_dev_free_iovecs(dev);
> +#endif
>  err_cgroup:
>  	kthread_stop(worker);
>  	dev->worker = NULL;
> @@ -655,6 +851,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>  	spin_unlock(&dev->iotlb_lock);
>  }
>  
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
> +			      int index, unsigned long uaddr,
> +			      size_t size, bool write)
> +{
> +	struct vhost_uaddr *addr = &vq->uaddrs[index];
> +
> +	addr->uaddr = uaddr;
> +	addr->size = size;
> +	addr->write = write;
> +}
> +
> +static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
> +{
> +	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
> +			  (unsigned long)vq->desc,
> +			  vhost_get_desc_size(vq, vq->num),
> +			  false);
> +	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
> +			  (unsigned long)vq->avail,
> +			  vhost_get_avail_size(vq, vq->num),
> +			  false);
> +	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
> +			  (unsigned long)vq->used,
> +			  vhost_get_used_size(vq, vq->num),
> +			  true);
> +}
> +
> +static int vhost_map_prefetch(struct vhost_virtqueue *vq,
> +			       int index)
> +{
> +	struct vhost_map *map;
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	struct page **pages;
> +	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
> +	int npinned;
> +	void *vaddr, *v;
> +	int err;
> +	int i;
> +
> +	spin_lock(&vq->mmu_lock);
> +
> +	err = -EFAULT;
> +	if (vq->invalidate_count)
> +		goto err;
> +
> +	err = -ENOMEM;
> +	map = kmalloc(sizeof(*map), GFP_ATOMIC);
> +	if (!map)
> +		goto err;
> +
> +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
> +	if (!pages)
> +		goto err_pages;
> +
> +	err = EFAULT;
> +	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
> +					uaddr->write, pages);
> +	if (npinned > 0)
> +		release_pages(pages, npinned);
> +	if (npinned != npages)
> +		goto err_gup;
> +
> +	for (i = 0; i < npinned; i++)
> +		if (PageHighMem(pages[i]))
> +			goto err_gup;
> +
> +	vaddr = v = page_address(pages[0]);
> +
> +	/* For simplicity, fallback to userspace address if VA is not
> +	 * contigious.
> +	 */
> +	for (i = 1; i < npinned; i++) {
> +		v += PAGE_SIZE;
> +		if (v != page_address(pages[i]))
> +			goto err_gup;
> +	}
> +
> +	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
> +	map->npages = npages;
> +	map->pages = pages;
> +
> +	vq->maps[index] = map;
> +	/* No need for a synchronize_rcu(). This function should be
> +	 * called by dev->worker so we are serialized with all
> +	 * readers.
> +	 */
> +	spin_unlock(&vq->mmu_lock);
> +
> +	return 0;
> +
> +err_gup:
> +	kfree(pages);
> +err_pages:
> +	kfree(map);
> +err:
> +	spin_unlock(&vq->mmu_lock);
> +	return err;
> +}
> +#endif
> +
>  void vhost_dev_cleanup(struct vhost_dev *dev)
>  {
>  	int i;
> @@ -684,8 +981,20 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  		kthread_stop(dev->worker);
>  		dev->worker = NULL;
>  	}
> -	if (dev->mm)
> +	if (dev->mm) {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +		if (dev->has_notifier) {
> +			mmu_notifier_unregister(&dev->mmu_notifier,
> +						dev->mm);
> +			dev->has_notifier = false;
> +		}
> +#endif
>  		mmput(dev->mm);
> +	}
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	for (i = 0; i < dev->nvqs; i++)
> +		vhost_uninit_vq_maps(dev->vqs[i]);
> +#endif
>  	dev->mm = NULL;
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> @@ -914,6 +1223,26 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
>  
>  static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			*((__virtio16 *)&used->ring[vq->num]) =
> +				cpu_to_vhost16(vq, vq->avail_idx);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
>  			      vhost_avail_event(vq));
>  }
> @@ -922,6 +1251,27 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  				 struct vring_used_elem *head, int idx,
>  				 int count)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +	size_t size;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			size = count * sizeof(*head);
> +			memcpy(used->ring + idx, head, size);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
>  				  count * sizeof(*head));
>  }
> @@ -929,6 +1279,25 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			used->flags = cpu_to_vhost16(vq, vq->used_flags);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
>  			      &vq->used->flags);
>  }
> @@ -936,6 +1305,25 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>  
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>  			      &vq->used->idx);
>  }
> @@ -981,12 +1369,50 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
>  static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>  				      __virtio16 *idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*idx = avail->idx;

index can now be speculated.

> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *idx, &vq->avail->idx);
>  }
>  
>  static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  				       __virtio16 *head, int idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*head = avail->ring[idx & (vq->num - 1)];


Since idx can be speculated, I guess we need array_index_nospec here?


> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *head,
>  			       &vq->avail->ring[idx & (vq->num - 1)]);
>  }
> @@ -994,24 +1420,98 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>  					__virtio16 *flags)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*flags = avail->flags;
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *flags, &vq->avail->flags);
>  }
>  
>  static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>  				       __virtio16 *event)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*event = (__virtio16)avail->ring[vq->num];
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *event, vhost_used_event(vq));
>  }
>  
>  static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>  				     __virtio16 *idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			*idx = used->idx;
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_used(vq, *idx, &vq->used->idx);
>  }


This seems to be used during init. Why do we bother
accelerating this?


>  
>  static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>  				 struct vring_desc *desc, int idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_desc *d;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_DESC];
> +		if (likely(map)) {
> +			d = map->addr;
> +			*desc = *(d + idx);


Since idx can be speculated, I guess we need array_index_nospec here?


> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
>  }
>  

I also wonder about the userspace address we get eventualy.
It would seem that we need to prevent that from speculating -
and that seems like a good idea even if this patch isn't
applied. As you are playing with micro-benchmarks, maybe
you could the below patch?
It's unfortunately untested.
Thanks a lot in advance!

===>
vhost: block speculation of translated descriptors

iovec addresses coming from vhost are assumed to be
pre-validated, but in fact can be speculated to a value
out of range.

Userspace address are later validated with array_index_nospec so we can
be sure kernel info does not leak through these addresses, but vhost
must also not leak userspace info outside the allowed memory table to
guests.

Following the defence in depth principle, make sure
the address is not validated out of node range.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

---


diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5dc174ac8cac..863e25011ef6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2072,7 +2076,9 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 		size = node->size - addr + node->start;
 		_iov->iov_len = min((u64)len - s, size);
 		_iov->iov_base = (void __user *)(unsigned long)
-			(node->userspace_addr + addr - node->start);
+			(node->userspace_addr +
+			 array_index_nospec(addr - node->start,
+					    node->size));
 		s += size;
 		addr += size;
 		++ret;

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
@ 2019-09-08 11:05     ` Michael S. Tsirkin
  0 siblings, 0 replies; 50+ messages in thread
From: Michael S. Tsirkin @ 2019-09-08 11:05 UTC (permalink / raw)
  To: Jason Wang
  Cc: aarcange, Christoph Hellwig, linux-parisc, kvm, netdev,
	linux-kernel, virtualization, James Bottomley, linux-mm, jglisse,
	jgg, David Miller, linux-arm-kernel

On Thu, Sep 05, 2019 at 08:27:36PM +0800, Jason Wang wrote:
> This is a rework on the commit 7f466032dc9e ("vhost: access vq
> metadata through kernel virtual address").
> 
> It was noticed that the copy_to/from_user() friends that was used to
> access virtqueue metdata tends to be very expensive for dataplane
> implementation like vhost since it involves lots of software checks,
> speculation barriers,

So if we drop speculation barrier,
there's a problem here in access will now be speculated.
This effectively disables the defence in depth effect of
b3bbfb3fb5d25776b8e3f361d2eedaabb0b496cd
    x86: Introduce __uaccess_begin_nospec() and uaccess_try_nospec


So now we need to sprinkle array_index_nospec or barrier_nospec over the
code whenever we use an index we got from userspace.
See below for some examples.


> hardware feature toggling (e.g SMAP). The
> extra cost will be more obvious when transferring small packets since
> the time spent on metadata accessing become more significant.
> 
> This patch tries to eliminate those overheads by accessing them
> through direct mapping of those pages. Invalidation callbacks is
> implemented for co-operation with general VM management (swap, KSM,
> THP or NUMA balancing). We will try to get the direct mapping of vq
> metadata before each round of packet processing if it doesn't
> exist. If we fail, we will simplely fallback to copy_to/from_user()
> friends.
> 
> This invalidation, direct mapping access and set are synchronized
> through spinlock. This takes a step back from the original commit
> 7f466032dc9e ("vhost: access vq metadata through kernel virtual
> address") which tries to RCU which is suspicious and hard to be
> reviewed. This won't perform as well as RCU because of the atomic,
> this could be addressed by the future optimization.
> 
> This method might does not work for high mem page which requires
> temporary mapping so we just fallback to normal
> copy_to/from_user() and may not for arch that has virtual tagged cache
> since extra cache flushing is needed to eliminate the alias. This will
> result complex logic and bad performance. For those archs, this patch
> simply go for copy_to/from_user() friends. This is done by ruling out
> kernel mapping codes through ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.
> 
> Note that this is only done when device IOTLB is not enabled. We
> could use similar method to optimize IOTLB in the future.
> 
> Tests shows at most about 22% improvement on TX PPS when using
> virtio-user + vhost_net + xdp1 + TAP on 4.0GHz Kaby Lake.
> 
>         SMAP on | SMAP off
> Before: 4.9Mpps | 6.9Mpps
> After:  6.0Mpps | 7.5Mpps
> 
> On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
> any difference.

Why is not Kaby Lake with SMAP off the same as Sandy Bridge?


> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: David Miller <davem@davemloft.net>
> Cc: Jerome Glisse <jglisse@redhat.com>
> Cc: Jason Gunthorpe <jgg@mellanox.com>
> Cc: linux-mm@kvack.org
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-parisc@vger.kernel.org
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  drivers/vhost/vhost.c | 551 +++++++++++++++++++++++++++++++++++++++++-
>  drivers/vhost/vhost.h |  41 ++++
>  2 files changed, 589 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 791562e03fe0..f98155f28f02 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -298,6 +298,182 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
>  		__vhost_vq_meta_reset(d->vqs[i]);
>  }
>  
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +static void vhost_map_unprefetch(struct vhost_map *map)
> +{
> +	kfree(map->pages);
> +	kfree(map);
> +}
> +
> +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
> +				struct vhost_map *map, int index)
> +{
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	int i;
> +
> +	if (uaddr->write) {
> +		for (i = 0; i < map->npages; i++)
> +			set_page_dirty(map->pages[i]);
> +	}
> +}
> +
> +static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
> +{
> +	struct vhost_map *map[VHOST_NUM_ADDRS];
> +	int i;
> +
> +	spin_lock(&vq->mmu_lock);
> +	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> +		map[i] = vq->maps[i];
> +		if (map[i]) {
> +			vhost_set_map_dirty(vq, map[i], i);
> +			vq->maps[i] = NULL;
> +		}
> +	}
> +	spin_unlock(&vq->mmu_lock);
> +
> +	/* No need for synchronization since we are serialized with
> +	 * memory accessors (e.g vq mutex held).
> +	 */
> +
> +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> +		if (map[i])
> +			vhost_map_unprefetch(map[i]);
> +
> +}
> +
> +static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
> +{
> +	int i;
> +
> +	vhost_uninit_vq_maps(vq);
> +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> +		vq->uaddrs[i].size = 0;
> +}
> +
> +static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
> +				     unsigned long start,
> +				     unsigned long end)
> +{
> +	if (unlikely(!uaddr->size))
> +		return false;
> +
> +	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
> +}
> +
> +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
> +{
> +	spin_lock(&vq->mmu_lock);
> +}
> +
> +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
> +{
> +	spin_unlock(&vq->mmu_lock);
> +}
> +
> +static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
> +				     int index,
> +				     unsigned long start,
> +				     unsigned long end,
> +				     bool blockable)
> +{
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	struct vhost_map *map;
> +
> +	if (!vhost_map_range_overlap(uaddr, start, end))
> +		return 0;
> +	else if (!blockable)
> +		return -EAGAIN;
> +
> +	spin_lock(&vq->mmu_lock);
> +	++vq->invalidate_count;
> +
> +	map = vq->maps[index];
> +	if (map)
> +		vq->maps[index] = NULL;
> +	spin_unlock(&vq->mmu_lock);
> +
> +	if (map) {
> +		vhost_set_map_dirty(vq, map, index);
> +		vhost_map_unprefetch(map);
> +	}
> +
> +	return 0;
> +}
> +
> +static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
> +				    int index,
> +				    unsigned long start,
> +				    unsigned long end)
> +{
> +	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
> +		return;
> +
> +	spin_lock(&vq->mmu_lock);
> +	--vq->invalidate_count;
> +	spin_unlock(&vq->mmu_lock);
> +}
> +
> +static int vhost_invalidate_range_start(struct mmu_notifier *mn,
> +					const struct mmu_notifier_range *range)
> +{
> +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> +					     mmu_notifier);
> +	bool blockable = mmu_notifier_range_blockable(range);
> +	int i, j, ret;
> +
> +	for (i = 0; i < dev->nvqs; i++) {
> +		struct vhost_virtqueue *vq = dev->vqs[i];
> +
> +		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
> +			ret = vhost_invalidate_vq_start(vq, j,
> +							range->start,
> +							range->end, blockable);
> +			if (ret)
> +				return ret;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static void vhost_invalidate_range_end(struct mmu_notifier *mn,
> +				       const struct mmu_notifier_range *range)
> +{
> +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> +					     mmu_notifier);
> +	int i, j;
> +
> +	for (i = 0; i < dev->nvqs; i++) {
> +		struct vhost_virtqueue *vq = dev->vqs[i];
> +
> +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> +			vhost_invalidate_vq_end(vq, j,
> +						range->start,
> +						range->end);
> +	}
> +}
> +
> +static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
> +	.invalidate_range_start = vhost_invalidate_range_start,
> +	.invalidate_range_end = vhost_invalidate_range_end,
> +};
> +
> +static void vhost_init_maps(struct vhost_dev *dev)
> +{
> +	struct vhost_virtqueue *vq;
> +	int i, j;
> +
> +	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
> +
> +	for (i = 0; i < dev->nvqs; ++i) {
> +		vq = dev->vqs[i];
> +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> +			vq->maps[j] = NULL;
> +	}
> +}
> +#endif
> +
>  static void vhost_vq_reset(struct vhost_dev *dev,
>  			   struct vhost_virtqueue *vq)
>  {
> @@ -326,7 +502,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->busyloop_timeout = 0;
>  	vq->umem = NULL;
>  	vq->iotlb = NULL;
> +	vq->invalidate_count = 0;
>  	__vhost_vq_meta_reset(vq);
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	vhost_reset_vq_maps(vq);
> +#endif
>  }
>  
>  static int vhost_worker(void *data)
> @@ -471,12 +651,15 @@ void vhost_dev_init(struct vhost_dev *dev,
>  	dev->iov_limit = iov_limit;
>  	dev->weight = weight;
>  	dev->byte_weight = byte_weight;
> +	dev->has_notifier = false;
>  	init_llist_head(&dev->work_list);
>  	init_waitqueue_head(&dev->wait);
>  	INIT_LIST_HEAD(&dev->read_list);
>  	INIT_LIST_HEAD(&dev->pending_list);
>  	spin_lock_init(&dev->iotlb_lock);
> -
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	vhost_init_maps(dev);
> +#endif
>  
>  	for (i = 0; i < dev->nvqs; ++i) {
>  		vq = dev->vqs[i];
> @@ -485,6 +668,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>  		vq->heads = NULL;
>  		vq->dev = dev;
>  		mutex_init(&vq->mutex);
> +		spin_lock_init(&vq->mmu_lock);
>  		vhost_vq_reset(dev, vq);
>  		if (vq->handle_kick)
>  			vhost_poll_init(&vq->poll, vq->handle_kick,
> @@ -564,7 +748,19 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>  	if (err)
>  		goto err_cgroup;
>  
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
> +	if (err)
> +		goto err_mmu_notifier;
> +#endif
> +	dev->has_notifier = true;
> +
>  	return 0;
> +
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +err_mmu_notifier:
> +	vhost_dev_free_iovecs(dev);
> +#endif
>  err_cgroup:
>  	kthread_stop(worker);
>  	dev->worker = NULL;
> @@ -655,6 +851,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>  	spin_unlock(&dev->iotlb_lock);
>  }
>  
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
> +			      int index, unsigned long uaddr,
> +			      size_t size, bool write)
> +{
> +	struct vhost_uaddr *addr = &vq->uaddrs[index];
> +
> +	addr->uaddr = uaddr;
> +	addr->size = size;
> +	addr->write = write;
> +}
> +
> +static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
> +{
> +	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
> +			  (unsigned long)vq->desc,
> +			  vhost_get_desc_size(vq, vq->num),
> +			  false);
> +	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
> +			  (unsigned long)vq->avail,
> +			  vhost_get_avail_size(vq, vq->num),
> +			  false);
> +	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
> +			  (unsigned long)vq->used,
> +			  vhost_get_used_size(vq, vq->num),
> +			  true);
> +}
> +
> +static int vhost_map_prefetch(struct vhost_virtqueue *vq,
> +			       int index)
> +{
> +	struct vhost_map *map;
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	struct page **pages;
> +	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
> +	int npinned;
> +	void *vaddr, *v;
> +	int err;
> +	int i;
> +
> +	spin_lock(&vq->mmu_lock);
> +
> +	err = -EFAULT;
> +	if (vq->invalidate_count)
> +		goto err;
> +
> +	err = -ENOMEM;
> +	map = kmalloc(sizeof(*map), GFP_ATOMIC);
> +	if (!map)
> +		goto err;
> +
> +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
> +	if (!pages)
> +		goto err_pages;
> +
> +	err = EFAULT;
> +	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
> +					uaddr->write, pages);
> +	if (npinned > 0)
> +		release_pages(pages, npinned);
> +	if (npinned != npages)
> +		goto err_gup;
> +
> +	for (i = 0; i < npinned; i++)
> +		if (PageHighMem(pages[i]))
> +			goto err_gup;
> +
> +	vaddr = v = page_address(pages[0]);
> +
> +	/* For simplicity, fallback to userspace address if VA is not
> +	 * contigious.
> +	 */
> +	for (i = 1; i < npinned; i++) {
> +		v += PAGE_SIZE;
> +		if (v != page_address(pages[i]))
> +			goto err_gup;
> +	}
> +
> +	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
> +	map->npages = npages;
> +	map->pages = pages;
> +
> +	vq->maps[index] = map;
> +	/* No need for a synchronize_rcu(). This function should be
> +	 * called by dev->worker so we are serialized with all
> +	 * readers.
> +	 */
> +	spin_unlock(&vq->mmu_lock);
> +
> +	return 0;
> +
> +err_gup:
> +	kfree(pages);
> +err_pages:
> +	kfree(map);
> +err:
> +	spin_unlock(&vq->mmu_lock);
> +	return err;
> +}
> +#endif
> +
>  void vhost_dev_cleanup(struct vhost_dev *dev)
>  {
>  	int i;
> @@ -684,8 +981,20 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  		kthread_stop(dev->worker);
>  		dev->worker = NULL;
>  	}
> -	if (dev->mm)
> +	if (dev->mm) {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +		if (dev->has_notifier) {
> +			mmu_notifier_unregister(&dev->mmu_notifier,
> +						dev->mm);
> +			dev->has_notifier = false;
> +		}
> +#endif
>  		mmput(dev->mm);
> +	}
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	for (i = 0; i < dev->nvqs; i++)
> +		vhost_uninit_vq_maps(dev->vqs[i]);
> +#endif
>  	dev->mm = NULL;
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> @@ -914,6 +1223,26 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
>  
>  static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			*((__virtio16 *)&used->ring[vq->num]) =
> +				cpu_to_vhost16(vq, vq->avail_idx);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
>  			      vhost_avail_event(vq));
>  }
> @@ -922,6 +1251,27 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  				 struct vring_used_elem *head, int idx,
>  				 int count)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +	size_t size;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			size = count * sizeof(*head);
> +			memcpy(used->ring + idx, head, size);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
>  				  count * sizeof(*head));
>  }
> @@ -929,6 +1279,25 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			used->flags = cpu_to_vhost16(vq, vq->used_flags);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
>  			      &vq->used->flags);
>  }
> @@ -936,6 +1305,25 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>  
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>  			      &vq->used->idx);
>  }
> @@ -981,12 +1369,50 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
>  static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>  				      __virtio16 *idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*idx = avail->idx;

index can now be speculated.

> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *idx, &vq->avail->idx);
>  }
>  
>  static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  				       __virtio16 *head, int idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*head = avail->ring[idx & (vq->num - 1)];


Since idx can be speculated, I guess we need array_index_nospec here?


> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *head,
>  			       &vq->avail->ring[idx & (vq->num - 1)]);
>  }
> @@ -994,24 +1420,98 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>  					__virtio16 *flags)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*flags = avail->flags;
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *flags, &vq->avail->flags);
>  }
>  
>  static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>  				       __virtio16 *event)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*event = (__virtio16)avail->ring[vq->num];
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *event, vhost_used_event(vq));
>  }
>  
>  static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>  				     __virtio16 *idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			*idx = used->idx;
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_used(vq, *idx, &vq->used->idx);
>  }


This seems to be used during init. Why do we bother
accelerating this?


>  
>  static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>  				 struct vring_desc *desc, int idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_desc *d;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_DESC];
> +		if (likely(map)) {
> +			d = map->addr;
> +			*desc = *(d + idx);


Since idx can be speculated, I guess we need array_index_nospec here?


> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
>  }
>  

I also wonder about the userspace address we get eventualy.
It would seem that we need to prevent that from speculating -
and that seems like a good idea even if this patch isn't
applied. As you are playing with micro-benchmarks, maybe
you could the below patch?
It's unfortunately untested.
Thanks a lot in advance!

===>
vhost: block speculation of translated descriptors

iovec addresses coming from vhost are assumed to be
pre-validated, but in fact can be speculated to a value
out of range.

Userspace address are later validated with array_index_nospec so we can
be sure kernel info does not leak through these addresses, but vhost
must also not leak userspace info outside the allowed memory table to
guests.

Following the defence in depth principle, make sure
the address is not validated out of node range.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

---


diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5dc174ac8cac..863e25011ef6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2072,7 +2076,9 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 		size = node->size - addr + node->start;
 		_iov->iov_len = min((u64)len - s, size);
 		_iov->iov_base = (void __user *)(unsigned long)
-			(node->userspace_addr + addr - node->start);
+			(node->userspace_addr +
+			 array_index_nospec(addr - node->start,
+					    node->size));
 		s += size;
 		addr += size;
 		++ret;

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-08 11:05     ` Michael S. Tsirkin
@ 2019-09-09  2:18       ` Jason Wang
  -1 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  2:18 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, virtualization, netdev, linux-kernel, jgg, aarcange,
	jglisse, linux-mm, James Bottomley, Christoph Hellwig,
	David Miller, linux-arm-kernel, linux-parisc


On 2019/9/8 下午7:05, Michael S. Tsirkin wrote:
> On Thu, Sep 05, 2019 at 08:27:36PM +0800, Jason Wang wrote:
>> This is a rework on the commit 7f466032dc9e ("vhost: access vq
>> metadata through kernel virtual address").
>>
>> It was noticed that the copy_to/from_user() friends that was used to
>> access virtqueue metdata tends to be very expensive for dataplane
>> implementation like vhost since it involves lots of software checks,
>> speculation barriers,
> So if we drop speculation barrier,
> there's a problem here in access will now be speculated.
> This effectively disables the defence in depth effect of
> b3bbfb3fb5d25776b8e3f361d2eedaabb0b496cd
>      x86: Introduce __uaccess_begin_nospec() and uaccess_try_nospec
>
>
> So now we need to sprinkle array_index_nospec or barrier_nospec over the
> code whenever we use an index we got from userspace.
> See below for some examples.
>
>
>> hardware feature toggling (e.g SMAP). The
>> extra cost will be more obvious when transferring small packets since
>> the time spent on metadata accessing become more significant.
>>
>> This patch tries to eliminate those overheads by accessing them
>> through direct mapping of those pages. Invalidation callbacks is
>> implemented for co-operation with general VM management (swap, KSM,
>> THP or NUMA balancing). We will try to get the direct mapping of vq
>> metadata before each round of packet processing if it doesn't
>> exist. If we fail, we will simplely fallback to copy_to/from_user()
>> friends.
>>
>> This invalidation, direct mapping access and set are synchronized
>> through spinlock. This takes a step back from the original commit
>> 7f466032dc9e ("vhost: access vq metadata through kernel virtual
>> address") which tries to RCU which is suspicious and hard to be
>> reviewed. This won't perform as well as RCU because of the atomic,
>> this could be addressed by the future optimization.
>>
>> This method might does not work for high mem page which requires
>> temporary mapping so we just fallback to normal
>> copy_to/from_user() and may not for arch that has virtual tagged cache
>> since extra cache flushing is needed to eliminate the alias. This will
>> result complex logic and bad performance. For those archs, this patch
>> simply go for copy_to/from_user() friends. This is done by ruling out
>> kernel mapping codes through ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.
>>
>> Note that this is only done when device IOTLB is not enabled. We
>> could use similar method to optimize IOTLB in the future.
>>
>> Tests shows at most about 22% improvement on TX PPS when using
>> virtio-user + vhost_net + xdp1 + TAP on 4.0GHz Kaby Lake.
>>
>>          SMAP on | SMAP off
>> Before: 4.9Mpps | 6.9Mpps
>> After:  6.0Mpps | 7.5Mpps
>>
>> On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
>> any difference.
> Why is not Kaby Lake with SMAP off the same as Sandy Bridge?


I don't know, I guess it was because the atomic is l


>
>
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
>> Cc: Christoph Hellwig <hch@infradead.org>
>> Cc: David Miller <davem@davemloft.net>
>> Cc: Jerome Glisse <jglisse@redhat.com>
>> Cc: Jason Gunthorpe <jgg@mellanox.com>
>> Cc: linux-mm@kvack.org
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linux-parisc@vger.kernel.org
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 551 +++++++++++++++++++++++++++++++++++++++++-
>>   drivers/vhost/vhost.h |  41 ++++
>>   2 files changed, 589 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index 791562e03fe0..f98155f28f02 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -298,6 +298,182 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
>>   		__vhost_vq_meta_reset(d->vqs[i]);
>>   }
>>   
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +static void vhost_map_unprefetch(struct vhost_map *map)
>> +{
>> +	kfree(map->pages);
>> +	kfree(map);
>> +}
>> +
>> +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
>> +				struct vhost_map *map, int index)
>> +{
>> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> +	int i;
>> +
>> +	if (uaddr->write) {
>> +		for (i = 0; i < map->npages; i++)
>> +			set_page_dirty(map->pages[i]);
>> +	}
>> +}
>> +
>> +static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
>> +{
>> +	struct vhost_map *map[VHOST_NUM_ADDRS];
>> +	int i;
>> +
>> +	spin_lock(&vq->mmu_lock);
>> +	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
>> +		map[i] = vq->maps[i];
>> +		if (map[i]) {
>> +			vhost_set_map_dirty(vq, map[i], i);
>> +			vq->maps[i] = NULL;
>> +		}
>> +	}
>> +	spin_unlock(&vq->mmu_lock);
>> +
>> +	/* No need for synchronization since we are serialized with
>> +	 * memory accessors (e.g vq mutex held).
>> +	 */
>> +
>> +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
>> +		if (map[i])
>> +			vhost_map_unprefetch(map[i]);
>> +
>> +}
>> +
>> +static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
>> +{
>> +	int i;
>> +
>> +	vhost_uninit_vq_maps(vq);
>> +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
>> +		vq->uaddrs[i].size = 0;
>> +}
>> +
>> +static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
>> +				     unsigned long start,
>> +				     unsigned long end)
>> +{
>> +	if (unlikely(!uaddr->size))
>> +		return false;
>> +
>> +	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
>> +}
>> +
>> +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
>> +{
>> +	spin_lock(&vq->mmu_lock);
>> +}
>> +
>> +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
>> +{
>> +	spin_unlock(&vq->mmu_lock);
>> +}
>> +
>> +static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>> +				     int index,
>> +				     unsigned long start,
>> +				     unsigned long end,
>> +				     bool blockable)
>> +{
>> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> +	struct vhost_map *map;
>> +
>> +	if (!vhost_map_range_overlap(uaddr, start, end))
>> +		return 0;
>> +	else if (!blockable)
>> +		return -EAGAIN;
>> +
>> +	spin_lock(&vq->mmu_lock);
>> +	++vq->invalidate_count;
>> +
>> +	map = vq->maps[index];
>> +	if (map)
>> +		vq->maps[index] = NULL;
>> +	spin_unlock(&vq->mmu_lock);
>> +
>> +	if (map) {
>> +		vhost_set_map_dirty(vq, map, index);
>> +		vhost_map_unprefetch(map);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
>> +				    int index,
>> +				    unsigned long start,
>> +				    unsigned long end)
>> +{
>> +	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
>> +		return;
>> +
>> +	spin_lock(&vq->mmu_lock);
>> +	--vq->invalidate_count;
>> +	spin_unlock(&vq->mmu_lock);
>> +}
>> +
>> +static int vhost_invalidate_range_start(struct mmu_notifier *mn,
>> +					const struct mmu_notifier_range *range)
>> +{
>> +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
>> +					     mmu_notifier);
>> +	bool blockable = mmu_notifier_range_blockable(range);
>> +	int i, j, ret;
>> +
>> +	for (i = 0; i < dev->nvqs; i++) {
>> +		struct vhost_virtqueue *vq = dev->vqs[i];
>> +
>> +		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
>> +			ret = vhost_invalidate_vq_start(vq, j,
>> +							range->start,
>> +							range->end, blockable);
>> +			if (ret)
>> +				return ret;
>> +		}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static void vhost_invalidate_range_end(struct mmu_notifier *mn,
>> +				       const struct mmu_notifier_range *range)
>> +{
>> +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
>> +					     mmu_notifier);
>> +	int i, j;
>> +
>> +	for (i = 0; i < dev->nvqs; i++) {
>> +		struct vhost_virtqueue *vq = dev->vqs[i];
>> +
>> +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
>> +			vhost_invalidate_vq_end(vq, j,
>> +						range->start,
>> +						range->end);
>> +	}
>> +}
>> +
>> +static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
>> +	.invalidate_range_start = vhost_invalidate_range_start,
>> +	.invalidate_range_end = vhost_invalidate_range_end,
>> +};
>> +
>> +static void vhost_init_maps(struct vhost_dev *dev)
>> +{
>> +	struct vhost_virtqueue *vq;
>> +	int i, j;
>> +
>> +	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
>> +
>> +	for (i = 0; i < dev->nvqs; ++i) {
>> +		vq = dev->vqs[i];
>> +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
>> +			vq->maps[j] = NULL;
>> +	}
>> +}
>> +#endif
>> +
>>   static void vhost_vq_reset(struct vhost_dev *dev,
>>   			   struct vhost_virtqueue *vq)
>>   {
>> @@ -326,7 +502,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>>   	vq->busyloop_timeout = 0;
>>   	vq->umem = NULL;
>>   	vq->iotlb = NULL;
>> +	vq->invalidate_count = 0;
>>   	__vhost_vq_meta_reset(vq);
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	vhost_reset_vq_maps(vq);
>> +#endif
>>   }
>>   
>>   static int vhost_worker(void *data)
>> @@ -471,12 +651,15 @@ void vhost_dev_init(struct vhost_dev *dev,
>>   	dev->iov_limit = iov_limit;
>>   	dev->weight = weight;
>>   	dev->byte_weight = byte_weight;
>> +	dev->has_notifier = false;
>>   	init_llist_head(&dev->work_list);
>>   	init_waitqueue_head(&dev->wait);
>>   	INIT_LIST_HEAD(&dev->read_list);
>>   	INIT_LIST_HEAD(&dev->pending_list);
>>   	spin_lock_init(&dev->iotlb_lock);
>> -
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	vhost_init_maps(dev);
>> +#endif
>>   
>>   	for (i = 0; i < dev->nvqs; ++i) {
>>   		vq = dev->vqs[i];
>> @@ -485,6 +668,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>>   		vq->heads = NULL;
>>   		vq->dev = dev;
>>   		mutex_init(&vq->mutex);
>> +		spin_lock_init(&vq->mmu_lock);
>>   		vhost_vq_reset(dev, vq);
>>   		if (vq->handle_kick)
>>   			vhost_poll_init(&vq->poll, vq->handle_kick,
>> @@ -564,7 +748,19 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>>   	if (err)
>>   		goto err_cgroup;
>>   
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
>> +	if (err)
>> +		goto err_mmu_notifier;
>> +#endif
>> +	dev->has_notifier = true;
>> +
>>   	return 0;
>> +
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +err_mmu_notifier:
>> +	vhost_dev_free_iovecs(dev);
>> +#endif
>>   err_cgroup:
>>   	kthread_stop(worker);
>>   	dev->worker = NULL;
>> @@ -655,6 +851,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>>   	spin_unlock(&dev->iotlb_lock);
>>   }
>>   
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
>> +			      int index, unsigned long uaddr,
>> +			      size_t size, bool write)
>> +{
>> +	struct vhost_uaddr *addr = &vq->uaddrs[index];
>> +
>> +	addr->uaddr = uaddr;
>> +	addr->size = size;
>> +	addr->write = write;
>> +}
>> +
>> +static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
>> +{
>> +	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
>> +			  (unsigned long)vq->desc,
>> +			  vhost_get_desc_size(vq, vq->num),
>> +			  false);
>> +	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
>> +			  (unsigned long)vq->avail,
>> +			  vhost_get_avail_size(vq, vq->num),
>> +			  false);
>> +	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
>> +			  (unsigned long)vq->used,
>> +			  vhost_get_used_size(vq, vq->num),
>> +			  true);
>> +}
>> +
>> +static int vhost_map_prefetch(struct vhost_virtqueue *vq,
>> +			       int index)
>> +{
>> +	struct vhost_map *map;
>> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> +	struct page **pages;
>> +	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
>> +	int npinned;
>> +	void *vaddr, *v;
>> +	int err;
>> +	int i;
>> +
>> +	spin_lock(&vq->mmu_lock);
>> +
>> +	err = -EFAULT;
>> +	if (vq->invalidate_count)
>> +		goto err;
>> +
>> +	err = -ENOMEM;
>> +	map = kmalloc(sizeof(*map), GFP_ATOMIC);
>> +	if (!map)
>> +		goto err;
>> +
>> +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
>> +	if (!pages)
>> +		goto err_pages;
>> +
>> +	err = EFAULT;
>> +	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
>> +					uaddr->write, pages);
>> +	if (npinned > 0)
>> +		release_pages(pages, npinned);
>> +	if (npinned != npages)
>> +		goto err_gup;
>> +
>> +	for (i = 0; i < npinned; i++)
>> +		if (PageHighMem(pages[i]))
>> +			goto err_gup;
>> +
>> +	vaddr = v = page_address(pages[0]);
>> +
>> +	/* For simplicity, fallback to userspace address if VA is not
>> +	 * contigious.
>> +	 */
>> +	for (i = 1; i < npinned; i++) {
>> +		v += PAGE_SIZE;
>> +		if (v != page_address(pages[i]))
>> +			goto err_gup;
>> +	}
>> +
>> +	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
>> +	map->npages = npages;
>> +	map->pages = pages;
>> +
>> +	vq->maps[index] = map;
>> +	/* No need for a synchronize_rcu(). This function should be
>> +	 * called by dev->worker so we are serialized with all
>> +	 * readers.
>> +	 */
>> +	spin_unlock(&vq->mmu_lock);
>> +
>> +	return 0;
>> +
>> +err_gup:
>> +	kfree(pages);
>> +err_pages:
>> +	kfree(map);
>> +err:
>> +	spin_unlock(&vq->mmu_lock);
>> +	return err;
>> +}
>> +#endif
>> +
>>   void vhost_dev_cleanup(struct vhost_dev *dev)
>>   {
>>   	int i;
>> @@ -684,8 +981,20 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>>   		kthread_stop(dev->worker);
>>   		dev->worker = NULL;
>>   	}
>> -	if (dev->mm)
>> +	if (dev->mm) {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +		if (dev->has_notifier) {
>> +			mmu_notifier_unregister(&dev->mmu_notifier,
>> +						dev->mm);
>> +			dev->has_notifier = false;
>> +		}
>> +#endif
>>   		mmput(dev->mm);
>> +	}
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	for (i = 0; i < dev->nvqs; i++)
>> +		vhost_uninit_vq_maps(dev->vqs[i]);
>> +#endif
>>   	dev->mm = NULL;
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
>> @@ -914,6 +1223,26 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
>>   
>>   static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			*((__virtio16 *)&used->ring[vq->num]) =
>> +				cpu_to_vhost16(vq, vq->avail_idx);
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
>>   			      vhost_avail_event(vq));
>>   }
>> @@ -922,6 +1251,27 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>>   				 struct vring_used_elem *head, int idx,
>>   				 int count)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +	size_t size;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			size = count * sizeof(*head);
>> +			memcpy(used->ring + idx, head, size);
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
>>   				  count * sizeof(*head));
>>   }
>> @@ -929,6 +1279,25 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>>   static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>>   
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			used->flags = cpu_to_vhost16(vq, vq->used_flags);
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
>>   			      &vq->used->flags);
>>   }
>> @@ -936,6 +1305,25 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>>   static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>>   
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>>   			      &vq->used->idx);
>>   }
>> @@ -981,12 +1369,50 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
>>   static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>>   				      __virtio16 *idx)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_avail *avail;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_AVAIL];
>> +		if (likely(map)) {
>> +			avail = map->addr;
>> +			*idx = avail->idx;
> index can now be speculated.

[...]


> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*head = avail->ring[idx & (vq->num - 1)];
>
> Since idx can be speculated, I guess we need array_index_nospec here?


So we have

ACQUIRE(mmu_lock)

get idx

RELEASE(mmu_lock)

ACQUIRE(mmu_lock)

read array[idx]

RELEASE(mmu_lock)

Then I think idx can't be speculated consider we've passed RELEASE + 
ACQUIRE?


>
>
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_get_avail(vq, *head,
>>   			       &vq->avail->ring[idx & (vq->num - 1)]);
>>   }
>> @@ -994,24 +1420,98 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>>   static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>>   					__virtio16 *flags)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_avail *avail;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_AVAIL];
>> +		if (likely(map)) {
>> +			avail = map->addr;
>> +			*flags = avail->flags;
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_get_avail(vq, *flags, &vq->avail->flags);
>>   }
>>   
>>   static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>>   				       __virtio16 *event)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_avail *avail;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +		map = vq->maps[VHOST_ADDR_AVAIL];
>> +		if (likely(map)) {
>> +			avail = map->addr;
>> +			*event = (__virtio16)avail->ring[vq->num];
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_get_avail(vq, *event, vhost_used_event(vq));
>>   }
>>   
>>   static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>>   				     __virtio16 *idx)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			*idx = used->idx;
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_get_used(vq, *idx, &vq->used->idx);
>>   }
>
> This seems to be used during init. Why do we bother
> accelerating this?


Ok, I can remove this part in next version.


>
>
>>   
>>   static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>>   				 struct vring_desc *desc, int idx)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_desc *d;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_DESC];
>> +		if (likely(map)) {
>> +			d = map->addr;
>> +			*desc = *(d + idx);
>
> Since idx can be speculated, I guess we need array_index_nospec here?


This is similar to the above avail idx case.


>
>
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
>>   }
>>   
> I also wonder about the userspace address we get eventualy.
> It would seem that we need to prevent that from speculating -
> and that seems like a good idea even if this patch isn't
> applied. As you are playing with micro-benchmarks, maybe
> you could the below patch?


Let me test it.

Thanks


> It's unfortunately untested.
> Thanks a lot in advance!
>
> ===>
> vhost: block speculation of translated descriptors
>
> iovec addresses coming from vhost are assumed to be
> pre-validated, but in fact can be speculated to a value
> out of range.
>
> Userspace address are later validated with array_index_nospec so we can
> be sure kernel info does not leak through these addresses, but vhost
> must also not leak userspace info outside the allowed memory table to
> guests.
>
> Following the defence in depth principle, make sure
> the address is not validated out of node range.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>
> ---
>
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 5dc174ac8cac..863e25011ef6 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2072,7 +2076,9 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
>   		size = node->size - addr + node->start;
>   		_iov->iov_len = min((u64)len - s, size);
>   		_iov->iov_base = (void __user *)(unsigned long)
> -			(node->userspace_addr + addr - node->start);
> +			(node->userspace_addr +
> +			 array_index_nospec(addr - node->start,
> +					    node->size));
>   		s += size;
>   		addr += size;
>   		++ret;

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-08 11:05     ` Michael S. Tsirkin
  (?)
@ 2019-09-09  2:18     ` Jason Wang
  -1 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  2:18 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: aarcange, Christoph Hellwig, linux-parisc, kvm, netdev,
	linux-kernel, virtualization, James Bottomley, linux-mm, jglisse,
	jgg, David Miller, linux-arm-kernel


On 2019/9/8 下午7:05, Michael S. Tsirkin wrote:
> On Thu, Sep 05, 2019 at 08:27:36PM +0800, Jason Wang wrote:
>> This is a rework on the commit 7f466032dc9e ("vhost: access vq
>> metadata through kernel virtual address").
>>
>> It was noticed that the copy_to/from_user() friends that was used to
>> access virtqueue metdata tends to be very expensive for dataplane
>> implementation like vhost since it involves lots of software checks,
>> speculation barriers,
> So if we drop speculation barrier,
> there's a problem here in access will now be speculated.
> This effectively disables the defence in depth effect of
> b3bbfb3fb5d25776b8e3f361d2eedaabb0b496cd
>      x86: Introduce __uaccess_begin_nospec() and uaccess_try_nospec
>
>
> So now we need to sprinkle array_index_nospec or barrier_nospec over the
> code whenever we use an index we got from userspace.
> See below for some examples.
>
>
>> hardware feature toggling (e.g SMAP). The
>> extra cost will be more obvious when transferring small packets since
>> the time spent on metadata accessing become more significant.
>>
>> This patch tries to eliminate those overheads by accessing them
>> through direct mapping of those pages. Invalidation callbacks is
>> implemented for co-operation with general VM management (swap, KSM,
>> THP or NUMA balancing). We will try to get the direct mapping of vq
>> metadata before each round of packet processing if it doesn't
>> exist. If we fail, we will simplely fallback to copy_to/from_user()
>> friends.
>>
>> This invalidation, direct mapping access and set are synchronized
>> through spinlock. This takes a step back from the original commit
>> 7f466032dc9e ("vhost: access vq metadata through kernel virtual
>> address") which tries to RCU which is suspicious and hard to be
>> reviewed. This won't perform as well as RCU because of the atomic,
>> this could be addressed by the future optimization.
>>
>> This method might does not work for high mem page which requires
>> temporary mapping so we just fallback to normal
>> copy_to/from_user() and may not for arch that has virtual tagged cache
>> since extra cache flushing is needed to eliminate the alias. This will
>> result complex logic and bad performance. For those archs, this patch
>> simply go for copy_to/from_user() friends. This is done by ruling out
>> kernel mapping codes through ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.
>>
>> Note that this is only done when device IOTLB is not enabled. We
>> could use similar method to optimize IOTLB in the future.
>>
>> Tests shows at most about 22% improvement on TX PPS when using
>> virtio-user + vhost_net + xdp1 + TAP on 4.0GHz Kaby Lake.
>>
>>          SMAP on | SMAP off
>> Before: 4.9Mpps | 6.9Mpps
>> After:  6.0Mpps | 7.5Mpps
>>
>> On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
>> any difference.
> Why is not Kaby Lake with SMAP off the same as Sandy Bridge?


I don't know, I guess it was because the atomic is l


>
>
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
>> Cc: Christoph Hellwig <hch@infradead.org>
>> Cc: David Miller <davem@davemloft.net>
>> Cc: Jerome Glisse <jglisse@redhat.com>
>> Cc: Jason Gunthorpe <jgg@mellanox.com>
>> Cc: linux-mm@kvack.org
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linux-parisc@vger.kernel.org
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 551 +++++++++++++++++++++++++++++++++++++++++-
>>   drivers/vhost/vhost.h |  41 ++++
>>   2 files changed, 589 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index 791562e03fe0..f98155f28f02 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -298,6 +298,182 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
>>   		__vhost_vq_meta_reset(d->vqs[i]);
>>   }
>>   
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +static void vhost_map_unprefetch(struct vhost_map *map)
>> +{
>> +	kfree(map->pages);
>> +	kfree(map);
>> +}
>> +
>> +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
>> +				struct vhost_map *map, int index)
>> +{
>> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> +	int i;
>> +
>> +	if (uaddr->write) {
>> +		for (i = 0; i < map->npages; i++)
>> +			set_page_dirty(map->pages[i]);
>> +	}
>> +}
>> +
>> +static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
>> +{
>> +	struct vhost_map *map[VHOST_NUM_ADDRS];
>> +	int i;
>> +
>> +	spin_lock(&vq->mmu_lock);
>> +	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
>> +		map[i] = vq->maps[i];
>> +		if (map[i]) {
>> +			vhost_set_map_dirty(vq, map[i], i);
>> +			vq->maps[i] = NULL;
>> +		}
>> +	}
>> +	spin_unlock(&vq->mmu_lock);
>> +
>> +	/* No need for synchronization since we are serialized with
>> +	 * memory accessors (e.g vq mutex held).
>> +	 */
>> +
>> +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
>> +		if (map[i])
>> +			vhost_map_unprefetch(map[i]);
>> +
>> +}
>> +
>> +static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
>> +{
>> +	int i;
>> +
>> +	vhost_uninit_vq_maps(vq);
>> +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
>> +		vq->uaddrs[i].size = 0;
>> +}
>> +
>> +static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
>> +				     unsigned long start,
>> +				     unsigned long end)
>> +{
>> +	if (unlikely(!uaddr->size))
>> +		return false;
>> +
>> +	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
>> +}
>> +
>> +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
>> +{
>> +	spin_lock(&vq->mmu_lock);
>> +}
>> +
>> +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
>> +{
>> +	spin_unlock(&vq->mmu_lock);
>> +}
>> +
>> +static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>> +				     int index,
>> +				     unsigned long start,
>> +				     unsigned long end,
>> +				     bool blockable)
>> +{
>> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> +	struct vhost_map *map;
>> +
>> +	if (!vhost_map_range_overlap(uaddr, start, end))
>> +		return 0;
>> +	else if (!blockable)
>> +		return -EAGAIN;
>> +
>> +	spin_lock(&vq->mmu_lock);
>> +	++vq->invalidate_count;
>> +
>> +	map = vq->maps[index];
>> +	if (map)
>> +		vq->maps[index] = NULL;
>> +	spin_unlock(&vq->mmu_lock);
>> +
>> +	if (map) {
>> +		vhost_set_map_dirty(vq, map, index);
>> +		vhost_map_unprefetch(map);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
>> +				    int index,
>> +				    unsigned long start,
>> +				    unsigned long end)
>> +{
>> +	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
>> +		return;
>> +
>> +	spin_lock(&vq->mmu_lock);
>> +	--vq->invalidate_count;
>> +	spin_unlock(&vq->mmu_lock);
>> +}
>> +
>> +static int vhost_invalidate_range_start(struct mmu_notifier *mn,
>> +					const struct mmu_notifier_range *range)
>> +{
>> +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
>> +					     mmu_notifier);
>> +	bool blockable = mmu_notifier_range_blockable(range);
>> +	int i, j, ret;
>> +
>> +	for (i = 0; i < dev->nvqs; i++) {
>> +		struct vhost_virtqueue *vq = dev->vqs[i];
>> +
>> +		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
>> +			ret = vhost_invalidate_vq_start(vq, j,
>> +							range->start,
>> +							range->end, blockable);
>> +			if (ret)
>> +				return ret;
>> +		}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static void vhost_invalidate_range_end(struct mmu_notifier *mn,
>> +				       const struct mmu_notifier_range *range)
>> +{
>> +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
>> +					     mmu_notifier);
>> +	int i, j;
>> +
>> +	for (i = 0; i < dev->nvqs; i++) {
>> +		struct vhost_virtqueue *vq = dev->vqs[i];
>> +
>> +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
>> +			vhost_invalidate_vq_end(vq, j,
>> +						range->start,
>> +						range->end);
>> +	}
>> +}
>> +
>> +static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
>> +	.invalidate_range_start = vhost_invalidate_range_start,
>> +	.invalidate_range_end = vhost_invalidate_range_end,
>> +};
>> +
>> +static void vhost_init_maps(struct vhost_dev *dev)
>> +{
>> +	struct vhost_virtqueue *vq;
>> +	int i, j;
>> +
>> +	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
>> +
>> +	for (i = 0; i < dev->nvqs; ++i) {
>> +		vq = dev->vqs[i];
>> +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
>> +			vq->maps[j] = NULL;
>> +	}
>> +}
>> +#endif
>> +
>>   static void vhost_vq_reset(struct vhost_dev *dev,
>>   			   struct vhost_virtqueue *vq)
>>   {
>> @@ -326,7 +502,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>>   	vq->busyloop_timeout = 0;
>>   	vq->umem = NULL;
>>   	vq->iotlb = NULL;
>> +	vq->invalidate_count = 0;
>>   	__vhost_vq_meta_reset(vq);
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	vhost_reset_vq_maps(vq);
>> +#endif
>>   }
>>   
>>   static int vhost_worker(void *data)
>> @@ -471,12 +651,15 @@ void vhost_dev_init(struct vhost_dev *dev,
>>   	dev->iov_limit = iov_limit;
>>   	dev->weight = weight;
>>   	dev->byte_weight = byte_weight;
>> +	dev->has_notifier = false;
>>   	init_llist_head(&dev->work_list);
>>   	init_waitqueue_head(&dev->wait);
>>   	INIT_LIST_HEAD(&dev->read_list);
>>   	INIT_LIST_HEAD(&dev->pending_list);
>>   	spin_lock_init(&dev->iotlb_lock);
>> -
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	vhost_init_maps(dev);
>> +#endif
>>   
>>   	for (i = 0; i < dev->nvqs; ++i) {
>>   		vq = dev->vqs[i];
>> @@ -485,6 +668,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>>   		vq->heads = NULL;
>>   		vq->dev = dev;
>>   		mutex_init(&vq->mutex);
>> +		spin_lock_init(&vq->mmu_lock);
>>   		vhost_vq_reset(dev, vq);
>>   		if (vq->handle_kick)
>>   			vhost_poll_init(&vq->poll, vq->handle_kick,
>> @@ -564,7 +748,19 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>>   	if (err)
>>   		goto err_cgroup;
>>   
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
>> +	if (err)
>> +		goto err_mmu_notifier;
>> +#endif
>> +	dev->has_notifier = true;
>> +
>>   	return 0;
>> +
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +err_mmu_notifier:
>> +	vhost_dev_free_iovecs(dev);
>> +#endif
>>   err_cgroup:
>>   	kthread_stop(worker);
>>   	dev->worker = NULL;
>> @@ -655,6 +851,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>>   	spin_unlock(&dev->iotlb_lock);
>>   }
>>   
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
>> +			      int index, unsigned long uaddr,
>> +			      size_t size, bool write)
>> +{
>> +	struct vhost_uaddr *addr = &vq->uaddrs[index];
>> +
>> +	addr->uaddr = uaddr;
>> +	addr->size = size;
>> +	addr->write = write;
>> +}
>> +
>> +static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
>> +{
>> +	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
>> +			  (unsigned long)vq->desc,
>> +			  vhost_get_desc_size(vq, vq->num),
>> +			  false);
>> +	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
>> +			  (unsigned long)vq->avail,
>> +			  vhost_get_avail_size(vq, vq->num),
>> +			  false);
>> +	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
>> +			  (unsigned long)vq->used,
>> +			  vhost_get_used_size(vq, vq->num),
>> +			  true);
>> +}
>> +
>> +static int vhost_map_prefetch(struct vhost_virtqueue *vq,
>> +			       int index)
>> +{
>> +	struct vhost_map *map;
>> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> +	struct page **pages;
>> +	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
>> +	int npinned;
>> +	void *vaddr, *v;
>> +	int err;
>> +	int i;
>> +
>> +	spin_lock(&vq->mmu_lock);
>> +
>> +	err = -EFAULT;
>> +	if (vq->invalidate_count)
>> +		goto err;
>> +
>> +	err = -ENOMEM;
>> +	map = kmalloc(sizeof(*map), GFP_ATOMIC);
>> +	if (!map)
>> +		goto err;
>> +
>> +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
>> +	if (!pages)
>> +		goto err_pages;
>> +
>> +	err = EFAULT;
>> +	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
>> +					uaddr->write, pages);
>> +	if (npinned > 0)
>> +		release_pages(pages, npinned);
>> +	if (npinned != npages)
>> +		goto err_gup;
>> +
>> +	for (i = 0; i < npinned; i++)
>> +		if (PageHighMem(pages[i]))
>> +			goto err_gup;
>> +
>> +	vaddr = v = page_address(pages[0]);
>> +
>> +	/* For simplicity, fallback to userspace address if VA is not
>> +	 * contigious.
>> +	 */
>> +	for (i = 1; i < npinned; i++) {
>> +		v += PAGE_SIZE;
>> +		if (v != page_address(pages[i]))
>> +			goto err_gup;
>> +	}
>> +
>> +	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
>> +	map->npages = npages;
>> +	map->pages = pages;
>> +
>> +	vq->maps[index] = map;
>> +	/* No need for a synchronize_rcu(). This function should be
>> +	 * called by dev->worker so we are serialized with all
>> +	 * readers.
>> +	 */
>> +	spin_unlock(&vq->mmu_lock);
>> +
>> +	return 0;
>> +
>> +err_gup:
>> +	kfree(pages);
>> +err_pages:
>> +	kfree(map);
>> +err:
>> +	spin_unlock(&vq->mmu_lock);
>> +	return err;
>> +}
>> +#endif
>> +
>>   void vhost_dev_cleanup(struct vhost_dev *dev)
>>   {
>>   	int i;
>> @@ -684,8 +981,20 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>>   		kthread_stop(dev->worker);
>>   		dev->worker = NULL;
>>   	}
>> -	if (dev->mm)
>> +	if (dev->mm) {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +		if (dev->has_notifier) {
>> +			mmu_notifier_unregister(&dev->mmu_notifier,
>> +						dev->mm);
>> +			dev->has_notifier = false;
>> +		}
>> +#endif
>>   		mmput(dev->mm);
>> +	}
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	for (i = 0; i < dev->nvqs; i++)
>> +		vhost_uninit_vq_maps(dev->vqs[i]);
>> +#endif
>>   	dev->mm = NULL;
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
>> @@ -914,6 +1223,26 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
>>   
>>   static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			*((__virtio16 *)&used->ring[vq->num]) =
>> +				cpu_to_vhost16(vq, vq->avail_idx);
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
>>   			      vhost_avail_event(vq));
>>   }
>> @@ -922,6 +1251,27 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>>   				 struct vring_used_elem *head, int idx,
>>   				 int count)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +	size_t size;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			size = count * sizeof(*head);
>> +			memcpy(used->ring + idx, head, size);
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
>>   				  count * sizeof(*head));
>>   }
>> @@ -929,6 +1279,25 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>>   static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>>   
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			used->flags = cpu_to_vhost16(vq, vq->used_flags);
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
>>   			      &vq->used->flags);
>>   }
>> @@ -936,6 +1305,25 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>>   static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>>   
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>>   			      &vq->used->idx);
>>   }
>> @@ -981,12 +1369,50 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
>>   static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>>   				      __virtio16 *idx)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_avail *avail;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_AVAIL];
>> +		if (likely(map)) {
>> +			avail = map->addr;
>> +			*idx = avail->idx;
> index can now be speculated.

[...]


> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*head = avail->ring[idx & (vq->num - 1)];
>
> Since idx can be speculated, I guess we need array_index_nospec here?


So we have

ACQUIRE(mmu_lock)

get idx

RELEASE(mmu_lock)

ACQUIRE(mmu_lock)

read array[idx]

RELEASE(mmu_lock)

Then I think idx can't be speculated consider we've passed RELEASE + 
ACQUIRE?


>
>
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_get_avail(vq, *head,
>>   			       &vq->avail->ring[idx & (vq->num - 1)]);
>>   }
>> @@ -994,24 +1420,98 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>>   static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>>   					__virtio16 *flags)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_avail *avail;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_AVAIL];
>> +		if (likely(map)) {
>> +			avail = map->addr;
>> +			*flags = avail->flags;
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_get_avail(vq, *flags, &vq->avail->flags);
>>   }
>>   
>>   static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>>   				       __virtio16 *event)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_avail *avail;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +		map = vq->maps[VHOST_ADDR_AVAIL];
>> +		if (likely(map)) {
>> +			avail = map->addr;
>> +			*event = (__virtio16)avail->ring[vq->num];
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_get_avail(vq, *event, vhost_used_event(vq));
>>   }
>>   
>>   static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>>   				     __virtio16 *idx)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			*idx = used->idx;
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_get_used(vq, *idx, &vq->used->idx);
>>   }
>
> This seems to be used during init. Why do we bother
> accelerating this?


Ok, I can remove this part in next version.


>
>
>>   
>>   static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>>   				 struct vring_desc *desc, int idx)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_desc *d;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_DESC];
>> +		if (likely(map)) {
>> +			d = map->addr;
>> +			*desc = *(d + idx);
>
> Since idx can be speculated, I guess we need array_index_nospec here?


This is similar to the above avail idx case.


>
>
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
>>   }
>>   
> I also wonder about the userspace address we get eventualy.
> It would seem that we need to prevent that from speculating -
> and that seems like a good idea even if this patch isn't
> applied. As you are playing with micro-benchmarks, maybe
> you could the below patch?


Let me test it.

Thanks


> It's unfortunately untested.
> Thanks a lot in advance!
>
> ===>
> vhost: block speculation of translated descriptors
>
> iovec addresses coming from vhost are assumed to be
> pre-validated, but in fact can be speculated to a value
> out of range.
>
> Userspace address are later validated with array_index_nospec so we can
> be sure kernel info does not leak through these addresses, but vhost
> must also not leak userspace info outside the allowed memory table to
> guests.
>
> Following the defence in depth principle, make sure
> the address is not validated out of node range.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>
> ---
>
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 5dc174ac8cac..863e25011ef6 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2072,7 +2076,9 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
>   		size = node->size - addr + node->start;
>   		_iov->iov_len = min((u64)len - s, size);
>   		_iov->iov_base = (void __user *)(unsigned long)
> -			(node->userspace_addr + addr - node->start);
> +			(node->userspace_addr +
> +			 array_index_nospec(addr - node->start,
> +					    node->size));
>   		s += size;
>   		addr += size;
>   		++ret;
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
@ 2019-09-09  2:18       ` Jason Wang
  0 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  2:18 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: aarcange, Christoph Hellwig, linux-parisc, kvm, netdev,
	linux-kernel, virtualization, James Bottomley, linux-mm, jglisse,
	jgg, David Miller, linux-arm-kernel


On 2019/9/8 下午7:05, Michael S. Tsirkin wrote:
> On Thu, Sep 05, 2019 at 08:27:36PM +0800, Jason Wang wrote:
>> This is a rework on the commit 7f466032dc9e ("vhost: access vq
>> metadata through kernel virtual address").
>>
>> It was noticed that the copy_to/from_user() friends that was used to
>> access virtqueue metdata tends to be very expensive for dataplane
>> implementation like vhost since it involves lots of software checks,
>> speculation barriers,
> So if we drop speculation barrier,
> there's a problem here in access will now be speculated.
> This effectively disables the defence in depth effect of
> b3bbfb3fb5d25776b8e3f361d2eedaabb0b496cd
>      x86: Introduce __uaccess_begin_nospec() and uaccess_try_nospec
>
>
> So now we need to sprinkle array_index_nospec or barrier_nospec over the
> code whenever we use an index we got from userspace.
> See below for some examples.
>
>
>> hardware feature toggling (e.g SMAP). The
>> extra cost will be more obvious when transferring small packets since
>> the time spent on metadata accessing become more significant.
>>
>> This patch tries to eliminate those overheads by accessing them
>> through direct mapping of those pages. Invalidation callbacks is
>> implemented for co-operation with general VM management (swap, KSM,
>> THP or NUMA balancing). We will try to get the direct mapping of vq
>> metadata before each round of packet processing if it doesn't
>> exist. If we fail, we will simplely fallback to copy_to/from_user()
>> friends.
>>
>> This invalidation, direct mapping access and set are synchronized
>> through spinlock. This takes a step back from the original commit
>> 7f466032dc9e ("vhost: access vq metadata through kernel virtual
>> address") which tries to RCU which is suspicious and hard to be
>> reviewed. This won't perform as well as RCU because of the atomic,
>> this could be addressed by the future optimization.
>>
>> This method might does not work for high mem page which requires
>> temporary mapping so we just fallback to normal
>> copy_to/from_user() and may not for arch that has virtual tagged cache
>> since extra cache flushing is needed to eliminate the alias. This will
>> result complex logic and bad performance. For those archs, this patch
>> simply go for copy_to/from_user() friends. This is done by ruling out
>> kernel mapping codes through ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.
>>
>> Note that this is only done when device IOTLB is not enabled. We
>> could use similar method to optimize IOTLB in the future.
>>
>> Tests shows at most about 22% improvement on TX PPS when using
>> virtio-user + vhost_net + xdp1 + TAP on 4.0GHz Kaby Lake.
>>
>>          SMAP on | SMAP off
>> Before: 4.9Mpps | 6.9Mpps
>> After:  6.0Mpps | 7.5Mpps
>>
>> On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
>> any difference.
> Why is not Kaby Lake with SMAP off the same as Sandy Bridge?


I don't know, I guess it was because the atomic is l


>
>
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
>> Cc: Christoph Hellwig <hch@infradead.org>
>> Cc: David Miller <davem@davemloft.net>
>> Cc: Jerome Glisse <jglisse@redhat.com>
>> Cc: Jason Gunthorpe <jgg@mellanox.com>
>> Cc: linux-mm@kvack.org
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linux-parisc@vger.kernel.org
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 551 +++++++++++++++++++++++++++++++++++++++++-
>>   drivers/vhost/vhost.h |  41 ++++
>>   2 files changed, 589 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index 791562e03fe0..f98155f28f02 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -298,6 +298,182 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
>>   		__vhost_vq_meta_reset(d->vqs[i]);
>>   }
>>   
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +static void vhost_map_unprefetch(struct vhost_map *map)
>> +{
>> +	kfree(map->pages);
>> +	kfree(map);
>> +}
>> +
>> +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
>> +				struct vhost_map *map, int index)
>> +{
>> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> +	int i;
>> +
>> +	if (uaddr->write) {
>> +		for (i = 0; i < map->npages; i++)
>> +			set_page_dirty(map->pages[i]);
>> +	}
>> +}
>> +
>> +static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
>> +{
>> +	struct vhost_map *map[VHOST_NUM_ADDRS];
>> +	int i;
>> +
>> +	spin_lock(&vq->mmu_lock);
>> +	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
>> +		map[i] = vq->maps[i];
>> +		if (map[i]) {
>> +			vhost_set_map_dirty(vq, map[i], i);
>> +			vq->maps[i] = NULL;
>> +		}
>> +	}
>> +	spin_unlock(&vq->mmu_lock);
>> +
>> +	/* No need for synchronization since we are serialized with
>> +	 * memory accessors (e.g vq mutex held).
>> +	 */
>> +
>> +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
>> +		if (map[i])
>> +			vhost_map_unprefetch(map[i]);
>> +
>> +}
>> +
>> +static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
>> +{
>> +	int i;
>> +
>> +	vhost_uninit_vq_maps(vq);
>> +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
>> +		vq->uaddrs[i].size = 0;
>> +}
>> +
>> +static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
>> +				     unsigned long start,
>> +				     unsigned long end)
>> +{
>> +	if (unlikely(!uaddr->size))
>> +		return false;
>> +
>> +	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
>> +}
>> +
>> +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
>> +{
>> +	spin_lock(&vq->mmu_lock);
>> +}
>> +
>> +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
>> +{
>> +	spin_unlock(&vq->mmu_lock);
>> +}
>> +
>> +static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>> +				     int index,
>> +				     unsigned long start,
>> +				     unsigned long end,
>> +				     bool blockable)
>> +{
>> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> +	struct vhost_map *map;
>> +
>> +	if (!vhost_map_range_overlap(uaddr, start, end))
>> +		return 0;
>> +	else if (!blockable)
>> +		return -EAGAIN;
>> +
>> +	spin_lock(&vq->mmu_lock);
>> +	++vq->invalidate_count;
>> +
>> +	map = vq->maps[index];
>> +	if (map)
>> +		vq->maps[index] = NULL;
>> +	spin_unlock(&vq->mmu_lock);
>> +
>> +	if (map) {
>> +		vhost_set_map_dirty(vq, map, index);
>> +		vhost_map_unprefetch(map);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
>> +				    int index,
>> +				    unsigned long start,
>> +				    unsigned long end)
>> +{
>> +	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
>> +		return;
>> +
>> +	spin_lock(&vq->mmu_lock);
>> +	--vq->invalidate_count;
>> +	spin_unlock(&vq->mmu_lock);
>> +}
>> +
>> +static int vhost_invalidate_range_start(struct mmu_notifier *mn,
>> +					const struct mmu_notifier_range *range)
>> +{
>> +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
>> +					     mmu_notifier);
>> +	bool blockable = mmu_notifier_range_blockable(range);
>> +	int i, j, ret;
>> +
>> +	for (i = 0; i < dev->nvqs; i++) {
>> +		struct vhost_virtqueue *vq = dev->vqs[i];
>> +
>> +		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
>> +			ret = vhost_invalidate_vq_start(vq, j,
>> +							range->start,
>> +							range->end, blockable);
>> +			if (ret)
>> +				return ret;
>> +		}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static void vhost_invalidate_range_end(struct mmu_notifier *mn,
>> +				       const struct mmu_notifier_range *range)
>> +{
>> +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
>> +					     mmu_notifier);
>> +	int i, j;
>> +
>> +	for (i = 0; i < dev->nvqs; i++) {
>> +		struct vhost_virtqueue *vq = dev->vqs[i];
>> +
>> +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
>> +			vhost_invalidate_vq_end(vq, j,
>> +						range->start,
>> +						range->end);
>> +	}
>> +}
>> +
>> +static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
>> +	.invalidate_range_start = vhost_invalidate_range_start,
>> +	.invalidate_range_end = vhost_invalidate_range_end,
>> +};
>> +
>> +static void vhost_init_maps(struct vhost_dev *dev)
>> +{
>> +	struct vhost_virtqueue *vq;
>> +	int i, j;
>> +
>> +	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
>> +
>> +	for (i = 0; i < dev->nvqs; ++i) {
>> +		vq = dev->vqs[i];
>> +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
>> +			vq->maps[j] = NULL;
>> +	}
>> +}
>> +#endif
>> +
>>   static void vhost_vq_reset(struct vhost_dev *dev,
>>   			   struct vhost_virtqueue *vq)
>>   {
>> @@ -326,7 +502,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>>   	vq->busyloop_timeout = 0;
>>   	vq->umem = NULL;
>>   	vq->iotlb = NULL;
>> +	vq->invalidate_count = 0;
>>   	__vhost_vq_meta_reset(vq);
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	vhost_reset_vq_maps(vq);
>> +#endif
>>   }
>>   
>>   static int vhost_worker(void *data)
>> @@ -471,12 +651,15 @@ void vhost_dev_init(struct vhost_dev *dev,
>>   	dev->iov_limit = iov_limit;
>>   	dev->weight = weight;
>>   	dev->byte_weight = byte_weight;
>> +	dev->has_notifier = false;
>>   	init_llist_head(&dev->work_list);
>>   	init_waitqueue_head(&dev->wait);
>>   	INIT_LIST_HEAD(&dev->read_list);
>>   	INIT_LIST_HEAD(&dev->pending_list);
>>   	spin_lock_init(&dev->iotlb_lock);
>> -
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	vhost_init_maps(dev);
>> +#endif
>>   
>>   	for (i = 0; i < dev->nvqs; ++i) {
>>   		vq = dev->vqs[i];
>> @@ -485,6 +668,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>>   		vq->heads = NULL;
>>   		vq->dev = dev;
>>   		mutex_init(&vq->mutex);
>> +		spin_lock_init(&vq->mmu_lock);
>>   		vhost_vq_reset(dev, vq);
>>   		if (vq->handle_kick)
>>   			vhost_poll_init(&vq->poll, vq->handle_kick,
>> @@ -564,7 +748,19 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>>   	if (err)
>>   		goto err_cgroup;
>>   
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
>> +	if (err)
>> +		goto err_mmu_notifier;
>> +#endif
>> +	dev->has_notifier = true;
>> +
>>   	return 0;
>> +
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +err_mmu_notifier:
>> +	vhost_dev_free_iovecs(dev);
>> +#endif
>>   err_cgroup:
>>   	kthread_stop(worker);
>>   	dev->worker = NULL;
>> @@ -655,6 +851,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>>   	spin_unlock(&dev->iotlb_lock);
>>   }
>>   
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
>> +			      int index, unsigned long uaddr,
>> +			      size_t size, bool write)
>> +{
>> +	struct vhost_uaddr *addr = &vq->uaddrs[index];
>> +
>> +	addr->uaddr = uaddr;
>> +	addr->size = size;
>> +	addr->write = write;
>> +}
>> +
>> +static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
>> +{
>> +	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
>> +			  (unsigned long)vq->desc,
>> +			  vhost_get_desc_size(vq, vq->num),
>> +			  false);
>> +	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
>> +			  (unsigned long)vq->avail,
>> +			  vhost_get_avail_size(vq, vq->num),
>> +			  false);
>> +	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
>> +			  (unsigned long)vq->used,
>> +			  vhost_get_used_size(vq, vq->num),
>> +			  true);
>> +}
>> +
>> +static int vhost_map_prefetch(struct vhost_virtqueue *vq,
>> +			       int index)
>> +{
>> +	struct vhost_map *map;
>> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> +	struct page **pages;
>> +	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
>> +	int npinned;
>> +	void *vaddr, *v;
>> +	int err;
>> +	int i;
>> +
>> +	spin_lock(&vq->mmu_lock);
>> +
>> +	err = -EFAULT;
>> +	if (vq->invalidate_count)
>> +		goto err;
>> +
>> +	err = -ENOMEM;
>> +	map = kmalloc(sizeof(*map), GFP_ATOMIC);
>> +	if (!map)
>> +		goto err;
>> +
>> +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
>> +	if (!pages)
>> +		goto err_pages;
>> +
>> +	err = EFAULT;
>> +	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
>> +					uaddr->write, pages);
>> +	if (npinned > 0)
>> +		release_pages(pages, npinned);
>> +	if (npinned != npages)
>> +		goto err_gup;
>> +
>> +	for (i = 0; i < npinned; i++)
>> +		if (PageHighMem(pages[i]))
>> +			goto err_gup;
>> +
>> +	vaddr = v = page_address(pages[0]);
>> +
>> +	/* For simplicity, fallback to userspace address if VA is not
>> +	 * contigious.
>> +	 */
>> +	for (i = 1; i < npinned; i++) {
>> +		v += PAGE_SIZE;
>> +		if (v != page_address(pages[i]))
>> +			goto err_gup;
>> +	}
>> +
>> +	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
>> +	map->npages = npages;
>> +	map->pages = pages;
>> +
>> +	vq->maps[index] = map;
>> +	/* No need for a synchronize_rcu(). This function should be
>> +	 * called by dev->worker so we are serialized with all
>> +	 * readers.
>> +	 */
>> +	spin_unlock(&vq->mmu_lock);
>> +
>> +	return 0;
>> +
>> +err_gup:
>> +	kfree(pages);
>> +err_pages:
>> +	kfree(map);
>> +err:
>> +	spin_unlock(&vq->mmu_lock);
>> +	return err;
>> +}
>> +#endif
>> +
>>   void vhost_dev_cleanup(struct vhost_dev *dev)
>>   {
>>   	int i;
>> @@ -684,8 +981,20 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>>   		kthread_stop(dev->worker);
>>   		dev->worker = NULL;
>>   	}
>> -	if (dev->mm)
>> +	if (dev->mm) {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +		if (dev->has_notifier) {
>> +			mmu_notifier_unregister(&dev->mmu_notifier,
>> +						dev->mm);
>> +			dev->has_notifier = false;
>> +		}
>> +#endif
>>   		mmput(dev->mm);
>> +	}
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	for (i = 0; i < dev->nvqs; i++)
>> +		vhost_uninit_vq_maps(dev->vqs[i]);
>> +#endif
>>   	dev->mm = NULL;
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
>> @@ -914,6 +1223,26 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
>>   
>>   static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			*((__virtio16 *)&used->ring[vq->num]) =
>> +				cpu_to_vhost16(vq, vq->avail_idx);
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
>>   			      vhost_avail_event(vq));
>>   }
>> @@ -922,6 +1251,27 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>>   				 struct vring_used_elem *head, int idx,
>>   				 int count)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +	size_t size;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			size = count * sizeof(*head);
>> +			memcpy(used->ring + idx, head, size);
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
>>   				  count * sizeof(*head));
>>   }
>> @@ -929,6 +1279,25 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>>   static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>>   
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			used->flags = cpu_to_vhost16(vq, vq->used_flags);
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
>>   			      &vq->used->flags);
>>   }
>> @@ -936,6 +1305,25 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>>   static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>>   
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>>   			      &vq->used->idx);
>>   }
>> @@ -981,12 +1369,50 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
>>   static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>>   				      __virtio16 *idx)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_avail *avail;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_AVAIL];
>> +		if (likely(map)) {
>> +			avail = map->addr;
>> +			*idx = avail->idx;
> index can now be speculated.

[...]


> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*head = avail->ring[idx & (vq->num - 1)];
>
> Since idx can be speculated, I guess we need array_index_nospec here?


So we have

ACQUIRE(mmu_lock)

get idx

RELEASE(mmu_lock)

ACQUIRE(mmu_lock)

read array[idx]

RELEASE(mmu_lock)

Then I think idx can't be speculated consider we've passed RELEASE + 
ACQUIRE?


>
>
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_get_avail(vq, *head,
>>   			       &vq->avail->ring[idx & (vq->num - 1)]);
>>   }
>> @@ -994,24 +1420,98 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>>   static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>>   					__virtio16 *flags)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_avail *avail;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_AVAIL];
>> +		if (likely(map)) {
>> +			avail = map->addr;
>> +			*flags = avail->flags;
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_get_avail(vq, *flags, &vq->avail->flags);
>>   }
>>   
>>   static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>>   				       __virtio16 *event)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_avail *avail;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +		map = vq->maps[VHOST_ADDR_AVAIL];
>> +		if (likely(map)) {
>> +			avail = map->addr;
>> +			*event = (__virtio16)avail->ring[vq->num];
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_get_avail(vq, *event, vhost_used_event(vq));
>>   }
>>   
>>   static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>>   				     __virtio16 *idx)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_used *used;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_USED];
>> +		if (likely(map)) {
>> +			used = map->addr;
>> +			*idx = used->idx;
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_get_used(vq, *idx, &vq->used->idx);
>>   }
>
> This seems to be used during init. Why do we bother
> accelerating this?


Ok, I can remove this part in next version.


>
>
>>   
>>   static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>>   				 struct vring_desc *desc, int idx)
>>   {
>> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> +	struct vhost_map *map;
>> +	struct vring_desc *d;
>> +
>> +	if (!vq->iotlb) {
>> +		vhost_vq_access_map_begin(vq);
>> +
>> +		map = vq->maps[VHOST_ADDR_DESC];
>> +		if (likely(map)) {
>> +			d = map->addr;
>> +			*desc = *(d + idx);
>
> Since idx can be speculated, I guess we need array_index_nospec here?


This is similar to the above avail idx case.


>
>
>> +			vhost_vq_access_map_end(vq);
>> +			return 0;
>> +		}
>> +
>> +		vhost_vq_access_map_end(vq);
>> +	}
>> +#endif
>> +
>>   	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
>>   }
>>   
> I also wonder about the userspace address we get eventualy.
> It would seem that we need to prevent that from speculating -
> and that seems like a good idea even if this patch isn't
> applied. As you are playing with micro-benchmarks, maybe
> you could the below patch?


Let me test it.

Thanks


> It's unfortunately untested.
> Thanks a lot in advance!
>
> ===>
> vhost: block speculation of translated descriptors
>
> iovec addresses coming from vhost are assumed to be
> pre-validated, but in fact can be speculated to a value
> out of range.
>
> Userspace address are later validated with array_index_nospec so we can
> be sure kernel info does not leak through these addresses, but vhost
> must also not leak userspace info outside the allowed memory table to
> guests.
>
> Following the defence in depth principle, make sure
> the address is not validated out of node range.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>
> ---
>
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 5dc174ac8cac..863e25011ef6 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2072,7 +2076,9 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
>   		size = node->size - addr + node->start;
>   		_iov->iov_len = min((u64)len - s, size);
>   		_iov->iov_base = (void __user *)(unsigned long)
> -			(node->userspace_addr + addr - node->start);
> +			(node->userspace_addr +
> +			 array_index_nospec(addr - node->start,
> +					    node->size));
>   		s += size;
>   		addr += size;
>   		++ret;

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
  2019-09-07 15:03       ` Jason Gunthorpe
@ 2019-09-09  2:29         ` Jason Wang
  -1 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  2:29 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: mst, kvm, virtualization, netdev, linux-kernel, aarcange,
	jglisse, linux-mm


On 2019/9/7 下午11:03, Jason Gunthorpe wrote:
> On Fri, Sep 06, 2019 at 06:02:35PM +0800, Jason Wang wrote:
>> On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
>>> On Thu, Sep 05, 2019 at 08:27:34PM +0800, Jason Wang wrote:
>>>> Hi:
>>>>
>>>> Per request from Michael and Jason, the metadata accelreation is
>>>> reverted in this version and rework in next version.
>>>>
>>>> Please review.
>>>>
>>>> Thanks
>>>>
>>>> Jason Wang (2):
>>>>     Revert "vhost: access vq metadata through kernel virtual address"
>>>>     vhost: re-introducing metadata acceleration through kernel virtual
>>>>       address
>>> There are a bunch of patches in the queue already that will help
>>> vhost, and I a working on one for next cycle that will help alot more
>>> too.
>>
>> I will check those patches, but if you can give me some pointers or keywords
>> it would be much appreciated.
> You can look here:
>
> https://github.com/jgunthorpe/linux/commits/mmu_notifier
>
> The first parts, the get/put are in the hmm tree, and the last part,
> the interval tree in the last commit is still a WIP, but it would
> remove alot of that code from vhost as well.
>
> Jason


Thanks a lot, will have a look at these and come back if I met any issues.


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
@ 2019-09-09  2:29         ` Jason Wang
  0 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  2:29 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: mst, kvm, virtualization, netdev, linux-kernel, aarcange,
	jglisse, linux-mm


On 2019/9/7 下午11:03, Jason Gunthorpe wrote:
> On Fri, Sep 06, 2019 at 06:02:35PM +0800, Jason Wang wrote:
>> On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
>>> On Thu, Sep 05, 2019 at 08:27:34PM +0800, Jason Wang wrote:
>>>> Hi:
>>>>
>>>> Per request from Michael and Jason, the metadata accelreation is
>>>> reverted in this version and rework in next version.
>>>>
>>>> Please review.
>>>>
>>>> Thanks
>>>>
>>>> Jason Wang (2):
>>>>     Revert "vhost: access vq metadata through kernel virtual address"
>>>>     vhost: re-introducing metadata acceleration through kernel virtual
>>>>       address
>>> There are a bunch of patches in the queue already that will help
>>> vhost, and I a working on one for next cycle that will help alot more
>>> too.
>>
>> I will check those patches, but if you can give me some pointers or keywords
>> it would be much appreciated.
> You can look here:
>
> https://github.com/jgunthorpe/linux/commits/mmu_notifier
>
> The first parts, the get/put are in the hmm tree, and the last part,
> the interval tree in the last commit is still a WIP, but it would
> remove alot of that code from vhost as well.
>
> Jason


Thanks a lot, will have a look at these and come back if I met any issues.



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
  2019-09-07 15:03       ` Jason Gunthorpe
                         ` (2 preceding siblings ...)
  (?)
@ 2019-09-09  2:29       ` Jason Wang
  -1 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  2:29 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: aarcange, kvm, mst, netdev, linux-kernel, virtualization,
	linux-mm, jglisse


On 2019/9/7 下午11:03, Jason Gunthorpe wrote:
> On Fri, Sep 06, 2019 at 06:02:35PM +0800, Jason Wang wrote:
>> On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
>>> On Thu, Sep 05, 2019 at 08:27:34PM +0800, Jason Wang wrote:
>>>> Hi:
>>>>
>>>> Per request from Michael and Jason, the metadata accelreation is
>>>> reverted in this version and rework in next version.
>>>>
>>>> Please review.
>>>>
>>>> Thanks
>>>>
>>>> Jason Wang (2):
>>>>     Revert "vhost: access vq metadata through kernel virtual address"
>>>>     vhost: re-introducing metadata acceleration through kernel virtual
>>>>       address
>>> There are a bunch of patches in the queue already that will help
>>> vhost, and I a working on one for next cycle that will help alot more
>>> too.
>>
>> I will check those patches, but if you can give me some pointers or keywords
>> it would be much appreciated.
> You can look here:
>
> https://github.com/jgunthorpe/linux/commits/mmu_notifier
>
> The first parts, the get/put are in the hmm tree, and the last part,
> the interval tree in the last commit is still a WIP, but it would
> remove alot of that code from vhost as well.
>
> Jason


Thanks a lot, will have a look at these and come back if I met any issues.

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-09  2:18       ` Jason Wang
@ 2019-09-09  2:30         ` Jason Wang
  -1 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  2:30 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, virtualization, netdev, linux-kernel, jgg, aarcange,
	jglisse, linux-mm, James Bottomley, Christoph Hellwig,
	David Miller, linux-arm-kernel, linux-parisc


On 2019/9/9 上午10:18, Jason Wang wrote:
>>>
>>> On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
>>> any difference.
>> Why is not Kaby Lake with SMAP off the same as Sandy Bridge?
>
>
> I don't know, I guess it was because the atomic is l 


Sorry, I meant atomic costs less for Kaby Lake.

Thanks



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-09  2:18       ` Jason Wang
  (?)
  (?)
@ 2019-09-09  2:30       ` Jason Wang
  -1 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  2:30 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: aarcange, Christoph Hellwig, linux-parisc, kvm, netdev,
	linux-kernel, virtualization, James Bottomley, linux-mm, jglisse,
	jgg, David Miller, linux-arm-kernel


On 2019/9/9 上午10:18, Jason Wang wrote:
>>>
>>> On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
>>> any difference.
>> Why is not Kaby Lake with SMAP off the same as Sandy Bridge?
>
>
> I don't know, I guess it was because the atomic is l 


Sorry, I meant atomic costs less for Kaby Lake.

Thanks


_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
@ 2019-09-09  2:30         ` Jason Wang
  0 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  2:30 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: aarcange, Christoph Hellwig, linux-parisc, kvm, netdev,
	linux-kernel, virtualization, James Bottomley, linux-mm, jglisse,
	jgg, David Miller, linux-arm-kernel


On 2019/9/9 上午10:18, Jason Wang wrote:
>>>
>>> On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
>>> any difference.
>> Why is not Kaby Lake with SMAP off the same as Sandy Bridge?
>
>
> I don't know, I guess it was because the atomic is l 


Sorry, I meant atomic costs less for Kaby Lake.

Thanks



_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-09  2:18       ` Jason Wang
  (?)
@ 2019-09-09  4:45         ` Michael S. Tsirkin
  -1 siblings, 0 replies; 50+ messages in thread
From: Michael S. Tsirkin @ 2019-09-09  4:45 UTC (permalink / raw)
  To: Jason Wang
  Cc: kvm, virtualization, netdev, linux-kernel, jgg, aarcange,
	jglisse, linux-mm, James Bottomley, Christoph Hellwig,
	David Miller, linux-arm-kernel, linux-parisc

On Mon, Sep 09, 2019 at 10:18:57AM +0800, Jason Wang wrote:
> 
> On 2019/9/8 下午7:05, Michael S. Tsirkin wrote:
> > On Thu, Sep 05, 2019 at 08:27:36PM +0800, Jason Wang wrote:
> > > This is a rework on the commit 7f466032dc9e ("vhost: access vq
> > > metadata through kernel virtual address").
> > > 
> > > It was noticed that the copy_to/from_user() friends that was used to
> > > access virtqueue metdata tends to be very expensive for dataplane
> > > implementation like vhost since it involves lots of software checks,
> > > speculation barriers,
> > So if we drop speculation barrier,
> > there's a problem here in access will now be speculated.
> > This effectively disables the defence in depth effect of
> > b3bbfb3fb5d25776b8e3f361d2eedaabb0b496cd
> >      x86: Introduce __uaccess_begin_nospec() and uaccess_try_nospec
> > 
> > 
> > So now we need to sprinkle array_index_nospec or barrier_nospec over the
> > code whenever we use an index we got from userspace.
> > See below for some examples.
> > 
> > 
> > > hardware feature toggling (e.g SMAP). The
> > > extra cost will be more obvious when transferring small packets since
> > > the time spent on metadata accessing become more significant.
> > > 
> > > This patch tries to eliminate those overheads by accessing them
> > > through direct mapping of those pages. Invalidation callbacks is
> > > implemented for co-operation with general VM management (swap, KSM,
> > > THP or NUMA balancing). We will try to get the direct mapping of vq
> > > metadata before each round of packet processing if it doesn't
> > > exist. If we fail, we will simplely fallback to copy_to/from_user()
> > > friends.
> > > 
> > > This invalidation, direct mapping access and set are synchronized
> > > through spinlock. This takes a step back from the original commit
> > > 7f466032dc9e ("vhost: access vq metadata through kernel virtual
> > > address") which tries to RCU which is suspicious and hard to be
> > > reviewed. This won't perform as well as RCU because of the atomic,
> > > this could be addressed by the future optimization.
> > > 
> > > This method might does not work for high mem page which requires
> > > temporary mapping so we just fallback to normal
> > > copy_to/from_user() and may not for arch that has virtual tagged cache
> > > since extra cache flushing is needed to eliminate the alias. This will
> > > result complex logic and bad performance. For those archs, this patch
> > > simply go for copy_to/from_user() friends. This is done by ruling out
> > > kernel mapping codes through ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.
> > > 
> > > Note that this is only done when device IOTLB is not enabled. We
> > > could use similar method to optimize IOTLB in the future.
> > > 
> > > Tests shows at most about 22% improvement on TX PPS when using
> > > virtio-user + vhost_net + xdp1 + TAP on 4.0GHz Kaby Lake.
> > > 
> > >          SMAP on | SMAP off
> > > Before: 4.9Mpps | 6.9Mpps
> > > After:  6.0Mpps | 7.5Mpps
> > > 
> > > On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
> > > any difference.
> > Why is not Kaby Lake with SMAP off the same as Sandy Bridge?
> 
> 
> I don't know, I guess it was because the atomic is l
> 
> 
> > 
> > 
> > > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > > Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
> > > Cc: Christoph Hellwig <hch@infradead.org>
> > > Cc: David Miller <davem@davemloft.net>
> > > Cc: Jerome Glisse <jglisse@redhat.com>
> > > Cc: Jason Gunthorpe <jgg@mellanox.com>
> > > Cc: linux-mm@kvack.org
> > > Cc: linux-arm-kernel@lists.infradead.org
> > > Cc: linux-parisc@vger.kernel.org
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > ---
> > >   drivers/vhost/vhost.c | 551 +++++++++++++++++++++++++++++++++++++++++-
> > >   drivers/vhost/vhost.h |  41 ++++
> > >   2 files changed, 589 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index 791562e03fe0..f98155f28f02 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -298,6 +298,182 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
> > >   		__vhost_vq_meta_reset(d->vqs[i]);
> > >   }
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +static void vhost_map_unprefetch(struct vhost_map *map)
> > > +{
> > > +	kfree(map->pages);
> > > +	kfree(map);
> > > +}
> > > +
> > > +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
> > > +				struct vhost_map *map, int index)
> > > +{
> > > +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> > > +	int i;
> > > +
> > > +	if (uaddr->write) {
> > > +		for (i = 0; i < map->npages; i++)
> > > +			set_page_dirty(map->pages[i]);
> > > +	}
> > > +}
> > > +
> > > +static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
> > > +{
> > > +	struct vhost_map *map[VHOST_NUM_ADDRS];
> > > +	int i;
> > > +
> > > +	spin_lock(&vq->mmu_lock);
> > > +	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> > > +		map[i] = vq->maps[i];
> > > +		if (map[i]) {
> > > +			vhost_set_map_dirty(vq, map[i], i);
> > > +			vq->maps[i] = NULL;
> > > +		}
> > > +	}
> > > +	spin_unlock(&vq->mmu_lock);
> > > +
> > > +	/* No need for synchronization since we are serialized with
> > > +	 * memory accessors (e.g vq mutex held).
> > > +	 */
> > > +
> > > +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> > > +		if (map[i])
> > > +			vhost_map_unprefetch(map[i]);
> > > +
> > > +}
> > > +
> > > +static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
> > > +{
> > > +	int i;
> > > +
> > > +	vhost_uninit_vq_maps(vq);
> > > +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> > > +		vq->uaddrs[i].size = 0;
> > > +}
> > > +
> > > +static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
> > > +				     unsigned long start,
> > > +				     unsigned long end)
> > > +{
> > > +	if (unlikely(!uaddr->size))
> > > +		return false;
> > > +
> > > +	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
> > > +}
> > > +
> > > +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
> > > +{
> > > +	spin_lock(&vq->mmu_lock);
> > > +}
> > > +
> > > +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
> > > +{
> > > +	spin_unlock(&vq->mmu_lock);
> > > +}
> > > +
> > > +static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
> > > +				     int index,
> > > +				     unsigned long start,
> > > +				     unsigned long end,
> > > +				     bool blockable)
> > > +{
> > > +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> > > +	struct vhost_map *map;
> > > +
> > > +	if (!vhost_map_range_overlap(uaddr, start, end))
> > > +		return 0;
> > > +	else if (!blockable)
> > > +		return -EAGAIN;
> > > +
> > > +	spin_lock(&vq->mmu_lock);
> > > +	++vq->invalidate_count;
> > > +
> > > +	map = vq->maps[index];
> > > +	if (map)
> > > +		vq->maps[index] = NULL;
> > > +	spin_unlock(&vq->mmu_lock);
> > > +
> > > +	if (map) {
> > > +		vhost_set_map_dirty(vq, map, index);
> > > +		vhost_map_unprefetch(map);
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
> > > +				    int index,
> > > +				    unsigned long start,
> > > +				    unsigned long end)
> > > +{
> > > +	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
> > > +		return;
> > > +
> > > +	spin_lock(&vq->mmu_lock);
> > > +	--vq->invalidate_count;
> > > +	spin_unlock(&vq->mmu_lock);
> > > +}
> > > +
> > > +static int vhost_invalidate_range_start(struct mmu_notifier *mn,
> > > +					const struct mmu_notifier_range *range)
> > > +{
> > > +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> > > +					     mmu_notifier);
> > > +	bool blockable = mmu_notifier_range_blockable(range);
> > > +	int i, j, ret;
> > > +
> > > +	for (i = 0; i < dev->nvqs; i++) {
> > > +		struct vhost_virtqueue *vq = dev->vqs[i];
> > > +
> > > +		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
> > > +			ret = vhost_invalidate_vq_start(vq, j,
> > > +							range->start,
> > > +							range->end, blockable);
> > > +			if (ret)
> > > +				return ret;
> > > +		}
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static void vhost_invalidate_range_end(struct mmu_notifier *mn,
> > > +				       const struct mmu_notifier_range *range)
> > > +{
> > > +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> > > +					     mmu_notifier);
> > > +	int i, j;
> > > +
> > > +	for (i = 0; i < dev->nvqs; i++) {
> > > +		struct vhost_virtqueue *vq = dev->vqs[i];
> > > +
> > > +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> > > +			vhost_invalidate_vq_end(vq, j,
> > > +						range->start,
> > > +						range->end);
> > > +	}
> > > +}
> > > +
> > > +static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
> > > +	.invalidate_range_start = vhost_invalidate_range_start,
> > > +	.invalidate_range_end = vhost_invalidate_range_end,
> > > +};
> > > +
> > > +static void vhost_init_maps(struct vhost_dev *dev)
> > > +{
> > > +	struct vhost_virtqueue *vq;
> > > +	int i, j;
> > > +
> > > +	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
> > > +
> > > +	for (i = 0; i < dev->nvqs; ++i) {
> > > +		vq = dev->vqs[i];
> > > +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> > > +			vq->maps[j] = NULL;
> > > +	}
> > > +}
> > > +#endif
> > > +
> > >   static void vhost_vq_reset(struct vhost_dev *dev,
> > >   			   struct vhost_virtqueue *vq)
> > >   {
> > > @@ -326,7 +502,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
> > >   	vq->busyloop_timeout = 0;
> > >   	vq->umem = NULL;
> > >   	vq->iotlb = NULL;
> > > +	vq->invalidate_count = 0;
> > >   	__vhost_vq_meta_reset(vq);
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	vhost_reset_vq_maps(vq);
> > > +#endif
> > >   }
> > >   static int vhost_worker(void *data)
> > > @@ -471,12 +651,15 @@ void vhost_dev_init(struct vhost_dev *dev,
> > >   	dev->iov_limit = iov_limit;
> > >   	dev->weight = weight;
> > >   	dev->byte_weight = byte_weight;
> > > +	dev->has_notifier = false;
> > >   	init_llist_head(&dev->work_list);
> > >   	init_waitqueue_head(&dev->wait);
> > >   	INIT_LIST_HEAD(&dev->read_list);
> > >   	INIT_LIST_HEAD(&dev->pending_list);
> > >   	spin_lock_init(&dev->iotlb_lock);
> > > -
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	vhost_init_maps(dev);
> > > +#endif
> > >   	for (i = 0; i < dev->nvqs; ++i) {
> > >   		vq = dev->vqs[i];
> > > @@ -485,6 +668,7 @@ void vhost_dev_init(struct vhost_dev *dev,
> > >   		vq->heads = NULL;
> > >   		vq->dev = dev;
> > >   		mutex_init(&vq->mutex);
> > > +		spin_lock_init(&vq->mmu_lock);
> > >   		vhost_vq_reset(dev, vq);
> > >   		if (vq->handle_kick)
> > >   			vhost_poll_init(&vq->poll, vq->handle_kick,
> > > @@ -564,7 +748,19 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > >   	if (err)
> > >   		goto err_cgroup;
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
> > > +	if (err)
> > > +		goto err_mmu_notifier;
> > > +#endif
> > > +	dev->has_notifier = true;
> > > +
> > >   	return 0;
> > > +
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +err_mmu_notifier:
> > > +	vhost_dev_free_iovecs(dev);
> > > +#endif
> > >   err_cgroup:
> > >   	kthread_stop(worker);
> > >   	dev->worker = NULL;
> > > @@ -655,6 +851,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
> > >   	spin_unlock(&dev->iotlb_lock);
> > >   }
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
> > > +			      int index, unsigned long uaddr,
> > > +			      size_t size, bool write)
> > > +{
> > > +	struct vhost_uaddr *addr = &vq->uaddrs[index];
> > > +
> > > +	addr->uaddr = uaddr;
> > > +	addr->size = size;
> > > +	addr->write = write;
> > > +}
> > > +
> > > +static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
> > > +{
> > > +	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
> > > +			  (unsigned long)vq->desc,
> > > +			  vhost_get_desc_size(vq, vq->num),
> > > +			  false);
> > > +	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
> > > +			  (unsigned long)vq->avail,
> > > +			  vhost_get_avail_size(vq, vq->num),
> > > +			  false);
> > > +	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
> > > +			  (unsigned long)vq->used,
> > > +			  vhost_get_used_size(vq, vq->num),
> > > +			  true);
> > > +}
> > > +
> > > +static int vhost_map_prefetch(struct vhost_virtqueue *vq,
> > > +			       int index)
> > > +{
> > > +	struct vhost_map *map;
> > > +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> > > +	struct page **pages;
> > > +	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
> > > +	int npinned;
> > > +	void *vaddr, *v;
> > > +	int err;
> > > +	int i;
> > > +
> > > +	spin_lock(&vq->mmu_lock);
> > > +
> > > +	err = -EFAULT;
> > > +	if (vq->invalidate_count)
> > > +		goto err;
> > > +
> > > +	err = -ENOMEM;
> > > +	map = kmalloc(sizeof(*map), GFP_ATOMIC);
> > > +	if (!map)
> > > +		goto err;
> > > +
> > > +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
> > > +	if (!pages)
> > > +		goto err_pages;
> > > +
> > > +	err = EFAULT;
> > > +	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
> > > +					uaddr->write, pages);
> > > +	if (npinned > 0)
> > > +		release_pages(pages, npinned);
> > > +	if (npinned != npages)
> > > +		goto err_gup;
> > > +
> > > +	for (i = 0; i < npinned; i++)
> > > +		if (PageHighMem(pages[i]))
> > > +			goto err_gup;
> > > +
> > > +	vaddr = v = page_address(pages[0]);
> > > +
> > > +	/* For simplicity, fallback to userspace address if VA is not
> > > +	 * contigious.
> > > +	 */
> > > +	for (i = 1; i < npinned; i++) {
> > > +		v += PAGE_SIZE;
> > > +		if (v != page_address(pages[i]))
> > > +			goto err_gup;
> > > +	}
> > > +
> > > +	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
> > > +	map->npages = npages;
> > > +	map->pages = pages;
> > > +
> > > +	vq->maps[index] = map;
> > > +	/* No need for a synchronize_rcu(). This function should be
> > > +	 * called by dev->worker so we are serialized with all
> > > +	 * readers.
> > > +	 */
> > > +	spin_unlock(&vq->mmu_lock);
> > > +
> > > +	return 0;
> > > +
> > > +err_gup:
> > > +	kfree(pages);
> > > +err_pages:
> > > +	kfree(map);
> > > +err:
> > > +	spin_unlock(&vq->mmu_lock);
> > > +	return err;
> > > +}
> > > +#endif
> > > +
> > >   void vhost_dev_cleanup(struct vhost_dev *dev)
> > >   {
> > >   	int i;
> > > @@ -684,8 +981,20 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > >   		kthread_stop(dev->worker);
> > >   		dev->worker = NULL;
> > >   	}
> > > -	if (dev->mm)
> > > +	if (dev->mm) {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +		if (dev->has_notifier) {
> > > +			mmu_notifier_unregister(&dev->mmu_notifier,
> > > +						dev->mm);
> > > +			dev->has_notifier = false;
> > > +		}
> > > +#endif
> > >   		mmput(dev->mm);
> > > +	}
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	for (i = 0; i < dev->nvqs; i++)
> > > +		vhost_uninit_vq_maps(dev->vqs[i]);
> > > +#endif
> > >   	dev->mm = NULL;
> > >   }
> > >   EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > @@ -914,6 +1223,26 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
> > >   static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			*((__virtio16 *)&used->ring[vq->num]) =
> > > +				cpu_to_vhost16(vq, vq->avail_idx);
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
> > >   			      vhost_avail_event(vq));
> > >   }
> > > @@ -922,6 +1251,27 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
> > >   				 struct vring_used_elem *head, int idx,
> > >   				 int count)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +	size_t size;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			size = count * sizeof(*head);
> > > +			memcpy(used->ring + idx, head, size);
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
> > >   				  count * sizeof(*head));
> > >   }
> > > @@ -929,6 +1279,25 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
> > >   static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			used->flags = cpu_to_vhost16(vq, vq->used_flags);
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
> > >   			      &vq->used->flags);
> > >   }
> > > @@ -936,6 +1305,25 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
> > >   static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
> > >   			      &vq->used->idx);
> > >   }
> > > @@ -981,12 +1369,50 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
> > >   static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
> > >   				      __virtio16 *idx)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_avail *avail;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_AVAIL];
> > > +		if (likely(map)) {
> > > +			avail = map->addr;
> > > +			*idx = avail->idx;
> > index can now be speculated.
> 
> [...]
> 
> 
> > +		vhost_vq_access_map_begin(vq);
> > +
> > +		map = vq->maps[VHOST_ADDR_AVAIL];
> > +		if (likely(map)) {
> > +			avail = map->addr;
> > +			*head = avail->ring[idx & (vq->num - 1)];
> > 
> > Since idx can be speculated, I guess we need array_index_nospec here?
> 
> 
> So we have
> 
> ACQUIRE(mmu_lock)
> 
> get idx
> 
> RELEASE(mmu_lock)
> 
> ACQUIRE(mmu_lock)
> 
> read array[idx]
> 
> RELEASE(mmu_lock)
> 
> Then I think idx can't be speculated consider we've passed RELEASE +
> ACQUIRE?

I don't think memory barriers have anything to do with speculation,
they are architectural.

> 
> > 
> > 
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_get_avail(vq, *head,
> > >   			       &vq->avail->ring[idx & (vq->num - 1)]);
> > >   }
> > > @@ -994,24 +1420,98 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
> > >   static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
> > >   					__virtio16 *flags)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_avail *avail;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_AVAIL];
> > > +		if (likely(map)) {
> > > +			avail = map->addr;
> > > +			*flags = avail->flags;
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_get_avail(vq, *flags, &vq->avail->flags);
> > >   }
> > >   static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
> > >   				       __virtio16 *event)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_avail *avail;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +		map = vq->maps[VHOST_ADDR_AVAIL];
> > > +		if (likely(map)) {
> > > +			avail = map->addr;
> > > +			*event = (__virtio16)avail->ring[vq->num];
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_get_avail(vq, *event, vhost_used_event(vq));
> > >   }
> > >   static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
> > >   				     __virtio16 *idx)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			*idx = used->idx;
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_get_used(vq, *idx, &vq->used->idx);
> > >   }
> > 
> > This seems to be used during init. Why do we bother
> > accelerating this?
> 
> 
> Ok, I can remove this part in next version.
> 
> 
> > 
> > 
> > >   static inline int vhost_get_desc(struct vhost_virtqueue *vq,
> > >   				 struct vring_desc *desc, int idx)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_desc *d;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_DESC];
> > > +		if (likely(map)) {
> > > +			d = map->addr;
> > > +			*desc = *(d + idx);
> > 
> > Since idx can be speculated, I guess we need array_index_nospec here?
> 
> 
> This is similar to the above avail idx case.
> 
> 
> > 
> > 
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
> > >   }
> > I also wonder about the userspace address we get eventualy.
> > It would seem that we need to prevent that from speculating -
> > and that seems like a good idea even if this patch isn't
> > applied. As you are playing with micro-benchmarks, maybe
> > you could the below patch?
> 
> 
> Let me test it.
> 
> Thanks
> 
> 
> > It's unfortunately untested.
> > Thanks a lot in advance!
> > 
> > ===>
> > vhost: block speculation of translated descriptors
> > 
> > iovec addresses coming from vhost are assumed to be
> > pre-validated, but in fact can be speculated to a value
> > out of range.
> > 
> > Userspace address are later validated with array_index_nospec so we can
> > be sure kernel info does not leak through these addresses, but vhost
> > must also not leak userspace info outside the allowed memory table to
> > guests.
> > 
> > Following the defence in depth principle, make sure
> > the address is not validated out of node range.
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > 
> > ---
> > 
> > 
> > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > index 5dc174ac8cac..863e25011ef6 100644
> > --- a/drivers/vhost/vhost.c
> > +++ b/drivers/vhost/vhost.c
> > @@ -2072,7 +2076,9 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
> >   		size = node->size - addr + node->start;
> >   		_iov->iov_len = min((u64)len - s, size);
> >   		_iov->iov_base = (void __user *)(unsigned long)
> > -			(node->userspace_addr + addr - node->start);
> > +			(node->userspace_addr +
> > +			 array_index_nospec(addr - node->start,
> > +					    node->size));
> >   		s += size;
> >   		addr += size;
> >   		++ret;

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
@ 2019-09-09  4:45         ` Michael S. Tsirkin
  0 siblings, 0 replies; 50+ messages in thread
From: Michael S. Tsirkin @ 2019-09-09  4:45 UTC (permalink / raw)
  To: Jason Wang
  Cc: aarcange, Christoph Hellwig, linux-parisc, kvm, netdev,
	linux-kernel, virtualization, James Bottomley, linux-mm, jglisse,
	jgg, David Miller, linux-arm-kernel

On Mon, Sep 09, 2019 at 10:18:57AM +0800, Jason Wang wrote:
> 
> On 2019/9/8 下午7:05, Michael S. Tsirkin wrote:
> > On Thu, Sep 05, 2019 at 08:27:36PM +0800, Jason Wang wrote:
> > > This is a rework on the commit 7f466032dc9e ("vhost: access vq
> > > metadata through kernel virtual address").
> > > 
> > > It was noticed that the copy_to/from_user() friends that was used to
> > > access virtqueue metdata tends to be very expensive for dataplane
> > > implementation like vhost since it involves lots of software checks,
> > > speculation barriers,
> > So if we drop speculation barrier,
> > there's a problem here in access will now be speculated.
> > This effectively disables the defence in depth effect of
> > b3bbfb3fb5d25776b8e3f361d2eedaabb0b496cd
> >      x86: Introduce __uaccess_begin_nospec() and uaccess_try_nospec
> > 
> > 
> > So now we need to sprinkle array_index_nospec or barrier_nospec over the
> > code whenever we use an index we got from userspace.
> > See below for some examples.
> > 
> > 
> > > hardware feature toggling (e.g SMAP). The
> > > extra cost will be more obvious when transferring small packets since
> > > the time spent on metadata accessing become more significant.
> > > 
> > > This patch tries to eliminate those overheads by accessing them
> > > through direct mapping of those pages. Invalidation callbacks is
> > > implemented for co-operation with general VM management (swap, KSM,
> > > THP or NUMA balancing). We will try to get the direct mapping of vq
> > > metadata before each round of packet processing if it doesn't
> > > exist. If we fail, we will simplely fallback to copy_to/from_user()
> > > friends.
> > > 
> > > This invalidation, direct mapping access and set are synchronized
> > > through spinlock. This takes a step back from the original commit
> > > 7f466032dc9e ("vhost: access vq metadata through kernel virtual
> > > address") which tries to RCU which is suspicious and hard to be
> > > reviewed. This won't perform as well as RCU because of the atomic,
> > > this could be addressed by the future optimization.
> > > 
> > > This method might does not work for high mem page which requires
> > > temporary mapping so we just fallback to normal
> > > copy_to/from_user() and may not for arch that has virtual tagged cache
> > > since extra cache flushing is needed to eliminate the alias. This will
> > > result complex logic and bad performance. For those archs, this patch
> > > simply go for copy_to/from_user() friends. This is done by ruling out
> > > kernel mapping codes through ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.
> > > 
> > > Note that this is only done when device IOTLB is not enabled. We
> > > could use similar method to optimize IOTLB in the future.
> > > 
> > > Tests shows at most about 22% improvement on TX PPS when using
> > > virtio-user + vhost_net + xdp1 + TAP on 4.0GHz Kaby Lake.
> > > 
> > >          SMAP on | SMAP off
> > > Before: 4.9Mpps | 6.9Mpps
> > > After:  6.0Mpps | 7.5Mpps
> > > 
> > > On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
> > > any difference.
> > Why is not Kaby Lake with SMAP off the same as Sandy Bridge?
> 
> 
> I don't know, I guess it was because the atomic is l
> 
> 
> > 
> > 
> > > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > > Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
> > > Cc: Christoph Hellwig <hch@infradead.org>
> > > Cc: David Miller <davem@davemloft.net>
> > > Cc: Jerome Glisse <jglisse@redhat.com>
> > > Cc: Jason Gunthorpe <jgg@mellanox.com>
> > > Cc: linux-mm@kvack.org
> > > Cc: linux-arm-kernel@lists.infradead.org
> > > Cc: linux-parisc@vger.kernel.org
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > ---
> > >   drivers/vhost/vhost.c | 551 +++++++++++++++++++++++++++++++++++++++++-
> > >   drivers/vhost/vhost.h |  41 ++++
> > >   2 files changed, 589 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index 791562e03fe0..f98155f28f02 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -298,6 +298,182 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
> > >   		__vhost_vq_meta_reset(d->vqs[i]);
> > >   }
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +static void vhost_map_unprefetch(struct vhost_map *map)
> > > +{
> > > +	kfree(map->pages);
> > > +	kfree(map);
> > > +}
> > > +
> > > +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
> > > +				struct vhost_map *map, int index)
> > > +{
> > > +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> > > +	int i;
> > > +
> > > +	if (uaddr->write) {
> > > +		for (i = 0; i < map->npages; i++)
> > > +			set_page_dirty(map->pages[i]);
> > > +	}
> > > +}
> > > +
> > > +static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
> > > +{
> > > +	struct vhost_map *map[VHOST_NUM_ADDRS];
> > > +	int i;
> > > +
> > > +	spin_lock(&vq->mmu_lock);
> > > +	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> > > +		map[i] = vq->maps[i];
> > > +		if (map[i]) {
> > > +			vhost_set_map_dirty(vq, map[i], i);
> > > +			vq->maps[i] = NULL;
> > > +		}
> > > +	}
> > > +	spin_unlock(&vq->mmu_lock);
> > > +
> > > +	/* No need for synchronization since we are serialized with
> > > +	 * memory accessors (e.g vq mutex held).
> > > +	 */
> > > +
> > > +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> > > +		if (map[i])
> > > +			vhost_map_unprefetch(map[i]);
> > > +
> > > +}
> > > +
> > > +static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
> > > +{
> > > +	int i;
> > > +
> > > +	vhost_uninit_vq_maps(vq);
> > > +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> > > +		vq->uaddrs[i].size = 0;
> > > +}
> > > +
> > > +static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
> > > +				     unsigned long start,
> > > +				     unsigned long end)
> > > +{
> > > +	if (unlikely(!uaddr->size))
> > > +		return false;
> > > +
> > > +	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
> > > +}
> > > +
> > > +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
> > > +{
> > > +	spin_lock(&vq->mmu_lock);
> > > +}
> > > +
> > > +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
> > > +{
> > > +	spin_unlock(&vq->mmu_lock);
> > > +}
> > > +
> > > +static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
> > > +				     int index,
> > > +				     unsigned long start,
> > > +				     unsigned long end,
> > > +				     bool blockable)
> > > +{
> > > +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> > > +	struct vhost_map *map;
> > > +
> > > +	if (!vhost_map_range_overlap(uaddr, start, end))
> > > +		return 0;
> > > +	else if (!blockable)
> > > +		return -EAGAIN;
> > > +
> > > +	spin_lock(&vq->mmu_lock);
> > > +	++vq->invalidate_count;
> > > +
> > > +	map = vq->maps[index];
> > > +	if (map)
> > > +		vq->maps[index] = NULL;
> > > +	spin_unlock(&vq->mmu_lock);
> > > +
> > > +	if (map) {
> > > +		vhost_set_map_dirty(vq, map, index);
> > > +		vhost_map_unprefetch(map);
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
> > > +				    int index,
> > > +				    unsigned long start,
> > > +				    unsigned long end)
> > > +{
> > > +	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
> > > +		return;
> > > +
> > > +	spin_lock(&vq->mmu_lock);
> > > +	--vq->invalidate_count;
> > > +	spin_unlock(&vq->mmu_lock);
> > > +}
> > > +
> > > +static int vhost_invalidate_range_start(struct mmu_notifier *mn,
> > > +					const struct mmu_notifier_range *range)
> > > +{
> > > +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> > > +					     mmu_notifier);
> > > +	bool blockable = mmu_notifier_range_blockable(range);
> > > +	int i, j, ret;
> > > +
> > > +	for (i = 0; i < dev->nvqs; i++) {
> > > +		struct vhost_virtqueue *vq = dev->vqs[i];
> > > +
> > > +		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
> > > +			ret = vhost_invalidate_vq_start(vq, j,
> > > +							range->start,
> > > +							range->end, blockable);
> > > +			if (ret)
> > > +				return ret;
> > > +		}
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static void vhost_invalidate_range_end(struct mmu_notifier *mn,
> > > +				       const struct mmu_notifier_range *range)
> > > +{
> > > +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> > > +					     mmu_notifier);
> > > +	int i, j;
> > > +
> > > +	for (i = 0; i < dev->nvqs; i++) {
> > > +		struct vhost_virtqueue *vq = dev->vqs[i];
> > > +
> > > +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> > > +			vhost_invalidate_vq_end(vq, j,
> > > +						range->start,
> > > +						range->end);
> > > +	}
> > > +}
> > > +
> > > +static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
> > > +	.invalidate_range_start = vhost_invalidate_range_start,
> > > +	.invalidate_range_end = vhost_invalidate_range_end,
> > > +};
> > > +
> > > +static void vhost_init_maps(struct vhost_dev *dev)
> > > +{
> > > +	struct vhost_virtqueue *vq;
> > > +	int i, j;
> > > +
> > > +	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
> > > +
> > > +	for (i = 0; i < dev->nvqs; ++i) {
> > > +		vq = dev->vqs[i];
> > > +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> > > +			vq->maps[j] = NULL;
> > > +	}
> > > +}
> > > +#endif
> > > +
> > >   static void vhost_vq_reset(struct vhost_dev *dev,
> > >   			   struct vhost_virtqueue *vq)
> > >   {
> > > @@ -326,7 +502,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
> > >   	vq->busyloop_timeout = 0;
> > >   	vq->umem = NULL;
> > >   	vq->iotlb = NULL;
> > > +	vq->invalidate_count = 0;
> > >   	__vhost_vq_meta_reset(vq);
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	vhost_reset_vq_maps(vq);
> > > +#endif
> > >   }
> > >   static int vhost_worker(void *data)
> > > @@ -471,12 +651,15 @@ void vhost_dev_init(struct vhost_dev *dev,
> > >   	dev->iov_limit = iov_limit;
> > >   	dev->weight = weight;
> > >   	dev->byte_weight = byte_weight;
> > > +	dev->has_notifier = false;
> > >   	init_llist_head(&dev->work_list);
> > >   	init_waitqueue_head(&dev->wait);
> > >   	INIT_LIST_HEAD(&dev->read_list);
> > >   	INIT_LIST_HEAD(&dev->pending_list);
> > >   	spin_lock_init(&dev->iotlb_lock);
> > > -
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	vhost_init_maps(dev);
> > > +#endif
> > >   	for (i = 0; i < dev->nvqs; ++i) {
> > >   		vq = dev->vqs[i];
> > > @@ -485,6 +668,7 @@ void vhost_dev_init(struct vhost_dev *dev,
> > >   		vq->heads = NULL;
> > >   		vq->dev = dev;
> > >   		mutex_init(&vq->mutex);
> > > +		spin_lock_init(&vq->mmu_lock);
> > >   		vhost_vq_reset(dev, vq);
> > >   		if (vq->handle_kick)
> > >   			vhost_poll_init(&vq->poll, vq->handle_kick,
> > > @@ -564,7 +748,19 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > >   	if (err)
> > >   		goto err_cgroup;
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
> > > +	if (err)
> > > +		goto err_mmu_notifier;
> > > +#endif
> > > +	dev->has_notifier = true;
> > > +
> > >   	return 0;
> > > +
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +err_mmu_notifier:
> > > +	vhost_dev_free_iovecs(dev);
> > > +#endif
> > >   err_cgroup:
> > >   	kthread_stop(worker);
> > >   	dev->worker = NULL;
> > > @@ -655,6 +851,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
> > >   	spin_unlock(&dev->iotlb_lock);
> > >   }
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
> > > +			      int index, unsigned long uaddr,
> > > +			      size_t size, bool write)
> > > +{
> > > +	struct vhost_uaddr *addr = &vq->uaddrs[index];
> > > +
> > > +	addr->uaddr = uaddr;
> > > +	addr->size = size;
> > > +	addr->write = write;
> > > +}
> > > +
> > > +static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
> > > +{
> > > +	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
> > > +			  (unsigned long)vq->desc,
> > > +			  vhost_get_desc_size(vq, vq->num),
> > > +			  false);
> > > +	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
> > > +			  (unsigned long)vq->avail,
> > > +			  vhost_get_avail_size(vq, vq->num),
> > > +			  false);
> > > +	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
> > > +			  (unsigned long)vq->used,
> > > +			  vhost_get_used_size(vq, vq->num),
> > > +			  true);
> > > +}
> > > +
> > > +static int vhost_map_prefetch(struct vhost_virtqueue *vq,
> > > +			       int index)
> > > +{
> > > +	struct vhost_map *map;
> > > +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> > > +	struct page **pages;
> > > +	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
> > > +	int npinned;
> > > +	void *vaddr, *v;
> > > +	int err;
> > > +	int i;
> > > +
> > > +	spin_lock(&vq->mmu_lock);
> > > +
> > > +	err = -EFAULT;
> > > +	if (vq->invalidate_count)
> > > +		goto err;
> > > +
> > > +	err = -ENOMEM;
> > > +	map = kmalloc(sizeof(*map), GFP_ATOMIC);
> > > +	if (!map)
> > > +		goto err;
> > > +
> > > +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
> > > +	if (!pages)
> > > +		goto err_pages;
> > > +
> > > +	err = EFAULT;
> > > +	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
> > > +					uaddr->write, pages);
> > > +	if (npinned > 0)
> > > +		release_pages(pages, npinned);
> > > +	if (npinned != npages)
> > > +		goto err_gup;
> > > +
> > > +	for (i = 0; i < npinned; i++)
> > > +		if (PageHighMem(pages[i]))
> > > +			goto err_gup;
> > > +
> > > +	vaddr = v = page_address(pages[0]);
> > > +
> > > +	/* For simplicity, fallback to userspace address if VA is not
> > > +	 * contigious.
> > > +	 */
> > > +	for (i = 1; i < npinned; i++) {
> > > +		v += PAGE_SIZE;
> > > +		if (v != page_address(pages[i]))
> > > +			goto err_gup;
> > > +	}
> > > +
> > > +	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
> > > +	map->npages = npages;
> > > +	map->pages = pages;
> > > +
> > > +	vq->maps[index] = map;
> > > +	/* No need for a synchronize_rcu(). This function should be
> > > +	 * called by dev->worker so we are serialized with all
> > > +	 * readers.
> > > +	 */
> > > +	spin_unlock(&vq->mmu_lock);
> > > +
> > > +	return 0;
> > > +
> > > +err_gup:
> > > +	kfree(pages);
> > > +err_pages:
> > > +	kfree(map);
> > > +err:
> > > +	spin_unlock(&vq->mmu_lock);
> > > +	return err;
> > > +}
> > > +#endif
> > > +
> > >   void vhost_dev_cleanup(struct vhost_dev *dev)
> > >   {
> > >   	int i;
> > > @@ -684,8 +981,20 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > >   		kthread_stop(dev->worker);
> > >   		dev->worker = NULL;
> > >   	}
> > > -	if (dev->mm)
> > > +	if (dev->mm) {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +		if (dev->has_notifier) {
> > > +			mmu_notifier_unregister(&dev->mmu_notifier,
> > > +						dev->mm);
> > > +			dev->has_notifier = false;
> > > +		}
> > > +#endif
> > >   		mmput(dev->mm);
> > > +	}
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	for (i = 0; i < dev->nvqs; i++)
> > > +		vhost_uninit_vq_maps(dev->vqs[i]);
> > > +#endif
> > >   	dev->mm = NULL;
> > >   }
> > >   EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > @@ -914,6 +1223,26 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
> > >   static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			*((__virtio16 *)&used->ring[vq->num]) =
> > > +				cpu_to_vhost16(vq, vq->avail_idx);
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
> > >   			      vhost_avail_event(vq));
> > >   }
> > > @@ -922,6 +1251,27 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
> > >   				 struct vring_used_elem *head, int idx,
> > >   				 int count)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +	size_t size;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			size = count * sizeof(*head);
> > > +			memcpy(used->ring + idx, head, size);
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
> > >   				  count * sizeof(*head));
> > >   }
> > > @@ -929,6 +1279,25 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
> > >   static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			used->flags = cpu_to_vhost16(vq, vq->used_flags);
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
> > >   			      &vq->used->flags);
> > >   }
> > > @@ -936,6 +1305,25 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
> > >   static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
> > >   			      &vq->used->idx);
> > >   }
> > > @@ -981,12 +1369,50 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
> > >   static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
> > >   				      __virtio16 *idx)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_avail *avail;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_AVAIL];
> > > +		if (likely(map)) {
> > > +			avail = map->addr;
> > > +			*idx = avail->idx;
> > index can now be speculated.
> 
> [...]
> 
> 
> > +		vhost_vq_access_map_begin(vq);
> > +
> > +		map = vq->maps[VHOST_ADDR_AVAIL];
> > +		if (likely(map)) {
> > +			avail = map->addr;
> > +			*head = avail->ring[idx & (vq->num - 1)];
> > 
> > Since idx can be speculated, I guess we need array_index_nospec here?
> 
> 
> So we have
> 
> ACQUIRE(mmu_lock)
> 
> get idx
> 
> RELEASE(mmu_lock)
> 
> ACQUIRE(mmu_lock)
> 
> read array[idx]
> 
> RELEASE(mmu_lock)
> 
> Then I think idx can't be speculated consider we've passed RELEASE +
> ACQUIRE?

I don't think memory barriers have anything to do with speculation,
they are architectural.

> 
> > 
> > 
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_get_avail(vq, *head,
> > >   			       &vq->avail->ring[idx & (vq->num - 1)]);
> > >   }
> > > @@ -994,24 +1420,98 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
> > >   static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
> > >   					__virtio16 *flags)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_avail *avail;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_AVAIL];
> > > +		if (likely(map)) {
> > > +			avail = map->addr;
> > > +			*flags = avail->flags;
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_get_avail(vq, *flags, &vq->avail->flags);
> > >   }
> > >   static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
> > >   				       __virtio16 *event)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_avail *avail;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +		map = vq->maps[VHOST_ADDR_AVAIL];
> > > +		if (likely(map)) {
> > > +			avail = map->addr;
> > > +			*event = (__virtio16)avail->ring[vq->num];
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_get_avail(vq, *event, vhost_used_event(vq));
> > >   }
> > >   static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
> > >   				     __virtio16 *idx)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			*idx = used->idx;
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_get_used(vq, *idx, &vq->used->idx);
> > >   }
> > 
> > This seems to be used during init. Why do we bother
> > accelerating this?
> 
> 
> Ok, I can remove this part in next version.
> 
> 
> > 
> > 
> > >   static inline int vhost_get_desc(struct vhost_virtqueue *vq,
> > >   				 struct vring_desc *desc, int idx)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_desc *d;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_DESC];
> > > +		if (likely(map)) {
> > > +			d = map->addr;
> > > +			*desc = *(d + idx);
> > 
> > Since idx can be speculated, I guess we need array_index_nospec here?
> 
> 
> This is similar to the above avail idx case.
> 
> 
> > 
> > 
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
> > >   }
> > I also wonder about the userspace address we get eventualy.
> > It would seem that we need to prevent that from speculating -
> > and that seems like a good idea even if this patch isn't
> > applied. As you are playing with micro-benchmarks, maybe
> > you could the below patch?
> 
> 
> Let me test it.
> 
> Thanks
> 
> 
> > It's unfortunately untested.
> > Thanks a lot in advance!
> > 
> > ===>
> > vhost: block speculation of translated descriptors
> > 
> > iovec addresses coming from vhost are assumed to be
> > pre-validated, but in fact can be speculated to a value
> > out of range.
> > 
> > Userspace address are later validated with array_index_nospec so we can
> > be sure kernel info does not leak through these addresses, but vhost
> > must also not leak userspace info outside the allowed memory table to
> > guests.
> > 
> > Following the defence in depth principle, make sure
> > the address is not validated out of node range.
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > 
> > ---
> > 
> > 
> > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > index 5dc174ac8cac..863e25011ef6 100644
> > --- a/drivers/vhost/vhost.c
> > +++ b/drivers/vhost/vhost.c
> > @@ -2072,7 +2076,9 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
> >   		size = node->size - addr + node->start;
> >   		_iov->iov_len = min((u64)len - s, size);
> >   		_iov->iov_base = (void __user *)(unsigned long)
> > -			(node->userspace_addr + addr - node->start);
> > +			(node->userspace_addr +
> > +			 array_index_nospec(addr - node->start,
> > +					    node->size));
> >   		s += size;
> >   		addr += size;
> >   		++ret;
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
@ 2019-09-09  4:45         ` Michael S. Tsirkin
  0 siblings, 0 replies; 50+ messages in thread
From: Michael S. Tsirkin @ 2019-09-09  4:45 UTC (permalink / raw)
  To: Jason Wang
  Cc: aarcange, Christoph Hellwig, linux-parisc, kvm, netdev,
	linux-kernel, virtualization, James Bottomley, linux-mm, jglisse,
	jgg, David Miller, linux-arm-kernel

On Mon, Sep 09, 2019 at 10:18:57AM +0800, Jason Wang wrote:
> 
> On 2019/9/8 下午7:05, Michael S. Tsirkin wrote:
> > On Thu, Sep 05, 2019 at 08:27:36PM +0800, Jason Wang wrote:
> > > This is a rework on the commit 7f466032dc9e ("vhost: access vq
> > > metadata through kernel virtual address").
> > > 
> > > It was noticed that the copy_to/from_user() friends that was used to
> > > access virtqueue metdata tends to be very expensive for dataplane
> > > implementation like vhost since it involves lots of software checks,
> > > speculation barriers,
> > So if we drop speculation barrier,
> > there's a problem here in access will now be speculated.
> > This effectively disables the defence in depth effect of
> > b3bbfb3fb5d25776b8e3f361d2eedaabb0b496cd
> >      x86: Introduce __uaccess_begin_nospec() and uaccess_try_nospec
> > 
> > 
> > So now we need to sprinkle array_index_nospec or barrier_nospec over the
> > code whenever we use an index we got from userspace.
> > See below for some examples.
> > 
> > 
> > > hardware feature toggling (e.g SMAP). The
> > > extra cost will be more obvious when transferring small packets since
> > > the time spent on metadata accessing become more significant.
> > > 
> > > This patch tries to eliminate those overheads by accessing them
> > > through direct mapping of those pages. Invalidation callbacks is
> > > implemented for co-operation with general VM management (swap, KSM,
> > > THP or NUMA balancing). We will try to get the direct mapping of vq
> > > metadata before each round of packet processing if it doesn't
> > > exist. If we fail, we will simplely fallback to copy_to/from_user()
> > > friends.
> > > 
> > > This invalidation, direct mapping access and set are synchronized
> > > through spinlock. This takes a step back from the original commit
> > > 7f466032dc9e ("vhost: access vq metadata through kernel virtual
> > > address") which tries to RCU which is suspicious and hard to be
> > > reviewed. This won't perform as well as RCU because of the atomic,
> > > this could be addressed by the future optimization.
> > > 
> > > This method might does not work for high mem page which requires
> > > temporary mapping so we just fallback to normal
> > > copy_to/from_user() and may not for arch that has virtual tagged cache
> > > since extra cache flushing is needed to eliminate the alias. This will
> > > result complex logic and bad performance. For those archs, this patch
> > > simply go for copy_to/from_user() friends. This is done by ruling out
> > > kernel mapping codes through ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.
> > > 
> > > Note that this is only done when device IOTLB is not enabled. We
> > > could use similar method to optimize IOTLB in the future.
> > > 
> > > Tests shows at most about 22% improvement on TX PPS when using
> > > virtio-user + vhost_net + xdp1 + TAP on 4.0GHz Kaby Lake.
> > > 
> > >          SMAP on | SMAP off
> > > Before: 4.9Mpps | 6.9Mpps
> > > After:  6.0Mpps | 7.5Mpps
> > > 
> > > On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
> > > any difference.
> > Why is not Kaby Lake with SMAP off the same as Sandy Bridge?
> 
> 
> I don't know, I guess it was because the atomic is l
> 
> 
> > 
> > 
> > > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > > Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
> > > Cc: Christoph Hellwig <hch@infradead.org>
> > > Cc: David Miller <davem@davemloft.net>
> > > Cc: Jerome Glisse <jglisse@redhat.com>
> > > Cc: Jason Gunthorpe <jgg@mellanox.com>
> > > Cc: linux-mm@kvack.org
> > > Cc: linux-arm-kernel@lists.infradead.org
> > > Cc: linux-parisc@vger.kernel.org
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > ---
> > >   drivers/vhost/vhost.c | 551 +++++++++++++++++++++++++++++++++++++++++-
> > >   drivers/vhost/vhost.h |  41 ++++
> > >   2 files changed, 589 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index 791562e03fe0..f98155f28f02 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -298,6 +298,182 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
> > >   		__vhost_vq_meta_reset(d->vqs[i]);
> > >   }
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +static void vhost_map_unprefetch(struct vhost_map *map)
> > > +{
> > > +	kfree(map->pages);
> > > +	kfree(map);
> > > +}
> > > +
> > > +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
> > > +				struct vhost_map *map, int index)
> > > +{
> > > +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> > > +	int i;
> > > +
> > > +	if (uaddr->write) {
> > > +		for (i = 0; i < map->npages; i++)
> > > +			set_page_dirty(map->pages[i]);
> > > +	}
> > > +}
> > > +
> > > +static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
> > > +{
> > > +	struct vhost_map *map[VHOST_NUM_ADDRS];
> > > +	int i;
> > > +
> > > +	spin_lock(&vq->mmu_lock);
> > > +	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> > > +		map[i] = vq->maps[i];
> > > +		if (map[i]) {
> > > +			vhost_set_map_dirty(vq, map[i], i);
> > > +			vq->maps[i] = NULL;
> > > +		}
> > > +	}
> > > +	spin_unlock(&vq->mmu_lock);
> > > +
> > > +	/* No need for synchronization since we are serialized with
> > > +	 * memory accessors (e.g vq mutex held).
> > > +	 */
> > > +
> > > +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> > > +		if (map[i])
> > > +			vhost_map_unprefetch(map[i]);
> > > +
> > > +}
> > > +
> > > +static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
> > > +{
> > > +	int i;
> > > +
> > > +	vhost_uninit_vq_maps(vq);
> > > +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> > > +		vq->uaddrs[i].size = 0;
> > > +}
> > > +
> > > +static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
> > > +				     unsigned long start,
> > > +				     unsigned long end)
> > > +{
> > > +	if (unlikely(!uaddr->size))
> > > +		return false;
> > > +
> > > +	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
> > > +}
> > > +
> > > +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
> > > +{
> > > +	spin_lock(&vq->mmu_lock);
> > > +}
> > > +
> > > +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
> > > +{
> > > +	spin_unlock(&vq->mmu_lock);
> > > +}
> > > +
> > > +static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
> > > +				     int index,
> > > +				     unsigned long start,
> > > +				     unsigned long end,
> > > +				     bool blockable)
> > > +{
> > > +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> > > +	struct vhost_map *map;
> > > +
> > > +	if (!vhost_map_range_overlap(uaddr, start, end))
> > > +		return 0;
> > > +	else if (!blockable)
> > > +		return -EAGAIN;
> > > +
> > > +	spin_lock(&vq->mmu_lock);
> > > +	++vq->invalidate_count;
> > > +
> > > +	map = vq->maps[index];
> > > +	if (map)
> > > +		vq->maps[index] = NULL;
> > > +	spin_unlock(&vq->mmu_lock);
> > > +
> > > +	if (map) {
> > > +		vhost_set_map_dirty(vq, map, index);
> > > +		vhost_map_unprefetch(map);
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
> > > +				    int index,
> > > +				    unsigned long start,
> > > +				    unsigned long end)
> > > +{
> > > +	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
> > > +		return;
> > > +
> > > +	spin_lock(&vq->mmu_lock);
> > > +	--vq->invalidate_count;
> > > +	spin_unlock(&vq->mmu_lock);
> > > +}
> > > +
> > > +static int vhost_invalidate_range_start(struct mmu_notifier *mn,
> > > +					const struct mmu_notifier_range *range)
> > > +{
> > > +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> > > +					     mmu_notifier);
> > > +	bool blockable = mmu_notifier_range_blockable(range);
> > > +	int i, j, ret;
> > > +
> > > +	for (i = 0; i < dev->nvqs; i++) {
> > > +		struct vhost_virtqueue *vq = dev->vqs[i];
> > > +
> > > +		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
> > > +			ret = vhost_invalidate_vq_start(vq, j,
> > > +							range->start,
> > > +							range->end, blockable);
> > > +			if (ret)
> > > +				return ret;
> > > +		}
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static void vhost_invalidate_range_end(struct mmu_notifier *mn,
> > > +				       const struct mmu_notifier_range *range)
> > > +{
> > > +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> > > +					     mmu_notifier);
> > > +	int i, j;
> > > +
> > > +	for (i = 0; i < dev->nvqs; i++) {
> > > +		struct vhost_virtqueue *vq = dev->vqs[i];
> > > +
> > > +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> > > +			vhost_invalidate_vq_end(vq, j,
> > > +						range->start,
> > > +						range->end);
> > > +	}
> > > +}
> > > +
> > > +static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
> > > +	.invalidate_range_start = vhost_invalidate_range_start,
> > > +	.invalidate_range_end = vhost_invalidate_range_end,
> > > +};
> > > +
> > > +static void vhost_init_maps(struct vhost_dev *dev)
> > > +{
> > > +	struct vhost_virtqueue *vq;
> > > +	int i, j;
> > > +
> > > +	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
> > > +
> > > +	for (i = 0; i < dev->nvqs; ++i) {
> > > +		vq = dev->vqs[i];
> > > +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> > > +			vq->maps[j] = NULL;
> > > +	}
> > > +}
> > > +#endif
> > > +
> > >   static void vhost_vq_reset(struct vhost_dev *dev,
> > >   			   struct vhost_virtqueue *vq)
> > >   {
> > > @@ -326,7 +502,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
> > >   	vq->busyloop_timeout = 0;
> > >   	vq->umem = NULL;
> > >   	vq->iotlb = NULL;
> > > +	vq->invalidate_count = 0;
> > >   	__vhost_vq_meta_reset(vq);
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	vhost_reset_vq_maps(vq);
> > > +#endif
> > >   }
> > >   static int vhost_worker(void *data)
> > > @@ -471,12 +651,15 @@ void vhost_dev_init(struct vhost_dev *dev,
> > >   	dev->iov_limit = iov_limit;
> > >   	dev->weight = weight;
> > >   	dev->byte_weight = byte_weight;
> > > +	dev->has_notifier = false;
> > >   	init_llist_head(&dev->work_list);
> > >   	init_waitqueue_head(&dev->wait);
> > >   	INIT_LIST_HEAD(&dev->read_list);
> > >   	INIT_LIST_HEAD(&dev->pending_list);
> > >   	spin_lock_init(&dev->iotlb_lock);
> > > -
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	vhost_init_maps(dev);
> > > +#endif
> > >   	for (i = 0; i < dev->nvqs; ++i) {
> > >   		vq = dev->vqs[i];
> > > @@ -485,6 +668,7 @@ void vhost_dev_init(struct vhost_dev *dev,
> > >   		vq->heads = NULL;
> > >   		vq->dev = dev;
> > >   		mutex_init(&vq->mutex);
> > > +		spin_lock_init(&vq->mmu_lock);
> > >   		vhost_vq_reset(dev, vq);
> > >   		if (vq->handle_kick)
> > >   			vhost_poll_init(&vq->poll, vq->handle_kick,
> > > @@ -564,7 +748,19 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > >   	if (err)
> > >   		goto err_cgroup;
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
> > > +	if (err)
> > > +		goto err_mmu_notifier;
> > > +#endif
> > > +	dev->has_notifier = true;
> > > +
> > >   	return 0;
> > > +
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +err_mmu_notifier:
> > > +	vhost_dev_free_iovecs(dev);
> > > +#endif
> > >   err_cgroup:
> > >   	kthread_stop(worker);
> > >   	dev->worker = NULL;
> > > @@ -655,6 +851,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
> > >   	spin_unlock(&dev->iotlb_lock);
> > >   }
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
> > > +			      int index, unsigned long uaddr,
> > > +			      size_t size, bool write)
> > > +{
> > > +	struct vhost_uaddr *addr = &vq->uaddrs[index];
> > > +
> > > +	addr->uaddr = uaddr;
> > > +	addr->size = size;
> > > +	addr->write = write;
> > > +}
> > > +
> > > +static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
> > > +{
> > > +	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
> > > +			  (unsigned long)vq->desc,
> > > +			  vhost_get_desc_size(vq, vq->num),
> > > +			  false);
> > > +	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
> > > +			  (unsigned long)vq->avail,
> > > +			  vhost_get_avail_size(vq, vq->num),
> > > +			  false);
> > > +	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
> > > +			  (unsigned long)vq->used,
> > > +			  vhost_get_used_size(vq, vq->num),
> > > +			  true);
> > > +}
> > > +
> > > +static int vhost_map_prefetch(struct vhost_virtqueue *vq,
> > > +			       int index)
> > > +{
> > > +	struct vhost_map *map;
> > > +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> > > +	struct page **pages;
> > > +	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
> > > +	int npinned;
> > > +	void *vaddr, *v;
> > > +	int err;
> > > +	int i;
> > > +
> > > +	spin_lock(&vq->mmu_lock);
> > > +
> > > +	err = -EFAULT;
> > > +	if (vq->invalidate_count)
> > > +		goto err;
> > > +
> > > +	err = -ENOMEM;
> > > +	map = kmalloc(sizeof(*map), GFP_ATOMIC);
> > > +	if (!map)
> > > +		goto err;
> > > +
> > > +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
> > > +	if (!pages)
> > > +		goto err_pages;
> > > +
> > > +	err = EFAULT;
> > > +	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
> > > +					uaddr->write, pages);
> > > +	if (npinned > 0)
> > > +		release_pages(pages, npinned);
> > > +	if (npinned != npages)
> > > +		goto err_gup;
> > > +
> > > +	for (i = 0; i < npinned; i++)
> > > +		if (PageHighMem(pages[i]))
> > > +			goto err_gup;
> > > +
> > > +	vaddr = v = page_address(pages[0]);
> > > +
> > > +	/* For simplicity, fallback to userspace address if VA is not
> > > +	 * contigious.
> > > +	 */
> > > +	for (i = 1; i < npinned; i++) {
> > > +		v += PAGE_SIZE;
> > > +		if (v != page_address(pages[i]))
> > > +			goto err_gup;
> > > +	}
> > > +
> > > +	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
> > > +	map->npages = npages;
> > > +	map->pages = pages;
> > > +
> > > +	vq->maps[index] = map;
> > > +	/* No need for a synchronize_rcu(). This function should be
> > > +	 * called by dev->worker so we are serialized with all
> > > +	 * readers.
> > > +	 */
> > > +	spin_unlock(&vq->mmu_lock);
> > > +
> > > +	return 0;
> > > +
> > > +err_gup:
> > > +	kfree(pages);
> > > +err_pages:
> > > +	kfree(map);
> > > +err:
> > > +	spin_unlock(&vq->mmu_lock);
> > > +	return err;
> > > +}
> > > +#endif
> > > +
> > >   void vhost_dev_cleanup(struct vhost_dev *dev)
> > >   {
> > >   	int i;
> > > @@ -684,8 +981,20 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > >   		kthread_stop(dev->worker);
> > >   		dev->worker = NULL;
> > >   	}
> > > -	if (dev->mm)
> > > +	if (dev->mm) {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +		if (dev->has_notifier) {
> > > +			mmu_notifier_unregister(&dev->mmu_notifier,
> > > +						dev->mm);
> > > +			dev->has_notifier = false;
> > > +		}
> > > +#endif
> > >   		mmput(dev->mm);
> > > +	}
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	for (i = 0; i < dev->nvqs; i++)
> > > +		vhost_uninit_vq_maps(dev->vqs[i]);
> > > +#endif
> > >   	dev->mm = NULL;
> > >   }
> > >   EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > @@ -914,6 +1223,26 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
> > >   static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			*((__virtio16 *)&used->ring[vq->num]) =
> > > +				cpu_to_vhost16(vq, vq->avail_idx);
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
> > >   			      vhost_avail_event(vq));
> > >   }
> > > @@ -922,6 +1251,27 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
> > >   				 struct vring_used_elem *head, int idx,
> > >   				 int count)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +	size_t size;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			size = count * sizeof(*head);
> > > +			memcpy(used->ring + idx, head, size);
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
> > >   				  count * sizeof(*head));
> > >   }
> > > @@ -929,6 +1279,25 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
> > >   static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			used->flags = cpu_to_vhost16(vq, vq->used_flags);
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
> > >   			      &vq->used->flags);
> > >   }
> > > @@ -936,6 +1305,25 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
> > >   static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
> > >   			      &vq->used->idx);
> > >   }
> > > @@ -981,12 +1369,50 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
> > >   static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
> > >   				      __virtio16 *idx)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_avail *avail;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_AVAIL];
> > > +		if (likely(map)) {
> > > +			avail = map->addr;
> > > +			*idx = avail->idx;
> > index can now be speculated.
> 
> [...]
> 
> 
> > +		vhost_vq_access_map_begin(vq);
> > +
> > +		map = vq->maps[VHOST_ADDR_AVAIL];
> > +		if (likely(map)) {
> > +			avail = map->addr;
> > +			*head = avail->ring[idx & (vq->num - 1)];
> > 
> > Since idx can be speculated, I guess we need array_index_nospec here?
> 
> 
> So we have
> 
> ACQUIRE(mmu_lock)
> 
> get idx
> 
> RELEASE(mmu_lock)
> 
> ACQUIRE(mmu_lock)
> 
> read array[idx]
> 
> RELEASE(mmu_lock)
> 
> Then I think idx can't be speculated consider we've passed RELEASE +
> ACQUIRE?

I don't think memory barriers have anything to do with speculation,
they are architectural.

> 
> > 
> > 
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_get_avail(vq, *head,
> > >   			       &vq->avail->ring[idx & (vq->num - 1)]);
> > >   }
> > > @@ -994,24 +1420,98 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
> > >   static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
> > >   					__virtio16 *flags)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_avail *avail;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_AVAIL];
> > > +		if (likely(map)) {
> > > +			avail = map->addr;
> > > +			*flags = avail->flags;
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_get_avail(vq, *flags, &vq->avail->flags);
> > >   }
> > >   static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
> > >   				       __virtio16 *event)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_avail *avail;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +		map = vq->maps[VHOST_ADDR_AVAIL];
> > > +		if (likely(map)) {
> > > +			avail = map->addr;
> > > +			*event = (__virtio16)avail->ring[vq->num];
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_get_avail(vq, *event, vhost_used_event(vq));
> > >   }
> > >   static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
> > >   				     __virtio16 *idx)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_used *used;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_USED];
> > > +		if (likely(map)) {
> > > +			used = map->addr;
> > > +			*idx = used->idx;
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_get_used(vq, *idx, &vq->used->idx);
> > >   }
> > 
> > This seems to be used during init. Why do we bother
> > accelerating this?
> 
> 
> Ok, I can remove this part in next version.
> 
> 
> > 
> > 
> > >   static inline int vhost_get_desc(struct vhost_virtqueue *vq,
> > >   				 struct vring_desc *desc, int idx)
> > >   {
> > > +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> > > +	struct vhost_map *map;
> > > +	struct vring_desc *d;
> > > +
> > > +	if (!vq->iotlb) {
> > > +		vhost_vq_access_map_begin(vq);
> > > +
> > > +		map = vq->maps[VHOST_ADDR_DESC];
> > > +		if (likely(map)) {
> > > +			d = map->addr;
> > > +			*desc = *(d + idx);
> > 
> > Since idx can be speculated, I guess we need array_index_nospec here?
> 
> 
> This is similar to the above avail idx case.
> 
> 
> > 
> > 
> > > +			vhost_vq_access_map_end(vq);
> > > +			return 0;
> > > +		}
> > > +
> > > +		vhost_vq_access_map_end(vq);
> > > +	}
> > > +#endif
> > > +
> > >   	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
> > >   }
> > I also wonder about the userspace address we get eventualy.
> > It would seem that we need to prevent that from speculating -
> > and that seems like a good idea even if this patch isn't
> > applied. As you are playing with micro-benchmarks, maybe
> > you could the below patch?
> 
> 
> Let me test it.
> 
> Thanks
> 
> 
> > It's unfortunately untested.
> > Thanks a lot in advance!
> > 
> > ===>
> > vhost: block speculation of translated descriptors
> > 
> > iovec addresses coming from vhost are assumed to be
> > pre-validated, but in fact can be speculated to a value
> > out of range.
> > 
> > Userspace address are later validated with array_index_nospec so we can
> > be sure kernel info does not leak through these addresses, but vhost
> > must also not leak userspace info outside the allowed memory table to
> > guests.
> > 
> > Following the defence in depth principle, make sure
> > the address is not validated out of node range.
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > 
> > ---
> > 
> > 
> > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > index 5dc174ac8cac..863e25011ef6 100644
> > --- a/drivers/vhost/vhost.c
> > +++ b/drivers/vhost/vhost.c
> > @@ -2072,7 +2076,9 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
> >   		size = node->size - addr + node->start;
> >   		_iov->iov_len = min((u64)len - s, size);
> >   		_iov->iov_base = (void __user *)(unsigned long)
> > -			(node->userspace_addr + addr - node->start);
> > +			(node->userspace_addr +
> > +			 array_index_nospec(addr - node->start,
> > +					    node->size));
> >   		s += size;
> >   		addr += size;
> >   		++ret;

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 1/2] Revert "vhost: access vq metadata through kernel virtual address"
  2019-09-06 13:46   ` Michael S. Tsirkin
@ 2019-09-09  7:16     ` Jason Wang
  2019-09-09  7:16     ` Jason Wang
  1 sibling, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  7:16 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, virtualization, netdev, linux-kernel, jgg, aarcange,
	jglisse, linux-mm


On 2019/9/6 下午9:46, Michael S. Tsirkin wrote:
> On Thu, Sep 05, 2019 at 08:27:35PM +0800, Jason Wang wrote:
>> It was reported that metadata acceleration introduces several issues,
>> so this patch reverts commit ff466032dc9e5a61217f22ea34b2df932786bbfc,
>> 73f628ec9e6bcc45b77c53fe6d0c0ec55eaf82af and
>> 0b4a7092ffe568a55bf8f3cefdf79ff666586d91.
>>
>> We will rework it on the next version.
>>
>> Cc: Jason Gunthorpe <jgg@mellanox.com>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>
> I am confused by the above.
> What I see upstream is 7f466032dc.
>
> commit 7f466032dc9e5a61217f22ea34b2df932786bbfc
> Author: Jason Wang <jasowang@redhat.com>
> Date:   Fri May 24 04:12:18 2019 -0400
>
>      vhost: access vq metadata through kernel virtual address
>
> so this is what I reverted.
>
> Pls take a look, and let me know if you see issues.
>
> Thanks!


Yes, my fault.

Thanks


>> ---
>>   drivers/vhost/vhost.c | 515 +-----------------------------------------
>>   drivers/vhost/vhost.h |  41 ----
>>   2 files changed, 3 insertions(+), 553 deletions(-)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index 0536f8526359..791562e03fe0 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -298,160 +298,6 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
>>   		__vhost_vq_meta_reset(d->vqs[i]);
>>   }
>>   
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -static void vhost_map_unprefetch(struct vhost_map *map)
>> -{
>> -	kfree(map->pages);
>> -	map->pages = NULL;
>> -	map->npages = 0;
>> -	map->addr = NULL;
>> -}
>> -
>> -static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
>> -{
>> -	struct vhost_map *map[VHOST_NUM_ADDRS];
>> -	int i;
>> -
>> -	spin_lock(&vq->mmu_lock);
>> -	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
>> -		map[i] = rcu_dereference_protected(vq->maps[i],
>> -				  lockdep_is_held(&vq->mmu_lock));
>> -		if (map[i])
>> -			rcu_assign_pointer(vq->maps[i], NULL);
>> -	}
>> -	spin_unlock(&vq->mmu_lock);
>> -
>> -	synchronize_rcu();
>> -
>> -	for (i = 0; i < VHOST_NUM_ADDRS; i++)
>> -		if (map[i])
>> -			vhost_map_unprefetch(map[i]);
>> -
>> -}
>> -
>> -static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
>> -{
>> -	int i;
>> -
>> -	vhost_uninit_vq_maps(vq);
>> -	for (i = 0; i < VHOST_NUM_ADDRS; i++)
>> -		vq->uaddrs[i].size = 0;
>> -}
>> -
>> -static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
>> -				     unsigned long start,
>> -				     unsigned long end)
>> -{
>> -	if (unlikely(!uaddr->size))
>> -		return false;
>> -
>> -	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
>> -}
>> -
>> -static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>> -				      int index,
>> -				      unsigned long start,
>> -				      unsigned long end)
>> -{
>> -	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> -	struct vhost_map *map;
>> -	int i;
>> -
>> -	if (!vhost_map_range_overlap(uaddr, start, end))
>> -		return;
>> -
>> -	spin_lock(&vq->mmu_lock);
>> -	++vq->invalidate_count;
>> -
>> -	map = rcu_dereference_protected(vq->maps[index],
>> -					lockdep_is_held(&vq->mmu_lock));
>> -	if (map) {
>> -		if (uaddr->write) {
>> -			for (i = 0; i < map->npages; i++)
>> -				set_page_dirty(map->pages[i]);
>> -		}
>> -		rcu_assign_pointer(vq->maps[index], NULL);
>> -	}
>> -	spin_unlock(&vq->mmu_lock);
>> -
>> -	if (map) {
>> -		synchronize_rcu();
>> -		vhost_map_unprefetch(map);
>> -	}
>> -}
>> -
>> -static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
>> -				    int index,
>> -				    unsigned long start,
>> -				    unsigned long end)
>> -{
>> -	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
>> -		return;
>> -
>> -	spin_lock(&vq->mmu_lock);
>> -	--vq->invalidate_count;
>> -	spin_unlock(&vq->mmu_lock);
>> -}
>> -
>> -static int vhost_invalidate_range_start(struct mmu_notifier *mn,
>> -					const struct mmu_notifier_range *range)
>> -{
>> -	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
>> -					     mmu_notifier);
>> -	int i, j;
>> -
>> -	if (!mmu_notifier_range_blockable(range))
>> -		return -EAGAIN;
>> -
>> -	for (i = 0; i < dev->nvqs; i++) {
>> -		struct vhost_virtqueue *vq = dev->vqs[i];
>> -
>> -		for (j = 0; j < VHOST_NUM_ADDRS; j++)
>> -			vhost_invalidate_vq_start(vq, j,
>> -						  range->start,
>> -						  range->end);
>> -	}
>> -
>> -	return 0;
>> -}
>> -
>> -static void vhost_invalidate_range_end(struct mmu_notifier *mn,
>> -				       const struct mmu_notifier_range *range)
>> -{
>> -	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
>> -					     mmu_notifier);
>> -	int i, j;
>> -
>> -	for (i = 0; i < dev->nvqs; i++) {
>> -		struct vhost_virtqueue *vq = dev->vqs[i];
>> -
>> -		for (j = 0; j < VHOST_NUM_ADDRS; j++)
>> -			vhost_invalidate_vq_end(vq, j,
>> -						range->start,
>> -						range->end);
>> -	}
>> -}
>> -
>> -static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
>> -	.invalidate_range_start = vhost_invalidate_range_start,
>> -	.invalidate_range_end = vhost_invalidate_range_end,
>> -};
>> -
>> -static void vhost_init_maps(struct vhost_dev *dev)
>> -{
>> -	struct vhost_virtqueue *vq;
>> -	int i, j;
>> -
>> -	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
>> -
>> -	for (i = 0; i < dev->nvqs; ++i) {
>> -		vq = dev->vqs[i];
>> -		for (j = 0; j < VHOST_NUM_ADDRS; j++)
>> -			RCU_INIT_POINTER(vq->maps[j], NULL);
>> -	}
>> -}
>> -#endif
>> -
>>   static void vhost_vq_reset(struct vhost_dev *dev,
>>   			   struct vhost_virtqueue *vq)
>>   {
>> @@ -480,11 +326,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>>   	vq->busyloop_timeout = 0;
>>   	vq->umem = NULL;
>>   	vq->iotlb = NULL;
>> -	vq->invalidate_count = 0;
>>   	__vhost_vq_meta_reset(vq);
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	vhost_reset_vq_maps(vq);
>> -#endif
>>   }
>>   
>>   static int vhost_worker(void *data)
>> @@ -634,9 +476,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>>   	INIT_LIST_HEAD(&dev->read_list);
>>   	INIT_LIST_HEAD(&dev->pending_list);
>>   	spin_lock_init(&dev->iotlb_lock);
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	vhost_init_maps(dev);
>> -#endif
>> +
>>   
>>   	for (i = 0; i < dev->nvqs; ++i) {
>>   		vq = dev->vqs[i];
>> @@ -645,7 +485,6 @@ void vhost_dev_init(struct vhost_dev *dev,
>>   		vq->heads = NULL;
>>   		vq->dev = dev;
>>   		mutex_init(&vq->mutex);
>> -		spin_lock_init(&vq->mmu_lock);
>>   		vhost_vq_reset(dev, vq);
>>   		if (vq->handle_kick)
>>   			vhost_poll_init(&vq->poll, vq->handle_kick,
>> @@ -725,18 +564,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>>   	if (err)
>>   		goto err_cgroup;
>>   
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
>> -	if (err)
>> -		goto err_mmu_notifier;
>> -#endif
>> -
>>   	return 0;
>> -
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -err_mmu_notifier:
>> -	vhost_dev_free_iovecs(dev);
>> -#endif
>>   err_cgroup:
>>   	kthread_stop(worker);
>>   	dev->worker = NULL;
>> @@ -827,107 +655,6 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>>   	spin_unlock(&dev->iotlb_lock);
>>   }
>>   
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
>> -			      int index, unsigned long uaddr,
>> -			      size_t size, bool write)
>> -{
>> -	struct vhost_uaddr *addr = &vq->uaddrs[index];
>> -
>> -	addr->uaddr = uaddr;
>> -	addr->size = size;
>> -	addr->write = write;
>> -}
>> -
>> -static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
>> -{
>> -	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
>> -			  (unsigned long)vq->desc,
>> -			  vhost_get_desc_size(vq, vq->num),
>> -			  false);
>> -	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
>> -			  (unsigned long)vq->avail,
>> -			  vhost_get_avail_size(vq, vq->num),
>> -			  false);
>> -	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
>> -			  (unsigned long)vq->used,
>> -			  vhost_get_used_size(vq, vq->num),
>> -			  true);
>> -}
>> -
>> -static int vhost_map_prefetch(struct vhost_virtqueue *vq,
>> -			       int index)
>> -{
>> -	struct vhost_map *map;
>> -	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> -	struct page **pages;
>> -	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
>> -	int npinned;
>> -	void *vaddr, *v;
>> -	int err;
>> -	int i;
>> -
>> -	spin_lock(&vq->mmu_lock);
>> -
>> -	err = -EFAULT;
>> -	if (vq->invalidate_count)
>> -		goto err;
>> -
>> -	err = -ENOMEM;
>> -	map = kmalloc(sizeof(*map), GFP_ATOMIC);
>> -	if (!map)
>> -		goto err;
>> -
>> -	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
>> -	if (!pages)
>> -		goto err_pages;
>> -
>> -	err = EFAULT;
>> -	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
>> -					uaddr->write, pages);
>> -	if (npinned > 0)
>> -		release_pages(pages, npinned);
>> -	if (npinned != npages)
>> -		goto err_gup;
>> -
>> -	for (i = 0; i < npinned; i++)
>> -		if (PageHighMem(pages[i]))
>> -			goto err_gup;
>> -
>> -	vaddr = v = page_address(pages[0]);
>> -
>> -	/* For simplicity, fallback to userspace address if VA is not
>> -	 * contigious.
>> -	 */
>> -	for (i = 1; i < npinned; i++) {
>> -		v += PAGE_SIZE;
>> -		if (v != page_address(pages[i]))
>> -			goto err_gup;
>> -	}
>> -
>> -	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
>> -	map->npages = npages;
>> -	map->pages = pages;
>> -
>> -	rcu_assign_pointer(vq->maps[index], map);
>> -	/* No need for a synchronize_rcu(). This function should be
>> -	 * called by dev->worker so we are serialized with all
>> -	 * readers.
>> -	 */
>> -	spin_unlock(&vq->mmu_lock);
>> -
>> -	return 0;
>> -
>> -err_gup:
>> -	kfree(pages);
>> -err_pages:
>> -	kfree(map);
>> -err:
>> -	spin_unlock(&vq->mmu_lock);
>> -	return err;
>> -}
>> -#endif
>> -
>>   void vhost_dev_cleanup(struct vhost_dev *dev)
>>   {
>>   	int i;
>> @@ -957,16 +684,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>>   		kthread_stop(dev->worker);
>>   		dev->worker = NULL;
>>   	}
>> -	if (dev->mm) {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -		mmu_notifier_unregister(&dev->mmu_notifier, dev->mm);
>> -#endif
>> +	if (dev->mm)
>>   		mmput(dev->mm);
>> -	}
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	for (i = 0; i < dev->nvqs; i++)
>> -		vhost_uninit_vq_maps(dev->vqs[i]);
>> -#endif
>>   	dev->mm = NULL;
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
>> @@ -1195,26 +914,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
>>   
>>   static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_used *used;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>> -		if (likely(map)) {
>> -			used = map->addr;
>> -			*((__virtio16 *)&used->ring[vq->num]) =
>> -				cpu_to_vhost16(vq, vq->avail_idx);
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
>>   			      vhost_avail_event(vq));
>>   }
>> @@ -1223,27 +922,6 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>>   				 struct vring_used_elem *head, int idx,
>>   				 int count)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_used *used;
>> -	size_t size;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>> -		if (likely(map)) {
>> -			used = map->addr;
>> -			size = count * sizeof(*head);
>> -			memcpy(used->ring + idx, head, size);
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
>>   				  count * sizeof(*head));
>>   }
>> @@ -1251,25 +929,6 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>>   static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>>   
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_used *used;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>> -		if (likely(map)) {
>> -			used = map->addr;
>> -			used->flags = cpu_to_vhost16(vq, vq->used_flags);
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
>>   			      &vq->used->flags);
>>   }
>> @@ -1277,25 +936,6 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>>   static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>>   
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_used *used;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>> -		if (likely(map)) {
>> -			used = map->addr;
>> -			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>>   			      &vq->used->idx);
>>   }
>> @@ -1341,50 +981,12 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
>>   static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>>   				      __virtio16 *idx)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_avail *avail;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
>> -		if (likely(map)) {
>> -			avail = map->addr;
>> -			*idx = avail->idx;
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_get_avail(vq, *idx, &vq->avail->idx);
>>   }
>>   
>>   static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>>   				       __virtio16 *head, int idx)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_avail *avail;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
>> -		if (likely(map)) {
>> -			avail = map->addr;
>> -			*head = avail->ring[idx & (vq->num - 1)];
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_get_avail(vq, *head,
>>   			       &vq->avail->ring[idx & (vq->num - 1)]);
>>   }
>> @@ -1392,98 +994,24 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>>   static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>>   					__virtio16 *flags)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_avail *avail;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
>> -		if (likely(map)) {
>> -			avail = map->addr;
>> -			*flags = avail->flags;
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_get_avail(vq, *flags, &vq->avail->flags);
>>   }
>>   
>>   static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>>   				       __virtio16 *event)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_avail *avail;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
>> -		if (likely(map)) {
>> -			avail = map->addr;
>> -			*event = (__virtio16)avail->ring[vq->num];
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_get_avail(vq, *event, vhost_used_event(vq));
>>   }
>>   
>>   static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>>   				     __virtio16 *idx)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_used *used;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>> -		if (likely(map)) {
>> -			used = map->addr;
>> -			*idx = used->idx;
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_get_used(vq, *idx, &vq->used->idx);
>>   }
>>   
>>   static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>>   				 struct vring_desc *desc, int idx)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_desc *d;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
>> -		if (likely(map)) {
>> -			d = map->addr;
>> -			*desc = *(d + idx);
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
>>   }
>>   
>> @@ -1824,32 +1352,12 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
>>   	return true;
>>   }
>>   
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
>> -{
>> -	struct vhost_map __rcu *map;
>> -	int i;
>> -
>> -	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
>> -		rcu_read_lock();
>> -		map = rcu_dereference(vq->maps[i]);
>> -		rcu_read_unlock();
>> -		if (unlikely(!map))
>> -			vhost_map_prefetch(vq, i);
>> -	}
>> -}
>> -#endif
>> -
>>   int vq_meta_prefetch(struct vhost_virtqueue *vq)
>>   {
>>   	unsigned int num = vq->num;
>>   
>> -	if (!vq->iotlb) {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -		vhost_vq_map_prefetch(vq);
>> -#endif
>> +	if (!vq->iotlb)
>>   		return 1;
>> -	}
>>   
>>   	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
>>   			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
>> @@ -2060,16 +1568,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
>>   
>>   	mutex_lock(&vq->mutex);
>>   
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	/* Unregister MMU notifer to allow invalidation callback
>> -	 * can access vq->uaddrs[] without holding a lock.
>> -	 */
>> -	if (d->mm)
>> -		mmu_notifier_unregister(&d->mmu_notifier, d->mm);
>> -
>> -	vhost_uninit_vq_maps(vq);
>> -#endif
>> -
>>   	switch (ioctl) {
>>   	case VHOST_SET_VRING_NUM:
>>   		r = vhost_vring_set_num(d, vq, argp);
>> @@ -2081,13 +1579,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
>>   		BUG();
>>   	}
>>   
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	vhost_setup_vq_uaddr(vq);
>> -
>> -	if (d->mm)
>> -		mmu_notifier_register(&d->mmu_notifier, d->mm);
>> -#endif
>> -
>>   	mutex_unlock(&vq->mutex);
>>   
>>   	return r;
>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>> index 42a8c2a13ab1..e9ed2722b633 100644
>> --- a/drivers/vhost/vhost.h
>> +++ b/drivers/vhost/vhost.h
>> @@ -12,9 +12,6 @@
>>   #include <linux/virtio_config.h>
>>   #include <linux/virtio_ring.h>
>>   #include <linux/atomic.h>
>> -#include <linux/pagemap.h>
>> -#include <linux/mmu_notifier.h>
>> -#include <asm/cacheflush.h>
>>   
>>   struct vhost_work;
>>   typedef void (*vhost_work_fn_t)(struct vhost_work *work);
>> @@ -83,24 +80,6 @@ enum vhost_uaddr_type {
>>   	VHOST_NUM_ADDRS = 3,
>>   };
>>   
>> -struct vhost_map {
>> -	int npages;
>> -	void *addr;
>> -	struct page **pages;
>> -};
>> -
>> -struct vhost_uaddr {
>> -	unsigned long uaddr;
>> -	size_t size;
>> -	bool write;
>> -};
>> -
>> -#if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
>> -#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
>> -#else
>> -#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
>> -#endif
>> -
>>   /* The virtqueue structure describes a queue attached to a device. */
>>   struct vhost_virtqueue {
>>   	struct vhost_dev *dev;
>> @@ -111,22 +90,7 @@ struct vhost_virtqueue {
>>   	struct vring_desc __user *desc;
>>   	struct vring_avail __user *avail;
>>   	struct vring_used __user *used;
>> -
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	/* Read by memory accessors, modified by meta data
>> -	 * prefetching, MMU notifier and vring ioctl().
>> -	 * Synchonrized through mmu_lock (writers) and RCU (writers
>> -	 * and readers).
>> -	 */
>> -	struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
>> -	/* Read by MMU notifier, modified by vring ioctl(),
>> -	 * synchronized through MMU notifier
>> -	 * registering/unregistering.
>> -	 */
>> -	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
>> -#endif
>>   	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
>> -
>>   	struct file *kick;
>>   	struct eventfd_ctx *call_ctx;
>>   	struct eventfd_ctx *error_ctx;
>> @@ -181,8 +145,6 @@ struct vhost_virtqueue {
>>   	bool user_be;
>>   #endif
>>   	u32 busyloop_timeout;
>> -	spinlock_t mmu_lock;
>> -	int invalidate_count;
>>   };
>>   
>>   struct vhost_msg_node {
>> @@ -196,9 +158,6 @@ struct vhost_msg_node {
>>   
>>   struct vhost_dev {
>>   	struct mm_struct *mm;
>> -#ifdef CONFIG_MMU_NOTIFIER
>> -	struct mmu_notifier mmu_notifier;
>> -#endif
>>   	struct mutex mutex;
>>   	struct vhost_virtqueue **vqs;
>>   	int nvqs;
>> -- 
>> 2.19.1

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 1/2] Revert "vhost: access vq metadata through kernel virtual address"
  2019-09-06 13:46   ` Michael S. Tsirkin
  2019-09-09  7:16     ` Jason Wang
@ 2019-09-09  7:16     ` Jason Wang
  1 sibling, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  7:16 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: aarcange, kvm, netdev, linux-kernel, virtualization, linux-mm,
	jglisse, jgg


On 2019/9/6 下午9:46, Michael S. Tsirkin wrote:
> On Thu, Sep 05, 2019 at 08:27:35PM +0800, Jason Wang wrote:
>> It was reported that metadata acceleration introduces several issues,
>> so this patch reverts commit ff466032dc9e5a61217f22ea34b2df932786bbfc,
>> 73f628ec9e6bcc45b77c53fe6d0c0ec55eaf82af and
>> 0b4a7092ffe568a55bf8f3cefdf79ff666586d91.
>>
>> We will rework it on the next version.
>>
>> Cc: Jason Gunthorpe <jgg@mellanox.com>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>
> I am confused by the above.
> What I see upstream is 7f466032dc.
>
> commit 7f466032dc9e5a61217f22ea34b2df932786bbfc
> Author: Jason Wang <jasowang@redhat.com>
> Date:   Fri May 24 04:12:18 2019 -0400
>
>      vhost: access vq metadata through kernel virtual address
>
> so this is what I reverted.
>
> Pls take a look, and let me know if you see issues.
>
> Thanks!


Yes, my fault.

Thanks


>> ---
>>   drivers/vhost/vhost.c | 515 +-----------------------------------------
>>   drivers/vhost/vhost.h |  41 ----
>>   2 files changed, 3 insertions(+), 553 deletions(-)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index 0536f8526359..791562e03fe0 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -298,160 +298,6 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
>>   		__vhost_vq_meta_reset(d->vqs[i]);
>>   }
>>   
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -static void vhost_map_unprefetch(struct vhost_map *map)
>> -{
>> -	kfree(map->pages);
>> -	map->pages = NULL;
>> -	map->npages = 0;
>> -	map->addr = NULL;
>> -}
>> -
>> -static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
>> -{
>> -	struct vhost_map *map[VHOST_NUM_ADDRS];
>> -	int i;
>> -
>> -	spin_lock(&vq->mmu_lock);
>> -	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
>> -		map[i] = rcu_dereference_protected(vq->maps[i],
>> -				  lockdep_is_held(&vq->mmu_lock));
>> -		if (map[i])
>> -			rcu_assign_pointer(vq->maps[i], NULL);
>> -	}
>> -	spin_unlock(&vq->mmu_lock);
>> -
>> -	synchronize_rcu();
>> -
>> -	for (i = 0; i < VHOST_NUM_ADDRS; i++)
>> -		if (map[i])
>> -			vhost_map_unprefetch(map[i]);
>> -
>> -}
>> -
>> -static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
>> -{
>> -	int i;
>> -
>> -	vhost_uninit_vq_maps(vq);
>> -	for (i = 0; i < VHOST_NUM_ADDRS; i++)
>> -		vq->uaddrs[i].size = 0;
>> -}
>> -
>> -static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
>> -				     unsigned long start,
>> -				     unsigned long end)
>> -{
>> -	if (unlikely(!uaddr->size))
>> -		return false;
>> -
>> -	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
>> -}
>> -
>> -static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>> -				      int index,
>> -				      unsigned long start,
>> -				      unsigned long end)
>> -{
>> -	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> -	struct vhost_map *map;
>> -	int i;
>> -
>> -	if (!vhost_map_range_overlap(uaddr, start, end))
>> -		return;
>> -
>> -	spin_lock(&vq->mmu_lock);
>> -	++vq->invalidate_count;
>> -
>> -	map = rcu_dereference_protected(vq->maps[index],
>> -					lockdep_is_held(&vq->mmu_lock));
>> -	if (map) {
>> -		if (uaddr->write) {
>> -			for (i = 0; i < map->npages; i++)
>> -				set_page_dirty(map->pages[i]);
>> -		}
>> -		rcu_assign_pointer(vq->maps[index], NULL);
>> -	}
>> -	spin_unlock(&vq->mmu_lock);
>> -
>> -	if (map) {
>> -		synchronize_rcu();
>> -		vhost_map_unprefetch(map);
>> -	}
>> -}
>> -
>> -static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
>> -				    int index,
>> -				    unsigned long start,
>> -				    unsigned long end)
>> -{
>> -	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
>> -		return;
>> -
>> -	spin_lock(&vq->mmu_lock);
>> -	--vq->invalidate_count;
>> -	spin_unlock(&vq->mmu_lock);
>> -}
>> -
>> -static int vhost_invalidate_range_start(struct mmu_notifier *mn,
>> -					const struct mmu_notifier_range *range)
>> -{
>> -	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
>> -					     mmu_notifier);
>> -	int i, j;
>> -
>> -	if (!mmu_notifier_range_blockable(range))
>> -		return -EAGAIN;
>> -
>> -	for (i = 0; i < dev->nvqs; i++) {
>> -		struct vhost_virtqueue *vq = dev->vqs[i];
>> -
>> -		for (j = 0; j < VHOST_NUM_ADDRS; j++)
>> -			vhost_invalidate_vq_start(vq, j,
>> -						  range->start,
>> -						  range->end);
>> -	}
>> -
>> -	return 0;
>> -}
>> -
>> -static void vhost_invalidate_range_end(struct mmu_notifier *mn,
>> -				       const struct mmu_notifier_range *range)
>> -{
>> -	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
>> -					     mmu_notifier);
>> -	int i, j;
>> -
>> -	for (i = 0; i < dev->nvqs; i++) {
>> -		struct vhost_virtqueue *vq = dev->vqs[i];
>> -
>> -		for (j = 0; j < VHOST_NUM_ADDRS; j++)
>> -			vhost_invalidate_vq_end(vq, j,
>> -						range->start,
>> -						range->end);
>> -	}
>> -}
>> -
>> -static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
>> -	.invalidate_range_start = vhost_invalidate_range_start,
>> -	.invalidate_range_end = vhost_invalidate_range_end,
>> -};
>> -
>> -static void vhost_init_maps(struct vhost_dev *dev)
>> -{
>> -	struct vhost_virtqueue *vq;
>> -	int i, j;
>> -
>> -	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
>> -
>> -	for (i = 0; i < dev->nvqs; ++i) {
>> -		vq = dev->vqs[i];
>> -		for (j = 0; j < VHOST_NUM_ADDRS; j++)
>> -			RCU_INIT_POINTER(vq->maps[j], NULL);
>> -	}
>> -}
>> -#endif
>> -
>>   static void vhost_vq_reset(struct vhost_dev *dev,
>>   			   struct vhost_virtqueue *vq)
>>   {
>> @@ -480,11 +326,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>>   	vq->busyloop_timeout = 0;
>>   	vq->umem = NULL;
>>   	vq->iotlb = NULL;
>> -	vq->invalidate_count = 0;
>>   	__vhost_vq_meta_reset(vq);
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	vhost_reset_vq_maps(vq);
>> -#endif
>>   }
>>   
>>   static int vhost_worker(void *data)
>> @@ -634,9 +476,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>>   	INIT_LIST_HEAD(&dev->read_list);
>>   	INIT_LIST_HEAD(&dev->pending_list);
>>   	spin_lock_init(&dev->iotlb_lock);
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	vhost_init_maps(dev);
>> -#endif
>> +
>>   
>>   	for (i = 0; i < dev->nvqs; ++i) {
>>   		vq = dev->vqs[i];
>> @@ -645,7 +485,6 @@ void vhost_dev_init(struct vhost_dev *dev,
>>   		vq->heads = NULL;
>>   		vq->dev = dev;
>>   		mutex_init(&vq->mutex);
>> -		spin_lock_init(&vq->mmu_lock);
>>   		vhost_vq_reset(dev, vq);
>>   		if (vq->handle_kick)
>>   			vhost_poll_init(&vq->poll, vq->handle_kick,
>> @@ -725,18 +564,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>>   	if (err)
>>   		goto err_cgroup;
>>   
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
>> -	if (err)
>> -		goto err_mmu_notifier;
>> -#endif
>> -
>>   	return 0;
>> -
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -err_mmu_notifier:
>> -	vhost_dev_free_iovecs(dev);
>> -#endif
>>   err_cgroup:
>>   	kthread_stop(worker);
>>   	dev->worker = NULL;
>> @@ -827,107 +655,6 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>>   	spin_unlock(&dev->iotlb_lock);
>>   }
>>   
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
>> -			      int index, unsigned long uaddr,
>> -			      size_t size, bool write)
>> -{
>> -	struct vhost_uaddr *addr = &vq->uaddrs[index];
>> -
>> -	addr->uaddr = uaddr;
>> -	addr->size = size;
>> -	addr->write = write;
>> -}
>> -
>> -static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
>> -{
>> -	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
>> -			  (unsigned long)vq->desc,
>> -			  vhost_get_desc_size(vq, vq->num),
>> -			  false);
>> -	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
>> -			  (unsigned long)vq->avail,
>> -			  vhost_get_avail_size(vq, vq->num),
>> -			  false);
>> -	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
>> -			  (unsigned long)vq->used,
>> -			  vhost_get_used_size(vq, vq->num),
>> -			  true);
>> -}
>> -
>> -static int vhost_map_prefetch(struct vhost_virtqueue *vq,
>> -			       int index)
>> -{
>> -	struct vhost_map *map;
>> -	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
>> -	struct page **pages;
>> -	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
>> -	int npinned;
>> -	void *vaddr, *v;
>> -	int err;
>> -	int i;
>> -
>> -	spin_lock(&vq->mmu_lock);
>> -
>> -	err = -EFAULT;
>> -	if (vq->invalidate_count)
>> -		goto err;
>> -
>> -	err = -ENOMEM;
>> -	map = kmalloc(sizeof(*map), GFP_ATOMIC);
>> -	if (!map)
>> -		goto err;
>> -
>> -	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
>> -	if (!pages)
>> -		goto err_pages;
>> -
>> -	err = EFAULT;
>> -	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
>> -					uaddr->write, pages);
>> -	if (npinned > 0)
>> -		release_pages(pages, npinned);
>> -	if (npinned != npages)
>> -		goto err_gup;
>> -
>> -	for (i = 0; i < npinned; i++)
>> -		if (PageHighMem(pages[i]))
>> -			goto err_gup;
>> -
>> -	vaddr = v = page_address(pages[0]);
>> -
>> -	/* For simplicity, fallback to userspace address if VA is not
>> -	 * contigious.
>> -	 */
>> -	for (i = 1; i < npinned; i++) {
>> -		v += PAGE_SIZE;
>> -		if (v != page_address(pages[i]))
>> -			goto err_gup;
>> -	}
>> -
>> -	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
>> -	map->npages = npages;
>> -	map->pages = pages;
>> -
>> -	rcu_assign_pointer(vq->maps[index], map);
>> -	/* No need for a synchronize_rcu(). This function should be
>> -	 * called by dev->worker so we are serialized with all
>> -	 * readers.
>> -	 */
>> -	spin_unlock(&vq->mmu_lock);
>> -
>> -	return 0;
>> -
>> -err_gup:
>> -	kfree(pages);
>> -err_pages:
>> -	kfree(map);
>> -err:
>> -	spin_unlock(&vq->mmu_lock);
>> -	return err;
>> -}
>> -#endif
>> -
>>   void vhost_dev_cleanup(struct vhost_dev *dev)
>>   {
>>   	int i;
>> @@ -957,16 +684,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>>   		kthread_stop(dev->worker);
>>   		dev->worker = NULL;
>>   	}
>> -	if (dev->mm) {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -		mmu_notifier_unregister(&dev->mmu_notifier, dev->mm);
>> -#endif
>> +	if (dev->mm)
>>   		mmput(dev->mm);
>> -	}
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	for (i = 0; i < dev->nvqs; i++)
>> -		vhost_uninit_vq_maps(dev->vqs[i]);
>> -#endif
>>   	dev->mm = NULL;
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
>> @@ -1195,26 +914,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
>>   
>>   static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_used *used;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>> -		if (likely(map)) {
>> -			used = map->addr;
>> -			*((__virtio16 *)&used->ring[vq->num]) =
>> -				cpu_to_vhost16(vq, vq->avail_idx);
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
>>   			      vhost_avail_event(vq));
>>   }
>> @@ -1223,27 +922,6 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>>   				 struct vring_used_elem *head, int idx,
>>   				 int count)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_used *used;
>> -	size_t size;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>> -		if (likely(map)) {
>> -			used = map->addr;
>> -			size = count * sizeof(*head);
>> -			memcpy(used->ring + idx, head, size);
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
>>   				  count * sizeof(*head));
>>   }
>> @@ -1251,25 +929,6 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>>   static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>>   
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_used *used;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>> -		if (likely(map)) {
>> -			used = map->addr;
>> -			used->flags = cpu_to_vhost16(vq, vq->used_flags);
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
>>   			      &vq->used->flags);
>>   }
>> @@ -1277,25 +936,6 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>>   static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>>   
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_used *used;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>> -		if (likely(map)) {
>> -			used = map->addr;
>> -			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>>   			      &vq->used->idx);
>>   }
>> @@ -1341,50 +981,12 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
>>   static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>>   				      __virtio16 *idx)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_avail *avail;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
>> -		if (likely(map)) {
>> -			avail = map->addr;
>> -			*idx = avail->idx;
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_get_avail(vq, *idx, &vq->avail->idx);
>>   }
>>   
>>   static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>>   				       __virtio16 *head, int idx)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_avail *avail;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
>> -		if (likely(map)) {
>> -			avail = map->addr;
>> -			*head = avail->ring[idx & (vq->num - 1)];
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_get_avail(vq, *head,
>>   			       &vq->avail->ring[idx & (vq->num - 1)]);
>>   }
>> @@ -1392,98 +994,24 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>>   static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>>   					__virtio16 *flags)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_avail *avail;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
>> -		if (likely(map)) {
>> -			avail = map->addr;
>> -			*flags = avail->flags;
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_get_avail(vq, *flags, &vq->avail->flags);
>>   }
>>   
>>   static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>>   				       __virtio16 *event)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_avail *avail;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
>> -		if (likely(map)) {
>> -			avail = map->addr;
>> -			*event = (__virtio16)avail->ring[vq->num];
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_get_avail(vq, *event, vhost_used_event(vq));
>>   }
>>   
>>   static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>>   				     __virtio16 *idx)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_used *used;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>> -		if (likely(map)) {
>> -			used = map->addr;
>> -			*idx = used->idx;
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_get_used(vq, *idx, &vq->used->idx);
>>   }
>>   
>>   static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>>   				 struct vring_desc *desc, int idx)
>>   {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	struct vhost_map *map;
>> -	struct vring_desc *d;
>> -
>> -	if (!vq->iotlb) {
>> -		rcu_read_lock();
>> -
>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
>> -		if (likely(map)) {
>> -			d = map->addr;
>> -			*desc = *(d + idx);
>> -			rcu_read_unlock();
>> -			return 0;
>> -		}
>> -
>> -		rcu_read_unlock();
>> -	}
>> -#endif
>> -
>>   	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
>>   }
>>   
>> @@ -1824,32 +1352,12 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
>>   	return true;
>>   }
>>   
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
>> -{
>> -	struct vhost_map __rcu *map;
>> -	int i;
>> -
>> -	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
>> -		rcu_read_lock();
>> -		map = rcu_dereference(vq->maps[i]);
>> -		rcu_read_unlock();
>> -		if (unlikely(!map))
>> -			vhost_map_prefetch(vq, i);
>> -	}
>> -}
>> -#endif
>> -
>>   int vq_meta_prefetch(struct vhost_virtqueue *vq)
>>   {
>>   	unsigned int num = vq->num;
>>   
>> -	if (!vq->iotlb) {
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -		vhost_vq_map_prefetch(vq);
>> -#endif
>> +	if (!vq->iotlb)
>>   		return 1;
>> -	}
>>   
>>   	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
>>   			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
>> @@ -2060,16 +1568,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
>>   
>>   	mutex_lock(&vq->mutex);
>>   
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	/* Unregister MMU notifer to allow invalidation callback
>> -	 * can access vq->uaddrs[] without holding a lock.
>> -	 */
>> -	if (d->mm)
>> -		mmu_notifier_unregister(&d->mmu_notifier, d->mm);
>> -
>> -	vhost_uninit_vq_maps(vq);
>> -#endif
>> -
>>   	switch (ioctl) {
>>   	case VHOST_SET_VRING_NUM:
>>   		r = vhost_vring_set_num(d, vq, argp);
>> @@ -2081,13 +1579,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
>>   		BUG();
>>   	}
>>   
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	vhost_setup_vq_uaddr(vq);
>> -
>> -	if (d->mm)
>> -		mmu_notifier_register(&d->mmu_notifier, d->mm);
>> -#endif
>> -
>>   	mutex_unlock(&vq->mutex);
>>   
>>   	return r;
>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>> index 42a8c2a13ab1..e9ed2722b633 100644
>> --- a/drivers/vhost/vhost.h
>> +++ b/drivers/vhost/vhost.h
>> @@ -12,9 +12,6 @@
>>   #include <linux/virtio_config.h>
>>   #include <linux/virtio_ring.h>
>>   #include <linux/atomic.h>
>> -#include <linux/pagemap.h>
>> -#include <linux/mmu_notifier.h>
>> -#include <asm/cacheflush.h>
>>   
>>   struct vhost_work;
>>   typedef void (*vhost_work_fn_t)(struct vhost_work *work);
>> @@ -83,24 +80,6 @@ enum vhost_uaddr_type {
>>   	VHOST_NUM_ADDRS = 3,
>>   };
>>   
>> -struct vhost_map {
>> -	int npages;
>> -	void *addr;
>> -	struct page **pages;
>> -};
>> -
>> -struct vhost_uaddr {
>> -	unsigned long uaddr;
>> -	size_t size;
>> -	bool write;
>> -};
>> -
>> -#if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
>> -#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
>> -#else
>> -#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
>> -#endif
>> -
>>   /* The virtqueue structure describes a queue attached to a device. */
>>   struct vhost_virtqueue {
>>   	struct vhost_dev *dev;
>> @@ -111,22 +90,7 @@ struct vhost_virtqueue {
>>   	struct vring_desc __user *desc;
>>   	struct vring_avail __user *avail;
>>   	struct vring_used __user *used;
>> -
>> -#if VHOST_ARCH_CAN_ACCEL_UACCESS
>> -	/* Read by memory accessors, modified by meta data
>> -	 * prefetching, MMU notifier and vring ioctl().
>> -	 * Synchonrized through mmu_lock (writers) and RCU (writers
>> -	 * and readers).
>> -	 */
>> -	struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
>> -	/* Read by MMU notifier, modified by vring ioctl(),
>> -	 * synchronized through MMU notifier
>> -	 * registering/unregistering.
>> -	 */
>> -	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
>> -#endif
>>   	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
>> -
>>   	struct file *kick;
>>   	struct eventfd_ctx *call_ctx;
>>   	struct eventfd_ctx *error_ctx;
>> @@ -181,8 +145,6 @@ struct vhost_virtqueue {
>>   	bool user_be;
>>   #endif
>>   	u32 busyloop_timeout;
>> -	spinlock_t mmu_lock;
>> -	int invalidate_count;
>>   };
>>   
>>   struct vhost_msg_node {
>> @@ -196,9 +158,6 @@ struct vhost_msg_node {
>>   
>>   struct vhost_dev {
>>   	struct mm_struct *mm;
>> -#ifdef CONFIG_MMU_NOTIFIER
>> -	struct mmu_notifier mmu_notifier;
>> -#endif
>>   	struct mutex mutex;
>>   	struct vhost_virtqueue **vqs;
>>   	int nvqs;
>> -- 
>> 2.19.1
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
  2019-09-06 13:15     ` David Miller
  2019-09-09  7:18       ` Jason Wang
@ 2019-09-09  7:18       ` Jason Wang
  2019-09-09 12:15         ` Michael S. Tsirkin
  2019-09-09 12:15         ` Michael S. Tsirkin
  1 sibling, 2 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  7:18 UTC (permalink / raw)
  To: David Miller
  Cc: jgg, mst, kvm, virtualization, netdev, linux-kernel, aarcange,
	jglisse, linux-mm


On 2019/9/6 下午9:15, David Miller wrote:
> From: Jason Wang <jasowang@redhat.com>
> Date: Fri, 6 Sep 2019 18:02:35 +0800
>
>> On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
>>> I think you should apply the revert this cycle and rebase the other
>>> patch for next..
>>>
>>> Jason
>> Yes, the plan is to revert in this release cycle.
> Then you should reset patch #1 all by itself targetting 'net'.


Thanks for the reminding. I want the patch to go through Michael's vhost  
tree, that's why I don't put 'net' prefix. For next time, maybe I can  
use "vhost" as a prefix for classification?


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
  2019-09-06 13:15     ` David Miller
@ 2019-09-09  7:18       ` Jason Wang
  2019-09-09  7:18       ` Jason Wang
  1 sibling, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  7:18 UTC (permalink / raw)
  To: David Miller
  Cc: aarcange, kvm, mst, netdev, linux-kernel, virtualization,
	linux-mm, jglisse, jgg


On 2019/9/6 下午9:15, David Miller wrote:
> From: Jason Wang <jasowang@redhat.com>
> Date: Fri, 6 Sep 2019 18:02:35 +0800
>
>> On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
>>> I think you should apply the revert this cycle and rebase the other
>>> patch for next..
>>>
>>> Jason
>> Yes, the plan is to revert in this release cycle.
> Then you should reset patch #1 all by itself targetting 'net'.


Thanks for the reminding. I want the patch to go through Michael's vhost  
tree, that's why I don't put 'net' prefix. For next time, maybe I can  
use "vhost" as a prefix for classification?

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-09  4:45         ` Michael S. Tsirkin
@ 2019-09-09  7:23           ` Jason Wang
  -1 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  7:23 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, virtualization, netdev, linux-kernel, jgg, aarcange,
	jglisse, linux-mm, James Bottomley, Christoph Hellwig,
	David Miller, linux-arm-kernel, linux-parisc


On 2019/9/9 下午12:45, Michael S. Tsirkin wrote:
>>> Since idx can be speculated, I guess we need array_index_nospec here?
>> So we have
>>
>> ACQUIRE(mmu_lock)
>>
>> get idx
>>
>> RELEASE(mmu_lock)
>>
>> ACQUIRE(mmu_lock)
>>
>> read array[idx]
>>
>> RELEASE(mmu_lock)
>>
>> Then I think idx can't be speculated consider we've passed RELEASE +
>> ACQUIRE?
> I don't think memory barriers have anything to do with speculation,
> they are architectural.
>

Oh right. Let me add array_index_nospec() in next version.

Thanks


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
  2019-09-09  4:45         ` Michael S. Tsirkin
  (?)
  (?)
@ 2019-09-09  7:23         ` Jason Wang
  -1 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  7:23 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: aarcange, Christoph Hellwig, linux-parisc, kvm, netdev,
	linux-kernel, virtualization, James Bottomley, linux-mm, jglisse,
	jgg, David Miller, linux-arm-kernel


On 2019/9/9 下午12:45, Michael S. Tsirkin wrote:
>>> Since idx can be speculated, I guess we need array_index_nospec here?
>> So we have
>>
>> ACQUIRE(mmu_lock)
>>
>> get idx
>>
>> RELEASE(mmu_lock)
>>
>> ACQUIRE(mmu_lock)
>>
>> read array[idx]
>>
>> RELEASE(mmu_lock)
>>
>> Then I think idx can't be speculated consider we've passed RELEASE +
>> ACQUIRE?
> I don't think memory barriers have anything to do with speculation,
> they are architectural.
>

Oh right. Let me add array_index_nospec() in next version.

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
@ 2019-09-09  7:23           ` Jason Wang
  0 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-09  7:23 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: aarcange, Christoph Hellwig, linux-parisc, kvm, netdev,
	linux-kernel, virtualization, James Bottomley, linux-mm, jglisse,
	jgg, David Miller, linux-arm-kernel


On 2019/9/9 下午12:45, Michael S. Tsirkin wrote:
>>> Since idx can be speculated, I guess we need array_index_nospec here?
>> So we have
>>
>> ACQUIRE(mmu_lock)
>>
>> get idx
>>
>> RELEASE(mmu_lock)
>>
>> ACQUIRE(mmu_lock)
>>
>> read array[idx]
>>
>> RELEASE(mmu_lock)
>>
>> Then I think idx can't be speculated consider we've passed RELEASE +
>> ACQUIRE?
> I don't think memory barriers have anything to do with speculation,
> they are architectural.
>

Oh right. Let me add array_index_nospec() in next version.

Thanks


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
  2019-09-09  7:18       ` Jason Wang
  2019-09-09 12:15         ` Michael S. Tsirkin
@ 2019-09-09 12:15         ` Michael S. Tsirkin
  1 sibling, 0 replies; 50+ messages in thread
From: Michael S. Tsirkin @ 2019-09-09 12:15 UTC (permalink / raw)
  To: Jason Wang
  Cc: David Miller, jgg, kvm, virtualization, netdev, linux-kernel,
	aarcange, jglisse, linux-mm

On Mon, Sep 09, 2019 at 03:18:01PM +0800, Jason Wang wrote:
> 
> On 2019/9/6 下午9:15, David Miller wrote:
> > From: Jason Wang <jasowang@redhat.com>
> > Date: Fri, 6 Sep 2019 18:02:35 +0800
> > 
> > > On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
> > > > I think you should apply the revert this cycle and rebase the other
> > > > patch for next..
> > > > 
> > > > Jason
> > > Yes, the plan is to revert in this release cycle.
> > Then you should reset patch #1 all by itself targetting 'net'.
> 
> 
> Thanks for the reminding. I want the patch to go through Michael's vhost
> tree, that's why I don't put 'net' prefix. For next time, maybe I can use
> "vhost" as a prefix for classification?

That's fine by me.

-- 
MST

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 0/2] Revert and rework on the metadata accelreation
  2019-09-09  7:18       ` Jason Wang
@ 2019-09-09 12:15         ` Michael S. Tsirkin
  2019-09-09 12:15         ` Michael S. Tsirkin
  1 sibling, 0 replies; 50+ messages in thread
From: Michael S. Tsirkin @ 2019-09-09 12:15 UTC (permalink / raw)
  To: Jason Wang
  Cc: aarcange, kvm, netdev, linux-kernel, virtualization, linux-mm,
	jglisse, jgg, David Miller

On Mon, Sep 09, 2019 at 03:18:01PM +0800, Jason Wang wrote:
> 
> On 2019/9/6 下午9:15, David Miller wrote:
> > From: Jason Wang <jasowang@redhat.com>
> > Date: Fri, 6 Sep 2019 18:02:35 +0800
> > 
> > > On 2019/9/5 下午9:59, Jason Gunthorpe wrote:
> > > > I think you should apply the revert this cycle and rebase the other
> > > > patch for next..
> > > > 
> > > > Jason
> > > Yes, the plan is to revert in this release cycle.
> > Then you should reset patch #1 all by itself targetting 'net'.
> 
> 
> Thanks for the reminding. I want the patch to go through Michael's vhost
> tree, that's why I don't put 'net' prefix. For next time, maybe I can use
> "vhost" as a prefix for classification?

That's fine by me.

-- 
MST
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 0/2] Revert and rework on the metadata accelreation
@ 2019-09-05 12:27 Jason Wang
  0 siblings, 0 replies; 50+ messages in thread
From: Jason Wang @ 2019-09-05 12:27 UTC (permalink / raw)
  To: mst, jasowang, kvm, virtualization
  Cc: aarcange, netdev, linux-kernel, linux-mm, jglisse, jgg

Hi:

Per request from Michael and Jason, the metadata accelreation is
reverted in this version and rework in next version.

Please review.

Thanks

Jason Wang (2):
  Revert "vhost: access vq metadata through kernel virtual address"
  vhost: re-introducing metadata acceleration through kernel virtual
    address

 drivers/vhost/vhost.c | 202 +++++++++++++++++++++++++-----------------
 drivers/vhost/vhost.h |   8 +-
 2 files changed, 123 insertions(+), 87 deletions(-)

-- 
2.19.1

^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2019-09-09 12:16 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-05 12:27 [PATCH 0/2] Revert and rework on the metadata accelreation Jason Wang
2019-09-05 12:27 ` [PATCH 1/2] Revert "vhost: access vq metadata through kernel virtual address" Jason Wang
2019-09-05 12:27 ` Jason Wang
2019-09-06 13:46   ` Michael S. Tsirkin
2019-09-09  7:16     ` Jason Wang
2019-09-09  7:16     ` Jason Wang
2019-09-06 13:46   ` Michael S. Tsirkin
2019-09-05 12:27 ` [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address Jason Wang
2019-09-05 12:27   ` Jason Wang
2019-09-08 11:05   ` Michael S. Tsirkin
2019-09-08 11:05     ` Michael S. Tsirkin
2019-09-09  2:18     ` Jason Wang
2019-09-09  2:18     ` Jason Wang
2019-09-09  2:18       ` Jason Wang
2019-09-09  2:30       ` Jason Wang
2019-09-09  2:30         ` Jason Wang
2019-09-09  2:30       ` Jason Wang
2019-09-09  4:45       ` Michael S. Tsirkin
2019-09-09  4:45         ` Michael S. Tsirkin
2019-09-09  4:45         ` Michael S. Tsirkin
2019-09-09  7:23         ` Jason Wang
2019-09-09  7:23         ` Jason Wang
2019-09-09  7:23           ` Jason Wang
2019-09-08 11:05   ` Michael S. Tsirkin
2019-09-05 12:27 ` Jason Wang
2019-09-05 13:59 ` [PATCH 0/2] Revert and rework on the metadata accelreation Jason Gunthorpe
2019-09-05 13:59 ` Jason Gunthorpe
2019-09-05 13:59   ` Jason Gunthorpe
2019-09-06 10:02   ` Jason Wang
2019-09-06 10:02   ` Jason Wang
2019-09-06 10:02     ` Jason Wang
2019-09-06 13:15     ` David Miller
2019-09-09  7:18       ` Jason Wang
2019-09-09  7:18       ` Jason Wang
2019-09-09 12:15         ` Michael S. Tsirkin
2019-09-09 12:15         ` Michael S. Tsirkin
2019-09-06 13:15     ` David Miller
2019-09-07 15:03     ` Jason Gunthorpe
2019-09-07 15:03       ` Jason Gunthorpe
2019-09-07 15:03       ` Jason Gunthorpe
2019-09-09  2:29       ` Jason Wang
2019-09-09  2:29         ` Jason Wang
2019-09-09  2:29       ` Jason Wang
2019-09-06  3:21 ` [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address Hillf Danton
2019-09-06  3:21   ` Hillf Danton
2019-09-06 12:51   ` Jason Wang
2019-09-06 12:51     ` Jason Wang
2019-09-06 12:51   ` Jason Wang
2019-09-06  3:21 ` Hillf Danton
  -- strict thread matches above, loose matches on Subject: below --
2019-09-05 12:27 [PATCH 0/2] Revert and rework on the metadata accelreation Jason Wang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.