All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next RFC 0/5] batched tx processing in vhost_net
@ 2017-09-22  8:02 Jason Wang
  2017-09-22  8:02 ` [PATCH net-next RFC 1/5] vhost: split out ring head fetching logic Jason Wang
                   ` (11 more replies)
  0 siblings, 12 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-22  8:02 UTC (permalink / raw)
  To: mst, jasowang, virtualization, netdev, linux-kernel; +Cc: kvm

Hi:

This series tries to implement basic tx batched processing. This is
done by prefetching descriptor indices and update used ring in a
batch. This intends to speed up used ring updating and improve the
cache utilization. Test shows about ~22% improvement in tx pss.

Please review.

Jason Wang (5):
  vhost: split out ring head fetching logic
  vhost: introduce helper to prefetch desc index
  vhost: introduce vhost_add_used_idx()
  vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
  vhost_net: basic tx virtqueue batched processing

 drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
 drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
 drivers/vhost/vhost.h |   9 ++
 3 files changed, 270 insertions(+), 125 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* [PATCH net-next RFC 1/5] vhost: split out ring head fetching logic
  2017-09-22  8:02 [PATCH net-next RFC 0/5] batched tx processing in vhost_net Jason Wang
  2017-09-22  8:02 ` [PATCH net-next RFC 1/5] vhost: split out ring head fetching logic Jason Wang
@ 2017-09-22  8:02 ` Jason Wang
  2017-09-22  8:31   ` Stefan Hajnoczi
  2017-09-22  8:31   ` Stefan Hajnoczi
  2017-09-22  8:02 ` [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index Jason Wang
                   ` (9 subsequent siblings)
  11 siblings, 2 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-22  8:02 UTC (permalink / raw)
  To: mst, jasowang, virtualization, netdev, linux-kernel; +Cc: kvm

This patch splits out ring head fetching logic and leave the
descriptor fetching and translation logic. This makes it is possible
to batch fetching the descriptor indices.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 75 +++++++++++++++++++++++++++++++++------------------
 drivers/vhost/vhost.h |  5 ++++
 2 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index d6dbb28..f87ec75 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1984,41 +1984,23 @@ static int get_indirect(struct vhost_virtqueue *vq,
 	return 0;
 }
 
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found.  A negative code is
- * returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
-		      struct iovec iov[], unsigned int iov_size,
-		      unsigned int *out_num, unsigned int *in_num,
-		      struct vhost_log *log, unsigned int *log_num)
+static unsigned int vhost_get_vq_head(struct vhost_virtqueue *vq, int *err)
 {
-	struct vring_desc desc;
-	unsigned int i, head, found = 0;
-	u16 last_avail_idx;
-	__virtio16 avail_idx;
-	__virtio16 ring_head;
-	int ret, access;
-
-	/* Check it isn't doing very strange things with descriptor numbers. */
-	last_avail_idx = vq->last_avail_idx;
+	u16 last_avail_idx = vq->last_avail_idx;
+	__virtio16 avail_idx, ring_head;
 
 	if (vq->avail_idx == vq->last_avail_idx) {
 		if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
 			vq_err(vq, "Failed to access avail idx at %p\n",
 				&vq->avail->idx);
-			return -EFAULT;
+			goto err;
 		}
 		vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
 
 		if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
 			vq_err(vq, "Guest moved used index from %u to %u",
 				last_avail_idx, vq->avail_idx);
-			return -EFAULT;
+			goto err;
 		}
 
 		/* If there's nothing new since last we looked, return
@@ -2040,13 +2022,35 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		vq_err(vq, "Failed to read head: idx %d address %p\n",
 		       last_avail_idx,
 		       &vq->avail->ring[last_avail_idx % vq->num]);
-		return -EFAULT;
+		goto err;
 	}
 
-	head = vhost16_to_cpu(vq, ring_head);
+	return vhost16_to_cpu(vq, ring_head);
+err:
+	*err = -EFAULT;
+	return 0;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access.  Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found.  A negative code is
+ * returned on error. */
+int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
+			struct iovec iov[], unsigned int iov_size,
+			unsigned int *out_num, unsigned int *in_num,
+			struct vhost_log *log, unsigned int *log_num,
+			__virtio16 head)
+{
+	struct vring_desc desc;
+	unsigned int i, found = 0;
+	int ret = 0, access;
 
 	/* If their number is silly, that's an error. */
-	if (unlikely(head >= vq->num)) {
+	if (unlikely(head > vq->num)) {
 		vq_err(vq, "Guest says index %u > %u is available",
 		       head, vq->num);
 		return -EINVAL;
@@ -2133,6 +2137,25 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
 	return head;
 }
+EXPORT_SYMBOL_GPL(__vhost_get_vq_desc);
+
+int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+		      struct iovec iov[], unsigned int iov_size,
+		      unsigned int *out_num, unsigned int *in_num,
+		      struct vhost_log *log, unsigned int *log_num)
+{
+	int ret = 0;
+	unsigned int head = vhost_get_vq_head(vq, &ret);
+
+	if (ret)
+		return ret;
+
+	if (head == vq->num)
+		return head;
+
+	return __vhost_get_vq_desc(vq, iov, iov_size, out_num, in_num,
+				   log, log_num, head);
+}
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d59a9cc..39ff897 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -191,6 +191,11 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
+int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
+			struct iovec iov[], unsigned int iov_count,
+			unsigned int *out_num, unsigned int *in_num,
+			struct vhost_log *log, unsigned int *log_num,
+			__virtio16 ring_head);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH net-next RFC 1/5] vhost: split out ring head fetching logic
  2017-09-22  8:02 [PATCH net-next RFC 0/5] batched tx processing in vhost_net Jason Wang
@ 2017-09-22  8:02 ` Jason Wang
  2017-09-22  8:02 ` Jason Wang
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-22  8:02 UTC (permalink / raw)
  To: mst, jasowang, virtualization, netdev, linux-kernel; +Cc: kvm

This patch splits out ring head fetching logic and leave the
descriptor fetching and translation logic. This makes it is possible
to batch fetching the descriptor indices.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 75 +++++++++++++++++++++++++++++++++------------------
 drivers/vhost/vhost.h |  5 ++++
 2 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index d6dbb28..f87ec75 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1984,41 +1984,23 @@ static int get_indirect(struct vhost_virtqueue *vq,
 	return 0;
 }
 
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found.  A negative code is
- * returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
-		      struct iovec iov[], unsigned int iov_size,
-		      unsigned int *out_num, unsigned int *in_num,
-		      struct vhost_log *log, unsigned int *log_num)
+static unsigned int vhost_get_vq_head(struct vhost_virtqueue *vq, int *err)
 {
-	struct vring_desc desc;
-	unsigned int i, head, found = 0;
-	u16 last_avail_idx;
-	__virtio16 avail_idx;
-	__virtio16 ring_head;
-	int ret, access;
-
-	/* Check it isn't doing very strange things with descriptor numbers. */
-	last_avail_idx = vq->last_avail_idx;
+	u16 last_avail_idx = vq->last_avail_idx;
+	__virtio16 avail_idx, ring_head;
 
 	if (vq->avail_idx == vq->last_avail_idx) {
 		if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
 			vq_err(vq, "Failed to access avail idx at %p\n",
 				&vq->avail->idx);
-			return -EFAULT;
+			goto err;
 		}
 		vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
 
 		if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
 			vq_err(vq, "Guest moved used index from %u to %u",
 				last_avail_idx, vq->avail_idx);
-			return -EFAULT;
+			goto err;
 		}
 
 		/* If there's nothing new since last we looked, return
@@ -2040,13 +2022,35 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		vq_err(vq, "Failed to read head: idx %d address %p\n",
 		       last_avail_idx,
 		       &vq->avail->ring[last_avail_idx % vq->num]);
-		return -EFAULT;
+		goto err;
 	}
 
-	head = vhost16_to_cpu(vq, ring_head);
+	return vhost16_to_cpu(vq, ring_head);
+err:
+	*err = -EFAULT;
+	return 0;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access.  Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found.  A negative code is
+ * returned on error. */
+int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
+			struct iovec iov[], unsigned int iov_size,
+			unsigned int *out_num, unsigned int *in_num,
+			struct vhost_log *log, unsigned int *log_num,
+			__virtio16 head)
+{
+	struct vring_desc desc;
+	unsigned int i, found = 0;
+	int ret = 0, access;
 
 	/* If their number is silly, that's an error. */
-	if (unlikely(head >= vq->num)) {
+	if (unlikely(head > vq->num)) {
 		vq_err(vq, "Guest says index %u > %u is available",
 		       head, vq->num);
 		return -EINVAL;
@@ -2133,6 +2137,25 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
 	return head;
 }
+EXPORT_SYMBOL_GPL(__vhost_get_vq_desc);
+
+int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+		      struct iovec iov[], unsigned int iov_size,
+		      unsigned int *out_num, unsigned int *in_num,
+		      struct vhost_log *log, unsigned int *log_num)
+{
+	int ret = 0;
+	unsigned int head = vhost_get_vq_head(vq, &ret);
+
+	if (ret)
+		return ret;
+
+	if (head == vq->num)
+		return head;
+
+	return __vhost_get_vq_desc(vq, iov, iov_size, out_num, in_num,
+				   log, log_num, head);
+}
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d59a9cc..39ff897 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -191,6 +191,11 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
+int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
+			struct iovec iov[], unsigned int iov_count,
+			unsigned int *out_num, unsigned int *in_num,
+			struct vhost_log *log, unsigned int *log_num,
+			__virtio16 ring_head);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-22  8:02 [PATCH net-next RFC 0/5] batched tx processing in vhost_net Jason Wang
  2017-09-22  8:02 ` [PATCH net-next RFC 1/5] vhost: split out ring head fetching logic Jason Wang
  2017-09-22  8:02 ` Jason Wang
@ 2017-09-22  8:02 ` Jason Wang
  2017-09-22  9:02     ` Stefan Hajnoczi
                     ` (4 more replies)
  2017-09-22  8:02 ` Jason Wang
                   ` (8 subsequent siblings)
  11 siblings, 5 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-22  8:02 UTC (permalink / raw)
  To: mst, jasowang, virtualization, netdev, linux-kernel; +Cc: kvm

This patch introduces vhost_prefetch_desc_indices() which could batch
descriptor indices fetching and used ring updating. This intends to
reduce the cache misses of indices fetching and updating and reduce
cache line bounce when virtqueue is almost full. copy_to_user() was
used in order to benefit from modern cpus that support fast string
copy. Batched virtqueue processing will be the first user.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |  3 +++
 2 files changed, 58 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f87ec75..8424166d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
 }
 EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
 
+int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
+				struct vring_used_elem *heads,
+				u16 num, bool used_update)
+{
+	int ret, ret2;
+	u16 last_avail_idx, last_used_idx, total, copied;
+	__virtio16 avail_idx;
+	struct vring_used_elem __user *used;
+	int i;
+
+	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
+		vq_err(vq, "Failed to access avail idx at %p\n",
+		       &vq->avail->idx);
+		return -EFAULT;
+	}
+	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
+	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+	total = vq->avail_idx - vq->last_avail_idx;
+	ret = total = min(total, num);
+
+	for (i = 0; i < ret; i++) {
+		ret2 = vhost_get_avail(vq, heads[i].id,
+				      &vq->avail->ring[last_avail_idx]);
+		if (unlikely(ret2)) {
+			vq_err(vq, "Failed to get descriptors\n");
+			return -EFAULT;
+		}
+		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
+	}
+
+	if (!used_update)
+		return ret;
+
+	last_used_idx = vq->last_used_idx & (vq->num - 1);
+	while (total) {
+		copied = min((u16)(vq->num - last_used_idx), total);
+		ret2 = vhost_copy_to_user(vq,
+					  &vq->used->ring[last_used_idx],
+					  &heads[ret - total],
+					  copied * sizeof(*used));
+
+		if (unlikely(ret2)) {
+			vq_err(vq, "Failed to update used ring!\n");
+			return -EFAULT;
+		}
+
+		last_used_idx = 0;
+		total -= copied;
+	}
+
+	/* Only get avail ring entries after they have been exposed by guest. */
+	smp_rmb();
+	return ret;
+}
+EXPORT_SYMBOL(vhost_prefetch_desc_indices);
 
 static int __init vhost_init(void)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 39ff897..16c2cb6 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -228,6 +228,9 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
 ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
 			     struct iov_iter *from);
 int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
+int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
+				struct vring_used_elem *heads,
+				u16 num, bool used_update);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-22  8:02 [PATCH net-next RFC 0/5] batched tx processing in vhost_net Jason Wang
                   ` (2 preceding siblings ...)
  2017-09-22  8:02 ` [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index Jason Wang
@ 2017-09-22  8:02 ` Jason Wang
  2017-09-22  8:02 ` [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx() Jason Wang
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-22  8:02 UTC (permalink / raw)
  To: mst, jasowang, virtualization, netdev, linux-kernel; +Cc: kvm

This patch introduces vhost_prefetch_desc_indices() which could batch
descriptor indices fetching and used ring updating. This intends to
reduce the cache misses of indices fetching and updating and reduce
cache line bounce when virtqueue is almost full. copy_to_user() was
used in order to benefit from modern cpus that support fast string
copy. Batched virtqueue processing will be the first user.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |  3 +++
 2 files changed, 58 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f87ec75..8424166d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
 }
 EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
 
+int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
+				struct vring_used_elem *heads,
+				u16 num, bool used_update)
+{
+	int ret, ret2;
+	u16 last_avail_idx, last_used_idx, total, copied;
+	__virtio16 avail_idx;
+	struct vring_used_elem __user *used;
+	int i;
+
+	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
+		vq_err(vq, "Failed to access avail idx at %p\n",
+		       &vq->avail->idx);
+		return -EFAULT;
+	}
+	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
+	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+	total = vq->avail_idx - vq->last_avail_idx;
+	ret = total = min(total, num);
+
+	for (i = 0; i < ret; i++) {
+		ret2 = vhost_get_avail(vq, heads[i].id,
+				      &vq->avail->ring[last_avail_idx]);
+		if (unlikely(ret2)) {
+			vq_err(vq, "Failed to get descriptors\n");
+			return -EFAULT;
+		}
+		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
+	}
+
+	if (!used_update)
+		return ret;
+
+	last_used_idx = vq->last_used_idx & (vq->num - 1);
+	while (total) {
+		copied = min((u16)(vq->num - last_used_idx), total);
+		ret2 = vhost_copy_to_user(vq,
+					  &vq->used->ring[last_used_idx],
+					  &heads[ret - total],
+					  copied * sizeof(*used));
+
+		if (unlikely(ret2)) {
+			vq_err(vq, "Failed to update used ring!\n");
+			return -EFAULT;
+		}
+
+		last_used_idx = 0;
+		total -= copied;
+	}
+
+	/* Only get avail ring entries after they have been exposed by guest. */
+	smp_rmb();
+	return ret;
+}
+EXPORT_SYMBOL(vhost_prefetch_desc_indices);
 
 static int __init vhost_init(void)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 39ff897..16c2cb6 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -228,6 +228,9 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
 ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
 			     struct iov_iter *from);
 int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
+int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
+				struct vring_used_elem *heads,
+				u16 num, bool used_update);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
  2017-09-22  8:02 [PATCH net-next RFC 0/5] batched tx processing in vhost_net Jason Wang
                   ` (3 preceding siblings ...)
  2017-09-22  8:02 ` Jason Wang
@ 2017-09-22  8:02 ` Jason Wang
  2017-09-22  9:07   ` Stefan Hajnoczi
                     ` (3 more replies)
  2017-09-22  8:02 ` [PATCH net-next RFC 4/5] vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH Jason Wang
                   ` (6 subsequent siblings)
  11 siblings, 4 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-22  8:02 UTC (permalink / raw)
  To: mst, jasowang, virtualization, netdev, linux-kernel; +Cc: kvm

This patch introduces a helper which just increase the used idx. This
will be used in pair with vhost_prefetch_desc_indices() by batching
code.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 33 +++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |  1 +
 2 files changed, 34 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8424166d..6532cda 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2178,6 +2178,39 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
 }
 EXPORT_SYMBOL_GPL(vhost_add_used);
 
+int vhost_add_used_idx(struct vhost_virtqueue *vq, int n)
+{
+	u16 old, new;
+
+	old = vq->last_used_idx;
+	new = (vq->last_used_idx += n);
+	/* If the driver never bothers to signal in a very long while,
+	 * used index might wrap around. If that happens, invalidate
+	 * signalled_used index we stored. TODO: make sure driver
+	 * signals at least once in 2^16 and remove this.
+	 */
+	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
+		vq->signalled_used_valid = false;
+
+	/* Make sure buffer is written before we update index. */
+	smp_wmb();
+	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
+			   &vq->used->idx)) {
+		vq_err(vq, "Failed to increment used idx");
+		return -EFAULT;
+	}
+	if (unlikely(vq->log_used)) {
+		/* Log used index update. */
+		log_write(vq->log_base,
+			  vq->log_addr + offsetof(struct vring_used, idx),
+			  sizeof(vq->used->idx));
+		if (vq->log_ctx)
+			eventfd_signal(vq->log_ctx, 1);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vhost_add_used_idx);
+
 static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 			    struct vring_used_elem *heads,
 			    unsigned count)
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 16c2cb6..5dd6c05 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
+int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
 int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
 		     unsigned count);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH net-next RFC 4/5] vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
  2017-09-22  8:02 [PATCH net-next RFC 0/5] batched tx processing in vhost_net Jason Wang
                   ` (4 preceding siblings ...)
  2017-09-22  8:02 ` [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx() Jason Wang
@ 2017-09-22  8:02 ` Jason Wang
  2017-09-22  8:02 ` Jason Wang
                   ` (5 subsequent siblings)
  11 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-22  8:02 UTC (permalink / raw)
  To: mst, jasowang, virtualization, netdev, linux-kernel; +Cc: kvm

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 58585ec..c89640e 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -87,7 +87,7 @@ struct vhost_net_ubuf_ref {
 	struct vhost_virtqueue *vq;
 };
 
-#define VHOST_RX_BATCH 64
+#define VHOST_NET_BATCH 64
 struct vhost_net_buf {
 	struct sk_buff **queue;
 	int tail;
@@ -159,7 +159,7 @@ static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
 
 	rxq->head = 0;
 	rxq->tail = skb_array_consume_batched(nvq->rx_array, rxq->queue,
-					      VHOST_RX_BATCH);
+					      VHOST_NET_BATCH);
 	return rxq->tail;
 }
 
@@ -912,7 +912,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 		return -ENOMEM;
 	}
 
-	queue = kmalloc_array(VHOST_RX_BATCH, sizeof(struct sk_buff *),
+	queue = kmalloc_array(VHOST_NET_BATCH, sizeof(struct sk_buff *),
 			      GFP_KERNEL);
 	if (!queue) {
 		kfree(vqs);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH net-next RFC 4/5] vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
  2017-09-22  8:02 [PATCH net-next RFC 0/5] batched tx processing in vhost_net Jason Wang
                   ` (5 preceding siblings ...)
  2017-09-22  8:02 ` [PATCH net-next RFC 4/5] vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH Jason Wang
@ 2017-09-22  8:02 ` Jason Wang
  2017-09-22  8:02 ` [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing Jason Wang
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-22  8:02 UTC (permalink / raw)
  To: mst, jasowang, virtualization, netdev, linux-kernel; +Cc: kvm

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 58585ec..c89640e 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -87,7 +87,7 @@ struct vhost_net_ubuf_ref {
 	struct vhost_virtqueue *vq;
 };
 
-#define VHOST_RX_BATCH 64
+#define VHOST_NET_BATCH 64
 struct vhost_net_buf {
 	struct sk_buff **queue;
 	int tail;
@@ -159,7 +159,7 @@ static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
 
 	rxq->head = 0;
 	rxq->tail = skb_array_consume_batched(nvq->rx_array, rxq->queue,
-					      VHOST_RX_BATCH);
+					      VHOST_NET_BATCH);
 	return rxq->tail;
 }
 
@@ -912,7 +912,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 		return -ENOMEM;
 	}
 
-	queue = kmalloc_array(VHOST_RX_BATCH, sizeof(struct sk_buff *),
+	queue = kmalloc_array(VHOST_NET_BATCH, sizeof(struct sk_buff *),
 			      GFP_KERNEL);
 	if (!queue) {
 		kfree(vqs);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-22  8:02 [PATCH net-next RFC 0/5] batched tx processing in vhost_net Jason Wang
                   ` (7 preceding siblings ...)
  2017-09-22  8:02 ` [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing Jason Wang
@ 2017-09-22  8:02 ` Jason Wang
  2017-09-26 19:25   ` Michael S. Tsirkin
                     ` (2 more replies)
  2017-09-26 13:45   ` Michael S. Tsirkin
                   ` (2 subsequent siblings)
  11 siblings, 3 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-22  8:02 UTC (permalink / raw)
  To: mst, jasowang, virtualization, netdev, linux-kernel; +Cc: kvm

This patch implements basic batched processing of tx virtqueue by
prefetching desc indices and updating used ring in a batch. For
non-zerocopy case, vq->heads were used for storing the prefetched
indices and updating used ring. It is also a requirement for doing
more batching on top. For zerocopy case and for simplicity, batched
processing were simply disabled by only fetching and processing one
descriptor at a time, this could be optimized in the future.

XDP_DROP (without touching skb) on tun (with Moongen in guest) with
zercopy disabled:

Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
Before: 3.20Mpps
After:  3.90Mpps (+22%)

No differences were seen with zerocopy enabled.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 215 ++++++++++++++++++++++++++++----------------------
 drivers/vhost/vhost.c |   2 +-
 2 files changed, 121 insertions(+), 96 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index c89640e..c439892 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -408,27 +408,25 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 	return vhost_poll_start(poll, sock->file);
 }
 
-static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
-				    struct vhost_virtqueue *vq,
-				    struct iovec iov[], unsigned int iov_size,
-				    unsigned int *out_num, unsigned int *in_num)
+static bool vhost_net_tx_avail(struct vhost_net *net,
+			       struct vhost_virtqueue *vq)
 {
 	unsigned long uninitialized_var(endtime);
-	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-				  out_num, in_num, NULL, NULL);
 
-	if (r == vq->num && vq->busyloop_timeout) {
-		preempt_disable();
-		endtime = busy_clock() + vq->busyloop_timeout;
-		while (vhost_can_busy_poll(vq->dev, endtime) &&
-		       vhost_vq_avail_empty(vq->dev, vq))
-			cpu_relax();
-		preempt_enable();
-		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-				      out_num, in_num, NULL, NULL);
-	}
+	if (!vq->busyloop_timeout)
+		return false;
 
-	return r;
+	if (!vhost_vq_avail_empty(vq->dev, vq))
+		return true;
+
+	preempt_disable();
+	endtime = busy_clock() + vq->busyloop_timeout;
+	while (vhost_can_busy_poll(vq->dev, endtime) &&
+		vhost_vq_avail_empty(vq->dev, vq))
+		cpu_relax();
+	preempt_enable();
+
+	return !vhost_vq_avail_empty(vq->dev, vq);
 }
 
 static bool vhost_exceeds_maxpend(struct vhost_net *net)
@@ -446,8 +444,9 @@ static void handle_tx(struct vhost_net *net)
 {
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
 	struct vhost_virtqueue *vq = &nvq->vq;
+	struct vring_used_elem used, *heads = vq->heads;
 	unsigned out, in;
-	int head;
+	int avails, head;
 	struct msghdr msg = {
 		.msg_name = NULL,
 		.msg_namelen = 0,
@@ -461,6 +460,7 @@ static void handle_tx(struct vhost_net *net)
 	struct socket *sock;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
 	bool zcopy, zcopy_used;
+	int i, batched = VHOST_NET_BATCH;
 
 	mutex_lock(&vq->mutex);
 	sock = vq->private_data;
@@ -475,6 +475,12 @@ static void handle_tx(struct vhost_net *net)
 	hdr_size = nvq->vhost_hlen;
 	zcopy = nvq->ubufs;
 
+	/* Disable zerocopy batched fetching for simplicity */
+	if (zcopy) {
+		heads = &used;
+		batched = 1;
+	}
+
 	for (;;) {
 		/* Release DMAs done buffers first */
 		if (zcopy)
@@ -486,95 +492,114 @@ static void handle_tx(struct vhost_net *net)
 		if (unlikely(vhost_exceeds_maxpend(net)))
 			break;
 
-		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
-						ARRAY_SIZE(vq->iov),
-						&out, &in);
+		avails = vhost_prefetch_desc_indices(vq, heads, batched, !zcopy);
 		/* On error, stop handling until the next kick. */
-		if (unlikely(head < 0))
+		if (unlikely(avails < 0))
 			break;
-		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (head == vq->num) {
+		/* Nothing new?  Busy poll for a while or wait for
+		 * eventfd to tell us they refilled. */
+		if (!avails) {
+			if (vhost_net_tx_avail(net, vq))
+				continue;
 			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 				vhost_disable_notify(&net->dev, vq);
 				continue;
 			}
 			break;
 		}
-		if (in) {
-			vq_err(vq, "Unexpected descriptor format for TX: "
-			       "out %d, int %d\n", out, in);
-			break;
-		}
-		/* Skip header. TODO: support TSO. */
-		len = iov_length(vq->iov, out);
-		iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
-		iov_iter_advance(&msg.msg_iter, hdr_size);
-		/* Sanity check */
-		if (!msg_data_left(&msg)) {
-			vq_err(vq, "Unexpected header len for TX: "
-			       "%zd expected %zd\n",
-			       len, hdr_size);
-			break;
-		}
-		len = msg_data_left(&msg);
-
-		zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
-				   && (nvq->upend_idx + 1) % UIO_MAXIOV !=
-				      nvq->done_idx
-				   && vhost_net_tx_select_zcopy(net);
-
-		/* use msg_control to pass vhost zerocopy ubuf info to skb */
-		if (zcopy_used) {
-			struct ubuf_info *ubuf;
-			ubuf = nvq->ubuf_info + nvq->upend_idx;
-
-			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
-			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
-			ubuf->callback = vhost_zerocopy_callback;
-			ubuf->ctx = nvq->ubufs;
-			ubuf->desc = nvq->upend_idx;
-			refcount_set(&ubuf->refcnt, 1);
-			msg.msg_control = ubuf;
-			msg.msg_controllen = sizeof(ubuf);
-			ubufs = nvq->ubufs;
-			atomic_inc(&ubufs->refcount);
-			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
-		} else {
-			msg.msg_control = NULL;
-			ubufs = NULL;
-		}
+		for (i = 0; i < avails; i++) {
+			head = __vhost_get_vq_desc(vq, vq->iov,
+						   ARRAY_SIZE(vq->iov),
+						   &out, &in, NULL, NULL,
+					       vhost16_to_cpu(vq, heads[i].id));
+			if (in) {
+				vq_err(vq, "Unexpected descriptor format for "
+					   "TX: out %d, int %d\n", out, in);
+				goto out;
+			}
 
-		total_len += len;
-		if (total_len < VHOST_NET_WEIGHT &&
-		    !vhost_vq_avail_empty(&net->dev, vq) &&
-		    likely(!vhost_exceeds_maxpend(net))) {
-			msg.msg_flags |= MSG_MORE;
-		} else {
-			msg.msg_flags &= ~MSG_MORE;
-		}
+			/* Skip header. TODO: support TSO. */
+			len = iov_length(vq->iov, out);
+			iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
+			iov_iter_advance(&msg.msg_iter, hdr_size);
+			/* Sanity check */
+			if (!msg_data_left(&msg)) {
+				vq_err(vq, "Unexpected header len for TX: "
+					"%zd expected %zd\n",
+					len, hdr_size);
+				goto out;
+			}
+			len = msg_data_left(&msg);
 
-		/* TODO: Check specific error and bomb out unless ENOBUFS? */
-		err = sock->ops->sendmsg(sock, &msg, len);
-		if (unlikely(err < 0)) {
+			zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
+				     && (nvq->upend_idx + 1) % UIO_MAXIOV !=
+					nvq->done_idx
+				     && vhost_net_tx_select_zcopy(net);
+
+			/* use msg_control to pass vhost zerocopy ubuf
+			 * info to skb
+			 */
 			if (zcopy_used) {
-				vhost_net_ubuf_put(ubufs);
-				nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
-					% UIO_MAXIOV;
+				struct ubuf_info *ubuf;
+				ubuf = nvq->ubuf_info + nvq->upend_idx;
+
+				vq->heads[nvq->upend_idx].id =
+					cpu_to_vhost32(vq, head);
+				vq->heads[nvq->upend_idx].len =
+					VHOST_DMA_IN_PROGRESS;
+				ubuf->callback = vhost_zerocopy_callback;
+				ubuf->ctx = nvq->ubufs;
+				ubuf->desc = nvq->upend_idx;
+				refcount_set(&ubuf->refcnt, 1);
+				msg.msg_control = ubuf;
+				msg.msg_controllen = sizeof(ubuf);
+				ubufs = nvq->ubufs;
+				atomic_inc(&ubufs->refcount);
+				nvq->upend_idx =
+					(nvq->upend_idx + 1) % UIO_MAXIOV;
+			} else {
+				msg.msg_control = NULL;
+				ubufs = NULL;
+			}
+
+			total_len += len;
+			if (total_len < VHOST_NET_WEIGHT &&
+				!vhost_vq_avail_empty(&net->dev, vq) &&
+				likely(!vhost_exceeds_maxpend(net))) {
+				msg.msg_flags |= MSG_MORE;
+			} else {
+				msg.msg_flags &= ~MSG_MORE;
+			}
+
+			/* TODO: Check specific error and bomb out
+			 * unless ENOBUFS?
+			 */
+			err = sock->ops->sendmsg(sock, &msg, len);
+			if (unlikely(err < 0)) {
+				if (zcopy_used) {
+					vhost_net_ubuf_put(ubufs);
+					nvq->upend_idx =
+				   ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV;
+				}
+				vhost_discard_vq_desc(vq, 1);
+				goto out;
+			}
+			if (err != len)
+				pr_debug("Truncated TX packet: "
+					" len %d != %zd\n", err, len);
+			if (!zcopy) {
+				vhost_add_used_idx(vq, 1);
+				vhost_signal(&net->dev, vq);
+			} else if (!zcopy_used) {
+				vhost_add_used_and_signal(&net->dev,
+							  vq, head, 0);
+			} else
+				vhost_zerocopy_signal_used(net, vq);
+			vhost_net_tx_packet(net);
+			if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
+				vhost_poll_queue(&vq->poll);
+				goto out;
 			}
-			vhost_discard_vq_desc(vq, 1);
-			break;
-		}
-		if (err != len)
-			pr_debug("Truncated TX packet: "
-				 " len %d != %zd\n", err, len);
-		if (!zcopy_used)
-			vhost_add_used_and_signal(&net->dev, vq, head, 0);
-		else
-			vhost_zerocopy_signal_used(net, vq);
-		vhost_net_tx_packet(net);
-		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
-			vhost_poll_queue(&vq->poll);
-			break;
 		}
 	}
 out:
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6532cda..8764df5 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -392,7 +392,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
 		vq->indirect = kmalloc(sizeof *vq->indirect * UIO_MAXIOV,
 				       GFP_KERNEL);
 		vq->log = kmalloc(sizeof *vq->log * UIO_MAXIOV, GFP_KERNEL);
-		vq->heads = kmalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);
+		vq->heads = kzalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);
 		if (!vq->indirect || !vq->log || !vq->heads)
 			goto err_nomem;
 	}
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-22  8:02 [PATCH net-next RFC 0/5] batched tx processing in vhost_net Jason Wang
                   ` (6 preceding siblings ...)
  2017-09-22  8:02 ` Jason Wang
@ 2017-09-22  8:02 ` Jason Wang
  2017-09-22  8:02 ` Jason Wang
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-22  8:02 UTC (permalink / raw)
  To: mst, jasowang, virtualization, netdev, linux-kernel; +Cc: kvm

This patch implements basic batched processing of tx virtqueue by
prefetching desc indices and updating used ring in a batch. For
non-zerocopy case, vq->heads were used for storing the prefetched
indices and updating used ring. It is also a requirement for doing
more batching on top. For zerocopy case and for simplicity, batched
processing were simply disabled by only fetching and processing one
descriptor at a time, this could be optimized in the future.

XDP_DROP (without touching skb) on tun (with Moongen in guest) with
zercopy disabled:

Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
Before: 3.20Mpps
After:  3.90Mpps (+22%)

No differences were seen with zerocopy enabled.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 215 ++++++++++++++++++++++++++++----------------------
 drivers/vhost/vhost.c |   2 +-
 2 files changed, 121 insertions(+), 96 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index c89640e..c439892 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -408,27 +408,25 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 	return vhost_poll_start(poll, sock->file);
 }
 
-static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
-				    struct vhost_virtqueue *vq,
-				    struct iovec iov[], unsigned int iov_size,
-				    unsigned int *out_num, unsigned int *in_num)
+static bool vhost_net_tx_avail(struct vhost_net *net,
+			       struct vhost_virtqueue *vq)
 {
 	unsigned long uninitialized_var(endtime);
-	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-				  out_num, in_num, NULL, NULL);
 
-	if (r == vq->num && vq->busyloop_timeout) {
-		preempt_disable();
-		endtime = busy_clock() + vq->busyloop_timeout;
-		while (vhost_can_busy_poll(vq->dev, endtime) &&
-		       vhost_vq_avail_empty(vq->dev, vq))
-			cpu_relax();
-		preempt_enable();
-		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-				      out_num, in_num, NULL, NULL);
-	}
+	if (!vq->busyloop_timeout)
+		return false;
 
-	return r;
+	if (!vhost_vq_avail_empty(vq->dev, vq))
+		return true;
+
+	preempt_disable();
+	endtime = busy_clock() + vq->busyloop_timeout;
+	while (vhost_can_busy_poll(vq->dev, endtime) &&
+		vhost_vq_avail_empty(vq->dev, vq))
+		cpu_relax();
+	preempt_enable();
+
+	return !vhost_vq_avail_empty(vq->dev, vq);
 }
 
 static bool vhost_exceeds_maxpend(struct vhost_net *net)
@@ -446,8 +444,9 @@ static void handle_tx(struct vhost_net *net)
 {
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
 	struct vhost_virtqueue *vq = &nvq->vq;
+	struct vring_used_elem used, *heads = vq->heads;
 	unsigned out, in;
-	int head;
+	int avails, head;
 	struct msghdr msg = {
 		.msg_name = NULL,
 		.msg_namelen = 0,
@@ -461,6 +460,7 @@ static void handle_tx(struct vhost_net *net)
 	struct socket *sock;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
 	bool zcopy, zcopy_used;
+	int i, batched = VHOST_NET_BATCH;
 
 	mutex_lock(&vq->mutex);
 	sock = vq->private_data;
@@ -475,6 +475,12 @@ static void handle_tx(struct vhost_net *net)
 	hdr_size = nvq->vhost_hlen;
 	zcopy = nvq->ubufs;
 
+	/* Disable zerocopy batched fetching for simplicity */
+	if (zcopy) {
+		heads = &used;
+		batched = 1;
+	}
+
 	for (;;) {
 		/* Release DMAs done buffers first */
 		if (zcopy)
@@ -486,95 +492,114 @@ static void handle_tx(struct vhost_net *net)
 		if (unlikely(vhost_exceeds_maxpend(net)))
 			break;
 
-		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
-						ARRAY_SIZE(vq->iov),
-						&out, &in);
+		avails = vhost_prefetch_desc_indices(vq, heads, batched, !zcopy);
 		/* On error, stop handling until the next kick. */
-		if (unlikely(head < 0))
+		if (unlikely(avails < 0))
 			break;
-		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (head == vq->num) {
+		/* Nothing new?  Busy poll for a while or wait for
+		 * eventfd to tell us they refilled. */
+		if (!avails) {
+			if (vhost_net_tx_avail(net, vq))
+				continue;
 			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 				vhost_disable_notify(&net->dev, vq);
 				continue;
 			}
 			break;
 		}
-		if (in) {
-			vq_err(vq, "Unexpected descriptor format for TX: "
-			       "out %d, int %d\n", out, in);
-			break;
-		}
-		/* Skip header. TODO: support TSO. */
-		len = iov_length(vq->iov, out);
-		iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
-		iov_iter_advance(&msg.msg_iter, hdr_size);
-		/* Sanity check */
-		if (!msg_data_left(&msg)) {
-			vq_err(vq, "Unexpected header len for TX: "
-			       "%zd expected %zd\n",
-			       len, hdr_size);
-			break;
-		}
-		len = msg_data_left(&msg);
-
-		zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
-				   && (nvq->upend_idx + 1) % UIO_MAXIOV !=
-				      nvq->done_idx
-				   && vhost_net_tx_select_zcopy(net);
-
-		/* use msg_control to pass vhost zerocopy ubuf info to skb */
-		if (zcopy_used) {
-			struct ubuf_info *ubuf;
-			ubuf = nvq->ubuf_info + nvq->upend_idx;
-
-			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
-			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
-			ubuf->callback = vhost_zerocopy_callback;
-			ubuf->ctx = nvq->ubufs;
-			ubuf->desc = nvq->upend_idx;
-			refcount_set(&ubuf->refcnt, 1);
-			msg.msg_control = ubuf;
-			msg.msg_controllen = sizeof(ubuf);
-			ubufs = nvq->ubufs;
-			atomic_inc(&ubufs->refcount);
-			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
-		} else {
-			msg.msg_control = NULL;
-			ubufs = NULL;
-		}
+		for (i = 0; i < avails; i++) {
+			head = __vhost_get_vq_desc(vq, vq->iov,
+						   ARRAY_SIZE(vq->iov),
+						   &out, &in, NULL, NULL,
+					       vhost16_to_cpu(vq, heads[i].id));
+			if (in) {
+				vq_err(vq, "Unexpected descriptor format for "
+					   "TX: out %d, int %d\n", out, in);
+				goto out;
+			}
 
-		total_len += len;
-		if (total_len < VHOST_NET_WEIGHT &&
-		    !vhost_vq_avail_empty(&net->dev, vq) &&
-		    likely(!vhost_exceeds_maxpend(net))) {
-			msg.msg_flags |= MSG_MORE;
-		} else {
-			msg.msg_flags &= ~MSG_MORE;
-		}
+			/* Skip header. TODO: support TSO. */
+			len = iov_length(vq->iov, out);
+			iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
+			iov_iter_advance(&msg.msg_iter, hdr_size);
+			/* Sanity check */
+			if (!msg_data_left(&msg)) {
+				vq_err(vq, "Unexpected header len for TX: "
+					"%zd expected %zd\n",
+					len, hdr_size);
+				goto out;
+			}
+			len = msg_data_left(&msg);
 
-		/* TODO: Check specific error and bomb out unless ENOBUFS? */
-		err = sock->ops->sendmsg(sock, &msg, len);
-		if (unlikely(err < 0)) {
+			zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
+				     && (nvq->upend_idx + 1) % UIO_MAXIOV !=
+					nvq->done_idx
+				     && vhost_net_tx_select_zcopy(net);
+
+			/* use msg_control to pass vhost zerocopy ubuf
+			 * info to skb
+			 */
 			if (zcopy_used) {
-				vhost_net_ubuf_put(ubufs);
-				nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
-					% UIO_MAXIOV;
+				struct ubuf_info *ubuf;
+				ubuf = nvq->ubuf_info + nvq->upend_idx;
+
+				vq->heads[nvq->upend_idx].id =
+					cpu_to_vhost32(vq, head);
+				vq->heads[nvq->upend_idx].len =
+					VHOST_DMA_IN_PROGRESS;
+				ubuf->callback = vhost_zerocopy_callback;
+				ubuf->ctx = nvq->ubufs;
+				ubuf->desc = nvq->upend_idx;
+				refcount_set(&ubuf->refcnt, 1);
+				msg.msg_control = ubuf;
+				msg.msg_controllen = sizeof(ubuf);
+				ubufs = nvq->ubufs;
+				atomic_inc(&ubufs->refcount);
+				nvq->upend_idx =
+					(nvq->upend_idx + 1) % UIO_MAXIOV;
+			} else {
+				msg.msg_control = NULL;
+				ubufs = NULL;
+			}
+
+			total_len += len;
+			if (total_len < VHOST_NET_WEIGHT &&
+				!vhost_vq_avail_empty(&net->dev, vq) &&
+				likely(!vhost_exceeds_maxpend(net))) {
+				msg.msg_flags |= MSG_MORE;
+			} else {
+				msg.msg_flags &= ~MSG_MORE;
+			}
+
+			/* TODO: Check specific error and bomb out
+			 * unless ENOBUFS?
+			 */
+			err = sock->ops->sendmsg(sock, &msg, len);
+			if (unlikely(err < 0)) {
+				if (zcopy_used) {
+					vhost_net_ubuf_put(ubufs);
+					nvq->upend_idx =
+				   ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV;
+				}
+				vhost_discard_vq_desc(vq, 1);
+				goto out;
+			}
+			if (err != len)
+				pr_debug("Truncated TX packet: "
+					" len %d != %zd\n", err, len);
+			if (!zcopy) {
+				vhost_add_used_idx(vq, 1);
+				vhost_signal(&net->dev, vq);
+			} else if (!zcopy_used) {
+				vhost_add_used_and_signal(&net->dev,
+							  vq, head, 0);
+			} else
+				vhost_zerocopy_signal_used(net, vq);
+			vhost_net_tx_packet(net);
+			if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
+				vhost_poll_queue(&vq->poll);
+				goto out;
 			}
-			vhost_discard_vq_desc(vq, 1);
-			break;
-		}
-		if (err != len)
-			pr_debug("Truncated TX packet: "
-				 " len %d != %zd\n", err, len);
-		if (!zcopy_used)
-			vhost_add_used_and_signal(&net->dev, vq, head, 0);
-		else
-			vhost_zerocopy_signal_used(net, vq);
-		vhost_net_tx_packet(net);
-		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
-			vhost_poll_queue(&vq->poll);
-			break;
 		}
 	}
 out:
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6532cda..8764df5 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -392,7 +392,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
 		vq->indirect = kmalloc(sizeof *vq->indirect * UIO_MAXIOV,
 				       GFP_KERNEL);
 		vq->log = kmalloc(sizeof *vq->log * UIO_MAXIOV, GFP_KERNEL);
-		vq->heads = kmalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);
+		vq->heads = kzalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);
 		if (!vq->indirect || !vq->log || !vq->heads)
 			goto err_nomem;
 	}
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 1/5] vhost: split out ring head fetching logic
  2017-09-22  8:02 ` Jason Wang
  2017-09-22  8:31   ` Stefan Hajnoczi
@ 2017-09-22  8:31   ` Stefan Hajnoczi
  2017-09-25  2:03     ` Jason Wang
  2017-09-25  2:03     ` Jason Wang
  1 sibling, 2 replies; 69+ messages in thread
From: Stefan Hajnoczi @ 2017-09-22  8:31 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, virtualization, netdev, linux-kernel, kvm

On Fri, Sep 22, 2017 at 04:02:31PM +0800, Jason Wang wrote:
> +/* This looks in the virtqueue and for the first available buffer, and converts
> + * it to an iovec for convenient access.  Since descriptors consist of some
> + * number of output then some number of input descriptors, it's actually two
> + * iovecs, but we pack them into one and note how many of each there were.
> + *
> + * This function returns the descriptor number found, or vq->num (which is
> + * never a valid descriptor number) if none was found.  A negative code is
> + * returned on error. */
> +int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
> +			struct iovec iov[], unsigned int iov_size,
> +			unsigned int *out_num, unsigned int *in_num,
> +			struct vhost_log *log, unsigned int *log_num,
> +			__virtio16 head)
[...]
> +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> +		      struct iovec iov[], unsigned int iov_size,
> +		      unsigned int *out_num, unsigned int *in_num,
> +		      struct vhost_log *log, unsigned int *log_num)

Please document vhost_get_vq_desc().

Please also explain the difference between __vhost_get_vq_desc() and
vhost_get_vq_desc() in the documentation.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 1/5] vhost: split out ring head fetching logic
  2017-09-22  8:02 ` Jason Wang
@ 2017-09-22  8:31   ` Stefan Hajnoczi
  2017-09-22  8:31   ` Stefan Hajnoczi
  1 sibling, 0 replies; 69+ messages in thread
From: Stefan Hajnoczi @ 2017-09-22  8:31 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, virtualization, linux-kernel, kvm, mst

On Fri, Sep 22, 2017 at 04:02:31PM +0800, Jason Wang wrote:
> +/* This looks in the virtqueue and for the first available buffer, and converts
> + * it to an iovec for convenient access.  Since descriptors consist of some
> + * number of output then some number of input descriptors, it's actually two
> + * iovecs, but we pack them into one and note how many of each there were.
> + *
> + * This function returns the descriptor number found, or vq->num (which is
> + * never a valid descriptor number) if none was found.  A negative code is
> + * returned on error. */
> +int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
> +			struct iovec iov[], unsigned int iov_size,
> +			unsigned int *out_num, unsigned int *in_num,
> +			struct vhost_log *log, unsigned int *log_num,
> +			__virtio16 head)
[...]
> +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> +		      struct iovec iov[], unsigned int iov_size,
> +		      unsigned int *out_num, unsigned int *in_num,
> +		      struct vhost_log *log, unsigned int *log_num)

Please document vhost_get_vq_desc().

Please also explain the difference between __vhost_get_vq_desc() and
vhost_get_vq_desc() in the documentation.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-22  8:02 ` [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index Jason Wang
@ 2017-09-22  9:02     ` Stefan Hajnoczi
  2017-09-26 19:19   ` Michael S. Tsirkin
                       ` (3 subsequent siblings)
  4 siblings, 0 replies; 69+ messages in thread
From: Stefan Hajnoczi @ 2017-09-22  9:02 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, virtualization, netdev, linux-kernel, kvm

On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index f87ec75..8424166d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>  }
>  EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>  
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +				struct vring_used_elem *heads,
> +				u16 num, bool used_update)

Missing doc comment.

> +{
> +	int ret, ret2;
> +	u16 last_avail_idx, last_used_idx, total, copied;
> +	__virtio16 avail_idx;
> +	struct vring_used_elem __user *used;
> +	int i;

The following variable names are a little confusing:

last_avail_idx vs vq->last_avail_idx.  last_avail_idx is a wrapped
avail->ring[] index, vq->last_avail_idx is a free-running counter.  The
same for last_used_idx vs vq->last_used_idx.

num argument vs vq->num.  The argument could be called nheads instead to
make it clear that this is heads[] and not the virtqueue size.

Not a bug but it took me a while to figure out what was going on.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
@ 2017-09-22  9:02     ` Stefan Hajnoczi
  0 siblings, 0 replies; 69+ messages in thread
From: Stefan Hajnoczi @ 2017-09-22  9:02 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, virtualization, linux-kernel, kvm, mst

On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index f87ec75..8424166d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>  }
>  EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>  
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +				struct vring_used_elem *heads,
> +				u16 num, bool used_update)

Missing doc comment.

> +{
> +	int ret, ret2;
> +	u16 last_avail_idx, last_used_idx, total, copied;
> +	__virtio16 avail_idx;
> +	struct vring_used_elem __user *used;
> +	int i;

The following variable names are a little confusing:

last_avail_idx vs vq->last_avail_idx.  last_avail_idx is a wrapped
avail->ring[] index, vq->last_avail_idx is a free-running counter.  The
same for last_used_idx vs vq->last_used_idx.

num argument vs vq->num.  The argument could be called nheads instead to
make it clear that this is heads[] and not the virtqueue size.

Not a bug but it took me a while to figure out what was going on.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
  2017-09-22  8:02 ` [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx() Jason Wang
@ 2017-09-22  9:07   ` Stefan Hajnoczi
  2017-09-22  9:07   ` Stefan Hajnoczi
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 69+ messages in thread
From: Stefan Hajnoczi @ 2017-09-22  9:07 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, virtualization, netdev, linux-kernel, kvm

On Fri, Sep 22, 2017 at 04:02:33PM +0800, Jason Wang wrote:
> This patch introduces a helper which just increase the used idx. This
> will be used in pair with vhost_prefetch_desc_indices() by batching
> code.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 33 +++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  1 +
>  2 files changed, 34 insertions(+)

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
  2017-09-22  8:02 ` [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx() Jason Wang
  2017-09-22  9:07   ` Stefan Hajnoczi
@ 2017-09-22  9:07   ` Stefan Hajnoczi
  2017-09-26 19:13   ` Michael S. Tsirkin
  2017-09-26 19:13   ` Michael S. Tsirkin
  3 siblings, 0 replies; 69+ messages in thread
From: Stefan Hajnoczi @ 2017-09-22  9:07 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, virtualization, linux-kernel, kvm, mst

On Fri, Sep 22, 2017 at 04:02:33PM +0800, Jason Wang wrote:
> This patch introduces a helper which just increase the used idx. This
> will be used in pair with vhost_prefetch_desc_indices() by batching
> code.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 33 +++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  1 +
>  2 files changed, 34 insertions(+)

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 1/5] vhost: split out ring head fetching logic
  2017-09-22  8:31   ` Stefan Hajnoczi
@ 2017-09-25  2:03     ` Jason Wang
  2017-09-25  2:03     ` Jason Wang
  1 sibling, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-25  2:03 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: mst, virtualization, netdev, linux-kernel, kvm



On 2017年09月22日 16:31, Stefan Hajnoczi wrote:
> On Fri, Sep 22, 2017 at 04:02:31PM +0800, Jason Wang wrote:
>> +/* This looks in the virtqueue and for the first available buffer, and converts
>> + * it to an iovec for convenient access.  Since descriptors consist of some
>> + * number of output then some number of input descriptors, it's actually two
>> + * iovecs, but we pack them into one and note how many of each there were.
>> + *
>> + * This function returns the descriptor number found, or vq->num (which is
>> + * never a valid descriptor number) if none was found.  A negative code is
>> + * returned on error. */
>> +int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
>> +			struct iovec iov[], unsigned int iov_size,
>> +			unsigned int *out_num, unsigned int *in_num,
>> +			struct vhost_log *log, unsigned int *log_num,
>> +			__virtio16 head)
> [...]
>> +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>> +		      struct iovec iov[], unsigned int iov_size,
>> +		      unsigned int *out_num, unsigned int *in_num,
>> +		      struct vhost_log *log, unsigned int *log_num)
> Please document vhost_get_vq_desc().
>
> Please also explain the difference between __vhost_get_vq_desc() and
> vhost_get_vq_desc() in the documentation.

Right, will document this in next version.

Thanks

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 1/5] vhost: split out ring head fetching logic
  2017-09-22  8:31   ` Stefan Hajnoczi
  2017-09-25  2:03     ` Jason Wang
@ 2017-09-25  2:03     ` Jason Wang
  1 sibling, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-25  2:03 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: netdev, virtualization, linux-kernel, kvm, mst



On 2017年09月22日 16:31, Stefan Hajnoczi wrote:
> On Fri, Sep 22, 2017 at 04:02:31PM +0800, Jason Wang wrote:
>> +/* This looks in the virtqueue and for the first available buffer, and converts
>> + * it to an iovec for convenient access.  Since descriptors consist of some
>> + * number of output then some number of input descriptors, it's actually two
>> + * iovecs, but we pack them into one and note how many of each there were.
>> + *
>> + * This function returns the descriptor number found, or vq->num (which is
>> + * never a valid descriptor number) if none was found.  A negative code is
>> + * returned on error. */
>> +int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
>> +			struct iovec iov[], unsigned int iov_size,
>> +			unsigned int *out_num, unsigned int *in_num,
>> +			struct vhost_log *log, unsigned int *log_num,
>> +			__virtio16 head)
> [...]
>> +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>> +		      struct iovec iov[], unsigned int iov_size,
>> +		      unsigned int *out_num, unsigned int *in_num,
>> +		      struct vhost_log *log, unsigned int *log_num)
> Please document vhost_get_vq_desc().
>
> Please also explain the difference between __vhost_get_vq_desc() and
> vhost_get_vq_desc() in the documentation.

Right, will document this in next version.

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-22  9:02     ` Stefan Hajnoczi
  (?)
  (?)
@ 2017-09-25  2:04     ` Jason Wang
  -1 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-25  2:04 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: mst, virtualization, netdev, linux-kernel, kvm



On 2017年09月22日 17:02, Stefan Hajnoczi wrote:
> On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index f87ec75..8424166d 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>>   
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +				struct vring_used_elem *heads,
>> +				u16 num, bool used_update)
> Missing doc comment.

Will fix this.

>
>> +{
>> +	int ret, ret2;
>> +	u16 last_avail_idx, last_used_idx, total, copied;
>> +	__virtio16 avail_idx;
>> +	struct vring_used_elem __user *used;
>> +	int i;
> The following variable names are a little confusing:
>
> last_avail_idx vs vq->last_avail_idx.  last_avail_idx is a wrapped
> avail->ring[] index, vq->last_avail_idx is a free-running counter.  The
> same for last_used_idx vs vq->last_used_idx.
>
> num argument vs vq->num.  The argument could be called nheads instead to
> make it clear that this is heads[] and not the virtqueue size.
>
> Not a bug but it took me a while to figure out what was going on.

I admit the name is confusing. Let me try better ones in V2.

Thanks

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-22  9:02     ` Stefan Hajnoczi
  (?)
@ 2017-09-25  2:04     ` Jason Wang
  -1 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-25  2:04 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: netdev, virtualization, linux-kernel, kvm, mst



On 2017年09月22日 17:02, Stefan Hajnoczi wrote:
> On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index f87ec75..8424166d 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>>   
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +				struct vring_used_elem *heads,
>> +				u16 num, bool used_update)
> Missing doc comment.

Will fix this.

>
>> +{
>> +	int ret, ret2;
>> +	u16 last_avail_idx, last_used_idx, total, copied;
>> +	__virtio16 avail_idx;
>> +	struct vring_used_elem __user *used;
>> +	int i;
> The following variable names are a little confusing:
>
> last_avail_idx vs vq->last_avail_idx.  last_avail_idx is a wrapped
> avail->ring[] index, vq->last_avail_idx is a free-running counter.  The
> same for last_used_idx vs vq->last_used_idx.
>
> num argument vs vq->num.  The argument could be called nheads instead to
> make it clear that this is heads[] and not the virtqueue size.
>
> Not a bug but it took me a while to figure out what was going on.

I admit the name is confusing. Let me try better ones in V2.

Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 0/5] batched tx processing in vhost_net
  2017-09-22  8:02 [PATCH net-next RFC 0/5] batched tx processing in vhost_net Jason Wang
@ 2017-09-26 13:45   ` Michael S. Tsirkin
  2017-09-22  8:02 ` Jason Wang
                     ` (10 subsequent siblings)
  11 siblings, 0 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-26 13:45 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, linux-kernel, kvm

On Fri, Sep 22, 2017 at 04:02:30PM +0800, Jason Wang wrote:
> Hi:
> 
> This series tries to implement basic tx batched processing. This is
> done by prefetching descriptor indices and update used ring in a
> batch. This intends to speed up used ring updating and improve the
> cache utilization.

Interesting, thanks for the patches. So IIUC most of the gain is really
overcoming some of the shortcomings of virtio 1.0 wrt cache utilization?

Which is fair enough (1.0 is already deployed) but I would like to avoid
making 1.1 support harder, and this patchset does this unfortunately,
see comments on individual patches. I'm sure it can be addressed though.

> Test shows about ~22% improvement in tx pss.

Is this with or without tx napi in guest?

> Please review.
> 
> Jason Wang (5):
>   vhost: split out ring head fetching logic
>   vhost: introduce helper to prefetch desc index
>   vhost: introduce vhost_add_used_idx()
>   vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
>   vhost_net: basic tx virtqueue batched processing
> 
>  drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
>  drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
>  drivers/vhost/vhost.h |   9 ++
>  3 files changed, 270 insertions(+), 125 deletions(-)
> 
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 0/5] batched tx processing in vhost_net
@ 2017-09-26 13:45   ` Michael S. Tsirkin
  0 siblings, 0 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-26 13:45 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization

On Fri, Sep 22, 2017 at 04:02:30PM +0800, Jason Wang wrote:
> Hi:
> 
> This series tries to implement basic tx batched processing. This is
> done by prefetching descriptor indices and update used ring in a
> batch. This intends to speed up used ring updating and improve the
> cache utilization.

Interesting, thanks for the patches. So IIUC most of the gain is really
overcoming some of the shortcomings of virtio 1.0 wrt cache utilization?

Which is fair enough (1.0 is already deployed) but I would like to avoid
making 1.1 support harder, and this patchset does this unfortunately,
see comments on individual patches. I'm sure it can be addressed though.

> Test shows about ~22% improvement in tx pss.

Is this with or without tx napi in guest?

> Please review.
> 
> Jason Wang (5):
>   vhost: split out ring head fetching logic
>   vhost: introduce helper to prefetch desc index
>   vhost: introduce vhost_add_used_idx()
>   vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
>   vhost_net: basic tx virtqueue batched processing
> 
>  drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
>  drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
>  drivers/vhost/vhost.h |   9 ++
>  3 files changed, 270 insertions(+), 125 deletions(-)
> 
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
  2017-09-22  8:02 ` [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx() Jason Wang
                     ` (2 preceding siblings ...)
  2017-09-26 19:13   ` Michael S. Tsirkin
@ 2017-09-26 19:13   ` Michael S. Tsirkin
  2017-09-27  0:38     ` Jason Wang
  2017-09-27  0:38     ` Jason Wang
  3 siblings, 2 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-26 19:13 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, linux-kernel, kvm

On Fri, Sep 22, 2017 at 04:02:33PM +0800, Jason Wang wrote:
> This patch introduces a helper which just increase the used idx. This
> will be used in pair with vhost_prefetch_desc_indices() by batching
> code.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 33 +++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  1 +
>  2 files changed, 34 insertions(+)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 8424166d..6532cda 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2178,6 +2178,39 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
>  }
>  EXPORT_SYMBOL_GPL(vhost_add_used);
>  
> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n)
> +{
> +	u16 old, new;
> +
> +	old = vq->last_used_idx;
> +	new = (vq->last_used_idx += n);
> +	/* If the driver never bothers to signal in a very long while,
> +	 * used index might wrap around. If that happens, invalidate
> +	 * signalled_used index we stored. TODO: make sure driver
> +	 * signals at least once in 2^16 and remove this.
> +	 */
> +	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
> +		vq->signalled_used_valid = false;
> +
> +	/* Make sure buffer is written before we update index. */
> +	smp_wmb();
> +	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
> +			   &vq->used->idx)) {
> +		vq_err(vq, "Failed to increment used idx");
> +		return -EFAULT;
> +	}
> +	if (unlikely(vq->log_used)) {
> +		/* Log used index update. */
> +		log_write(vq->log_base,
> +			  vq->log_addr + offsetof(struct vring_used, idx),
> +			  sizeof(vq->used->idx));
> +		if (vq->log_ctx)
> +			eventfd_signal(vq->log_ctx, 1);
> +	}
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(vhost_add_used_idx);
> +
>  static int __vhost_add_used_n(struct vhost_virtqueue *vq,
>  			    struct vring_used_elem *heads,
>  			    unsigned count)
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 16c2cb6..5dd6c05 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
>  void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>  
>  int vhost_vq_init_access(struct vhost_virtqueue *);
> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
>  int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
>  int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
>  		     unsigned count);

Please change the API to hide the fact that there's an index that needs
to be updated.

> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
  2017-09-22  8:02 ` [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx() Jason Wang
  2017-09-22  9:07   ` Stefan Hajnoczi
  2017-09-22  9:07   ` Stefan Hajnoczi
@ 2017-09-26 19:13   ` Michael S. Tsirkin
  2017-09-26 19:13   ` Michael S. Tsirkin
  3 siblings, 0 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-26 19:13 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization

On Fri, Sep 22, 2017 at 04:02:33PM +0800, Jason Wang wrote:
> This patch introduces a helper which just increase the used idx. This
> will be used in pair with vhost_prefetch_desc_indices() by batching
> code.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 33 +++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  1 +
>  2 files changed, 34 insertions(+)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 8424166d..6532cda 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2178,6 +2178,39 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
>  }
>  EXPORT_SYMBOL_GPL(vhost_add_used);
>  
> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n)
> +{
> +	u16 old, new;
> +
> +	old = vq->last_used_idx;
> +	new = (vq->last_used_idx += n);
> +	/* If the driver never bothers to signal in a very long while,
> +	 * used index might wrap around. If that happens, invalidate
> +	 * signalled_used index we stored. TODO: make sure driver
> +	 * signals at least once in 2^16 and remove this.
> +	 */
> +	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
> +		vq->signalled_used_valid = false;
> +
> +	/* Make sure buffer is written before we update index. */
> +	smp_wmb();
> +	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
> +			   &vq->used->idx)) {
> +		vq_err(vq, "Failed to increment used idx");
> +		return -EFAULT;
> +	}
> +	if (unlikely(vq->log_used)) {
> +		/* Log used index update. */
> +		log_write(vq->log_base,
> +			  vq->log_addr + offsetof(struct vring_used, idx),
> +			  sizeof(vq->used->idx));
> +		if (vq->log_ctx)
> +			eventfd_signal(vq->log_ctx, 1);
> +	}
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(vhost_add_used_idx);
> +
>  static int __vhost_add_used_n(struct vhost_virtqueue *vq,
>  			    struct vring_used_elem *heads,
>  			    unsigned count)
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 16c2cb6..5dd6c05 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
>  void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>  
>  int vhost_vq_init_access(struct vhost_virtqueue *);
> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
>  int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
>  int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
>  		     unsigned count);

Please change the API to hide the fact that there's an index that needs
to be updated.

> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-22  8:02 ` [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index Jason Wang
  2017-09-22  9:02     ` Stefan Hajnoczi
  2017-09-26 19:19   ` Michael S. Tsirkin
@ 2017-09-26 19:19   ` Michael S. Tsirkin
  2017-09-27  0:35     ` Jason Wang
  2017-09-27  0:35     ` Jason Wang
  2017-09-28  0:47   ` Willem de Bruijn
  2017-09-28  0:47   ` Willem de Bruijn
  4 siblings, 2 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-26 19:19 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, linux-kernel, kvm

On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
> This patch introduces vhost_prefetch_desc_indices() which could batch
> descriptor indices fetching and used ring updating. This intends to
> reduce the cache misses of indices fetching and updating and reduce
> cache line bounce when virtqueue is almost full. copy_to_user() was
> used in order to benefit from modern cpus that support fast string
> copy. Batched virtqueue processing will be the first user.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  3 +++
>  2 files changed, 58 insertions(+)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index f87ec75..8424166d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>  }
>  EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>  
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +				struct vring_used_elem *heads,
> +				u16 num, bool used_update)

why do you need to combine used update with prefetch?

> +{
> +	int ret, ret2;
> +	u16 last_avail_idx, last_used_idx, total, copied;
> +	__virtio16 avail_idx;
> +	struct vring_used_elem __user *used;
> +	int i;
> +
> +	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
> +		vq_err(vq, "Failed to access avail idx at %p\n",
> +		       &vq->avail->idx);
> +		return -EFAULT;
> +	}
> +	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
> +	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> +	total = vq->avail_idx - vq->last_avail_idx;
> +	ret = total = min(total, num);
> +
> +	for (i = 0; i < ret; i++) {
> +		ret2 = vhost_get_avail(vq, heads[i].id,
> +				      &vq->avail->ring[last_avail_idx]);
> +		if (unlikely(ret2)) {
> +			vq_err(vq, "Failed to get descriptors\n");
> +			return -EFAULT;
> +		}
> +		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
> +	}
> +
> +	if (!used_update)
> +		return ret;
> +
> +	last_used_idx = vq->last_used_idx & (vq->num - 1);
> +	while (total) {
> +		copied = min((u16)(vq->num - last_used_idx), total);
> +		ret2 = vhost_copy_to_user(vq,
> +					  &vq->used->ring[last_used_idx],
> +					  &heads[ret - total],
> +					  copied * sizeof(*used));
> +
> +		if (unlikely(ret2)) {
> +			vq_err(vq, "Failed to update used ring!\n");
> +			return -EFAULT;
> +		}
> +
> +		last_used_idx = 0;
> +		total -= copied;
> +	}
> +
> +	/* Only get avail ring entries after they have been exposed by guest. */
> +	smp_rmb();

Barrier before return is a very confusing API. I guess it's designed to
be used in a specific way to make it necessary - but what is it?


> +	return ret;
> +}
> +EXPORT_SYMBOL(vhost_prefetch_desc_indices);
>  
>  static int __init vhost_init(void)
>  {
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 39ff897..16c2cb6 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -228,6 +228,9 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
>  ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
>  			     struct iov_iter *from);
>  int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +				struct vring_used_elem *heads,
> +				u16 num, bool used_update);
>  
>  #define vq_err(vq, fmt, ...) do {                                  \
>  		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-22  8:02 ` [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index Jason Wang
  2017-09-22  9:02     ` Stefan Hajnoczi
@ 2017-09-26 19:19   ` Michael S. Tsirkin
  2017-09-26 19:19   ` Michael S. Tsirkin
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-26 19:19 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization

On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
> This patch introduces vhost_prefetch_desc_indices() which could batch
> descriptor indices fetching and used ring updating. This intends to
> reduce the cache misses of indices fetching and updating and reduce
> cache line bounce when virtqueue is almost full. copy_to_user() was
> used in order to benefit from modern cpus that support fast string
> copy. Batched virtqueue processing will be the first user.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  3 +++
>  2 files changed, 58 insertions(+)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index f87ec75..8424166d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>  }
>  EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>  
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +				struct vring_used_elem *heads,
> +				u16 num, bool used_update)

why do you need to combine used update with prefetch?

> +{
> +	int ret, ret2;
> +	u16 last_avail_idx, last_used_idx, total, copied;
> +	__virtio16 avail_idx;
> +	struct vring_used_elem __user *used;
> +	int i;
> +
> +	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
> +		vq_err(vq, "Failed to access avail idx at %p\n",
> +		       &vq->avail->idx);
> +		return -EFAULT;
> +	}
> +	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
> +	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> +	total = vq->avail_idx - vq->last_avail_idx;
> +	ret = total = min(total, num);
> +
> +	for (i = 0; i < ret; i++) {
> +		ret2 = vhost_get_avail(vq, heads[i].id,
> +				      &vq->avail->ring[last_avail_idx]);
> +		if (unlikely(ret2)) {
> +			vq_err(vq, "Failed to get descriptors\n");
> +			return -EFAULT;
> +		}
> +		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
> +	}
> +
> +	if (!used_update)
> +		return ret;
> +
> +	last_used_idx = vq->last_used_idx & (vq->num - 1);
> +	while (total) {
> +		copied = min((u16)(vq->num - last_used_idx), total);
> +		ret2 = vhost_copy_to_user(vq,
> +					  &vq->used->ring[last_used_idx],
> +					  &heads[ret - total],
> +					  copied * sizeof(*used));
> +
> +		if (unlikely(ret2)) {
> +			vq_err(vq, "Failed to update used ring!\n");
> +			return -EFAULT;
> +		}
> +
> +		last_used_idx = 0;
> +		total -= copied;
> +	}
> +
> +	/* Only get avail ring entries after they have been exposed by guest. */
> +	smp_rmb();

Barrier before return is a very confusing API. I guess it's designed to
be used in a specific way to make it necessary - but what is it?


> +	return ret;
> +}
> +EXPORT_SYMBOL(vhost_prefetch_desc_indices);
>  
>  static int __init vhost_init(void)
>  {
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 39ff897..16c2cb6 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -228,6 +228,9 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
>  ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
>  			     struct iov_iter *from);
>  int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +				struct vring_used_elem *heads,
> +				u16 num, bool used_update);
>  
>  #define vq_err(vq, fmt, ...) do {                                  \
>  		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-22  8:02 ` Jason Wang
@ 2017-09-26 19:25   ` Michael S. Tsirkin
  2017-09-27  2:04     ` Jason Wang
  2017-09-27  2:04     ` Jason Wang
  2017-09-26 19:25   ` Michael S. Tsirkin
  2017-09-28  0:55     ` Willem de Bruijn
  2 siblings, 2 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-26 19:25 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, linux-kernel, kvm

On Fri, Sep 22, 2017 at 04:02:35PM +0800, Jason Wang wrote:
> This patch implements basic batched processing of tx virtqueue by
> prefetching desc indices and updating used ring in a batch. For
> non-zerocopy case, vq->heads were used for storing the prefetched
> indices and updating used ring. It is also a requirement for doing
> more batching on top. For zerocopy case and for simplicity, batched
> processing were simply disabled by only fetching and processing one
> descriptor at a time, this could be optimized in the future.
> 
> XDP_DROP (without touching skb) on tun (with Moongen in guest) with
> zercopy disabled:
> 
> Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
> Before: 3.20Mpps
> After:  3.90Mpps (+22%)
> 
> No differences were seen with zerocopy enabled.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

So where is the speedup coming from? I'd guess the ring is
hot in cache, it's faster to access it in one go, then
pass many packets to net stack. Is that right?

Another possibility is better code cache locality.

So how about this patchset is refactored:

1. use existing APIs just first get packets then
   transmit them all then use them all
2. add new APIs and move the loop into vhost core
   for more speedups



> ---
>  drivers/vhost/net.c   | 215 ++++++++++++++++++++++++++++----------------------
>  drivers/vhost/vhost.c |   2 +-
>  2 files changed, 121 insertions(+), 96 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index c89640e..c439892 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -408,27 +408,25 @@ static int vhost_net_enable_vq(struct vhost_net *n,
>  	return vhost_poll_start(poll, sock->file);
>  }
>  
> -static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> -				    struct vhost_virtqueue *vq,
> -				    struct iovec iov[], unsigned int iov_size,
> -				    unsigned int *out_num, unsigned int *in_num)
> +static bool vhost_net_tx_avail(struct vhost_net *net,
> +			       struct vhost_virtqueue *vq)
>  {
>  	unsigned long uninitialized_var(endtime);
> -	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> -				  out_num, in_num, NULL, NULL);
>  
> -	if (r == vq->num && vq->busyloop_timeout) {
> -		preempt_disable();
> -		endtime = busy_clock() + vq->busyloop_timeout;
> -		while (vhost_can_busy_poll(vq->dev, endtime) &&
> -		       vhost_vq_avail_empty(vq->dev, vq))
> -			cpu_relax();
> -		preempt_enable();
> -		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> -				      out_num, in_num, NULL, NULL);
> -	}
> +	if (!vq->busyloop_timeout)
> +		return false;
>  
> -	return r;
> +	if (!vhost_vq_avail_empty(vq->dev, vq))
> +		return true;
> +
> +	preempt_disable();
> +	endtime = busy_clock() + vq->busyloop_timeout;
> +	while (vhost_can_busy_poll(vq->dev, endtime) &&
> +		vhost_vq_avail_empty(vq->dev, vq))
> +		cpu_relax();
> +	preempt_enable();
> +
> +	return !vhost_vq_avail_empty(vq->dev, vq);
>  }
>  
>  static bool vhost_exceeds_maxpend(struct vhost_net *net)
> @@ -446,8 +444,9 @@ static void handle_tx(struct vhost_net *net)
>  {
>  	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
>  	struct vhost_virtqueue *vq = &nvq->vq;
> +	struct vring_used_elem used, *heads = vq->heads;
>  	unsigned out, in;
> -	int head;
> +	int avails, head;
>  	struct msghdr msg = {
>  		.msg_name = NULL,
>  		.msg_namelen = 0,
> @@ -461,6 +460,7 @@ static void handle_tx(struct vhost_net *net)
>  	struct socket *sock;
>  	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
>  	bool zcopy, zcopy_used;
> +	int i, batched = VHOST_NET_BATCH;
>  
>  	mutex_lock(&vq->mutex);
>  	sock = vq->private_data;
> @@ -475,6 +475,12 @@ static void handle_tx(struct vhost_net *net)
>  	hdr_size = nvq->vhost_hlen;
>  	zcopy = nvq->ubufs;
>  
> +	/* Disable zerocopy batched fetching for simplicity */
> +	if (zcopy) {
> +		heads = &used;
> +		batched = 1;
> +	}
> +
>  	for (;;) {
>  		/* Release DMAs done buffers first */
>  		if (zcopy)
> @@ -486,95 +492,114 @@ static void handle_tx(struct vhost_net *net)
>  		if (unlikely(vhost_exceeds_maxpend(net)))
>  			break;
>  
> -		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
> -						ARRAY_SIZE(vq->iov),
> -						&out, &in);
> +		avails = vhost_prefetch_desc_indices(vq, heads, batched, !zcopy);
>  		/* On error, stop handling until the next kick. */
> -		if (unlikely(head < 0))
> +		if (unlikely(avails < 0))
>  			break;
> -		/* Nothing new?  Wait for eventfd to tell us they refilled. */
> -		if (head == vq->num) {
> +		/* Nothing new?  Busy poll for a while or wait for
> +		 * eventfd to tell us they refilled. */
> +		if (!avails) {
> +			if (vhost_net_tx_avail(net, vq))
> +				continue;
>  			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>  				vhost_disable_notify(&net->dev, vq);
>  				continue;
>  			}
>  			break;
>  		}
> -		if (in) {
> -			vq_err(vq, "Unexpected descriptor format for TX: "
> -			       "out %d, int %d\n", out, in);
> -			break;
> -		}
> -		/* Skip header. TODO: support TSO. */
> -		len = iov_length(vq->iov, out);
> -		iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
> -		iov_iter_advance(&msg.msg_iter, hdr_size);
> -		/* Sanity check */
> -		if (!msg_data_left(&msg)) {
> -			vq_err(vq, "Unexpected header len for TX: "
> -			       "%zd expected %zd\n",
> -			       len, hdr_size);
> -			break;
> -		}
> -		len = msg_data_left(&msg);
> -
> -		zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
> -				   && (nvq->upend_idx + 1) % UIO_MAXIOV !=
> -				      nvq->done_idx
> -				   && vhost_net_tx_select_zcopy(net);
> -
> -		/* use msg_control to pass vhost zerocopy ubuf info to skb */
> -		if (zcopy_used) {
> -			struct ubuf_info *ubuf;
> -			ubuf = nvq->ubuf_info + nvq->upend_idx;
> -
> -			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
> -			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
> -			ubuf->callback = vhost_zerocopy_callback;
> -			ubuf->ctx = nvq->ubufs;
> -			ubuf->desc = nvq->upend_idx;
> -			refcount_set(&ubuf->refcnt, 1);
> -			msg.msg_control = ubuf;
> -			msg.msg_controllen = sizeof(ubuf);
> -			ubufs = nvq->ubufs;
> -			atomic_inc(&ubufs->refcount);
> -			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
> -		} else {
> -			msg.msg_control = NULL;
> -			ubufs = NULL;
> -		}
> +		for (i = 0; i < avails; i++) {
> +			head = __vhost_get_vq_desc(vq, vq->iov,
> +						   ARRAY_SIZE(vq->iov),
> +						   &out, &in, NULL, NULL,
> +					       vhost16_to_cpu(vq, heads[i].id));
> +			if (in) {
> +				vq_err(vq, "Unexpected descriptor format for "
> +					   "TX: out %d, int %d\n", out, in);
> +				goto out;
> +			}
>  
> -		total_len += len;
> -		if (total_len < VHOST_NET_WEIGHT &&
> -		    !vhost_vq_avail_empty(&net->dev, vq) &&
> -		    likely(!vhost_exceeds_maxpend(net))) {
> -			msg.msg_flags |= MSG_MORE;
> -		} else {
> -			msg.msg_flags &= ~MSG_MORE;
> -		}
> +			/* Skip header. TODO: support TSO. */
> +			len = iov_length(vq->iov, out);
> +			iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
> +			iov_iter_advance(&msg.msg_iter, hdr_size);
> +			/* Sanity check */
> +			if (!msg_data_left(&msg)) {
> +				vq_err(vq, "Unexpected header len for TX: "
> +					"%zd expected %zd\n",
> +					len, hdr_size);
> +				goto out;
> +			}
> +			len = msg_data_left(&msg);
>  
> -		/* TODO: Check specific error and bomb out unless ENOBUFS? */
> -		err = sock->ops->sendmsg(sock, &msg, len);
> -		if (unlikely(err < 0)) {
> +			zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
> +				     && (nvq->upend_idx + 1) % UIO_MAXIOV !=
> +					nvq->done_idx
> +				     && vhost_net_tx_select_zcopy(net);
> +
> +			/* use msg_control to pass vhost zerocopy ubuf
> +			 * info to skb
> +			 */
>  			if (zcopy_used) {
> -				vhost_net_ubuf_put(ubufs);
> -				nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
> -					% UIO_MAXIOV;
> +				struct ubuf_info *ubuf;
> +				ubuf = nvq->ubuf_info + nvq->upend_idx;
> +
> +				vq->heads[nvq->upend_idx].id =
> +					cpu_to_vhost32(vq, head);
> +				vq->heads[nvq->upend_idx].len =
> +					VHOST_DMA_IN_PROGRESS;
> +				ubuf->callback = vhost_zerocopy_callback;
> +				ubuf->ctx = nvq->ubufs;
> +				ubuf->desc = nvq->upend_idx;
> +				refcount_set(&ubuf->refcnt, 1);
> +				msg.msg_control = ubuf;
> +				msg.msg_controllen = sizeof(ubuf);
> +				ubufs = nvq->ubufs;
> +				atomic_inc(&ubufs->refcount);
> +				nvq->upend_idx =
> +					(nvq->upend_idx + 1) % UIO_MAXIOV;
> +			} else {
> +				msg.msg_control = NULL;
> +				ubufs = NULL;
> +			}
> +
> +			total_len += len;
> +			if (total_len < VHOST_NET_WEIGHT &&
> +				!vhost_vq_avail_empty(&net->dev, vq) &&
> +				likely(!vhost_exceeds_maxpend(net))) {
> +				msg.msg_flags |= MSG_MORE;
> +			} else {
> +				msg.msg_flags &= ~MSG_MORE;
> +			}
> +
> +			/* TODO: Check specific error and bomb out
> +			 * unless ENOBUFS?
> +			 */
> +			err = sock->ops->sendmsg(sock, &msg, len);
> +			if (unlikely(err < 0)) {
> +				if (zcopy_used) {
> +					vhost_net_ubuf_put(ubufs);
> +					nvq->upend_idx =
> +				   ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV;
> +				}
> +				vhost_discard_vq_desc(vq, 1);
> +				goto out;
> +			}
> +			if (err != len)
> +				pr_debug("Truncated TX packet: "
> +					" len %d != %zd\n", err, len);
> +			if (!zcopy) {
> +				vhost_add_used_idx(vq, 1);
> +				vhost_signal(&net->dev, vq);
> +			} else if (!zcopy_used) {
> +				vhost_add_used_and_signal(&net->dev,
> +							  vq, head, 0);
> +			} else
> +				vhost_zerocopy_signal_used(net, vq);
> +			vhost_net_tx_packet(net);
> +			if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> +				vhost_poll_queue(&vq->poll);
> +				goto out;
>  			}
> -			vhost_discard_vq_desc(vq, 1);
> -			break;
> -		}
> -		if (err != len)
> -			pr_debug("Truncated TX packet: "
> -				 " len %d != %zd\n", err, len);
> -		if (!zcopy_used)
> -			vhost_add_used_and_signal(&net->dev, vq, head, 0);
> -		else
> -			vhost_zerocopy_signal_used(net, vq);
> -		vhost_net_tx_packet(net);
> -		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> -			vhost_poll_queue(&vq->poll);
> -			break;
>  		}
>  	}
>  out:
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 6532cda..8764df5 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -392,7 +392,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
>  		vq->indirect = kmalloc(sizeof *vq->indirect * UIO_MAXIOV,
>  				       GFP_KERNEL);
>  		vq->log = kmalloc(sizeof *vq->log * UIO_MAXIOV, GFP_KERNEL);
> -		vq->heads = kmalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);
> +		vq->heads = kzalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);
>  		if (!vq->indirect || !vq->log || !vq->heads)
>  			goto err_nomem;
>  	}
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-22  8:02 ` Jason Wang
  2017-09-26 19:25   ` Michael S. Tsirkin
@ 2017-09-26 19:25   ` Michael S. Tsirkin
  2017-09-28  0:55     ` Willem de Bruijn
  2 siblings, 0 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-26 19:25 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization

On Fri, Sep 22, 2017 at 04:02:35PM +0800, Jason Wang wrote:
> This patch implements basic batched processing of tx virtqueue by
> prefetching desc indices and updating used ring in a batch. For
> non-zerocopy case, vq->heads were used for storing the prefetched
> indices and updating used ring. It is also a requirement for doing
> more batching on top. For zerocopy case and for simplicity, batched
> processing were simply disabled by only fetching and processing one
> descriptor at a time, this could be optimized in the future.
> 
> XDP_DROP (without touching skb) on tun (with Moongen in guest) with
> zercopy disabled:
> 
> Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
> Before: 3.20Mpps
> After:  3.90Mpps (+22%)
> 
> No differences were seen with zerocopy enabled.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

So where is the speedup coming from? I'd guess the ring is
hot in cache, it's faster to access it in one go, then
pass many packets to net stack. Is that right?

Another possibility is better code cache locality.

So how about this patchset is refactored:

1. use existing APIs just first get packets then
   transmit them all then use them all
2. add new APIs and move the loop into vhost core
   for more speedups



> ---
>  drivers/vhost/net.c   | 215 ++++++++++++++++++++++++++++----------------------
>  drivers/vhost/vhost.c |   2 +-
>  2 files changed, 121 insertions(+), 96 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index c89640e..c439892 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -408,27 +408,25 @@ static int vhost_net_enable_vq(struct vhost_net *n,
>  	return vhost_poll_start(poll, sock->file);
>  }
>  
> -static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> -				    struct vhost_virtqueue *vq,
> -				    struct iovec iov[], unsigned int iov_size,
> -				    unsigned int *out_num, unsigned int *in_num)
> +static bool vhost_net_tx_avail(struct vhost_net *net,
> +			       struct vhost_virtqueue *vq)
>  {
>  	unsigned long uninitialized_var(endtime);
> -	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> -				  out_num, in_num, NULL, NULL);
>  
> -	if (r == vq->num && vq->busyloop_timeout) {
> -		preempt_disable();
> -		endtime = busy_clock() + vq->busyloop_timeout;
> -		while (vhost_can_busy_poll(vq->dev, endtime) &&
> -		       vhost_vq_avail_empty(vq->dev, vq))
> -			cpu_relax();
> -		preempt_enable();
> -		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> -				      out_num, in_num, NULL, NULL);
> -	}
> +	if (!vq->busyloop_timeout)
> +		return false;
>  
> -	return r;
> +	if (!vhost_vq_avail_empty(vq->dev, vq))
> +		return true;
> +
> +	preempt_disable();
> +	endtime = busy_clock() + vq->busyloop_timeout;
> +	while (vhost_can_busy_poll(vq->dev, endtime) &&
> +		vhost_vq_avail_empty(vq->dev, vq))
> +		cpu_relax();
> +	preempt_enable();
> +
> +	return !vhost_vq_avail_empty(vq->dev, vq);
>  }
>  
>  static bool vhost_exceeds_maxpend(struct vhost_net *net)
> @@ -446,8 +444,9 @@ static void handle_tx(struct vhost_net *net)
>  {
>  	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
>  	struct vhost_virtqueue *vq = &nvq->vq;
> +	struct vring_used_elem used, *heads = vq->heads;
>  	unsigned out, in;
> -	int head;
> +	int avails, head;
>  	struct msghdr msg = {
>  		.msg_name = NULL,
>  		.msg_namelen = 0,
> @@ -461,6 +460,7 @@ static void handle_tx(struct vhost_net *net)
>  	struct socket *sock;
>  	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
>  	bool zcopy, zcopy_used;
> +	int i, batched = VHOST_NET_BATCH;
>  
>  	mutex_lock(&vq->mutex);
>  	sock = vq->private_data;
> @@ -475,6 +475,12 @@ static void handle_tx(struct vhost_net *net)
>  	hdr_size = nvq->vhost_hlen;
>  	zcopy = nvq->ubufs;
>  
> +	/* Disable zerocopy batched fetching for simplicity */
> +	if (zcopy) {
> +		heads = &used;
> +		batched = 1;
> +	}
> +
>  	for (;;) {
>  		/* Release DMAs done buffers first */
>  		if (zcopy)
> @@ -486,95 +492,114 @@ static void handle_tx(struct vhost_net *net)
>  		if (unlikely(vhost_exceeds_maxpend(net)))
>  			break;
>  
> -		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
> -						ARRAY_SIZE(vq->iov),
> -						&out, &in);
> +		avails = vhost_prefetch_desc_indices(vq, heads, batched, !zcopy);
>  		/* On error, stop handling until the next kick. */
> -		if (unlikely(head < 0))
> +		if (unlikely(avails < 0))
>  			break;
> -		/* Nothing new?  Wait for eventfd to tell us they refilled. */
> -		if (head == vq->num) {
> +		/* Nothing new?  Busy poll for a while or wait for
> +		 * eventfd to tell us they refilled. */
> +		if (!avails) {
> +			if (vhost_net_tx_avail(net, vq))
> +				continue;
>  			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>  				vhost_disable_notify(&net->dev, vq);
>  				continue;
>  			}
>  			break;
>  		}
> -		if (in) {
> -			vq_err(vq, "Unexpected descriptor format for TX: "
> -			       "out %d, int %d\n", out, in);
> -			break;
> -		}
> -		/* Skip header. TODO: support TSO. */
> -		len = iov_length(vq->iov, out);
> -		iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
> -		iov_iter_advance(&msg.msg_iter, hdr_size);
> -		/* Sanity check */
> -		if (!msg_data_left(&msg)) {
> -			vq_err(vq, "Unexpected header len for TX: "
> -			       "%zd expected %zd\n",
> -			       len, hdr_size);
> -			break;
> -		}
> -		len = msg_data_left(&msg);
> -
> -		zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
> -				   && (nvq->upend_idx + 1) % UIO_MAXIOV !=
> -				      nvq->done_idx
> -				   && vhost_net_tx_select_zcopy(net);
> -
> -		/* use msg_control to pass vhost zerocopy ubuf info to skb */
> -		if (zcopy_used) {
> -			struct ubuf_info *ubuf;
> -			ubuf = nvq->ubuf_info + nvq->upend_idx;
> -
> -			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
> -			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
> -			ubuf->callback = vhost_zerocopy_callback;
> -			ubuf->ctx = nvq->ubufs;
> -			ubuf->desc = nvq->upend_idx;
> -			refcount_set(&ubuf->refcnt, 1);
> -			msg.msg_control = ubuf;
> -			msg.msg_controllen = sizeof(ubuf);
> -			ubufs = nvq->ubufs;
> -			atomic_inc(&ubufs->refcount);
> -			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
> -		} else {
> -			msg.msg_control = NULL;
> -			ubufs = NULL;
> -		}
> +		for (i = 0; i < avails; i++) {
> +			head = __vhost_get_vq_desc(vq, vq->iov,
> +						   ARRAY_SIZE(vq->iov),
> +						   &out, &in, NULL, NULL,
> +					       vhost16_to_cpu(vq, heads[i].id));
> +			if (in) {
> +				vq_err(vq, "Unexpected descriptor format for "
> +					   "TX: out %d, int %d\n", out, in);
> +				goto out;
> +			}
>  
> -		total_len += len;
> -		if (total_len < VHOST_NET_WEIGHT &&
> -		    !vhost_vq_avail_empty(&net->dev, vq) &&
> -		    likely(!vhost_exceeds_maxpend(net))) {
> -			msg.msg_flags |= MSG_MORE;
> -		} else {
> -			msg.msg_flags &= ~MSG_MORE;
> -		}
> +			/* Skip header. TODO: support TSO. */
> +			len = iov_length(vq->iov, out);
> +			iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
> +			iov_iter_advance(&msg.msg_iter, hdr_size);
> +			/* Sanity check */
> +			if (!msg_data_left(&msg)) {
> +				vq_err(vq, "Unexpected header len for TX: "
> +					"%zd expected %zd\n",
> +					len, hdr_size);
> +				goto out;
> +			}
> +			len = msg_data_left(&msg);
>  
> -		/* TODO: Check specific error and bomb out unless ENOBUFS? */
> -		err = sock->ops->sendmsg(sock, &msg, len);
> -		if (unlikely(err < 0)) {
> +			zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
> +				     && (nvq->upend_idx + 1) % UIO_MAXIOV !=
> +					nvq->done_idx
> +				     && vhost_net_tx_select_zcopy(net);
> +
> +			/* use msg_control to pass vhost zerocopy ubuf
> +			 * info to skb
> +			 */
>  			if (zcopy_used) {
> -				vhost_net_ubuf_put(ubufs);
> -				nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
> -					% UIO_MAXIOV;
> +				struct ubuf_info *ubuf;
> +				ubuf = nvq->ubuf_info + nvq->upend_idx;
> +
> +				vq->heads[nvq->upend_idx].id =
> +					cpu_to_vhost32(vq, head);
> +				vq->heads[nvq->upend_idx].len =
> +					VHOST_DMA_IN_PROGRESS;
> +				ubuf->callback = vhost_zerocopy_callback;
> +				ubuf->ctx = nvq->ubufs;
> +				ubuf->desc = nvq->upend_idx;
> +				refcount_set(&ubuf->refcnt, 1);
> +				msg.msg_control = ubuf;
> +				msg.msg_controllen = sizeof(ubuf);
> +				ubufs = nvq->ubufs;
> +				atomic_inc(&ubufs->refcount);
> +				nvq->upend_idx =
> +					(nvq->upend_idx + 1) % UIO_MAXIOV;
> +			} else {
> +				msg.msg_control = NULL;
> +				ubufs = NULL;
> +			}
> +
> +			total_len += len;
> +			if (total_len < VHOST_NET_WEIGHT &&
> +				!vhost_vq_avail_empty(&net->dev, vq) &&
> +				likely(!vhost_exceeds_maxpend(net))) {
> +				msg.msg_flags |= MSG_MORE;
> +			} else {
> +				msg.msg_flags &= ~MSG_MORE;
> +			}
> +
> +			/* TODO: Check specific error and bomb out
> +			 * unless ENOBUFS?
> +			 */
> +			err = sock->ops->sendmsg(sock, &msg, len);
> +			if (unlikely(err < 0)) {
> +				if (zcopy_used) {
> +					vhost_net_ubuf_put(ubufs);
> +					nvq->upend_idx =
> +				   ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV;
> +				}
> +				vhost_discard_vq_desc(vq, 1);
> +				goto out;
> +			}
> +			if (err != len)
> +				pr_debug("Truncated TX packet: "
> +					" len %d != %zd\n", err, len);
> +			if (!zcopy) {
> +				vhost_add_used_idx(vq, 1);
> +				vhost_signal(&net->dev, vq);
> +			} else if (!zcopy_used) {
> +				vhost_add_used_and_signal(&net->dev,
> +							  vq, head, 0);
> +			} else
> +				vhost_zerocopy_signal_used(net, vq);
> +			vhost_net_tx_packet(net);
> +			if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> +				vhost_poll_queue(&vq->poll);
> +				goto out;
>  			}
> -			vhost_discard_vq_desc(vq, 1);
> -			break;
> -		}
> -		if (err != len)
> -			pr_debug("Truncated TX packet: "
> -				 " len %d != %zd\n", err, len);
> -		if (!zcopy_used)
> -			vhost_add_used_and_signal(&net->dev, vq, head, 0);
> -		else
> -			vhost_zerocopy_signal_used(net, vq);
> -		vhost_net_tx_packet(net);
> -		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> -			vhost_poll_queue(&vq->poll);
> -			break;
>  		}
>  	}
>  out:
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 6532cda..8764df5 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -392,7 +392,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
>  		vq->indirect = kmalloc(sizeof *vq->indirect * UIO_MAXIOV,
>  				       GFP_KERNEL);
>  		vq->log = kmalloc(sizeof *vq->log * UIO_MAXIOV, GFP_KERNEL);
> -		vq->heads = kmalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);
> +		vq->heads = kzalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);
>  		if (!vq->indirect || !vq->log || !vq->heads)
>  			goto err_nomem;
>  	}
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 0/5] batched tx processing in vhost_net
  2017-09-22  8:02 [PATCH net-next RFC 0/5] batched tx processing in vhost_net Jason Wang
                   ` (10 preceding siblings ...)
  2017-09-26 19:26 ` Michael S. Tsirkin
@ 2017-09-26 19:26 ` Michael S. Tsirkin
  2017-09-27  2:06   ` Jason Wang
  2017-09-27  2:06   ` Jason Wang
  11 siblings, 2 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-26 19:26 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, linux-kernel, kvm

On Fri, Sep 22, 2017 at 04:02:30PM +0800, Jason Wang wrote:
> Hi:
> 
> This series tries to implement basic tx batched processing. This is
> done by prefetching descriptor indices and update used ring in a
> batch. This intends to speed up used ring updating and improve the
> cache utilization. Test shows about ~22% improvement in tx pss.




> Please review.
> 
> Jason Wang (5):
>   vhost: split out ring head fetching logic
>   vhost: introduce helper to prefetch desc index
>   vhost: introduce vhost_add_used_idx()

Please squash these new APIs into where they are used.
This split-up just makes review harder for me as
I can't figure out how the new APIs are used.


>   vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH

This is ok as a separate patch.

>   vhost_net: basic tx virtqueue batched processing
> 
>  drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
>  drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
>  drivers/vhost/vhost.h |   9 ++
>  3 files changed, 270 insertions(+), 125 deletions(-)
> 
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 0/5] batched tx processing in vhost_net
  2017-09-22  8:02 [PATCH net-next RFC 0/5] batched tx processing in vhost_net Jason Wang
                   ` (9 preceding siblings ...)
  2017-09-26 13:45   ` Michael S. Tsirkin
@ 2017-09-26 19:26 ` Michael S. Tsirkin
  2017-09-26 19:26 ` Michael S. Tsirkin
  11 siblings, 0 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-26 19:26 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization

On Fri, Sep 22, 2017 at 04:02:30PM +0800, Jason Wang wrote:
> Hi:
> 
> This series tries to implement basic tx batched processing. This is
> done by prefetching descriptor indices and update used ring in a
> batch. This intends to speed up used ring updating and improve the
> cache utilization. Test shows about ~22% improvement in tx pss.




> Please review.
> 
> Jason Wang (5):
>   vhost: split out ring head fetching logic
>   vhost: introduce helper to prefetch desc index
>   vhost: introduce vhost_add_used_idx()

Please squash these new APIs into where they are used.
This split-up just makes review harder for me as
I can't figure out how the new APIs are used.


>   vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH

This is ok as a separate patch.

>   vhost_net: basic tx virtqueue batched processing
> 
>  drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
>  drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
>  drivers/vhost/vhost.h |   9 ++
>  3 files changed, 270 insertions(+), 125 deletions(-)
> 
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 0/5] batched tx processing in vhost_net
  2017-09-26 13:45   ` Michael S. Tsirkin
  (?)
@ 2017-09-27  0:27   ` Jason Wang
  2017-09-27 22:28       ` Michael S. Tsirkin
  -1 siblings, 1 reply; 69+ messages in thread
From: Jason Wang @ 2017-09-27  0:27 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization, netdev, linux-kernel, kvm



On 2017年09月26日 21:45, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:30PM +0800, Jason Wang wrote:
>> Hi:
>>
>> This series tries to implement basic tx batched processing. This is
>> done by prefetching descriptor indices and update used ring in a
>> batch. This intends to speed up used ring updating and improve the
>> cache utilization.
> Interesting, thanks for the patches. So IIUC most of the gain is really
> overcoming some of the shortcomings of virtio 1.0 wrt cache utilization?

Yes.

Actually, looks like batching in 1.1 is not as easy as in 1.0.

In 1.0, we could do something like:

batch update used ring by user copy_to_user()
smp_wmb()
update used_idx

In 1.1, we need more memory barriers, can't benefit from fast copy helpers?

for () {
     update desc.addr
     smp_wmb()
     update desc.flag
}

>
> Which is fair enough (1.0 is already deployed) but I would like to avoid
> making 1.1 support harder, and this patchset does this unfortunately,

I think the new APIs do not expose more internal data structure of 
virtio than before? (vq->heads has already been used by vhost_net for 
years). Consider the layout is re-designed completely, I don't see an 
easy method to reuse current 1.0 API for 1.1.

> see comments on individual patches. I'm sure it can be addressed though.
>
>> Test shows about ~22% improvement in tx pss.
> Is this with or without tx napi in guest?

MoonGen is used in guest for better numbers.

Thanks

>
>> Please review.
>>
>> Jason Wang (5):
>>    vhost: split out ring head fetching logic
>>    vhost: introduce helper to prefetch desc index
>>    vhost: introduce vhost_add_used_idx()
>>    vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
>>    vhost_net: basic tx virtqueue batched processing
>>
>>   drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
>>   drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
>>   drivers/vhost/vhost.h |   9 ++
>>   3 files changed, 270 insertions(+), 125 deletions(-)
>>
>> -- 
>> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 0/5] batched tx processing in vhost_net
  2017-09-26 13:45   ` Michael S. Tsirkin
  (?)
  (?)
@ 2017-09-27  0:27   ` Jason Wang
  -1 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-27  0:27 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization



On 2017年09月26日 21:45, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:30PM +0800, Jason Wang wrote:
>> Hi:
>>
>> This series tries to implement basic tx batched processing. This is
>> done by prefetching descriptor indices and update used ring in a
>> batch. This intends to speed up used ring updating and improve the
>> cache utilization.
> Interesting, thanks for the patches. So IIUC most of the gain is really
> overcoming some of the shortcomings of virtio 1.0 wrt cache utilization?

Yes.

Actually, looks like batching in 1.1 is not as easy as in 1.0.

In 1.0, we could do something like:

batch update used ring by user copy_to_user()
smp_wmb()
update used_idx

In 1.1, we need more memory barriers, can't benefit from fast copy helpers?

for () {
     update desc.addr
     smp_wmb()
     update desc.flag
}

>
> Which is fair enough (1.0 is already deployed) but I would like to avoid
> making 1.1 support harder, and this patchset does this unfortunately,

I think the new APIs do not expose more internal data structure of 
virtio than before? (vq->heads has already been used by vhost_net for 
years). Consider the layout is re-designed completely, I don't see an 
easy method to reuse current 1.0 API for 1.1.

> see comments on individual patches. I'm sure it can be addressed though.
>
>> Test shows about ~22% improvement in tx pss.
> Is this with or without tx napi in guest?

MoonGen is used in guest for better numbers.

Thanks

>
>> Please review.
>>
>> Jason Wang (5):
>>    vhost: split out ring head fetching logic
>>    vhost: introduce helper to prefetch desc index
>>    vhost: introduce vhost_add_used_idx()
>>    vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
>>    vhost_net: basic tx virtqueue batched processing
>>
>>   drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
>>   drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
>>   drivers/vhost/vhost.h |   9 ++
>>   3 files changed, 270 insertions(+), 125 deletions(-)
>>
>> -- 
>> 2.7.4

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-26 19:19   ` Michael S. Tsirkin
@ 2017-09-27  0:35     ` Jason Wang
  2017-09-27 22:57         ` Michael S. Tsirkin
  2017-09-27  0:35     ` Jason Wang
  1 sibling, 1 reply; 69+ messages in thread
From: Jason Wang @ 2017-09-27  0:35 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization, netdev, linux-kernel, kvm



On 2017年09月27日 03:19, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
>> This patch introduces vhost_prefetch_desc_indices() which could batch
>> descriptor indices fetching and used ring updating. This intends to
>> reduce the cache misses of indices fetching and updating and reduce
>> cache line bounce when virtqueue is almost full. copy_to_user() was
>> used in order to benefit from modern cpus that support fast string
>> copy. Batched virtqueue processing will be the first user.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>   drivers/vhost/vhost.h |  3 +++
>>   2 files changed, 58 insertions(+)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index f87ec75..8424166d 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>>   
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +				struct vring_used_elem *heads,
>> +				u16 num, bool used_update)
> why do you need to combine used update with prefetch?

For better performance and I believe we don't care about the overhead 
when we meet errors in tx.

>
>> +{
>> +	int ret, ret2;
>> +	u16 last_avail_idx, last_used_idx, total, copied;
>> +	__virtio16 avail_idx;
>> +	struct vring_used_elem __user *used;
>> +	int i;
>> +
>> +	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
>> +		vq_err(vq, "Failed to access avail idx at %p\n",
>> +		       &vq->avail->idx);
>> +		return -EFAULT;
>> +	}
>> +	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
>> +	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
>> +	total = vq->avail_idx - vq->last_avail_idx;
>> +	ret = total = min(total, num);
>> +
>> +	for (i = 0; i < ret; i++) {
>> +		ret2 = vhost_get_avail(vq, heads[i].id,
>> +				      &vq->avail->ring[last_avail_idx]);
>> +		if (unlikely(ret2)) {
>> +			vq_err(vq, "Failed to get descriptors\n");
>> +			return -EFAULT;
>> +		}
>> +		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
>> +	}
>> +
>> +	if (!used_update)
>> +		return ret;
>> +
>> +	last_used_idx = vq->last_used_idx & (vq->num - 1);
>> +	while (total) {
>> +		copied = min((u16)(vq->num - last_used_idx), total);
>> +		ret2 = vhost_copy_to_user(vq,
>> +					  &vq->used->ring[last_used_idx],
>> +					  &heads[ret - total],
>> +					  copied * sizeof(*used));
>> +
>> +		if (unlikely(ret2)) {
>> +			vq_err(vq, "Failed to update used ring!\n");
>> +			return -EFAULT;
>> +		}
>> +
>> +		last_used_idx = 0;
>> +		total -= copied;
>> +	}
>> +
>> +	/* Only get avail ring entries after they have been exposed by guest. */
>> +	smp_rmb();
> Barrier before return is a very confusing API. I guess it's designed to
> be used in a specific way to make it necessary - but what is it?

Looks like a and we need do this after reading avail_idx.

Thanks

>
>
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(vhost_prefetch_desc_indices);
>>   
>>   static int __init vhost_init(void)
>>   {
>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>> index 39ff897..16c2cb6 100644
>> --- a/drivers/vhost/vhost.h
>> +++ b/drivers/vhost/vhost.h
>> @@ -228,6 +228,9 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
>>   ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
>>   			     struct iov_iter *from);
>>   int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +				struct vring_used_elem *heads,
>> +				u16 num, bool used_update);
>>   
>>   #define vq_err(vq, fmt, ...) do {                                  \
>>   		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
>> -- 
>> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-26 19:19   ` Michael S. Tsirkin
  2017-09-27  0:35     ` Jason Wang
@ 2017-09-27  0:35     ` Jason Wang
  1 sibling, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-27  0:35 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization



On 2017年09月27日 03:19, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
>> This patch introduces vhost_prefetch_desc_indices() which could batch
>> descriptor indices fetching and used ring updating. This intends to
>> reduce the cache misses of indices fetching and updating and reduce
>> cache line bounce when virtqueue is almost full. copy_to_user() was
>> used in order to benefit from modern cpus that support fast string
>> copy. Batched virtqueue processing will be the first user.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>   drivers/vhost/vhost.h |  3 +++
>>   2 files changed, 58 insertions(+)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index f87ec75..8424166d 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>>   
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +				struct vring_used_elem *heads,
>> +				u16 num, bool used_update)
> why do you need to combine used update with prefetch?

For better performance and I believe we don't care about the overhead 
when we meet errors in tx.

>
>> +{
>> +	int ret, ret2;
>> +	u16 last_avail_idx, last_used_idx, total, copied;
>> +	__virtio16 avail_idx;
>> +	struct vring_used_elem __user *used;
>> +	int i;
>> +
>> +	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
>> +		vq_err(vq, "Failed to access avail idx at %p\n",
>> +		       &vq->avail->idx);
>> +		return -EFAULT;
>> +	}
>> +	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
>> +	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
>> +	total = vq->avail_idx - vq->last_avail_idx;
>> +	ret = total = min(total, num);
>> +
>> +	for (i = 0; i < ret; i++) {
>> +		ret2 = vhost_get_avail(vq, heads[i].id,
>> +				      &vq->avail->ring[last_avail_idx]);
>> +		if (unlikely(ret2)) {
>> +			vq_err(vq, "Failed to get descriptors\n");
>> +			return -EFAULT;
>> +		}
>> +		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
>> +	}
>> +
>> +	if (!used_update)
>> +		return ret;
>> +
>> +	last_used_idx = vq->last_used_idx & (vq->num - 1);
>> +	while (total) {
>> +		copied = min((u16)(vq->num - last_used_idx), total);
>> +		ret2 = vhost_copy_to_user(vq,
>> +					  &vq->used->ring[last_used_idx],
>> +					  &heads[ret - total],
>> +					  copied * sizeof(*used));
>> +
>> +		if (unlikely(ret2)) {
>> +			vq_err(vq, "Failed to update used ring!\n");
>> +			return -EFAULT;
>> +		}
>> +
>> +		last_used_idx = 0;
>> +		total -= copied;
>> +	}
>> +
>> +	/* Only get avail ring entries after they have been exposed by guest. */
>> +	smp_rmb();
> Barrier before return is a very confusing API. I guess it's designed to
> be used in a specific way to make it necessary - but what is it?

Looks like a and we need do this after reading avail_idx.

Thanks

>
>
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(vhost_prefetch_desc_indices);
>>   
>>   static int __init vhost_init(void)
>>   {
>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>> index 39ff897..16c2cb6 100644
>> --- a/drivers/vhost/vhost.h
>> +++ b/drivers/vhost/vhost.h
>> @@ -228,6 +228,9 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
>>   ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
>>   			     struct iov_iter *from);
>>   int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +				struct vring_used_elem *heads,
>> +				u16 num, bool used_update);
>>   
>>   #define vq_err(vq, fmt, ...) do {                                  \
>>   		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
>> -- 
>> 2.7.4

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
  2017-09-26 19:13   ` Michael S. Tsirkin
@ 2017-09-27  0:38     ` Jason Wang
  2017-09-27 22:58       ` Michael S. Tsirkin
  2017-09-27 22:58       ` Michael S. Tsirkin
  2017-09-27  0:38     ` Jason Wang
  1 sibling, 2 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-27  0:38 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization, netdev, linux-kernel, kvm



On 2017年09月27日 03:13, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:33PM +0800, Jason Wang wrote:
>> This patch introduces a helper which just increase the used idx. This
>> will be used in pair with vhost_prefetch_desc_indices() by batching
>> code.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 33 +++++++++++++++++++++++++++++++++
>>   drivers/vhost/vhost.h |  1 +
>>   2 files changed, 34 insertions(+)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index 8424166d..6532cda 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2178,6 +2178,39 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_add_used);
>>   
>> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n)
>> +{
>> +	u16 old, new;
>> +
>> +	old = vq->last_used_idx;
>> +	new = (vq->last_used_idx += n);
>> +	/* If the driver never bothers to signal in a very long while,
>> +	 * used index might wrap around. If that happens, invalidate
>> +	 * signalled_used index we stored. TODO: make sure driver
>> +	 * signals at least once in 2^16 and remove this.
>> +	 */
>> +	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
>> +		vq->signalled_used_valid = false;
>> +
>> +	/* Make sure buffer is written before we update index. */
>> +	smp_wmb();
>> +	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>> +			   &vq->used->idx)) {
>> +		vq_err(vq, "Failed to increment used idx");
>> +		return -EFAULT;
>> +	}
>> +	if (unlikely(vq->log_used)) {
>> +		/* Log used index update. */
>> +		log_write(vq->log_base,
>> +			  vq->log_addr + offsetof(struct vring_used, idx),
>> +			  sizeof(vq->used->idx));
>> +		if (vq->log_ctx)
>> +			eventfd_signal(vq->log_ctx, 1);
>> +	}
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(vhost_add_used_idx);
>> +
>>   static int __vhost_add_used_n(struct vhost_virtqueue *vq,
>>   			    struct vring_used_elem *heads,
>>   			    unsigned count)
>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>> index 16c2cb6..5dd6c05 100644
>> --- a/drivers/vhost/vhost.h
>> +++ b/drivers/vhost/vhost.h
>> @@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
>>   void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>>   
>>   int vhost_vq_init_access(struct vhost_virtqueue *);
>> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
>>   int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
>>   int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
>>   		     unsigned count);
> Please change the API to hide the fact that there's an index that needs
> to be updated.

In fact, an interesting optimization on top is just call 
vhost_add_used_idx(vq, n) instead of n vhost_add_used_idx(vq, 1). That's 
the reason I leave n in the API.

Thanks

>
>> -- 
>> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
  2017-09-26 19:13   ` Michael S. Tsirkin
  2017-09-27  0:38     ` Jason Wang
@ 2017-09-27  0:38     ` Jason Wang
  1 sibling, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-27  0:38 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization



On 2017年09月27日 03:13, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:33PM +0800, Jason Wang wrote:
>> This patch introduces a helper which just increase the used idx. This
>> will be used in pair with vhost_prefetch_desc_indices() by batching
>> code.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 33 +++++++++++++++++++++++++++++++++
>>   drivers/vhost/vhost.h |  1 +
>>   2 files changed, 34 insertions(+)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index 8424166d..6532cda 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2178,6 +2178,39 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_add_used);
>>   
>> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n)
>> +{
>> +	u16 old, new;
>> +
>> +	old = vq->last_used_idx;
>> +	new = (vq->last_used_idx += n);
>> +	/* If the driver never bothers to signal in a very long while,
>> +	 * used index might wrap around. If that happens, invalidate
>> +	 * signalled_used index we stored. TODO: make sure driver
>> +	 * signals at least once in 2^16 and remove this.
>> +	 */
>> +	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
>> +		vq->signalled_used_valid = false;
>> +
>> +	/* Make sure buffer is written before we update index. */
>> +	smp_wmb();
>> +	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>> +			   &vq->used->idx)) {
>> +		vq_err(vq, "Failed to increment used idx");
>> +		return -EFAULT;
>> +	}
>> +	if (unlikely(vq->log_used)) {
>> +		/* Log used index update. */
>> +		log_write(vq->log_base,
>> +			  vq->log_addr + offsetof(struct vring_used, idx),
>> +			  sizeof(vq->used->idx));
>> +		if (vq->log_ctx)
>> +			eventfd_signal(vq->log_ctx, 1);
>> +	}
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(vhost_add_used_idx);
>> +
>>   static int __vhost_add_used_n(struct vhost_virtqueue *vq,
>>   			    struct vring_used_elem *heads,
>>   			    unsigned count)
>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>> index 16c2cb6..5dd6c05 100644
>> --- a/drivers/vhost/vhost.h
>> +++ b/drivers/vhost/vhost.h
>> @@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
>>   void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>>   
>>   int vhost_vq_init_access(struct vhost_virtqueue *);
>> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
>>   int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
>>   int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
>>   		     unsigned count);
> Please change the API to hide the fact that there's an index that needs
> to be updated.

In fact, an interesting optimization on top is just call 
vhost_add_used_idx(vq, n) instead of n vhost_add_used_idx(vq, 1). That's 
the reason I leave n in the API.

Thanks

>
>> -- 
>> 2.7.4

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-26 19:25   ` Michael S. Tsirkin
@ 2017-09-27  2:04     ` Jason Wang
  2017-09-27 22:19       ` Michael S. Tsirkin
  2017-09-27 22:19       ` Michael S. Tsirkin
  2017-09-27  2:04     ` Jason Wang
  1 sibling, 2 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-27  2:04 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization, netdev, linux-kernel, kvm



On 2017年09月27日 03:25, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:35PM +0800, Jason Wang wrote:
>> This patch implements basic batched processing of tx virtqueue by
>> prefetching desc indices and updating used ring in a batch. For
>> non-zerocopy case, vq->heads were used for storing the prefetched
>> indices and updating used ring. It is also a requirement for doing
>> more batching on top. For zerocopy case and for simplicity, batched
>> processing were simply disabled by only fetching and processing one
>> descriptor at a time, this could be optimized in the future.
>>
>> XDP_DROP (without touching skb) on tun (with Moongen in guest) with
>> zercopy disabled:
>>
>> Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
>> Before: 3.20Mpps
>> After:  3.90Mpps (+22%)
>>
>> No differences were seen with zerocopy enabled.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> So where is the speedup coming from? I'd guess the ring is
> hot in cache, it's faster to access it in one go, then
> pass many packets to net stack. Is that right?
>
> Another possibility is better code cache locality.

Yes, I think the speed up comes from:

- less cache misses
- less cache line bounce when virtqueue is about to be full (guest is 
faster than host which is the case of MoonGen)
- less memory barriers
- possible faster copy speed by using copy_to_user() on modern CPUs

>
> So how about this patchset is refactored:
>
> 1. use existing APIs just first get packets then
>     transmit them all then use them all

Looks like current API can not get packets first, it only support get 
packet one by one (if you mean vhost_get_vq_desc()). And used ring 
updating may get more misses in this case.

> 2. add new APIs and move the loop into vhost core
>     for more speedups

I don't see any advantages, looks like just need some e.g callbacks in 
this case.

Thanks

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-26 19:25   ` Michael S. Tsirkin
  2017-09-27  2:04     ` Jason Wang
@ 2017-09-27  2:04     ` Jason Wang
  1 sibling, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-27  2:04 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization



On 2017年09月27日 03:25, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:35PM +0800, Jason Wang wrote:
>> This patch implements basic batched processing of tx virtqueue by
>> prefetching desc indices and updating used ring in a batch. For
>> non-zerocopy case, vq->heads were used for storing the prefetched
>> indices and updating used ring. It is also a requirement for doing
>> more batching on top. For zerocopy case and for simplicity, batched
>> processing were simply disabled by only fetching and processing one
>> descriptor at a time, this could be optimized in the future.
>>
>> XDP_DROP (without touching skb) on tun (with Moongen in guest) with
>> zercopy disabled:
>>
>> Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
>> Before: 3.20Mpps
>> After:  3.90Mpps (+22%)
>>
>> No differences were seen with zerocopy enabled.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> So where is the speedup coming from? I'd guess the ring is
> hot in cache, it's faster to access it in one go, then
> pass many packets to net stack. Is that right?
>
> Another possibility is better code cache locality.

Yes, I think the speed up comes from:

- less cache misses
- less cache line bounce when virtqueue is about to be full (guest is 
faster than host which is the case of MoonGen)
- less memory barriers
- possible faster copy speed by using copy_to_user() on modern CPUs

>
> So how about this patchset is refactored:
>
> 1. use existing APIs just first get packets then
>     transmit them all then use them all

Looks like current API can not get packets first, it only support get 
packet one by one (if you mean vhost_get_vq_desc()). And used ring 
updating may get more misses in this case.

> 2. add new APIs and move the loop into vhost core
>     for more speedups

I don't see any advantages, looks like just need some e.g callbacks in 
this case.

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 0/5] batched tx processing in vhost_net
  2017-09-26 19:26 ` Michael S. Tsirkin
@ 2017-09-27  2:06   ` Jason Wang
  2017-09-27  2:06   ` Jason Wang
  1 sibling, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-27  2:06 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization, netdev, linux-kernel, kvm



On 2017年09月27日 03:26, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:30PM +0800, Jason Wang wrote:
>> Hi:
>>
>> This series tries to implement basic tx batched processing. This is
>> done by prefetching descriptor indices and update used ring in a
>> batch. This intends to speed up used ring updating and improve the
>> cache utilization. Test shows about ~22% improvement in tx pss.
>
>
>
>> Please review.
>>
>> Jason Wang (5):
>>    vhost: split out ring head fetching logic
>>    vhost: introduce helper to prefetch desc index
>>    vhost: introduce vhost_add_used_idx()
> Please squash these new APIs into where they are used.
> This split-up just makes review harder for me as
> I can't figure out how the new APIs are used.

Ok.

Thanks

>
>
>>    vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
> This is ok as a separate patch.
>
>>    vhost_net: basic tx virtqueue batched processing
>>
>>   drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
>>   drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
>>   drivers/vhost/vhost.h |   9 ++
>>   3 files changed, 270 insertions(+), 125 deletions(-)
>>
>> -- 
>> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 0/5] batched tx processing in vhost_net
  2017-09-26 19:26 ` Michael S. Tsirkin
  2017-09-27  2:06   ` Jason Wang
@ 2017-09-27  2:06   ` Jason Wang
  1 sibling, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-27  2:06 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization



On 2017年09月27日 03:26, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:30PM +0800, Jason Wang wrote:
>> Hi:
>>
>> This series tries to implement basic tx batched processing. This is
>> done by prefetching descriptor indices and update used ring in a
>> batch. This intends to speed up used ring updating and improve the
>> cache utilization. Test shows about ~22% improvement in tx pss.
>
>
>
>> Please review.
>>
>> Jason Wang (5):
>>    vhost: split out ring head fetching logic
>>    vhost: introduce helper to prefetch desc index
>>    vhost: introduce vhost_add_used_idx()
> Please squash these new APIs into where they are used.
> This split-up just makes review harder for me as
> I can't figure out how the new APIs are used.

Ok.

Thanks

>
>
>>    vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
> This is ok as a separate patch.
>
>>    vhost_net: basic tx virtqueue batched processing
>>
>>   drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
>>   drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
>>   drivers/vhost/vhost.h |   9 ++
>>   3 files changed, 270 insertions(+), 125 deletions(-)
>>
>> -- 
>> 2.7.4

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-27  2:04     ` Jason Wang
@ 2017-09-27 22:19       ` Michael S. Tsirkin
  2017-09-28  7:02         ` Jason Wang
                           ` (3 more replies)
  2017-09-27 22:19       ` Michael S. Tsirkin
  1 sibling, 4 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-27 22:19 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, linux-kernel, kvm

On Wed, Sep 27, 2017 at 10:04:18AM +0800, Jason Wang wrote:
> 
> 
> On 2017年09月27日 03:25, Michael S. Tsirkin wrote:
> > On Fri, Sep 22, 2017 at 04:02:35PM +0800, Jason Wang wrote:
> > > This patch implements basic batched processing of tx virtqueue by
> > > prefetching desc indices and updating used ring in a batch. For
> > > non-zerocopy case, vq->heads were used for storing the prefetched
> > > indices and updating used ring. It is also a requirement for doing
> > > more batching on top. For zerocopy case and for simplicity, batched
> > > processing were simply disabled by only fetching and processing one
> > > descriptor at a time, this could be optimized in the future.
> > > 
> > > XDP_DROP (without touching skb) on tun (with Moongen in guest) with
> > > zercopy disabled:
> > > 
> > > Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
> > > Before: 3.20Mpps
> > > After:  3.90Mpps (+22%)
> > > 
> > > No differences were seen with zerocopy enabled.
> > > 
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > So where is the speedup coming from? I'd guess the ring is
> > hot in cache, it's faster to access it in one go, then
> > pass many packets to net stack. Is that right?
> > 
> > Another possibility is better code cache locality.
> 
> Yes, I think the speed up comes from:
> 
> - less cache misses
> - less cache line bounce when virtqueue is about to be full (guest is faster
> than host which is the case of MoonGen)
> - less memory barriers
> - possible faster copy speed by using copy_to_user() on modern CPUs
> 
> > 
> > So how about this patchset is refactored:
> > 
> > 1. use existing APIs just first get packets then
> >     transmit them all then use them all
> 
> Looks like current API can not get packets first, it only support get packet
> one by one (if you mean vhost_get_vq_desc()). And used ring updating may get
> more misses in this case.

Right. So if you do

for (...)
	vhost_get_vq_desc


then later

for (...)
	vhost_add_used


then you get most of benefits except maybe code cache misses
and copy_to_user.







> > 2. add new APIs and move the loop into vhost core
> >     for more speedups
> 
> I don't see any advantages, looks like just need some e.g callbacks in this
> case.
> 
> Thanks

IUC callbacks pretty much destroy the code cache locality advantages,
IP is jumping around too much.


-- 
MST

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-27  2:04     ` Jason Wang
  2017-09-27 22:19       ` Michael S. Tsirkin
@ 2017-09-27 22:19       ` Michael S. Tsirkin
  1 sibling, 0 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-27 22:19 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization

On Wed, Sep 27, 2017 at 10:04:18AM +0800, Jason Wang wrote:
> 
> 
> On 2017年09月27日 03:25, Michael S. Tsirkin wrote:
> > On Fri, Sep 22, 2017 at 04:02:35PM +0800, Jason Wang wrote:
> > > This patch implements basic batched processing of tx virtqueue by
> > > prefetching desc indices and updating used ring in a batch. For
> > > non-zerocopy case, vq->heads were used for storing the prefetched
> > > indices and updating used ring. It is also a requirement for doing
> > > more batching on top. For zerocopy case and for simplicity, batched
> > > processing were simply disabled by only fetching and processing one
> > > descriptor at a time, this could be optimized in the future.
> > > 
> > > XDP_DROP (without touching skb) on tun (with Moongen in guest) with
> > > zercopy disabled:
> > > 
> > > Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
> > > Before: 3.20Mpps
> > > After:  3.90Mpps (+22%)
> > > 
> > > No differences were seen with zerocopy enabled.
> > > 
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > So where is the speedup coming from? I'd guess the ring is
> > hot in cache, it's faster to access it in one go, then
> > pass many packets to net stack. Is that right?
> > 
> > Another possibility is better code cache locality.
> 
> Yes, I think the speed up comes from:
> 
> - less cache misses
> - less cache line bounce when virtqueue is about to be full (guest is faster
> than host which is the case of MoonGen)
> - less memory barriers
> - possible faster copy speed by using copy_to_user() on modern CPUs
> 
> > 
> > So how about this patchset is refactored:
> > 
> > 1. use existing APIs just first get packets then
> >     transmit them all then use them all
> 
> Looks like current API can not get packets first, it only support get packet
> one by one (if you mean vhost_get_vq_desc()). And used ring updating may get
> more misses in this case.

Right. So if you do

for (...)
	vhost_get_vq_desc


then later

for (...)
	vhost_add_used


then you get most of benefits except maybe code cache misses
and copy_to_user.







> > 2. add new APIs and move the loop into vhost core
> >     for more speedups
> 
> I don't see any advantages, looks like just need some e.g callbacks in this
> case.
> 
> Thanks

IUC callbacks pretty much destroy the code cache locality advantages,
IP is jumping around too much.


-- 
MST
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 0/5] batched tx processing in vhost_net
  2017-09-27  0:27   ` Jason Wang
@ 2017-09-27 22:28       ` Michael S. Tsirkin
  0 siblings, 0 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-27 22:28 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, linux-kernel, kvm

On Wed, Sep 27, 2017 at 08:27:37AM +0800, Jason Wang wrote:
> 
> 
> On 2017年09月26日 21:45, Michael S. Tsirkin wrote:
> > On Fri, Sep 22, 2017 at 04:02:30PM +0800, Jason Wang wrote:
> > > Hi:
> > > 
> > > This series tries to implement basic tx batched processing. This is
> > > done by prefetching descriptor indices and update used ring in a
> > > batch. This intends to speed up used ring updating and improve the
> > > cache utilization.
> > Interesting, thanks for the patches. So IIUC most of the gain is really
> > overcoming some of the shortcomings of virtio 1.0 wrt cache utilization?
> 
> Yes.
> 
> Actually, looks like batching in 1.1 is not as easy as in 1.0.
> 
> In 1.0, we could do something like:
> 
> batch update used ring by user copy_to_user()
> smp_wmb()
> update used_idx
> In 1.1, we need more memory barriers, can't benefit from fast copy helpers?
> 
> for () {
>     update desc.addr
>     smp_wmb()
>     update desc.flag
> }

Yes but smp_wmb is a NOP on e.g. x86. We can switch to other types of
barriers as well.  We do need to do the updates in order, so we might
need new APIs for that to avoid re-doing the translation all the time.

In 1.0 the last update is a cache miss always. You need batching to get
less misses. In 1.1 you don't have it so fundamentally there is less
need for batching. But batching does not always work.  DPDK guys (which
batch things aggressively) already tried 1.1 and saw performance gains
so we do not need to argue theoretically.



> > 
> > Which is fair enough (1.0 is already deployed) but I would like to avoid
> > making 1.1 support harder, and this patchset does this unfortunately,
> 
> I think the new APIs do not expose more internal data structure of virtio
> than before? (vq->heads has already been used by vhost_net for years).

For sure we might need to change vring_used_elem.

> Consider the layout is re-designed completely, I don't see an easy method to
> reuse current 1.0 API for 1.1.

Current API just says you get buffers then you use them. It is not tied
to actual separate used ring.


> > see comments on individual patches. I'm sure it can be addressed though.
> > 
> > > Test shows about ~22% improvement in tx pss.
> > Is this with or without tx napi in guest?
> 
> MoonGen is used in guest for better numbers.
> 
> Thanks

Not sure I understand. Did you set napi_tx to true or false?

> > 
> > > Please review.
> > > 
> > > Jason Wang (5):
> > >    vhost: split out ring head fetching logic
> > >    vhost: introduce helper to prefetch desc index
> > >    vhost: introduce vhost_add_used_idx()
> > >    vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
> > >    vhost_net: basic tx virtqueue batched processing
> > > 
> > >   drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
> > >   drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
> > >   drivers/vhost/vhost.h |   9 ++
> > >   3 files changed, 270 insertions(+), 125 deletions(-)
> > > 
> > > -- 
> > > 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 0/5] batched tx processing in vhost_net
@ 2017-09-27 22:28       ` Michael S. Tsirkin
  0 siblings, 0 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-27 22:28 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization

On Wed, Sep 27, 2017 at 08:27:37AM +0800, Jason Wang wrote:
> 
> 
> On 2017年09月26日 21:45, Michael S. Tsirkin wrote:
> > On Fri, Sep 22, 2017 at 04:02:30PM +0800, Jason Wang wrote:
> > > Hi:
> > > 
> > > This series tries to implement basic tx batched processing. This is
> > > done by prefetching descriptor indices and update used ring in a
> > > batch. This intends to speed up used ring updating and improve the
> > > cache utilization.
> > Interesting, thanks for the patches. So IIUC most of the gain is really
> > overcoming some of the shortcomings of virtio 1.0 wrt cache utilization?
> 
> Yes.
> 
> Actually, looks like batching in 1.1 is not as easy as in 1.0.
> 
> In 1.0, we could do something like:
> 
> batch update used ring by user copy_to_user()
> smp_wmb()
> update used_idx
> In 1.1, we need more memory barriers, can't benefit from fast copy helpers?
> 
> for () {
>     update desc.addr
>     smp_wmb()
>     update desc.flag
> }

Yes but smp_wmb is a NOP on e.g. x86. We can switch to other types of
barriers as well.  We do need to do the updates in order, so we might
need new APIs for that to avoid re-doing the translation all the time.

In 1.0 the last update is a cache miss always. You need batching to get
less misses. In 1.1 you don't have it so fundamentally there is less
need for batching. But batching does not always work.  DPDK guys (which
batch things aggressively) already tried 1.1 and saw performance gains
so we do not need to argue theoretically.



> > 
> > Which is fair enough (1.0 is already deployed) but I would like to avoid
> > making 1.1 support harder, and this patchset does this unfortunately,
> 
> I think the new APIs do not expose more internal data structure of virtio
> than before? (vq->heads has already been used by vhost_net for years).

For sure we might need to change vring_used_elem.

> Consider the layout is re-designed completely, I don't see an easy method to
> reuse current 1.0 API for 1.1.

Current API just says you get buffers then you use them. It is not tied
to actual separate used ring.


> > see comments on individual patches. I'm sure it can be addressed though.
> > 
> > > Test shows about ~22% improvement in tx pss.
> > Is this with or without tx napi in guest?
> 
> MoonGen is used in guest for better numbers.
> 
> Thanks

Not sure I understand. Did you set napi_tx to true or false?

> > 
> > > Please review.
> > > 
> > > Jason Wang (5):
> > >    vhost: split out ring head fetching logic
> > >    vhost: introduce helper to prefetch desc index
> > >    vhost: introduce vhost_add_used_idx()
> > >    vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
> > >    vhost_net: basic tx virtqueue batched processing
> > > 
> > >   drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
> > >   drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
> > >   drivers/vhost/vhost.h |   9 ++
> > >   3 files changed, 270 insertions(+), 125 deletions(-)
> > > 
> > > -- 
> > > 2.7.4
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-27  0:35     ` Jason Wang
@ 2017-09-27 22:57         ` Michael S. Tsirkin
  0 siblings, 0 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-27 22:57 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, linux-kernel, kvm

On Wed, Sep 27, 2017 at 08:35:47AM +0800, Jason Wang wrote:
> 
> 
> On 2017年09月27日 03:19, Michael S. Tsirkin wrote:
> > On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
> > > This patch introduces vhost_prefetch_desc_indices() which could batch
> > > descriptor indices fetching and used ring updating. This intends to
> > > reduce the cache misses of indices fetching and updating and reduce
> > > cache line bounce when virtqueue is almost full. copy_to_user() was
> > > used in order to benefit from modern cpus that support fast string
> > > copy. Batched virtqueue processing will be the first user.
> > > 
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > ---
> > >   drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
> > >   drivers/vhost/vhost.h |  3 +++
> > >   2 files changed, 58 insertions(+)
> > > 
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index f87ec75..8424166d 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
> > >   }
> > >   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
> > > +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> > > +				struct vring_used_elem *heads,
> > > +				u16 num, bool used_update)
> > why do you need to combine used update with prefetch?
> 
> For better performance


Why is sticking a branch in there better than requesting the update
conditionally from the caller?



> and I believe we don't care about the overhead when
> we meet errors in tx.

That's a separate question, I do not really understand how
you can fetch a descriptor and update the used ring at the same
time. This allows the guest to overwrite the buffer.
I might be misunderstanding what is going on here though.


> > 
> > > +{
> > > +	int ret, ret2;
> > > +	u16 last_avail_idx, last_used_idx, total, copied;
> > > +	__virtio16 avail_idx;
> > > +	struct vring_used_elem __user *used;
> > > +	int i;
> > > +
> > > +	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
> > > +		vq_err(vq, "Failed to access avail idx at %p\n",
> > > +		       &vq->avail->idx);
> > > +		return -EFAULT;
> > > +	}
> > > +	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
> > > +	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> > > +	total = vq->avail_idx - vq->last_avail_idx;
> > > +	ret = total = min(total, num);
> > > +
> > > +	for (i = 0; i < ret; i++) {
> > > +		ret2 = vhost_get_avail(vq, heads[i].id,
> > > +				      &vq->avail->ring[last_avail_idx]);
> > > +		if (unlikely(ret2)) {
> > > +			vq_err(vq, "Failed to get descriptors\n");
> > > +			return -EFAULT;
> > > +		}
> > > +		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
> > > +	}
> > > +
> > > +	if (!used_update)
> > > +		return ret;
> > > +
> > > +	last_used_idx = vq->last_used_idx & (vq->num - 1);
> > > +	while (total) {
> > > +		copied = min((u16)(vq->num - last_used_idx), total);
> > > +		ret2 = vhost_copy_to_user(vq,
> > > +					  &vq->used->ring[last_used_idx],
> > > +					  &heads[ret - total],
> > > +					  copied * sizeof(*used));
> > > +
> > > +		if (unlikely(ret2)) {
> > > +			vq_err(vq, "Failed to update used ring!\n");
> > > +			return -EFAULT;
> > > +		}
> > > +
> > > +		last_used_idx = 0;
> > > +		total -= copied;
> > > +	}
> > > +
> > > +	/* Only get avail ring entries after they have been exposed by guest. */
> > > +	smp_rmb();
> > Barrier before return is a very confusing API. I guess it's designed to
> > be used in a specific way to make it necessary - but what is it?
> 
> Looks like a and we need do this after reading avail_idx.
> 
> Thanks
> 
> > 
> > 
> > > +	return ret;
> > > +}
> > > +EXPORT_SYMBOL(vhost_prefetch_desc_indices);
> > >   static int __init vhost_init(void)
> > >   {
> > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > index 39ff897..16c2cb6 100644
> > > --- a/drivers/vhost/vhost.h
> > > +++ b/drivers/vhost/vhost.h
> > > @@ -228,6 +228,9 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
> > >   ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
> > >   			     struct iov_iter *from);
> > >   int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
> > > +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> > > +				struct vring_used_elem *heads,
> > > +				u16 num, bool used_update);
> > >   #define vq_err(vq, fmt, ...) do {                                  \
> > >   		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> > > -- 
> > > 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
@ 2017-09-27 22:57         ` Michael S. Tsirkin
  0 siblings, 0 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-27 22:57 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization

On Wed, Sep 27, 2017 at 08:35:47AM +0800, Jason Wang wrote:
> 
> 
> On 2017年09月27日 03:19, Michael S. Tsirkin wrote:
> > On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
> > > This patch introduces vhost_prefetch_desc_indices() which could batch
> > > descriptor indices fetching and used ring updating. This intends to
> > > reduce the cache misses of indices fetching and updating and reduce
> > > cache line bounce when virtqueue is almost full. copy_to_user() was
> > > used in order to benefit from modern cpus that support fast string
> > > copy. Batched virtqueue processing will be the first user.
> > > 
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > ---
> > >   drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
> > >   drivers/vhost/vhost.h |  3 +++
> > >   2 files changed, 58 insertions(+)
> > > 
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index f87ec75..8424166d 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
> > >   }
> > >   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
> > > +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> > > +				struct vring_used_elem *heads,
> > > +				u16 num, bool used_update)
> > why do you need to combine used update with prefetch?
> 
> For better performance


Why is sticking a branch in there better than requesting the update
conditionally from the caller?



> and I believe we don't care about the overhead when
> we meet errors in tx.

That's a separate question, I do not really understand how
you can fetch a descriptor and update the used ring at the same
time. This allows the guest to overwrite the buffer.
I might be misunderstanding what is going on here though.


> > 
> > > +{
> > > +	int ret, ret2;
> > > +	u16 last_avail_idx, last_used_idx, total, copied;
> > > +	__virtio16 avail_idx;
> > > +	struct vring_used_elem __user *used;
> > > +	int i;
> > > +
> > > +	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
> > > +		vq_err(vq, "Failed to access avail idx at %p\n",
> > > +		       &vq->avail->idx);
> > > +		return -EFAULT;
> > > +	}
> > > +	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
> > > +	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> > > +	total = vq->avail_idx - vq->last_avail_idx;
> > > +	ret = total = min(total, num);
> > > +
> > > +	for (i = 0; i < ret; i++) {
> > > +		ret2 = vhost_get_avail(vq, heads[i].id,
> > > +				      &vq->avail->ring[last_avail_idx]);
> > > +		if (unlikely(ret2)) {
> > > +			vq_err(vq, "Failed to get descriptors\n");
> > > +			return -EFAULT;
> > > +		}
> > > +		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
> > > +	}
> > > +
> > > +	if (!used_update)
> > > +		return ret;
> > > +
> > > +	last_used_idx = vq->last_used_idx & (vq->num - 1);
> > > +	while (total) {
> > > +		copied = min((u16)(vq->num - last_used_idx), total);
> > > +		ret2 = vhost_copy_to_user(vq,
> > > +					  &vq->used->ring[last_used_idx],
> > > +					  &heads[ret - total],
> > > +					  copied * sizeof(*used));
> > > +
> > > +		if (unlikely(ret2)) {
> > > +			vq_err(vq, "Failed to update used ring!\n");
> > > +			return -EFAULT;
> > > +		}
> > > +
> > > +		last_used_idx = 0;
> > > +		total -= copied;
> > > +	}
> > > +
> > > +	/* Only get avail ring entries after they have been exposed by guest. */
> > > +	smp_rmb();
> > Barrier before return is a very confusing API. I guess it's designed to
> > be used in a specific way to make it necessary - but what is it?
> 
> Looks like a and we need do this after reading avail_idx.
> 
> Thanks
> 
> > 
> > 
> > > +	return ret;
> > > +}
> > > +EXPORT_SYMBOL(vhost_prefetch_desc_indices);
> > >   static int __init vhost_init(void)
> > >   {
> > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > index 39ff897..16c2cb6 100644
> > > --- a/drivers/vhost/vhost.h
> > > +++ b/drivers/vhost/vhost.h
> > > @@ -228,6 +228,9 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
> > >   ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
> > >   			     struct iov_iter *from);
> > >   int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
> > > +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> > > +				struct vring_used_elem *heads,
> > > +				u16 num, bool used_update);
> > >   #define vq_err(vq, fmt, ...) do {                                  \
> > >   		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> > > -- 
> > > 2.7.4
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
  2017-09-27  0:38     ` Jason Wang
  2017-09-27 22:58       ` Michael S. Tsirkin
@ 2017-09-27 22:58       ` Michael S. Tsirkin
  2017-09-28  0:59         ` Willem de Bruijn
                           ` (3 more replies)
  1 sibling, 4 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-27 22:58 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, linux-kernel, kvm

On Wed, Sep 27, 2017 at 08:38:24AM +0800, Jason Wang wrote:
> 
> 
> On 2017年09月27日 03:13, Michael S. Tsirkin wrote:
> > On Fri, Sep 22, 2017 at 04:02:33PM +0800, Jason Wang wrote:
> > > This patch introduces a helper which just increase the used idx. This
> > > will be used in pair with vhost_prefetch_desc_indices() by batching
> > > code.
> > > 
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > ---
> > >   drivers/vhost/vhost.c | 33 +++++++++++++++++++++++++++++++++
> > >   drivers/vhost/vhost.h |  1 +
> > >   2 files changed, 34 insertions(+)
> > > 
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index 8424166d..6532cda 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -2178,6 +2178,39 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
> > >   }
> > >   EXPORT_SYMBOL_GPL(vhost_add_used);
> > > +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n)
> > > +{
> > > +	u16 old, new;
> > > +
> > > +	old = vq->last_used_idx;
> > > +	new = (vq->last_used_idx += n);
> > > +	/* If the driver never bothers to signal in a very long while,
> > > +	 * used index might wrap around. If that happens, invalidate
> > > +	 * signalled_used index we stored. TODO: make sure driver
> > > +	 * signals at least once in 2^16 and remove this.
> > > +	 */
> > > +	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
> > > +		vq->signalled_used_valid = false;
> > > +
> > > +	/* Make sure buffer is written before we update index. */
> > > +	smp_wmb();
> > > +	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
> > > +			   &vq->used->idx)) {
> > > +		vq_err(vq, "Failed to increment used idx");
> > > +		return -EFAULT;
> > > +	}
> > > +	if (unlikely(vq->log_used)) {
> > > +		/* Log used index update. */
> > > +		log_write(vq->log_base,
> > > +			  vq->log_addr + offsetof(struct vring_used, idx),
> > > +			  sizeof(vq->used->idx));
> > > +		if (vq->log_ctx)
> > > +			eventfd_signal(vq->log_ctx, 1);
> > > +	}
> > > +	return 0;
> > > +}
> > > +EXPORT_SYMBOL_GPL(vhost_add_used_idx);
> > > +
> > >   static int __vhost_add_used_n(struct vhost_virtqueue *vq,
> > >   			    struct vring_used_elem *heads,
> > >   			    unsigned count)
> > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > index 16c2cb6..5dd6c05 100644
> > > --- a/drivers/vhost/vhost.h
> > > +++ b/drivers/vhost/vhost.h
> > > @@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > >   void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
> > >   int vhost_vq_init_access(struct vhost_virtqueue *);
> > > +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
> > >   int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
> > >   int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
> > >   		     unsigned count);
> > Please change the API to hide the fact that there's an index that needs
> > to be updated.
> 
> In fact, an interesting optimization on top is just call
> vhost_add_used_idx(vq, n) instead of n vhost_add_used_idx(vq, 1). That's the
> reason I leave n in the API.
> 
> Thanks

Right but you could increment some internal counter in the vq
structure then update the used index using some api
with a generic name, e.g.  add_used_complete or something like this.

> > 
> > > -- 
> > > 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
  2017-09-27  0:38     ` Jason Wang
@ 2017-09-27 22:58       ` Michael S. Tsirkin
  2017-09-27 22:58       ` Michael S. Tsirkin
  1 sibling, 0 replies; 69+ messages in thread
From: Michael S. Tsirkin @ 2017-09-27 22:58 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization

On Wed, Sep 27, 2017 at 08:38:24AM +0800, Jason Wang wrote:
> 
> 
> On 2017年09月27日 03:13, Michael S. Tsirkin wrote:
> > On Fri, Sep 22, 2017 at 04:02:33PM +0800, Jason Wang wrote:
> > > This patch introduces a helper which just increase the used idx. This
> > > will be used in pair with vhost_prefetch_desc_indices() by batching
> > > code.
> > > 
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > ---
> > >   drivers/vhost/vhost.c | 33 +++++++++++++++++++++++++++++++++
> > >   drivers/vhost/vhost.h |  1 +
> > >   2 files changed, 34 insertions(+)
> > > 
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index 8424166d..6532cda 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -2178,6 +2178,39 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
> > >   }
> > >   EXPORT_SYMBOL_GPL(vhost_add_used);
> > > +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n)
> > > +{
> > > +	u16 old, new;
> > > +
> > > +	old = vq->last_used_idx;
> > > +	new = (vq->last_used_idx += n);
> > > +	/* If the driver never bothers to signal in a very long while,
> > > +	 * used index might wrap around. If that happens, invalidate
> > > +	 * signalled_used index we stored. TODO: make sure driver
> > > +	 * signals at least once in 2^16 and remove this.
> > > +	 */
> > > +	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
> > > +		vq->signalled_used_valid = false;
> > > +
> > > +	/* Make sure buffer is written before we update index. */
> > > +	smp_wmb();
> > > +	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
> > > +			   &vq->used->idx)) {
> > > +		vq_err(vq, "Failed to increment used idx");
> > > +		return -EFAULT;
> > > +	}
> > > +	if (unlikely(vq->log_used)) {
> > > +		/* Log used index update. */
> > > +		log_write(vq->log_base,
> > > +			  vq->log_addr + offsetof(struct vring_used, idx),
> > > +			  sizeof(vq->used->idx));
> > > +		if (vq->log_ctx)
> > > +			eventfd_signal(vq->log_ctx, 1);
> > > +	}
> > > +	return 0;
> > > +}
> > > +EXPORT_SYMBOL_GPL(vhost_add_used_idx);
> > > +
> > >   static int __vhost_add_used_n(struct vhost_virtqueue *vq,
> > >   			    struct vring_used_elem *heads,
> > >   			    unsigned count)
> > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > index 16c2cb6..5dd6c05 100644
> > > --- a/drivers/vhost/vhost.h
> > > +++ b/drivers/vhost/vhost.h
> > > @@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > >   void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
> > >   int vhost_vq_init_access(struct vhost_virtqueue *);
> > > +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
> > >   int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
> > >   int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
> > >   		     unsigned count);
> > Please change the API to hide the fact that there's an index that needs
> > to be updated.
> 
> In fact, an interesting optimization on top is just call
> vhost_add_used_idx(vq, n) instead of n vhost_add_used_idx(vq, 1). That's the
> reason I leave n in the API.
> 
> Thanks

Right but you could increment some internal counter in the vq
structure then update the used index using some api
with a generic name, e.g.  add_used_complete or something like this.

> > 
> > > -- 
> > > 2.7.4
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-22  8:02 ` [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index Jason Wang
                     ` (3 preceding siblings ...)
  2017-09-28  0:47   ` Willem de Bruijn
@ 2017-09-28  0:47   ` Willem de Bruijn
  2017-09-28  7:44     ` Jason Wang
  2017-09-28  7:44     ` Jason Wang
  4 siblings, 2 replies; 69+ messages in thread
From: Willem de Bruijn @ 2017-09-28  0:47 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, virtualization, Network Development, LKML, kvm

On Fri, Sep 22, 2017 at 4:02 AM, Jason Wang <jasowang@redhat.com> wrote:
> This patch introduces vhost_prefetch_desc_indices() which could batch
> descriptor indices fetching and used ring updating. This intends to
> reduce the cache misses of indices fetching and updating and reduce
> cache line bounce when virtqueue is almost full. copy_to_user() was
> used in order to benefit from modern cpus that support fast string
> copy. Batched virtqueue processing will be the first user.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  3 +++
>  2 files changed, 58 insertions(+)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index f87ec75..8424166d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>  }
>  EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +                               struct vring_used_elem *heads,
> +                               u16 num, bool used_update)
> +{
> +       int ret, ret2;
> +       u16 last_avail_idx, last_used_idx, total, copied;
> +       __virtio16 avail_idx;
> +       struct vring_used_elem __user *used;
> +       int i;
> +
> +       if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
> +               vq_err(vq, "Failed to access avail idx at %p\n",
> +                      &vq->avail->idx);
> +               return -EFAULT;
> +       }
> +       last_avail_idx = vq->last_avail_idx & (vq->num - 1);
> +       vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> +       total = vq->avail_idx - vq->last_avail_idx;
> +       ret = total = min(total, num);
> +
> +       for (i = 0; i < ret; i++) {
> +               ret2 = vhost_get_avail(vq, heads[i].id,
> +                                     &vq->avail->ring[last_avail_idx]);
> +               if (unlikely(ret2)) {
> +                       vq_err(vq, "Failed to get descriptors\n");
> +                       return -EFAULT;
> +               }
> +               last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
> +       }

This is understandably very similar to the existing logic in vhost_get_vq_desc.
Can that be extracted to a helper to avoid code duplication?

Perhaps one helper to update vq->avail_idx and return num, and
another to call vhost_get_avail one or more times.

> +
> +       if (!used_update)
> +               return ret;
> +
> +       last_used_idx = vq->last_used_idx & (vq->num - 1);
> +       while (total) {
> +               copied = min((u16)(vq->num - last_used_idx), total);
> +               ret2 = vhost_copy_to_user(vq,
> +                                         &vq->used->ring[last_used_idx],
> +                                         &heads[ret - total],
> +                                         copied * sizeof(*used));
> +
> +               if (unlikely(ret2)) {
> +                       vq_err(vq, "Failed to update used ring!\n");
> +                       return -EFAULT;
> +               }
> +
> +               last_used_idx = 0;
> +               total -= copied;
> +       }

This second part seems unrelated and could be a separate function?

Also, no need for ret2 and double assignment "ret = total =" if not
modifying total
in the the second loop:

  for (i = 0; i < total; ) {
    ...
    i += copied;
  }

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-22  8:02 ` [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index Jason Wang
                     ` (2 preceding siblings ...)
  2017-09-26 19:19   ` Michael S. Tsirkin
@ 2017-09-28  0:47   ` Willem de Bruijn
  2017-09-28  0:47   ` Willem de Bruijn
  4 siblings, 0 replies; 69+ messages in thread
From: Willem de Bruijn @ 2017-09-28  0:47 UTC (permalink / raw)
  To: Jason Wang
  Cc: Network Development, virtualization, LKML, kvm, Michael S. Tsirkin

On Fri, Sep 22, 2017 at 4:02 AM, Jason Wang <jasowang@redhat.com> wrote:
> This patch introduces vhost_prefetch_desc_indices() which could batch
> descriptor indices fetching and used ring updating. This intends to
> reduce the cache misses of indices fetching and updating and reduce
> cache line bounce when virtqueue is almost full. copy_to_user() was
> used in order to benefit from modern cpus that support fast string
> copy. Batched virtqueue processing will be the first user.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  3 +++
>  2 files changed, 58 insertions(+)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index f87ec75..8424166d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>  }
>  EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +                               struct vring_used_elem *heads,
> +                               u16 num, bool used_update)
> +{
> +       int ret, ret2;
> +       u16 last_avail_idx, last_used_idx, total, copied;
> +       __virtio16 avail_idx;
> +       struct vring_used_elem __user *used;
> +       int i;
> +
> +       if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
> +               vq_err(vq, "Failed to access avail idx at %p\n",
> +                      &vq->avail->idx);
> +               return -EFAULT;
> +       }
> +       last_avail_idx = vq->last_avail_idx & (vq->num - 1);
> +       vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> +       total = vq->avail_idx - vq->last_avail_idx;
> +       ret = total = min(total, num);
> +
> +       for (i = 0; i < ret; i++) {
> +               ret2 = vhost_get_avail(vq, heads[i].id,
> +                                     &vq->avail->ring[last_avail_idx]);
> +               if (unlikely(ret2)) {
> +                       vq_err(vq, "Failed to get descriptors\n");
> +                       return -EFAULT;
> +               }
> +               last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
> +       }

This is understandably very similar to the existing logic in vhost_get_vq_desc.
Can that be extracted to a helper to avoid code duplication?

Perhaps one helper to update vq->avail_idx and return num, and
another to call vhost_get_avail one or more times.

> +
> +       if (!used_update)
> +               return ret;
> +
> +       last_used_idx = vq->last_used_idx & (vq->num - 1);
> +       while (total) {
> +               copied = min((u16)(vq->num - last_used_idx), total);
> +               ret2 = vhost_copy_to_user(vq,
> +                                         &vq->used->ring[last_used_idx],
> +                                         &heads[ret - total],
> +                                         copied * sizeof(*used));
> +
> +               if (unlikely(ret2)) {
> +                       vq_err(vq, "Failed to update used ring!\n");
> +                       return -EFAULT;
> +               }
> +
> +               last_used_idx = 0;
> +               total -= copied;
> +       }

This second part seems unrelated and could be a separate function?

Also, no need for ret2 and double assignment "ret = total =" if not
modifying total
in the the second loop:

  for (i = 0; i < total; ) {
    ...
    i += copied;
  }

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-22  8:02 ` Jason Wang
@ 2017-09-28  0:55     ` Willem de Bruijn
  2017-09-26 19:25   ` Michael S. Tsirkin
  2017-09-28  0:55     ` Willem de Bruijn
  2 siblings, 0 replies; 69+ messages in thread
From: Willem de Bruijn @ 2017-09-28  0:55 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, virtualization, Network Development, LKML, kvm

> @@ -461,6 +460,7 @@ static void handle_tx(struct vhost_net *net)
>         struct socket *sock;
>         struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
>         bool zcopy, zcopy_used;
> +       int i, batched = VHOST_NET_BATCH;
>
>         mutex_lock(&vq->mutex);
>         sock = vq->private_data;
> @@ -475,6 +475,12 @@ static void handle_tx(struct vhost_net *net)
>         hdr_size = nvq->vhost_hlen;
>         zcopy = nvq->ubufs;
>
> +       /* Disable zerocopy batched fetching for simplicity */

This special case can perhaps be avoided if we no longer block
on vhost_exceeds_maxpend, but revert to copying.

> +       if (zcopy) {
> +               heads = &used;

Can this special case of batchsize 1 not use vq->heads?

> +               batched = 1;
> +       }
> +
>         for (;;) {
>                 /* Release DMAs done buffers first */
>                 if (zcopy)
> @@ -486,95 +492,114 @@ static void handle_tx(struct vhost_net *net)
>                 if (unlikely(vhost_exceeds_maxpend(net)))
>                         break;

> +                       /* TODO: Check specific error and bomb out
> +                        * unless ENOBUFS?
> +                        */
> +                       err = sock->ops->sendmsg(sock, &msg, len);
> +                       if (unlikely(err < 0)) {
> +                               if (zcopy_used) {
> +                                       vhost_net_ubuf_put(ubufs);
> +                                       nvq->upend_idx =
> +                                  ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV;
> +                               }
> +                               vhost_discard_vq_desc(vq, 1);
> +                               goto out;
> +                       }
> +                       if (err != len)
> +                               pr_debug("Truncated TX packet: "
> +                                       " len %d != %zd\n", err, len);
> +                       if (!zcopy) {
> +                               vhost_add_used_idx(vq, 1);
> +                               vhost_signal(&net->dev, vq);
> +                       } else if (!zcopy_used) {
> +                               vhost_add_used_and_signal(&net->dev,
> +                                                         vq, head, 0);

While batching, perhaps can also move this producer index update
out of the loop and using vhost_add_used_and_signal_n.

> +                       } else
> +                               vhost_zerocopy_signal_used(net, vq);
> +                       vhost_net_tx_packet(net);
> +                       if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> +                               vhost_poll_queue(&vq->poll);
> +                               goto out;
>                         }
> -                       vhost_discard_vq_desc(vq, 1);
> -                       break;
> -               }
> -               if (err != len)
> -                       pr_debug("Truncated TX packet: "
> -                                " len %d != %zd\n", err, len);
> -               if (!zcopy_used)
> -                       vhost_add_used_and_signal(&net->dev, vq, head, 0);
> -               else
> -                       vhost_zerocopy_signal_used(net, vq);
> -               vhost_net_tx_packet(net);
> -               if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> -                       vhost_poll_queue(&vq->poll);
> -                       break;

This patch touches many lines just for indentation. If having to touch
these lines anyway (dirtying git blame), it may be a good time to move
the processing of a single descriptor code into a separate helper function.
And while breaking up, perhaps another helper for setting up ubuf_info.
If you agree, preferably in a separate noop refactor patch that precedes
the functional changes.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
@ 2017-09-28  0:55     ` Willem de Bruijn
  0 siblings, 0 replies; 69+ messages in thread
From: Willem de Bruijn @ 2017-09-28  0:55 UTC (permalink / raw)
  To: Jason Wang
  Cc: Network Development, virtualization, LKML, kvm, Michael S. Tsirkin

> @@ -461,6 +460,7 @@ static void handle_tx(struct vhost_net *net)
>         struct socket *sock;
>         struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
>         bool zcopy, zcopy_used;
> +       int i, batched = VHOST_NET_BATCH;
>
>         mutex_lock(&vq->mutex);
>         sock = vq->private_data;
> @@ -475,6 +475,12 @@ static void handle_tx(struct vhost_net *net)
>         hdr_size = nvq->vhost_hlen;
>         zcopy = nvq->ubufs;
>
> +       /* Disable zerocopy batched fetching for simplicity */

This special case can perhaps be avoided if we no longer block
on vhost_exceeds_maxpend, but revert to copying.

> +       if (zcopy) {
> +               heads = &used;

Can this special case of batchsize 1 not use vq->heads?

> +               batched = 1;
> +       }
> +
>         for (;;) {
>                 /* Release DMAs done buffers first */
>                 if (zcopy)
> @@ -486,95 +492,114 @@ static void handle_tx(struct vhost_net *net)
>                 if (unlikely(vhost_exceeds_maxpend(net)))
>                         break;

> +                       /* TODO: Check specific error and bomb out
> +                        * unless ENOBUFS?
> +                        */
> +                       err = sock->ops->sendmsg(sock, &msg, len);
> +                       if (unlikely(err < 0)) {
> +                               if (zcopy_used) {
> +                                       vhost_net_ubuf_put(ubufs);
> +                                       nvq->upend_idx =
> +                                  ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV;
> +                               }
> +                               vhost_discard_vq_desc(vq, 1);
> +                               goto out;
> +                       }
> +                       if (err != len)
> +                               pr_debug("Truncated TX packet: "
> +                                       " len %d != %zd\n", err, len);
> +                       if (!zcopy) {
> +                               vhost_add_used_idx(vq, 1);
> +                               vhost_signal(&net->dev, vq);
> +                       } else if (!zcopy_used) {
> +                               vhost_add_used_and_signal(&net->dev,
> +                                                         vq, head, 0);

While batching, perhaps can also move this producer index update
out of the loop and using vhost_add_used_and_signal_n.

> +                       } else
> +                               vhost_zerocopy_signal_used(net, vq);
> +                       vhost_net_tx_packet(net);
> +                       if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> +                               vhost_poll_queue(&vq->poll);
> +                               goto out;
>                         }
> -                       vhost_discard_vq_desc(vq, 1);
> -                       break;
> -               }
> -               if (err != len)
> -                       pr_debug("Truncated TX packet: "
> -                                " len %d != %zd\n", err, len);
> -               if (!zcopy_used)
> -                       vhost_add_used_and_signal(&net->dev, vq, head, 0);
> -               else
> -                       vhost_zerocopy_signal_used(net, vq);
> -               vhost_net_tx_packet(net);
> -               if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> -                       vhost_poll_queue(&vq->poll);
> -                       break;

This patch touches many lines just for indentation. If having to touch
these lines anyway (dirtying git blame), it may be a good time to move
the processing of a single descriptor code into a separate helper function.
And while breaking up, perhaps another helper for setting up ubuf_info.
If you agree, preferably in a separate noop refactor patch that precedes
the functional changes.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
  2017-09-27 22:58       ` Michael S. Tsirkin
  2017-09-28  0:59         ` Willem de Bruijn
@ 2017-09-28  0:59         ` Willem de Bruijn
  2017-09-28  7:19         ` Jason Wang
  2017-09-28  7:19         ` Jason Wang
  3 siblings, 0 replies; 69+ messages in thread
From: Willem de Bruijn @ 2017-09-28  0:59 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, virtualization, Network Development, LKML, kvm

>> > > @@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
>> > >   void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>> > >   int vhost_vq_init_access(struct vhost_virtqueue *);
>> > > +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
>> > >   int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
>> > >   int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
>> > >                        unsigned count);
>> > Please change the API to hide the fact that there's an index that needs
>> > to be updated.
>>
>> In fact, an interesting optimization on top is just call
>> vhost_add_used_idx(vq, n) instead of n vhost_add_used_idx(vq, 1). That's the
>> reason I leave n in the API.
>>
>> Thanks
>
> Right but you could increment some internal counter in the vq
> structure then update the used index using some api
> with a generic name, e.g.  add_used_complete or something like this.

That adds a layer of information hiding. If the same variable can be
kept close to the computation in a local variable and passed directly
to vhost_add_used_idx_n that is easier to follow.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
  2017-09-27 22:58       ` Michael S. Tsirkin
@ 2017-09-28  0:59         ` Willem de Bruijn
  2017-09-28  0:59         ` Willem de Bruijn
                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 69+ messages in thread
From: Willem de Bruijn @ 2017-09-28  0:59 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Network Development, LKML, kvm, virtualization

>> > > @@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
>> > >   void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>> > >   int vhost_vq_init_access(struct vhost_virtqueue *);
>> > > +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
>> > >   int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
>> > >   int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
>> > >                        unsigned count);
>> > Please change the API to hide the fact that there's an index that needs
>> > to be updated.
>>
>> In fact, an interesting optimization on top is just call
>> vhost_add_used_idx(vq, n) instead of n vhost_add_used_idx(vq, 1). That's the
>> reason I leave n in the API.
>>
>> Thanks
>
> Right but you could increment some internal counter in the vq
> structure then update the used index using some api
> with a generic name, e.g.  add_used_complete or something like this.

That adds a layer of information hiding. If the same variable can be
kept close to the computation in a local variable and passed directly
to vhost_add_used_idx_n that is easier to follow.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-27 22:19       ` Michael S. Tsirkin
  2017-09-28  7:02         ` Jason Wang
@ 2017-09-28  7:02         ` Jason Wang
  2017-09-28  7:52         ` Jason Wang
  2017-09-28  7:52         ` Jason Wang
  3 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-28  7:02 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization, netdev, linux-kernel, kvm



On 2017年09月28日 06:19, Michael S. Tsirkin wrote:
> On Wed, Sep 27, 2017 at 10:04:18AM +0800, Jason Wang wrote:
>>
>> On 2017年09月27日 03:25, Michael S. Tsirkin wrote:
>>> On Fri, Sep 22, 2017 at 04:02:35PM +0800, Jason Wang wrote:
>>>> This patch implements basic batched processing of tx virtqueue by
>>>> prefetching desc indices and updating used ring in a batch. For
>>>> non-zerocopy case, vq->heads were used for storing the prefetched
>>>> indices and updating used ring. It is also a requirement for doing
>>>> more batching on top. For zerocopy case and for simplicity, batched
>>>> processing were simply disabled by only fetching and processing one
>>>> descriptor at a time, this could be optimized in the future.
>>>>
>>>> XDP_DROP (without touching skb) on tun (with Moongen in guest) with
>>>> zercopy disabled:
>>>>
>>>> Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
>>>> Before: 3.20Mpps
>>>> After:  3.90Mpps (+22%)
>>>>
>>>> No differences were seen with zerocopy enabled.
>>>>
>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>> So where is the speedup coming from? I'd guess the ring is
>>> hot in cache, it's faster to access it in one go, then
>>> pass many packets to net stack. Is that right?
>>>
>>> Another possibility is better code cache locality.
>> Yes, I think the speed up comes from:
>>
>> - less cache misses
>> - less cache line bounce when virtqueue is about to be full (guest is faster
>> than host which is the case of MoonGen)
>> - less memory barriers
>> - possible faster copy speed by using copy_to_user() on modern CPUs
>>
>>> So how about this patchset is refactored:
>>>
>>> 1. use existing APIs just first get packets then
>>>      transmit them all then use them all
>> Looks like current API can not get packets first, it only support get packet
>> one by one (if you mean vhost_get_vq_desc()). And used ring updating may get
>> more misses in this case.
> Right. So if you do
>
> for (...)
> 	vhost_get_vq_desc
>
>
> then later
>
> for (...)
> 	vhost_add_used
>
>
> then you get most of benefits except maybe code cache misses
> and copy_to_user.
>

If you mean vhost_add_used_n(), then more barriers and userspace memory 
access as well. And you still need to cache vring_used_elem in vq->heads 
or elsewhere. Since you do not batch reading avail ring indexes, more 
write miss will happen if the ring is about to be full. So it looks 
slower than what is done in this patch?

And if we want to indo more batching on top (do we want this?), we still 
need new APIs anyway.

Thanks

>
>
>
>
>
>>> 2. add new APIs and move the loop into vhost core
>>>      for more speedups
>> I don't see any advantages, looks like just need some e.g callbacks in this
>> case.
>>
>> Thanks
> IUC callbacks pretty much destroy the code cache locality advantages,
> IP is jumping around too much.
>
>

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-27 22:19       ` Michael S. Tsirkin
@ 2017-09-28  7:02         ` Jason Wang
  2017-09-28  7:02         ` Jason Wang
                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-28  7:02 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization



On 2017年09月28日 06:19, Michael S. Tsirkin wrote:
> On Wed, Sep 27, 2017 at 10:04:18AM +0800, Jason Wang wrote:
>>
>> On 2017年09月27日 03:25, Michael S. Tsirkin wrote:
>>> On Fri, Sep 22, 2017 at 04:02:35PM +0800, Jason Wang wrote:
>>>> This patch implements basic batched processing of tx virtqueue by
>>>> prefetching desc indices and updating used ring in a batch. For
>>>> non-zerocopy case, vq->heads were used for storing the prefetched
>>>> indices and updating used ring. It is also a requirement for doing
>>>> more batching on top. For zerocopy case and for simplicity, batched
>>>> processing were simply disabled by only fetching and processing one
>>>> descriptor at a time, this could be optimized in the future.
>>>>
>>>> XDP_DROP (without touching skb) on tun (with Moongen in guest) with
>>>> zercopy disabled:
>>>>
>>>> Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
>>>> Before: 3.20Mpps
>>>> After:  3.90Mpps (+22%)
>>>>
>>>> No differences were seen with zerocopy enabled.
>>>>
>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>> So where is the speedup coming from? I'd guess the ring is
>>> hot in cache, it's faster to access it in one go, then
>>> pass many packets to net stack. Is that right?
>>>
>>> Another possibility is better code cache locality.
>> Yes, I think the speed up comes from:
>>
>> - less cache misses
>> - less cache line bounce when virtqueue is about to be full (guest is faster
>> than host which is the case of MoonGen)
>> - less memory barriers
>> - possible faster copy speed by using copy_to_user() on modern CPUs
>>
>>> So how about this patchset is refactored:
>>>
>>> 1. use existing APIs just first get packets then
>>>      transmit them all then use them all
>> Looks like current API can not get packets first, it only support get packet
>> one by one (if you mean vhost_get_vq_desc()). And used ring updating may get
>> more misses in this case.
> Right. So if you do
>
> for (...)
> 	vhost_get_vq_desc
>
>
> then later
>
> for (...)
> 	vhost_add_used
>
>
> then you get most of benefits except maybe code cache misses
> and copy_to_user.
>

If you mean vhost_add_used_n(), then more barriers and userspace memory 
access as well. And you still need to cache vring_used_elem in vq->heads 
or elsewhere. Since you do not batch reading avail ring indexes, more 
write miss will happen if the ring is about to be full. So it looks 
slower than what is done in this patch?

And if we want to indo more batching on top (do we want this?), we still 
need new APIs anyway.

Thanks

>
>
>
>
>
>>> 2. add new APIs and move the loop into vhost core
>>>      for more speedups
>> I don't see any advantages, looks like just need some e.g callbacks in this
>> case.
>>
>> Thanks
> IUC callbacks pretty much destroy the code cache locality advantages,
> IP is jumping around too much.
>
>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 0/5] batched tx processing in vhost_net
  2017-09-27 22:28       ` Michael S. Tsirkin
  (?)
  (?)
@ 2017-09-28  7:16       ` Jason Wang
  -1 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-28  7:16 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization, netdev, linux-kernel, kvm



On 2017年09月28日 06:28, Michael S. Tsirkin wrote:
> On Wed, Sep 27, 2017 at 08:27:37AM +0800, Jason Wang wrote:
>>
>> On 2017年09月26日 21:45, Michael S. Tsirkin wrote:
>>> On Fri, Sep 22, 2017 at 04:02:30PM +0800, Jason Wang wrote:
>>>> Hi:
>>>>
>>>> This series tries to implement basic tx batched processing. This is
>>>> done by prefetching descriptor indices and update used ring in a
>>>> batch. This intends to speed up used ring updating and improve the
>>>> cache utilization.
>>> Interesting, thanks for the patches. So IIUC most of the gain is really
>>> overcoming some of the shortcomings of virtio 1.0 wrt cache utilization?
>> Yes.
>>
>> Actually, looks like batching in 1.1 is not as easy as in 1.0.
>>
>> In 1.0, we could do something like:
>>
>> batch update used ring by user copy_to_user()
>> smp_wmb()
>> update used_idx
>> In 1.1, we need more memory barriers, can't benefit from fast copy helpers?
>>
>> for () {
>>      update desc.addr
>>      smp_wmb()
>>      update desc.flag
>> }
> Yes but smp_wmb is a NOP on e.g. x86. We can switch to other types of
> barriers as well.

We need consider non x86 platforms as well. And we meet similar things 
on batch reading.

>    We do need to do the updates in order, so we might
> need new APIs for that to avoid re-doing the translation all the time.
>
> In 1.0 the last update is a cache miss always. You need batching to get
> less misses. In 1.1 you don't have it so fundamentally there is less
> need for batching. But batching does not always work.  DPDK guys (which
> batch things aggressively) already tried 1.1 and saw performance gains
> so we do not need to argue theoretically.

Do they test on non-x86 CPUs? And the prototype should be reviewed 
carefully since a bug can boost the performance in this case.

>
>
>
>>> Which is fair enough (1.0 is already deployed) but I would like to avoid
>>> making 1.1 support harder, and this patchset does this unfortunately,
>> I think the new APIs do not expose more internal data structure of virtio
>> than before? (vq->heads has already been used by vhost_net for years).
> For sure we might need to change vring_used_elem.
>
>> Consider the layout is re-designed completely, I don't see an easy method to
>> reuse current 1.0 API for 1.1.
> Current API just says you get buffers then you use them. It is not tied
> to actual separate used ring.

So do this patch I think? It introduces:

int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
                 struct vring_used_elem *heads,
                 u16 num);
int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);

There's nothing new exposed to vhost_net. (vring_used_elem has been used 
by net.c at many places without this patch).

>
>
>>> see comments on individual patches. I'm sure it can be addressed though.
>>>
>>>> Test shows about ~22% improvement in tx pss.
>>> Is this with or without tx napi in guest?
>> MoonGen is used in guest for better numbers.
>>
>> Thanks
> Not sure I understand. Did you set napi_tx to true or false?

MoonGen uses dpdk, so virtio-net pmd is used in guest. (See 
http://conferences.sigcomm.org/imc/2015/papers/p275.pdf)

Thanks

>
>>>> Please review.
>>>>
>>>> Jason Wang (5):
>>>>     vhost: split out ring head fetching logic
>>>>     vhost: introduce helper to prefetch desc index
>>>>     vhost: introduce vhost_add_used_idx()
>>>>     vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
>>>>     vhost_net: basic tx virtqueue batched processing
>>>>
>>>>    drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
>>>>    drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
>>>>    drivers/vhost/vhost.h |   9 ++
>>>>    3 files changed, 270 insertions(+), 125 deletions(-)
>>>>
>>>> -- 
>>>> 2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 0/5] batched tx processing in vhost_net
  2017-09-27 22:28       ` Michael S. Tsirkin
  (?)
@ 2017-09-28  7:16       ` Jason Wang
  -1 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-28  7:16 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization



On 2017年09月28日 06:28, Michael S. Tsirkin wrote:
> On Wed, Sep 27, 2017 at 08:27:37AM +0800, Jason Wang wrote:
>>
>> On 2017年09月26日 21:45, Michael S. Tsirkin wrote:
>>> On Fri, Sep 22, 2017 at 04:02:30PM +0800, Jason Wang wrote:
>>>> Hi:
>>>>
>>>> This series tries to implement basic tx batched processing. This is
>>>> done by prefetching descriptor indices and update used ring in a
>>>> batch. This intends to speed up used ring updating and improve the
>>>> cache utilization.
>>> Interesting, thanks for the patches. So IIUC most of the gain is really
>>> overcoming some of the shortcomings of virtio 1.0 wrt cache utilization?
>> Yes.
>>
>> Actually, looks like batching in 1.1 is not as easy as in 1.0.
>>
>> In 1.0, we could do something like:
>>
>> batch update used ring by user copy_to_user()
>> smp_wmb()
>> update used_idx
>> In 1.1, we need more memory barriers, can't benefit from fast copy helpers?
>>
>> for () {
>>      update desc.addr
>>      smp_wmb()
>>      update desc.flag
>> }
> Yes but smp_wmb is a NOP on e.g. x86. We can switch to other types of
> barriers as well.

We need consider non x86 platforms as well. And we meet similar things 
on batch reading.

>    We do need to do the updates in order, so we might
> need new APIs for that to avoid re-doing the translation all the time.
>
> In 1.0 the last update is a cache miss always. You need batching to get
> less misses. In 1.1 you don't have it so fundamentally there is less
> need for batching. But batching does not always work.  DPDK guys (which
> batch things aggressively) already tried 1.1 and saw performance gains
> so we do not need to argue theoretically.

Do they test on non-x86 CPUs? And the prototype should be reviewed 
carefully since a bug can boost the performance in this case.

>
>
>
>>> Which is fair enough (1.0 is already deployed) but I would like to avoid
>>> making 1.1 support harder, and this patchset does this unfortunately,
>> I think the new APIs do not expose more internal data structure of virtio
>> than before? (vq->heads has already been used by vhost_net for years).
> For sure we might need to change vring_used_elem.
>
>> Consider the layout is re-designed completely, I don't see an easy method to
>> reuse current 1.0 API for 1.1.
> Current API just says you get buffers then you use them. It is not tied
> to actual separate used ring.

So do this patch I think? It introduces:

int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
                 struct vring_used_elem *heads,
                 u16 num);
int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);

There's nothing new exposed to vhost_net. (vring_used_elem has been used 
by net.c at many places without this patch).

>
>
>>> see comments on individual patches. I'm sure it can be addressed though.
>>>
>>>> Test shows about ~22% improvement in tx pss.
>>> Is this with or without tx napi in guest?
>> MoonGen is used in guest for better numbers.
>>
>> Thanks
> Not sure I understand. Did you set napi_tx to true or false?

MoonGen uses dpdk, so virtio-net pmd is used in guest. (See 
http://conferences.sigcomm.org/imc/2015/papers/p275.pdf)

Thanks

>
>>>> Please review.
>>>>
>>>> Jason Wang (5):
>>>>     vhost: split out ring head fetching logic
>>>>     vhost: introduce helper to prefetch desc index
>>>>     vhost: introduce vhost_add_used_idx()
>>>>     vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
>>>>     vhost_net: basic tx virtqueue batched processing
>>>>
>>>>    drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
>>>>    drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
>>>>    drivers/vhost/vhost.h |   9 ++
>>>>    3 files changed, 270 insertions(+), 125 deletions(-)
>>>>
>>>> -- 
>>>> 2.7.4

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-27 22:57         ` Michael S. Tsirkin
  (?)
  (?)
@ 2017-09-28  7:18         ` Jason Wang
  -1 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-28  7:18 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization, netdev, linux-kernel, kvm



On 2017年09月28日 06:57, Michael S. Tsirkin wrote:
> On Wed, Sep 27, 2017 at 08:35:47AM +0800, Jason Wang wrote:
>>
>> On 2017年09月27日 03:19, Michael S. Tsirkin wrote:
>>> On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
>>>> This patch introduces vhost_prefetch_desc_indices() which could batch
>>>> descriptor indices fetching and used ring updating. This intends to
>>>> reduce the cache misses of indices fetching and updating and reduce
>>>> cache line bounce when virtqueue is almost full. copy_to_user() was
>>>> used in order to benefit from modern cpus that support fast string
>>>> copy. Batched virtqueue processing will be the first user.
>>>>
>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>>> ---
>>>>    drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    drivers/vhost/vhost.h |  3 +++
>>>>    2 files changed, 58 insertions(+)
>>>>
>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>> index f87ec75..8424166d 100644
>>>> --- a/drivers/vhost/vhost.c
>>>> +++ b/drivers/vhost/vhost.c
>>>> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>>>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>>>> +				struct vring_used_elem *heads,
>>>> +				u16 num, bool used_update)
>>> why do you need to combine used update with prefetch?
>> For better performance
>
> Why is sticking a branch in there better than requesting the update
> conditionally from the caller?

Ok, I get your point, I can split the two functions.

>
>
>
>> and I believe we don't care about the overhead when
>> we meet errors in tx.
> That's a separate question, I do not really understand how
> you can fetch a descriptor and update the used ring at the same
> time. This allows the guest to overwrite the buffer.
> I might be misunderstanding what is going on here though.

We don't update used idx, so guest can't overwrite the buffer I think?

Thanks

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-27 22:57         ` Michael S. Tsirkin
  (?)
@ 2017-09-28  7:18         ` Jason Wang
  -1 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-28  7:18 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization



On 2017年09月28日 06:57, Michael S. Tsirkin wrote:
> On Wed, Sep 27, 2017 at 08:35:47AM +0800, Jason Wang wrote:
>>
>> On 2017年09月27日 03:19, Michael S. Tsirkin wrote:
>>> On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
>>>> This patch introduces vhost_prefetch_desc_indices() which could batch
>>>> descriptor indices fetching and used ring updating. This intends to
>>>> reduce the cache misses of indices fetching and updating and reduce
>>>> cache line bounce when virtqueue is almost full. copy_to_user() was
>>>> used in order to benefit from modern cpus that support fast string
>>>> copy. Batched virtqueue processing will be the first user.
>>>>
>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>>> ---
>>>>    drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    drivers/vhost/vhost.h |  3 +++
>>>>    2 files changed, 58 insertions(+)
>>>>
>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>> index f87ec75..8424166d 100644
>>>> --- a/drivers/vhost/vhost.c
>>>> +++ b/drivers/vhost/vhost.c
>>>> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>>>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>>>> +				struct vring_used_elem *heads,
>>>> +				u16 num, bool used_update)
>>> why do you need to combine used update with prefetch?
>> For better performance
>
> Why is sticking a branch in there better than requesting the update
> conditionally from the caller?

Ok, I get your point, I can split the two functions.

>
>
>
>> and I believe we don't care about the overhead when
>> we meet errors in tx.
> That's a separate question, I do not really understand how
> you can fetch a descriptor and update the used ring at the same
> time. This allows the guest to overwrite the buffer.
> I might be misunderstanding what is going on here though.

We don't update used idx, so guest can't overwrite the buffer I think?

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
  2017-09-27 22:58       ` Michael S. Tsirkin
  2017-09-28  0:59         ` Willem de Bruijn
  2017-09-28  0:59         ` Willem de Bruijn
@ 2017-09-28  7:19         ` Jason Wang
  2017-09-28  7:19         ` Jason Wang
  3 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-28  7:19 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization, netdev, linux-kernel, kvm



On 2017年09月28日 06:58, Michael S. Tsirkin wrote:
> On Wed, Sep 27, 2017 at 08:38:24AM +0800, Jason Wang wrote:
>> On 2017年09月27日 03:13, Michael S. Tsirkin wrote:
>>> On Fri, Sep 22, 2017 at 04:02:33PM +0800, Jason Wang wrote:
>>>> This patch introduces a helper which just increase the used idx. This
>>>> will be used in pair with vhost_prefetch_desc_indices() by batching
>>>> code.
>>>>
>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>> ---
>>>>    drivers/vhost/vhost.c | 33 +++++++++++++++++++++++++++++++++
>>>>    drivers/vhost/vhost.h |  1 +
>>>>    2 files changed, 34 insertions(+)
>>>>
>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>> index 8424166d..6532cda 100644
>>>> --- a/drivers/vhost/vhost.c
>>>> +++ b/drivers/vhost/vhost.c
>>>> @@ -2178,6 +2178,39 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(vhost_add_used);
>>>> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n)
>>>> +{
>>>> +	u16 old, new;
>>>> +
>>>> +	old = vq->last_used_idx;
>>>> +	new = (vq->last_used_idx += n);
>>>> +	/* If the driver never bothers to signal in a very long while,
>>>> +	 * used index might wrap around. If that happens, invalidate
>>>> +	 * signalled_used index we stored. TODO: make sure driver
>>>> +	 * signals at least once in 2^16  and remove this.
>>>> +	 */
>>>> +	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
>>>> +		vq->signalled_used_valid = false;
>>>> +
>>>> +	/* Make sure buffer is written before we update index. */
>>>> +	smp_wmb();
>>>> +	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>>>> +			   &vq->used->idx)) {
>>>> +		vq_err(vq, "Failed to increment used idx");
>>>> +		return -EFAULT;
>>>> +	}
>>>> +	if (unlikely(vq->log_used)) {
>>>> +		/* Log used index update. */
>>>> +		log_write(vq->log_base,
>>>> +			  vq->log_addr + offsetof(struct vring_used, idx),
>>>> +			  sizeof(vq->used->idx));
>>>> +		if (vq->log_ctx)
>>>> +			eventfd_signal(vq->log_ctx, 1);
>>>> +	}
>>>> +	return 0;
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(vhost_add_used_idx);
>>>> +
>>>>    static int __vhost_add_used_n(struct vhost_virtqueue *vq,
>>>>    			    struct vring_used_elem *heads,
>>>>    			    unsigned count)
>>>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>>>> index 16c2cb6..5dd6c05 100644
>>>> --- a/drivers/vhost/vhost.h
>>>> +++ b/drivers/vhost/vhost.h
>>>> @@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
>>>>    void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>>>>    int vhost_vq_init_access(struct vhost_virtqueue *);
>>>> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
>>>>    int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
>>>>    int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
>>>>    		     unsigned count);
>>> Please change the API to hide the fact that there's an index that needs
>>> to be updated.
>> In fact, an interesting optimization on top is just call
>> vhost_add_used_idx(vq, n) instead of n vhost_add_used_idx(vq, 1). That's the
>> reason I leave n in the API.
>>
>> Thanks
> Right but you could increment some internal counter in the vq
> structure then update the used index using some api
> with a generic name, e.g.  add_used_complete or something like this.
>

Right, I see.

Thanks

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
  2017-09-27 22:58       ` Michael S. Tsirkin
                           ` (2 preceding siblings ...)
  2017-09-28  7:19         ` Jason Wang
@ 2017-09-28  7:19         ` Jason Wang
  3 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-28  7:19 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization



On 2017年09月28日 06:58, Michael S. Tsirkin wrote:
> On Wed, Sep 27, 2017 at 08:38:24AM +0800, Jason Wang wrote:
>> On 2017年09月27日 03:13, Michael S. Tsirkin wrote:
>>> On Fri, Sep 22, 2017 at 04:02:33PM +0800, Jason Wang wrote:
>>>> This patch introduces a helper which just increase the used idx. This
>>>> will be used in pair with vhost_prefetch_desc_indices() by batching
>>>> code.
>>>>
>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>> ---
>>>>    drivers/vhost/vhost.c | 33 +++++++++++++++++++++++++++++++++
>>>>    drivers/vhost/vhost.h |  1 +
>>>>    2 files changed, 34 insertions(+)
>>>>
>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>> index 8424166d..6532cda 100644
>>>> --- a/drivers/vhost/vhost.c
>>>> +++ b/drivers/vhost/vhost.c
>>>> @@ -2178,6 +2178,39 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(vhost_add_used);
>>>> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n)
>>>> +{
>>>> +	u16 old, new;
>>>> +
>>>> +	old = vq->last_used_idx;
>>>> +	new = (vq->last_used_idx += n);
>>>> +	/* If the driver never bothers to signal in a very long while,
>>>> +	 * used index might wrap around. If that happens, invalidate
>>>> +	 * signalled_used index we stored. TODO: make sure driver
>>>> +	 * signals at least once in 2^16  and remove this.
>>>> +	 */
>>>> +	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
>>>> +		vq->signalled_used_valid = false;
>>>> +
>>>> +	/* Make sure buffer is written before we update index. */
>>>> +	smp_wmb();
>>>> +	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>>>> +			   &vq->used->idx)) {
>>>> +		vq_err(vq, "Failed to increment used idx");
>>>> +		return -EFAULT;
>>>> +	}
>>>> +	if (unlikely(vq->log_used)) {
>>>> +		/* Log used index update. */
>>>> +		log_write(vq->log_base,
>>>> +			  vq->log_addr + offsetof(struct vring_used, idx),
>>>> +			  sizeof(vq->used->idx));
>>>> +		if (vq->log_ctx)
>>>> +			eventfd_signal(vq->log_ctx, 1);
>>>> +	}
>>>> +	return 0;
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(vhost_add_used_idx);
>>>> +
>>>>    static int __vhost_add_used_n(struct vhost_virtqueue *vq,
>>>>    			    struct vring_used_elem *heads,
>>>>    			    unsigned count)
>>>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>>>> index 16c2cb6..5dd6c05 100644
>>>> --- a/drivers/vhost/vhost.h
>>>> +++ b/drivers/vhost/vhost.h
>>>> @@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
>>>>    void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>>>>    int vhost_vq_init_access(struct vhost_virtqueue *);
>>>> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
>>>>    int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
>>>>    int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
>>>>    		     unsigned count);
>>> Please change the API to hide the fact that there's an index that needs
>>> to be updated.
>> In fact, an interesting optimization on top is just call
>> vhost_add_used_idx(vq, n) instead of n vhost_add_used_idx(vq, 1). That's the
>> reason I leave n in the API.
>>
>> Thanks
> Right but you could increment some internal counter in the vq
> structure then update the used index using some api
> with a generic name, e.g.  add_used_complete or something like this.
>

Right, I see.

Thanks


_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-28  0:47   ` Willem de Bruijn
  2017-09-28  7:44     ` Jason Wang
@ 2017-09-28  7:44     ` Jason Wang
  1 sibling, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-28  7:44 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Michael S. Tsirkin, virtualization, Network Development, LKML, kvm



On 2017年09月28日 08:47, Willem de Bruijn wrote:
> On Fri, Sep 22, 2017 at 4:02 AM, Jason Wang <jasowang@redhat.com> wrote:
>> This patch introduces vhost_prefetch_desc_indices() which could batch
>> descriptor indices fetching and used ring updating. This intends to
>> reduce the cache misses of indices fetching and updating and reduce
>> cache line bounce when virtqueue is almost full. copy_to_user() was
>> used in order to benefit from modern cpus that support fast string
>> copy. Batched virtqueue processing will be the first user.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>   drivers/vhost/vhost.h |  3 +++
>>   2 files changed, 58 insertions(+)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index f87ec75..8424166d 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>>
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +                               struct vring_used_elem *heads,
>> +                               u16 num, bool used_update)
>> +{
>> +       int ret, ret2;
>> +       u16 last_avail_idx, last_used_idx, total, copied;
>> +       __virtio16 avail_idx;
>> +       struct vring_used_elem __user *used;
>> +       int i;
>> +
>> +       if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
>> +               vq_err(vq, "Failed to access avail idx at %p\n",
>> +                      &vq->avail->idx);
>> +               return -EFAULT;
>> +       }
>> +       last_avail_idx = vq->last_avail_idx & (vq->num - 1);
>> +       vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
>> +       total = vq->avail_idx - vq->last_avail_idx;
>> +       ret = total = min(total, num);
>> +
>> +       for (i = 0; i < ret; i++) {
>> +               ret2 = vhost_get_avail(vq, heads[i].id,
>> +                                     &vq->avail->ring[last_avail_idx]);
>> +               if (unlikely(ret2)) {
>> +                       vq_err(vq, "Failed to get descriptors\n");
>> +                       return -EFAULT;
>> +               }
>> +               last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
>> +       }
> This is understandably very similar to the existing logic in vhost_get_vq_desc.
> Can that be extracted to a helper to avoid code duplication?
>
> Perhaps one helper to update vq->avail_idx and return num, and
> another to call vhost_get_avail one or more times.

Yes it can.

>
>> +
>> +       if (!used_update)
>> +               return ret;
>> +
>> +       last_used_idx = vq->last_used_idx & (vq->num - 1);
>> +       while (total) {
>> +               copied = min((u16)(vq->num - last_used_idx), total);
>> +               ret2 = vhost_copy_to_user(vq,
>> +                                         &vq->used->ring[last_used_idx],
>> +                                         &heads[ret - total],
>> +                                         copied * sizeof(*used));
>> +
>> +               if (unlikely(ret2)) {
>> +                       vq_err(vq, "Failed to update used ring!\n");
>> +                       return -EFAULT;
>> +               }
>> +
>> +               last_used_idx = 0;
>> +               total -= copied;
>> +       }
> This second part seems unrelated and could be a separate function?

Yes.

>
> Also, no need for ret2 and double assignment "ret = total =" if not
> modifying total
> in the the second loop:
>
>    for (i = 0; i < total; ) {
>      ...
>      i += copied;
>    }

Right, will do this in V2.

Thanks

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
  2017-09-28  0:47   ` Willem de Bruijn
@ 2017-09-28  7:44     ` Jason Wang
  2017-09-28  7:44     ` Jason Wang
  1 sibling, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-28  7:44 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Network Development, virtualization, LKML, kvm, Michael S. Tsirkin



On 2017年09月28日 08:47, Willem de Bruijn wrote:
> On Fri, Sep 22, 2017 at 4:02 AM, Jason Wang <jasowang@redhat.com> wrote:
>> This patch introduces vhost_prefetch_desc_indices() which could batch
>> descriptor indices fetching and used ring updating. This intends to
>> reduce the cache misses of indices fetching and updating and reduce
>> cache line bounce when virtqueue is almost full. copy_to_user() was
>> used in order to benefit from modern cpus that support fast string
>> copy. Batched virtqueue processing will be the first user.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>   drivers/vhost/vhost.h |  3 +++
>>   2 files changed, 58 insertions(+)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index f87ec75..8424166d 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>>
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +                               struct vring_used_elem *heads,
>> +                               u16 num, bool used_update)
>> +{
>> +       int ret, ret2;
>> +       u16 last_avail_idx, last_used_idx, total, copied;
>> +       __virtio16 avail_idx;
>> +       struct vring_used_elem __user *used;
>> +       int i;
>> +
>> +       if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
>> +               vq_err(vq, "Failed to access avail idx at %p\n",
>> +                      &vq->avail->idx);
>> +               return -EFAULT;
>> +       }
>> +       last_avail_idx = vq->last_avail_idx & (vq->num - 1);
>> +       vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
>> +       total = vq->avail_idx - vq->last_avail_idx;
>> +       ret = total = min(total, num);
>> +
>> +       for (i = 0; i < ret; i++) {
>> +               ret2 = vhost_get_avail(vq, heads[i].id,
>> +                                     &vq->avail->ring[last_avail_idx]);
>> +               if (unlikely(ret2)) {
>> +                       vq_err(vq, "Failed to get descriptors\n");
>> +                       return -EFAULT;
>> +               }
>> +               last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
>> +       }
> This is understandably very similar to the existing logic in vhost_get_vq_desc.
> Can that be extracted to a helper to avoid code duplication?
>
> Perhaps one helper to update vq->avail_idx and return num, and
> another to call vhost_get_avail one or more times.

Yes it can.

>
>> +
>> +       if (!used_update)
>> +               return ret;
>> +
>> +       last_used_idx = vq->last_used_idx & (vq->num - 1);
>> +       while (total) {
>> +               copied = min((u16)(vq->num - last_used_idx), total);
>> +               ret2 = vhost_copy_to_user(vq,
>> +                                         &vq->used->ring[last_used_idx],
>> +                                         &heads[ret - total],
>> +                                         copied * sizeof(*used));
>> +
>> +               if (unlikely(ret2)) {
>> +                       vq_err(vq, "Failed to update used ring!\n");
>> +                       return -EFAULT;
>> +               }
>> +
>> +               last_used_idx = 0;
>> +               total -= copied;
>> +       }
> This second part seems unrelated and could be a separate function?

Yes.

>
> Also, no need for ret2 and double assignment "ret = total =" if not
> modifying total
> in the the second loop:
>
>    for (i = 0; i < total; ) {
>      ...
>      i += copied;
>    }

Right, will do this in V2.

Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-28  0:55     ` Willem de Bruijn
  (?)
  (?)
@ 2017-09-28  7:50     ` Jason Wang
  -1 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-28  7:50 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Michael S. Tsirkin, virtualization, Network Development, LKML, kvm



On 2017年09月28日 08:55, Willem de Bruijn wrote:
>> @@ -461,6 +460,7 @@ static void handle_tx(struct vhost_net *net)
>>          struct socket *sock;
>>          struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
>>          bool zcopy, zcopy_used;
>> +       int i, batched = VHOST_NET_BATCH;
>>
>>          mutex_lock(&vq->mutex);
>>          sock = vq->private_data;
>> @@ -475,6 +475,12 @@ static void handle_tx(struct vhost_net *net)
>>          hdr_size = nvq->vhost_hlen;
>>          zcopy = nvq->ubufs;
>>
>> +       /* Disable zerocopy batched fetching for simplicity */
> This special case can perhaps be avoided if we no longer block
> on vhost_exceeds_maxpend, but revert to copying.

Yes, I think so. For simplicity, I do it for data copy first. If the 
idea is convinced, I will try to do zerocopy on top.

>
>> +       if (zcopy) {
>> +               heads = &used;
> Can this special case of batchsize 1 not use vq->heads?

It doesn't in fact?

>
>> +               batched = 1;
>> +       }
>> +
>>          for (;;) {
>>                  /* Release DMAs done buffers first */
>>                  if (zcopy)
>> @@ -486,95 +492,114 @@ static void handle_tx(struct vhost_net *net)
>>                  if (unlikely(vhost_exceeds_maxpend(net)))
>>                          break;
>> +                       /* TODO: Check specific error and bomb out
>> +                        * unless ENOBUFS?
>> +                        */
>> +                       err = sock->ops->sendmsg(sock, &msg, len);
>> +                       if (unlikely(err < 0)) {
>> +                               if (zcopy_used) {
>> +                                       vhost_net_ubuf_put(ubufs);
>> +                                       nvq->upend_idx =
>> +                                  ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV;
>> +                               }
>> +                               vhost_discard_vq_desc(vq, 1);
>> +                               goto out;
>> +                       }
>> +                       if (err != len)
>> +                               pr_debug("Truncated TX packet: "
>> +                                       " len %d != %zd\n", err, len);
>> +                       if (!zcopy) {
>> +                               vhost_add_used_idx(vq, 1);
>> +                               vhost_signal(&net->dev, vq);
>> +                       } else if (!zcopy_used) {
>> +                               vhost_add_used_and_signal(&net->dev,
>> +                                                         vq, head, 0);
> While batching, perhaps can also move this producer index update
> out of the loop and using vhost_add_used_and_signal_n.

Yes.

>
>> +                       } else
>> +                               vhost_zerocopy_signal_used(net, vq);
>> +                       vhost_net_tx_packet(net);
>> +                       if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>> +                               vhost_poll_queue(&vq->poll);
>> +                               goto out;
>>                          }
>> -                       vhost_discard_vq_desc(vq, 1);
>> -                       break;
>> -               }
>> -               if (err != len)
>> -                       pr_debug("Truncated TX packet: "
>> -                                " len %d != %zd\n", err, len);
>> -               if (!zcopy_used)
>> -                       vhost_add_used_and_signal(&net->dev, vq, head, 0);
>> -               else
>> -                       vhost_zerocopy_signal_used(net, vq);
>> -               vhost_net_tx_packet(net);
>> -               if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>> -                       vhost_poll_queue(&vq->poll);
>> -                       break;
> This patch touches many lines just for indentation. If having to touch
> these lines anyway (dirtying git blame), it may be a good time to move
> the processing of a single descriptor code into a separate helper function.
> And while breaking up, perhaps another helper for setting up ubuf_info.
> If you agree, preferably in a separate noop refactor patch that precedes
> the functional changes.

Right and it looks better, will try to do this.

Thanks

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-28  0:55     ` Willem de Bruijn
  (?)
@ 2017-09-28  7:50     ` Jason Wang
  -1 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-28  7:50 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Network Development, virtualization, LKML, kvm, Michael S. Tsirkin



On 2017年09月28日 08:55, Willem de Bruijn wrote:
>> @@ -461,6 +460,7 @@ static void handle_tx(struct vhost_net *net)
>>          struct socket *sock;
>>          struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
>>          bool zcopy, zcopy_used;
>> +       int i, batched = VHOST_NET_BATCH;
>>
>>          mutex_lock(&vq->mutex);
>>          sock = vq->private_data;
>> @@ -475,6 +475,12 @@ static void handle_tx(struct vhost_net *net)
>>          hdr_size = nvq->vhost_hlen;
>>          zcopy = nvq->ubufs;
>>
>> +       /* Disable zerocopy batched fetching for simplicity */
> This special case can perhaps be avoided if we no longer block
> on vhost_exceeds_maxpend, but revert to copying.

Yes, I think so. For simplicity, I do it for data copy first. If the 
idea is convinced, I will try to do zerocopy on top.

>
>> +       if (zcopy) {
>> +               heads = &used;
> Can this special case of batchsize 1 not use vq->heads?

It doesn't in fact?

>
>> +               batched = 1;
>> +       }
>> +
>>          for (;;) {
>>                  /* Release DMAs done buffers first */
>>                  if (zcopy)
>> @@ -486,95 +492,114 @@ static void handle_tx(struct vhost_net *net)
>>                  if (unlikely(vhost_exceeds_maxpend(net)))
>>                          break;
>> +                       /* TODO: Check specific error and bomb out
>> +                        * unless ENOBUFS?
>> +                        */
>> +                       err = sock->ops->sendmsg(sock, &msg, len);
>> +                       if (unlikely(err < 0)) {
>> +                               if (zcopy_used) {
>> +                                       vhost_net_ubuf_put(ubufs);
>> +                                       nvq->upend_idx =
>> +                                  ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV;
>> +                               }
>> +                               vhost_discard_vq_desc(vq, 1);
>> +                               goto out;
>> +                       }
>> +                       if (err != len)
>> +                               pr_debug("Truncated TX packet: "
>> +                                       " len %d != %zd\n", err, len);
>> +                       if (!zcopy) {
>> +                               vhost_add_used_idx(vq, 1);
>> +                               vhost_signal(&net->dev, vq);
>> +                       } else if (!zcopy_used) {
>> +                               vhost_add_used_and_signal(&net->dev,
>> +                                                         vq, head, 0);
> While batching, perhaps can also move this producer index update
> out of the loop and using vhost_add_used_and_signal_n.

Yes.

>
>> +                       } else
>> +                               vhost_zerocopy_signal_used(net, vq);
>> +                       vhost_net_tx_packet(net);
>> +                       if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>> +                               vhost_poll_queue(&vq->poll);
>> +                               goto out;
>>                          }
>> -                       vhost_discard_vq_desc(vq, 1);
>> -                       break;
>> -               }
>> -               if (err != len)
>> -                       pr_debug("Truncated TX packet: "
>> -                                " len %d != %zd\n", err, len);
>> -               if (!zcopy_used)
>> -                       vhost_add_used_and_signal(&net->dev, vq, head, 0);
>> -               else
>> -                       vhost_zerocopy_signal_used(net, vq);
>> -               vhost_net_tx_packet(net);
>> -               if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>> -                       vhost_poll_queue(&vq->poll);
>> -                       break;
> This patch touches many lines just for indentation. If having to touch
> these lines anyway (dirtying git blame), it may be a good time to move
> the processing of a single descriptor code into a separate helper function.
> And while breaking up, perhaps another helper for setting up ubuf_info.
> If you agree, preferably in a separate noop refactor patch that precedes
> the functional changes.

Right and it looks better, will try to do this.

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-27 22:19       ` Michael S. Tsirkin
                           ` (2 preceding siblings ...)
  2017-09-28  7:52         ` Jason Wang
@ 2017-09-28  7:52         ` Jason Wang
  3 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-28  7:52 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization, netdev, linux-kernel, kvm



On 2017年09月28日 06:19, Michael S. Tsirkin wrote:
>
>>> 2. add new APIs and move the loop into vhost core
>>>      for more speedups
>> I don't see any advantages, looks like just need some e.g callbacks in this
>> case.
>>
>> Thanks
> IUC callbacks pretty much destroy the code cache locality advantages,
> IP is jumping around too much.

I may miss something, but we need net specific code anyway in this case?

Thanks

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
  2017-09-27 22:19       ` Michael S. Tsirkin
  2017-09-28  7:02         ` Jason Wang
  2017-09-28  7:02         ` Jason Wang
@ 2017-09-28  7:52         ` Jason Wang
  2017-09-28  7:52         ` Jason Wang
  3 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-28  7:52 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization



On 2017年09月28日 06:19, Michael S. Tsirkin wrote:
>
>>> 2. add new APIs and move the loop into vhost core
>>>      for more speedups
>> I don't see any advantages, looks like just need some e.g callbacks in this
>> case.
>>
>> Thanks
> IUC callbacks pretty much destroy the code cache locality advantages,
> IP is jumping around too much.

I may miss something, but we need net specific code anyway in this case?

Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 69+ messages in thread

* [PATCH net-next RFC 0/5] batched tx processing in vhost_net
@ 2017-09-22  8:02 Jason Wang
  0 siblings, 0 replies; 69+ messages in thread
From: Jason Wang @ 2017-09-22  8:02 UTC (permalink / raw)
  To: mst, jasowang, virtualization, netdev, linux-kernel; +Cc: kvm

Hi:

This series tries to implement basic tx batched processing. This is
done by prefetching descriptor indices and update used ring in a
batch. This intends to speed up used ring updating and improve the
cache utilization. Test shows about ~22% improvement in tx pss.

Please review.

Jason Wang (5):
  vhost: split out ring head fetching logic
  vhost: introduce helper to prefetch desc index
  vhost: introduce vhost_add_used_idx()
  vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
  vhost_net: basic tx virtqueue batched processing

 drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
 drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
 drivers/vhost/vhost.h |   9 ++
 3 files changed, 270 insertions(+), 125 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 69+ messages in thread

end of thread, other threads:[~2017-09-28  7:53 UTC | newest]

Thread overview: 69+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-22  8:02 [PATCH net-next RFC 0/5] batched tx processing in vhost_net Jason Wang
2017-09-22  8:02 ` [PATCH net-next RFC 1/5] vhost: split out ring head fetching logic Jason Wang
2017-09-22  8:02 ` Jason Wang
2017-09-22  8:31   ` Stefan Hajnoczi
2017-09-22  8:31   ` Stefan Hajnoczi
2017-09-25  2:03     ` Jason Wang
2017-09-25  2:03     ` Jason Wang
2017-09-22  8:02 ` [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index Jason Wang
2017-09-22  9:02   ` Stefan Hajnoczi
2017-09-22  9:02     ` Stefan Hajnoczi
2017-09-25  2:04     ` Jason Wang
2017-09-25  2:04     ` Jason Wang
2017-09-26 19:19   ` Michael S. Tsirkin
2017-09-26 19:19   ` Michael S. Tsirkin
2017-09-27  0:35     ` Jason Wang
2017-09-27 22:57       ` Michael S. Tsirkin
2017-09-27 22:57         ` Michael S. Tsirkin
2017-09-28  7:18         ` Jason Wang
2017-09-28  7:18         ` Jason Wang
2017-09-27  0:35     ` Jason Wang
2017-09-28  0:47   ` Willem de Bruijn
2017-09-28  0:47   ` Willem de Bruijn
2017-09-28  7:44     ` Jason Wang
2017-09-28  7:44     ` Jason Wang
2017-09-22  8:02 ` Jason Wang
2017-09-22  8:02 ` [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx() Jason Wang
2017-09-22  9:07   ` Stefan Hajnoczi
2017-09-22  9:07   ` Stefan Hajnoczi
2017-09-26 19:13   ` Michael S. Tsirkin
2017-09-26 19:13   ` Michael S. Tsirkin
2017-09-27  0:38     ` Jason Wang
2017-09-27 22:58       ` Michael S. Tsirkin
2017-09-27 22:58       ` Michael S. Tsirkin
2017-09-28  0:59         ` Willem de Bruijn
2017-09-28  0:59         ` Willem de Bruijn
2017-09-28  7:19         ` Jason Wang
2017-09-28  7:19         ` Jason Wang
2017-09-27  0:38     ` Jason Wang
2017-09-22  8:02 ` [PATCH net-next RFC 4/5] vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH Jason Wang
2017-09-22  8:02 ` Jason Wang
2017-09-22  8:02 ` [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing Jason Wang
2017-09-22  8:02 ` Jason Wang
2017-09-26 19:25   ` Michael S. Tsirkin
2017-09-27  2:04     ` Jason Wang
2017-09-27 22:19       ` Michael S. Tsirkin
2017-09-28  7:02         ` Jason Wang
2017-09-28  7:02         ` Jason Wang
2017-09-28  7:52         ` Jason Wang
2017-09-28  7:52         ` Jason Wang
2017-09-27 22:19       ` Michael S. Tsirkin
2017-09-27  2:04     ` Jason Wang
2017-09-26 19:25   ` Michael S. Tsirkin
2017-09-28  0:55   ` Willem de Bruijn
2017-09-28  0:55     ` Willem de Bruijn
2017-09-28  7:50     ` Jason Wang
2017-09-28  7:50     ` Jason Wang
2017-09-26 13:45 ` [PATCH net-next RFC 0/5] batched tx processing in vhost_net Michael S. Tsirkin
2017-09-26 13:45   ` Michael S. Tsirkin
2017-09-27  0:27   ` Jason Wang
2017-09-27 22:28     ` Michael S. Tsirkin
2017-09-27 22:28       ` Michael S. Tsirkin
2017-09-28  7:16       ` Jason Wang
2017-09-28  7:16       ` Jason Wang
2017-09-27  0:27   ` Jason Wang
2017-09-26 19:26 ` Michael S. Tsirkin
2017-09-26 19:26 ` Michael S. Tsirkin
2017-09-27  2:06   ` Jason Wang
2017-09-27  2:06   ` Jason Wang
2017-09-22  8:02 Jason Wang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.