All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] optimize vhost enqueue
@ 2016-08-16  3:50 Zhihong Wang
  2016-08-16 13:59 ` Maxime Coquelin
                   ` (5 more replies)
  0 siblings, 6 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-16  3:50 UTC (permalink / raw)
  To: dev; +Cc: Zhihong Wang

This patch optimizes the vhost enqueue function: rte_vhost_enqueue_burst.

Currently there're 2 callbacks for vhost enqueue:
 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
 *  virtio_dev_rx for mrg_rxbuf turned off cases.

The virtio_dev_merge_rx doesn't provide optimal performance, also it is
reported having compatibility issue working with Windows VMs.

Besides, having 2 separated functions increases maintenance efforts.

This patch uses a single function logic to replace the current 2 for
better maintainability, and provides better performance by optimizing
caching behavior especially for mrg_rxbuf turned on cases.

It also fixes the issue working with Windows VMs.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost-net.h  |   6 +-
 lib/librte_vhost/vhost_rxtx.c | 582 ++++++++++++++----------------------------
 lib/librte_vhost/virtio-net.c |  15 +-
 3 files changed, 208 insertions(+), 395 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 38593a2..a15182c 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
 	uint32_t		size;
 
 	/* Last index used on the available ring */
-	volatile uint16_t	last_used_idx;
+	uint16_t		last_used_idx;
 #define VIRTIO_INVALID_EVENTFD		(-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD	(-2)
 
@@ -85,6 +85,10 @@ struct vhost_virtqueue {
 
 	/* Physical address of used ring, for logging */
 	uint64_t		log_guest_addr;
+
+	/* Shadow used ring for performance */
+	struct vring_used_elem	*shadow_used_ring;
+	uint32_t		shadow_used_idx;
 } __rte_cache_aligned;
 
 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..1263168 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
 	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }
 
-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
 	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -125,427 +125,227 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 	}
 }
 
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-		    struct virtio_net_hdr_mrg_rxbuf hdr)
-{
-	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-	else
-		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
-}
-
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		  struct rte_mbuf *m, uint16_t desc_idx)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint16_t count)
 {
-	uint32_t desc_avail, desc_offset;
-	uint32_t mbuf_avail, mbuf_offset;
-	uint32_t cpy_len;
+	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
+	struct vhost_virtqueue *vq;
 	struct vring_desc *desc;
-	uint64_t desc_addr;
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
-	desc = &vq->desc[desc_idx];
-	desc_addr = gpa_to_vva(dev, desc->addr);
-	/*
-	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-	 * otherwise stores offset on the stack instead of in a register.
-	 */
-	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-		return -1;
-
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-	desc_offset = dev->vhost_hlen;
-	desc_avail  = desc->len - dev->vhost_hlen;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current mbuf, fetch next */
-		if (mbuf_avail == 0) {
-			m = m->next;
-
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
-		}
-
-		/* done with current desc buf, fetch next */
-		if (desc_avail == 0) {
-			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
-				/* Room in vring buffer is not enough */
-				return -1;
-			}
-			if (unlikely(desc->next >= vq->size))
-				return -1;
-
-			desc = &vq->desc[desc->next];
-			desc_addr = gpa_to_vva(dev, desc->addr);
-			if (unlikely(!desc_addr))
-				return -1;
-
-			desc_offset = 0;
-			desc_avail  = desc->len;
-		}
-
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			     cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
-	}
-
-	return 0;
-}
+	struct virtio_net *dev;
+	struct rte_mbuf *mbuf;
+	uint64_t desc_host_write_addr = 0;
+	uint32_t desc_chain_head = 0;
+	uint32_t desc_chain_len = 0;
+	uint32_t desc_current = 0;
+	uint32_t desc_write_offset = 0;
+	uint32_t used_idx_static = 0;
+	uint32_t pkt_idx = 0;
+	uint32_t pkt_left = 0;
+	uint32_t pkt_sent = 0;
+	uint32_t mbuf_len = 0;
+	uint32_t mbuf_len_left = 0;
+	uint32_t copy_len = 0;
+	uint32_t copy_virtio_hdr = 0;
+	uint32_t is_mrg_rxbuf = 0;
+	uint32_t is_virtio_1 = 0;
+
+	if (unlikely(count == 0))
+		return 0;
 
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
-	      struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint16_t avail_idx, free_entries, start_idx;
-	uint16_t desc_indexes[MAX_PKT_BURST];
-	uint16_t used_idx;
-	uint32_t i;
+	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
 
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
+	dev = get_device(vid);
+	if (unlikely(!dev))
 		return 0;
-	}
 
-	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
 		return 0;
 
-	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-	start_idx = vq->last_used_idx;
-	free_entries = avail_idx - start_idx;
-	count = RTE_MIN(count, free_entries);
-	count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
-	if (count == 0)
+	vq = dev->virtqueue[queue_id];
+	if (unlikely(!vq->enabled))
 		return 0;
 
-	LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
-		dev->vid, start_idx, start_idx + count);
+	if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+		is_mrg_rxbuf = 1;
+
+	if (dev->features & (1ULL << VIRTIO_F_VERSION_1))
+		is_virtio_1 = 1;
+
+	pkt_idx = 0;
+	pkt_left = count;
+	used_idx_static = vq->last_used_idx & (vq->size - 1);
+	vq->shadow_used_idx = 0;
+
+	while (pkt_left > 0) {
+		if (unlikely(vq->avail->idx == vq->last_used_idx))
+			goto done;
+
+		if (pkt_left > 1 && vq->avail->idx != vq->last_used_idx + 1)
+			rte_prefetch0(&vq->desc[
+					vq->avail->ring[
+					(vq->last_used_idx + 1) &
+					(vq->size - 1)]]);
+
+		mbuf = pkts[pkt_idx];
+		mbuf_len = rte_pktmbuf_data_len(mbuf);
+		mbuf_len_left = mbuf_len;
+		pkt_idx++;
+		pkt_left--;
+
+		desc_chain_head = vq->avail->ring[(vq->last_used_idx) &
+			(vq->size - 1)];
+		desc_current = desc_chain_head;
+		desc = &vq->desc[desc_current];
+		desc_host_write_addr = gpa_to_vva(dev, desc->addr);
+		if (unlikely(!desc_host_write_addr))
+			goto done;
+
+		virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)
+			(uintptr_t)desc_host_write_addr;
+		copy_virtio_hdr = 1;
+
+		vhost_log_write(dev, desc->addr, dev->vhost_hlen);
+		desc_write_offset = dev->vhost_hlen;
+		desc_chain_len = desc_write_offset;
+		desc_host_write_addr += desc_write_offset;
+
+		while (1) {
+			if (!mbuf_len_left) {
+				if (mbuf->next) {
+					mbuf = mbuf->next;
+					mbuf_len = rte_pktmbuf_data_len(mbuf);
+					mbuf_len_left = mbuf_len;
+				} else
+					break;
+			}
 
-	/* Retrieve all of the desc indexes first to avoid caching issues. */
-	rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
-	for (i = 0; i < count; i++) {
-		used_idx = (start_idx + i) & (vq->size - 1);
-		desc_indexes[i] = vq->avail->ring[used_idx];
-		vq->used->ring[used_idx].id = desc_indexes[i];
-		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
-					       dev->vhost_hlen;
-		vhost_log_used_vring(dev, vq,
-			offsetof(struct vring_used, ring[used_idx]),
-			sizeof(vq->used->ring[used_idx]));
-	}
+			if (desc->len <= desc_write_offset) {
+				if (desc->flags & VRING_DESC_F_NEXT) {
+					desc_write_offset = 0;
+					desc_current = desc->next;
+					desc = &vq->desc[desc_current];
+					desc_host_write_addr =
+						gpa_to_vva(dev, desc->addr);
+					if (unlikely(!desc_host_write_addr))
+						goto rollback;
+				} else if (is_mrg_rxbuf) {
+					vq->shadow_used_ring[
+						vq->shadow_used_idx].id =
+						desc_chain_head;
+					vq->shadow_used_ring[
+						vq->shadow_used_idx].len =
+						desc_chain_len;
+					vq->shadow_used_idx++;
+					vq->last_used_idx++;
+					virtio_hdr->num_buffers++;
+					if (unlikely(vq->avail->idx ==
+							vq->last_used_idx))
+						goto rollback;
+
+					desc_chain_head = vq->avail->ring[
+						(vq->last_used_idx) &
+						(vq->size - 1)];
+					desc_current = desc_chain_head;
+					desc = &vq->desc[desc_current];
+					desc_host_write_addr =
+						gpa_to_vva(dev, desc->addr);
+					if (unlikely(!desc_host_write_addr))
+						goto rollback;
+
+					desc_chain_len = 0;
+					desc_write_offset = 0;
+				} else
+					goto rollback;
+			}
 
-	rte_prefetch0(&vq->desc[desc_indexes[0]]);
-	for (i = 0; i < count; i++) {
-		uint16_t desc_idx = desc_indexes[i];
-		int err;
+			copy_len = RTE_MIN(desc->len - desc_write_offset,
+					mbuf_len_left);
+			if (copy_virtio_hdr) {
+				copy_virtio_hdr = 0;
+				memset((void *)(uintptr_t)&(virtio_hdr->hdr),
+						0, dev->vhost_hlen);
+				virtio_enqueue_offload(mbuf,
+						&(virtio_hdr->hdr));
+				if (is_mrg_rxbuf || is_virtio_1)
+					virtio_hdr->num_buffers = 1;
+			}
 
-		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
-		if (unlikely(err)) {
-			used_idx = (start_idx + i) & (vq->size - 1);
-			vq->used->ring[used_idx].len = dev->vhost_hlen;
-			vhost_log_used_vring(dev, vq,
-				offsetof(struct vring_used, ring[used_idx]),
-				sizeof(vq->used->ring[used_idx]));
+			rte_memcpy((void *)(uintptr_t)desc_host_write_addr,
+					rte_pktmbuf_mtod_offset(mbuf, void *,
+						mbuf_len - mbuf_len_left),
+					copy_len);
+			vhost_log_write(dev, desc->addr + desc_write_offset,
+					copy_len);
+			mbuf_len_left -= copy_len;
+			desc_write_offset += copy_len;
+			desc_host_write_addr += copy_len;
+			desc_chain_len += copy_len;
 		}
 
-		if (i + 1 < count)
-			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
+		vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+		vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+		vq->shadow_used_idx++;
+		vq->last_used_idx++;
+		pkt_sent++;
 	}
 
-	rte_smp_wmb();
-
-	*(volatile uint16_t *)&vq->used->idx += count;
-	vq->last_used_idx += count;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, idx),
-		sizeof(vq->used->idx));
-
-	/* flush used->idx update before we read avail->flags. */
-	rte_mb();
-
-	/* Kick the guest if necessary. */
-	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-			&& (vq->callfd >= 0))
-		eventfd_write(vq->callfd, (eventfd_t)1);
-	return count;
-}
-
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
-	     uint32_t *allocated, uint32_t *vec_idx,
-	     struct buf_vector *buf_vec)
-{
-	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
-	uint32_t vec_id = *vec_idx;
-	uint32_t len    = *allocated;
-
-	while (1) {
-		if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
-			return -1;
-
-		len += vq->desc[idx].len;
-		buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
-		buf_vec[vec_id].buf_len  = vq->desc[idx].len;
-		buf_vec[vec_id].desc_idx = idx;
-		vec_id++;
-
-		if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
-			break;
-
-		idx = vq->desc[idx].next;
-	}
-
-	*allocated = len;
-	*vec_idx   = vec_id;
-
-	return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
-			    uint16_t *end, struct buf_vector *buf_vec)
-{
-	uint16_t cur_idx;
-	uint16_t avail_idx;
-	uint32_t allocated = 0;
-	uint32_t vec_idx = 0;
-	uint16_t tries = 0;
-
-	cur_idx  = vq->last_used_idx;
-
-	while (1) {
-		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-		if (unlikely(cur_idx == avail_idx))
-			return -1;
-
-		if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
-					  &vec_idx, buf_vec) < 0))
-			return -1;
-
-		cur_idx++;
-		tries++;
-
-		if (allocated >= size)
-			break;
-
-		/*
-		 * if we tried all available ring items, and still
-		 * can't get enough buf, it means something abnormal
-		 * happened.
-		 */
-		if (unlikely(tries >= vq->size))
-			return -1;
-	}
-
-	*end = cur_idx;
-	return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
-			    uint16_t end_idx, struct rte_mbuf *m,
-			    struct buf_vector *buf_vec)
-{
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-	uint32_t vec_idx = 0;
-	uint16_t start_idx = vq->last_used_idx;
-	uint16_t cur_idx = start_idx;
-	uint64_t desc_addr;
-	uint32_t mbuf_offset, mbuf_avail;
-	uint32_t desc_offset, desc_avail;
-	uint32_t cpy_len;
-	uint16_t desc_idx, used_idx;
-
-	if (unlikely(m == NULL))
-		return 0;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
-		dev->vid, cur_idx, end_idx);
-
-	desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
-		return 0;
-
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-	virtio_hdr.num_buffers = end_idx - start_idx;
-	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
-		dev->vid, virtio_hdr.num_buffers);
-
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-	vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
-	desc_offset = dev->vhost_hlen;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current desc buf, get the next one */
-		if (desc_avail == 0) {
-			desc_idx = buf_vec[vec_idx].desc_idx;
-
-			if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
-				/* Update used ring with desc information */
-				used_idx = cur_idx++ & (vq->size - 1);
-				vq->used->ring[used_idx].id  = desc_idx;
-				vq->used->ring[used_idx].len = desc_offset;
-				vhost_log_used_vring(dev, vq,
+done:
+	if (likely(vq->shadow_used_idx > 0)) {
+		if (used_idx_static + vq->shadow_used_idx < vq->size) {
+			rte_memcpy(&vq->used->ring[used_idx_static],
+					&vq->shadow_used_ring[0],
+					vq->shadow_used_idx *
+					sizeof(struct vring_used_elem));
+			vhost_log_used_vring(dev, vq,
 					offsetof(struct vring_used,
-						 ring[used_idx]),
-					sizeof(vq->used->ring[used_idx]));
-			}
-
-			vec_idx++;
-			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-			if (unlikely(!desc_addr))
-				return 0;
-
-			/* Prefetch buffer address. */
-			rte_prefetch0((void *)(uintptr_t)desc_addr);
-			desc_offset = 0;
-			desc_avail  = buf_vec[vec_idx].buf_len;
-		}
-
-		/* done with current mbuf, get the next one */
-		if (mbuf_avail == 0) {
-			m = m->next;
+						ring[used_idx_static]),
+					vq->shadow_used_idx *
+					sizeof(struct vring_used_elem));
+		} else {
+			uint32_t part_1 = vq->size - used_idx_static;
+			uint32_t part_2 = vq->shadow_used_idx - part_1;
 
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
+			rte_memcpy(&vq->used->ring[used_idx_static],
+					&vq->shadow_used_ring[0],
+					part_1 *
+					sizeof(struct vring_used_elem));
+			vhost_log_used_vring(dev, vq,
+					offsetof(struct vring_used,
+						ring[used_idx_static]),
+					part_1 *
+					sizeof(struct vring_used_elem));
+			rte_memcpy(&vq->used->ring[0],
+					&vq->shadow_used_ring[part_1],
+					part_2 *
+					sizeof(struct vring_used_elem));
+			vhost_log_used_vring(dev, vq,
+					offsetof(struct vring_used,
+						ring[0]),
+					part_2 *
+					sizeof(struct vring_used_elem));
 		}
-
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
-			cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
 	}
 
-	used_idx = cur_idx & (vq->size - 1);
-	vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
-	vq->used->ring[used_idx].len = desc_offset;
+	rte_smp_wmb();
+	vq->used->idx = vq->last_used_idx;
 	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, ring[used_idx]),
-		sizeof(vq->used->ring[used_idx]));
-
-	return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint32_t pkt_idx = 0, nr_used = 0;
-	uint16_t end;
-	struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
-		return 0;
-	}
-
-	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
-		return 0;
-
-	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
-	if (count == 0)
-		return 0;
-
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
-		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
-
-		if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
-							 &end, buf_vec) < 0)) {
-			LOG_DEBUG(VHOST_DATA,
-				"(%d) failed to get enough desc from vring\n",
-				dev->vid);
-			break;
-		}
-
-		nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
-						      pkts[pkt_idx], buf_vec);
-		rte_smp_wmb();
-
-		*(volatile uint16_t *)&vq->used->idx += nr_used;
-		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+			offsetof(struct vring_used, idx),
 			sizeof(vq->used->idx));
-		vq->last_used_idx += nr_used;
-	}
-
-	if (likely(pkt_idx)) {
-		/* flush used->idx update before we read avail->flags. */
-		rte_mb();
-
-		/* Kick the guest if necessary. */
+	rte_mb();
+	if (likely(pkt_sent)) {
 		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
 				&& (vq->callfd >= 0))
 			eventfd_write(vq->callfd, (eventfd_t)1);
 	}
 
-	return pkt_idx;
-}
-
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint16_t count)
-{
-	struct virtio_net *dev = get_device(vid);
+	return pkt_sent;
 
-	if (!dev)
-		return 0;
+rollback:
+	if (is_mrg_rxbuf || is_virtio_1)
+		vq->last_used_idx -= virtio_hdr->num_buffers - 1;
 
-	if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
-		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
-	else
-		return virtio_dev_rx(dev, queue_id, pkts, count);
+	goto done;
 }
 
 static void
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 1785695..87d09fa 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -152,10 +152,14 @@ cleanup_device(struct virtio_net *dev, int destroy)
 static void
 free_device(struct virtio_net *dev)
 {
+	struct vhost_virtqueue *vq;
 	uint32_t i;
 
-	for (i = 0; i < dev->virt_qp_nb; i++)
-		rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+	for (i = 0; i < dev->virt_qp_nb; i++) {
+		vq = dev->virtqueue[i * VIRTIO_QNUM];
+		rte_free(vq->shadow_used_ring);
+		rte_free(vq);
+	}
 
 	rte_free(dev);
 }
@@ -418,13 +422,18 @@ int
 vhost_set_vring_num(int vid, struct vhost_vring_state *state)
 {
 	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
 
 	dev = get_device(vid);
 	if (dev == NULL)
 		return -1;
 
 	/* State->index refers to the queue index. The txq is 1, rxq is 0. */
-	dev->virtqueue[state->index]->size = state->num;
+	vq = dev->virtqueue[state->index];
+	vq->size = state->num;
+	vq->shadow_used_ring = rte_malloc("",
+			vq->size * sizeof(struct vring_used_elem),
+			RTE_CACHE_LINE_SIZE);
 
 	return 0;
 }
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* Re: [PATCH] optimize vhost enqueue
  2016-08-16  3:50 [PATCH] optimize vhost enqueue Zhihong Wang
@ 2016-08-16 13:59 ` Maxime Coquelin
  2016-08-17  1:45   ` Wang, Zhihong
  2016-08-18  6:33 ` [PATCH v2 0/6] vhost: optimize enqueue Zhihong Wang
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 141+ messages in thread
From: Maxime Coquelin @ 2016-08-16 13:59 UTC (permalink / raw)
  To: Zhihong Wang, dev

Hi Zhihong,

On 08/16/2016 05:50 AM, Zhihong Wang wrote:
> This patch optimizes the vhost enqueue function: rte_vhost_enqueue_burst.
>
> Currently there're 2 callbacks for vhost enqueue:
>  *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
>  *  virtio_dev_rx for mrg_rxbuf turned off cases.
>
> The virtio_dev_merge_rx doesn't provide optimal performance, also it is
> reported having compatibility issue working with Windows VMs.
Could you tell us more please about this compatibility issue?

>
> Besides, having 2 separated functions increases maintenance efforts.
>
> This patch uses a single function logic to replace the current 2 for
> better maintainability, and provides better performance by optimizing
> caching behavior especially for mrg_rxbuf turned on cases.
Do you have some benchmark comparison before and after your change?

Also, for maintainability, I would suggest the that the enqueue
function be split. Because vhost_enqueue_burst becomes very long (220
LoC), and max level of indentation is too high (6).

It makes the code hard to understand, and prone to miss bugs during
review and maintenance.

>
> It also fixes the issue working with Windows VMs.
Ideally, the fix should be sent separately, before the rework.
Indeed, we might want to have the fix in the stable branch, without
picking the optimization.

>
> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> ---
>  lib/librte_vhost/vhost-net.h  |   6 +-
>  lib/librte_vhost/vhost_rxtx.c | 582 ++++++++++++++----------------------------
>  lib/librte_vhost/virtio-net.c |  15 +-
>  3 files changed, 208 insertions(+), 395 deletions(-)
582 lines changed is a huge patch.
If possible, it would be better splitting it in incremental changes,
making the review process easier.

Also, for v2, please prefix the commit title with "vhost:".

Thanks for your contribution, I'm looking forward for the v2.
- Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH] optimize vhost enqueue
  2016-08-16 13:59 ` Maxime Coquelin
@ 2016-08-17  1:45   ` Wang, Zhihong
  2016-08-17  2:38     ` Yuanhan Liu
  0 siblings, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-08-17  1:45 UTC (permalink / raw)
  To: Maxime Coquelin, dev



> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Tuesday, August 16, 2016 10:00 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
> 
> Hi Zhihong,
> 
> On 08/16/2016 05:50 AM, Zhihong Wang wrote:
> > This patch optimizes the vhost enqueue function: rte_vhost_enqueue_burst.
> >
> > Currently there're 2 callbacks for vhost enqueue:
> >  *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
> >  *  virtio_dev_rx for mrg_rxbuf turned off cases.
> >
> > The virtio_dev_merge_rx doesn't provide optimal performance, also it is
> > reported having compatibility issue working with Windows VMs.
> Could you tell us more please about this compatibility issue?


For example, when you have testpmd in the host and Window VM as the guest,
with mrg_rxbuf turned on, the guest will hang once there's packets enqueued
by virtio_dev_merge_rx.

Let me know if you see the same issue.


> 
> >
> > Besides, having 2 separated functions increases maintenance efforts.
> >
> > This patch uses a single function logic to replace the current 2 for
> > better maintainability, and provides better performance by optimizing
> > caching behavior especially for mrg_rxbuf turned on cases.
> Do you have some benchmark comparison before and after your change?
> 
> Also, for maintainability, I would suggest the that the enqueue
> function be split. Because vhost_enqueue_burst becomes very long (220
> LoC), and max level of indentation is too high (6).
> 
> It makes the code hard to understand, and prone to miss bugs during
> review and maintenance.


This is something I've thought about while writing the code, the reason I
keep it as one function body is that:

 1. This function is very performance sensitive, and we need full control of
    code ordering (You can compare with the current performance with the
    mrg_rxbuf feature turned on to see the difference).

 2. I somehow find that a single function logic makes it easier to understand,
    surely I can add comments to make it easiler to read for .

Please let me know if you still insist, we can discuss more on it.


> 
> >
> > It also fixes the issue working with Windows VMs.
> Ideally, the fix should be sent separately, before the rework.
> Indeed, we might want to have the fix in the stable branch, without
> picking the optimization.
> 
> >
> > Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> > ---
> >  lib/librte_vhost/vhost-net.h  |   6 +-
> >  lib/librte_vhost/vhost_rxtx.c | 582 ++++++++++++++----------------------------
> >  lib/librte_vhost/virtio-net.c |  15 +-
> >  3 files changed, 208 insertions(+), 395 deletions(-)
> 582 lines changed is a huge patch.
> If possible, it would be better splitting it in incremental changes,
> making the review process easier.


It looks like a huge patch, but it simply deletes the current implementation
and add the new code. I think perhaps split it into 2, 1st one to replace
just the rte_vhost_enqueue_burst, 2nd one to delete all the obsolete functions.
It should make the patch clear, how do you think?  :)


> 
> Also, for v2, please prefix the commit title with "vhost:".

Thanks for the hint! Will do.

> 
> Thanks for your contribution, I'm looking forward for the v2.
> - Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH] optimize vhost enqueue
  2016-08-17  1:45   ` Wang, Zhihong
@ 2016-08-17  2:38     ` Yuanhan Liu
  2016-08-17  6:41       ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-08-17  2:38 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: Maxime Coquelin, dev

On Wed, Aug 17, 2016 at 01:45:26AM +0000, Wang, Zhihong wrote:
> 
> 
> > -----Original Message-----
> > From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> > Sent: Tuesday, August 16, 2016 10:00 PM
> > To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
> > 
> > Hi Zhihong,
> > 
> > On 08/16/2016 05:50 AM, Zhihong Wang wrote:
> > > This patch optimizes the vhost enqueue function: rte_vhost_enqueue_burst.
> > >
> > > Currently there're 2 callbacks for vhost enqueue:
> > >  *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
> > >  *  virtio_dev_rx for mrg_rxbuf turned off cases.
> > >
> > > The virtio_dev_merge_rx doesn't provide optimal performance, also it is
> > > reported having compatibility issue working with Windows VMs.
> > Could you tell us more please about this compatibility issue?
> 
> 
> For example, when you have testpmd in the host and Window VM as the guest,
> with mrg_rxbuf turned on, the guest will hang once there's packets enqueued
> by virtio_dev_merge_rx.

You should put it into commit log.

> Let me know if you see the same issue.
> 
> 
> > 
> > >
> > > Besides, having 2 separated functions increases maintenance efforts.
> > >
> > > This patch uses a single function logic to replace the current 2 for
> > > better maintainability, and provides better performance by optimizing
> > > caching behavior especially for mrg_rxbuf turned on cases.

Here, here sounds two parts to me:

- one to unite mergeable and non-mergeable Rx

- another one to optimize the mergeable path

That means you should do it in two patches, with that we can have clear
understanding what changes the performance boost. It also helps review.

> > Do you have some benchmark comparison before and after your change?
> > 
> > Also, for maintainability, I would suggest the that the enqueue
> > function be split. Because vhost_enqueue_burst becomes very long (220
> > LoC), and max level of indentation is too high (6).
> > 
> > It makes the code hard to understand, and prone to miss bugs during
> > review and maintenance.

Agreed.

> 
> This is something I've thought about while writing the code, the reason I
> keep it as one function body is that:
> 
>  1. This function is very performance sensitive, and we need full control of
>     code ordering (You can compare with the current performance with the
>     mrg_rxbuf feature turned on to see the difference).

Will inline functions help?

>  2. I somehow find that a single function logic makes it easier to understand,
>     surely I can add comments to make it easiler to read for .
> 
> Please let me know if you still insist, we can discuss more on it.

I am personally not a fan of huge function; I would try hard to avoid
too many levels of indentation as well.

> 
> > 
> > >
> > > It also fixes the issue working with Windows VMs.
> > Ideally, the fix should be sent separately, before the rework.
> > Indeed, we might want to have the fix in the stable branch, without
> > picking the optimization.

Agreed.

> > 
> > >
> > > Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> > > ---
> > >  lib/librte_vhost/vhost-net.h  |   6 +-
> > >  lib/librte_vhost/vhost_rxtx.c | 582 ++++++++++++++----------------------------
> > >  lib/librte_vhost/virtio-net.c |  15 +-
> > >  3 files changed, 208 insertions(+), 395 deletions(-)
> > 582 lines changed is a huge patch.
> > If possible, it would be better splitting it in incremental changes,
> > making the review process easier.
> 
> 
> It looks like a huge patch, but it simply deletes the current implementation
> and add the new code. I think perhaps split it into 2, 1st one to replace
> just the rte_vhost_enqueue_burst, 2nd one to delete all the obsolete functions.
> It should make the patch clear, how do you think?  :)

Nope, it's not working in that way. It should be:

- one patch to fix the hang issue for windows guest

  Please cc it to stable@dpdk.org as well so that we could pick it for
  v16.07 stable release.

- one patch to unite the two different Rx code path

- another patch to optimize mergeable code path

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH] optimize vhost enqueue
  2016-08-17  2:38     ` Yuanhan Liu
@ 2016-08-17  6:41       ` Wang, Zhihong
  2016-08-17  9:17         ` Maxime Coquelin
  0 siblings, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-08-17  6:41 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: Maxime Coquelin, dev



> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Wednesday, August 17, 2016 10:38 AM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: Maxime Coquelin <maxime.coquelin@redhat.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
> 
> On Wed, Aug 17, 2016 at 01:45:26AM +0000, Wang, Zhihong wrote:
> >
> >
> > > -----Original Message-----
> > > From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> > > Sent: Tuesday, August 16, 2016 10:00 PM
> > > To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
> > >
> > > Hi Zhihong,
> > >
> > > On 08/16/2016 05:50 AM, Zhihong Wang wrote:
> > > > This patch optimizes the vhost enqueue function:
> rte_vhost_enqueue_burst.
> > > >
> > > > Currently there're 2 callbacks for vhost enqueue:
> > > >  *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
> > > >  *  virtio_dev_rx for mrg_rxbuf turned off cases.
> > > >
> > > > The virtio_dev_merge_rx doesn't provide optimal performance, also it is
> > > > reported having compatibility issue working with Windows VMs.
> > > Could you tell us more please about this compatibility issue?
> >
> >
> > For example, when you have testpmd in the host and Window VM as the
> guest,
> > with mrg_rxbuf turned on, the guest will hang once there's packets enqueued
> > by virtio_dev_merge_rx.
> 
> You should put it into commit log.


Okay.


> 
> > Let me know if you see the same issue.
> >
> >
> > >
> > > >
> > > > Besides, having 2 separated functions increases maintenance efforts.
> > > >
> > > > This patch uses a single function logic to replace the current 2 for
> > > > better maintainability, and provides better performance by optimizing
> > > > caching behavior especially for mrg_rxbuf turned on cases.
> 
> Here, here sounds two parts to me:
> 
> - one to unite mergeable and non-mergeable Rx
> 
> - another one to optimize the mergeable path
> 
> That means you should do it in two patches, with that we can have clear
> understanding what changes the performance boost. It also helps review.


Please see explanation below.


> 
> > > Do you have some benchmark comparison before and after your change?
> > >
> > > Also, for maintainability, I would suggest the that the enqueue
> > > function be split. Because vhost_enqueue_burst becomes very long (220
> > > LoC), and max level of indentation is too high (6).
> > >
> > > It makes the code hard to understand, and prone to miss bugs during
> > > review and maintenance.
> 
> Agreed.
> 
> >
> > This is something I've thought about while writing the code, the reason I
> > keep it as one function body is that:
> >
> >  1. This function is very performance sensitive, and we need full control of
> >     code ordering (You can compare with the current performance with the
> >     mrg_rxbuf feature turned on to see the difference).
> 
> Will inline functions help?


Optimization in this patch actually reorganizes the code from its logic,
so it's not suitable for making separated functions.

I'll explain this in v2.


> 
> >  2. I somehow find that a single function logic makes it easier to understand,
> >     surely I can add comments to make it easiler to read for .
> >
> > Please let me know if you still insist, we can discuss more on it.
> 
> I am personally not a fan of huge function; I would try hard to avoid
> too many levels of indentation as well.
> 
> >
> > >
> > > >
> > > > It also fixes the issue working with Windows VMs.
> > > Ideally, the fix should be sent separately, before the rework.
> > > Indeed, we might want to have the fix in the stable branch, without
> > > picking the optimization.
> 
> Agreed.


The fact is that I don't have much time to debug with the current code
since it's messy and I don't have Windows virtio code and the debugging
environment.

This patch doesn't try to fix this issue, it rewrites the logic totally,
and somehow fixes this issue.

Do you think integrating this whole patch into the stable branch will work?
Personally I think it makes more sense.


> 
> > >
> > > >
> > > > Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> > > > ---
> > > >  lib/librte_vhost/vhost-net.h  |   6 +-
> > > >  lib/librte_vhost/vhost_rxtx.c | 582
> ++++++++++++++----------------------------
> > > >  lib/librte_vhost/virtio-net.c |  15 +-
> > > >  3 files changed, 208 insertions(+), 395 deletions(-)
> > > 582 lines changed is a huge patch.
> > > If possible, it would be better splitting it in incremental changes,
> > > making the review process easier.
> >
> >
> > It looks like a huge patch, but it simply deletes the current implementation
> > and add the new code. I think perhaps split it into 2, 1st one to replace
> > just the rte_vhost_enqueue_burst, 2nd one to delete all the obsolete
> functions.
> > It should make the patch clear, how do you think?  :)
> 
> Nope, it's not working in that way. It should be:
> 
> - one patch to fix the hang issue for windows guest
> 
>   Please cc it to stable@dpdk.org as well so that we could pick it for
>   v16.07 stable release.
> 
> - one patch to unite the two different Rx code path
> 
> - another patch to optimize mergeable code path


I can separate optimization from the basic code in v2, however as I explained
this patch is built from scratch and doesn't take anything from the existing
code, so there's no way to transform from the existing code incrementally into
the new code.


> 
> 	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH] optimize vhost enqueue
  2016-08-17  6:41       ` Wang, Zhihong
@ 2016-08-17  9:17         ` Maxime Coquelin
  2016-08-17  9:51           ` Yuanhan Liu
  2016-08-17 10:07           ` Wang, Zhihong
  0 siblings, 2 replies; 141+ messages in thread
From: Maxime Coquelin @ 2016-08-17  9:17 UTC (permalink / raw)
  To: Wang, Zhihong, Yuanhan Liu; +Cc: dev



On 08/17/2016 08:41 AM, Wang, Zhihong wrote:
>
>
>> -----Original Message-----
>> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
>> Sent: Wednesday, August 17, 2016 10:38 AM
>> To: Wang, Zhihong <zhihong.wang@intel.com>
>> Cc: Maxime Coquelin <maxime.coquelin@redhat.com>; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
>>
>> On Wed, Aug 17, 2016 at 01:45:26AM +0000, Wang, Zhihong wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>>> Sent: Tuesday, August 16, 2016 10:00 PM
>>>> To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
>>>> Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
>>>>
>>>> Hi Zhihong,
>>>>
>>>> On 08/16/2016 05:50 AM, Zhihong Wang wrote:
>>>>> This patch optimizes the vhost enqueue function:
>> rte_vhost_enqueue_burst.
>>>>>
>>>>> Currently there're 2 callbacks for vhost enqueue:
>>>>>  *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
>>>>>  *  virtio_dev_rx for mrg_rxbuf turned off cases.
>>>>>
>>>>> The virtio_dev_merge_rx doesn't provide optimal performance, also it is
>>>>> reported having compatibility issue working with Windows VMs.
>>>> Could you tell us more please about this compatibility issue?
>>>
>>>
>>> For example, when you have testpmd in the host and Window VM as the
>> guest,
>>> with mrg_rxbuf turned on, the guest will hang once there's packets enqueued
>>> by virtio_dev_merge_rx.
>>
>> You should put it into commit log.
>
>
> Okay.
>
>
>>
>>> Let me know if you see the same issue.
>>>
>>>
>>>>
>>>>>
>>>>> Besides, having 2 separated functions increases maintenance efforts.
>>>>>
>>>>> This patch uses a single function logic to replace the current 2 for
>>>>> better maintainability, and provides better performance by optimizing
>>>>> caching behavior especially for mrg_rxbuf turned on cases.
>>
>> Here, here sounds two parts to me:
>>
>> - one to unite mergeable and non-mergeable Rx
>>
>> - another one to optimize the mergeable path
>>
>> That means you should do it in two patches, with that we can have clear
>> understanding what changes the performance boost. It also helps review.
>
>
> Please see explanation below.
>
>
>>
>>>> Do you have some benchmark comparison before and after your change?
>>>>
>>>> Also, for maintainability, I would suggest the that the enqueue
>>>> function be split. Because vhost_enqueue_burst becomes very long (220
>>>> LoC), and max level of indentation is too high (6).
>>>>
>>>> It makes the code hard to understand, and prone to miss bugs during
>>>> review and maintenance.
>>
>> Agreed.
>>
>>>
>>> This is something I've thought about while writing the code, the reason I
>>> keep it as one function body is that:
>>>
>>>  1. This function is very performance sensitive, and we need full control of
>>>     code ordering (You can compare with the current performance with the
>>>     mrg_rxbuf feature turned on to see the difference).
>>
>> Will inline functions help?
>
>
> Optimization in this patch actually reorganizes the code from its logic,
> so it's not suitable for making separated functions.
>
> I'll explain this in v2.

I agree with Yuanhan.
Inline functions should not break the optimizations.
IMHO, this is mandatory for the patch to be accepted.

>
>
>>
>>>  2. I somehow find that a single function logic makes it easier to understand,
>>>     surely I can add comments to make it easiler to read for .
>>>
>>> Please let me know if you still insist, we can discuss more on it.
>>
>> I am personally not a fan of huge function; I would try hard to avoid
>> too many levels of indentation as well.
>>
>>>
>>>>
>>>>>
>>>>> It also fixes the issue working with Windows VMs.
>>>> Ideally, the fix should be sent separately, before the rework.
>>>> Indeed, we might want to have the fix in the stable branch, without
>>>> picking the optimization.
>>
>> Agreed.
>
>
> The fact is that I don't have much time to debug with the current code
> since it's messy and I don't have Windows virtio code and the debugging
> environment.

It seems you are not the only one facing the issue:
https://github.com/YanVugenfirer/kvm-guest-drivers-windows/issues/70

So a dedicated fix is really important.

> This patch doesn't try to fix this issue, it rewrites the logic totally,
> and somehow fixes this issue.
>
> Do you think integrating this whole patch into the stable branch will work?
> Personally I think it makes more sense.

No.
We don't even know why/how it fixes the Windows issue, which would be
the first thing to understand before integrating a fix in stable branch.

And the stable branch is not meant for integrating such big reworks,
it is only meant to fix bugs.

The risk of regressions have to be avoided as much as possible.

>
>
>>
>>>>
>>>>>
>>>>> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
>>>>> ---
>>>>>  lib/librte_vhost/vhost-net.h  |   6 +-
>>>>>  lib/librte_vhost/vhost_rxtx.c | 582
>> ++++++++++++++----------------------------
>>>>>  lib/librte_vhost/virtio-net.c |  15 +-
>>>>>  3 files changed, 208 insertions(+), 395 deletions(-)
>>>> 582 lines changed is a huge patch.
>>>> If possible, it would be better splitting it in incremental changes,
>>>> making the review process easier.
>>>
>>>
>>> It looks like a huge patch, but it simply deletes the current implementation
>>> and add the new code. I think perhaps split it into 2, 1st one to replace
>>> just the rte_vhost_enqueue_burst, 2nd one to delete all the obsolete
>> functions.
>>> It should make the patch clear, how do you think?  :)
>>
>> Nope, it's not working in that way. It should be:
>>
>> - one patch to fix the hang issue for windows guest
>>
>>   Please cc it to stable@dpdk.org as well so that we could pick it for
>>   v16.07 stable release.
>>
>> - one patch to unite the two different Rx code path
>>
>> - another patch to optimize mergeable code path
>
>
> I can separate optimization from the basic code in v2, however as I explained
> this patch is built from scratch and doesn't take anything from the existing
> code, so there's no way to transform from the existing code incrementally into
> the new code.
>
>
>>
>> 	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH] optimize vhost enqueue
  2016-08-17  9:17         ` Maxime Coquelin
@ 2016-08-17  9:51           ` Yuanhan Liu
  2016-08-18 13:44             ` Wang, Zhihong
  2016-08-17 10:07           ` Wang, Zhihong
  1 sibling, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-08-17  9:51 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: Wang, Zhihong, dev

On Wed, Aug 17, 2016 at 11:17:46AM +0200, Maxime Coquelin wrote:
> >>>This is something I've thought about while writing the code, the reason I
> >>>keep it as one function body is that:
> >>>
> >>> 1. This function is very performance sensitive, and we need full control of
> >>>    code ordering (You can compare with the current performance with the
> >>>    mrg_rxbuf feature turned on to see the difference).
> >>
> >>Will inline functions help?
> >
> >
> >Optimization in this patch actually reorganizes the code from its logic,
> >so it's not suitable for making separated functions.
> >
> >I'll explain this in v2.
> 
> I agree with Yuanhan.
> Inline functions should not break the optimizations.
> IMHO, this is mandatory for the patch to be accepted.

Yes.

> It seems you are not the only one facing the issue:
> https://github.com/YanVugenfirer/kvm-guest-drivers-windows/issues/70
> 
> So a dedicated fix is really important.

Yes.

> 
> >This patch doesn't try to fix this issue, it rewrites the logic totally,
> >and somehow fixes this issue.
> >
> >Do you think integrating this whole patch into the stable branch will work?
> >Personally I think it makes more sense.
> 
> No.
> We don't even know why/how it fixes the Windows issue, which would be
> the first thing to understand before integrating a fix in stable branch.

Yes.

> 
> And the stable branch is not meant for integrating such big reworks,
> it is only meant to fix bugs.

Yes.

> The risk of regressions have to be avoided as much as possible.

Yes.

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH] optimize vhost enqueue
  2016-08-17  9:17         ` Maxime Coquelin
  2016-08-17  9:51           ` Yuanhan Liu
@ 2016-08-17 10:07           ` Wang, Zhihong
  1 sibling, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-08-17 10:07 UTC (permalink / raw)
  To: Maxime Coquelin, Yuanhan Liu; +Cc: dev



> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Wednesday, August 17, 2016 5:18 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
> <yuanhan.liu@linux.intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
> 
> 
> 
> On 08/17/2016 08:41 AM, Wang, Zhihong wrote:
> >
> >
> >> -----Original Message-----
> >> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> >> Sent: Wednesday, August 17, 2016 10:38 AM
> >> To: Wang, Zhihong <zhihong.wang@intel.com>
> >> Cc: Maxime Coquelin <maxime.coquelin@redhat.com>; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
> >>
> >> On Wed, Aug 17, 2016 at 01:45:26AM +0000, Wang, Zhihong wrote:
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> >>>> Sent: Tuesday, August 16, 2016 10:00 PM
> >>>> To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
> >>>> Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
> >>>>
> >>>> Hi Zhihong,
> >>>>
> >>>> On 08/16/2016 05:50 AM, Zhihong Wang wrote:
> >>>>> This patch optimizes the vhost enqueue function:
> >> rte_vhost_enqueue_burst.
> >>>>>
> >>>>> Currently there're 2 callbacks for vhost enqueue:
> >>>>>  *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
> >>>>>  *  virtio_dev_rx for mrg_rxbuf turned off cases.
> >>>>>
> >>>>> The virtio_dev_merge_rx doesn't provide optimal performance, also it is
> >>>>> reported having compatibility issue working with Windows VMs.
> >>>> Could you tell us more please about this compatibility issue?
> >>>
> >>>
> >>> For example, when you have testpmd in the host and Window VM as the
> >> guest,
> >>> with mrg_rxbuf turned on, the guest will hang once there's packets
> enqueued
> >>> by virtio_dev_merge_rx.
> >>
> >> You should put it into commit log.
> >
> >
> > Okay.
> >
> >
> >>
> >>> Let me know if you see the same issue.
> >>>
> >>>
> >>>>
> >>>>>
> >>>>> Besides, having 2 separated functions increases maintenance efforts.
> >>>>>
> >>>>> This patch uses a single function logic to replace the current 2 for
> >>>>> better maintainability, and provides better performance by optimizing
> >>>>> caching behavior especially for mrg_rxbuf turned on cases.
> >>
> >> Here, here sounds two parts to me:
> >>
> >> - one to unite mergeable and non-mergeable Rx
> >>
> >> - another one to optimize the mergeable path
> >>
> >> That means you should do it in two patches, with that we can have clear
> >> understanding what changes the performance boost. It also helps review.
> >
> >
> > Please see explanation below.
> >
> >
> >>
> >>>> Do you have some benchmark comparison before and after your change?
> >>>>
> >>>> Also, for maintainability, I would suggest the that the enqueue
> >>>> function be split. Because vhost_enqueue_burst becomes very long (220
> >>>> LoC), and max level of indentation is too high (6).
> >>>>
> >>>> It makes the code hard to understand, and prone to miss bugs during
> >>>> review and maintenance.
> >>
> >> Agreed.
> >>
> >>>
> >>> This is something I've thought about while writing the code, the reason I
> >>> keep it as one function body is that:
> >>>
> >>>  1. This function is very performance sensitive, and we need full control of
> >>>     code ordering (You can compare with the current performance with the
> >>>     mrg_rxbuf feature turned on to see the difference).
> >>
> >> Will inline functions help?
> >
> >
> > Optimization in this patch actually reorganizes the code from its logic,
> > so it's not suitable for making separated functions.
> >
> > I'll explain this in v2.
> 
> I agree with Yuanhan.
> Inline functions should not break the optimizations.
> IMHO, this is mandatory for the patch to be accepted.


Excellent!


> 
> >
> >
> >>
> >>>  2. I somehow find that a single function logic makes it easier to understand,
> >>>     surely I can add comments to make it easiler to read for .
> >>>
> >>> Please let me know if you still insist, we can discuss more on it.
> >>
> >> I am personally not a fan of huge function; I would try hard to avoid
> >> too many levels of indentation as well.
> >>
> >>>
> >>>>
> >>>>>
> >>>>> It also fixes the issue working with Windows VMs.
> >>>> Ideally, the fix should be sent separately, before the rework.
> >>>> Indeed, we might want to have the fix in the stable branch, without
> >>>> picking the optimization.
> >>
> >> Agreed.
> >
> >
> > The fact is that I don't have much time to debug with the current code
> > since it's messy and I don't have Windows virtio code and the debugging
> > environment.
> 
> It seems you are not the only one facing the issue:
> https://github.com/YanVugenfirer/kvm-guest-drivers-windows/issues/70
> 
> So a dedicated fix is really important.


Yeah that's me raising this issue there.

But I think it's another standalone task to identify the root cause and
find the fix for the existing code.


> 
> > This patch doesn't try to fix this issue, it rewrites the logic totally,
> > and somehow fixes this issue.
> >
> > Do you think integrating this whole patch into the stable branch will work?
> > Personally I think it makes more sense.
> 
> No.
> We don't even know why/how it fixes the Windows issue, which would be
> the first thing to understand before integrating a fix in stable branch.
> 
> And the stable branch is not meant for integrating such big reworks,
> it is only meant to fix bugs.
> 
> The risk of regressions have to be avoided as much as possible.
> 
> >
> >
> >>
> >>>>
> >>>>>
> >>>>> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> >>>>> ---
> >>>>>  lib/librte_vhost/vhost-net.h  |   6 +-
> >>>>>  lib/librte_vhost/vhost_rxtx.c | 582
> >> ++++++++++++++----------------------------
> >>>>>  lib/librte_vhost/virtio-net.c |  15 +-
> >>>>>  3 files changed, 208 insertions(+), 395 deletions(-)
> >>>> 582 lines changed is a huge patch.
> >>>> If possible, it would be better splitting it in incremental changes,
> >>>> making the review process easier.
> >>>
> >>>
> >>> It looks like a huge patch, but it simply deletes the current implementation
> >>> and add the new code. I think perhaps split it into 2, 1st one to replace
> >>> just the rte_vhost_enqueue_burst, 2nd one to delete all the obsolete
> >> functions.
> >>> It should make the patch clear, how do you think?  :)
> >>
> >> Nope, it's not working in that way. It should be:
> >>
> >> - one patch to fix the hang issue for windows guest
> >>
> >>   Please cc it to stable@dpdk.org as well so that we could pick it for
> >>   v16.07 stable release.
> >>
> >> - one patch to unite the two different Rx code path
> >>
> >> - another patch to optimize mergeable code path
> >
> >
> > I can separate optimization from the basic code in v2, however as I explained
> > this patch is built from scratch and doesn't take anything from the existing
> > code, so there's no way to transform from the existing code incrementally into
> > the new code.
> >
> >
> >>
> >> 	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* [PATCH v2 0/6] vhost: optimize enqueue
  2016-08-16  3:50 [PATCH] optimize vhost enqueue Zhihong Wang
  2016-08-16 13:59 ` Maxime Coquelin
@ 2016-08-18  6:33 ` Zhihong Wang
  2016-08-18  6:33   ` [PATCH v2 1/6] vhost: rewrite enqueue Zhihong Wang
                     ` (5 more replies)
  2016-08-19  5:43 ` [PATCH v3 0/5] vhost: optimize enqueue Zhihong Wang
                   ` (3 subsequent siblings)
  5 siblings, 6 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-18  6:33 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu

This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

 *  For fast frontends (eg. DPDK virtio pmd), higher performance (maximum
    throughput) can be achieved.

 *  For slow frontends (eg. kernel virtio-net), better scalability can be
    achieved, each vhost core can support more connections since it takes
    less cycles to handle each single frontend.

The main optimization techniques are:

 1. Reorder code to reduce CPU pipeline stall cycles.

 2. Batch update the used ring for better efficiency.

 3. Prefetch descriptor to hide cache latency.

 4. Remove useless volatile attribute to allow compiler optimization.

In the existing code there're 2 callbacks for vhost enqueue:

 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.

 *  virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Also, having 2 separated functions increases
maintenance efforts.

---
Changes in v2:

 1. Split the big function into several small ones

 2. Use multiple patches to explain each optimization

 3. Add comments

Zhihong Wang (6):
  vhost: rewrite enqueue
  vhost: remove obsolete
  vhost: remove useless volatile
  vhost: add desc prefetch
  vhost: batch update used ring
  vhost: optimize cache access

 lib/librte_vhost/vhost-net.h  |   6 +-
 lib/librte_vhost/vhost_rxtx.c | 582 +++++++++++++++---------------------------
 lib/librte_vhost/virtio-net.c |  15 +-
 3 files changed, 228 insertions(+), 375 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 141+ messages in thread

* [PATCH v2 1/6] vhost: rewrite enqueue
  2016-08-18  6:33 ` [PATCH v2 0/6] vhost: optimize enqueue Zhihong Wang
@ 2016-08-18  6:33   ` Zhihong Wang
  2016-08-19  2:39     ` Yuanhan Liu
  2016-08-18  6:33   ` [PATCH v2 2/6] vhost: remove obsolete Zhihong Wang
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 141+ messages in thread
From: Zhihong Wang @ 2016-08-18  6:33 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, Zhihong Wang

This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost_rxtx.c | 212 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 205 insertions(+), 7 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..8e6d782 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
 	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }
 
-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
 	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -533,19 +533,217 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 	return pkt_idx;
 }
 
+static inline uint32_t __attribute__((always_inline))
+loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
+{
+	if (pkt_left == 0 || avail_idx == vq->last_used_idx)
+		return 1;
+
+	return 0;
+}
+
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint16_t avail_idx, struct rte_mbuf *mbuf,
+		uint32_t is_mrg_rxbuf)
+{
+	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
+	struct vring_desc *desc;
+	uint64_t desc_host_write_addr = 0;
+	uint32_t desc_chain_head = 0;
+	uint32_t desc_chain_len = 0;
+	uint32_t desc_current = 0;
+	uint32_t desc_write_offset = 0;
+	uint32_t mbuf_len = 0;
+	uint32_t mbuf_len_left = 0;
+	uint32_t copy_len = 0;
+	uint32_t extra_buffers = 0;
+	uint32_t used_idx_round = 0;
+
+	/* start with the first mbuf of the packet */
+	mbuf_len = rte_pktmbuf_data_len(mbuf);
+	mbuf_len_left = mbuf_len;
+
+	/* get the current desc */
+	desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+	desc_chain_head = desc_current;
+	desc = &vq->desc[desc_current];
+	desc_host_write_addr = gpa_to_vva(dev, desc->addr);
+	if (unlikely(!desc_host_write_addr))
+		goto error;
+
+	/* handle virtio header */
+	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)
+		(uintptr_t)desc_host_write_addr;
+	memset((void *)(uintptr_t)&(virtio_hdr->hdr),
+			0, dev->vhost_hlen);
+	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
+	desc_write_offset = dev->vhost_hlen;
+	desc_chain_len = desc_write_offset;
+	desc_host_write_addr += desc_write_offset;
+	if (is_mrg_rxbuf)
+		virtio_hdr->num_buffers = 1;
+
+	/* start copy from mbuf to desc */
+	while (1) {
+		/* get the next mbuf if the current done */
+		if (!mbuf_len_left) {
+			if (mbuf->next) {
+				mbuf = mbuf->next;
+				mbuf_len = rte_pktmbuf_data_len(mbuf);
+				mbuf_len_left = mbuf_len;
+			} else
+				break;
+		}
+
+		/* get the next desc if the current done */
+		if (desc->len <= desc_write_offset) {
+			if (desc->flags & VRING_DESC_F_NEXT) {
+				/* go on with the current desc chain */
+				desc_write_offset = 0;
+				desc_current = desc->next;
+				desc = &vq->desc[desc_current];
+				desc_host_write_addr =
+					gpa_to_vva(dev, desc->addr);
+				if (unlikely(!desc_host_write_addr))
+					goto rollback;
+			} else if (is_mrg_rxbuf) {
+				/* start with the next desc chain */
+				used_idx_round = vq->last_used_idx
+					& (vq->size - 1);
+				vq->used->ring[used_idx_round].id =
+					desc_chain_head;
+				vq->used->ring[used_idx_round].len =
+					desc_chain_len;
+				vhost_log_used_vring(dev, vq,
+					offsetof(struct vring_used,
+						ring[used_idx_round]),
+					sizeof(vq->used->ring[
+						used_idx_round]));
+				vq->last_used_idx++;
+				extra_buffers++;
+				virtio_hdr->num_buffers++;
+				if (avail_idx == vq->last_used_idx)
+					goto rollback;
+
+				desc_current =
+					vq->avail->ring[(vq->last_used_idx) &
+					(vq->size - 1)];
+				desc_chain_head = desc_current;
+				desc = &vq->desc[desc_current];
+				desc_host_write_addr =
+					gpa_to_vva(dev, desc->addr);
+				if (unlikely(!desc_host_write_addr))
+					goto rollback;
+
+				desc_chain_len = 0;
+				desc_write_offset = 0;
+			} else
+				goto rollback;
+		}
+
+		/* copy mbuf data */
+		copy_len = RTE_MIN(desc->len - desc_write_offset,
+				mbuf_len_left);
+		rte_memcpy((void *)(uintptr_t)desc_host_write_addr,
+				rte_pktmbuf_mtod_offset(mbuf, void *,
+					mbuf_len - mbuf_len_left),
+				copy_len);
+		vhost_log_write(dev, desc->addr + desc_write_offset,
+				copy_len);
+		mbuf_len_left -= copy_len;
+		desc_write_offset += copy_len;
+		desc_host_write_addr += copy_len;
+		desc_chain_len += copy_len;
+	}
+
+	used_idx_round = vq->last_used_idx & (vq->size - 1);
+	vq->used->ring[used_idx_round].id = desc_chain_head;
+	vq->used->ring[used_idx_round].len = desc_chain_len;
+	vhost_log_used_vring(dev, vq,
+		offsetof(struct vring_used, ring[used_idx_round]),
+		sizeof(vq->used->ring[used_idx_round]));
+	vq->last_used_idx++;
+
+	return 0;
+
+rollback:
+	/* rollback on any error if last_used_idx update on-the-fly */
+	if (is_mrg_rxbuf)
+		vq->last_used_idx -= extra_buffers;
+
+error:
+	return 1;
+}
+
+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+	rte_smp_wmb();
+	vq->used->idx = vq->last_used_idx;
+	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+			sizeof(vq->used->idx));
+	rte_mb();
+	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+			&& (vq->callfd >= 0))
+		eventfd_write(vq->callfd, (eventfd_t)1);
+}
+
 uint16_t
 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 	struct rte_mbuf **pkts, uint16_t count)
 {
-	struct virtio_net *dev = get_device(vid);
+	struct vhost_virtqueue *vq;
+	struct virtio_net *dev;
+	uint32_t pkt_idx = 0;
+	uint32_t pkt_left = 0;
+	uint32_t pkt_sent = 0;
+	uint32_t is_mrg_rxbuf = 0;
+	uint16_t avail_idx = 0;
+
+	/* precheck */
+	if (unlikely(count == 0))
+		return 0;
 
-	if (!dev)
+	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
+
+	dev = get_device(vid);
+	if (unlikely(!dev))
 		return 0;
 
-	if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
-		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
-	else
-		return virtio_dev_rx(dev, queue_id, pkts, count);
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
+		return 0;
+
+	vq = dev->virtqueue[queue_id];
+	if (unlikely(!vq->enabled))
+		return 0;
+
+	if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+		is_mrg_rxbuf = 1;
+
+	/* start enqueuing packets 1 by 1 */
+	pkt_idx = 0;
+	pkt_left = count;
+	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+	while (1) {
+		if (loop_check(vq, avail_idx, pkt_left))
+			break;
+
+		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+					is_mrg_rxbuf))
+			break;
+
+		pkt_idx++;
+		pkt_sent++;
+		pkt_left--;
+	}
+
+	/* update used idx and kick the guest if necessary */
+	if (pkt_sent)
+		notify_guest(dev, vq);
+
+	return pkt_sent;
 }
 
 static void
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v2 2/6] vhost: remove obsolete
  2016-08-18  6:33 ` [PATCH v2 0/6] vhost: optimize enqueue Zhihong Wang
  2016-08-18  6:33   ` [PATCH v2 1/6] vhost: rewrite enqueue Zhihong Wang
@ 2016-08-18  6:33   ` Zhihong Wang
  2016-08-19  2:32     ` Yuanhan Liu
  2016-08-18  6:33   ` [PATCH v2 3/6] vhost: remove useless volatile Zhihong Wang
                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 141+ messages in thread
From: Zhihong Wang @ 2016-08-18  6:33 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, Zhihong Wang

This patch removes obsolete functions.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost_rxtx.c | 408 ------------------------------------------
 1 file changed, 408 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 8e6d782..939957d 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -125,414 +125,6 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 	}
 }
 
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-		    struct virtio_net_hdr_mrg_rxbuf hdr)
-{
-	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-	else
-		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
-}
-
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		  struct rte_mbuf *m, uint16_t desc_idx)
-{
-	uint32_t desc_avail, desc_offset;
-	uint32_t mbuf_avail, mbuf_offset;
-	uint32_t cpy_len;
-	struct vring_desc *desc;
-	uint64_t desc_addr;
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
-	desc = &vq->desc[desc_idx];
-	desc_addr = gpa_to_vva(dev, desc->addr);
-	/*
-	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-	 * otherwise stores offset on the stack instead of in a register.
-	 */
-	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-		return -1;
-
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-	desc_offset = dev->vhost_hlen;
-	desc_avail  = desc->len - dev->vhost_hlen;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current mbuf, fetch next */
-		if (mbuf_avail == 0) {
-			m = m->next;
-
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
-		}
-
-		/* done with current desc buf, fetch next */
-		if (desc_avail == 0) {
-			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
-				/* Room in vring buffer is not enough */
-				return -1;
-			}
-			if (unlikely(desc->next >= vq->size))
-				return -1;
-
-			desc = &vq->desc[desc->next];
-			desc_addr = gpa_to_vva(dev, desc->addr);
-			if (unlikely(!desc_addr))
-				return -1;
-
-			desc_offset = 0;
-			desc_avail  = desc->len;
-		}
-
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			     cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
-	}
-
-	return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
-	      struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint16_t avail_idx, free_entries, start_idx;
-	uint16_t desc_indexes[MAX_PKT_BURST];
-	uint16_t used_idx;
-	uint32_t i;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
-		return 0;
-	}
-
-	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
-		return 0;
-
-	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-	start_idx = vq->last_used_idx;
-	free_entries = avail_idx - start_idx;
-	count = RTE_MIN(count, free_entries);
-	count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
-	if (count == 0)
-		return 0;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
-		dev->vid, start_idx, start_idx + count);
-
-	/* Retrieve all of the desc indexes first to avoid caching issues. */
-	rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
-	for (i = 0; i < count; i++) {
-		used_idx = (start_idx + i) & (vq->size - 1);
-		desc_indexes[i] = vq->avail->ring[used_idx];
-		vq->used->ring[used_idx].id = desc_indexes[i];
-		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
-					       dev->vhost_hlen;
-		vhost_log_used_vring(dev, vq,
-			offsetof(struct vring_used, ring[used_idx]),
-			sizeof(vq->used->ring[used_idx]));
-	}
-
-	rte_prefetch0(&vq->desc[desc_indexes[0]]);
-	for (i = 0; i < count; i++) {
-		uint16_t desc_idx = desc_indexes[i];
-		int err;
-
-		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
-		if (unlikely(err)) {
-			used_idx = (start_idx + i) & (vq->size - 1);
-			vq->used->ring[used_idx].len = dev->vhost_hlen;
-			vhost_log_used_vring(dev, vq,
-				offsetof(struct vring_used, ring[used_idx]),
-				sizeof(vq->used->ring[used_idx]));
-		}
-
-		if (i + 1 < count)
-			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
-	}
-
-	rte_smp_wmb();
-
-	*(volatile uint16_t *)&vq->used->idx += count;
-	vq->last_used_idx += count;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, idx),
-		sizeof(vq->used->idx));
-
-	/* flush used->idx update before we read avail->flags. */
-	rte_mb();
-
-	/* Kick the guest if necessary. */
-	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-			&& (vq->callfd >= 0))
-		eventfd_write(vq->callfd, (eventfd_t)1);
-	return count;
-}
-
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
-	     uint32_t *allocated, uint32_t *vec_idx,
-	     struct buf_vector *buf_vec)
-{
-	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
-	uint32_t vec_id = *vec_idx;
-	uint32_t len    = *allocated;
-
-	while (1) {
-		if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
-			return -1;
-
-		len += vq->desc[idx].len;
-		buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
-		buf_vec[vec_id].buf_len  = vq->desc[idx].len;
-		buf_vec[vec_id].desc_idx = idx;
-		vec_id++;
-
-		if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
-			break;
-
-		idx = vq->desc[idx].next;
-	}
-
-	*allocated = len;
-	*vec_idx   = vec_id;
-
-	return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
-			    uint16_t *end, struct buf_vector *buf_vec)
-{
-	uint16_t cur_idx;
-	uint16_t avail_idx;
-	uint32_t allocated = 0;
-	uint32_t vec_idx = 0;
-	uint16_t tries = 0;
-
-	cur_idx  = vq->last_used_idx;
-
-	while (1) {
-		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-		if (unlikely(cur_idx == avail_idx))
-			return -1;
-
-		if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
-					  &vec_idx, buf_vec) < 0))
-			return -1;
-
-		cur_idx++;
-		tries++;
-
-		if (allocated >= size)
-			break;
-
-		/*
-		 * if we tried all available ring items, and still
-		 * can't get enough buf, it means something abnormal
-		 * happened.
-		 */
-		if (unlikely(tries >= vq->size))
-			return -1;
-	}
-
-	*end = cur_idx;
-	return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
-			    uint16_t end_idx, struct rte_mbuf *m,
-			    struct buf_vector *buf_vec)
-{
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-	uint32_t vec_idx = 0;
-	uint16_t start_idx = vq->last_used_idx;
-	uint16_t cur_idx = start_idx;
-	uint64_t desc_addr;
-	uint32_t mbuf_offset, mbuf_avail;
-	uint32_t desc_offset, desc_avail;
-	uint32_t cpy_len;
-	uint16_t desc_idx, used_idx;
-
-	if (unlikely(m == NULL))
-		return 0;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
-		dev->vid, cur_idx, end_idx);
-
-	desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
-		return 0;
-
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-	virtio_hdr.num_buffers = end_idx - start_idx;
-	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
-		dev->vid, virtio_hdr.num_buffers);
-
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-	vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
-	desc_offset = dev->vhost_hlen;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current desc buf, get the next one */
-		if (desc_avail == 0) {
-			desc_idx = buf_vec[vec_idx].desc_idx;
-
-			if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
-				/* Update used ring with desc information */
-				used_idx = cur_idx++ & (vq->size - 1);
-				vq->used->ring[used_idx].id  = desc_idx;
-				vq->used->ring[used_idx].len = desc_offset;
-				vhost_log_used_vring(dev, vq,
-					offsetof(struct vring_used,
-						 ring[used_idx]),
-					sizeof(vq->used->ring[used_idx]));
-			}
-
-			vec_idx++;
-			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-			if (unlikely(!desc_addr))
-				return 0;
-
-			/* Prefetch buffer address. */
-			rte_prefetch0((void *)(uintptr_t)desc_addr);
-			desc_offset = 0;
-			desc_avail  = buf_vec[vec_idx].buf_len;
-		}
-
-		/* done with current mbuf, get the next one */
-		if (mbuf_avail == 0) {
-			m = m->next;
-
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
-		}
-
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
-			cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
-	}
-
-	used_idx = cur_idx & (vq->size - 1);
-	vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
-	vq->used->ring[used_idx].len = desc_offset;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, ring[used_idx]),
-		sizeof(vq->used->ring[used_idx]));
-
-	return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint32_t pkt_idx = 0, nr_used = 0;
-	uint16_t end;
-	struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
-		return 0;
-	}
-
-	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
-		return 0;
-
-	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
-	if (count == 0)
-		return 0;
-
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
-		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
-
-		if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
-							 &end, buf_vec) < 0)) {
-			LOG_DEBUG(VHOST_DATA,
-				"(%d) failed to get enough desc from vring\n",
-				dev->vid);
-			break;
-		}
-
-		nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
-						      pkts[pkt_idx], buf_vec);
-		rte_smp_wmb();
-
-		*(volatile uint16_t *)&vq->used->idx += nr_used;
-		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
-			sizeof(vq->used->idx));
-		vq->last_used_idx += nr_used;
-	}
-
-	if (likely(pkt_idx)) {
-		/* flush used->idx update before we read avail->flags. */
-		rte_mb();
-
-		/* Kick the guest if necessary. */
-		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-				&& (vq->callfd >= 0))
-			eventfd_write(vq->callfd, (eventfd_t)1);
-	}
-
-	return pkt_idx;
-}
-
 static inline uint32_t __attribute__((always_inline))
 loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
 {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v2 3/6] vhost: remove useless volatile
  2016-08-18  6:33 ` [PATCH v2 0/6] vhost: optimize enqueue Zhihong Wang
  2016-08-18  6:33   ` [PATCH v2 1/6] vhost: rewrite enqueue Zhihong Wang
  2016-08-18  6:33   ` [PATCH v2 2/6] vhost: remove obsolete Zhihong Wang
@ 2016-08-18  6:33   ` Zhihong Wang
  2016-08-18  6:33   ` [PATCH v2 4/6] vhost: add desc prefetch Zhihong Wang
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-18  6:33 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, Zhihong Wang

This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost-net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 38593a2..51fdf3d 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
 	uint32_t		size;
 
 	/* Last index used on the available ring */
-	volatile uint16_t	last_used_idx;
+	uint16_t		last_used_idx;
 #define VIRTIO_INVALID_EVENTFD		(-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD	(-2)
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v2 4/6] vhost: add desc prefetch
  2016-08-18  6:33 ` [PATCH v2 0/6] vhost: optimize enqueue Zhihong Wang
                     ` (2 preceding siblings ...)
  2016-08-18  6:33   ` [PATCH v2 3/6] vhost: remove useless volatile Zhihong Wang
@ 2016-08-18  6:33   ` Zhihong Wang
  2016-08-18  6:33   ` [PATCH v2 5/6] vhost: batch update used ring Zhihong Wang
  2016-08-18  6:33   ` [PATCH v2 6/6] vhost: optimize cache access Zhihong Wang
  5 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-18  6:33 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, Zhihong Wang

This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost_rxtx.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 939957d..7db83d0 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -131,6 +131,11 @@ loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
 	if (pkt_left == 0 || avail_idx == vq->last_used_idx)
 		return 1;
 
+	/* prefetch the next desc */
+	if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+		rte_prefetch0(&vq->desc[vq->avail->ring[
+				(vq->last_used_idx + 1) & (vq->size - 1)]]);
+
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v2 5/6] vhost: batch update used ring
  2016-08-18  6:33 ` [PATCH v2 0/6] vhost: optimize enqueue Zhihong Wang
                     ` (3 preceding siblings ...)
  2016-08-18  6:33   ` [PATCH v2 4/6] vhost: add desc prefetch Zhihong Wang
@ 2016-08-18  6:33   ` Zhihong Wang
  2016-08-18  6:33   ` [PATCH v2 6/6] vhost: optimize cache access Zhihong Wang
  5 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-18  6:33 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, Zhihong Wang

This patch enables batch update of the used ring for better efficiency.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost-net.h  |  4 +++
 lib/librte_vhost/vhost_rxtx.c | 68 +++++++++++++++++++++++++++++++++----------
 lib/librte_vhost/virtio-net.c | 15 ++++++++--
 3 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 51fdf3d..a15182c 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {
 
 	/* Physical address of used ring, for logging */
 	uint64_t		log_guest_addr;
+
+	/* Shadow used ring for performance */
+	struct vring_used_elem	*shadow_used_ring;
+	uint32_t		shadow_used_idx;
 } __rte_cache_aligned;
 
 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 7db83d0..60d63d3 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -155,7 +155,6 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	uint32_t mbuf_len_left = 0;
 	uint32_t copy_len = 0;
 	uint32_t extra_buffers = 0;
-	uint32_t used_idx_round = 0;
 
 	/* start with the first mbuf of the packet */
 	mbuf_len = rte_pktmbuf_data_len(mbuf);
@@ -207,17 +206,11 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 					goto rollback;
 			} else if (is_mrg_rxbuf) {
 				/* start with the next desc chain */
-				used_idx_round = vq->last_used_idx
-					& (vq->size - 1);
-				vq->used->ring[used_idx_round].id =
+				vq->shadow_used_ring[vq->shadow_used_idx].id =
 					desc_chain_head;
-				vq->used->ring[used_idx_round].len =
+				vq->shadow_used_ring[vq->shadow_used_idx].len =
 					desc_chain_len;
-				vhost_log_used_vring(dev, vq,
-					offsetof(struct vring_used,
-						ring[used_idx_round]),
-					sizeof(vq->used->ring[
-						used_idx_round]));
+				vq->shadow_used_idx++;
 				vq->last_used_idx++;
 				extra_buffers++;
 				virtio_hdr->num_buffers++;
@@ -255,12 +248,9 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		desc_chain_len += copy_len;
 	}
 
-	used_idx_round = vq->last_used_idx & (vq->size - 1);
-	vq->used->ring[used_idx_round].id = desc_chain_head;
-	vq->used->ring[used_idx_round].len = desc_chain_len;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, ring[used_idx_round]),
-		sizeof(vq->used->ring[used_idx_round]));
+	vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+	vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+	vq->shadow_used_idx++;
 	vq->last_used_idx++;
 
 	return 0;
@@ -275,6 +265,45 @@ error:
 }
 
 static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint32_t used_idx_start)
+{
+	if (used_idx_start + vq->shadow_used_idx < vq->size) {
+		rte_memcpy(&vq->used->ring[used_idx_start],
+				&vq->shadow_used_ring[0],
+				vq->shadow_used_idx *
+				sizeof(struct vring_used_elem));
+		vhost_log_used_vring(dev, vq,
+				offsetof(struct vring_used,
+					ring[used_idx_start]),
+				vq->shadow_used_idx *
+				sizeof(struct vring_used_elem));
+	} else {
+		uint32_t part_1 = vq->size - used_idx_start;
+		uint32_t part_2 = vq->shadow_used_idx - part_1;
+
+		rte_memcpy(&vq->used->ring[used_idx_start],
+				&vq->shadow_used_ring[0],
+				part_1 *
+				sizeof(struct vring_used_elem));
+		vhost_log_used_vring(dev, vq,
+				offsetof(struct vring_used,
+					ring[used_idx_start]),
+				part_1 *
+				sizeof(struct vring_used_elem));
+		rte_memcpy(&vq->used->ring[0],
+				&vq->shadow_used_ring[part_1],
+				part_2 *
+				sizeof(struct vring_used_elem));
+		vhost_log_used_vring(dev, vq,
+				offsetof(struct vring_used,
+					ring[0]),
+				part_2 *
+				sizeof(struct vring_used_elem));
+	}
+}
+
+static inline void __attribute__((always_inline))
 notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
 	rte_smp_wmb();
@@ -293,6 +322,7 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 {
 	struct vhost_virtqueue *vq;
 	struct virtio_net *dev;
+	uint32_t used_idx_start = 0;
 	uint32_t pkt_idx = 0;
 	uint32_t pkt_left = 0;
 	uint32_t pkt_sent = 0;
@@ -322,6 +352,8 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 	/* start enqueuing packets 1 by 1 */
 	pkt_idx = 0;
 	pkt_left = count;
+	vq->shadow_used_idx = 0;
+	used_idx_start = vq->last_used_idx & (vq->size - 1);
 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
 	while (1) {
 		if (loop_check(vq, avail_idx, pkt_left))
@@ -336,6 +368,10 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 		pkt_left--;
 	}
 
+	/* batch update used ring for better performance */
+	if (likely(vq->shadow_used_idx > 0))
+		update_used_ring(dev, vq, used_idx_start);
+
 	/* update used idx and kick the guest if necessary */
 	if (pkt_sent)
 		notify_guest(dev, vq);
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 1785695..87d09fa 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -152,10 +152,14 @@ cleanup_device(struct virtio_net *dev, int destroy)
 static void
 free_device(struct virtio_net *dev)
 {
+	struct vhost_virtqueue *vq;
 	uint32_t i;
 
-	for (i = 0; i < dev->virt_qp_nb; i++)
-		rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+	for (i = 0; i < dev->virt_qp_nb; i++) {
+		vq = dev->virtqueue[i * VIRTIO_QNUM];
+		rte_free(vq->shadow_used_ring);
+		rte_free(vq);
+	}
 
 	rte_free(dev);
 }
@@ -418,13 +422,18 @@ int
 vhost_set_vring_num(int vid, struct vhost_vring_state *state)
 {
 	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
 
 	dev = get_device(vid);
 	if (dev == NULL)
 		return -1;
 
 	/* State->index refers to the queue index. The txq is 1, rxq is 0. */
-	dev->virtqueue[state->index]->size = state->num;
+	vq = dev->virtqueue[state->index];
+	vq->size = state->num;
+	vq->shadow_used_ring = rte_malloc("",
+			vq->size * sizeof(struct vring_used_elem),
+			RTE_CACHE_LINE_SIZE);
 
 	return 0;
 }
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v2 6/6] vhost: optimize cache access
  2016-08-18  6:33 ` [PATCH v2 0/6] vhost: optimize enqueue Zhihong Wang
                     ` (4 preceding siblings ...)
  2016-08-18  6:33   ` [PATCH v2 5/6] vhost: batch update used ring Zhihong Wang
@ 2016-08-18  6:33   ` Zhihong Wang
  5 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-18  6:33 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, Zhihong Wang

This patch reorders the code to delay virtio header write to optimize cache
access efficiency for cases where the mrg_rxbuf feature is turned on. It
reduces CPU pipeline stall cycles significantly.


Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost_rxtx.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 60d63d3..15f7f9c 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -154,6 +154,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	uint32_t mbuf_len = 0;
 	uint32_t mbuf_len_left = 0;
 	uint32_t copy_len = 0;
+	uint32_t copy_virtio_hdr = 0;
 	uint32_t extra_buffers = 0;
 
 	/* start with the first mbuf of the packet */
@@ -168,18 +169,17 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	if (unlikely(!desc_host_write_addr))
 		goto error;
 
-	/* handle virtio header */
+	/*
+	 * handle virtio header, the actual write operation
+	 * is delayed for cache optimization.
+	 */
 	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)
 		(uintptr_t)desc_host_write_addr;
-	memset((void *)(uintptr_t)&(virtio_hdr->hdr),
-			0, dev->vhost_hlen);
-	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+	copy_virtio_hdr = 1;
 	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
 	desc_write_offset = dev->vhost_hlen;
 	desc_chain_len = desc_write_offset;
 	desc_host_write_addr += desc_write_offset;
-	if (is_mrg_rxbuf)
-		virtio_hdr->num_buffers = 1;
 
 	/* start copy from mbuf to desc */
 	while (1) {
@@ -233,9 +233,18 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 				goto rollback;
 		}
 
-		/* copy mbuf data */
+		/* copy virtio header and mbuf data */
 		copy_len = RTE_MIN(desc->len - desc_write_offset,
 				mbuf_len_left);
+		if (copy_virtio_hdr) {
+			copy_virtio_hdr = 0;
+			memset((void *)(uintptr_t)&(virtio_hdr->hdr),
+					0, dev->vhost_hlen);
+			virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+			if (is_mrg_rxbuf)
+				virtio_hdr->num_buffers = extra_buffers + 1;
+		}
+
 		rte_memcpy((void *)(uintptr_t)desc_host_write_addr,
 				rte_pktmbuf_mtod_offset(mbuf, void *,
 					mbuf_len - mbuf_len_left),
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* Re: [PATCH] optimize vhost enqueue
  2016-08-17  9:51           ` Yuanhan Liu
@ 2016-08-18 13:44             ` Wang, Zhihong
  0 siblings, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-08-18 13:44 UTC (permalink / raw)
  To: Yuanhan Liu, Maxime Coquelin; +Cc: dev

Thanks Maxime and Yuanhan for your review and suggestions!
Please help review the v2 of this patch.


> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Wednesday, August 17, 2016 5:51 PM
> To: Maxime Coquelin <maxime.coquelin@redhat.com>
> Cc: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
> 
> On Wed, Aug 17, 2016 at 11:17:46AM +0200, Maxime Coquelin wrote:
> > >>>This is something I've thought about while writing the code, the reason I
> > >>>keep it as one function body is that:
> > >>>
> > >>> 1. This function is very performance sensitive, and we need full control of
> > >>>    code ordering (You can compare with the current performance with
> the
> > >>>    mrg_rxbuf feature turned on to see the difference).
> > >>
> > >>Will inline functions help?
> > >
> > >
> > >Optimization in this patch actually reorganizes the code from its logic,
> > >so it's not suitable for making separated functions.
> > >
> > >I'll explain this in v2.
> >
> > I agree with Yuanhan.
> > Inline functions should not break the optimizations.
> > IMHO, this is mandatory for the patch to be accepted.
> 
> Yes.
> 
> > It seems you are not the only one facing the issue:
> > https://github.com/YanVugenfirer/kvm-guest-drivers-windows/issues/70
> >
> > So a dedicated fix is really important.
> 
> Yes.
> 
> >
> > >This patch doesn't try to fix this issue, it rewrites the logic totally,
> > >and somehow fixes this issue.
> > >
> > >Do you think integrating this whole patch into the stable branch will work?
> > >Personally I think it makes more sense.
> >
> > No.
> > We don't even know why/how it fixes the Windows issue, which would be
> > the first thing to understand before integrating a fix in stable branch.
> 
> Yes.
> 
> >
> > And the stable branch is not meant for integrating such big reworks,
> > it is only meant to fix bugs.
> 
> Yes.
> 
> > The risk of regressions have to be avoided as much as possible.
> 
> Yes.
> 
> 	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v2 2/6] vhost: remove obsolete
  2016-08-18  6:33   ` [PATCH v2 2/6] vhost: remove obsolete Zhihong Wang
@ 2016-08-19  2:32     ` Yuanhan Liu
  2016-08-19  7:08       ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-08-19  2:32 UTC (permalink / raw)
  To: Zhihong Wang; +Cc: dev, maxime.coquelin

On Thu, Aug 18, 2016 at 02:33:07AM -0400, Zhihong Wang wrote:
> This patch removes obsolete functions.

Splitting patches doesn't work in this way: this should be in the first
patch. Otherwise, build breaks in the first patch, as some functions are
defined but not used.

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v2 1/6] vhost: rewrite enqueue
  2016-08-18  6:33   ` [PATCH v2 1/6] vhost: rewrite enqueue Zhihong Wang
@ 2016-08-19  2:39     ` Yuanhan Liu
  2016-08-19  7:07       ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-08-19  2:39 UTC (permalink / raw)
  To: Zhihong Wang; +Cc: dev, maxime.coquelin

On Thu, Aug 18, 2016 at 02:33:06AM -0400, Zhihong Wang wrote:
> This patch implements the vhost logic from scratch into a single function
> designed for high performance and better maintainability.
> 
> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> ---
>  lib/librte_vhost/vhost_rxtx.c | 212 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 205 insertions(+), 7 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> index 08a73fd..8e6d782 100644
> --- a/lib/librte_vhost/vhost_rxtx.c
> +++ b/lib/librte_vhost/vhost_rxtx.c
> @@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
>  	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
>  }
>  
> -static void
> +static inline void __attribute__((always_inline))
>  virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  {
>  	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
> @@ -533,19 +533,217 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
>  	return pkt_idx;
>  }
>  
> +static inline uint32_t __attribute__((always_inline))
> +loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
> +{
> +	if (pkt_left == 0 || avail_idx == vq->last_used_idx)
> +		return 1;
> +
> +	return 0;
> +}

Hmmm, I don't see any benifit from making such simple check into a
function.

> +static inline uint32_t __attribute__((always_inline))
> +enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		uint16_t avail_idx, struct rte_mbuf *mbuf,
> +		uint32_t is_mrg_rxbuf)
> +{
> +	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
> +	struct vring_desc *desc;
> +	uint64_t desc_host_write_addr = 0;
> +	uint32_t desc_chain_head = 0;
> +	uint32_t desc_chain_len = 0;
> +	uint32_t desc_current = 0;
> +	uint32_t desc_write_offset = 0;
> +	uint32_t mbuf_len = 0;
> +	uint32_t mbuf_len_left = 0;
> +	uint32_t copy_len = 0;

The dequeue function uses var like desc_addr, desc_avail, desc_offset,
mbuf_avail, ..., I see no reason to use something different here. This
breaks the code consistency. Besides that, var name like desc_host_write_addr
looks redundant; desc_addr is much cleaner.

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* [PATCH v3 0/5] vhost: optimize enqueue
  2016-08-16  3:50 [PATCH] optimize vhost enqueue Zhihong Wang
  2016-08-16 13:59 ` Maxime Coquelin
  2016-08-18  6:33 ` [PATCH v2 0/6] vhost: optimize enqueue Zhihong Wang
@ 2016-08-19  5:43 ` Zhihong Wang
  2016-08-19  5:43   ` [PATCH v3 1/5] vhost: rewrite enqueue Zhihong Wang
                     ` (5 more replies)
  2016-08-30  3:35 ` [PATCH v4 0/6] " Zhihong Wang
                   ` (2 subsequent siblings)
  5 siblings, 6 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-19  5:43 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu

This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

 *  For fast frontends (eg. DPDK virtio pmd), higher performance (maximum
    throughput) can be achieved.

 *  For slow frontends (eg. kernel virtio-net), better scalability can be
    achieved, each vhost core can support more connections since it takes
    less cycles to handle each single frontend.

The main optimization techniques are:

 1. Reorder code to reduce CPU pipeline stall cycles.

 2. Batch update the used ring for better efficiency.

 3. Prefetch descriptor to hide cache latency.

 4. Remove useless volatile attribute to allow compiler optimization.

Code reordering and batch used ring update bring most of the performance
improvements.

In the existing code there're 2 callbacks for vhost enqueue:

 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.

 *  virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Also, having 2 separated functions increases
maintenance efforts.

---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

 3. Rewrite enqueue and delete the obsolete in the same patch.

---
Changes in v2:

 1. Split the big function into several small ones.

 2. Use multiple patches to explain each optimization.

 3. Add comments.

Zhihong Wang (5):
  vhost: rewrite enqueue
  vhost: remove useless volatile
  vhost: add desc prefetch
  vhost: batch update used ring
  vhost: optimize cache access

 lib/librte_vhost/vhost-net.h  |   6 +-
 lib/librte_vhost/vhost_rxtx.c | 573 +++++++++++++++---------------------------
 lib/librte_vhost/virtio-net.c |  15 +-
 3 files changed, 220 insertions(+), 374 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 141+ messages in thread

* [PATCH v3 1/5] vhost: rewrite enqueue
  2016-08-19  5:43 ` [PATCH v3 0/5] vhost: optimize enqueue Zhihong Wang
@ 2016-08-19  5:43   ` Zhihong Wang
  2016-08-22  9:35     ` Maxime Coquelin
  2016-08-19  5:43   ` [PATCH v3 2/5] vhost: remove useless volatile Zhihong Wang
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 141+ messages in thread
From: Zhihong Wang @ 2016-08-19  5:43 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, Zhihong Wang

This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.

---
Changes in v3:

 1. Rewrite enqueue and delete the obsolete in the same patch.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost_rxtx.c | 537 +++++++++++++-----------------------------
 1 file changed, 160 insertions(+), 377 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..b09a9c3 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
 	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }
 
-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
 	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -125,427 +125,210 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 	}
 }
 
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-		    struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline uint32_t __attribute__((always_inline))
+loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
 {
-	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-	else
-		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+	if (pkt_left == 0 || avail_idx == vq->last_used_idx)
+		return 1;
+
+	return 0;
 }
 
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		  struct rte_mbuf *m, uint16_t desc_idx)
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint16_t avail_idx, struct rte_mbuf *mbuf,
+		uint32_t is_mrg_rxbuf)
 {
-	uint32_t desc_avail, desc_offset;
-	uint32_t mbuf_avail, mbuf_offset;
-	uint32_t cpy_len;
+	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
 	struct vring_desc *desc;
-	uint64_t desc_addr;
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
-	desc = &vq->desc[desc_idx];
+	uint64_t desc_addr = 0;
+	uint32_t desc_chain_head = 0;
+	uint32_t desc_chain_len = 0;
+	uint32_t desc_current = 0;
+	uint32_t desc_offset = 0;
+	uint32_t mbuf_len = 0;
+	uint32_t mbuf_avail = 0;
+	uint32_t copy_len = 0;
+	uint32_t extra_buffers = 0;
+	uint32_t used_idx_round = 0;
+
+	/* start with the first mbuf of the packet */
+	mbuf_len = rte_pktmbuf_data_len(mbuf);
+	mbuf_avail = mbuf_len;
+
+	/* get the current desc */
+	desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+	desc_chain_head = desc_current;
+	desc = &vq->desc[desc_current];
 	desc_addr = gpa_to_vva(dev, desc->addr);
-	/*
-	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-	 * otherwise stores offset on the stack instead of in a register.
-	 */
-	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-		return -1;
-
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
+	if (unlikely(!desc_addr))
+		goto error;
 
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+	/* handle virtio header */
+	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
 	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
 	desc_offset = dev->vhost_hlen;
-	desc_avail  = desc->len - dev->vhost_hlen;
+	desc_chain_len = desc_offset;
+	desc_addr += desc_offset;
+	if (is_mrg_rxbuf)
+		virtio_hdr->num_buffers = 1;
 
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current mbuf, fetch next */
-		if (mbuf_avail == 0) {
-			m = m->next;
-
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
+	/* start copy from mbuf to desc */
+	while (1) {
+		/* get the next mbuf if the current done */
+		if (!mbuf_avail) {
+			if (mbuf->next) {
+				mbuf = mbuf->next;
+				mbuf_len = rte_pktmbuf_data_len(mbuf);
+				mbuf_avail = mbuf_len;
+			} else
+				break;
 		}
 
-		/* done with current desc buf, fetch next */
-		if (desc_avail == 0) {
-			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
-				/* Room in vring buffer is not enough */
-				return -1;
-			}
-			if (unlikely(desc->next >= vq->size))
-				return -1;
-
-			desc = &vq->desc[desc->next];
-			desc_addr = gpa_to_vva(dev, desc->addr);
-			if (unlikely(!desc_addr))
-				return -1;
-
-			desc_offset = 0;
-			desc_avail  = desc->len;
+		/* get the next desc if the current done */
+		if (desc->len <= desc_offset) {
+			if (desc->flags & VRING_DESC_F_NEXT) {
+				/* go on with the current desc chain */
+				desc_offset = 0;
+				desc_current = desc->next;
+				desc = &vq->desc[desc_current];
+				desc_addr = gpa_to_vva(dev, desc->addr);
+				if (unlikely(!desc_addr))
+					goto rollback;
+			} else if (is_mrg_rxbuf) {
+				/* start with the next desc chain */
+				used_idx_round = vq->last_used_idx
+					& (vq->size - 1);
+				vq->used->ring[used_idx_round].id =
+					desc_chain_head;
+				vq->used->ring[used_idx_round].len =
+					desc_chain_len;
+				vhost_log_used_vring(dev, vq,
+					offsetof(struct vring_used,
+						ring[used_idx_round]),
+					sizeof(vq->used->ring[
+						used_idx_round]));
+				vq->last_used_idx++;
+				extra_buffers++;
+				virtio_hdr->num_buffers++;
+				if (avail_idx == vq->last_used_idx)
+					goto rollback;
+
+				desc_current =
+					vq->avail->ring[(vq->last_used_idx) &
+					(vq->size - 1)];
+				desc_chain_head = desc_current;
+				desc = &vq->desc[desc_current];
+				desc_addr = gpa_to_vva(dev, desc->addr);
+				if (unlikely(!desc_addr))
+					goto rollback;
+
+				desc_chain_len = 0;
+				desc_offset = 0;
+			} else
+				goto rollback;
 		}
 
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			     cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
-	}
-
-	return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
-	      struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint16_t avail_idx, free_entries, start_idx;
-	uint16_t desc_indexes[MAX_PKT_BURST];
-	uint16_t used_idx;
-	uint32_t i;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
-		return 0;
+		/* copy mbuf data */
+		copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+		rte_memcpy((void *)(uintptr_t)desc_addr,
+				rte_pktmbuf_mtod_offset(mbuf, void *,
+					mbuf_len - mbuf_avail),
+				copy_len);
+		vhost_log_write(dev, desc->addr + desc_offset, copy_len);
+		mbuf_avail -= copy_len;
+		desc_offset += copy_len;
+		desc_addr += copy_len;
+		desc_chain_len += copy_len;
 	}
 
-	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
-		return 0;
-
-	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-	start_idx = vq->last_used_idx;
-	free_entries = avail_idx - start_idx;
-	count = RTE_MIN(count, free_entries);
-	count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
-	if (count == 0)
-		return 0;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
-		dev->vid, start_idx, start_idx + count);
-
-	/* Retrieve all of the desc indexes first to avoid caching issues. */
-	rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
-	for (i = 0; i < count; i++) {
-		used_idx = (start_idx + i) & (vq->size - 1);
-		desc_indexes[i] = vq->avail->ring[used_idx];
-		vq->used->ring[used_idx].id = desc_indexes[i];
-		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
-					       dev->vhost_hlen;
-		vhost_log_used_vring(dev, vq,
-			offsetof(struct vring_used, ring[used_idx]),
-			sizeof(vq->used->ring[used_idx]));
-	}
+	used_idx_round = vq->last_used_idx & (vq->size - 1);
+	vq->used->ring[used_idx_round].id = desc_chain_head;
+	vq->used->ring[used_idx_round].len = desc_chain_len;
+	vhost_log_used_vring(dev, vq,
+		offsetof(struct vring_used, ring[used_idx_round]),
+		sizeof(vq->used->ring[used_idx_round]));
+	vq->last_used_idx++;
 
-	rte_prefetch0(&vq->desc[desc_indexes[0]]);
-	for (i = 0; i < count; i++) {
-		uint16_t desc_idx = desc_indexes[i];
-		int err;
+	return 0;
 
-		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
-		if (unlikely(err)) {
-			used_idx = (start_idx + i) & (vq->size - 1);
-			vq->used->ring[used_idx].len = dev->vhost_hlen;
-			vhost_log_used_vring(dev, vq,
-				offsetof(struct vring_used, ring[used_idx]),
-				sizeof(vq->used->ring[used_idx]));
-		}
+rollback:
+	/* rollback on any error if last_used_idx update on-the-fly */
+	if (is_mrg_rxbuf)
+		vq->last_used_idx -= extra_buffers;
 
-		if (i + 1 < count)
-			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
-	}
+error:
+	return 1;
+}
 
+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
 	rte_smp_wmb();
-
-	*(volatile uint16_t *)&vq->used->idx += count;
-	vq->last_used_idx += count;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, idx),
-		sizeof(vq->used->idx));
-
-	/* flush used->idx update before we read avail->flags. */
+	vq->used->idx = vq->last_used_idx;
+	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+			sizeof(vq->used->idx));
 	rte_mb();
-
-	/* Kick the guest if necessary. */
 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
 			&& (vq->callfd >= 0))
 		eventfd_write(vq->callfd, (eventfd_t)1);
-	return count;
-}
-
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
-	     uint32_t *allocated, uint32_t *vec_idx,
-	     struct buf_vector *buf_vec)
-{
-	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
-	uint32_t vec_id = *vec_idx;
-	uint32_t len    = *allocated;
-
-	while (1) {
-		if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
-			return -1;
-
-		len += vq->desc[idx].len;
-		buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
-		buf_vec[vec_id].buf_len  = vq->desc[idx].len;
-		buf_vec[vec_id].desc_idx = idx;
-		vec_id++;
-
-		if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
-			break;
-
-		idx = vq->desc[idx].next;
-	}
-
-	*allocated = len;
-	*vec_idx   = vec_id;
-
-	return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
-			    uint16_t *end, struct buf_vector *buf_vec)
-{
-	uint16_t cur_idx;
-	uint16_t avail_idx;
-	uint32_t allocated = 0;
-	uint32_t vec_idx = 0;
-	uint16_t tries = 0;
-
-	cur_idx  = vq->last_used_idx;
-
-	while (1) {
-		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-		if (unlikely(cur_idx == avail_idx))
-			return -1;
-
-		if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
-					  &vec_idx, buf_vec) < 0))
-			return -1;
-
-		cur_idx++;
-		tries++;
-
-		if (allocated >= size)
-			break;
-
-		/*
-		 * if we tried all available ring items, and still
-		 * can't get enough buf, it means something abnormal
-		 * happened.
-		 */
-		if (unlikely(tries >= vq->size))
-			return -1;
-	}
-
-	*end = cur_idx;
-	return 0;
 }
 
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
-			    uint16_t end_idx, struct rte_mbuf *m,
-			    struct buf_vector *buf_vec)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint16_t count)
 {
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-	uint32_t vec_idx = 0;
-	uint16_t start_idx = vq->last_used_idx;
-	uint16_t cur_idx = start_idx;
-	uint64_t desc_addr;
-	uint32_t mbuf_offset, mbuf_avail;
-	uint32_t desc_offset, desc_avail;
-	uint32_t cpy_len;
-	uint16_t desc_idx, used_idx;
-
-	if (unlikely(m == NULL))
+	struct vhost_virtqueue *vq;
+	struct virtio_net *dev;
+	uint32_t pkt_idx = 0;
+	uint32_t pkt_left = 0;
+	uint32_t pkt_sent = 0;
+	uint32_t is_mrg_rxbuf = 0;
+	uint16_t avail_idx = 0;
+
+	/* precheck */
+	if (unlikely(count == 0))
 		return 0;
 
-	LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
-		dev->vid, cur_idx, end_idx);
+	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
 
-	desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
+	dev = get_device(vid);
+	if (unlikely(!dev))
 		return 0;
 
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-	virtio_hdr.num_buffers = end_idx - start_idx;
-	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
-		dev->vid, virtio_hdr.num_buffers);
-
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-	vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
-	desc_offset = dev->vhost_hlen;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current desc buf, get the next one */
-		if (desc_avail == 0) {
-			desc_idx = buf_vec[vec_idx].desc_idx;
-
-			if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
-				/* Update used ring with desc information */
-				used_idx = cur_idx++ & (vq->size - 1);
-				vq->used->ring[used_idx].id  = desc_idx;
-				vq->used->ring[used_idx].len = desc_offset;
-				vhost_log_used_vring(dev, vq,
-					offsetof(struct vring_used,
-						 ring[used_idx]),
-					sizeof(vq->used->ring[used_idx]));
-			}
-
-			vec_idx++;
-			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-			if (unlikely(!desc_addr))
-				return 0;
-
-			/* Prefetch buffer address. */
-			rte_prefetch0((void *)(uintptr_t)desc_addr);
-			desc_offset = 0;
-			desc_avail  = buf_vec[vec_idx].buf_len;
-		}
-
-		/* done with current mbuf, get the next one */
-		if (mbuf_avail == 0) {
-			m = m->next;
-
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
-		}
-
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
-			cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
-	}
-
-	used_idx = cur_idx & (vq->size - 1);
-	vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
-	vq->used->ring[used_idx].len = desc_offset;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, ring[used_idx]),
-		sizeof(vq->used->ring[used_idx]));
-
-	return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint32_t pkt_idx = 0, nr_used = 0;
-	uint16_t end;
-	struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
 		return 0;
-	}
 
 	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
+	if (unlikely(!vq->enabled))
 		return 0;
 
-	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
-	if (count == 0)
-		return 0;
+	if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+		is_mrg_rxbuf = 1;
 
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
-		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
-
-		if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
-							 &end, buf_vec) < 0)) {
-			LOG_DEBUG(VHOST_DATA,
-				"(%d) failed to get enough desc from vring\n",
-				dev->vid);
+	/* start enqueuing packets 1 by 1 */
+	pkt_idx = 0;
+	pkt_left = count;
+	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+	while (1) {
+		if (loop_check(vq, avail_idx, pkt_left))
 			break;
-		}
-
-		nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
-						      pkts[pkt_idx], buf_vec);
-		rte_smp_wmb();
 
-		*(volatile uint16_t *)&vq->used->idx += nr_used;
-		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
-			sizeof(vq->used->idx));
-		vq->last_used_idx += nr_used;
-	}
-
-	if (likely(pkt_idx)) {
-		/* flush used->idx update before we read avail->flags. */
-		rte_mb();
+		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+					is_mrg_rxbuf))
+			break;
 
-		/* Kick the guest if necessary. */
-		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-				&& (vq->callfd >= 0))
-			eventfd_write(vq->callfd, (eventfd_t)1);
+		pkt_idx++;
+		pkt_sent++;
+		pkt_left--;
 	}
 
-	return pkt_idx;
-}
-
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint16_t count)
-{
-	struct virtio_net *dev = get_device(vid);
-
-	if (!dev)
-		return 0;
+	/* update used idx and kick the guest if necessary */
+	if (pkt_sent)
+		notify_guest(dev, vq);
 
-	if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
-		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
-	else
-		return virtio_dev_rx(dev, queue_id, pkts, count);
+	return pkt_sent;
 }
 
 static void
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v3 2/5] vhost: remove useless volatile
  2016-08-19  5:43 ` [PATCH v3 0/5] vhost: optimize enqueue Zhihong Wang
  2016-08-19  5:43   ` [PATCH v3 1/5] vhost: rewrite enqueue Zhihong Wang
@ 2016-08-19  5:43   ` Zhihong Wang
  2016-08-19  5:43   ` [PATCH v3 3/5] vhost: add desc prefetch Zhihong Wang
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-19  5:43 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, Zhihong Wang

This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost-net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 38593a2..51fdf3d 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
 	uint32_t		size;
 
 	/* Last index used on the available ring */
-	volatile uint16_t	last_used_idx;
+	uint16_t		last_used_idx;
 #define VIRTIO_INVALID_EVENTFD		(-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD	(-2)
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v3 3/5] vhost: add desc prefetch
  2016-08-19  5:43 ` [PATCH v3 0/5] vhost: optimize enqueue Zhihong Wang
  2016-08-19  5:43   ` [PATCH v3 1/5] vhost: rewrite enqueue Zhihong Wang
  2016-08-19  5:43   ` [PATCH v3 2/5] vhost: remove useless volatile Zhihong Wang
@ 2016-08-19  5:43   ` Zhihong Wang
  2016-08-19  5:43   ` [PATCH v3 4/5] vhost: batch update used ring Zhihong Wang
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-19  5:43 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, Zhihong Wang

This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost_rxtx.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index b09a9c3..7523b2d 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -131,6 +131,11 @@ loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
 	if (pkt_left == 0 || avail_idx == vq->last_used_idx)
 		return 1;
 
+	/* prefetch the next desc */
+	if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+		rte_prefetch0(&vq->desc[vq->avail->ring[
+				(vq->last_used_idx + 1) & (vq->size - 1)]]);
+
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v3 4/5] vhost: batch update used ring
  2016-08-19  5:43 ` [PATCH v3 0/5] vhost: optimize enqueue Zhihong Wang
                     ` (2 preceding siblings ...)
  2016-08-19  5:43   ` [PATCH v3 3/5] vhost: add desc prefetch Zhihong Wang
@ 2016-08-19  5:43   ` Zhihong Wang
  2016-08-25  3:48     ` Yuanhan Liu
  2016-08-19  5:43   ` [PATCH v3 5/5] vhost: optimize cache access Zhihong Wang
  2016-08-22  8:11   ` [PATCH v3 0/5] vhost: optimize enqueue Maxime Coquelin
  5 siblings, 1 reply; 141+ messages in thread
From: Zhihong Wang @ 2016-08-19  5:43 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, Zhihong Wang

This patch enables batch update of the used ring for better efficiency.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost-net.h  |  4 +++
 lib/librte_vhost/vhost_rxtx.c | 68 +++++++++++++++++++++++++++++++++----------
 lib/librte_vhost/virtio-net.c | 15 ++++++++--
 3 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 51fdf3d..a15182c 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {
 
 	/* Physical address of used ring, for logging */
 	uint64_t		log_guest_addr;
+
+	/* Shadow used ring for performance */
+	struct vring_used_elem	*shadow_used_ring;
+	uint32_t		shadow_used_idx;
 } __rte_cache_aligned;
 
 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 7523b2d..c4abaf1 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -155,7 +155,6 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	uint32_t mbuf_avail = 0;
 	uint32_t copy_len = 0;
 	uint32_t extra_buffers = 0;
-	uint32_t used_idx_round = 0;
 
 	/* start with the first mbuf of the packet */
 	mbuf_len = rte_pktmbuf_data_len(mbuf);
@@ -203,17 +202,11 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 					goto rollback;
 			} else if (is_mrg_rxbuf) {
 				/* start with the next desc chain */
-				used_idx_round = vq->last_used_idx
-					& (vq->size - 1);
-				vq->used->ring[used_idx_round].id =
+				vq->shadow_used_ring[vq->shadow_used_idx].id =
 					desc_chain_head;
-				vq->used->ring[used_idx_round].len =
+				vq->shadow_used_ring[vq->shadow_used_idx].len =
 					desc_chain_len;
-				vhost_log_used_vring(dev, vq,
-					offsetof(struct vring_used,
-						ring[used_idx_round]),
-					sizeof(vq->used->ring[
-						used_idx_round]));
+				vq->shadow_used_idx++;
 				vq->last_used_idx++;
 				extra_buffers++;
 				virtio_hdr->num_buffers++;
@@ -248,12 +241,9 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		desc_chain_len += copy_len;
 	}
 
-	used_idx_round = vq->last_used_idx & (vq->size - 1);
-	vq->used->ring[used_idx_round].id = desc_chain_head;
-	vq->used->ring[used_idx_round].len = desc_chain_len;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, ring[used_idx_round]),
-		sizeof(vq->used->ring[used_idx_round]));
+	vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+	vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+	vq->shadow_used_idx++;
 	vq->last_used_idx++;
 
 	return 0;
@@ -268,6 +258,45 @@ error:
 }
 
 static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint32_t used_idx_start)
+{
+	if (used_idx_start + vq->shadow_used_idx < vq->size) {
+		rte_memcpy(&vq->used->ring[used_idx_start],
+				&vq->shadow_used_ring[0],
+				vq->shadow_used_idx *
+				sizeof(struct vring_used_elem));
+		vhost_log_used_vring(dev, vq,
+				offsetof(struct vring_used,
+					ring[used_idx_start]),
+				vq->shadow_used_idx *
+				sizeof(struct vring_used_elem));
+	} else {
+		uint32_t part_1 = vq->size - used_idx_start;
+		uint32_t part_2 = vq->shadow_used_idx - part_1;
+
+		rte_memcpy(&vq->used->ring[used_idx_start],
+				&vq->shadow_used_ring[0],
+				part_1 *
+				sizeof(struct vring_used_elem));
+		vhost_log_used_vring(dev, vq,
+				offsetof(struct vring_used,
+					ring[used_idx_start]),
+				part_1 *
+				sizeof(struct vring_used_elem));
+		rte_memcpy(&vq->used->ring[0],
+				&vq->shadow_used_ring[part_1],
+				part_2 *
+				sizeof(struct vring_used_elem));
+		vhost_log_used_vring(dev, vq,
+				offsetof(struct vring_used,
+					ring[0]),
+				part_2 *
+				sizeof(struct vring_used_elem));
+	}
+}
+
+static inline void __attribute__((always_inline))
 notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
 	rte_smp_wmb();
@@ -286,6 +315,7 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 {
 	struct vhost_virtqueue *vq;
 	struct virtio_net *dev;
+	uint32_t used_idx_start = 0;
 	uint32_t pkt_idx = 0;
 	uint32_t pkt_left = 0;
 	uint32_t pkt_sent = 0;
@@ -315,6 +345,8 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 	/* start enqueuing packets 1 by 1 */
 	pkt_idx = 0;
 	pkt_left = count;
+	vq->shadow_used_idx = 0;
+	used_idx_start = vq->last_used_idx & (vq->size - 1);
 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
 	while (1) {
 		if (loop_check(vq, avail_idx, pkt_left))
@@ -329,6 +361,10 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 		pkt_left--;
 	}
 
+	/* batch update used ring for better performance */
+	if (likely(vq->shadow_used_idx > 0))
+		update_used_ring(dev, vq, used_idx_start);
+
 	/* update used idx and kick the guest if necessary */
 	if (pkt_sent)
 		notify_guest(dev, vq);
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 1785695..87d09fa 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -152,10 +152,14 @@ cleanup_device(struct virtio_net *dev, int destroy)
 static void
 free_device(struct virtio_net *dev)
 {
+	struct vhost_virtqueue *vq;
 	uint32_t i;
 
-	for (i = 0; i < dev->virt_qp_nb; i++)
-		rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+	for (i = 0; i < dev->virt_qp_nb; i++) {
+		vq = dev->virtqueue[i * VIRTIO_QNUM];
+		rte_free(vq->shadow_used_ring);
+		rte_free(vq);
+	}
 
 	rte_free(dev);
 }
@@ -418,13 +422,18 @@ int
 vhost_set_vring_num(int vid, struct vhost_vring_state *state)
 {
 	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
 
 	dev = get_device(vid);
 	if (dev == NULL)
 		return -1;
 
 	/* State->index refers to the queue index. The txq is 1, rxq is 0. */
-	dev->virtqueue[state->index]->size = state->num;
+	vq = dev->virtqueue[state->index];
+	vq->size = state->num;
+	vq->shadow_used_ring = rte_malloc("",
+			vq->size * sizeof(struct vring_used_elem),
+			RTE_CACHE_LINE_SIZE);
 
 	return 0;
 }
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v3 5/5] vhost: optimize cache access
  2016-08-19  5:43 ` [PATCH v3 0/5] vhost: optimize enqueue Zhihong Wang
                     ` (3 preceding siblings ...)
  2016-08-19  5:43   ` [PATCH v3 4/5] vhost: batch update used ring Zhihong Wang
@ 2016-08-19  5:43   ` Zhihong Wang
  2016-08-22  8:11   ` [PATCH v3 0/5] vhost: optimize enqueue Maxime Coquelin
  5 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-19  5:43 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, Zhihong Wang

This patch reorders the code to delay virtio header write to optimize cache
access efficiency for cases where the mrg_rxbuf feature is turned on. It
reduces CPU pipeline stall cycles significantly.

---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost_rxtx.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index c4abaf1..e3ba4e0 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -154,6 +154,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	uint32_t mbuf_len = 0;
 	uint32_t mbuf_avail = 0;
 	uint32_t copy_len = 0;
+	uint32_t copy_virtio_hdr = 0;
 	uint32_t extra_buffers = 0;
 
 	/* start with the first mbuf of the packet */
@@ -168,15 +169,16 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	if (unlikely(!desc_addr))
 		goto error;
 
-	/* handle virtio header */
+	/*
+	 * handle virtio header, the actual write operation
+	 * is delayed for cache optimization.
+	 */
 	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
-	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+	copy_virtio_hdr = 1;
 	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
 	desc_offset = dev->vhost_hlen;
 	desc_chain_len = desc_offset;
 	desc_addr += desc_offset;
-	if (is_mrg_rxbuf)
-		virtio_hdr->num_buffers = 1;
 
 	/* start copy from mbuf to desc */
 	while (1) {
@@ -228,8 +230,15 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 				goto rollback;
 		}
 
-		/* copy mbuf data */
+		/* copy virtio header and mbuf data */
 		copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+		if (copy_virtio_hdr) {
+			copy_virtio_hdr = 0;
+			virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+			if (is_mrg_rxbuf)
+				virtio_hdr->num_buffers = extra_buffers + 1;
+		}
+
 		rte_memcpy((void *)(uintptr_t)desc_addr,
 				rte_pktmbuf_mtod_offset(mbuf, void *,
 					mbuf_len - mbuf_avail),
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* Re: [PATCH v2 1/6] vhost: rewrite enqueue
  2016-08-19  2:39     ` Yuanhan Liu
@ 2016-08-19  7:07       ` Wang, Zhihong
  0 siblings, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-08-19  7:07 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: dev, maxime.coquelin



> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Friday, August 19, 2016 10:39 AM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org; maxime.coquelin@redhat.com
> Subject: Re: [PATCH v2 1/6] vhost: rewrite enqueue
> 
> On Thu, Aug 18, 2016 at 02:33:06AM -0400, Zhihong Wang wrote:
> > This patch implements the vhost logic from scratch into a single function
> > designed for high performance and better maintainability.
> >
> > Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> > ---
> >  lib/librte_vhost/vhost_rxtx.c | 212
> ++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 205 insertions(+), 7 deletions(-)
> >
> > diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> > index 08a73fd..8e6d782 100644
> > --- a/lib/librte_vhost/vhost_rxtx.c
> > +++ b/lib/librte_vhost/vhost_rxtx.c
> > @@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t
> qp_nb)
> >  	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
> >  }
> >
> > -static void
> > +static inline void __attribute__((always_inline))
> >  virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr
> *net_hdr)
> >  {
> >  	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
> > @@ -533,19 +533,217 @@ virtio_dev_merge_rx(struct virtio_net *dev,
> uint16_t queue_id,
> >  	return pkt_idx;
> >  }
> >
> > +static inline uint32_t __attribute__((always_inline))
> > +loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
> > +{
> > +	if (pkt_left == 0 || avail_idx == vq->last_used_idx)
> > +		return 1;
> > +
> > +	return 0;
> > +}
> 
> Hmmm, I don't see any benifit from making such simple check into a
> function.

It's for prefetch code later to be merged into this function.

> 
> > +static inline uint32_t __attribute__((always_inline))
> > +enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > +		uint16_t avail_idx, struct rte_mbuf *mbuf,
> > +		uint32_t is_mrg_rxbuf)
> > +{
> > +	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
> > +	struct vring_desc *desc;
> > +	uint64_t desc_host_write_addr = 0;
> > +	uint32_t desc_chain_head = 0;
> > +	uint32_t desc_chain_len = 0;
> > +	uint32_t desc_current = 0;
> > +	uint32_t desc_write_offset = 0;
> > +	uint32_t mbuf_len = 0;
> > +	uint32_t mbuf_len_left = 0;
> > +	uint32_t copy_len = 0;
> 
> The dequeue function uses var like desc_addr, desc_avail, desc_offset,
> mbuf_avail, ..., I see no reason to use something different here. This
> breaks the code consistency. Besides that, var name like desc_host_write_addr
> looks redundant; desc_addr is much cleaner.

Okay.

> 
> 	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v2 2/6] vhost: remove obsolete
  2016-08-19  2:32     ` Yuanhan Liu
@ 2016-08-19  7:08       ` Wang, Zhihong
  0 siblings, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-08-19  7:08 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: dev, maxime.coquelin



> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Friday, August 19, 2016 10:33 AM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org; maxime.coquelin@redhat.com
> Subject: Re: [PATCH v2 2/6] vhost: remove obsolete
> 
> On Thu, Aug 18, 2016 at 02:33:07AM -0400, Zhihong Wang wrote:
> > This patch removes obsolete functions.
> 
> Splitting patches doesn't work in this way: this should be in the first
> patch. Otherwise, build breaks in the first patch, as some functions are
> defined but not used.

Thanks. I'll send out v3 soon, also to fix a small glitch
while running in old platform like snb and ivb.

> 
> 	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-08-19  5:43 ` [PATCH v3 0/5] vhost: optimize enqueue Zhihong Wang
                     ` (4 preceding siblings ...)
  2016-08-19  5:43   ` [PATCH v3 5/5] vhost: optimize cache access Zhihong Wang
@ 2016-08-22  8:11   ` Maxime Coquelin
  2016-08-22 10:01     ` Maxime Coquelin
                       ` (2 more replies)
  5 siblings, 3 replies; 141+ messages in thread
From: Maxime Coquelin @ 2016-08-22  8:11 UTC (permalink / raw)
  To: Zhihong Wang, dev; +Cc: yuanhan.liu

Hi Zhihong,

On 08/19/2016 07:43 AM, Zhihong Wang wrote:
> This patch set optimizes the vhost enqueue function.
>
> It implements the vhost logic from scratch into a single function designed
> for high performance and good maintainability, and improves CPU efficiency
> significantly by optimizing cache access, which means:
>
>  *  For fast frontends (eg. DPDK virtio pmd), higher performance (maximum
>     throughput) can be achieved.
>
>  *  For slow frontends (eg. kernel virtio-net), better scalability can be
>     achieved, each vhost core can support more connections since it takes
>     less cycles to handle each single frontend.
>
> The main optimization techniques are:
>
>  1. Reorder code to reduce CPU pipeline stall cycles.
>
>  2. Batch update the used ring for better efficiency.
>
>  3. Prefetch descriptor to hide cache latency.
>
>  4. Remove useless volatile attribute to allow compiler optimization.

Thanks for these details, this is helpful to understand where the perf
gain comes from.
I would suggest to add these information as comments in the code
where/if it makes sense. If more a general comment, at least add it in
the commit message of the patch introducing it.
Indeed, adding it to the cover letter is fine, but the information is
lost as soon as the series is applied.

You don't mention any figures, so I set up a benchmark on my side to
evaluate your series. It indeed shows an interesting performance gain.

My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
pktgen-dpdk. The hosts forwards received packets back to the guest
using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
physical CPUs.

I tested it with and without your v1 patch, with and without
rx-mergeable feature turned ON.
Results are the average of 8 runs of 60 seconds:

Rx-Mergeable ON : 7.72Mpps
Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
Rx-Mergeable OFF: 10.52Mpps
Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps

Regards,
Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 1/5] vhost: rewrite enqueue
  2016-08-19  5:43   ` [PATCH v3 1/5] vhost: rewrite enqueue Zhihong Wang
@ 2016-08-22  9:35     ` Maxime Coquelin
  2016-08-23  2:27       ` Wang, Zhihong
  2016-08-25  4:00       ` Yuanhan Liu
  0 siblings, 2 replies; 141+ messages in thread
From: Maxime Coquelin @ 2016-08-22  9:35 UTC (permalink / raw)
  To: Zhihong Wang, dev; +Cc: yuanhan.liu



On 08/19/2016 07:43 AM, Zhihong Wang wrote:
> This patch implements the vhost logic from scratch into a single function
> designed for high performance and better maintainability.
>
> ---
> Changes in v3:
>
>  1. Rewrite enqueue and delete the obsolete in the same patch.
>
> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> ---
>  lib/librte_vhost/vhost_rxtx.c | 537 +++++++++++++-----------------------------
>  1 file changed, 160 insertions(+), 377 deletions(-)
>
> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> index 08a73fd..b09a9c3 100644
> --- a/lib/librte_vhost/vhost_rxtx.c
> +++ b/lib/librte_vhost/vhost_rxtx.c
> @@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
>  	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
>  }
>
> -static void
> +static inline void __attribute__((always_inline))
>  virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  {
>  	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
> @@ -125,427 +125,210 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  	}
>  }
>
> -static inline void
> -copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
> -		    struct virtio_net_hdr_mrg_rxbuf hdr)
> +static inline uint32_t __attribute__((always_inline))
> +loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
Creating a function just for doing this doesn't make much sense.
And the function name doesn't help.
I think you should just remove this function.

>  {
> -	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
> -		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
> -	else
> -		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
> +	if (pkt_left == 0 || avail_idx == vq->last_used_idx)
> +		return 1;
> +
> +	return 0;
>  }
>
> -static inline int __attribute__((always_inline))
> -copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -		  struct rte_mbuf *m, uint16_t desc_idx)
> +static inline uint32_t __attribute__((always_inline))
> +enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		uint16_t avail_idx, struct rte_mbuf *mbuf,
> +		uint32_t is_mrg_rxbuf)
>  {
> -	uint32_t desc_avail, desc_offset;
> -	uint32_t mbuf_avail, mbuf_offset;
> -	uint32_t cpy_len;
> +	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
>  	struct vring_desc *desc;
> -	uint64_t desc_addr;
> -	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
> -
> -	desc = &vq->desc[desc_idx];
> +	uint64_t desc_addr = 0;
> +	uint32_t desc_chain_head = 0;
> +	uint32_t desc_chain_len = 0;
> +	uint32_t desc_current = 0;
> +	uint32_t desc_offset = 0;
> +	uint32_t mbuf_len = 0;
> +	uint32_t mbuf_avail = 0;
> +	uint32_t copy_len = 0;
> +	uint32_t extra_buffers = 0;
> +	uint32_t used_idx_round = 0;
Most of these variables don't need to be initialized.

> +
> +	/* start with the first mbuf of the packet */
> +	mbuf_len = rte_pktmbuf_data_len(mbuf);
> +	mbuf_avail = mbuf_len;
> +
> +	/* get the current desc */
> +	desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
> +	desc_chain_head = desc_current;
> +	desc = &vq->desc[desc_current];
>  	desc_addr = gpa_to_vva(dev, desc->addr);
> -	/*
> -	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
> -	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
> -	 * otherwise stores offset on the stack instead of in a register.
> -	 */
> -	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
> -		return -1;
> -
> -	rte_prefetch0((void *)(uintptr_t)desc_addr);
> +	if (unlikely(!desc_addr))
> +		goto error;
>
> -	virtio_enqueue_offload(m, &virtio_hdr.hdr);
> -	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
> +	/* handle virtio header */
> +	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
> +	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
Parenthesis around virtio_hdr->hdr shouldn't be needed.
>  	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
> -	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
Looks like you remove the PRINT_PACKET calls.
Does it impact performance?
In any case, it should be mentionned in the commit message.

> -
>  	desc_offset = dev->vhost_hlen;
> -	desc_avail  = desc->len - dev->vhost_hlen;
> +	desc_chain_len = desc_offset;
> +	desc_addr += desc_offset;
> +	if (is_mrg_rxbuf)
> +		virtio_hdr->num_buffers = 1;
>
> -	mbuf_avail  = rte_pktmbuf_data_len(m);
> -	mbuf_offset = 0;
> -	while (mbuf_avail != 0 || m->next != NULL) {
> -		/* done with current mbuf, fetch next */
> -		if (mbuf_avail == 0) {
> -			m = m->next;
> -
> -			mbuf_offset = 0;
> -			mbuf_avail  = rte_pktmbuf_data_len(m);
> +	/* start copy from mbuf to desc */
> +	while (1) {
Please avoid while(1) when you can check for a real condition:
while (mbuf_avail || mbuf->next) ?

Compiler should optimize this properly, no?
> +		/* get the next mbuf if the current done */
> +		if (!mbuf_avail) {
> +			if (mbuf->next) {
> +				mbuf = mbuf->next;
> +				mbuf_len = rte_pktmbuf_data_len(mbuf);
> +				mbuf_avail = mbuf_len;
> +			} else
> +				break;
>  		}
>
> -		/* done with current desc buf, fetch next */
> -		if (desc_avail == 0) {
> -			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
> -				/* Room in vring buffer is not enough */
> -				return -1;
> -			}
> -			if (unlikely(desc->next >= vq->size))
> -				return -1;
> -
> -			desc = &vq->desc[desc->next];
> -			desc_addr = gpa_to_vva(dev, desc->addr);
> -			if (unlikely(!desc_addr))
> -				return -1;
> -
> -			desc_offset = 0;
> -			desc_avail  = desc->len;
> +		/* get the next desc if the current done */
> +		if (desc->len <= desc_offset) {
> +			if (desc->flags & VRING_DESC_F_NEXT) {
> +				/* go on with the current desc chain */
> +				desc_offset = 0;
> +				desc_current = desc->next;
> +				desc = &vq->desc[desc_current];
> +				desc_addr = gpa_to_vva(dev, desc->addr);
> +				if (unlikely(!desc_addr))
> +					goto rollback;
you could goto directly to error, and decrement last_used_idx directly
under "error"'s goto since extra_buffers will be zero otherwise.

Also, except desc_current affectation, all the above code is common
with mergeable case, so you should avoid duplication.
> +			} else if (is_mrg_rxbuf) {
> +				/* start with the next desc chain */
> +				used_idx_round = vq->last_used_idx
> +					& (vq->size - 1);
> +				vq->used->ring[used_idx_round].id =
> +					desc_chain_head;
> +				vq->used->ring[used_idx_round].len =
> +					desc_chain_len;
> +				vhost_log_used_vring(dev, vq,
> +					offsetof(struct vring_used,
> +						ring[used_idx_round]),
> +					sizeof(vq->used->ring[
> +						used_idx_round]));
> +				vq->last_used_idx++;
> +				extra_buffers++;
> +				virtio_hdr->num_buffers++;
> +				if (avail_idx == vq->last_used_idx)
> +					goto rollback;
> +
> +				desc_current =
> +					vq->avail->ring[(vq->last_used_idx) &
> +					(vq->size - 1)];
> +				desc_chain_head = desc_current;
> +				desc = &vq->desc[desc_current];
> +				desc_addr = gpa_to_vva(dev, desc->addr);
> +				if (unlikely(!desc_addr))
> +					goto rollback;
> +
> +				desc_chain_len = 0;
> +				desc_offset = 0;
> +			} else
> +				goto rollback;
>  		}
>
> -		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
> -		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
> -			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
> -			cpy_len);
> -		vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
> -		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
> -			     cpy_len, 0);
> -
> -		mbuf_avail  -= cpy_len;
> -		mbuf_offset += cpy_len;
> -		desc_avail  -= cpy_len;
> -		desc_offset += cpy_len;
> -	}
> -
> -	return 0;
> -}
> -
> -/**
> - * This function adds buffers to the virtio devices RX virtqueue. Buffers can
> - * be received from the physical port or from another virtio device. A packet
> - * count is returned to indicate the number of packets that are succesfully
> - * added to the RX queue. This function works when the mbuf is scattered, but
> - * it doesn't support the mergeable feature.
> - */
> -static inline uint32_t __attribute__((always_inline))
> -virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
> -	      struct rte_mbuf **pkts, uint32_t count)
> -{
> -	struct vhost_virtqueue *vq;
> -	uint16_t avail_idx, free_entries, start_idx;
> -	uint16_t desc_indexes[MAX_PKT_BURST];
> -	uint16_t used_idx;
> -	uint32_t i;
> -
> -	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
> -	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
> -		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
> -			dev->vid, __func__, queue_id);
> -		return 0;
> +		/* copy mbuf data */
> +		copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
> +		rte_memcpy((void *)(uintptr_t)desc_addr,
> +				rte_pktmbuf_mtod_offset(mbuf, void *,
> +					mbuf_len - mbuf_avail),
> +				copy_len);
> +		vhost_log_write(dev, desc->addr + desc_offset, copy_len);
> +		mbuf_avail -= copy_len;
> +		desc_offset += copy_len;
> +		desc_addr += copy_len;
> +		desc_chain_len += copy_len;
>  	}
>
> -	vq = dev->virtqueue[queue_id];
> -	if (unlikely(vq->enabled == 0))
> -		return 0;
> -
> -	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
> -	start_idx = vq->last_used_idx;
> -	free_entries = avail_idx - start_idx;
> -	count = RTE_MIN(count, free_entries);
> -	count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
> -	if (count == 0)
> -		return 0;
> -
> -	LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
> -		dev->vid, start_idx, start_idx + count);
> -
> -	/* Retrieve all of the desc indexes first to avoid caching issues. */
> -	rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
> -	for (i = 0; i < count; i++) {
> -		used_idx = (start_idx + i) & (vq->size - 1);
> -		desc_indexes[i] = vq->avail->ring[used_idx];
> -		vq->used->ring[used_idx].id = desc_indexes[i];
> -		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
> -					       dev->vhost_hlen;
> -		vhost_log_used_vring(dev, vq,
> -			offsetof(struct vring_used, ring[used_idx]),
> -			sizeof(vq->used->ring[used_idx]));
> -	}
> +	used_idx_round = vq->last_used_idx & (vq->size - 1);
> +	vq->used->ring[used_idx_round].id = desc_chain_head;
> +	vq->used->ring[used_idx_round].len = desc_chain_len;
> +	vhost_log_used_vring(dev, vq,
> +		offsetof(struct vring_used, ring[used_idx_round]),
> +		sizeof(vq->used->ring[used_idx_round]));
> +	vq->last_used_idx++;
All this code is duplicatedd from the rx_mergeable base.
I think a dedicated inline function would really make sense here.

>
> -	rte_prefetch0(&vq->desc[desc_indexes[0]]);
> -	for (i = 0; i < count; i++) {
> -		uint16_t desc_idx = desc_indexes[i];
> -		int err;
> +	return 0;
>
> -		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
> -		if (unlikely(err)) {
> -			used_idx = (start_idx + i) & (vq->size - 1);
> -			vq->used->ring[used_idx].len = dev->vhost_hlen;
> -			vhost_log_used_vring(dev, vq,
> -				offsetof(struct vring_used, ring[used_idx]),
> -				sizeof(vq->used->ring[used_idx]));
> -		}
> +rollback:
> +	/* rollback on any error if last_used_idx update on-the-fly */
> +	if (is_mrg_rxbuf)
If (!is_mrg_rxbuf), extra_buffers will be zero, so just remove the test,
and place belw line directly under error: as explained above.
> +		vq->last_used_idx -= extra_buffers;
>
> -		if (i + 1 < count)
> -			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
> -	}
> +error:
> +	return 1;
> +}
>
> +static inline void __attribute__((always_inline))
> +notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
> +{
>  	rte_smp_wmb();
> -
> -	*(volatile uint16_t *)&vq->used->idx += count;
> -	vq->last_used_idx += count;
> -	vhost_log_used_vring(dev, vq,
> -		offsetof(struct vring_used, idx),
> -		sizeof(vq->used->idx));
> -
> -	/* flush used->idx update before we read avail->flags. */
> +	vq->used->idx = vq->last_used_idx;
> +	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> +			sizeof(vq->used->idx));
>  	rte_mb();
> -
> -	/* Kick the guest if necessary. */
>  	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
>  			&& (vq->callfd >= 0))
>  		eventfd_write(vq->callfd, (eventfd_t)1);
> -	return count;
> -}
> -
> -static inline int
> -fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
> -	     uint32_t *allocated, uint32_t *vec_idx,
> -	     struct buf_vector *buf_vec)
> -{
> -	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
> -	uint32_t vec_id = *vec_idx;
> -	uint32_t len    = *allocated;
> -
> -	while (1) {
> -		if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
> -			return -1;
> -
> -		len += vq->desc[idx].len;
> -		buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
> -		buf_vec[vec_id].buf_len  = vq->desc[idx].len;
> -		buf_vec[vec_id].desc_idx = idx;
> -		vec_id++;
> -
> -		if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
> -			break;
> -
> -		idx = vq->desc[idx].next;
> -	}
> -
> -	*allocated = len;
> -	*vec_idx   = vec_id;
> -
> -	return 0;
> -}
> -
> -/*
> - * Returns -1 on fail, 0 on success
> - */
> -static inline int
> -reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
> -			    uint16_t *end, struct buf_vector *buf_vec)
> -{
> -	uint16_t cur_idx;
> -	uint16_t avail_idx;
> -	uint32_t allocated = 0;
> -	uint32_t vec_idx = 0;
> -	uint16_t tries = 0;
> -
> -	cur_idx  = vq->last_used_idx;
> -
> -	while (1) {
> -		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
> -		if (unlikely(cur_idx == avail_idx))
> -			return -1;
> -
> -		if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
> -					  &vec_idx, buf_vec) < 0))
> -			return -1;
> -
> -		cur_idx++;
> -		tries++;
> -
> -		if (allocated >= size)
> -			break;
> -
> -		/*
> -		 * if we tried all available ring items, and still
> -		 * can't get enough buf, it means something abnormal
> -		 * happened.
> -		 */
> -		if (unlikely(tries >= vq->size))
> -			return -1;
> -	}
> -
> -	*end = cur_idx;
> -	return 0;
>  }
>
> -static inline uint32_t __attribute__((always_inline))
> -copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -			    uint16_t end_idx, struct rte_mbuf *m,
> -			    struct buf_vector *buf_vec)
> +uint16_t
> +rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
> +	struct rte_mbuf **pkts, uint16_t count)
>  {
> -	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
> -	uint32_t vec_idx = 0;
> -	uint16_t start_idx = vq->last_used_idx;
> -	uint16_t cur_idx = start_idx;
> -	uint64_t desc_addr;
> -	uint32_t mbuf_offset, mbuf_avail;
> -	uint32_t desc_offset, desc_avail;
> -	uint32_t cpy_len;
> -	uint16_t desc_idx, used_idx;
> -
> -	if (unlikely(m == NULL))
> +	struct vhost_virtqueue *vq;
> +	struct virtio_net *dev;
> +	uint32_t pkt_idx = 0;
> +	uint32_t pkt_left = 0;
> +	uint32_t pkt_sent = 0;
> +	uint32_t is_mrg_rxbuf = 0;
> +	uint16_t avail_idx = 0;
> +
> +	/* precheck */
Comment not very informative here.
> +	if (unlikely(count == 0))
>  		return 0;
>
> -	LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
> -		dev->vid, cur_idx, end_idx);
> +	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
>
> -	desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
> -	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
> +	dev = get_device(vid);
> +	if (unlikely(!dev))
>  		return 0;
>
> -	rte_prefetch0((void *)(uintptr_t)desc_addr);
> -
> -	virtio_hdr.num_buffers = end_idx - start_idx;
> -	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
> -		dev->vid, virtio_hdr.num_buffers);
> -
> -	virtio_enqueue_offload(m, &virtio_hdr.hdr);
> -	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
> -	vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
> -	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
> -
> -	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
> -	desc_offset = dev->vhost_hlen;
> -
> -	mbuf_avail  = rte_pktmbuf_data_len(m);
> -	mbuf_offset = 0;
> -	while (mbuf_avail != 0 || m->next != NULL) {
> -		/* done with current desc buf, get the next one */
> -		if (desc_avail == 0) {
> -			desc_idx = buf_vec[vec_idx].desc_idx;
> -
> -			if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
> -				/* Update used ring with desc information */
> -				used_idx = cur_idx++ & (vq->size - 1);
> -				vq->used->ring[used_idx].id  = desc_idx;
> -				vq->used->ring[used_idx].len = desc_offset;
> -				vhost_log_used_vring(dev, vq,
> -					offsetof(struct vring_used,
> -						 ring[used_idx]),
> -					sizeof(vq->used->ring[used_idx]));
> -			}
> -
> -			vec_idx++;
> -			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
> -			if (unlikely(!desc_addr))
> -				return 0;
> -
> -			/* Prefetch buffer address. */
> -			rte_prefetch0((void *)(uintptr_t)desc_addr);
> -			desc_offset = 0;
> -			desc_avail  = buf_vec[vec_idx].buf_len;
> -		}
> -
> -		/* done with current mbuf, get the next one */
> -		if (mbuf_avail == 0) {
> -			m = m->next;
> -
> -			mbuf_offset = 0;
> -			mbuf_avail  = rte_pktmbuf_data_len(m);
> -		}
> -
> -		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
> -		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
> -			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
> -			cpy_len);
> -		vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
> -			cpy_len);
> -		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
> -			cpy_len, 0);
> -
> -		mbuf_avail  -= cpy_len;
> -		mbuf_offset += cpy_len;
> -		desc_avail  -= cpy_len;
> -		desc_offset += cpy_len;
> -	}
> -
> -	used_idx = cur_idx & (vq->size - 1);
> -	vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
> -	vq->used->ring[used_idx].len = desc_offset;
> -	vhost_log_used_vring(dev, vq,
> -		offsetof(struct vring_used, ring[used_idx]),
> -		sizeof(vq->used->ring[used_idx]));
> -
> -	return end_idx - start_idx;
> -}
> -
> -static inline uint32_t __attribute__((always_inline))
> -virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
> -	struct rte_mbuf **pkts, uint32_t count)
> -{
> -	struct vhost_virtqueue *vq;
> -	uint32_t pkt_idx = 0, nr_used = 0;
> -	uint16_t end;
> -	struct buf_vector buf_vec[BUF_VECTOR_MAX];
> -
> -	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
> -	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
> -		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
> -			dev->vid, __func__, queue_id);
> +	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
>  		return 0;
> -	}
>
>  	vq = dev->virtqueue[queue_id];
> -	if (unlikely(vq->enabled == 0))
> +	if (unlikely(!vq->enabled))
>  		return 0;
>
> -	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
> -	if (count == 0)
> -		return 0;
> +	if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
> +		is_mrg_rxbuf = 1;
>
> -	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> -		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
> -
> -		if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
> -							 &end, buf_vec) < 0)) {
> -			LOG_DEBUG(VHOST_DATA,
> -				"(%d) failed to get enough desc from vring\n",
> -				dev->vid);
> +	/* start enqueuing packets 1 by 1 */
> +	pkt_idx = 0;
> +	pkt_left = count;
> +	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
> +	while (1) {
> +		if (loop_check(vq, avail_idx, pkt_left))
What about:
while (pkt_left && avail_idx != vq->last_used_idx) {

>  			break;
> -		}
> -
> -		nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
> -						      pkts[pkt_idx], buf_vec);
> -		rte_smp_wmb();
>
> -		*(volatile uint16_t *)&vq->used->idx += nr_used;
> -		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> -			sizeof(vq->used->idx));
> -		vq->last_used_idx += nr_used;
> -	}
> -
> -	if (likely(pkt_idx)) {
> -		/* flush used->idx update before we read avail->flags. */
> -		rte_mb();
> +		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
> +					is_mrg_rxbuf))
> +			break;
>
> -		/* Kick the guest if necessary. */
> -		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
> -				&& (vq->callfd >= 0))
> -			eventfd_write(vq->callfd, (eventfd_t)1);
> +		pkt_idx++;
> +		pkt_sent++;
> +		pkt_left--;
>  	}
>
> -	return pkt_idx;
> -}
> -
> -uint16_t
> -rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
> -	struct rte_mbuf **pkts, uint16_t count)
> -{
> -	struct virtio_net *dev = get_device(vid);
> -
> -	if (!dev)
> -		return 0;
> +	/* update used idx and kick the guest if necessary */
> +	if (pkt_sent)
> +		notify_guest(dev, vq);
>
> -	if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> -		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
> -	else
> -		return virtio_dev_rx(dev, queue_id, pkts, count);
> +	return pkt_sent;
>  }
>
>  static void
>

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-08-22  8:11   ` [PATCH v3 0/5] vhost: optimize enqueue Maxime Coquelin
@ 2016-08-22 10:01     ` Maxime Coquelin
  2016-08-22 10:35       ` Thomas Monjalon
  2016-08-23  2:31       ` Wang, Zhihong
  2016-08-23  2:15     ` Wang, Zhihong
  2016-09-21  8:50     ` Jianbo Liu
  2 siblings, 2 replies; 141+ messages in thread
From: Maxime Coquelin @ 2016-08-22 10:01 UTC (permalink / raw)
  To: Zhihong Wang, dev; +Cc: yuanhan.liu


On 08/22/2016 10:11 AM, Maxime Coquelin wrote:
> Hi Zhihong,
>
> On 08/19/2016 07:43 AM, Zhihong Wang wrote:
> > This patch set optimizes the vhost enqueue function.
> >
> > It implements the vhost logic from scratch into a single function
> > designed
> > for high performance and good maintainability, and improves CPU
> > efficiency
> > significantly by optimizing cache access, which means:
> >
> >  *  For fast frontends (eg. DPDK virtio pmd), higher performance (maximum
> >     throughput) can be achieved.
> >
> >  *  For slow frontends (eg. kernel virtio-net), better scalability can be
> >     achieved, each vhost core can support more connections since it takes
> >     less cycles to handle each single frontend.
> >
> > The main optimization techniques are:
> >
> >  1. Reorder code to reduce CPU pipeline stall cycles.
> >
> >  2. Batch update the used ring for better efficiency.
> >
> >  3. Prefetch descriptor to hide cache latency.
> >
> >  4. Remove useless volatile attribute to allow compiler optimization.
>
> Thanks for these details, this is helpful to understand where the perf
> gain comes from.
> I would suggest to add these information as comments in the code
> where/if it makes sense. If more a general comment, at least add it in
> the commit message of the patch introducing it.
> Indeed, adding it to the cover letter is fine, but the information is
> lost as soon as the series is applied.
>
> You don't mention any figures, so I set up a benchmark on my side to
> evaluate your series. It indeed shows an interesting performance gain.
>
> My setup consists of one host running a guest.
> The guest generates as much 64bytes packets as possible using
> pktgen-dpdk. The hosts forwards received packets back to the guest
> using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
> physical CPUs.
>
> I tested it with and without your v1 patch, with and without
> rx-mergeable feature turned ON.
> Results are the average of 8 runs of 60 seconds:
>
> Rx-Mergeable ON : 7.72Mpps
> Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
> Rx-Mergeable OFF: 10.52Mpps
> Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
>
I forgot to add that before this series, I think we should first fix the windows bug.
Else we will need a dedicated fix for the stable branch.

Regards,
Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-08-22 10:01     ` Maxime Coquelin
@ 2016-08-22 10:35       ` Thomas Monjalon
  2016-08-24  3:37         ` Wang, Zhihong
  2016-08-23  2:31       ` Wang, Zhihong
  1 sibling, 1 reply; 141+ messages in thread
From: Thomas Monjalon @ 2016-08-22 10:35 UTC (permalink / raw)
  To: Maxime Coquelin, Zhihong Wang, yuanhan.liu; +Cc: dev

2016-08-22 12:01, Maxime Coquelin:
> I forgot to add that before this series, I think we should first fix the windows bug.
> Else we will need a dedicated fix for the stable branch.

This is a funny situation :)
If Zhihong had reworked the code without mentioning it is fixing a scenario
with Windows guests, maybe that nobody would have notice ;)
That's probably why it is not written in v2/v3. But thanks to the v1,
we all know it:
	"It also fixes the issue working with Windows VMs."

So yes, it would be a lot better to find the root cause and try to have a
minimal fix for 16.07, then rework the code for performance in 16.11.
I think we must avoid silent fixes, and even more, avoid writing specific
fixes for stable branches without validating them in the master branch and
its large users base.

Thanks for your good works guys, DPDK vhost is improving very well.

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-08-22  8:11   ` [PATCH v3 0/5] vhost: optimize enqueue Maxime Coquelin
  2016-08-22 10:01     ` Maxime Coquelin
@ 2016-08-23  2:15     ` Wang, Zhihong
  2016-09-21  8:50     ` Jianbo Liu
  2 siblings, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-08-23  2:15 UTC (permalink / raw)
  To: Maxime Coquelin, dev; +Cc: yuanhan.liu

> Subject: Re: [PATCH v3 0/5] vhost: optimize enqueue
> 
> Hi Zhihong,
> 
[...]
> > The main optimization techniques are:
> >
> >  1. Reorder code to reduce CPU pipeline stall cycles.
> >
> >  2. Batch update the used ring for better efficiency.
> >
> >  3. Prefetch descriptor to hide cache latency.
> >
> >  4. Remove useless volatile attribute to allow compiler optimization.
> 
> Thanks for these details, this is helpful to understand where the perf
> gain comes from.
> I would suggest to add these information as comments in the code
> where/if it makes sense. If more a general comment, at least add it in
> the commit message of the patch introducing it.
> Indeed, adding it to the cover letter is fine, but the information is
> lost as soon as the series is applied.

Hi Maxime,

I did add these info in the later optimization patches to explain each
optimization techniques. The v1 was indeed hard to read.


> 
> You don't mention any figures, so I set up a benchmark on my side to
> evaluate your series. It indeed shows an interesting performance gain.
> 
> My setup consists of one host running a guest.
> The guest generates as much 64bytes packets as possible using
> pktgen-dpdk. The hosts forwards received packets back to the guest
> using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
> physical CPUs.
> 

Thanks for doing the test!

I didn't publish any numbers since the gain varies in different platforms
and test setups.

In my phy to vm test on both IVB and HSW, where testpmd in the host rx from
the nic and enqueue to the guest, the enqueue efficiency (cycles per packet)
is 2.4x and 1.4x as fast as the current code for mergeable on and mergeable
off respectively, for v3 patch.


> I tested it with and without your v1 patch, with and without
> rx-mergeable feature turned ON.
> Results are the average of 8 runs of 60 seconds:
> 
> Rx-Mergeable ON : 7.72Mpps
> Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
> Rx-Mergeable OFF: 10.52Mpps
> Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
> 
> Regards,
> Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 1/5] vhost: rewrite enqueue
  2016-08-22  9:35     ` Maxime Coquelin
@ 2016-08-23  2:27       ` Wang, Zhihong
  2016-08-25  4:00       ` Yuanhan Liu
  1 sibling, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-08-23  2:27 UTC (permalink / raw)
  To: Maxime Coquelin, dev; +Cc: yuanhan.liu

Hi Maxime,

Thanks very much for the detailed review.

> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Monday, August 22, 2016 5:36 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
> Cc: yuanhan.liu@linux.intel.com
> Subject: Re: [PATCH v3 1/5] vhost: rewrite enqueue
> 
> 
> 
> On 08/19/2016 07:43 AM, Zhihong Wang wrote:
> > This patch implements the vhost logic from scratch into a single function
> > designed for high performance and better maintainability.
> >
> > ---
> > Changes in v3:
> >
> >  1. Rewrite enqueue and delete the obsolete in the same patch.
> >
> > Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> > ---
> >  lib/librte_vhost/vhost_rxtx.c | 537 +++++++++++++-----------------------------
> >  1 file changed, 160 insertions(+), 377 deletions(-)
> >
> > diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> > index 08a73fd..b09a9c3 100644
> > --- a/lib/librte_vhost/vhost_rxtx.c
> > +++ b/lib/librte_vhost/vhost_rxtx.c
> > @@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t
> qp_nb)
> >  	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
> >  }
> >
> > -static void
> > +static inline void __attribute__((always_inline))
> >  virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr
> *net_hdr)
> >  {
> >  	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
> > @@ -125,427 +125,210 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf,
> struct virtio_net_hdr *net_hdr)
> >  	}
> >  }
> >
> > -static inline void
> > -copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
> > -		    struct virtio_net_hdr_mrg_rxbuf hdr)
> > +static inline uint32_t __attribute__((always_inline))
> > +loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
> Creating a function just for doing this doesn't make much sense.
> And the function name doesn't help.
> I think you should just remove this function.

Okay.

> 
> >  {
> > -	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
> > -		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
> > -	else
> > -		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
> > +	if (pkt_left == 0 || avail_idx == vq->last_used_idx)
> > +		return 1;
> > +
> > +	return 0;
> >  }
> >
> > -static inline int __attribute__((always_inline))
> > -copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > -		  struct rte_mbuf *m, uint16_t desc_idx)
> > +static inline uint32_t __attribute__((always_inline))
> > +enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > +		uint16_t avail_idx, struct rte_mbuf *mbuf,
> > +		uint32_t is_mrg_rxbuf)
> >  {
> > -	uint32_t desc_avail, desc_offset;
> > -	uint32_t mbuf_avail, mbuf_offset;
> > -	uint32_t cpy_len;
> > +	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
> >  	struct vring_desc *desc;
> > -	uint64_t desc_addr;
> > -	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
> > -
> > -	desc = &vq->desc[desc_idx];
> > +	uint64_t desc_addr = 0;
> > +	uint32_t desc_chain_head = 0;
> > +	uint32_t desc_chain_len = 0;
> > +	uint32_t desc_current = 0;
> > +	uint32_t desc_offset = 0;
> > +	uint32_t mbuf_len = 0;
> > +	uint32_t mbuf_avail = 0;
> > +	uint32_t copy_len = 0;
> > +	uint32_t extra_buffers = 0;
> > +	uint32_t used_idx_round = 0;
> Most of these variables don't need to be initialized.

Okay.

> 
> > +
> > +	/* start with the first mbuf of the packet */
> > +	mbuf_len = rte_pktmbuf_data_len(mbuf);
> > +	mbuf_avail = mbuf_len;
> > +
> > +	/* get the current desc */
> > +	desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
> > +	desc_chain_head = desc_current;
> > +	desc = &vq->desc[desc_current];
> >  	desc_addr = gpa_to_vva(dev, desc->addr);
> > -	/*
> > -	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
> > -	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
> > -	 * otherwise stores offset on the stack instead of in a register.
> > -	 */
> > -	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
> > -		return -1;
> > -
> > -	rte_prefetch0((void *)(uintptr_t)desc_addr);
> > +	if (unlikely(!desc_addr))
> > +		goto error;
> >
> > -	virtio_enqueue_offload(m, &virtio_hdr.hdr);
> > -	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
> > +	/* handle virtio header */
> > +	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
> > +	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
> Parenthesis around virtio_hdr->hdr shouldn't be needed.
> >  	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
> > -	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
> Looks like you remove the PRINT_PACKET calls.
> Does it impact performance?
> In any case, it should be mentionned in the commit message.

Will add this.

> 
> > -
> >  	desc_offset = dev->vhost_hlen;
> > -	desc_avail  = desc->len - dev->vhost_hlen;
> > +	desc_chain_len = desc_offset;
> > +	desc_addr += desc_offset;
> > +	if (is_mrg_rxbuf)
> > +		virtio_hdr->num_buffers = 1;
> >
> > -	mbuf_avail  = rte_pktmbuf_data_len(m);
> > -	mbuf_offset = 0;
> > -	while (mbuf_avail != 0 || m->next != NULL) {
> > -		/* done with current mbuf, fetch next */
> > -		if (mbuf_avail == 0) {
> > -			m = m->next;
> > -
> > -			mbuf_offset = 0;
> > -			mbuf_avail  = rte_pktmbuf_data_len(m);
> > +	/* start copy from mbuf to desc */
> > +	while (1) {
> Please avoid while(1) when you can check for a real condition:
> while (mbuf_avail || mbuf->next) ?

Will rewrite this logic.

> 
> Compiler should optimize this properly, no?
> > +		/* get the next mbuf if the current done */
> > +		if (!mbuf_avail) {
> > +			if (mbuf->next) {
> > +				mbuf = mbuf->next;
> > +				mbuf_len = rte_pktmbuf_data_len(mbuf);
> > +				mbuf_avail = mbuf_len;
> > +			} else
> > +				break;
> >  		}
> >
> > -		/* done with current desc buf, fetch next */
> > -		if (desc_avail == 0) {
> > -			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
> > -				/* Room in vring buffer is not enough */
> > -				return -1;
> > -			}
> > -			if (unlikely(desc->next >= vq->size))
> > -				return -1;
> > -
> > -			desc = &vq->desc[desc->next];
> > -			desc_addr = gpa_to_vva(dev, desc->addr);
> > -			if (unlikely(!desc_addr))
> > -				return -1;
> > -
> > -			desc_offset = 0;
> > -			desc_avail  = desc->len;
> > +		/* get the next desc if the current done */
> > +		if (desc->len <= desc_offset) {
> > +			if (desc->flags & VRING_DESC_F_NEXT) {
> > +				/* go on with the current desc chain */
> > +				desc_offset = 0;
> > +				desc_current = desc->next;
> > +				desc = &vq->desc[desc_current];
> > +				desc_addr = gpa_to_vva(dev, desc->addr);
> > +				if (unlikely(!desc_addr))
> > +					goto rollback;
> you could goto directly to error, and decrement last_used_idx directly
> under "error"'s goto since extra_buffers will be zero otherwise.

Good call.

> 
> Also, except desc_current affectation, all the above code is common
> with mergeable case, so you should avoid duplication.
> > +			} else if (is_mrg_rxbuf) {
> > +				/* start with the next desc chain */
> > +				used_idx_round = vq->last_used_idx
> > +					& (vq->size - 1);
> > +				vq->used->ring[used_idx_round].id =
> > +					desc_chain_head;
> > +				vq->used->ring[used_idx_round].len =
> > +					desc_chain_len;
> > +				vhost_log_used_vring(dev, vq,
> > +					offsetof(struct vring_used,
> > +						ring[used_idx_round]),
> > +					sizeof(vq->used->ring[
> > +						used_idx_round]));
> > +				vq->last_used_idx++;
> > +				extra_buffers++;
> > +				virtio_hdr->num_buffers++;
> > +				if (avail_idx == vq->last_used_idx)
> > +					goto rollback;
> > +
> > +				desc_current =
> > +					vq->avail->ring[(vq->last_used_idx) &
> > +					(vq->size - 1)];
> > +				desc_chain_head = desc_current;
> > +				desc = &vq->desc[desc_current];
> > +				desc_addr = gpa_to_vva(dev, desc->addr);
> > +				if (unlikely(!desc_addr))
> > +					goto rollback;
> > +
> > +				desc_chain_len = 0;
> > +				desc_offset = 0;
> > +			} else
> > +				goto rollback;
> >  		}
> >
> > -		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
> > -		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
> > -			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
> > -			cpy_len);
> > -		vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
> > -		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
> > -			     cpy_len, 0);
> > -
> > -		mbuf_avail  -= cpy_len;
> > -		mbuf_offset += cpy_len;
> > -		desc_avail  -= cpy_len;
> > -		desc_offset += cpy_len;
> > -	}
> > -
> > -	return 0;
> > -}
> > -
> > -/**
> > - * This function adds buffers to the virtio devices RX virtqueue. Buffers can
> > - * be received from the physical port or from another virtio device. A packet
> > - * count is returned to indicate the number of packets that are succesfully
> > - * added to the RX queue. This function works when the mbuf is scattered, but
> > - * it doesn't support the mergeable feature.
> > - */
> > -static inline uint32_t __attribute__((always_inline))
> > -virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
> > -	      struct rte_mbuf **pkts, uint32_t count)
> > -{
> > -	struct vhost_virtqueue *vq;
> > -	uint16_t avail_idx, free_entries, start_idx;
> > -	uint16_t desc_indexes[MAX_PKT_BURST];
> > -	uint16_t used_idx;
> > -	uint32_t i;
> > -
> > -	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
> > -	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
> > -		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
> > -			dev->vid, __func__, queue_id);
> > -		return 0;
> > +		/* copy mbuf data */
> > +		copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
> > +		rte_memcpy((void *)(uintptr_t)desc_addr,
> > +				rte_pktmbuf_mtod_offset(mbuf, void *,
> > +					mbuf_len - mbuf_avail),
> > +				copy_len);
> > +		vhost_log_write(dev, desc->addr + desc_offset, copy_len);
> > +		mbuf_avail -= copy_len;
> > +		desc_offset += copy_len;
> > +		desc_addr += copy_len;
> > +		desc_chain_len += copy_len;
> >  	}
> >
> > -	vq = dev->virtqueue[queue_id];
> > -	if (unlikely(vq->enabled == 0))
> > -		return 0;
> > -
> > -	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
> > -	start_idx = vq->last_used_idx;
> > -	free_entries = avail_idx - start_idx;
> > -	count = RTE_MIN(count, free_entries);
> > -	count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
> > -	if (count == 0)
> > -		return 0;
> > -
> > -	LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
> > -		dev->vid, start_idx, start_idx + count);
> > -
> > -	/* Retrieve all of the desc indexes first to avoid caching issues. */
> > -	rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
> > -	for (i = 0; i < count; i++) {
> > -		used_idx = (start_idx + i) & (vq->size - 1);
> > -		desc_indexes[i] = vq->avail->ring[used_idx];
> > -		vq->used->ring[used_idx].id = desc_indexes[i];
> > -		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
> > -					       dev->vhost_hlen;
> > -		vhost_log_used_vring(dev, vq,
> > -			offsetof(struct vring_used, ring[used_idx]),
> > -			sizeof(vq->used->ring[used_idx]));
> > -	}
> > +	used_idx_round = vq->last_used_idx & (vq->size - 1);
> > +	vq->used->ring[used_idx_round].id = desc_chain_head;
> > +	vq->used->ring[used_idx_round].len = desc_chain_len;
> > +	vhost_log_used_vring(dev, vq,
> > +		offsetof(struct vring_used, ring[used_idx_round]),
> > +		sizeof(vq->used->ring[used_idx_round]));
> > +	vq->last_used_idx++;
> All this code is duplicatedd from the rx_mergeable base.
> I think a dedicated inline function would really make sense here.

Good catch. Will make a function for this.

> 
> >
> > -	rte_prefetch0(&vq->desc[desc_indexes[0]]);
> > -	for (i = 0; i < count; i++) {
> > -		uint16_t desc_idx = desc_indexes[i];
> > -		int err;
> > +	return 0;
> >
> > -		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
> > -		if (unlikely(err)) {
> > -			used_idx = (start_idx + i) & (vq->size - 1);
> > -			vq->used->ring[used_idx].len = dev->vhost_hlen;
> > -			vhost_log_used_vring(dev, vq,
> > -				offsetof(struct vring_used, ring[used_idx]),
> > -				sizeof(vq->used->ring[used_idx]));
> > -		}
> > +rollback:
> > +	/* rollback on any error if last_used_idx update on-the-fly */
> > +	if (is_mrg_rxbuf)
> If (!is_mrg_rxbuf), extra_buffers will be zero, so just remove the test,
> and place belw line directly under error: as explained above.

Sure. Thanks.

> > +		vq->last_used_idx -= extra_buffers;
> >
> > -		if (i + 1 < count)
> > -			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
> > -	}
> > +error:
> > +	return 1;
> > +}
> >
> > +static inline void __attribute__((always_inline))
> > +notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
> > +{
> >  	rte_smp_wmb();
> > -
> > -	*(volatile uint16_t *)&vq->used->idx += count;
> > -	vq->last_used_idx += count;
> > -	vhost_log_used_vring(dev, vq,
> > -		offsetof(struct vring_used, idx),
> > -		sizeof(vq->used->idx));
> > -
> > -	/* flush used->idx update before we read avail->flags. */
> > +	vq->used->idx = vq->last_used_idx;
> > +	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> > +			sizeof(vq->used->idx));
> >  	rte_mb();
> > -
> > -	/* Kick the guest if necessary. */
> >  	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
> >  			&& (vq->callfd >= 0))
> >  		eventfd_write(vq->callfd, (eventfd_t)1);
> > -	return count;
> > -}
> > -
> > -static inline int
> > -fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
> > -	     uint32_t *allocated, uint32_t *vec_idx,
> > -	     struct buf_vector *buf_vec)
> > -{
> > -	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
> > -	uint32_t vec_id = *vec_idx;
> > -	uint32_t len    = *allocated;
> > -
> > -	while (1) {
> > -		if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
> > -			return -1;
> > -
> > -		len += vq->desc[idx].len;
> > -		buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
> > -		buf_vec[vec_id].buf_len  = vq->desc[idx].len;
> > -		buf_vec[vec_id].desc_idx = idx;
> > -		vec_id++;
> > -
> > -		if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
> > -			break;
> > -
> > -		idx = vq->desc[idx].next;
> > -	}
> > -
> > -	*allocated = len;
> > -	*vec_idx   = vec_id;
> > -
> > -	return 0;
> > -}
> > -
> > -/*
> > - * Returns -1 on fail, 0 on success
> > - */
> > -static inline int
> > -reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
> > -			    uint16_t *end, struct buf_vector *buf_vec)
> > -{
> > -	uint16_t cur_idx;
> > -	uint16_t avail_idx;
> > -	uint32_t allocated = 0;
> > -	uint32_t vec_idx = 0;
> > -	uint16_t tries = 0;
> > -
> > -	cur_idx  = vq->last_used_idx;
> > -
> > -	while (1) {
> > -		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
> > -		if (unlikely(cur_idx == avail_idx))
> > -			return -1;
> > -
> > -		if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
> > -					  &vec_idx, buf_vec) < 0))
> > -			return -1;
> > -
> > -		cur_idx++;
> > -		tries++;
> > -
> > -		if (allocated >= size)
> > -			break;
> > -
> > -		/*
> > -		 * if we tried all available ring items, and still
> > -		 * can't get enough buf, it means something abnormal
> > -		 * happened.
> > -		 */
> > -		if (unlikely(tries >= vq->size))
> > -			return -1;
> > -	}
> > -
> > -	*end = cur_idx;
> > -	return 0;
> >  }
> >
> > -static inline uint32_t __attribute__((always_inline))
> > -copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> > -			    uint16_t end_idx, struct rte_mbuf *m,
> > -			    struct buf_vector *buf_vec)
> > +uint16_t
> > +rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
> > +	struct rte_mbuf **pkts, uint16_t count)
> >  {
> > -	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
> > -	uint32_t vec_idx = 0;
> > -	uint16_t start_idx = vq->last_used_idx;
> > -	uint16_t cur_idx = start_idx;
> > -	uint64_t desc_addr;
> > -	uint32_t mbuf_offset, mbuf_avail;
> > -	uint32_t desc_offset, desc_avail;
> > -	uint32_t cpy_len;
> > -	uint16_t desc_idx, used_idx;
> > -
> > -	if (unlikely(m == NULL))
> > +	struct vhost_virtqueue *vq;
> > +	struct virtio_net *dev;
> > +	uint32_t pkt_idx = 0;
> > +	uint32_t pkt_left = 0;
> > +	uint32_t pkt_sent = 0;
> > +	uint32_t is_mrg_rxbuf = 0;
> > +	uint16_t avail_idx = 0;
> > +
> > +	/* precheck */
> Comment not very informative here.

Okay.

> > +	if (unlikely(count == 0))
> >  		return 0;
> >
> > -	LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
> > -		dev->vid, cur_idx, end_idx);
> > +	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
> >
> > -	desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
> > -	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
> > +	dev = get_device(vid);
> > +	if (unlikely(!dev))
> >  		return 0;
> >
> > -	rte_prefetch0((void *)(uintptr_t)desc_addr);
> > -
> > -	virtio_hdr.num_buffers = end_idx - start_idx;
> > -	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
> > -		dev->vid, virtio_hdr.num_buffers);
> > -
> > -	virtio_enqueue_offload(m, &virtio_hdr.hdr);
> > -	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
> > -	vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
> > -	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
> > -
> > -	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
> > -	desc_offset = dev->vhost_hlen;
> > -
> > -	mbuf_avail  = rte_pktmbuf_data_len(m);
> > -	mbuf_offset = 0;
> > -	while (mbuf_avail != 0 || m->next != NULL) {
> > -		/* done with current desc buf, get the next one */
> > -		if (desc_avail == 0) {
> > -			desc_idx = buf_vec[vec_idx].desc_idx;
> > -
> > -			if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
> > -				/* Update used ring with desc information */
> > -				used_idx = cur_idx++ & (vq->size - 1);
> > -				vq->used->ring[used_idx].id  = desc_idx;
> > -				vq->used->ring[used_idx].len = desc_offset;
> > -				vhost_log_used_vring(dev, vq,
> > -					offsetof(struct vring_used,
> > -						 ring[used_idx]),
> > -					sizeof(vq->used->ring[used_idx]));
> > -			}
> > -
> > -			vec_idx++;
> > -			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
> > -			if (unlikely(!desc_addr))
> > -				return 0;
> > -
> > -			/* Prefetch buffer address. */
> > -			rte_prefetch0((void *)(uintptr_t)desc_addr);
> > -			desc_offset = 0;
> > -			desc_avail  = buf_vec[vec_idx].buf_len;
> > -		}
> > -
> > -		/* done with current mbuf, get the next one */
> > -		if (mbuf_avail == 0) {
> > -			m = m->next;
> > -
> > -			mbuf_offset = 0;
> > -			mbuf_avail  = rte_pktmbuf_data_len(m);
> > -		}
> > -
> > -		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
> > -		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
> > -			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
> > -			cpy_len);
> > -		vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
> > -			cpy_len);
> > -		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
> > -			cpy_len, 0);
> > -
> > -		mbuf_avail  -= cpy_len;
> > -		mbuf_offset += cpy_len;
> > -		desc_avail  -= cpy_len;
> > -		desc_offset += cpy_len;
> > -	}
> > -
> > -	used_idx = cur_idx & (vq->size - 1);
> > -	vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
> > -	vq->used->ring[used_idx].len = desc_offset;
> > -	vhost_log_used_vring(dev, vq,
> > -		offsetof(struct vring_used, ring[used_idx]),
> > -		sizeof(vq->used->ring[used_idx]));
> > -
> > -	return end_idx - start_idx;
> > -}
> > -
> > -static inline uint32_t __attribute__((always_inline))
> > -virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
> > -	struct rte_mbuf **pkts, uint32_t count)
> > -{
> > -	struct vhost_virtqueue *vq;
> > -	uint32_t pkt_idx = 0, nr_used = 0;
> > -	uint16_t end;
> > -	struct buf_vector buf_vec[BUF_VECTOR_MAX];
> > -
> > -	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
> > -	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
> > -		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
> > -			dev->vid, __func__, queue_id);
> > +	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
> >  		return 0;
> > -	}
> >
> >  	vq = dev->virtqueue[queue_id];
> > -	if (unlikely(vq->enabled == 0))
> > +	if (unlikely(!vq->enabled))
> >  		return 0;
> >
> > -	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
> > -	if (count == 0)
> > -		return 0;
> > +	if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
> > +		is_mrg_rxbuf = 1;
> >
> > -	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> > -		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
> > -
> > -		if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
> > -							 &end, buf_vec) < 0)) {
> > -			LOG_DEBUG(VHOST_DATA,
> > -				"(%d) failed to get enough desc from vring\n",
> > -				dev->vid);
> > +	/* start enqueuing packets 1 by 1 */
> > +	pkt_idx = 0;
> > +	pkt_left = count;
> > +	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
> > +	while (1) {
> > +		if (loop_check(vq, avail_idx, pkt_left))
> What about:
> while (pkt_left && avail_idx != vq->last_used_idx) {

Will rewrite it.

> 
> >  			break;
> > -		}
> > -
> > -		nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
> > -						      pkts[pkt_idx], buf_vec);
> > -		rte_smp_wmb();
> >
> > -		*(volatile uint16_t *)&vq->used->idx += nr_used;
> > -		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> > -			sizeof(vq->used->idx));
> > -		vq->last_used_idx += nr_used;
> > -	}
> > -
> > -	if (likely(pkt_idx)) {
> > -		/* flush used->idx update before we read avail->flags. */
> > -		rte_mb();
> > +		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
> > +					is_mrg_rxbuf))
> > +			break;
> >
> > -		/* Kick the guest if necessary. */
> > -		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
> > -				&& (vq->callfd >= 0))
> > -			eventfd_write(vq->callfd, (eventfd_t)1);
> > +		pkt_idx++;
> > +		pkt_sent++;
> > +		pkt_left--;
> >  	}
> >
> > -	return pkt_idx;
> > -}
> > -
> > -uint16_t
> > -rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
> > -	struct rte_mbuf **pkts, uint16_t count)
> > -{
> > -	struct virtio_net *dev = get_device(vid);
> > -
> > -	if (!dev)
> > -		return 0;
> > +	/* update used idx and kick the guest if necessary */
> > +	if (pkt_sent)
> > +		notify_guest(dev, vq);
> >
> > -	if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> > -		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
> > -	else
> > -		return virtio_dev_rx(dev, queue_id, pkts, count);
> > +	return pkt_sent;
> >  }
> >
> >  static void
> >

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-08-22 10:01     ` Maxime Coquelin
  2016-08-22 10:35       ` Thomas Monjalon
@ 2016-08-23  2:31       ` Wang, Zhihong
  2016-08-23 10:43         ` Wang, Zhihong
  1 sibling, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-08-23  2:31 UTC (permalink / raw)
  To: Maxime Coquelin, dev; +Cc: yuanhan.liu



> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Monday, August 22, 2016 6:02 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
> Cc: yuanhan.liu@linux.intel.com
> Subject: Re: [PATCH v3 0/5] vhost: optimize enqueue
> 
> 
> On 08/22/2016 10:11 AM, Maxime Coquelin wrote:
> > Hi Zhihong,
> >
> > On 08/19/2016 07:43 AM, Zhihong Wang wrote:
> > > This patch set optimizes the vhost enqueue function.
> > >
> > > It implements the vhost logic from scratch into a single function
> > > designed
> > > for high performance and good maintainability, and improves CPU
> > > efficiency
> > > significantly by optimizing cache access, which means:
> > >
> > >  *  For fast frontends (eg. DPDK virtio pmd), higher performance
> (maximum
> > >     throughput) can be achieved.
> > >
> > >  *  For slow frontends (eg. kernel virtio-net), better scalability can be
> > >     achieved, each vhost core can support more connections since it takes
> > >     less cycles to handle each single frontend.
> > >
> > > The main optimization techniques are:
> > >
> > >  1. Reorder code to reduce CPU pipeline stall cycles.
> > >
> > >  2. Batch update the used ring for better efficiency.
> > >
> > >  3. Prefetch descriptor to hide cache latency.
> > >
> > >  4. Remove useless volatile attribute to allow compiler optimization.
> >
> > Thanks for these details, this is helpful to understand where the perf
> > gain comes from.
> > I would suggest to add these information as comments in the code
> > where/if it makes sense. If more a general comment, at least add it in
> > the commit message of the patch introducing it.
> > Indeed, adding it to the cover letter is fine, but the information is
> > lost as soon as the series is applied.
> >
> > You don't mention any figures, so I set up a benchmark on my side to
> > evaluate your series. It indeed shows an interesting performance gain.
> >
> > My setup consists of one host running a guest.
> > The guest generates as much 64bytes packets as possible using
> > pktgen-dpdk. The hosts forwards received packets back to the guest
> > using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
> > physical CPUs.
> >
> > I tested it with and without your v1 patch, with and without
> > rx-mergeable feature turned ON.
> > Results are the average of 8 runs of 60 seconds:
> >
> > Rx-Mergeable ON : 7.72Mpps
> > Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
> > Rx-Mergeable OFF: 10.52Mpps
> > Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
> >
> I forgot to add that before this series, I think we should first fix the windows bug.
> Else we will need a dedicated fix for the stable branch.

Okay I'll try to fix it, though I can't make any promises at present.

Have tried once but stopped since we don't have enough debug info from the
frontend side so basically I was debugging the backend based on guesses.


> 
> Regards,
> Maxime


^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-08-23  2:31       ` Wang, Zhihong
@ 2016-08-23 10:43         ` Wang, Zhihong
  2016-08-23 12:16           ` Maxime Coquelin
  2016-08-23 12:22           ` Yuanhan Liu
  0 siblings, 2 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-08-23 10:43 UTC (permalink / raw)
  To: 'Maxime Coquelin', 'dev@dpdk.org',
	'yuanhan.liu@linux.intel.com'



> -----Original Message-----
> From: Wang, Zhihong
> Sent: Tuesday, August 23, 2016 10:31 AM
> To: Maxime Coquelin <maxime.coquelin@redhat.com>; dev@dpdk.org
> Cc: yuanhan.liu@linux.intel.com
> Subject: RE: [PATCH v3 0/5] vhost: optimize enqueue
> 
> 
> 
> > -----Original Message-----
> > From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> > Sent: Monday, August 22, 2016 6:02 PM
> > To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
> > Cc: yuanhan.liu@linux.intel.com
> > Subject: Re: [PATCH v3 0/5] vhost: optimize enqueue
> >
> >
> > On 08/22/2016 10:11 AM, Maxime Coquelin wrote:
> > > Hi Zhihong,
> > >
> > > On 08/19/2016 07:43 AM, Zhihong Wang wrote:
> > > > This patch set optimizes the vhost enqueue function.
> > > >
> > > > It implements the vhost logic from scratch into a single function
> > > > designed
> > > > for high performance and good maintainability, and improves CPU
> > > > efficiency
> > > > significantly by optimizing cache access, which means:
> > > >
> > > >  *  For fast frontends (eg. DPDK virtio pmd), higher performance
> > (maximum
> > > >     throughput) can be achieved.
> > > >
> > > >  *  For slow frontends (eg. kernel virtio-net), better scalability can be
> > > >     achieved, each vhost core can support more connections since it takes
> > > >     less cycles to handle each single frontend.
> > > >
> > > > The main optimization techniques are:
> > > >
> > > >  1. Reorder code to reduce CPU pipeline stall cycles.
> > > >
> > > >  2. Batch update the used ring for better efficiency.
> > > >
> > > >  3. Prefetch descriptor to hide cache latency.
> > > >
> > > >  4. Remove useless volatile attribute to allow compiler optimization.
> > >
> > > Thanks for these details, this is helpful to understand where the perf
> > > gain comes from.
> > > I would suggest to add these information as comments in the code
> > > where/if it makes sense. If more a general comment, at least add it in
> > > the commit message of the patch introducing it.
> > > Indeed, adding it to the cover letter is fine, but the information is
> > > lost as soon as the series is applied.
> > >
> > > You don't mention any figures, so I set up a benchmark on my side to
> > > evaluate your series. It indeed shows an interesting performance gain.
> > >
> > > My setup consists of one host running a guest.
> > > The guest generates as much 64bytes packets as possible using
> > > pktgen-dpdk. The hosts forwards received packets back to the guest
> > > using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
> > > physical CPUs.
> > >
> > > I tested it with and without your v1 patch, with and without
> > > rx-mergeable feature turned ON.
> > > Results are the average of 8 runs of 60 seconds:
> > >
> > > Rx-Mergeable ON : 7.72Mpps
> > > Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
> > > Rx-Mergeable OFF: 10.52Mpps
> > > Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
> > >
> > I forgot to add that before this series, I think we should first fix the windows
> bug.
> > Else we will need a dedicated fix for the stable branch.
> 
> Okay I'll try to fix it, though I can't make any promises at present.
> 
> Have tried once but stopped since we don't have enough debug info from the
> frontend side so basically I was debugging the backend based on guesses.

Hi Maxime, Yuanhan,

I've identified the root cause, do you think it makes sense to put the fix
in the same patch set? Or send it as a separated patch?


Thanks
Zhihong

> 
> 
> >
> > Regards,
> > Maxime


^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-08-23 10:43         ` Wang, Zhihong
@ 2016-08-23 12:16           ` Maxime Coquelin
  2016-08-23 12:22           ` Yuanhan Liu
  1 sibling, 0 replies; 141+ messages in thread
From: Maxime Coquelin @ 2016-08-23 12:16 UTC (permalink / raw)
  To: Wang, Zhihong, 'dev@dpdk.org',
	'yuanhan.liu@linux.intel.com'



On 08/23/2016 12:43 PM, Wang, Zhihong wrote:
>
>
>> -----Original Message-----
>> From: Wang, Zhihong
>> Sent: Tuesday, August 23, 2016 10:31 AM
>> To: Maxime Coquelin <maxime.coquelin@redhat.com>; dev@dpdk.org
>> Cc: yuanhan.liu@linux.intel.com
>> Subject: RE: [PATCH v3 0/5] vhost: optimize enqueue
>>
>>
>>
>>> -----Original Message-----
>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>> Sent: Monday, August 22, 2016 6:02 PM
>>> To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
>>> Cc: yuanhan.liu@linux.intel.com
>>> Subject: Re: [PATCH v3 0/5] vhost: optimize enqueue
..
>>>>
>>> I forgot to add that before this series, I think we should first fix the windows
>> bug.
>>> Else we will need a dedicated fix for the stable branch.
>>
>> Okay I'll try to fix it, though I can't make any promises at present.
>>
>> Have tried once but stopped since we don't have enough debug info from the
>> frontend side so basically I was debugging the backend based on guesses.
>
> Hi Maxime, Yuanhan,
>
> I've identified the root cause, do you think it makes sense to put the fix
> in the same patch set? Or send it as a separated patch?

Good work!

Agree with Yuanhan, send it before the optimization series.

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-08-23 10:43         ` Wang, Zhihong
  2016-08-23 12:16           ` Maxime Coquelin
@ 2016-08-23 12:22           ` Yuanhan Liu
  1 sibling, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-08-23 12:22 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: 'Maxime Coquelin', 'dev@dpdk.org'

On Tue, Aug 23, 2016 at 10:43:36AM +0000, Wang, Zhihong wrote:
> > > I forgot to add that before this series, I think we should first fix the windows
> > bug.
> > > Else we will need a dedicated fix for the stable branch.
> > 
> > Okay I'll try to fix it, though I can't make any promises at present.
> > 
> > Have tried once but stopped since we don't have enough debug info from the
> > frontend side so basically I was debugging the backend based on guesses.
> 
> Hi Maxime, Yuanhan,
> 
> I've identified the root cause, do you think it makes sense to put the fix
> in the same patch set? Or send it as a separated patch?

Great!

Yes, it's okay to put it in the patch set (normally, as the first patch,
before the rewrite).

Please also add following line before your Signed-off-by in the commit
log:

    Cc: <stable@dpdk.org>

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-08-22 10:35       ` Thomas Monjalon
@ 2016-08-24  3:37         ` Wang, Zhihong
  0 siblings, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-08-24  3:37 UTC (permalink / raw)
  To: Thomas Monjalon, Maxime Coquelin, yuanhan.liu; +Cc: dev



> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> Sent: Monday, August 22, 2016 6:35 PM
> To: Maxime Coquelin <maxime.coquelin@redhat.com>; Wang, Zhihong
> <zhihong.wang@intel.com>; yuanhan.liu@linux.intel.com
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> 2016-08-22 12:01, Maxime Coquelin:
> > I forgot to add that before this series, I think we should first fix the
> windows bug.
> > Else we will need a dedicated fix for the stable branch.
> 
> This is a funny situation :)
> If Zhihong had reworked the code without mentioning it is fixing a scenario
> with Windows guests, maybe that nobody would have notice ;) That's
> probably why it is not written in v2/v3. But thanks to the v1, we all know it:
> 	"It also fixes the issue working with Windows VMs."

I thought it'd be more appropriate to send a dedicated fix for stable branch.
So I removed this info.

> 
> So yes, it would be a lot better to find the root cause and try to have a
> minimal fix for 16.07, then rework the code for performance in 16.11.
> I think we must avoid silent fixes, and even more, avoid writing specific fixes
> for stable branches without validating them in the master branch and its large
> users base.

Okay, that's also what Maxime and Yuanhan suggest.

BTW the root cause has been identified and fix will be in v4.

> 
> Thanks for your good works guys, DPDK vhost is improving very well.

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 4/5] vhost: batch update used ring
  2016-08-19  5:43   ` [PATCH v3 4/5] vhost: batch update used ring Zhihong Wang
@ 2016-08-25  3:48     ` Yuanhan Liu
  2016-08-25  5:19       ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-08-25  3:48 UTC (permalink / raw)
  To: Zhihong Wang; +Cc: dev, maxime.coquelin

On Fri, Aug 19, 2016 at 01:43:49AM -0400, Zhihong Wang wrote:
> This patch enables batch update of the used ring for better efficiency.
> 
> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
...
> diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
> index 1785695..87d09fa 100644
> --- a/lib/librte_vhost/virtio-net.c
> +++ b/lib/librte_vhost/virtio-net.c
> @@ -152,10 +152,14 @@ cleanup_device(struct virtio_net *dev, int destroy)
>  static void
>  free_device(struct virtio_net *dev)
>  {
> +	struct vhost_virtqueue *vq;
>  	uint32_t i;
>  
> -	for (i = 0; i < dev->virt_qp_nb; i++)
> -		rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
> +	for (i = 0; i < dev->virt_qp_nb; i++) {
> +		vq = dev->virtqueue[i * VIRTIO_QNUM];
> +		rte_free(vq->shadow_used_ring);
> +		rte_free(vq);
> +	}
>  	rte_free(dev);
>  }
> @@ -418,13 +422,18 @@ int
>  vhost_set_vring_num(int vid, struct vhost_vring_state *state)
>  {
>  	struct virtio_net *dev;
> +	struct vhost_virtqueue *vq;
>  
>  	dev = get_device(vid);
>  	if (dev == NULL)
>  		return -1;
>  
>  	/* State->index refers to the queue index. The txq is 1, rxq is 0. */
> -	dev->virtqueue[state->index]->size = state->num;
> +	vq = dev->virtqueue[state->index];
> +	vq->size = state->num;
> +	vq->shadow_used_ring = rte_malloc("",
> +			vq->size * sizeof(struct vring_used_elem),
> +			RTE_CACHE_LINE_SIZE);

Few notes here:

- I think the typical way to not specific a string type is using NULL,
  but not "".

- You should check the return value of rte_malloc: it could fail.

- Note that free_device() is invoked only when the vhost-user connection
  is broken (say the guest is halt). However, vhost_set_vring_num() could
  be invoked many times for a connection, say when you restart testpmd
  many times. This would lead to memory leak.

  The right way is to free it on get_vring_base().

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 1/5] vhost: rewrite enqueue
  2016-08-22  9:35     ` Maxime Coquelin
  2016-08-23  2:27       ` Wang, Zhihong
@ 2016-08-25  4:00       ` Yuanhan Liu
  1 sibling, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-08-25  4:00 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: Zhihong Wang, dev

On Mon, Aug 22, 2016 at 11:35:47AM +0200, Maxime Coquelin wrote:
> >-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
> >-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
> >+	/* handle virtio header */
> >+	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
> >+	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
> Parenthesis around virtio_hdr->hdr shouldn't be needed.
> > 	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
> >-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
> Looks like you remove the PRINT_PACKET calls.
> Does it impact performance?

Yes, it does. But it's only enabled for debug mode. Besides that,
it's just a NOOP.

> In any case, it should be mentionned in the commit message.

Agreed. But for this case, we should not remove it: it breaks the
debug-ability.

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 4/5] vhost: batch update used ring
  2016-08-25  3:48     ` Yuanhan Liu
@ 2016-08-25  5:19       ` Wang, Zhihong
  0 siblings, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-08-25  5:19 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: dev, maxime.coquelin



> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Thursday, August 25, 2016 11:48 AM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org; maxime.coquelin@redhat.com
> Subject: Re: [PATCH v3 4/5] vhost: batch update used ring
> 
> On Fri, Aug 19, 2016 at 01:43:49AM -0400, Zhihong Wang wrote:
> > This patch enables batch update of the used ring for better efficiency.
> >
> > Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> ...
> > diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
> > index 1785695..87d09fa 100644
> > --- a/lib/librte_vhost/virtio-net.c
> > +++ b/lib/librte_vhost/virtio-net.c
> > @@ -152,10 +152,14 @@ cleanup_device(struct virtio_net *dev, int
> destroy)
> >  static void
> >  free_device(struct virtio_net *dev)
> >  {
> > +	struct vhost_virtqueue *vq;
> >  	uint32_t i;
> >
> > -	for (i = 0; i < dev->virt_qp_nb; i++)
> > -		rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
> > +	for (i = 0; i < dev->virt_qp_nb; i++) {
> > +		vq = dev->virtqueue[i * VIRTIO_QNUM];
> > +		rte_free(vq->shadow_used_ring);
> > +		rte_free(vq);
> > +	}
> >  	rte_free(dev);
> >  }
> > @@ -418,13 +422,18 @@ int
> >  vhost_set_vring_num(int vid, struct vhost_vring_state *state)
> >  {
> >  	struct virtio_net *dev;
> > +	struct vhost_virtqueue *vq;
> >
> >  	dev = get_device(vid);
> >  	if (dev == NULL)
> >  		return -1;
> >
> >  	/* State->index refers to the queue index. The txq is 1, rxq is 0. */
> > -	dev->virtqueue[state->index]->size = state->num;
> > +	vq = dev->virtqueue[state->index];
> > +	vq->size = state->num;
> > +	vq->shadow_used_ring = rte_malloc("",
> > +			vq->size * sizeof(struct vring_used_elem),
> > +			RTE_CACHE_LINE_SIZE);
> 
> Few notes here:
> 
> - I think the typical way to not specific a string type is using NULL,
>   but not "".
> 
> - You should check the return value of rte_malloc: it could fail.
> 
> - Note that free_device() is invoked only when the vhost-user connection
>   is broken (say the guest is halt). However, vhost_set_vring_num() could
>   be invoked many times for a connection, say when you restart testpmd
>   many times. This would lead to memory leak.
> 
>   The right way is to free it on get_vring_base().

Good catch! Thanks.

> 
> 	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* [PATCH v4 0/6] vhost: optimize enqueue
  2016-08-16  3:50 [PATCH] optimize vhost enqueue Zhihong Wang
                   ` (2 preceding siblings ...)
  2016-08-19  5:43 ` [PATCH v3 0/5] vhost: optimize enqueue Zhihong Wang
@ 2016-08-30  3:35 ` Zhihong Wang
  2016-08-30  3:35   ` [PATCH v4 1/6] vhost: fix windows vm hang Zhihong Wang
                     ` (5 more replies)
  2016-09-09  3:39 ` [PATCH v5 0/6] vhost: optimize enqueue Zhihong Wang
  2016-09-20  2:00 ` [PATCH v6 " Zhihong Wang
  5 siblings, 6 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-30  3:35 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon

This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

 *  Higher maximum throughput can be achieved for fast frontends like DPDK
    virtio pmd.

 *  Better scalability can be achieved that each vhost core can support
    more connections because it takes less cycles to handle each single
    frontend.

This patch set contains:

 1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.

 2. A baseline patch to rewrite the vhost logic.

 3. A series of optimization patches added upon the baseline.

The main optimization techniques are:

 1. Reorder code to reduce CPU pipeline stall cycles.

 2. Batch update the used ring for better efficiency.

 3. Prefetch descriptor to hide cache latency.

 4. Remove useless volatile attribute to allow compiler optimization.

Code reordering and batch used ring update bring most of the performance
improvements.

In the existing code there're 2 callbacks for vhost enqueue:

 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.

 *  virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
maintenance efforts.

Also, there's a compatibility issue in the existing code which causes
Windows VM to hang when the mrg_rxbuf feature turned on.

---
Changes in v4:

 1. Fix a Windows VM compatibility issue.

 2. Free shadow used ring in the right place.

 3. Add failure check for shadow used ring malloc.

 4. Refactor the code for clearer logic.

 5. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

 3. Rewrite enqueue and delete the obsolete in the same patch.

---
Changes in v2:

 1. Split the big function into several small ones.

 2. Use multiple patches to explain each optimization.

 3. Add comments.

Zhihong Wang (6):
  vhost: fix windows vm hang
  vhost: rewrite enqueue
  vhost: remove useless volatile
  vhost: add desc prefetch
  vhost: batch update used ring
  vhost: optimize cache access

 lib/librte_vhost/vhost-net.h  |   6 +-
 lib/librte_vhost/vhost_rxtx.c | 572 +++++++++++++++---------------------------
 lib/librte_vhost/virtio-net.c |  42 +++-
 3 files changed, 244 insertions(+), 376 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 141+ messages in thread

* [PATCH v4 1/6] vhost: fix windows vm hang
  2016-08-30  3:35 ` [PATCH v4 0/6] " Zhihong Wang
@ 2016-08-30  3:35   ` Zhihong Wang
  2016-09-05  5:24     ` [dpdk-stable] " Yuanhan Liu
  2016-08-30  3:36   ` [PATCH v4 2/6] vhost: rewrite enqueue Zhihong Wang
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 141+ messages in thread
From: Zhihong Wang @ 2016-08-30  3:35 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, stable, Zhihong Wang

This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost code,
which causes the guest to hang once any packets are enqueued when mrg_rxbuf
is turned on.

How to test?

 1. Start testpmd in the host with a vhost port.

 2. Start a Windows VM image with qemu and connect to the vhost port.

 3. Start io forwarding with tx_first in host testpmd.

For 16.07 code, the Windows VM will hang once any packets are enqueued.

Cc: <stable@dpdk.org>
Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost_rxtx.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..5806f99 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	uint16_t start_idx = vq->last_used_idx;
 	uint16_t cur_idx = start_idx;
 	uint64_t desc_addr;
+	uint32_t desc_chain_head;
+	uint32_t desc_chain_len;
 	uint32_t mbuf_offset, mbuf_avail;
 	uint32_t desc_offset, desc_avail;
 	uint32_t cpy_len;
@@ -412,6 +414,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
 	desc_offset = dev->vhost_hlen;
+	desc_chain_head = buf_vec[vec_idx].desc_idx;
+	desc_chain_len = desc_offset;
 
 	mbuf_avail  = rte_pktmbuf_data_len(m);
 	mbuf_offset = 0;
@@ -419,19 +423,21 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		/* done with current desc buf, get the next one */
 		if (desc_avail == 0) {
 			desc_idx = buf_vec[vec_idx].desc_idx;
+			vec_idx++;
 
 			if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
 				/* Update used ring with desc information */
 				used_idx = cur_idx++ & (vq->size - 1);
-				vq->used->ring[used_idx].id  = desc_idx;
-				vq->used->ring[used_idx].len = desc_offset;
+				vq->used->ring[used_idx].id = desc_chain_head;
+				vq->used->ring[used_idx].len = desc_chain_len;
 				vhost_log_used_vring(dev, vq,
 					offsetof(struct vring_used,
 						 ring[used_idx]),
 					sizeof(vq->used->ring[used_idx]));
+				desc_chain_head = buf_vec[vec_idx].desc_idx;
+				desc_chain_len = 0;
 			}
 
-			vec_idx++;
 			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
 			if (unlikely(!desc_addr))
 				return 0;
@@ -463,11 +469,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		mbuf_offset += cpy_len;
 		desc_avail  -= cpy_len;
 		desc_offset += cpy_len;
+		desc_chain_len += cpy_len;
 	}
 
 	used_idx = cur_idx & (vq->size - 1);
-	vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
-	vq->used->ring[used_idx].len = desc_offset;
+	vq->used->ring[used_idx].id = desc_chain_head;
+	vq->used->ring[used_idx].len = desc_chain_len;
 	vhost_log_used_vring(dev, vq,
 		offsetof(struct vring_used, ring[used_idx]),
 		sizeof(vq->used->ring[used_idx]));
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v4 2/6] vhost: rewrite enqueue
  2016-08-30  3:35 ` [PATCH v4 0/6] " Zhihong Wang
  2016-08-30  3:35   ` [PATCH v4 1/6] vhost: fix windows vm hang Zhihong Wang
@ 2016-08-30  3:36   ` Zhihong Wang
  2016-09-05  6:39     ` Yuanhan Liu
  2016-08-30  3:36   ` [PATCH v4 3/6] vhost: remove useless volatile Zhihong Wang
                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 141+ messages in thread
From: Zhihong Wang @ 2016-08-30  3:36 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.

This is the baseline version of the new code, more optimization will be
added in the following patches in this patch set.

---
Changes in v4:

 1. Refactor the code for clearer logic.

 2. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Rewrite enqueue and delete the obsolete in the same patch.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost_rxtx.c | 525 ++++++++++++------------------------------
 1 file changed, 145 insertions(+), 380 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 5806f99..629e8ae 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
 	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }
 
-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
 	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 						cksum));
 			break;
 		}
+	} else {
+		net_hdr->flags = 0;
+		net_hdr->csum_start = 0;
+		net_hdr->csum_offset = 0;
 	}
 
 	if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
@@ -122,437 +126,198 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 		net_hdr->gso_size = m_buf->tso_segsz;
 		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
 					+ m_buf->l4_len;
+	} else {
+		net_hdr->gso_type = 0;
+		net_hdr->hdr_len = 0;
+		net_hdr->gso_size = 0;
 	}
 }
 
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-		    struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint32_t desc_chain_head, uint32_t desc_chain_len)
 {
-	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-	else
-		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+	uint32_t used_idx_round = vq->last_used_idx & (vq->size - 1);
+
+	vq->used->ring[used_idx_round].id = desc_chain_head;
+	vq->used->ring[used_idx_round].len = desc_chain_len;
+	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
+				ring[used_idx_round]),
+			sizeof(vq->used->ring[used_idx_round]));
 }
 
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		  struct rte_mbuf *m, uint16_t desc_idx)
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint16_t avail_idx, struct rte_mbuf *mbuf,
+		uint32_t is_mrg_rxbuf)
 {
-	uint32_t desc_avail, desc_offset;
-	uint32_t mbuf_avail, mbuf_offset;
-	uint32_t cpy_len;
+	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
 	struct vring_desc *desc;
 	uint64_t desc_addr;
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
-	desc = &vq->desc[desc_idx];
+	uint32_t desc_chain_head;
+	uint32_t desc_chain_len;
+	uint32_t desc_current;
+	uint32_t desc_offset;
+	uint32_t mbuf_len;
+	uint32_t mbuf_avail;
+	uint32_t copy_len;
+	uint32_t extra_buffers = 0;
+
+	/* start with the first mbuf of the packet */
+	mbuf_len = rte_pktmbuf_data_len(mbuf);
+	mbuf_avail = mbuf_len;
+
+	/* get the current desc */
+	desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+	desc_chain_head = desc_current;
+	desc = &vq->desc[desc_current];
 	desc_addr = gpa_to_vva(dev, desc->addr);
-	/*
-	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-	 * otherwise stores offset on the stack instead of in a register.
-	 */
-	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-		return -1;
+	if (unlikely(!desc_addr))
+		goto error;
 
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
+	/* handle virtio header */
+	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+	if (is_mrg_rxbuf)
+		virtio_hdr->num_buffers = extra_buffers + 1;
 
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
 	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
 	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
 	desc_offset = dev->vhost_hlen;
-	desc_avail  = desc->len - dev->vhost_hlen;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current mbuf, fetch next */
-		if (mbuf_avail == 0) {
-			m = m->next;
-
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
+	desc_chain_len = desc_offset;
+	desc_addr += desc_offset;
+
+	/* start copy from mbuf to desc */
+	while (mbuf_avail || mbuf->next) {
+		/* get the next mbuf if the current done */
+		if (!mbuf_avail) {
+			mbuf = mbuf->next;
+			mbuf_len = rte_pktmbuf_data_len(mbuf);
+			mbuf_avail = mbuf_len;
 		}
 
-		/* done with current desc buf, fetch next */
-		if (desc_avail == 0) {
-			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
-				/* Room in vring buffer is not enough */
-				return -1;
-			}
-			if (unlikely(desc->next >= vq->size))
-				return -1;
+		/* get the next desc if the current done */
+		if (desc->len <= desc_offset) {
+			if (desc->flags & VRING_DESC_F_NEXT) {
+				/* go on with the current desc chain */
+				desc_offset = 0;
+				desc_current = desc->next;
+				desc = &vq->desc[desc_current];
+				desc_addr = gpa_to_vva(dev, desc->addr);
+				if (unlikely(!desc_addr))
+					goto error;
+			} else if (is_mrg_rxbuf) {
+				/* start with the next desc chain */
+				update_used_ring(dev, vq, desc_chain_head,
+						desc_chain_len);
+				vq->last_used_idx++;
+				extra_buffers++;
+				virtio_hdr->num_buffers++;
+				if (avail_idx == vq->last_used_idx)
+					goto error;
+
+				desc_current =
+					vq->avail->ring[(vq->last_used_idx) &
+					(vq->size - 1)];
+				desc_chain_head = desc_current;
+				desc = &vq->desc[desc_current];
+				desc_addr = gpa_to_vva(dev, desc->addr);
+				if (unlikely(!desc_addr))
+					goto error;
 
-			desc = &vq->desc[desc->next];
-			desc_addr = gpa_to_vva(dev, desc->addr);
-			if (unlikely(!desc_addr))
-				return -1;
-
-			desc_offset = 0;
-			desc_avail  = desc->len;
+				desc_chain_len = 0;
+				desc_offset = 0;
+			} else
+				goto error;
 		}
 
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			     cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
+		/* copy mbuf data */
+		copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+		rte_memcpy((void *)(uintptr_t)desc_addr,
+				rte_pktmbuf_mtod_offset(mbuf, void *,
+					mbuf_len - mbuf_avail),
+				copy_len);
+		vhost_log_write(dev, desc->addr + desc_offset, copy_len);
+		PRINT_PACKET(dev, (uintptr_t)desc_addr, copy_len, 0);
+		mbuf_avail -= copy_len;
+		desc_offset += copy_len;
+		desc_addr += copy_len;
+		desc_chain_len += copy_len;
 	}
 
-	return 0;
-}
+	update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
+	vq->last_used_idx++;
 
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
-	      struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint16_t avail_idx, free_entries, start_idx;
-	uint16_t desc_indexes[MAX_PKT_BURST];
-	uint16_t used_idx;
-	uint32_t i;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
-		return 0;
-	}
-
-	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
-		return 0;
-
-	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-	start_idx = vq->last_used_idx;
-	free_entries = avail_idx - start_idx;
-	count = RTE_MIN(count, free_entries);
-	count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
-	if (count == 0)
-		return 0;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
-		dev->vid, start_idx, start_idx + count);
-
-	/* Retrieve all of the desc indexes first to avoid caching issues. */
-	rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
-	for (i = 0; i < count; i++) {
-		used_idx = (start_idx + i) & (vq->size - 1);
-		desc_indexes[i] = vq->avail->ring[used_idx];
-		vq->used->ring[used_idx].id = desc_indexes[i];
-		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
-					       dev->vhost_hlen;
-		vhost_log_used_vring(dev, vq,
-			offsetof(struct vring_used, ring[used_idx]),
-			sizeof(vq->used->ring[used_idx]));
-	}
-
-	rte_prefetch0(&vq->desc[desc_indexes[0]]);
-	for (i = 0; i < count; i++) {
-		uint16_t desc_idx = desc_indexes[i];
-		int err;
+	return 0;
 
-		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
-		if (unlikely(err)) {
-			used_idx = (start_idx + i) & (vq->size - 1);
-			vq->used->ring[used_idx].len = dev->vhost_hlen;
-			vhost_log_used_vring(dev, vq,
-				offsetof(struct vring_used, ring[used_idx]),
-				sizeof(vq->used->ring[used_idx]));
-		}
+error:
+	/* rollback on any error if last_used_idx update on-the-fly */
+	vq->last_used_idx -= extra_buffers;
 
-		if (i + 1 < count)
-			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
-	}
+	return 1;
+}
 
+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
 	rte_smp_wmb();
-
-	*(volatile uint16_t *)&vq->used->idx += count;
-	vq->last_used_idx += count;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, idx),
-		sizeof(vq->used->idx));
-
-	/* flush used->idx update before we read avail->flags. */
+	vq->used->idx = vq->last_used_idx;
+	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+			sizeof(vq->used->idx));
 	rte_mb();
-
-	/* Kick the guest if necessary. */
 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
 			&& (vq->callfd >= 0))
 		eventfd_write(vq->callfd, (eventfd_t)1);
-	return count;
 }
 
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
-	     uint32_t *allocated, uint32_t *vec_idx,
-	     struct buf_vector *buf_vec)
-{
-	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
-	uint32_t vec_id = *vec_idx;
-	uint32_t len    = *allocated;
-
-	while (1) {
-		if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
-			return -1;
-
-		len += vq->desc[idx].len;
-		buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
-		buf_vec[vec_id].buf_len  = vq->desc[idx].len;
-		buf_vec[vec_id].desc_idx = idx;
-		vec_id++;
-
-		if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
-			break;
-
-		idx = vq->desc[idx].next;
-	}
-
-	*allocated = len;
-	*vec_idx   = vec_id;
-
-	return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
-			    uint16_t *end, struct buf_vector *buf_vec)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint16_t count)
 {
-	uint16_t cur_idx;
+	struct vhost_virtqueue *vq;
+	struct virtio_net *dev;
+	uint32_t pkt_left = count;
+	uint32_t pkt_idx = 0;
+	uint32_t pkt_sent = 0;
+	uint32_t is_mrg_rxbuf = 0;
 	uint16_t avail_idx;
-	uint32_t allocated = 0;
-	uint32_t vec_idx = 0;
-	uint16_t tries = 0;
 
-	cur_idx  = vq->last_used_idx;
-
-	while (1) {
-		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-		if (unlikely(cur_idx == avail_idx))
-			return -1;
-
-		if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
-					  &vec_idx, buf_vec) < 0))
-			return -1;
-
-		cur_idx++;
-		tries++;
-
-		if (allocated >= size)
-			break;
-
-		/*
-		 * if we tried all available ring items, and still
-		 * can't get enough buf, it means something abnormal
-		 * happened.
-		 */
-		if (unlikely(tries >= vq->size))
-			return -1;
-	}
-
-	*end = cur_idx;
-	return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
-			    uint16_t end_idx, struct rte_mbuf *m,
-			    struct buf_vector *buf_vec)
-{
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-	uint32_t vec_idx = 0;
-	uint16_t start_idx = vq->last_used_idx;
-	uint16_t cur_idx = start_idx;
-	uint64_t desc_addr;
-	uint32_t desc_chain_head;
-	uint32_t desc_chain_len;
-	uint32_t mbuf_offset, mbuf_avail;
-	uint32_t desc_offset, desc_avail;
-	uint32_t cpy_len;
-	uint16_t desc_idx, used_idx;
-
-	if (unlikely(m == NULL))
+	if (unlikely(!pkt_left))
 		return 0;
 
-	LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
-		dev->vid, cur_idx, end_idx);
+	pkt_left = RTE_MIN((uint32_t)MAX_PKT_BURST, pkt_left);
 
-	desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
+	dev = get_device(vid);
+	if (unlikely(!dev))
 		return 0;
 
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-	virtio_hdr.num_buffers = end_idx - start_idx;
-	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
-		dev->vid, virtio_hdr.num_buffers);
-
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-	vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
-	desc_offset = dev->vhost_hlen;
-	desc_chain_head = buf_vec[vec_idx].desc_idx;
-	desc_chain_len = desc_offset;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current desc buf, get the next one */
-		if (desc_avail == 0) {
-			desc_idx = buf_vec[vec_idx].desc_idx;
-			vec_idx++;
-
-			if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
-				/* Update used ring with desc information */
-				used_idx = cur_idx++ & (vq->size - 1);
-				vq->used->ring[used_idx].id = desc_chain_head;
-				vq->used->ring[used_idx].len = desc_chain_len;
-				vhost_log_used_vring(dev, vq,
-					offsetof(struct vring_used,
-						 ring[used_idx]),
-					sizeof(vq->used->ring[used_idx]));
-				desc_chain_head = buf_vec[vec_idx].desc_idx;
-				desc_chain_len = 0;
-			}
-
-			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-			if (unlikely(!desc_addr))
-				return 0;
-
-			/* Prefetch buffer address. */
-			rte_prefetch0((void *)(uintptr_t)desc_addr);
-			desc_offset = 0;
-			desc_avail  = buf_vec[vec_idx].buf_len;
-		}
-
-		/* done with current mbuf, get the next one */
-		if (mbuf_avail == 0) {
-			m = m->next;
-
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
-		}
-
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
-			cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
-		desc_chain_len += cpy_len;
-	}
-
-	used_idx = cur_idx & (vq->size - 1);
-	vq->used->ring[used_idx].id = desc_chain_head;
-	vq->used->ring[used_idx].len = desc_chain_len;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, ring[used_idx]),
-		sizeof(vq->used->ring[used_idx]));
-
-	return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint32_t pkt_idx = 0, nr_used = 0;
-	uint16_t end;
-	struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
 		return 0;
-	}
 
 	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
+	if (unlikely(!vq->enabled))
 		return 0;
 
-	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
-	if (count == 0)
-		return 0;
-
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
-		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
+	if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+		is_mrg_rxbuf = 1;
 
-		if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
-							 &end, buf_vec) < 0)) {
-			LOG_DEBUG(VHOST_DATA,
-				"(%d) failed to get enough desc from vring\n",
-				dev->vid);
+	/* start enqueuing packets 1 by 1 */
+	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+	while (pkt_left && avail_idx != vq->last_used_idx) {
+		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+					is_mrg_rxbuf))
 			break;
-		}
-
-		nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
-						      pkts[pkt_idx], buf_vec);
-		rte_smp_wmb();
-
-		*(volatile uint16_t *)&vq->used->idx += nr_used;
-		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
-			sizeof(vq->used->idx));
-		vq->last_used_idx += nr_used;
-	}
-
-	if (likely(pkt_idx)) {
-		/* flush used->idx update before we read avail->flags. */
-		rte_mb();
 
-		/* Kick the guest if necessary. */
-		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-				&& (vq->callfd >= 0))
-			eventfd_write(vq->callfd, (eventfd_t)1);
+		pkt_idx++;
+		pkt_sent++;
+		pkt_left--;
 	}
 
-	return pkt_idx;
-}
-
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint16_t count)
-{
-	struct virtio_net *dev = get_device(vid);
-
-	if (!dev)
-		return 0;
+	/* update used idx and kick the guest if necessary */
+	if (pkt_sent)
+		notify_guest(dev, vq);
 
-	if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
-		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
-	else
-		return virtio_dev_rx(dev, queue_id, pkts, count);
+	return pkt_sent;
 }
 
 static void
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v4 3/6] vhost: remove useless volatile
  2016-08-30  3:35 ` [PATCH v4 0/6] " Zhihong Wang
  2016-08-30  3:35   ` [PATCH v4 1/6] vhost: fix windows vm hang Zhihong Wang
  2016-08-30  3:36   ` [PATCH v4 2/6] vhost: rewrite enqueue Zhihong Wang
@ 2016-08-30  3:36   ` Zhihong Wang
  2016-08-30  3:36   ` [PATCH v4 4/6] vhost: add desc prefetch Zhihong Wang
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-30  3:36 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost-net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 38593a2..51fdf3d 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
 	uint32_t		size;
 
 	/* Last index used on the available ring */
-	volatile uint16_t	last_used_idx;
+	uint16_t		last_used_idx;
 #define VIRTIO_INVALID_EVENTFD		(-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD	(-2)
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v4 4/6] vhost: add desc prefetch
  2016-08-30  3:35 ` [PATCH v4 0/6] " Zhihong Wang
                     ` (2 preceding siblings ...)
  2016-08-30  3:36   ` [PATCH v4 3/6] vhost: remove useless volatile Zhihong Wang
@ 2016-08-30  3:36   ` Zhihong Wang
  2016-08-30  3:36   ` [PATCH v4 5/6] vhost: batch update used ring Zhihong Wang
  2016-08-30  3:36   ` [PATCH v4 6/6] vhost: optimize cache access Zhihong Wang
  5 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-30  3:36 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost_rxtx.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 629e8ae..927896c 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -304,6 +304,12 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 	/* start enqueuing packets 1 by 1 */
 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
 	while (pkt_left && avail_idx != vq->last_used_idx) {
+		/* prefetch the next desc */
+		if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+			rte_prefetch0(&vq->desc[vq->avail->ring[
+					(vq->last_used_idx + 1) &
+					(vq->size - 1)]]);
+
 		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
 					is_mrg_rxbuf))
 			break;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v4 5/6] vhost: batch update used ring
  2016-08-30  3:35 ` [PATCH v4 0/6] " Zhihong Wang
                     ` (3 preceding siblings ...)
  2016-08-30  3:36   ` [PATCH v4 4/6] vhost: add desc prefetch Zhihong Wang
@ 2016-08-30  3:36   ` Zhihong Wang
  2016-08-30  3:36   ` [PATCH v4 6/6] vhost: optimize cache access Zhihong Wang
  5 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-30  3:36 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch enables batch update of the used ring for better efficiency.

---
Changes in v4:

 1. Free shadow used ring in the right place.

 2. Add failure check for shadow used ring malloc.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost-net.h  |  4 +++
 lib/librte_vhost/vhost_rxtx.c | 62 ++++++++++++++++++++++++++++++++++++-------
 lib/librte_vhost/virtio-net.c | 42 ++++++++++++++++++++++++++---
 3 files changed, 95 insertions(+), 13 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 51fdf3d..a15182c 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {
 
 	/* Physical address of used ring, for logging */
 	uint64_t		log_guest_addr;
+
+	/* Shadow used ring for performance */
+	struct vring_used_elem	*shadow_used_ring;
+	uint32_t		shadow_used_idx;
 } __rte_cache_aligned;
 
 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 927896c..ddc7b21 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -134,16 +134,51 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 }
 
 static inline void __attribute__((always_inline))
-update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		uint32_t desc_chain_head, uint32_t desc_chain_len)
+update_used_ring(struct vhost_virtqueue *vq, uint32_t desc_chain_head,
+		uint32_t desc_chain_len)
 {
-	uint32_t used_idx_round = vq->last_used_idx & (vq->size - 1);
+	vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+	vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+	vq->shadow_used_idx++;
+}
+
+static inline void __attribute__((always_inline))
+flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint32_t used_idx_start)
+{
+	if (used_idx_start + vq->shadow_used_idx < vq->size) {
+		rte_memcpy(&vq->used->ring[used_idx_start],
+				&vq->shadow_used_ring[0],
+				vq->shadow_used_idx *
+				sizeof(struct vring_used_elem));
+		vhost_log_used_vring(dev, vq,
+				offsetof(struct vring_used,
+					ring[used_idx_start]),
+				vq->shadow_used_idx *
+				sizeof(struct vring_used_elem));
+	} else {
+		uint32_t part_1 = vq->size - used_idx_start;
+		uint32_t part_2 = vq->shadow_used_idx - part_1;
 
-	vq->used->ring[used_idx_round].id = desc_chain_head;
-	vq->used->ring[used_idx_round].len = desc_chain_len;
-	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
-				ring[used_idx_round]),
-			sizeof(vq->used->ring[used_idx_round]));
+		rte_memcpy(&vq->used->ring[used_idx_start],
+				&vq->shadow_used_ring[0],
+				part_1 *
+				sizeof(struct vring_used_elem));
+		vhost_log_used_vring(dev, vq,
+				offsetof(struct vring_used,
+					ring[used_idx_start]),
+				part_1 *
+				sizeof(struct vring_used_elem));
+		rte_memcpy(&vq->used->ring[0],
+				&vq->shadow_used_ring[part_1],
+				part_2 *
+				sizeof(struct vring_used_elem));
+		vhost_log_used_vring(dev, vq,
+				offsetof(struct vring_used,
+					ring[0]),
+				part_2 *
+				sizeof(struct vring_used_elem));
+	}
 }
 
 static inline uint32_t __attribute__((always_inline))
@@ -208,7 +243,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 					goto error;
 			} else if (is_mrg_rxbuf) {
 				/* start with the next desc chain */
-				update_used_ring(dev, vq, desc_chain_head,
+				update_used_ring(vq, desc_chain_head,
 						desc_chain_len);
 				vq->last_used_idx++;
 				extra_buffers++;
@@ -245,7 +280,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		desc_chain_len += copy_len;
 	}
 
-	update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
+	update_used_ring(vq, desc_chain_head, desc_chain_len);
 	vq->last_used_idx++;
 
 	return 0;
@@ -276,6 +311,7 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 {
 	struct vhost_virtqueue *vq;
 	struct virtio_net *dev;
+	uint32_t used_idx_start;
 	uint32_t pkt_left = count;
 	uint32_t pkt_idx = 0;
 	uint32_t pkt_sent = 0;
@@ -302,6 +338,8 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 		is_mrg_rxbuf = 1;
 
 	/* start enqueuing packets 1 by 1 */
+	vq->shadow_used_idx = 0;
+	used_idx_start = vq->last_used_idx & (vq->size - 1);
 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
 	while (pkt_left && avail_idx != vq->last_used_idx) {
 		/* prefetch the next desc */
@@ -319,6 +357,10 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 		pkt_left--;
 	}
 
+	/* batch update used ring for better performance */
+	if (likely(vq->shadow_used_idx > 0))
+		flush_used_ring(dev, vq, used_idx_start);
+
 	/* update used idx and kick the guest if necessary */
 	if (pkt_sent)
 		notify_guest(dev, vq);
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 1785695..7416079 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -152,10 +152,26 @@ cleanup_device(struct virtio_net *dev, int destroy)
 static void
 free_device(struct virtio_net *dev)
 {
+	struct vhost_virtqueue *vq_0;
+	struct vhost_virtqueue *vq_1;
 	uint32_t i;
 
-	for (i = 0; i < dev->virt_qp_nb; i++)
-		rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+	for (i = 0; i < dev->virt_qp_nb; i++) {
+		vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
+		if (vq_0->shadow_used_ring) {
+			rte_free(vq_0->shadow_used_ring);
+			vq_0->shadow_used_ring = NULL;
+		}
+
+		vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
+		if (vq_1->shadow_used_ring) {
+			rte_free(vq_1->shadow_used_ring);
+			vq_1->shadow_used_ring = NULL;
+		}
+
+		/* malloc together, free together */
+		rte_free(vq_0);
+	}
 
 	rte_free(dev);
 }
@@ -418,13 +434,26 @@ int
 vhost_set_vring_num(int vid, struct vhost_vring_state *state)
 {
 	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
 
 	dev = get_device(vid);
 	if (dev == NULL)
 		return -1;
 
 	/* State->index refers to the queue index. The txq is 1, rxq is 0. */
-	dev->virtqueue[state->index]->size = state->num;
+	vq = dev->virtqueue[state->index];
+	vq->size = state->num;
+	if (!vq->shadow_used_ring) {
+		vq->shadow_used_ring = rte_malloc(NULL,
+				vq->size * sizeof(struct vring_used_elem),
+				RTE_CACHE_LINE_SIZE);
+		if (!vq->shadow_used_ring) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"Failed to allocate memory"
+				" for shadow used ring.\n");
+			return -1;
+		}
+	}
 
 	return 0;
 }
@@ -610,6 +639,7 @@ int
 vhost_get_vring_base(int vid, uint32_t index,
 	struct vhost_vring_state *state)
 {
+	struct vhost_virtqueue *vq;
 	struct virtio_net *dev;
 
 	dev = get_device(vid);
@@ -617,6 +647,12 @@ vhost_get_vring_base(int vid, uint32_t index,
 		return -1;
 
 	state->index = index;
+	vq = dev->virtqueue[state->index];
+	if (vq->shadow_used_ring) {
+		rte_free(vq->shadow_used_ring);
+		vq->shadow_used_ring = NULL;
+	}
+
 	/* State->index refers to the queue index. The txq is 1, rxq is 0. */
 	state->num = dev->virtqueue[state->index]->last_used_idx;
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v4 6/6] vhost: optimize cache access
  2016-08-30  3:35 ` [PATCH v4 0/6] " Zhihong Wang
                     ` (4 preceding siblings ...)
  2016-08-30  3:36   ` [PATCH v4 5/6] vhost: batch update used ring Zhihong Wang
@ 2016-08-30  3:36   ` Zhihong Wang
  5 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-08-30  3:36 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch reorders the code to delay virtio header write to optimize cache
access efficiency for cases where the mrg_rxbuf feature is turned on. It
reduces CPU pipeline stall cycles significantly.

---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost_rxtx.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index ddc7b21..fc5dc4a 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -196,6 +196,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	uint32_t mbuf_len;
 	uint32_t mbuf_avail;
 	uint32_t copy_len;
+	uint32_t copy_virtio_hdr;
 	uint32_t extra_buffers = 0;
 
 	/* start with the first mbuf of the packet */
@@ -210,12 +211,12 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	if (unlikely(!desc_addr))
 		goto error;
 
-	/* handle virtio header */
+	/*
+	 * handle virtio header, the actual write operation is delayed
+	 * for cache optimization, to reduce CPU pipeline stall cycles.
+	 */
 	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
-	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
-	if (is_mrg_rxbuf)
-		virtio_hdr->num_buffers = extra_buffers + 1;
-
+	copy_virtio_hdr = 1;
 	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
 	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
 	desc_offset = dev->vhost_hlen;
@@ -266,8 +267,15 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 				goto error;
 		}
 
-		/* copy mbuf data */
+		/* copy virtio header and mbuf data */
 		copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+		if (copy_virtio_hdr) {
+			copy_virtio_hdr = 0;
+			virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+			if (is_mrg_rxbuf)
+				virtio_hdr->num_buffers = extra_buffers + 1;
+		}
+
 		rte_memcpy((void *)(uintptr_t)desc_addr,
 				rte_pktmbuf_mtod_offset(mbuf, void *,
 					mbuf_len - mbuf_avail),
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* Re: [dpdk-stable] [PATCH v4 1/6] vhost: fix windows vm hang
  2016-08-30  3:35   ` [PATCH v4 1/6] vhost: fix windows vm hang Zhihong Wang
@ 2016-09-05  5:24     ` Yuanhan Liu
  2016-09-05  5:25       ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-09-05  5:24 UTC (permalink / raw)
  To: Zhihong Wang; +Cc: dev, maxime.coquelin, yuanhan.liu, thomas.monjalon, stable

On Mon, Aug 29, 2016 at 11:35:59PM -0400, Zhihong Wang wrote:
> This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost code,
> which causes the guest to hang once any packets are enqueued when mrg_rxbuf
> is turned on.

This commit log lacks two important pieces: why does the hang happen and
how does your patch fix it.

> How to test?
> 
>  1. Start testpmd in the host with a vhost port.
> 
>  2. Start a Windows VM image with qemu and connect to the vhost port.
> 
>  3. Start io forwarding with tx_first in host testpmd.
> 
> For 16.07 code, the Windows VM will hang once any packets are enqueued.
> 
> Cc: <stable@dpdk.org>
> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> ---
>  lib/librte_vhost/vhost_rxtx.c | 17 ++++++++++++-----
>  1 file changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> index 08a73fd..5806f99 100644
> --- a/lib/librte_vhost/vhost_rxtx.c
> +++ b/lib/librte_vhost/vhost_rxtx.c
> @@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	uint16_t start_idx = vq->last_used_idx;
>  	uint16_t cur_idx = start_idx;
>  	uint64_t desc_addr;
> +	uint32_t desc_chain_head;
> +	uint32_t desc_chain_len;

What's the point of introducing "desc_chain_len"? It has the same value
of desc_offset.

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [dpdk-stable] [PATCH v4 1/6] vhost: fix windows vm hang
  2016-09-05  5:24     ` [dpdk-stable] " Yuanhan Liu
@ 2016-09-05  5:25       ` Wang, Zhihong
  2016-09-05  5:40         ` Yuanhan Liu
  0 siblings, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-05  5:25 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: dev, maxime.coquelin, thomas.monjalon, stable



> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Monday, September 5, 2016 1:25 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org; maxime.coquelin@redhat.com;
> yuanhan.liu@linux.intel.com; thomas.monjalon@6wind.com;
> stable@dpdk.org
> Subject: Re: [dpdk-stable] [PATCH v4 1/6] vhost: fix windows vm hang
> 
> On Mon, Aug 29, 2016 at 11:35:59PM -0400, Zhihong Wang wrote:
> > This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost
> code,
> > which causes the guest to hang once any packets are enqueued when
> mrg_rxbuf
> > is turned on.
> 
> This commit log lacks two important pieces: why does the hang happen and
> how does your patch fix it.

Okay, I'll add it in v5.

> 
> > How to test?
> >
> >  1. Start testpmd in the host with a vhost port.
> >
> >  2. Start a Windows VM image with qemu and connect to the vhost port.
> >
> >  3. Start io forwarding with tx_first in host testpmd.
> >
> > For 16.07 code, the Windows VM will hang once any packets are enqueued.
> >
> > Cc: <stable@dpdk.org>
> > Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> > ---
> >  lib/librte_vhost/vhost_rxtx.c | 17 ++++++++++++-----
> >  1 file changed, 12 insertions(+), 5 deletions(-)
> >
> > diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> > index 08a73fd..5806f99 100644
> > --- a/lib/librte_vhost/vhost_rxtx.c
> > +++ b/lib/librte_vhost/vhost_rxtx.c
> > @@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net
> *dev, struct vhost_virtqueue *vq,
> >  	uint16_t start_idx = vq->last_used_idx;
> >  	uint16_t cur_idx = start_idx;
> >  	uint64_t desc_addr;
> > +	uint32_t desc_chain_head;
> > +	uint32_t desc_chain_len;
> 
> What's the point of introducing "desc_chain_len"? It has the same value
> of desc_offset.

No it's not, desc_offset is the offset of the current desc only.
That's where the old code goes wrong.

If you take a look at the virtio spec:

/* le32 is used here for ids for padding reasons. */
struct vring_used_elem {
/* Index of start of used descriptor chain. */
le32 id;
/* Total length of the descriptor chain which was written to. */
le32 len;
};

> 
> 	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [dpdk-stable] [PATCH v4 1/6] vhost: fix windows vm hang
  2016-09-05  5:25       ` Wang, Zhihong
@ 2016-09-05  5:40         ` Yuanhan Liu
  0 siblings, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-09-05  5:40 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: Yuanhan Liu, dev, maxime.coquelin, thomas.monjalon, stable

On Mon, Sep 05, 2016 at 05:25:31AM +0000, Wang, Zhihong wrote:
> 
> 
> > -----Original Message-----
> > From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> > Sent: Monday, September 5, 2016 1:25 PM
> > To: Wang, Zhihong <zhihong.wang@intel.com>
> > Cc: dev@dpdk.org; maxime.coquelin@redhat.com;
> > yuanhan.liu@linux.intel.com; thomas.monjalon@6wind.com;
> > stable@dpdk.org
> > Subject: Re: [dpdk-stable] [PATCH v4 1/6] vhost: fix windows vm hang
> > 
> > On Mon, Aug 29, 2016 at 11:35:59PM -0400, Zhihong Wang wrote:
> > > This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost
> > code,
> > > which causes the guest to hang once any packets are enqueued when
> > mrg_rxbuf
> > > is turned on.
> > 
> > This commit log lacks two important pieces: why does the hang happen and
> > how does your patch fix it.
> 
> Okay, I'll add it in v5.
> 
> > 
> > > How to test?
> > >
> > >  1. Start testpmd in the host with a vhost port.
> > >
> > >  2. Start a Windows VM image with qemu and connect to the vhost port.
> > >
> > >  3. Start io forwarding with tx_first in host testpmd.
> > >
> > > For 16.07 code, the Windows VM will hang once any packets are enqueued.
> > >
> > > Cc: <stable@dpdk.org>
> > > Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> > > ---
> > >  lib/librte_vhost/vhost_rxtx.c | 17 ++++++++++++-----
> > >  1 file changed, 12 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> > > index 08a73fd..5806f99 100644
> > > --- a/lib/librte_vhost/vhost_rxtx.c
> > > +++ b/lib/librte_vhost/vhost_rxtx.c
> > > @@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net
> > *dev, struct vhost_virtqueue *vq,
> > >  	uint16_t start_idx = vq->last_used_idx;
> > >  	uint16_t cur_idx = start_idx;
> > >  	uint64_t desc_addr;
> > > +	uint32_t desc_chain_head;
> > > +	uint32_t desc_chain_len;
> > 
> > What's the point of introducing "desc_chain_len"? It has the same value
> > of desc_offset.
> 
> No it's not, desc_offset is the offset of the current desc only.
> That's where the old code goes wrong.

Oh, right.

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v4 2/6] vhost: rewrite enqueue
  2016-08-30  3:36   ` [PATCH v4 2/6] vhost: rewrite enqueue Zhihong Wang
@ 2016-09-05  6:39     ` Yuanhan Liu
  2016-09-07  5:33       ` Yuanhan Liu
  0 siblings, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-09-05  6:39 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: dev, maxime.coquelin, thomas.monjalon

On Mon, Aug 29, 2016 at 11:36:00PM -0400, Zhihong Wang wrote:
> This patch implements the vhost logic from scratch into a single function
> designed for high performance and better maintainability.
> 
> This is the baseline version of the new code, more optimization will be
> added in the following patches in this patch set.
> 
> ---
> Changes in v4:
> 
>  1. Refactor the code for clearer logic.
> 
>  2. Add PRINT_PACKET for debugging.
> 
> ---
> Changes in v3:
> 
>  1. Rewrite enqueue and delete the obsolete in the same patch.

Change log should go ----> 

> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> ---

... here, after the SoB.

>  lib/librte_vhost/vhost_rxtx.c | 525 ++++++++++++------------------------------
>  1 file changed, 145 insertions(+), 380 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> index 5806f99..629e8ae 100644
> --- a/lib/librte_vhost/vhost_rxtx.c
> +++ b/lib/librte_vhost/vhost_rxtx.c
> @@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
>  	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
>  }
>  
> -static void
> +static inline void __attribute__((always_inline))
>  virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  {
>  	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
> @@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  						cksum));
>  			break;
>  		}
> +	} else {
> +		net_hdr->flags = 0;
> +		net_hdr->csum_start = 0;
> +		net_hdr->csum_offset = 0;
>  	}
>  
>  	if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
> @@ -122,437 +126,198 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  		net_hdr->gso_size = m_buf->tso_segsz;
>  		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
>  					+ m_buf->l4_len;
> +	} else {
> +		net_hdr->gso_type = 0;
> +		net_hdr->hdr_len = 0;
> +		net_hdr->gso_size = 0;
>  	}
>  }
>  
> -static inline void
> -copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
> -		    struct virtio_net_hdr_mrg_rxbuf hdr)
> +static inline void __attribute__((always_inline))
> +update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		uint32_t desc_chain_head, uint32_t desc_chain_len)
>  {
> -	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
> -		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
> -	else
> -		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
> +	uint32_t used_idx_round = vq->last_used_idx & (vq->size - 1);

I'd suggest to use "used_idx", instead of "used_idx_round".

> +
> +	vq->used->ring[used_idx_round].id = desc_chain_head;
> +	vq->used->ring[used_idx_round].len = desc_chain_len;
> +	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
> +				ring[used_idx_round]),
> +			sizeof(vq->used->ring[used_idx_round]));
>  }
>  
> -static inline int __attribute__((always_inline))
> -copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -		  struct rte_mbuf *m, uint16_t desc_idx)
> +static inline uint32_t __attribute__((always_inline))
> +enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		uint16_t avail_idx, struct rte_mbuf *mbuf,
> +		uint32_t is_mrg_rxbuf)
>  {
> -	uint32_t desc_avail, desc_offset;
> -	uint32_t mbuf_avail, mbuf_offset;
> -	uint32_t cpy_len;
> +	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
>  	struct vring_desc *desc;
>  	uint64_t desc_addr;
> -	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
> -
> -	desc = &vq->desc[desc_idx];
> +	uint32_t desc_chain_head;
> +	uint32_t desc_chain_len;
> +	uint32_t desc_current;
> +	uint32_t desc_offset;
> +	uint32_t mbuf_len;
> +	uint32_t mbuf_avail;
> +	uint32_t copy_len;
> +	uint32_t extra_buffers = 0;

I'd name it to "num_buffers", to keep consistent with the virito hdr
naming style.

> +
> +	/* start with the first mbuf of the packet */
> +	mbuf_len = rte_pktmbuf_data_len(mbuf);
> +	mbuf_avail = mbuf_len;
> +
> +	/* get the current desc */
> +	desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
> +	desc_chain_head = desc_current;
> +	desc = &vq->desc[desc_current];
>  	desc_addr = gpa_to_vva(dev, desc->addr);
> -	/*
> -	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
> -	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
> -	 * otherwise stores offset on the stack instead of in a register.
> -	 */
> -	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
> -		return -1;
> +	if (unlikely(!desc_addr))
> +		goto error;
>  
> -	rte_prefetch0((void *)(uintptr_t)desc_addr);
> +	/* handle virtio header */
> +	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
> +	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
> +	if (is_mrg_rxbuf)
> +		virtio_hdr->num_buffers = extra_buffers + 1;
>  
> -	virtio_enqueue_offload(m, &virtio_hdr.hdr);
> -	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
>  	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
>  	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
> -
>  	desc_offset = dev->vhost_hlen;
> -	desc_avail  = desc->len - dev->vhost_hlen;
> -
> -	mbuf_avail  = rte_pktmbuf_data_len(m);
> -	mbuf_offset = 0;
> -	while (mbuf_avail != 0 || m->next != NULL) {
> -		/* done with current mbuf, fetch next */
> -		if (mbuf_avail == 0) {
> -			m = m->next;
> -
> -			mbuf_offset = 0;
> -			mbuf_avail  = rte_pktmbuf_data_len(m);
> +	desc_chain_len = desc_offset;
> +	desc_addr += desc_offset;
> +
> +	/* start copy from mbuf to desc */
> +	while (mbuf_avail || mbuf->next) {
> +		/* get the next mbuf if the current done */
> +		if (!mbuf_avail) {
> +			mbuf = mbuf->next;
> +			mbuf_len = rte_pktmbuf_data_len(mbuf);
> +			mbuf_avail = mbuf_len;
>  		}
>  
> -		/* done with current desc buf, fetch next */
> -		if (desc_avail == 0) {
> -			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
> -				/* Room in vring buffer is not enough */
> -				return -1;
> -			}
> -			if (unlikely(desc->next >= vq->size))
> -				return -1;
> +		/* get the next desc if the current done */
> +		if (desc->len <= desc_offset) {
> +			if (desc->flags & VRING_DESC_F_NEXT) {
> +				/* go on with the current desc chain */
> +				desc_offset = 0;
> +				desc_current = desc->next;
> +				desc = &vq->desc[desc_current];
> +				desc_addr = gpa_to_vva(dev, desc->addr);
> +				if (unlikely(!desc_addr))
> +					goto error;
> +			} else if (is_mrg_rxbuf) {
> +				/* start with the next desc chain */
> +				update_used_ring(dev, vq, desc_chain_head,
> +						desc_chain_len);
> +				vq->last_used_idx++;

Why not putting "vq->last_used_idx++" into update_used_ring()?

> +				extra_buffers++;
> +				virtio_hdr->num_buffers++;
> +				if (avail_idx == vq->last_used_idx)
> +					goto error;
> +
> +				desc_current =
> +					vq->avail->ring[(vq->last_used_idx) &
> +					(vq->size - 1)];
> +				desc_chain_head = desc_current;
> +				desc = &vq->desc[desc_current];
> +				desc_addr = gpa_to_vva(dev, desc->addr);
> +				if (unlikely(!desc_addr))
> +					goto error;
>  
> -			desc = &vq->desc[desc->next];
> -			desc_addr = gpa_to_vva(dev, desc->addr);
> -			if (unlikely(!desc_addr))
> -				return -1;
> -
> -			desc_offset = 0;
> -			desc_avail  = desc->len;
> +				desc_chain_len = 0;
> +				desc_offset = 0;
> +			} else
> +				goto error;
>  		}
>  
> -		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
> -		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
> -			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
> -			cpy_len);
> -		vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
> -		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
> -			     cpy_len, 0);
> -
> -		mbuf_avail  -= cpy_len;
> -		mbuf_offset += cpy_len;
> -		desc_avail  -= cpy_len;
> -		desc_offset += cpy_len;
> +		/* copy mbuf data */
> +		copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);

TBH, I'm okay with copy_len (actually, I prefer it slightly). However,
the old code uses cpy_len, the current dequeue function also uses cpy_len,
I then see no good reason to use copy_len here. It's really not a good
idea to me to use two different naming styles in one source file.


> +		rte_memcpy((void *)(uintptr_t)desc_addr,
> +				rte_pktmbuf_mtod_offset(mbuf, void *,
> +					mbuf_len - mbuf_avail),

I would keep the old var "mbuf_offset" and do not introduce "mbuf_len".
This could avoid above calculation and make it straightforward.

> +				copy_len);
> +		vhost_log_write(dev, desc->addr + desc_offset, copy_len);
> +		PRINT_PACKET(dev, (uintptr_t)desc_addr, copy_len, 0);
> +		mbuf_avail -= copy_len;
> +		desc_offset += copy_len;
> +		desc_addr += copy_len;
> +		desc_chain_len += copy_len;

Vertical alighment[0] is not a must, but as you can see, it's a style I
prefer. Meaning, if possible, please follow it.

[0]: https://en.wikipedia.org/wiki/Programming_style#Vertical_alignment

>  	}
>  
> -	return 0;
> -}
> +	update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
> +	vq->last_used_idx++;
>  
...
> -	rte_prefetch0(&vq->desc[desc_indexes[0]]);
> -	for (i = 0; i < count; i++) {
> -		uint16_t desc_idx = desc_indexes[i];
> -		int err;
> +	return 0;
>  
> -		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
> -		if (unlikely(err)) {
> -			used_idx = (start_idx + i) & (vq->size - 1);
> -			vq->used->ring[used_idx].len = dev->vhost_hlen;
> -			vhost_log_used_vring(dev, vq,
> -				offsetof(struct vring_used, ring[used_idx]),
> -				sizeof(vq->used->ring[used_idx]));
> -		}
> +error:
> +	/* rollback on any error if last_used_idx update on-the-fly */
> +	vq->last_used_idx -= extra_buffers;
>  
> -		if (i + 1 < count)
> -			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
> -	}
> +	return 1;

We normally returns -1 (but not 1) on error.

>  
> +static inline void __attribute__((always_inline))
> +notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
> +{
>  	rte_smp_wmb();
> -
> -	*(volatile uint16_t *)&vq->used->idx += count;
> -	vq->last_used_idx += count;
> -	vhost_log_used_vring(dev, vq,
> -		offsetof(struct vring_used, idx),
> -		sizeof(vq->used->idx));
> -
> -	/* flush used->idx update before we read avail->flags. */
> +	vq->used->idx = vq->last_used_idx;

I will not drop the "volatile" cast here, sliently. You know this kind of
stuff is tricky and would be painful to debug if it cause any issue. Such
removal deserves a patch, as well as some explanations.

> +	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> +			sizeof(vq->used->idx));
>  	rte_mb();
> -
> -	/* Kick the guest if necessary. */
>  	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
>  			&& (vq->callfd >= 0))
>  		eventfd_write(vq->callfd, (eventfd_t)1);
> -	return count;
>  }
>  
...
> +	/* start enqueuing packets 1 by 1 */
> +	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
> +	while (pkt_left && avail_idx != vq->last_used_idx) {
> +		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
> +					is_mrg_rxbuf))
>  			break;
> -		}
> -
> -		nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
> -						      pkts[pkt_idx], buf_vec);
> -		rte_smp_wmb();
> -
> -		*(volatile uint16_t *)&vq->used->idx += nr_used;
> -		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> -			sizeof(vq->used->idx));
> -		vq->last_used_idx += nr_used;
> -	}
> -
> -	if (likely(pkt_idx)) {
> -		/* flush used->idx update before we read avail->flags. */
> -		rte_mb();
>  
> -		/* Kick the guest if necessary. */
> -		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
> -				&& (vq->callfd >= 0))
> -			eventfd_write(vq->callfd, (eventfd_t)1);
> +		pkt_idx++;
> +		pkt_sent++;

pkt_idx and pkt_sent is duplicate here.

	--yliu

> +		pkt_left--;
>  	}
>  
> -	return pkt_idx;
> -}
> -
> -uint16_t
> -rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
> -	struct rte_mbuf **pkts, uint16_t count)
> -{
> -	struct virtio_net *dev = get_device(vid);
> -
> -	if (!dev)
> -		return 0;
> +	/* update used idx and kick the guest if necessary */
> +	if (pkt_sent)
> +		notify_guest(dev, vq);
>  
> -	if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> -		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
> -	else
> -		return virtio_dev_rx(dev, queue_id, pkts, count);
> +	return pkt_sent;
>  }
>  
>  static void
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v4 2/6] vhost: rewrite enqueue
  2016-09-05  6:39     ` Yuanhan Liu
@ 2016-09-07  5:33       ` Yuanhan Liu
  2016-09-07  5:39         ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-09-07  5:33 UTC (permalink / raw)
  To: Zhihong Wang; +Cc: dev, maxime.coquelin, thomas.monjalon

Hmmm, yet another email didn't send out successfully. Resend.

BTW, please work out v5 on top of the latest next-virtio tree.

Thanks.

	--yliu

On Mon, Sep 05, 2016 at 02:39:25PM +0800, Yuanhan Liu wrote:
----
On Mon, Aug 29, 2016 at 11:36:00PM -0400, Zhihong Wang wrote:
> This patch implements the vhost logic from scratch into a single function
> designed for high performance and better maintainability.
> 
> This is the baseline version of the new code, more optimization will be
> added in the following patches in this patch set.
> 
> ---
> Changes in v4:
> 
>  1. Refactor the code for clearer logic.
> 
>  2. Add PRINT_PACKET for debugging.
> 
> ---
> Changes in v3:
> 
>  1. Rewrite enqueue and delete the obsolete in the same patch.

Change log should go ----> 

> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> ---

... here, after the SoB.

>  lib/librte_vhost/vhost_rxtx.c | 525 ++++++++++++------------------------------
>  1 file changed, 145 insertions(+), 380 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> index 5806f99..629e8ae 100644
> --- a/lib/librte_vhost/vhost_rxtx.c
> +++ b/lib/librte_vhost/vhost_rxtx.c
> @@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
>  	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
>  }
>  
> -static void
> +static inline void __attribute__((always_inline))
>  virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  {
>  	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
> @@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  						cksum));
>  			break;
>  		}
> +	} else {
> +		net_hdr->flags = 0;
> +		net_hdr->csum_start = 0;
> +		net_hdr->csum_offset = 0;
>  	}
>  
>  	if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
> @@ -122,437 +126,198 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  		net_hdr->gso_size = m_buf->tso_segsz;
>  		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
>  					+ m_buf->l4_len;
> +	} else {
> +		net_hdr->gso_type = 0;
> +		net_hdr->hdr_len = 0;
> +		net_hdr->gso_size = 0;
>  	}
>  }
>  
> -static inline void
> -copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
> -		    struct virtio_net_hdr_mrg_rxbuf hdr)
> +static inline void __attribute__((always_inline))
> +update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		uint32_t desc_chain_head, uint32_t desc_chain_len)
>  {
> -	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
> -		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
> -	else
> -		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
> +	uint32_t used_idx_round = vq->last_used_idx & (vq->size - 1);

I'd suggest to use "used_idx", instead of "used_idx_round".

> +
> +	vq->used->ring[used_idx_round].id = desc_chain_head;
> +	vq->used->ring[used_idx_round].len = desc_chain_len;
> +	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
> +				ring[used_idx_round]),
> +			sizeof(vq->used->ring[used_idx_round]));
>  }
>  
> -static inline int __attribute__((always_inline))
> -copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -		  struct rte_mbuf *m, uint16_t desc_idx)
> +static inline uint32_t __attribute__((always_inline))
> +enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		uint16_t avail_idx, struct rte_mbuf *mbuf,
> +		uint32_t is_mrg_rxbuf)
>  {
> -	uint32_t desc_avail, desc_offset;
> -	uint32_t mbuf_avail, mbuf_offset;
> -	uint32_t cpy_len;
> +	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
>  	struct vring_desc *desc;
>  	uint64_t desc_addr;
> -	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
> -
> -	desc = &vq->desc[desc_idx];
> +	uint32_t desc_chain_head;
> +	uint32_t desc_chain_len;
> +	uint32_t desc_current;
> +	uint32_t desc_offset;
> +	uint32_t mbuf_len;
> +	uint32_t mbuf_avail;
> +	uint32_t copy_len;
> +	uint32_t extra_buffers = 0;

I'd name it to "num_buffers", to keep consistent with the virito hdr
naming style.

> +
> +	/* start with the first mbuf of the packet */
> +	mbuf_len = rte_pktmbuf_data_len(mbuf);
> +	mbuf_avail = mbuf_len;
> +
> +	/* get the current desc */
> +	desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
> +	desc_chain_head = desc_current;
> +	desc = &vq->desc[desc_current];
>  	desc_addr = gpa_to_vva(dev, desc->addr);
> -	/*
> -	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
> -	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
> -	 * otherwise stores offset on the stack instead of in a register.
> -	 */
> -	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
> -		return -1;
> +	if (unlikely(!desc_addr))
> +		goto error;
>  
> -	rte_prefetch0((void *)(uintptr_t)desc_addr);
> +	/* handle virtio header */
> +	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
> +	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
> +	if (is_mrg_rxbuf)
> +		virtio_hdr->num_buffers = extra_buffers + 1;
>  
> -	virtio_enqueue_offload(m, &virtio_hdr.hdr);
> -	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
>  	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
>  	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
> -
>  	desc_offset = dev->vhost_hlen;
> -	desc_avail  = desc->len - dev->vhost_hlen;
> -
> -	mbuf_avail  = rte_pktmbuf_data_len(m);
> -	mbuf_offset = 0;
> -	while (mbuf_avail != 0 || m->next != NULL) {
> -		/* done with current mbuf, fetch next */
> -		if (mbuf_avail == 0) {
> -			m = m->next;
> -
> -			mbuf_offset = 0;
> -			mbuf_avail  = rte_pktmbuf_data_len(m);
> +	desc_chain_len = desc_offset;
> +	desc_addr += desc_offset;
> +
> +	/* start copy from mbuf to desc */
> +	while (mbuf_avail || mbuf->next) {
> +		/* get the next mbuf if the current done */
> +		if (!mbuf_avail) {
> +			mbuf = mbuf->next;
> +			mbuf_len = rte_pktmbuf_data_len(mbuf);
> +			mbuf_avail = mbuf_len;
>  		}
>  
> -		/* done with current desc buf, fetch next */
> -		if (desc_avail == 0) {
> -			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
> -				/* Room in vring buffer is not enough */
> -				return -1;
> -			}
> -			if (unlikely(desc->next >= vq->size))
> -				return -1;
> +		/* get the next desc if the current done */
> +		if (desc->len <= desc_offset) {
> +			if (desc->flags & VRING_DESC_F_NEXT) {
> +				/* go on with the current desc chain */
> +				desc_offset = 0;
> +				desc_current = desc->next;
> +				desc = &vq->desc[desc_current];
> +				desc_addr = gpa_to_vva(dev, desc->addr);
> +				if (unlikely(!desc_addr))
> +					goto error;
> +			} else if (is_mrg_rxbuf) {
> +				/* start with the next desc chain */
> +				update_used_ring(dev, vq, desc_chain_head,
> +						desc_chain_len);
> +				vq->last_used_idx++;

Why not putting "vq->last_used_idx++" into update_used_ring()?

> +				extra_buffers++;
> +				virtio_hdr->num_buffers++;
> +				if (avail_idx == vq->last_used_idx)
> +					goto error;
> +
> +				desc_current =
> +					vq->avail->ring[(vq->last_used_idx) &
> +					(vq->size - 1)];
> +				desc_chain_head = desc_current;
> +				desc = &vq->desc[desc_current];
> +				desc_addr = gpa_to_vva(dev, desc->addr);
> +				if (unlikely(!desc_addr))
> +					goto error;
>  
> -			desc = &vq->desc[desc->next];
> -			desc_addr = gpa_to_vva(dev, desc->addr);
> -			if (unlikely(!desc_addr))
> -				return -1;
> -
> -			desc_offset = 0;
> -			desc_avail  = desc->len;
> +				desc_chain_len = 0;
> +				desc_offset = 0;
> +			} else
> +				goto error;
>  		}
>  
> -		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
> -		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
> -			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
> -			cpy_len);
> -		vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
> -		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
> -			     cpy_len, 0);
> -
> -		mbuf_avail  -= cpy_len;
> -		mbuf_offset += cpy_len;
> -		desc_avail  -= cpy_len;
> -		desc_offset += cpy_len;
> +		/* copy mbuf data */
> +		copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);

TBH, I'm okay with copy_len (actually, I prefer it slightly). However,
the old code uses cpy_len, the current dequeue function also uses cpy_len,
I then see no good reason to use copy_len here. It's really not a good
idea to me to use two different naming styles in one source file.


> +		rte_memcpy((void *)(uintptr_t)desc_addr,
> +				rte_pktmbuf_mtod_offset(mbuf, void *,
> +					mbuf_len - mbuf_avail),

I would keep the old var "mbuf_offset" and do not introduce "mbuf_len".
This could avoid above calculation and make it straightforward.

> +				copy_len);
> +		vhost_log_write(dev, desc->addr + desc_offset, copy_len);
> +		PRINT_PACKET(dev, (uintptr_t)desc_addr, copy_len, 0);
> +		mbuf_avail -= copy_len;
> +		desc_offset += copy_len;
> +		desc_addr += copy_len;
> +		desc_chain_len += copy_len;

Vertical alighment[0] is not a must, but as you can see, it's a style I
prefer. Meaning, if possible, please follow it.

[0]: https://en.wikipedia.org/wiki/Programming_style#Vertical_alignment

>  	}
>  
> -	return 0;
> -}
> +	update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
> +	vq->last_used_idx++;
>  
...
> -	rte_prefetch0(&vq->desc[desc_indexes[0]]);
> -	for (i = 0; i < count; i++) {
> -		uint16_t desc_idx = desc_indexes[i];
> -		int err;
> +	return 0;
>  
> -		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
> -		if (unlikely(err)) {
> -			used_idx = (start_idx + i) & (vq->size - 1);
> -			vq->used->ring[used_idx].len = dev->vhost_hlen;
> -			vhost_log_used_vring(dev, vq,
> -				offsetof(struct vring_used, ring[used_idx]),
> -				sizeof(vq->used->ring[used_idx]));
> -		}
> +error:
> +	/* rollback on any error if last_used_idx update on-the-fly */
> +	vq->last_used_idx -= extra_buffers;
>  
> -		if (i + 1 < count)
> -			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
> -	}
> +	return 1;

We normally returns -1 (but not 1) on error.

>  
> +static inline void __attribute__((always_inline))
> +notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
> +{
>  	rte_smp_wmb();
> -
> -	*(volatile uint16_t *)&vq->used->idx += count;
> -	vq->last_used_idx += count;
> -	vhost_log_used_vring(dev, vq,
> -		offsetof(struct vring_used, idx),
> -		sizeof(vq->used->idx));
> -
> -	/* flush used->idx update before we read avail->flags. */
> +	vq->used->idx = vq->last_used_idx;

I will not drop the "volatile" cast here, sliently. You know this kind of
stuff is tricky and would be painful to debug if it cause any issue. Such
removal deserves a patch, as well as some explanations.

> +	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> +			sizeof(vq->used->idx));
>  	rte_mb();
> -
> -	/* Kick the guest if necessary. */
>  	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
>  			&& (vq->callfd >= 0))
>  		eventfd_write(vq->callfd, (eventfd_t)1);
> -	return count;
>  }
>  
...
> +	/* start enqueuing packets 1 by 1 */
> +	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
> +	while (pkt_left && avail_idx != vq->last_used_idx) {
> +		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
> +					is_mrg_rxbuf))
>  			break;
> -		}
> -
> -		nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
> -						      pkts[pkt_idx], buf_vec);
> -		rte_smp_wmb();
> -
> -		*(volatile uint16_t *)&vq->used->idx += nr_used;
> -		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> -			sizeof(vq->used->idx));
> -		vq->last_used_idx += nr_used;
> -	}
> -
> -	if (likely(pkt_idx)) {
> -		/* flush used->idx update before we read avail->flags. */
> -		rte_mb();
>  
> -		/* Kick the guest if necessary. */
> -		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
> -				&& (vq->callfd >= 0))
> -			eventfd_write(vq->callfd, (eventfd_t)1);
> +		pkt_idx++;
> +		pkt_sent++;

pkt_idx and pkt_sent is duplicate here.

	--yliu

> +		pkt_left--;
>  	}
>  
> -	return pkt_idx;
> -}
> -
> -uint16_t
> -rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
> -	struct rte_mbuf **pkts, uint16_t count)
> -{
> -	struct virtio_net *dev = get_device(vid);
> -
> -	if (!dev)
> -		return 0;
> +	/* update used idx and kick the guest if necessary */
> +	if (pkt_sent)
> +		notify_guest(dev, vq);
>  
> -	if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> -		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
> -	else
> -		return virtio_dev_rx(dev, queue_id, pkts, count);
> +	return pkt_sent;
>  }
>  
>  static void
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v4 2/6] vhost: rewrite enqueue
  2016-09-07  5:33       ` Yuanhan Liu
@ 2016-09-07  5:39         ` Wang, Zhihong
  0 siblings, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-07  5:39 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: dev, maxime.coquelin, thomas.monjalon



> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Wednesday, September 7, 2016 1:33 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org; maxime.coquelin@redhat.com;
> thomas.monjalon@6wind.com
> Subject: Re: [PATCH v4 2/6] vhost: rewrite enqueue
> 
> Hmmm, yet another email didn't send out successfully. Resend.
> 
> BTW, please work out v5 on top of the latest next-virtio tree.
> 
> Thanks.

Okay. Thanks.

> 
> 	--yliu
> 
> On Mon, Sep 05, 2016 at 02:39:25PM +0800, Yuanhan Liu wrote:
> ----
> On Mon, Aug 29, 2016 at 11:36:00PM -0400, Zhihong Wang wrote:
> > This patch implements the vhost logic from scratch into a single function
> > designed for high performance and better maintainability.
> >
> > This is the baseline version of the new code, more optimization will be
> > added in the following patches in this patch set.
> >
> > ---
> > Changes in v4:
> >
> >  1. Refactor the code for clearer logic.
> >
> >  2. Add PRINT_PACKET for debugging.
> >
> > ---
> > Changes in v3:
> >
> >  1. Rewrite enqueue and delete the obsolete in the same patch.
> 
> Change log should go ---->
> 
> > Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> > ---
> 
> ... here, after the SoB.
> 
> >  lib/librte_vhost/vhost_rxtx.c | 525 ++++++++++++-----------------------------
> -
> >  1 file changed, 145 insertions(+), 380 deletions(-)
> >
> > diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> > index 5806f99..629e8ae 100644
> > --- a/lib/librte_vhost/vhost_rxtx.c
> > +++ b/lib/librte_vhost/vhost_rxtx.c
> > @@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx,
> uint32_t qp_nb)
> >  	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
> >  }
> >
> > -static void
> > +static inline void __attribute__((always_inline))
> >  virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr
> *net_hdr)
> >  {
> >  	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
> > @@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf,
> struct virtio_net_hdr *net_hdr)
> >  						cksum));
> >  			break;
> >  		}
> > +	} else {
> > +		net_hdr->flags = 0;
> > +		net_hdr->csum_start = 0;
> > +		net_hdr->csum_offset = 0;
> >  	}
> >
> >  	if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
> > @@ -122,437 +126,198 @@ virtio_enqueue_offload(struct rte_mbuf
> *m_buf, struct virtio_net_hdr *net_hdr)
> >  		net_hdr->gso_size = m_buf->tso_segsz;
> >  		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
> >  					+ m_buf->l4_len;
> > +	} else {
> > +		net_hdr->gso_type = 0;
> > +		net_hdr->hdr_len = 0;
> > +		net_hdr->gso_size = 0;
> >  	}
> >  }
> >
> > -static inline void
> > -copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
> > -		    struct virtio_net_hdr_mrg_rxbuf hdr)
> > +static inline void __attribute__((always_inline))
> > +update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > +		uint32_t desc_chain_head, uint32_t desc_chain_len)
> >  {
> > -	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
> > -		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr =
> hdr;
> > -	else
> > -		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
> > +	uint32_t used_idx_round = vq->last_used_idx & (vq->size - 1);
> 
> I'd suggest to use "used_idx", instead of "used_idx_round".
> 
> > +
> > +	vq->used->ring[used_idx_round].id = desc_chain_head;
> > +	vq->used->ring[used_idx_round].len = desc_chain_len;
> > +	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
> > +				ring[used_idx_round]),
> > +			sizeof(vq->used->ring[used_idx_round]));
> >  }
> >
> > -static inline int __attribute__((always_inline))
> > -copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > -		  struct rte_mbuf *m, uint16_t desc_idx)
> > +static inline uint32_t __attribute__((always_inline))
> > +enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > +		uint16_t avail_idx, struct rte_mbuf *mbuf,
> > +		uint32_t is_mrg_rxbuf)
> >  {
> > -	uint32_t desc_avail, desc_offset;
> > -	uint32_t mbuf_avail, mbuf_offset;
> > -	uint32_t cpy_len;
> > +	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
> >  	struct vring_desc *desc;
> >  	uint64_t desc_addr;
> > -	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
> > -
> > -	desc = &vq->desc[desc_idx];
> > +	uint32_t desc_chain_head;
> > +	uint32_t desc_chain_len;
> > +	uint32_t desc_current;
> > +	uint32_t desc_offset;
> > +	uint32_t mbuf_len;
> > +	uint32_t mbuf_avail;
> > +	uint32_t copy_len;
> > +	uint32_t extra_buffers = 0;
> 
> I'd name it to "num_buffers", to keep consistent with the virito hdr
> naming style.
> 
> > +
> > +	/* start with the first mbuf of the packet */
> > +	mbuf_len = rte_pktmbuf_data_len(mbuf);
> > +	mbuf_avail = mbuf_len;
> > +
> > +	/* get the current desc */
> > +	desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
> > +	desc_chain_head = desc_current;
> > +	desc = &vq->desc[desc_current];
> >  	desc_addr = gpa_to_vva(dev, desc->addr);
> > -	/*
> > -	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
> > -	 * performance issue with some versions of gcc (4.8.4 and 5.3.0)
> which
> > -	 * otherwise stores offset on the stack instead of in a register.
> > -	 */
> > -	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
> > -		return -1;
> > +	if (unlikely(!desc_addr))
> > +		goto error;
> >
> > -	rte_prefetch0((void *)(uintptr_t)desc_addr);
> > +	/* handle virtio header */
> > +	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf
> *)(uintptr_t)desc_addr;
> > +	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
> > +	if (is_mrg_rxbuf)
> > +		virtio_hdr->num_buffers = extra_buffers + 1;
> >
> > -	virtio_enqueue_offload(m, &virtio_hdr.hdr);
> > -	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
> >  	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
> >  	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
> > -
> >  	desc_offset = dev->vhost_hlen;
> > -	desc_avail  = desc->len - dev->vhost_hlen;
> > -
> > -	mbuf_avail  = rte_pktmbuf_data_len(m);
> > -	mbuf_offset = 0;
> > -	while (mbuf_avail != 0 || m->next != NULL) {
> > -		/* done with current mbuf, fetch next */
> > -		if (mbuf_avail == 0) {
> > -			m = m->next;
> > -
> > -			mbuf_offset = 0;
> > -			mbuf_avail  = rte_pktmbuf_data_len(m);
> > +	desc_chain_len = desc_offset;
> > +	desc_addr += desc_offset;
> > +
> > +	/* start copy from mbuf to desc */
> > +	while (mbuf_avail || mbuf->next) {
> > +		/* get the next mbuf if the current done */
> > +		if (!mbuf_avail) {
> > +			mbuf = mbuf->next;
> > +			mbuf_len = rte_pktmbuf_data_len(mbuf);
> > +			mbuf_avail = mbuf_len;
> >  		}
> >
> > -		/* done with current desc buf, fetch next */
> > -		if (desc_avail == 0) {
> > -			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
> > -				/* Room in vring buffer is not enough */
> > -				return -1;
> > -			}
> > -			if (unlikely(desc->next >= vq->size))
> > -				return -1;
> > +		/* get the next desc if the current done */
> > +		if (desc->len <= desc_offset) {
> > +			if (desc->flags & VRING_DESC_F_NEXT) {
> > +				/* go on with the current desc chain */
> > +				desc_offset = 0;
> > +				desc_current = desc->next;
> > +				desc = &vq->desc[desc_current];
> > +				desc_addr = gpa_to_vva(dev, desc->addr);
> > +				if (unlikely(!desc_addr))
> > +					goto error;
> > +			} else if (is_mrg_rxbuf) {
> > +				/* start with the next desc chain */
> > +				update_used_ring(dev, vq,
> desc_chain_head,
> > +						desc_chain_len);
> > +				vq->last_used_idx++;
> 
> Why not putting "vq->last_used_idx++" into update_used_ring()?
> 
> > +				extra_buffers++;
> > +				virtio_hdr->num_buffers++;
> > +				if (avail_idx == vq->last_used_idx)
> > +					goto error;
> > +
> > +				desc_current =
> > +					vq->avail->ring[(vq->last_used_idx)
> &
> > +					(vq->size - 1)];
> > +				desc_chain_head = desc_current;
> > +				desc = &vq->desc[desc_current];
> > +				desc_addr = gpa_to_vva(dev, desc->addr);
> > +				if (unlikely(!desc_addr))
> > +					goto error;
> >
> > -			desc = &vq->desc[desc->next];
> > -			desc_addr = gpa_to_vva(dev, desc->addr);
> > -			if (unlikely(!desc_addr))
> > -				return -1;
> > -
> > -			desc_offset = 0;
> > -			desc_avail  = desc->len;
> > +				desc_chain_len = 0;
> > +				desc_offset = 0;
> > +			} else
> > +				goto error;
> >  		}
> >
> > -		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
> > -		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
> > -			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
> > -			cpy_len);
> > -		vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
> > -		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
> > -			     cpy_len, 0);
> > -
> > -		mbuf_avail  -= cpy_len;
> > -		mbuf_offset += cpy_len;
> > -		desc_avail  -= cpy_len;
> > -		desc_offset += cpy_len;
> > +		/* copy mbuf data */
> > +		copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
> 
> TBH, I'm okay with copy_len (actually, I prefer it slightly). However,
> the old code uses cpy_len, the current dequeue function also uses cpy_len,
> I then see no good reason to use copy_len here. It's really not a good
> idea to me to use two different naming styles in one source file.
> 
> 
> > +		rte_memcpy((void *)(uintptr_t)desc_addr,
> > +				rte_pktmbuf_mtod_offset(mbuf, void *,
> > +					mbuf_len - mbuf_avail),
> 
> I would keep the old var "mbuf_offset" and do not introduce "mbuf_len".
> This could avoid above calculation and make it straightforward.
> 
> > +				copy_len);
> > +		vhost_log_write(dev, desc->addr + desc_offset, copy_len);
> > +		PRINT_PACKET(dev, (uintptr_t)desc_addr, copy_len, 0);
> > +		mbuf_avail -= copy_len;
> > +		desc_offset += copy_len;
> > +		desc_addr += copy_len;
> > +		desc_chain_len += copy_len;
> 
> Vertical alighment[0] is not a must, but as you can see, it's a style I
> prefer. Meaning, if possible, please follow it.
> 
> [0]: https://en.wikipedia.org/wiki/Programming_style#Vertical_alignment
> 
> >  	}
> >
> > -	return 0;
> > -}
> > +	update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
> > +	vq->last_used_idx++;
> >
> ...
> > -	rte_prefetch0(&vq->desc[desc_indexes[0]]);
> > -	for (i = 0; i < count; i++) {
> > -		uint16_t desc_idx = desc_indexes[i];
> > -		int err;
> > +	return 0;
> >
> > -		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
> > -		if (unlikely(err)) {
> > -			used_idx = (start_idx + i) & (vq->size - 1);
> > -			vq->used->ring[used_idx].len = dev->vhost_hlen;
> > -			vhost_log_used_vring(dev, vq,
> > -				offsetof(struct vring_used, ring[used_idx]),
> > -				sizeof(vq->used->ring[used_idx]));
> > -		}
> > +error:
> > +	/* rollback on any error if last_used_idx update on-the-fly */
> > +	vq->last_used_idx -= extra_buffers;
> >
> > -		if (i + 1 < count)
> > -			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
> > -	}
> > +	return 1;
> 
> We normally returns -1 (but not 1) on error.
> 
> >
> > +static inline void __attribute__((always_inline))
> > +notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
> > +{
> >  	rte_smp_wmb();
> > -
> > -	*(volatile uint16_t *)&vq->used->idx += count;
> > -	vq->last_used_idx += count;
> > -	vhost_log_used_vring(dev, vq,
> > -		offsetof(struct vring_used, idx),
> > -		sizeof(vq->used->idx));
> > -
> > -	/* flush used->idx update before we read avail->flags. */
> > +	vq->used->idx = vq->last_used_idx;
> 
> I will not drop the "volatile" cast here, sliently. You know this kind of
> stuff is tricky and would be painful to debug if it cause any issue. Such
> removal deserves a patch, as well as some explanations.
> 
> > +	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> > +			sizeof(vq->used->idx));
> >  	rte_mb();
> > -
> > -	/* Kick the guest if necessary. */
> >  	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
> >  			&& (vq->callfd >= 0))
> >  		eventfd_write(vq->callfd, (eventfd_t)1);
> > -	return count;
> >  }
> >
> ...
> > +	/* start enqueuing packets 1 by 1 */
> > +	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
> > +	while (pkt_left && avail_idx != vq->last_used_idx) {
> > +		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
> > +					is_mrg_rxbuf))
> >  			break;
> > -		}
> > -
> > -		nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
> > -						      pkts[pkt_idx], buf_vec);
> > -		rte_smp_wmb();
> > -
> > -		*(volatile uint16_t *)&vq->used->idx += nr_used;
> > -		vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
> idx),
> > -			sizeof(vq->used->idx));
> > -		vq->last_used_idx += nr_used;
> > -	}
> > -
> > -	if (likely(pkt_idx)) {
> > -		/* flush used->idx update before we read avail->flags. */
> > -		rte_mb();
> >
> > -		/* Kick the guest if necessary. */
> > -		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
> > -				&& (vq->callfd >= 0))
> > -			eventfd_write(vq->callfd, (eventfd_t)1);
> > +		pkt_idx++;
> > +		pkt_sent++;
> 
> pkt_idx and pkt_sent is duplicate here.
> 
> 	--yliu
> 
> > +		pkt_left--;
> >  	}
> >
> > -	return pkt_idx;
> > -}
> > -
> > -uint16_t
> > -rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
> > -	struct rte_mbuf **pkts, uint16_t count)
> > -{
> > -	struct virtio_net *dev = get_device(vid);
> > -
> > -	if (!dev)
> > -		return 0;
> > +	/* update used idx and kick the guest if necessary */
> > +	if (pkt_sent)
> > +		notify_guest(dev, vq);
> >
> > -	if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> > -		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
> > -	else
> > -		return virtio_dev_rx(dev, queue_id, pkts, count);
> > +	return pkt_sent;
> >  }
> >
> >  static void
> > --
> > 2.7.4

^ permalink raw reply	[flat|nested] 141+ messages in thread

* [PATCH v5 0/6] vhost: optimize enqueue
  2016-08-16  3:50 [PATCH] optimize vhost enqueue Zhihong Wang
                   ` (3 preceding siblings ...)
  2016-08-30  3:35 ` [PATCH v4 0/6] " Zhihong Wang
@ 2016-09-09  3:39 ` Zhihong Wang
  2016-09-09  3:39   ` [PATCH v5 1/6] vhost: fix windows vm hang Zhihong Wang
                     ` (6 more replies)
  2016-09-20  2:00 ` [PATCH v6 " Zhihong Wang
  5 siblings, 7 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-09-09  3:39 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon

This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

 *  Higher maximum throughput can be achieved for fast frontends like DPDK
    virtio pmd.

 *  Better scalability can be achieved that each vhost core can support
    more connections because it takes less cycles to handle each single
    frontend.

This patch set contains:

 1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.

 2. A baseline patch to rewrite the vhost logic.

 3. A series of optimization patches added upon the baseline.

The main optimization techniques are:

 1. Reorder code to reduce CPU pipeline stall cycles.

 2. Batch update the used ring for better efficiency.

 3. Prefetch descriptor to hide cache latency.

 4. Remove useless volatile attribute to allow compiler optimization.

Code reordering and batch used ring update bring most of the performance
improvements.

In the existing code there're 2 callbacks for vhost enqueue:

 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.

 *  virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
maintenance efforts.

Also, there's a compatibility issue in the existing code which causes
Windows VM to hang when the mrg_rxbuf feature turned on.

---
Changes in v5:

 1. Rebase to the latest branch.

 2. Rename variables to keep consistent in naming style.

 3. Small changes like return value adjustment and vertical alignment.

 4. Add details in commit log.

---
Changes in v4:

 1. Fix a Windows VM compatibility issue.

 2. Free shadow used ring in the right place.

 3. Add failure check for shadow used ring malloc.

 4. Refactor the code for clearer logic.

 5. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

 3. Rewrite enqueue and delete the obsolete in the same patch.

---
Changes in v2:

 1. Split the big function into several small ones.

 2. Use multiple patches to explain each optimization.

 3. Add comments.

Zhihong Wang (6):
  vhost: fix windows vm hang
  vhost: rewrite enqueue
  vhost: remove useless volatile
  vhost: add desc prefetch
  vhost: batch update used ring
  vhost: optimize cache access

 lib/librte_vhost/vhost.c      |  20 +-
 lib/librte_vhost/vhost.h      |   6 +-
 lib/librte_vhost/vhost_user.c |  31 ++-
 lib/librte_vhost/virtio_net.c | 561 +++++++++++++++---------------------------
 4 files changed, 242 insertions(+), 376 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 141+ messages in thread

* [PATCH v5 1/6] vhost: fix windows vm hang
  2016-09-09  3:39 ` [PATCH v5 0/6] vhost: optimize enqueue Zhihong Wang
@ 2016-09-09  3:39   ` Zhihong Wang
  2016-09-09  3:39   ` [PATCH v5 2/6] vhost: rewrite enqueue Zhihong Wang
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-09-09  3:39 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, stable, Zhihong Wang

This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost code
which causes the guest to hang once any packets are enqueued when mrg_rxbuf
is turned on by setting the right id and len in the used ring.

As defined in virtio spec 0.95 and 1.0, in each used ring element, id means
index of start of used descriptor chain, and len means total length of the
descriptor chain which was written to. While in 16.07 code, index of the
last descriptor is assigned to id, and the length of the last descriptor is
assigned to len.

How to test?

 1. Start testpmd in the host with a vhost port.

 2. Start a Windows VM image with qemu and connect to the vhost port.

 3. Start io forwarding with tx_first in host testpmd.

For 16.07 code, the Windows VM will hang once any packets are enqueued.

Cc: <stable@dpdk.org>
Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
Changes in v5:

 1. Add details in commit log.

 lib/librte_vhost/virtio_net.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 8a151af..0d6e7d9 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	uint16_t start_idx = vq->last_used_idx;
 	uint16_t cur_idx = start_idx;
 	uint64_t desc_addr;
+	uint32_t desc_chain_head;
+	uint32_t desc_chain_len;
 	uint32_t mbuf_offset, mbuf_avail;
 	uint32_t desc_offset, desc_avail;
 	uint32_t cpy_len;
@@ -412,6 +414,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
 	desc_offset = dev->vhost_hlen;
+	desc_chain_head = buf_vec[vec_idx].desc_idx;
+	desc_chain_len = desc_offset;
 
 	mbuf_avail  = rte_pktmbuf_data_len(m);
 	mbuf_offset = 0;
@@ -419,19 +423,21 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		/* done with current desc buf, get the next one */
 		if (desc_avail == 0) {
 			desc_idx = buf_vec[vec_idx].desc_idx;
+			vec_idx++;
 
 			if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
 				/* Update used ring with desc information */
 				used_idx = cur_idx++ & (vq->size - 1);
-				vq->used->ring[used_idx].id  = desc_idx;
-				vq->used->ring[used_idx].len = desc_offset;
+				vq->used->ring[used_idx].id = desc_chain_head;
+				vq->used->ring[used_idx].len = desc_chain_len;
 				vhost_log_used_vring(dev, vq,
 					offsetof(struct vring_used,
 						 ring[used_idx]),
 					sizeof(vq->used->ring[used_idx]));
+				desc_chain_head = buf_vec[vec_idx].desc_idx;
+				desc_chain_len = 0;
 			}
 
-			vec_idx++;
 			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
 			if (unlikely(!desc_addr))
 				return 0;
@@ -463,11 +469,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		mbuf_offset += cpy_len;
 		desc_avail  -= cpy_len;
 		desc_offset += cpy_len;
+		desc_chain_len += cpy_len;
 	}
 
 	used_idx = cur_idx & (vq->size - 1);
-	vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
-	vq->used->ring[used_idx].len = desc_offset;
+	vq->used->ring[used_idx].id = desc_chain_head;
+	vq->used->ring[used_idx].len = desc_chain_len;
 	vhost_log_used_vring(dev, vq,
 		offsetof(struct vring_used, ring[used_idx]),
 		sizeof(vq->used->ring[used_idx]));
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v5 2/6] vhost: rewrite enqueue
  2016-09-09  3:39 ` [PATCH v5 0/6] vhost: optimize enqueue Zhihong Wang
  2016-09-09  3:39   ` [PATCH v5 1/6] vhost: fix windows vm hang Zhihong Wang
@ 2016-09-09  3:39   ` Zhihong Wang
  2016-09-12 15:42     ` Maxime Coquelin
                       ` (2 more replies)
  2016-09-09  3:39   ` [PATCH v5 3/6] vhost: remove useless volatile Zhihong Wang
                     ` (4 subsequent siblings)
  6 siblings, 3 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-09-09  3:39 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.

This is the baseline version of the new code, more optimization will be
added in the following patches in this patch set.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
Changes in v5:

 1. Rebase to the latest branch.

 2. Rename variables to keep consistent in naming style.

 3. Small changes like return value adjustment and vertical alignment.

---
Changes in v4:

 1. Refactor the code for clearer logic.

 2. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Rewrite enqueue and delete the obsolete in the same patch.

 lib/librte_vhost/virtio_net.c | 514 ++++++++++++------------------------------
 1 file changed, 138 insertions(+), 376 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 0d6e7d9..6f63968 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
 	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }
 
-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
 	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 						cksum));
 			break;
 		}
+	} else {
+		net_hdr->flags       = 0;
+		net_hdr->csum_start  = 0;
+		net_hdr->csum_offset = 0;
 	}
 
 	if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
@@ -122,439 +126,197 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 		net_hdr->gso_size = m_buf->tso_segsz;
 		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
 					+ m_buf->l4_len;
+	} else {
+		net_hdr->gso_type = 0;
+		net_hdr->hdr_len  = 0;
+		net_hdr->gso_size = 0;
 	}
 }
 
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-		    struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint32_t desc_chain_head, uint32_t desc_chain_len)
 {
-	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-	else
-		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
+
+	vq->used->ring[used_idx].id = desc_chain_head;
+	vq->used->ring[used_idx].len = desc_chain_len;
+	vq->last_used_idx++;
+	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
+				ring[used_idx]),
+			sizeof(vq->used->ring[used_idx]));
 }
 
 static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		  struct rte_mbuf *m, uint16_t desc_idx)
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint16_t avail_idx, struct rte_mbuf *mbuf,
+		uint32_t is_mrg_rxbuf)
 {
-	uint32_t desc_avail, desc_offset;
-	uint32_t mbuf_avail, mbuf_offset;
-	uint32_t cpy_len;
+	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
 	struct vring_desc *desc;
 	uint64_t desc_addr;
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+	uint32_t desc_chain_head;
+	uint32_t desc_chain_len;
+	uint32_t desc_current;
+	uint32_t desc_offset;
+	uint32_t mbuf_len;
+	uint32_t mbuf_avail;
+	uint32_t cpy_len;
+	uint32_t num_buffers = 0;
 
-	desc = &vq->desc[desc_idx];
+	/* start with the first mbuf of the packet */
+	mbuf_len = rte_pktmbuf_data_len(mbuf);
+	mbuf_avail = mbuf_len;
+
+	/* get the current desc */
+	desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+	desc_chain_head = desc_current;
+	desc = &vq->desc[desc_current];
 	desc_addr = gpa_to_vva(dev, desc->addr);
-	/*
-	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-	 * otherwise stores offset on the stack instead of in a register.
-	 */
-	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-		return -1;
+	if (unlikely(!desc_addr))
+		goto error;
 
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
+	/* handle virtio header */
+	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+	if (is_mrg_rxbuf)
+		virtio_hdr->num_buffers = 1;
 
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
 	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
 	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
 	desc_offset = dev->vhost_hlen;
-	desc_avail  = desc->len - dev->vhost_hlen;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current mbuf, fetch next */
-		if (mbuf_avail == 0) {
-			m = m->next;
-
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
+	desc_chain_len = desc_offset;
+	desc_addr += desc_offset;
+
+	/* start copy from mbuf to desc */
+	while (mbuf_avail || mbuf->next) {
+		/* get the next mbuf if the current done */
+		if (!mbuf_avail) {
+			mbuf = mbuf->next;
+			mbuf_len = rte_pktmbuf_data_len(mbuf);
+			mbuf_avail = mbuf_len;
 		}
 
-		/* done with current desc buf, fetch next */
-		if (desc_avail == 0) {
-			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
-				/* Room in vring buffer is not enough */
-				return -1;
-			}
-			if (unlikely(desc->next >= vq->size))
-				return -1;
+		/* get the next desc if the current done */
+		if (desc->len <= desc_offset) {
+			if (desc->flags & VRING_DESC_F_NEXT) {
+				/* go on with the current desc chain */
+				desc_offset = 0;
+				desc_current = desc->next;
+				desc = &vq->desc[desc_current];
+				desc_addr = gpa_to_vva(dev, desc->addr);
+				if (unlikely(!desc_addr))
+					goto error;
+			} else if (is_mrg_rxbuf) {
+				/* start with the next desc chain */
+				update_used_ring(dev, vq, desc_chain_head,
+						desc_chain_len);
+				num_buffers++;
+				virtio_hdr->num_buffers++;
+				if (avail_idx == vq->last_used_idx)
+					goto error;
+
+				desc_current =
+					vq->avail->ring[(vq->last_used_idx) &
+					(vq->size - 1)];
+				desc_chain_head = desc_current;
+				desc = &vq->desc[desc_current];
+				desc_addr = gpa_to_vva(dev, desc->addr);
+				if (unlikely(!desc_addr))
+					goto error;
 
-			desc = &vq->desc[desc->next];
-			desc_addr = gpa_to_vva(dev, desc->addr);
-			if (unlikely(!desc_addr))
-				return -1;
-
-			desc_offset = 0;
-			desc_avail  = desc->len;
+				desc_chain_len = 0;
+				desc_offset = 0;
+			} else
+				goto error;
 		}
 
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
+		/* copy mbuf data */
+		cpy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+		rte_memcpy((void *)(uintptr_t)desc_addr,
+				rte_pktmbuf_mtod_offset(mbuf, void *,
+					mbuf_len - mbuf_avail),
+				cpy_len);
 		vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			     cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
-	}
-
-	return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
-	      struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint16_t avail_idx, free_entries, start_idx;
-	uint16_t desc_indexes[MAX_PKT_BURST];
-	uint16_t used_idx;
-	uint32_t i;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
-		return 0;
+		PRINT_PACKET(dev, (uintptr_t)desc_addr, cpy_len, 0);
+		mbuf_avail     -= cpy_len;
+		desc_addr      += cpy_len;
+		desc_offset    += cpy_len;
+		desc_chain_len += cpy_len;
 	}
 
-	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
-		return 0;
+	update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
 
-	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-	start_idx = vq->last_used_idx;
-	free_entries = avail_idx - start_idx;
-	count = RTE_MIN(count, free_entries);
-	count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
-	if (count == 0)
-		return 0;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
-		dev->vid, start_idx, start_idx + count);
-
-	/* Retrieve all of the desc indexes first to avoid caching issues. */
-	rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
-	for (i = 0; i < count; i++) {
-		used_idx = (start_idx + i) & (vq->size - 1);
-		desc_indexes[i] = vq->avail->ring[used_idx];
-		vq->used->ring[used_idx].id = desc_indexes[i];
-		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
-					       dev->vhost_hlen;
-		vhost_log_used_vring(dev, vq,
-			offsetof(struct vring_used, ring[used_idx]),
-			sizeof(vq->used->ring[used_idx]));
-	}
-
-	rte_prefetch0(&vq->desc[desc_indexes[0]]);
-	for (i = 0; i < count; i++) {
-		uint16_t desc_idx = desc_indexes[i];
-		int err;
+	return 0;
 
-		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
-		if (unlikely(err)) {
-			used_idx = (start_idx + i) & (vq->size - 1);
-			vq->used->ring[used_idx].len = dev->vhost_hlen;
-			vhost_log_used_vring(dev, vq,
-				offsetof(struct vring_used, ring[used_idx]),
-				sizeof(vq->used->ring[used_idx]));
-		}
+error:
+	/* rollback on any error if last_used_idx update on-the-fly */
+	vq->last_used_idx -= num_buffers;
 
-		if (i + 1 < count)
-			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
-	}
+	return -1;
+}
 
+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
 	rte_smp_wmb();
-
-	*(volatile uint16_t *)&vq->used->idx += count;
-	vq->last_used_idx += count;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, idx),
-		sizeof(vq->used->idx));
-
-	/* flush used->idx update before we read avail->flags. */
+	*(volatile uint16_t *)&vq->used->idx = vq->last_used_idx;
+	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+			sizeof(vq->used->idx));
 	rte_mb();
-
-	/* Kick the guest if necessary. */
 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
 			&& (vq->callfd >= 0))
 		eventfd_write(vq->callfd, (eventfd_t)1);
-	return count;
 }
 
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
-	     uint32_t *allocated, uint32_t *vec_idx,
-	     struct buf_vector *buf_vec)
-{
-	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
-	uint32_t vec_id = *vec_idx;
-	uint32_t len    = *allocated;
-
-	while (1) {
-		if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
-			return -1;
-
-		len += vq->desc[idx].len;
-		buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
-		buf_vec[vec_id].buf_len  = vq->desc[idx].len;
-		buf_vec[vec_id].desc_idx = idx;
-		vec_id++;
-
-		if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
-			break;
-
-		idx = vq->desc[idx].next;
-	}
-
-	*allocated = len;
-	*vec_idx   = vec_id;
-
-	return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
-			    uint16_t *end, struct buf_vector *buf_vec)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint16_t count)
 {
-	uint16_t cur_idx;
+	struct vhost_virtqueue *vq;
+	struct virtio_net *dev;
+	uint32_t is_mrg_rxbuf = 0;
+	uint32_t pkt_idx      = 0;
+	uint32_t pkt_left     = count;
 	uint16_t avail_idx;
-	uint32_t allocated = 0;
-	uint32_t vec_idx = 0;
-	uint16_t tries = 0;
-
-	cur_idx  = vq->last_used_idx;
-
-	while (1) {
-		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-		if (unlikely(cur_idx == avail_idx))
-			return -1;
-
-		if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
-					  &vec_idx, buf_vec) < 0))
-			return -1;
-
-		cur_idx++;
-		tries++;
-
-		if (allocated >= size)
-			break;
-
-		/*
-		 * if we tried all available ring items, and still
-		 * can't get enough buf, it means something abnormal
-		 * happened.
-		 */
-		if (unlikely(tries >= vq->size))
-			return -1;
-	}
 
-	*end = cur_idx;
-	return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
-			    uint16_t end_idx, struct rte_mbuf *m,
-			    struct buf_vector *buf_vec)
-{
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-	uint32_t vec_idx = 0;
-	uint16_t start_idx = vq->last_used_idx;
-	uint16_t cur_idx = start_idx;
-	uint64_t desc_addr;
-	uint32_t desc_chain_head;
-	uint32_t desc_chain_len;
-	uint32_t mbuf_offset, mbuf_avail;
-	uint32_t desc_offset, desc_avail;
-	uint32_t cpy_len;
-	uint16_t desc_idx, used_idx;
-
-	if (unlikely(m == NULL))
+	if (unlikely(!pkt_left))
 		return 0;
 
-	LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
-		dev->vid, cur_idx, end_idx);
+	pkt_left = RTE_MIN((uint32_t)MAX_PKT_BURST, pkt_left);
 
-	desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
+	dev = get_device(vid);
+	if (unlikely(!dev))
 		return 0;
 
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-	virtio_hdr.num_buffers = end_idx - start_idx;
-	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
-		dev->vid, virtio_hdr.num_buffers);
-
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-	vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
-	desc_offset = dev->vhost_hlen;
-	desc_chain_head = buf_vec[vec_idx].desc_idx;
-	desc_chain_len = desc_offset;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current desc buf, get the next one */
-		if (desc_avail == 0) {
-			desc_idx = buf_vec[vec_idx].desc_idx;
-			vec_idx++;
-
-			if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
-				/* Update used ring with desc information */
-				used_idx = cur_idx++ & (vq->size - 1);
-				vq->used->ring[used_idx].id = desc_chain_head;
-				vq->used->ring[used_idx].len = desc_chain_len;
-				vhost_log_used_vring(dev, vq,
-					offsetof(struct vring_used,
-						 ring[used_idx]),
-					sizeof(vq->used->ring[used_idx]));
-				desc_chain_head = buf_vec[vec_idx].desc_idx;
-				desc_chain_len = 0;
-			}
-
-			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-			if (unlikely(!desc_addr))
-				return 0;
-
-			/* Prefetch buffer address. */
-			rte_prefetch0((void *)(uintptr_t)desc_addr);
-			desc_offset = 0;
-			desc_avail  = buf_vec[vec_idx].buf_len;
-		}
-
-		/* done with current mbuf, get the next one */
-		if (mbuf_avail == 0) {
-			m = m->next;
-
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
-		}
-
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
-			cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
-		desc_chain_len += cpy_len;
-	}
-
-	used_idx = cur_idx & (vq->size - 1);
-	vq->used->ring[used_idx].id = desc_chain_head;
-	vq->used->ring[used_idx].len = desc_chain_len;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, ring[used_idx]),
-		sizeof(vq->used->ring[used_idx]));
-
-	return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint32_t pkt_idx = 0, nr_used = 0;
-	uint16_t end;
-	struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
 		return 0;
-	}
 
 	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
+	if (unlikely(!vq->enabled))
 		return 0;
 
-	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
-	if (count == 0)
-		return 0;
-
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
-		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
+	if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+		is_mrg_rxbuf = 1;
 
-		if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
-							 &end, buf_vec) < 0)) {
-			LOG_DEBUG(VHOST_DATA,
-				"(%d) failed to get enough desc from vring\n",
-				dev->vid);
+	/* start enqueuing packets 1 by 1 */
+	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+	while (pkt_left && avail_idx != vq->last_used_idx) {
+		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+					is_mrg_rxbuf))
 			break;
-		}
-
-		nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
-						      pkts[pkt_idx], buf_vec);
-		rte_smp_wmb();
 
-		*(volatile uint16_t *)&vq->used->idx += nr_used;
-		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
-			sizeof(vq->used->idx));
-		vq->last_used_idx += nr_used;
+		pkt_idx++;
+		pkt_left--;
 	}
 
-	if (likely(pkt_idx)) {
-		/* flush used->idx update before we read avail->flags. */
-		rte_mb();
-
-		/* Kick the guest if necessary. */
-		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-				&& (vq->callfd >= 0))
-			eventfd_write(vq->callfd, (eventfd_t)1);
-	}
+	/* update used idx and kick the guest if necessary */
+	if (pkt_idx)
+		notify_guest(dev, vq);
 
 	return pkt_idx;
 }
 
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint16_t count)
-{
-	struct virtio_net *dev = get_device(vid);
-
-	if (!dev)
-		return 0;
-
-	if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
-		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
-	else
-		return virtio_dev_rx(dev, queue_id, pkts, count);
-}
-
 static void
 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
 {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v5 3/6] vhost: remove useless volatile
  2016-09-09  3:39 ` [PATCH v5 0/6] vhost: optimize enqueue Zhihong Wang
  2016-09-09  3:39   ` [PATCH v5 1/6] vhost: fix windows vm hang Zhihong Wang
  2016-09-09  3:39   ` [PATCH v5 2/6] vhost: rewrite enqueue Zhihong Wang
@ 2016-09-09  3:39   ` Zhihong Wang
  2016-09-09  3:39   ` [PATCH v5 4/6] vhost: add desc prefetch Zhihong Wang
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-09-09  3:39 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index c2dfc3c..9707dfc 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
 	uint32_t		size;
 
 	/* Last index used on the available ring */
-	volatile uint16_t	last_used_idx;
+	uint16_t		last_used_idx;
 #define VIRTIO_INVALID_EVENTFD		(-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD	(-2)
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v5 4/6] vhost: add desc prefetch
  2016-09-09  3:39 ` [PATCH v5 0/6] vhost: optimize enqueue Zhihong Wang
                     ` (2 preceding siblings ...)
  2016-09-09  3:39   ` [PATCH v5 3/6] vhost: remove useless volatile Zhihong Wang
@ 2016-09-09  3:39   ` Zhihong Wang
  2016-09-09  3:39   ` [PATCH v5 5/6] vhost: batch update used ring Zhihong Wang
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-09-09  3:39 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 6f63968..b38f18f 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -302,6 +302,12 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 	/* start enqueuing packets 1 by 1 */
 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
 	while (pkt_left && avail_idx != vq->last_used_idx) {
+		/* prefetch the next desc */
+		if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+			rte_prefetch0(&vq->desc[vq->avail->ring[
+					(vq->last_used_idx + 1) &
+					(vq->size - 1)]]);
+
 		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
 					is_mrg_rxbuf))
 			break;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v5 5/6] vhost: batch update used ring
  2016-09-09  3:39 ` [PATCH v5 0/6] vhost: optimize enqueue Zhihong Wang
                     ` (3 preceding siblings ...)
  2016-09-09  3:39   ` [PATCH v5 4/6] vhost: add desc prefetch Zhihong Wang
@ 2016-09-09  3:39   ` Zhihong Wang
  2016-09-12 15:45     ` Maxime Coquelin
  2016-09-09  3:39   ` [PATCH v5 6/6] vhost: optimize cache access Zhihong Wang
  2016-09-12 13:52   ` [PATCH v5 0/6] vhost: optimize enqueue Maxime Coquelin
  6 siblings, 1 reply; 141+ messages in thread
From: Zhihong Wang @ 2016-09-09  3:39 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch enables batch update of the used ring for better efficiency.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
Changes in v4:

 1. Free shadow used ring in the right place.

 2. Add failure check for shadow used ring malloc.

 lib/librte_vhost/vhost.c      | 20 ++++++++++++--
 lib/librte_vhost/vhost.h      |  4 +++
 lib/librte_vhost/vhost_user.c | 31 +++++++++++++++++----
 lib/librte_vhost/virtio_net.c | 64 +++++++++++++++++++++++++++++++++++--------
 4 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 46095c3..cb31cdd 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -119,10 +119,26 @@ cleanup_device(struct virtio_net *dev, int destroy)
 static void
 free_device(struct virtio_net *dev)
 {
+	struct vhost_virtqueue *vq_0;
+	struct vhost_virtqueue *vq_1;
 	uint32_t i;
 
-	for (i = 0; i < dev->virt_qp_nb; i++)
-		rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+	for (i = 0; i < dev->virt_qp_nb; i++) {
+		vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
+		if (vq_0->shadow_used_ring) {
+			rte_free(vq_0->shadow_used_ring);
+			vq_0->shadow_used_ring = NULL;
+		}
+
+		vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
+		if (vq_1->shadow_used_ring) {
+			rte_free(vq_1->shadow_used_ring);
+			vq_1->shadow_used_ring = NULL;
+		}
+
+		/* malloc together, free together */
+		rte_free(vq_0);
+	}
 
 	rte_free(dev);
 }
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 9707dfc..381dc27 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {
 
 	/* Physical address of used ring, for logging */
 	uint64_t		log_guest_addr;
+
+	/* Shadow used ring for performance */
+	struct vring_used_elem	*shadow_used_ring;
+	uint32_t		shadow_used_idx;
 } __rte_cache_aligned;
 
 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index eee99e9..d7cf1ed 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -193,7 +193,21 @@ static int
 vhost_user_set_vring_num(struct virtio_net *dev,
 			 struct vhost_vring_state *state)
 {
-	dev->virtqueue[state->index]->size = state->num;
+	struct vhost_virtqueue *vq;
+
+	vq = dev->virtqueue[state->index];
+	vq->size = state->num;
+	if (!vq->shadow_used_ring) {
+		vq->shadow_used_ring = rte_malloc(NULL,
+				vq->size * sizeof(struct vring_used_elem),
+				RTE_CACHE_LINE_SIZE);
+		if (!vq->shadow_used_ring) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"Failed to allocate memory"
+				" for shadow used ring.\n");
+			return -1;
+		}
+	}
 
 	return 0;
 }
@@ -611,14 +625,21 @@ static int
 vhost_user_get_vring_base(struct virtio_net *dev,
 			  struct vhost_vring_state *state)
 {
+	struct vhost_virtqueue *vq;
+
 	/* We have to stop the queue (virtio) if it is running. */
 	if (dev->flags & VIRTIO_DEV_RUNNING) {
 		dev->flags &= ~VIRTIO_DEV_RUNNING;
 		notify_ops->destroy_device(dev->vid);
 	}
 
+	vq = dev->virtqueue[state->index];
 	/* Here we are safe to get the last used index */
-	state->num = dev->virtqueue[state->index]->last_used_idx;
+	state->num = vq->last_used_idx;
+	if (vq->shadow_used_ring) {
+		rte_free(vq->shadow_used_ring);
+		vq->shadow_used_ring = NULL;
+	}
 
 	RTE_LOG(INFO, VHOST_CONFIG,
 		"vring base idx:%d file:%d\n", state->index, state->num);
@@ -627,10 +648,10 @@ vhost_user_get_vring_base(struct virtio_net *dev,
 	 * sent and only sent in vhost_vring_stop.
 	 * TODO: cleanup the vring, it isn't usable since here.
 	 */
-	if (dev->virtqueue[state->index]->kickfd >= 0)
-		close(dev->virtqueue[state->index]->kickfd);
+	if (vq->kickfd >= 0)
+		close(vq->kickfd);
 
-	dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+	vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
 
 	return 0;
 }
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index b38f18f..e9f6353 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -134,17 +134,52 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 }
 
 static inline void __attribute__((always_inline))
-update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		uint32_t desc_chain_head, uint32_t desc_chain_len)
+update_used_ring(struct vhost_virtqueue *vq, uint32_t desc_chain_head,
+		uint32_t desc_chain_len)
 {
-	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
-
-	vq->used->ring[used_idx].id = desc_chain_head;
-	vq->used->ring[used_idx].len = desc_chain_len;
+	vq->shadow_used_ring[vq->shadow_used_idx].id  = desc_chain_head;
+	vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+	vq->shadow_used_idx++;
 	vq->last_used_idx++;
-	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
-				ring[used_idx]),
-			sizeof(vq->used->ring[used_idx]));
+}
+
+static inline void __attribute__((always_inline))
+flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint32_t used_idx_start)
+{
+	if (used_idx_start + vq->shadow_used_idx < vq->size) {
+		rte_memcpy(&vq->used->ring[used_idx_start],
+				&vq->shadow_used_ring[0],
+				vq->shadow_used_idx *
+				sizeof(struct vring_used_elem));
+		vhost_log_used_vring(dev, vq,
+				offsetof(struct vring_used,
+					ring[used_idx_start]),
+				vq->shadow_used_idx *
+				sizeof(struct vring_used_elem));
+	} else {
+		uint32_t part_1 = vq->size - used_idx_start;
+		uint32_t part_2 = vq->shadow_used_idx - part_1;
+
+		rte_memcpy(&vq->used->ring[used_idx_start],
+				&vq->shadow_used_ring[0],
+				part_1 *
+				sizeof(struct vring_used_elem));
+		vhost_log_used_vring(dev, vq,
+				offsetof(struct vring_used,
+					ring[used_idx_start]),
+				part_1 *
+				sizeof(struct vring_used_elem));
+		rte_memcpy(&vq->used->ring[0],
+				&vq->shadow_used_ring[part_1],
+				part_2 *
+				sizeof(struct vring_used_elem));
+		vhost_log_used_vring(dev, vq,
+				offsetof(struct vring_used,
+					ring[0]),
+				part_2 *
+				sizeof(struct vring_used_elem));
+	}
 }
 
 static inline int __attribute__((always_inline))
@@ -209,7 +244,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 					goto error;
 			} else if (is_mrg_rxbuf) {
 				/* start with the next desc chain */
-				update_used_ring(dev, vq, desc_chain_head,
+				update_used_ring(vq, desc_chain_head,
 						desc_chain_len);
 				num_buffers++;
 				virtio_hdr->num_buffers++;
@@ -245,7 +280,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		desc_chain_len += cpy_len;
 	}
 
-	update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
+	update_used_ring(vq, desc_chain_head, desc_chain_len);
 
 	return 0;
 
@@ -275,6 +310,7 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 {
 	struct vhost_virtqueue *vq;
 	struct virtio_net *dev;
+	uint32_t used_idx_start;
 	uint32_t is_mrg_rxbuf = 0;
 	uint32_t pkt_idx      = 0;
 	uint32_t pkt_left     = count;
@@ -300,6 +336,8 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 		is_mrg_rxbuf = 1;
 
 	/* start enqueuing packets 1 by 1 */
+	vq->shadow_used_idx = 0;
+	used_idx_start = vq->last_used_idx & (vq->size - 1);
 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
 	while (pkt_left && avail_idx != vq->last_used_idx) {
 		/* prefetch the next desc */
@@ -316,6 +354,10 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 		pkt_left--;
 	}
 
+	/* batch update used ring for better performance */
+	if (likely(vq->shadow_used_idx > 0))
+		flush_used_ring(dev, vq, used_idx_start);
+
 	/* update used idx and kick the guest if necessary */
 	if (pkt_idx)
 		notify_guest(dev, vq);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v5 6/6] vhost: optimize cache access
  2016-09-09  3:39 ` [PATCH v5 0/6] vhost: optimize enqueue Zhihong Wang
                     ` (4 preceding siblings ...)
  2016-09-09  3:39   ` [PATCH v5 5/6] vhost: batch update used ring Zhihong Wang
@ 2016-09-09  3:39   ` Zhihong Wang
  2016-09-12 13:52   ` [PATCH v5 0/6] vhost: optimize enqueue Maxime Coquelin
  6 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-09-09  3:39 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch reorders the code to delay virtio header write to optimize cache
access efficiency for cases where the mrg_rxbuf feature is turned on. It
reduces CPU pipeline stall cycles significantly.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

 lib/librte_vhost/virtio_net.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index e9f6353..0086bcb 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -197,6 +197,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	uint32_t mbuf_len;
 	uint32_t mbuf_avail;
 	uint32_t cpy_len;
+	uint32_t copy_virtio_hdr;
 	uint32_t num_buffers = 0;
 
 	/* start with the first mbuf of the packet */
@@ -211,12 +212,12 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	if (unlikely(!desc_addr))
 		goto error;
 
-	/* handle virtio header */
+	/*
+	 * handle virtio header, the actual write operation is delayed
+	 * for cache optimization, to reduce CPU pipeline stall cycles.
+	 */
 	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
-	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
-	if (is_mrg_rxbuf)
-		virtio_hdr->num_buffers = 1;
-
+	copy_virtio_hdr = 1;
 	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
 	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
 	desc_offset = dev->vhost_hlen;
@@ -266,8 +267,15 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 				goto error;
 		}
 
-		/* copy mbuf data */
+		/* copy virtio header and mbuf data */
 		cpy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+		if (copy_virtio_hdr) {
+			copy_virtio_hdr = 0;
+			virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+			if (is_mrg_rxbuf)
+				virtio_hdr->num_buffers = num_buffers + 1;
+		}
+
 		rte_memcpy((void *)(uintptr_t)desc_addr,
 				rte_pktmbuf_mtod_offset(mbuf, void *,
 					mbuf_len - mbuf_avail),
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 0/6] vhost: optimize enqueue
  2016-09-09  3:39 ` [PATCH v5 0/6] vhost: optimize enqueue Zhihong Wang
                     ` (5 preceding siblings ...)
  2016-09-09  3:39   ` [PATCH v5 6/6] vhost: optimize cache access Zhihong Wang
@ 2016-09-12 13:52   ` Maxime Coquelin
  2016-09-12 13:56     ` Maxime Coquelin
  2016-09-12 14:01     ` Yuanhan Liu
  6 siblings, 2 replies; 141+ messages in thread
From: Maxime Coquelin @ 2016-09-12 13:52 UTC (permalink / raw)
  To: Zhihong Wang, dev; +Cc: yuanhan.liu, thomas.monjalon

Hi,

On 09/09/2016 05:39 AM, Zhihong Wang wrote:
> This patch set optimizes the vhost enqueue function.
>
> It implements the vhost logic from scratch into a single function designed
> for high performance and good maintainability, and improves CPU efficiency
> significantly by optimizing cache access, which means:
>
>  *  Higher maximum throughput can be achieved for fast frontends like DPDK
>     virtio pmd.
>
>  *  Better scalability can be achieved that each vhost core can support
>     more connections because it takes less cycles to handle each single
>     frontend.
>
> This patch set contains:
>
>  1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.
>
>  2. A baseline patch to rewrite the vhost logic.
>
>  3. A series of optimization patches added upon the baseline.
>
> The main optimization techniques are:
>
>  1. Reorder code to reduce CPU pipeline stall cycles.
>
>  2. Batch update the used ring for better efficiency.
>
>  3. Prefetch descriptor to hide cache latency.
>
>  4. Remove useless volatile attribute to allow compiler optimization.
>
> Code reordering and batch used ring update bring most of the performance
> improvements.
>
> In the existing code there're 2 callbacks for vhost enqueue:
>
>  *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
>
>  *  virtio_dev_rx for mrg_rxbuf turned off cases.
>
> The performance of the existing code is not optimal, especially when the
> mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
> maintenance efforts.
>
> Also, there's a compatibility issue in the existing code which causes
> Windows VM to hang when the mrg_rxbuf feature turned on.
>
> ---
> Changes in v5:
>
>  1. Rebase to the latest branch.
>
>  2. Rename variables to keep consistent in naming style.
>
>  3. Small changes like return value adjustment and vertical alignment.
>
>  4. Add details in commit log.
Just tried to apply your series without success.
Apparently, it is not based directly on master branch,
as it lacks some SHA-1 information.

Could you rebase it against master please?

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 0/6] vhost: optimize enqueue
  2016-09-12 13:52   ` [PATCH v5 0/6] vhost: optimize enqueue Maxime Coquelin
@ 2016-09-12 13:56     ` Maxime Coquelin
  2016-09-12 14:01     ` Yuanhan Liu
  1 sibling, 0 replies; 141+ messages in thread
From: Maxime Coquelin @ 2016-09-12 13:56 UTC (permalink / raw)
  To: Zhihong Wang, dev; +Cc: yuanhan.liu, thomas.monjalon



On 09/12/2016 03:52 PM, Maxime Coquelin wrote:
> Hi,
>
> On 09/09/2016 05:39 AM, Zhihong Wang wrote:
>> This patch set optimizes the vhost enqueue function.
>>
>> It implements the vhost logic from scratch into a single function
>> designed
>> for high performance and good maintainability, and improves CPU
>> efficiency
>> significantly by optimizing cache access, which means:
>>
>>  *  Higher maximum throughput can be achieved for fast frontends like
>> DPDK
>>     virtio pmd.
>>
>>  *  Better scalability can be achieved that each vhost core can support
>>     more connections because it takes less cycles to handle each single
>>     frontend.
>>
>> This patch set contains:
>>
>>  1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.
>>
>>  2. A baseline patch to rewrite the vhost logic.
>>
>>  3. A series of optimization patches added upon the baseline.
>>
>> The main optimization techniques are:
>>
>>  1. Reorder code to reduce CPU pipeline stall cycles.
>>
>>  2. Batch update the used ring for better efficiency.
>>
>>  3. Prefetch descriptor to hide cache latency.
>>
>>  4. Remove useless volatile attribute to allow compiler optimization.
>>
>> Code reordering and batch used ring update bring most of the performance
>> improvements.
>>
>> In the existing code there're 2 callbacks for vhost enqueue:
>>
>>  *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
>>
>>  *  virtio_dev_rx for mrg_rxbuf turned off cases.
>>
>> The performance of the existing code is not optimal, especially when the
>> mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
>> maintenance efforts.
>>
>> Also, there's a compatibility issue in the existing code which causes
>> Windows VM to hang when the mrg_rxbuf feature turned on.
>>
>> ---
>> Changes in v5:
>>
>>  1. Rebase to the latest branch.
>>
>>  2. Rename variables to keep consistent in naming style.
>>
>>  3. Small changes like return value adjustment and vertical alignment.
>>
>>  4. Add details in commit log.
> Just tried to apply your series without success.
> Apparently, it is not based directly on master branch,
> as it lacks some SHA-1 information.
>
> Could you rebase it against master please?

Ok, it is in fact based on top of:
git://dpdk.org/next/dpdk-next-virtio master

For v6, if any, could you add this info to the cover letter please?

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 0/6] vhost: optimize enqueue
  2016-09-12 13:52   ` [PATCH v5 0/6] vhost: optimize enqueue Maxime Coquelin
  2016-09-12 13:56     ` Maxime Coquelin
@ 2016-09-12 14:01     ` Yuanhan Liu
  1 sibling, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-09-12 14:01 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: Zhihong Wang, dev, thomas.monjalon

On Mon, Sep 12, 2016 at 03:52:12PM +0200, Maxime Coquelin wrote:
> Just tried to apply your series without success.
> Apparently, it is not based directly on master branch,
> as it lacks some SHA-1 information.
> 
> Could you rebase it against master please?

It's rebased against the dpdk-next-virtio tree [0], where all the
virtio/vhost patches are applied first.

[0]: http://dpdk.org/browse/next/dpdk-next-virtio/

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 2/6] vhost: rewrite enqueue
  2016-09-09  3:39   ` [PATCH v5 2/6] vhost: rewrite enqueue Zhihong Wang
@ 2016-09-12 15:42     ` Maxime Coquelin
  2016-09-14  8:20       ` Wang, Zhihong
  2016-09-12 16:26     ` Maxime Coquelin
  2016-09-18 14:19     ` Yuanhan Liu
  2 siblings, 1 reply; 141+ messages in thread
From: Maxime Coquelin @ 2016-09-12 15:42 UTC (permalink / raw)
  To: Zhihong Wang, dev; +Cc: yuanhan.liu, thomas.monjalon

Hi,

On 09/09/2016 05:39 AM, Zhihong Wang wrote:
> This patch implements the vhost logic from scratch into a single function
> designed for high performance and better maintainability.
>
> This is the baseline version of the new code, more optimization will be
> added in the following patches in this patch set.
>
> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> ---
> Changes in v5:
>
>  1. Rebase to the latest branch.
>
>  2. Rename variables to keep consistent in naming style.
>
>  3. Small changes like return value adjustment and vertical alignment.
>
> ---
> Changes in v4:
>
>  1. Refactor the code for clearer logic.
>
>  2. Add PRINT_PACKET for debugging.
>
> ---
> Changes in v3:
>
>  1. Rewrite enqueue and delete the obsolete in the same patch.
>
>  lib/librte_vhost/virtio_net.c | 514 ++++++++++++------------------------------
>  1 file changed, 138 insertions(+), 376 deletions(-)
>
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 0d6e7d9..6f63968 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
>  	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
>  }
>
> -static void
> +static inline void __attribute__((always_inline))
>  virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  {
>  	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
> @@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  						cksum));
>  			break;
>  		}
> +	} else {
> +		net_hdr->flags       = 0;
> +		net_hdr->csum_start  = 0;
> +		net_hdr->csum_offset = 0;
>  	}
>
>  	if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
> @@ -122,439 +126,197 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  		net_hdr->gso_size = m_buf->tso_segsz;
>  		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
>  					+ m_buf->l4_len;
> +	} else {
> +		net_hdr->gso_type = 0;
> +		net_hdr->hdr_len  = 0;
> +		net_hdr->gso_size = 0;
>  	}
>  }
>
> -static inline void
> -copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
> -		    struct virtio_net_hdr_mrg_rxbuf hdr)
> +static inline void __attribute__((always_inline))
> +update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		uint32_t desc_chain_head, uint32_t desc_chain_len)
>  {
> -	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
> -		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
> -	else
> -		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
> +	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
> +
> +	vq->used->ring[used_idx].id = desc_chain_head;
> +	vq->used->ring[used_idx].len = desc_chain_len;
> +	vq->last_used_idx++;
> +	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
> +				ring[used_idx]),
> +			sizeof(vq->used->ring[used_idx]));
>  }
>
>  static inline int __attribute__((always_inline))
> -copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -		  struct rte_mbuf *m, uint16_t desc_idx)
> +enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		uint16_t avail_idx, struct rte_mbuf *mbuf,
> +		uint32_t is_mrg_rxbuf)
>  {
> -	uint32_t desc_avail, desc_offset;
> -	uint32_t mbuf_avail, mbuf_offset;
> -	uint32_t cpy_len;
> +	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
>  	struct vring_desc *desc;
>  	uint64_t desc_addr;
> -	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
> +	uint32_t desc_chain_head;
> +	uint32_t desc_chain_len;
> +	uint32_t desc_current;
> +	uint32_t desc_offset;
> +	uint32_t mbuf_len;
> +	uint32_t mbuf_avail;
> +	uint32_t cpy_len;
> +	uint32_t num_buffers = 0;
>
> -	desc = &vq->desc[desc_idx];
> +	/* start with the first mbuf of the packet */
> +	mbuf_len = rte_pktmbuf_data_len(mbuf);
> +	mbuf_avail = mbuf_len;
> +
> +	/* get the current desc */
> +	desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
> +	desc_chain_head = desc_current;
> +	desc = &vq->desc[desc_current];
>  	desc_addr = gpa_to_vva(dev, desc->addr);
> -	/*
> -	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
> -	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
> -	 * otherwise stores offset on the stack instead of in a register.
> -	 */
> -	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
> -		return -1;
> +	if (unlikely(!desc_addr))
> +		goto error;
>
> -	rte_prefetch0((void *)(uintptr_t)desc_addr);
> +	/* handle virtio header */
> +	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
> +	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
> +	if (is_mrg_rxbuf)
> +		virtio_hdr->num_buffers = 1;
>
> -	virtio_enqueue_offload(m, &virtio_hdr.hdr);
> -	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
>  	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
>  	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
> -
>  	desc_offset = dev->vhost_hlen;
> -	desc_avail  = desc->len - dev->vhost_hlen;
> -
> -	mbuf_avail  = rte_pktmbuf_data_len(m);
> -	mbuf_offset = 0;
> -	while (mbuf_avail != 0 || m->next != NULL) {
> -		/* done with current mbuf, fetch next */
> -		if (mbuf_avail == 0) {
> -			m = m->next;
> -
> -			mbuf_offset = 0;
> -			mbuf_avail  = rte_pktmbuf_data_len(m);
> +	desc_chain_len = desc_offset;
> +	desc_addr += desc_offset;
> +
> +	/* start copy from mbuf to desc */
> +	while (mbuf_avail || mbuf->next) {
> +		/* get the next mbuf if the current done */
> +		if (!mbuf_avail) {
> +			mbuf = mbuf->next;
> +			mbuf_len = rte_pktmbuf_data_len(mbuf);
> +			mbuf_avail = mbuf_len;
>  		}
>
> -		/* done with current desc buf, fetch next */
> -		if (desc_avail == 0) {
> -			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
> -				/* Room in vring buffer is not enough */
> -				return -1;
> -			}
> -			if (unlikely(desc->next >= vq->size))
> -				return -1;
> +		/* get the next desc if the current done */
> +		if (desc->len <= desc_offset) {
> +			if (desc->flags & VRING_DESC_F_NEXT) {
> +				/* go on with the current desc chain */
> +				desc_offset = 0;
> +				desc_current = desc->next;
> +				desc = &vq->desc[desc_current];
> +				desc_addr = gpa_to_vva(dev, desc->addr);
> +				if (unlikely(!desc_addr))
> +					goto error;
> +			} else if (is_mrg_rxbuf) {
> +				/* start with the next desc chain */
> +				update_used_ring(dev, vq, desc_chain_head,
> +						desc_chain_len);
> +				num_buffers++;
> +				virtio_hdr->num_buffers++;
> +				if (avail_idx == vq->last_used_idx)
> +					goto error;
> +
> +				desc_current =
> +					vq->avail->ring[(vq->last_used_idx) &
> +					(vq->size - 1)];
> +				desc_chain_head = desc_current;
> +				desc = &vq->desc[desc_current];
> +				desc_addr = gpa_to_vva(dev, desc->addr);
> +				if (unlikely(!desc_addr))
> +					goto error;
>
> -			desc = &vq->desc[desc->next];
> -			desc_addr = gpa_to_vva(dev, desc->addr);
> -			if (unlikely(!desc_addr))
> -				return -1;
> -
> -			desc_offset = 0;
> -			desc_avail  = desc->len;
> +				desc_chain_len = 0;
> +				desc_offset = 0;
As I commented on v3, there is code duplication between next flag, and 
mrg buf cases:
desc_offset = 0;

and:

desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
if (unlikely(!desc_addr))
     goto error;

Regards,
Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 5/6] vhost: batch update used ring
  2016-09-09  3:39   ` [PATCH v5 5/6] vhost: batch update used ring Zhihong Wang
@ 2016-09-12 15:45     ` Maxime Coquelin
  2016-09-14  8:43       ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Maxime Coquelin @ 2016-09-12 15:45 UTC (permalink / raw)
  To: Zhihong Wang, dev; +Cc: yuanhan.liu, thomas.monjalon



On 09/09/2016 05:39 AM, Zhihong Wang wrote:
> This patch enables batch update of the used ring for better efficiency.
>
> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> ---
> Changes in v4:
>
>  1. Free shadow used ring in the right place.
>
>  2. Add failure check for shadow used ring malloc.
>
>  lib/librte_vhost/vhost.c      | 20 ++++++++++++--
>  lib/librte_vhost/vhost.h      |  4 +++
>  lib/librte_vhost/vhost_user.c | 31 +++++++++++++++++----
>  lib/librte_vhost/virtio_net.c | 64 +++++++++++++++++++++++++++++++++++--------
>  4 files changed, 101 insertions(+), 18 deletions(-)
>
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 46095c3..cb31cdd 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -119,10 +119,26 @@ cleanup_device(struct virtio_net *dev, int destroy)
>  static void
>  free_device(struct virtio_net *dev)
>  {
> +	struct vhost_virtqueue *vq_0;
> +	struct vhost_virtqueue *vq_1;
>  	uint32_t i;
>
> -	for (i = 0; i < dev->virt_qp_nb; i++)
> -		rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
> +	for (i = 0; i < dev->virt_qp_nb; i++) {
> +		vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
> +		if (vq_0->shadow_used_ring) {
> +			rte_free(vq_0->shadow_used_ring);
> +			vq_0->shadow_used_ring = NULL;
> +		}
> +
> +		vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
> +		if (vq_1->shadow_used_ring) {
> +			rte_free(vq_1->shadow_used_ring);
> +			vq_1->shadow_used_ring = NULL;
> +		}
> +
> +		/* malloc together, free together */
> +		rte_free(vq_0);
> +	}
>
>  	rte_free(dev);
>  }
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 9707dfc..381dc27 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -85,6 +85,10 @@ struct vhost_virtqueue {
>
>  	/* Physical address of used ring, for logging */
>  	uint64_t		log_guest_addr;
> +
> +	/* Shadow used ring for performance */
> +	struct vring_used_elem	*shadow_used_ring;
> +	uint32_t		shadow_used_idx;
>  } __rte_cache_aligned;
>
>  /* Old kernels have no such macro defined */
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> index eee99e9..d7cf1ed 100644
> --- a/lib/librte_vhost/vhost_user.c
> +++ b/lib/librte_vhost/vhost_user.c
> @@ -193,7 +193,21 @@ static int
>  vhost_user_set_vring_num(struct virtio_net *dev,
>  			 struct vhost_vring_state *state)
>  {
> -	dev->virtqueue[state->index]->size = state->num;
> +	struct vhost_virtqueue *vq;
> +
> +	vq = dev->virtqueue[state->index];
> +	vq->size = state->num;
> +	if (!vq->shadow_used_ring) {
> +		vq->shadow_used_ring = rte_malloc(NULL,
> +				vq->size * sizeof(struct vring_used_elem),
> +				RTE_CACHE_LINE_SIZE);
> +		if (!vq->shadow_used_ring) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +				"Failed to allocate memory"
> +				" for shadow used ring.\n");
> +			return -1;
> +		}
> +	}
>
>  	return 0;
>  }
> @@ -611,14 +625,21 @@ static int
>  vhost_user_get_vring_base(struct virtio_net *dev,
>  			  struct vhost_vring_state *state)
>  {
> +	struct vhost_virtqueue *vq;
> +
>  	/* We have to stop the queue (virtio) if it is running. */
>  	if (dev->flags & VIRTIO_DEV_RUNNING) {
>  		dev->flags &= ~VIRTIO_DEV_RUNNING;
>  		notify_ops->destroy_device(dev->vid);
>  	}
>
> +	vq = dev->virtqueue[state->index];
>  	/* Here we are safe to get the last used index */
> -	state->num = dev->virtqueue[state->index]->last_used_idx;
> +	state->num = vq->last_used_idx;
> +	if (vq->shadow_used_ring) {
> +		rte_free(vq->shadow_used_ring);
> +		vq->shadow_used_ring = NULL;
> +	}
>
>  	RTE_LOG(INFO, VHOST_CONFIG,
>  		"vring base idx:%d file:%d\n", state->index, state->num);
> @@ -627,10 +648,10 @@ vhost_user_get_vring_base(struct virtio_net *dev,
>  	 * sent and only sent in vhost_vring_stop.
>  	 * TODO: cleanup the vring, it isn't usable since here.
>  	 */
> -	if (dev->virtqueue[state->index]->kickfd >= 0)
> -		close(dev->virtqueue[state->index]->kickfd);
> +	if (vq->kickfd >= 0)
> +		close(vq->kickfd);
>
> -	dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
> +	vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
>
>  	return 0;
>  }
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index b38f18f..e9f6353 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -134,17 +134,52 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  }
>
>  static inline void __attribute__((always_inline))
> -update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -		uint32_t desc_chain_head, uint32_t desc_chain_len)
> +update_used_ring(struct vhost_virtqueue *vq, uint32_t desc_chain_head,
> +		uint32_t desc_chain_len)
>  {
> -	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
> -
> -	vq->used->ring[used_idx].id = desc_chain_head;
> -	vq->used->ring[used_idx].len = desc_chain_len;
> +	vq->shadow_used_ring[vq->shadow_used_idx].id  = desc_chain_head;
> +	vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
> +	vq->shadow_used_idx++;
>  	vq->last_used_idx++;
> -	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
> -				ring[used_idx]),
> -			sizeof(vq->used->ring[used_idx]));
> +}
> +
> +static inline void __attribute__((always_inline))
> +flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		uint32_t used_idx_start)
> +{
> +	if (used_idx_start + vq->shadow_used_idx < vq->size) {
> +		rte_memcpy(&vq->used->ring[used_idx_start],
> +				&vq->shadow_used_ring[0],
> +				vq->shadow_used_idx *
> +				sizeof(struct vring_used_elem));
> +		vhost_log_used_vring(dev, vq,
> +				offsetof(struct vring_used,
> +					ring[used_idx_start]),
> +				vq->shadow_used_idx *
> +				sizeof(struct vring_used_elem));
> +	} else {
> +		uint32_t part_1 = vq->size - used_idx_start;
> +		uint32_t part_2 = vq->shadow_used_idx - part_1;
> +
> +		rte_memcpy(&vq->used->ring[used_idx_start],
> +				&vq->shadow_used_ring[0],
> +				part_1 *
> +				sizeof(struct vring_used_elem));
> +		vhost_log_used_vring(dev, vq,
> +				offsetof(struct vring_used,
> +					ring[used_idx_start]),
> +				part_1 *
> +				sizeof(struct vring_used_elem));
> +		rte_memcpy(&vq->used->ring[0],
> +				&vq->shadow_used_ring[part_1],
> +				part_2 *
> +				sizeof(struct vring_used_elem));
> +		vhost_log_used_vring(dev, vq,
> +				offsetof(struct vring_used,
> +					ring[0]),
> +				part_2 *
> +				sizeof(struct vring_used_elem));
> +	}
>  }
Is expanding the code done for performance purpose?
Or maybe we could have a loop to do that?
Something like this (not compiled, not tested):

static inline void __attribute__((always_inline))
flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
{
	uint32_t to = used_idx_start;
	uint32_t from = 0;
	uint32_t count;

	if (used_idx_start + vq->shadow_used_idx < vq->size)
		count = vq->shadow_used_idx;
	else
		count = vq->size - used_idx_start;

	do {
		rte_memcpy(&vq->used->ring[to],
				&vq->shadow_used_ring[from],
				count * sizeof(struct vring_used_elem));
		vhost_log_used_vring(dev, vq,
				offsetof(struct vring_used, ring[to]),
				count * sizeof(struct vring_used_elem));

		to = (to + count) & (vq->size - 1);
		from += count;
		count = vq->shadow_used_idx - count;
	} while (count);
}

Regards,
Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 2/6] vhost: rewrite enqueue
  2016-09-09  3:39   ` [PATCH v5 2/6] vhost: rewrite enqueue Zhihong Wang
  2016-09-12 15:42     ` Maxime Coquelin
@ 2016-09-12 16:26     ` Maxime Coquelin
  2016-09-14  8:22       ` Wang, Zhihong
  2016-09-18 14:19     ` Yuanhan Liu
  2 siblings, 1 reply; 141+ messages in thread
From: Maxime Coquelin @ 2016-09-12 16:26 UTC (permalink / raw)
  To: Zhihong Wang, dev; +Cc: yuanhan.liu, thomas.monjalon



On 09/09/2016 05:39 AM, Zhihong Wang wrote:
>
> +static inline void __attribute__((always_inline))
> +notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
> +{
>  	rte_smp_wmb();
> -
> -	*(volatile uint16_t *)&vq->used->idx += count;
> -	vq->last_used_idx += count;
> -	vhost_log_used_vring(dev, vq,
> -		offsetof(struct vring_used, idx),
> -		sizeof(vq->used->idx));
> -
> -	/* flush used->idx update before we read avail->flags. */
Please don't remove comments if not justified.
Here the comment is important, as it explains why the barrier is needed.
> +	*(volatile uint16_t *)&vq->used->idx = vq->last_used_idx;
> +	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> +			sizeof(vq->used->idx));
>  	rte_mb();
> -
> -	/* Kick the guest if necessary. */
>  	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
>  			&& (vq->callfd >= 0))
>  		eventfd_write(vq->callfd, (eventfd_t)1);
> -	return count;
>  }

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 2/6] vhost: rewrite enqueue
  2016-09-12 15:42     ` Maxime Coquelin
@ 2016-09-14  8:20       ` Wang, Zhihong
  2016-09-15 16:35         ` Maxime Coquelin
  0 siblings, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-14  8:20 UTC (permalink / raw)
  To: Maxime Coquelin, dev; +Cc: yuanhan.liu, thomas.monjalon

> > +				desc_current =
> > +					vq->avail->ring[(vq->last_used_idx)
> &
> > +					(vq->size - 1)];
> > +				desc_chain_head = desc_current;
> > +				desc = &vq->desc[desc_current];
> > +				desc_addr = gpa_to_vva(dev, desc->addr);
> > +				if (unlikely(!desc_addr))
> > +					goto error;
> >
> > -			desc = &vq->desc[desc->next];
> > -			desc_addr = gpa_to_vva(dev, desc->addr);
> > -			if (unlikely(!desc_addr))
> > -				return -1;
> > -
> > -			desc_offset = 0;
> > -			desc_avail  = desc->len;
> > +				desc_chain_len = 0;
> > +				desc_offset = 0;
> As I commented on v3, there is code duplication between next flag, and
> mrg buf cases:
> desc_offset = 0;
> 
> and:
> 
> desc = &vq->desc[desc_current];
> desc_addr = gpa_to_vva(dev, desc->addr);
> if (unlikely(!desc_addr))
>      goto error;
> 

Do you mean to add something like:

static inline int __attribute__((always_inline))
get_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
                uint32_t desc_idx, struct vring_desc **desc,
                uint64_t *desc_addr)
{
        *desc = &vq->desc[desc_idx];
        *desc_addr = gpa_to_vva(dev, (*desc)->addr);
        if (unlikely(!(*desc_addr)))
                return -1;

        return 0;
}


> Regards,
> Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 2/6] vhost: rewrite enqueue
  2016-09-12 16:26     ` Maxime Coquelin
@ 2016-09-14  8:22       ` Wang, Zhihong
  0 siblings, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-14  8:22 UTC (permalink / raw)
  To: Maxime Coquelin, dev; +Cc: yuanhan.liu, thomas.monjalon



> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Tuesday, September 13, 2016 12:27 AM
> To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
> Cc: yuanhan.liu@linux.intel.com; thomas.monjalon@6wind.com
> Subject: Re: [PATCH v5 2/6] vhost: rewrite enqueue
> 
> 
> 
> On 09/09/2016 05:39 AM, Zhihong Wang wrote:
> >
> > +static inline void __attribute__((always_inline))
> > +notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
> > +{
> >  	rte_smp_wmb();
> > -
> > -	*(volatile uint16_t *)&vq->used->idx += count;
> > -	vq->last_used_idx += count;
> > -	vhost_log_used_vring(dev, vq,
> > -		offsetof(struct vring_used, idx),
> > -		sizeof(vq->used->idx));
> > -
> > -	/* flush used->idx update before we read avail->flags. */
> Please don't remove comments if not justified.
> Here the comment is important, as it explains why the barrier is needed.

Okay.

> > +	*(volatile uint16_t *)&vq->used->idx = vq->last_used_idx;
> > +	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> > +			sizeof(vq->used->idx));
> >  	rte_mb();
> > -
> > -	/* Kick the guest if necessary. */
> >  	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
> >  			&& (vq->callfd >= 0))
> >  		eventfd_write(vq->callfd, (eventfd_t)1);
> > -	return count;
> >  }

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 5/6] vhost: batch update used ring
  2016-09-12 15:45     ` Maxime Coquelin
@ 2016-09-14  8:43       ` Wang, Zhihong
  2016-09-15 16:38         ` Maxime Coquelin
  0 siblings, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-14  8:43 UTC (permalink / raw)
  To: Maxime Coquelin, dev; +Cc: yuanhan.liu, thomas.monjalon



> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Monday, September 12, 2016 11:46 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
> Cc: yuanhan.liu@linux.intel.com; thomas.monjalon@6wind.com
> Subject: Re: [PATCH v5 5/6] vhost: batch update used ring
> 
> 
> 
> On 09/09/2016 05:39 AM, Zhihong Wang wrote:
> > This patch enables batch update of the used ring for better efficiency.
> >
> > Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> > ---
> > Changes in v4:
> >
> >  1. Free shadow used ring in the right place.
> >
> >  2. Add failure check for shadow used ring malloc.
> >
> >  lib/librte_vhost/vhost.c      | 20 ++++++++++++--
> >  lib/librte_vhost/vhost.h      |  4 +++
> >  lib/librte_vhost/vhost_user.c | 31 +++++++++++++++++----
> >  lib/librte_vhost/virtio_net.c | 64
> +++++++++++++++++++++++++++++++++++--------
> >  4 files changed, 101 insertions(+), 18 deletions(-)
> >
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> > index 46095c3..cb31cdd 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -119,10 +119,26 @@ cleanup_device(struct virtio_net *dev, int
> destroy)
> >  static void
> >  free_device(struct virtio_net *dev)
> >  {
> > +	struct vhost_virtqueue *vq_0;
> > +	struct vhost_virtqueue *vq_1;
> >  	uint32_t i;
> >
> > -	for (i = 0; i < dev->virt_qp_nb; i++)
> > -		rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
> > +	for (i = 0; i < dev->virt_qp_nb; i++) {
> > +		vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
> > +		if (vq_0->shadow_used_ring) {
> > +			rte_free(vq_0->shadow_used_ring);
> > +			vq_0->shadow_used_ring = NULL;
> > +		}
> > +
> > +		vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
> > +		if (vq_1->shadow_used_ring) {
> > +			rte_free(vq_1->shadow_used_ring);
> > +			vq_1->shadow_used_ring = NULL;
> > +		}
> > +
> > +		/* malloc together, free together */
> > +		rte_free(vq_0);
> > +	}
> >
> >  	rte_free(dev);
> >  }
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> > index 9707dfc..381dc27 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -85,6 +85,10 @@ struct vhost_virtqueue {
> >
> >  	/* Physical address of used ring, for logging */
> >  	uint64_t		log_guest_addr;
> > +
> > +	/* Shadow used ring for performance */
> > +	struct vring_used_elem	*shadow_used_ring;
> > +	uint32_t		shadow_used_idx;
> >  } __rte_cache_aligned;
> >
> >  /* Old kernels have no such macro defined */
> > diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> > index eee99e9..d7cf1ed 100644
> > --- a/lib/librte_vhost/vhost_user.c
> > +++ b/lib/librte_vhost/vhost_user.c
> > @@ -193,7 +193,21 @@ static int
> >  vhost_user_set_vring_num(struct virtio_net *dev,
> >  			 struct vhost_vring_state *state)
> >  {
> > -	dev->virtqueue[state->index]->size = state->num;
> > +	struct vhost_virtqueue *vq;
> > +
> > +	vq = dev->virtqueue[state->index];
> > +	vq->size = state->num;
> > +	if (!vq->shadow_used_ring) {
> > +		vq->shadow_used_ring = rte_malloc(NULL,
> > +				vq->size * sizeof(struct vring_used_elem),
> > +				RTE_CACHE_LINE_SIZE);
> > +		if (!vq->shadow_used_ring) {
> > +			RTE_LOG(ERR, VHOST_CONFIG,
> > +				"Failed to allocate memory"
> > +				" for shadow used ring.\n");
> > +			return -1;
> > +		}
> > +	}
> >
> >  	return 0;
> >  }
> > @@ -611,14 +625,21 @@ static int
> >  vhost_user_get_vring_base(struct virtio_net *dev,
> >  			  struct vhost_vring_state *state)
> >  {
> > +	struct vhost_virtqueue *vq;
> > +
> >  	/* We have to stop the queue (virtio) if it is running. */
> >  	if (dev->flags & VIRTIO_DEV_RUNNING) {
> >  		dev->flags &= ~VIRTIO_DEV_RUNNING;
> >  		notify_ops->destroy_device(dev->vid);
> >  	}
> >
> > +	vq = dev->virtqueue[state->index];
> >  	/* Here we are safe to get the last used index */
> > -	state->num = dev->virtqueue[state->index]->last_used_idx;
> > +	state->num = vq->last_used_idx;
> > +	if (vq->shadow_used_ring) {
> > +		rte_free(vq->shadow_used_ring);
> > +		vq->shadow_used_ring = NULL;
> > +	}
> >
> >  	RTE_LOG(INFO, VHOST_CONFIG,
> >  		"vring base idx:%d file:%d\n", state->index, state->num);
> > @@ -627,10 +648,10 @@ vhost_user_get_vring_base(struct virtio_net
> *dev,
> >  	 * sent and only sent in vhost_vring_stop.
> >  	 * TODO: cleanup the vring, it isn't usable since here.
> >  	 */
> > -	if (dev->virtqueue[state->index]->kickfd >= 0)
> > -		close(dev->virtqueue[state->index]->kickfd);
> > +	if (vq->kickfd >= 0)
> > +		close(vq->kickfd);
> >
> > -	dev->virtqueue[state->index]->kickfd =
> VIRTIO_UNINITIALIZED_EVENTFD;
> > +	vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
> >
> >  	return 0;
> >  }
> > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> > index b38f18f..e9f6353 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -134,17 +134,52 @@ virtio_enqueue_offload(struct rte_mbuf
> *m_buf, struct virtio_net_hdr *net_hdr)
> >  }
> >
> >  static inline void __attribute__((always_inline))
> > -update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > -		uint32_t desc_chain_head, uint32_t desc_chain_len)
> > +update_used_ring(struct vhost_virtqueue *vq, uint32_t
> desc_chain_head,
> > +		uint32_t desc_chain_len)
> >  {
> > -	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
> > -
> > -	vq->used->ring[used_idx].id = desc_chain_head;
> > -	vq->used->ring[used_idx].len = desc_chain_len;
> > +	vq->shadow_used_ring[vq->shadow_used_idx].id  =
> desc_chain_head;
> > +	vq->shadow_used_ring[vq->shadow_used_idx].len =
> desc_chain_len;
> > +	vq->shadow_used_idx++;
> >  	vq->last_used_idx++;
> > -	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
> > -				ring[used_idx]),
> > -			sizeof(vq->used->ring[used_idx]));
> > +}
> > +
> > +static inline void __attribute__((always_inline))
> > +flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > +		uint32_t used_idx_start)
> > +{
> > +	if (used_idx_start + vq->shadow_used_idx < vq->size) {
> > +		rte_memcpy(&vq->used->ring[used_idx_start],
> > +				&vq->shadow_used_ring[0],
> > +				vq->shadow_used_idx *
> > +				sizeof(struct vring_used_elem));
> > +		vhost_log_used_vring(dev, vq,
> > +				offsetof(struct vring_used,
> > +					ring[used_idx_start]),
> > +				vq->shadow_used_idx *
> > +				sizeof(struct vring_used_elem));
> > +	} else {
> > +		uint32_t part_1 = vq->size - used_idx_start;
> > +		uint32_t part_2 = vq->shadow_used_idx - part_1;
> > +
> > +		rte_memcpy(&vq->used->ring[used_idx_start],
> > +				&vq->shadow_used_ring[0],
> > +				part_1 *
> > +				sizeof(struct vring_used_elem));
> > +		vhost_log_used_vring(dev, vq,
> > +				offsetof(struct vring_used,
> > +					ring[used_idx_start]),
> > +				part_1 *
> > +				sizeof(struct vring_used_elem));
> > +		rte_memcpy(&vq->used->ring[0],
> > +				&vq->shadow_used_ring[part_1],
> > +				part_2 *
> > +				sizeof(struct vring_used_elem));
> > +		vhost_log_used_vring(dev, vq,
> > +				offsetof(struct vring_used,
> > +					ring[0]),
> > +				part_2 *
> > +				sizeof(struct vring_used_elem));
> > +	}
> >  }
> Is expanding the code done for performance purpose?

Hi Maxime,

Yes theoretically this has the least branch number.
And I think the logic is simpler this way.

Thanks
Zhihong

> Or maybe we could have a loop to do that?
> Something like this (not compiled, not tested):
> 
> static inline void __attribute__((always_inline))
> flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
> {
> 	uint32_t to = used_idx_start;
> 	uint32_t from = 0;
> 	uint32_t count;
> 
> 	if (used_idx_start + vq->shadow_used_idx < vq->size)
> 		count = vq->shadow_used_idx;
> 	else
> 		count = vq->size - used_idx_start;
> 
> 	do {
> 		rte_memcpy(&vq->used->ring[to],
> 				&vq->shadow_used_ring[from],
> 				count * sizeof(struct vring_used_elem));
> 		vhost_log_used_vring(dev, vq,
> 				offsetof(struct vring_used, ring[to]),
> 				count * sizeof(struct vring_used_elem));
> 
> 		to = (to + count) & (vq->size - 1);
> 		from += count;
> 		count = vq->shadow_used_idx - count;
> 	} while (count);
> }
> 
> Regards,
> Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 2/6] vhost: rewrite enqueue
  2016-09-14  8:20       ` Wang, Zhihong
@ 2016-09-15 16:35         ` Maxime Coquelin
  0 siblings, 0 replies; 141+ messages in thread
From: Maxime Coquelin @ 2016-09-15 16:35 UTC (permalink / raw)
  To: Wang, Zhihong, dev; +Cc: yuanhan.liu, thomas.monjalon

Hi,
On 09/14/2016 10:20 AM, Wang, Zhihong wrote:
>>> +				desc_current =
>>> +					vq->avail->ring[(vq->last_used_idx)
>> &
>>> +					(vq->size - 1)];
>>> +				desc_chain_head = desc_current;
>>> +				desc = &vq->desc[desc_current];
>>> +				desc_addr = gpa_to_vva(dev, desc->addr);
>>> +				if (unlikely(!desc_addr))
>>> +					goto error;
>>>
>>> -			desc = &vq->desc[desc->next];
>>> -			desc_addr = gpa_to_vva(dev, desc->addr);
>>> -			if (unlikely(!desc_addr))
>>> -				return -1;
>>> -
>>> -			desc_offset = 0;
>>> -			desc_avail  = desc->len;
>>> +				desc_chain_len = 0;
>>> +				desc_offset = 0;
>> As I commented on v3, there is code duplication between next flag, and
>> mrg buf cases:
>> desc_offset = 0;
>>
>> and:
>>
>> desc = &vq->desc[desc_current];
>> desc_addr = gpa_to_vva(dev, desc->addr);
>> if (unlikely(!desc_addr))
>>      goto error;
>>
>
> Do you mean to add something like:
>
> static inline int __attribute__((always_inline))
> get_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
>                 uint32_t desc_idx, struct vring_desc **desc,
>                 uint64_t *desc_addr)
> {
>         *desc = &vq->desc[desc_idx];
>         *desc_addr = gpa_to_vva(dev, (*desc)->addr);
>         if (unlikely(!(*desc_addr)))
>                 return -1;
>
>         return 0;
> }

I meant, move this code after the if/else.
You can do it in a function if it is done elsewhere in the file.

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 5/6] vhost: batch update used ring
  2016-09-14  8:43       ` Wang, Zhihong
@ 2016-09-15 16:38         ` Maxime Coquelin
  2016-09-18  2:55           ` Yuanhan Liu
  0 siblings, 1 reply; 141+ messages in thread
From: Maxime Coquelin @ 2016-09-15 16:38 UTC (permalink / raw)
  To: Wang, Zhihong, dev; +Cc: yuanhan.liu, thomas.monjalon



On 09/14/2016 10:43 AM, Wang, Zhihong wrote:
>
>
>> -----Original Message-----
>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>> Sent: Monday, September 12, 2016 11:46 PM
>> To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
>> Cc: yuanhan.liu@linux.intel.com; thomas.monjalon@6wind.com
>> Subject: Re: [PATCH v5 5/6] vhost: batch update used ring
>>
>>
>>
>> On 09/09/2016 05:39 AM, Zhihong Wang wrote:
>>> This patch enables batch update of the used ring for better efficiency.
>>>
>>> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
>>> ---
>>> Changes in v4:
>>>
>>>  1. Free shadow used ring in the right place.
>>>
>>>  2. Add failure check for shadow used ring malloc.
>>>
>>>  lib/librte_vhost/vhost.c      | 20 ++++++++++++--
>>>  lib/librte_vhost/vhost.h      |  4 +++
>>>  lib/librte_vhost/vhost_user.c | 31 +++++++++++++++++----
>>>  lib/librte_vhost/virtio_net.c | 64
>> +++++++++++++++++++++++++++++++++++--------
>>>  4 files changed, 101 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
>>> index 46095c3..cb31cdd 100644
>>> --- a/lib/librte_vhost/vhost.c
>>> +++ b/lib/librte_vhost/vhost.c
>>> @@ -119,10 +119,26 @@ cleanup_device(struct virtio_net *dev, int
>> destroy)
>>>  static void
>>>  free_device(struct virtio_net *dev)
>>>  {
>>> +	struct vhost_virtqueue *vq_0;
>>> +	struct vhost_virtqueue *vq_1;
>>>  	uint32_t i;
>>>
>>> -	for (i = 0; i < dev->virt_qp_nb; i++)
>>> -		rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
>>> +	for (i = 0; i < dev->virt_qp_nb; i++) {
>>> +		vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
>>> +		if (vq_0->shadow_used_ring) {
>>> +			rte_free(vq_0->shadow_used_ring);
>>> +			vq_0->shadow_used_ring = NULL;
>>> +		}
>>> +
>>> +		vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
>>> +		if (vq_1->shadow_used_ring) {
>>> +			rte_free(vq_1->shadow_used_ring);
>>> +			vq_1->shadow_used_ring = NULL;
>>> +		}
>>> +
>>> +		/* malloc together, free together */
>>> +		rte_free(vq_0);
>>> +	}
>>>
>>>  	rte_free(dev);
>>>  }
>>> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
>>> index 9707dfc..381dc27 100644
>>> --- a/lib/librte_vhost/vhost.h
>>> +++ b/lib/librte_vhost/vhost.h
>>> @@ -85,6 +85,10 @@ struct vhost_virtqueue {
>>>
>>>  	/* Physical address of used ring, for logging */
>>>  	uint64_t		log_guest_addr;
>>> +
>>> +	/* Shadow used ring for performance */
>>> +	struct vring_used_elem	*shadow_used_ring;
>>> +	uint32_t		shadow_used_idx;
>>>  } __rte_cache_aligned;
>>>
>>>  /* Old kernels have no such macro defined */
>>> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
>>> index eee99e9..d7cf1ed 100644
>>> --- a/lib/librte_vhost/vhost_user.c
>>> +++ b/lib/librte_vhost/vhost_user.c
>>> @@ -193,7 +193,21 @@ static int
>>>  vhost_user_set_vring_num(struct virtio_net *dev,
>>>  			 struct vhost_vring_state *state)
>>>  {
>>> -	dev->virtqueue[state->index]->size = state->num;
>>> +	struct vhost_virtqueue *vq;
>>> +
>>> +	vq = dev->virtqueue[state->index];
>>> +	vq->size = state->num;
>>> +	if (!vq->shadow_used_ring) {
>>> +		vq->shadow_used_ring = rte_malloc(NULL,
>>> +				vq->size * sizeof(struct vring_used_elem),
>>> +				RTE_CACHE_LINE_SIZE);
>>> +		if (!vq->shadow_used_ring) {
>>> +			RTE_LOG(ERR, VHOST_CONFIG,
>>> +				"Failed to allocate memory"
>>> +				" for shadow used ring.\n");
>>> +			return -1;
>>> +		}
>>> +	}
>>>
>>>  	return 0;
>>>  }
>>> @@ -611,14 +625,21 @@ static int
>>>  vhost_user_get_vring_base(struct virtio_net *dev,
>>>  			  struct vhost_vring_state *state)
>>>  {
>>> +	struct vhost_virtqueue *vq;
>>> +
>>>  	/* We have to stop the queue (virtio) if it is running. */
>>>  	if (dev->flags & VIRTIO_DEV_RUNNING) {
>>>  		dev->flags &= ~VIRTIO_DEV_RUNNING;
>>>  		notify_ops->destroy_device(dev->vid);
>>>  	}
>>>
>>> +	vq = dev->virtqueue[state->index];
>>>  	/* Here we are safe to get the last used index */
>>> -	state->num = dev->virtqueue[state->index]->last_used_idx;
>>> +	state->num = vq->last_used_idx;
>>> +	if (vq->shadow_used_ring) {
>>> +		rte_free(vq->shadow_used_ring);
>>> +		vq->shadow_used_ring = NULL;
>>> +	}
>>>
>>>  	RTE_LOG(INFO, VHOST_CONFIG,
>>>  		"vring base idx:%d file:%d\n", state->index, state->num);
>>> @@ -627,10 +648,10 @@ vhost_user_get_vring_base(struct virtio_net
>> *dev,
>>>  	 * sent and only sent in vhost_vring_stop.
>>>  	 * TODO: cleanup the vring, it isn't usable since here.
>>>  	 */
>>> -	if (dev->virtqueue[state->index]->kickfd >= 0)
>>> -		close(dev->virtqueue[state->index]->kickfd);
>>> +	if (vq->kickfd >= 0)
>>> +		close(vq->kickfd);
>>>
>>> -	dev->virtqueue[state->index]->kickfd =
>> VIRTIO_UNINITIALIZED_EVENTFD;
>>> +	vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
>>>
>>>  	return 0;
>>>  }
>>> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
>>> index b38f18f..e9f6353 100644
>>> --- a/lib/librte_vhost/virtio_net.c
>>> +++ b/lib/librte_vhost/virtio_net.c
>>> @@ -134,17 +134,52 @@ virtio_enqueue_offload(struct rte_mbuf
>> *m_buf, struct virtio_net_hdr *net_hdr)
>>>  }
>>>
>>>  static inline void __attribute__((always_inline))
>>> -update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>> -		uint32_t desc_chain_head, uint32_t desc_chain_len)
>>> +update_used_ring(struct vhost_virtqueue *vq, uint32_t
>> desc_chain_head,
>>> +		uint32_t desc_chain_len)
>>>  {
>>> -	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
>>> -
>>> -	vq->used->ring[used_idx].id = desc_chain_head;
>>> -	vq->used->ring[used_idx].len = desc_chain_len;
>>> +	vq->shadow_used_ring[vq->shadow_used_idx].id  =
>> desc_chain_head;
>>> +	vq->shadow_used_ring[vq->shadow_used_idx].len =
>> desc_chain_len;
>>> +	vq->shadow_used_idx++;
>>>  	vq->last_used_idx++;
>>> -	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
>>> -				ring[used_idx]),
>>> -			sizeof(vq->used->ring[used_idx]));
>>> +}
>>> +
>>> +static inline void __attribute__((always_inline))
>>> +flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>> +		uint32_t used_idx_start)
>>> +{
>>> +	if (used_idx_start + vq->shadow_used_idx < vq->size) {
>>> +		rte_memcpy(&vq->used->ring[used_idx_start],
>>> +				&vq->shadow_used_ring[0],
>>> +				vq->shadow_used_idx *
>>> +				sizeof(struct vring_used_elem));
>>> +		vhost_log_used_vring(dev, vq,
>>> +				offsetof(struct vring_used,
>>> +					ring[used_idx_start]),
>>> +				vq->shadow_used_idx *
>>> +				sizeof(struct vring_used_elem));
>>> +	} else {
>>> +		uint32_t part_1 = vq->size - used_idx_start;
>>> +		uint32_t part_2 = vq->shadow_used_idx - part_1;
>>> +
>>> +		rte_memcpy(&vq->used->ring[used_idx_start],
>>> +				&vq->shadow_used_ring[0],
>>> +				part_1 *
>>> +				sizeof(struct vring_used_elem));
>>> +		vhost_log_used_vring(dev, vq,
>>> +				offsetof(struct vring_used,
>>> +					ring[used_idx_start]),
>>> +				part_1 *
>>> +				sizeof(struct vring_used_elem));
>>> +		rte_memcpy(&vq->used->ring[0],
>>> +				&vq->shadow_used_ring[part_1],
>>> +				part_2 *
>>> +				sizeof(struct vring_used_elem));
>>> +		vhost_log_used_vring(dev, vq,
>>> +				offsetof(struct vring_used,
>>> +					ring[0]),
>>> +				part_2 *
>>> +				sizeof(struct vring_used_elem));
>>> +	}
>>>  }
>> Is expanding the code done for performance purpose?
>
> Hi Maxime,
>
> Yes theoretically this has the least branch number.
> And I think the logic is simpler this way.
Ok, in that case, maybe you could create a function to
do the rte_memcpy and the vhost_log_used on a given range.

I don't have a strong opinion on this, if Yuanhan is fine
with current code, that's ok for me.
Thanks,
Maxime
>
> Thanks
> Zhihong
>
>> Or maybe we could have a loop to do that?
>> Something like this (not compiled, not tested):
>>
>> static inline void __attribute__((always_inline))
>> flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> {
>> 	uint32_t to = used_idx_start;
>> 	uint32_t from = 0;
>> 	uint32_t count;
>>
>> 	if (used_idx_start + vq->shadow_used_idx < vq->size)
>> 		count = vq->shadow_used_idx;
>> 	else
>> 		count = vq->size - used_idx_start;
>>
>> 	do {
>> 		rte_memcpy(&vq->used->ring[to],
>> 				&vq->shadow_used_ring[from],
>> 				count * sizeof(struct vring_used_elem));
>> 		vhost_log_used_vring(dev, vq,
>> 				offsetof(struct vring_used, ring[to]),
>> 				count * sizeof(struct vring_used_elem));
>>
>> 		to = (to + count) & (vq->size - 1);
>> 		from += count;
>> 		count = vq->shadow_used_idx - count;
>> 	} while (count);
>> }
>>
>> Regards,
>> Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 5/6] vhost: batch update used ring
  2016-09-15 16:38         ` Maxime Coquelin
@ 2016-09-18  2:55           ` Yuanhan Liu
  2016-09-18  2:57             ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-09-18  2:55 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: Wang, Zhihong, dev, thomas.monjalon

On Thu, Sep 15, 2016 at 06:38:06PM +0200, Maxime Coquelin wrote:
> >>>+static inline void __attribute__((always_inline))
> >>>+flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
> >>>+		uint32_t used_idx_start)
> >>>+{
> >>>+	if (used_idx_start + vq->shadow_used_idx < vq->size) {
> >>>+		rte_memcpy(&vq->used->ring[used_idx_start],
> >>>+				&vq->shadow_used_ring[0],
> >>>+				vq->shadow_used_idx *
> >>>+				sizeof(struct vring_used_elem));
> >>>+		vhost_log_used_vring(dev, vq,
> >>>+				offsetof(struct vring_used,
> >>>+					ring[used_idx_start]),
> >>>+				vq->shadow_used_idx *
> >>>+				sizeof(struct vring_used_elem));
> >>>+	} else {
> >>>+		uint32_t part_1 = vq->size - used_idx_start;
> >>>+		uint32_t part_2 = vq->shadow_used_idx - part_1;
> >>>+
> >>>+		rte_memcpy(&vq->used->ring[used_idx_start],
> >>>+				&vq->shadow_used_ring[0],
> >>>+				part_1 *
> >>>+				sizeof(struct vring_used_elem));
> >>>+		vhost_log_used_vring(dev, vq,
> >>>+				offsetof(struct vring_used,
> >>>+					ring[used_idx_start]),
> >>>+				part_1 *
> >>>+				sizeof(struct vring_used_elem));
> >>>+		rte_memcpy(&vq->used->ring[0],
> >>>+				&vq->shadow_used_ring[part_1],
> >>>+				part_2 *
> >>>+				sizeof(struct vring_used_elem));
> >>>+		vhost_log_used_vring(dev, vq,
> >>>+				offsetof(struct vring_used,
> >>>+					ring[0]),
> >>>+				part_2 *
> >>>+				sizeof(struct vring_used_elem));
> >>>+	}
> >>> }
> >>Is expanding the code done for performance purpose?
> >
> >Hi Maxime,
> >
> >Yes theoretically this has the least branch number.
> >And I think the logic is simpler this way.
> Ok, in that case, maybe you could create a function to
> do the rte_memcpy and the vhost_log_used on a given range.

Agreed, that will be better; it could avoid repeating similar code
block 3 times.

> I don't have a strong opinion on this, if Yuanhan is fine
> with current code, that's ok for me.

>From what I know, that's kind of DPDK prefered way, to expand code
when necessary. For example, 9ec201f5d6e7 ("mbuf: provide bulk
allocation").

So I'm fine with it.

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 5/6] vhost: batch update used ring
  2016-09-18  2:55           ` Yuanhan Liu
@ 2016-09-18  2:57             ` Wang, Zhihong
  0 siblings, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-18  2:57 UTC (permalink / raw)
  To: Yuanhan Liu, Maxime Coquelin; +Cc: dev, thomas.monjalon



> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Sunday, September 18, 2016 10:56 AM
> To: Maxime Coquelin <maxime.coquelin@redhat.com>
> Cc: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org;
> thomas.monjalon@6wind.com
> Subject: Re: [PATCH v5 5/6] vhost: batch update used ring
> 
> On Thu, Sep 15, 2016 at 06:38:06PM +0200, Maxime Coquelin wrote:
> > >>>+static inline void __attribute__((always_inline))
> > >>>+flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > >>>+		uint32_t used_idx_start)
> > >>>+{
> > >>>+	if (used_idx_start + vq->shadow_used_idx < vq->size) {
> > >>>+		rte_memcpy(&vq->used->ring[used_idx_start],
> > >>>+				&vq->shadow_used_ring[0],
> > >>>+				vq->shadow_used_idx *
> > >>>+				sizeof(struct vring_used_elem));
> > >>>+		vhost_log_used_vring(dev, vq,
> > >>>+				offsetof(struct vring_used,
> > >>>+					ring[used_idx_start]),
> > >>>+				vq->shadow_used_idx *
> > >>>+				sizeof(struct vring_used_elem));
> > >>>+	} else {
> > >>>+		uint32_t part_1 = vq->size - used_idx_start;
> > >>>+		uint32_t part_2 = vq->shadow_used_idx - part_1;
> > >>>+
> > >>>+		rte_memcpy(&vq->used->ring[used_idx_start],
> > >>>+				&vq->shadow_used_ring[0],
> > >>>+				part_1 *
> > >>>+				sizeof(struct vring_used_elem));
> > >>>+		vhost_log_used_vring(dev, vq,
> > >>>+				offsetof(struct vring_used,
> > >>>+					ring[used_idx_start]),
> > >>>+				part_1 *
> > >>>+				sizeof(struct vring_used_elem));
> > >>>+		rte_memcpy(&vq->used->ring[0],
> > >>>+				&vq->shadow_used_ring[part_1],
> > >>>+				part_2 *
> > >>>+				sizeof(struct vring_used_elem));
> > >>>+		vhost_log_used_vring(dev, vq,
> > >>>+				offsetof(struct vring_used,
> > >>>+					ring[0]),
> > >>>+				part_2 *
> > >>>+				sizeof(struct vring_used_elem));
> > >>>+	}
> > >>> }
> > >>Is expanding the code done for performance purpose?
> > >
> > >Hi Maxime,
> > >
> > >Yes theoretically this has the least branch number.
> > >And I think the logic is simpler this way.
> > Ok, in that case, maybe you could create a function to
> > do the rte_memcpy and the vhost_log_used on a given range.
> 
> Agreed, that will be better; it could avoid repeating similar code
> block 3 times.

Okay. Thanks for the suggestion, Maxime and Yuanhan.

> 
> > I don't have a strong opinion on this, if Yuanhan is fine
> > with current code, that's ok for me.
> 
> From what I know, that's kind of DPDK prefered way, to expand code
> when necessary. For example, 9ec201f5d6e7 ("mbuf: provide bulk
> allocation").
> 
> So I'm fine with it.
> 
> 	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 2/6] vhost: rewrite enqueue
  2016-09-09  3:39   ` [PATCH v5 2/6] vhost: rewrite enqueue Zhihong Wang
  2016-09-12 15:42     ` Maxime Coquelin
  2016-09-12 16:26     ` Maxime Coquelin
@ 2016-09-18 14:19     ` Yuanhan Liu
  2016-09-19  3:29       ` Wang, Zhihong
  2 siblings, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-09-18 14:19 UTC (permalink / raw)
  To: Zhihong Wang; +Cc: dev, maxime.coquelin, thomas.monjalon

On Thu, Sep 08, 2016 at 11:39:24PM -0400, Zhihong Wang wrote:
> This patch implements the vhost logic from scratch into a single function
> designed for high performance and better maintainability.

As always, your commit log just states what have been done, but doesn't
tell why such changes have been made. For example, you said "it's designed
for high performance", then you'd better explain why your version would
introduce high performance. You need a reason, as well as some numbers
(percent change) to prove it: it's not that right to keep the numbers
inside: I'm sure people outside intel are also willing and happy to know
those numbers.

For this patch, I think it's more about the maintainability improvement
but not performance: the performance tunning patches are done later
after all.

Another example is, in patch 6, you said "It reduces CPU pipeline stall
cycles significantly", but you didn't say why there is pipeline stall
before and why your patch reduces it.

All those are important things that deserves some explanation. So, I'd
ask you to re-visit all your patches in this set, to think what you
could add to make the commit better and more informative.

Besides that, I think this patchset looks fine to me. I may just need
another time to look it more carefully, then I think I can merge (v6).

BTW, thanks for the great work!

	--yliu

> This is the baseline version of the new code, more optimization will be
> added in the following patches in this patch set.
> 
> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> ---

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v5 2/6] vhost: rewrite enqueue
  2016-09-18 14:19     ` Yuanhan Liu
@ 2016-09-19  3:29       ` Wang, Zhihong
  0 siblings, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-19  3:29 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: dev, maxime.coquelin, thomas.monjalon



> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Sunday, September 18, 2016 10:19 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org; maxime.coquelin@redhat.com;
> thomas.monjalon@6wind.com
> Subject: Re: [PATCH v5 2/6] vhost: rewrite enqueue
> 
> On Thu, Sep 08, 2016 at 11:39:24PM -0400, Zhihong Wang wrote:
> > This patch implements the vhost logic from scratch into a single function
> > designed for high performance and better maintainability.
> 
> As always, your commit log just states what have been done, but doesn't
> tell why such changes have been made. For example, you said "it's designed
> for high performance", then you'd better explain why your version would
> introduce high performance. You need a reason, as well as some numbers
> (percent change) to prove it: it's not that right to keep the numbers
> inside: I'm sure people outside intel are also willing and happy to know
> those numbers.
> 
> For this patch, I think it's more about the maintainability improvement
> but not performance: the performance tunning patches are done later
> after all.
> 
> Another example is, in patch 6, you said "It reduces CPU pipeline stall
> cycles significantly", but you didn't say why there is pipeline stall
> before and why your patch reduces it.
> 
> All those are important things that deserves some explanation. So, I'd
> ask you to re-visit all your patches in this set, to think what you
> could add to make the commit better and more informative.

Okay. I'll add more detailed commit log.

> 
> Besides that, I think this patchset looks fine to me. I may just need
> another time to look it more carefully, then I think I can merge (v6).
> 
> BTW, thanks for the great work!
> 
> 	--yliu
> 
> > This is the baseline version of the new code, more optimization will be
> > added in the following patches in this patch set.
> >
> > Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> > ---

^ permalink raw reply	[flat|nested] 141+ messages in thread

* [PATCH v6 0/6] vhost: optimize enqueue
  2016-08-16  3:50 [PATCH] optimize vhost enqueue Zhihong Wang
                   ` (4 preceding siblings ...)
  2016-09-09  3:39 ` [PATCH v5 0/6] vhost: optimize enqueue Zhihong Wang
@ 2016-09-20  2:00 ` Zhihong Wang
  2016-09-20  2:00   ` [PATCH v6 1/6] vhost: fix windows vm hang Zhihong Wang
                     ` (7 more replies)
  5 siblings, 8 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-09-20  2:00 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon

This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

 *  Higher maximum throughput can be achieved for fast frontends like DPDK
    virtio pmd.

 *  Better scalability can be achieved that each vhost core can support
    more connections because it takes less cycles to handle each single
    frontend.

This patch set contains:

 1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.

 2. A baseline patch to rewrite the vhost logic.

 3. A series of optimization patches added upon the baseline.

The main optimization techniques are:

 1. Reorder code to reduce CPU pipeline stall cycles.

 2. Batch update the used ring for better efficiency.

 3. Prefetch descriptor to hide cache latency.

 4. Remove useless volatile attribute to allow compiler optimization.

Code reordering and batch used ring update bring most of the performance
improvements.

In the existing code there're 2 callbacks for vhost enqueue:

 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.

 *  virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
maintenance efforts.

Also, there's a compatibility issue in the existing code which causes
Windows VM to hang when the mrg_rxbuf feature turned on.

---
Changes in v6:

 1. Merge duplicated code.

 2. Introduce a function for used ring write.

 3. Add necessary comments.

---
Changes in v5:

 1. Rebase to dpdk-next-virtio master.

 2. Rename variables to keep consistent in naming style.

 3. Small changes like return value adjustment and vertical alignment.

 4. Add details in commit log.

---
Changes in v4:

 1. Fix a Windows VM compatibility issue.

 2. Free shadow used ring in the right place.

 3. Add failure check for shadow used ring malloc.

 4. Refactor the code for clearer logic.

 5. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

 3. Rewrite enqueue and delete the obsolete in the same patch.

---
Changes in v2:

 1. Split the big function into several small ones.

 2. Use multiple patches to explain each optimization.

 3. Add comments.

Zhihong Wang (6):
  vhost: fix windows vm hang
  vhost: rewrite enqueue
  vhost: remove useless volatile
  vhost: add desc prefetch
  vhost: batch update used ring
  vhost: optimize cache access

 lib/librte_vhost/vhost.c      |  20 +-
 lib/librte_vhost/vhost.h      |   6 +-
 lib/librte_vhost/vhost_user.c |  31 ++-
 lib/librte_vhost/virtio_net.c | 541 ++++++++++++++----------------------------
 4 files changed, 225 insertions(+), 373 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 141+ messages in thread

* [PATCH v6 1/6] vhost: fix windows vm hang
  2016-09-20  2:00 ` [PATCH v6 " Zhihong Wang
@ 2016-09-20  2:00   ` Zhihong Wang
  2016-10-13  6:18     ` [dpdk-stable] " Yuanhan Liu
  2016-09-20  2:00   ` [PATCH v6 2/6] vhost: rewrite enqueue Zhihong Wang
                     ` (6 subsequent siblings)
  7 siblings, 1 reply; 141+ messages in thread
From: Zhihong Wang @ 2016-09-20  2:00 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, stable, Zhihong Wang

This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost code
which causes the guest to hang once any packets are enqueued when mrg_rxbuf
is turned on by setting the right id and len in the used ring.

As defined in virtio spec 0.95 and 1.0, in each used ring element, id means
index of start of used descriptor chain, and len means total length of the
descriptor chain which was written to. While in 16.07 code, index of the
last descriptor is assigned to id, and the length of the last descriptor is
assigned to len.

How to test?

 1. Start testpmd in the host with a vhost port.

 2. Start a Windows VM image with qemu and connect to the vhost port.

 3. Start io forwarding with tx_first in host testpmd.

For 16.07 code, the Windows VM will hang once any packets are enqueued.

Cc: <stable@dpdk.org>
Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
Changes in v5:

 1. Add details in commit log.

 lib/librte_vhost/virtio_net.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 8a151af..0d6e7d9 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	uint16_t start_idx = vq->last_used_idx;
 	uint16_t cur_idx = start_idx;
 	uint64_t desc_addr;
+	uint32_t desc_chain_head;
+	uint32_t desc_chain_len;
 	uint32_t mbuf_offset, mbuf_avail;
 	uint32_t desc_offset, desc_avail;
 	uint32_t cpy_len;
@@ -412,6 +414,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
 	desc_offset = dev->vhost_hlen;
+	desc_chain_head = buf_vec[vec_idx].desc_idx;
+	desc_chain_len = desc_offset;
 
 	mbuf_avail  = rte_pktmbuf_data_len(m);
 	mbuf_offset = 0;
@@ -419,19 +423,21 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		/* done with current desc buf, get the next one */
 		if (desc_avail == 0) {
 			desc_idx = buf_vec[vec_idx].desc_idx;
+			vec_idx++;
 
 			if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
 				/* Update used ring with desc information */
 				used_idx = cur_idx++ & (vq->size - 1);
-				vq->used->ring[used_idx].id  = desc_idx;
-				vq->used->ring[used_idx].len = desc_offset;
+				vq->used->ring[used_idx].id = desc_chain_head;
+				vq->used->ring[used_idx].len = desc_chain_len;
 				vhost_log_used_vring(dev, vq,
 					offsetof(struct vring_used,
 						 ring[used_idx]),
 					sizeof(vq->used->ring[used_idx]));
+				desc_chain_head = buf_vec[vec_idx].desc_idx;
+				desc_chain_len = 0;
 			}
 
-			vec_idx++;
 			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
 			if (unlikely(!desc_addr))
 				return 0;
@@ -463,11 +469,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		mbuf_offset += cpy_len;
 		desc_avail  -= cpy_len;
 		desc_offset += cpy_len;
+		desc_chain_len += cpy_len;
 	}
 
 	used_idx = cur_idx & (vq->size - 1);
-	vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
-	vq->used->ring[used_idx].len = desc_offset;
+	vq->used->ring[used_idx].id = desc_chain_head;
+	vq->used->ring[used_idx].len = desc_chain_len;
 	vhost_log_used_vring(dev, vq,
 		offsetof(struct vring_used, ring[used_idx]),
 		sizeof(vq->used->ring[used_idx]));
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v6 2/6] vhost: rewrite enqueue
  2016-09-20  2:00 ` [PATCH v6 " Zhihong Wang
  2016-09-20  2:00   ` [PATCH v6 1/6] vhost: fix windows vm hang Zhihong Wang
@ 2016-09-20  2:00   ` Zhihong Wang
  2016-09-22  9:58     ` Jianbo Liu
  2016-09-20  2:00   ` [PATCH v6 3/6] vhost: remove useless volatile Zhihong Wang
                     ` (5 subsequent siblings)
  7 siblings, 1 reply; 141+ messages in thread
From: Zhihong Wang @ 2016-09-20  2:00 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch implements the vhost logic from scratch into a single function
to improve maintainability. This is the baseline version of the new code,
more optimization will be added in the following patches in this patch set.

In the existing code there're 2 callbacks for vhost enqueue:

 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.

 *  virtio_dev_rx for mrg_rxbuf turned off cases.

Having 2 callback paths increases maintenance effort. Also, the performance
of the existing code is not optimal, especially when the mrg_rxbuf feature
turned on.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
Changes in v6:

 1. Merge duplicated code.

 2. Add necessary comments.

---
Changes in v5:

 1. Rebase to dpdk-next-virtio master.

 2. Rename variables to keep consistent in naming style.

 3. Small changes like return value adjustment and vertical alignment.

---
Changes in v4:

 1. Refactor the code for clearer logic.

 2. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Rewrite enqueue and delete the obsolete in the same patch.

 lib/librte_vhost/virtio_net.c | 508 +++++++++++-------------------------------
 1 file changed, 134 insertions(+), 374 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 0d6e7d9..0ada32b 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
 	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }
 
-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
 	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 						cksum));
 			break;
 		}
+	} else {
+		net_hdr->flags       = 0;
+		net_hdr->csum_start  = 0;
+		net_hdr->csum_offset = 0;
 	}
 
 	if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
@@ -122,439 +126,195 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 		net_hdr->gso_size = m_buf->tso_segsz;
 		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
 					+ m_buf->l4_len;
+	} else {
+		net_hdr->gso_type = 0;
+		net_hdr->hdr_len  = 0;
+		net_hdr->gso_size = 0;
 	}
 }
 
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-		    struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint32_t desc_chain_head, uint32_t desc_chain_len)
 {
-	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-	else
-		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
+
+	vq->used->ring[used_idx].id = desc_chain_head;
+	vq->used->ring[used_idx].len = desc_chain_len;
+	vq->last_used_idx++;
+	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
+				ring[used_idx]),
+			sizeof(vq->used->ring[used_idx]));
 }
 
 static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		  struct rte_mbuf *m, uint16_t desc_idx)
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint16_t avail_idx, struct rte_mbuf *mbuf,
+		uint32_t is_mrg_rxbuf)
 {
-	uint32_t desc_avail, desc_offset;
-	uint32_t mbuf_avail, mbuf_offset;
-	uint32_t cpy_len;
+	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
 	struct vring_desc *desc;
 	uint64_t desc_addr;
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+	uint32_t desc_chain_head;
+	uint32_t desc_chain_len;
+	uint32_t desc_current;
+	uint32_t desc_offset;
+	uint32_t mbuf_len;
+	uint32_t mbuf_avail;
+	uint32_t cpy_len;
+	uint32_t num_buffers = 0;
 
-	desc = &vq->desc[desc_idx];
+	/* start with the first mbuf of the packet */
+	mbuf_len = rte_pktmbuf_data_len(mbuf);
+	mbuf_avail = mbuf_len;
+
+	/* get the current desc */
+	desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+	desc_chain_head = desc_current;
+	desc = &vq->desc[desc_current];
 	desc_addr = gpa_to_vva(dev, desc->addr);
-	/*
-	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-	 * otherwise stores offset on the stack instead of in a register.
-	 */
-	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-		return -1;
+	if (unlikely(!desc_addr))
+		goto error;
 
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
+	/* handle virtio header */
+	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+	if (is_mrg_rxbuf)
+		virtio_hdr->num_buffers = 1;
 
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
 	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
 	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
 	desc_offset = dev->vhost_hlen;
-	desc_avail  = desc->len - dev->vhost_hlen;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current mbuf, fetch next */
-		if (mbuf_avail == 0) {
-			m = m->next;
-
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
+	desc_chain_len = desc_offset;
+	desc_addr += desc_offset;
+
+	/* start copy from mbuf to desc */
+	while (mbuf_avail || mbuf->next) {
+		/* get the next mbuf if the current done */
+		if (!mbuf_avail) {
+			mbuf = mbuf->next;
+			mbuf_len = rte_pktmbuf_data_len(mbuf);
+			mbuf_avail = mbuf_len;
 		}
 
-		/* done with current desc buf, fetch next */
-		if (desc_avail == 0) {
-			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
-				/* Room in vring buffer is not enough */
-				return -1;
-			}
-			if (unlikely(desc->next >= vq->size))
-				return -1;
+		/* get the next desc if the current done */
+		if (desc->len <= desc_offset) {
+			if (desc->flags & VRING_DESC_F_NEXT) {
+				/* go on with the current desc chain */
+				desc_current = desc->next;
+			} else if (is_mrg_rxbuf) {
+				/* start with the next desc chain */
+				update_used_ring(dev, vq, desc_chain_head,
+						desc_chain_len);
+				num_buffers++;
+				virtio_hdr->num_buffers++;
+				if (avail_idx == vq->last_used_idx)
+					goto error;
+
+				desc_current =
+					vq->avail->ring[(vq->last_used_idx) &
+					(vq->size - 1)];
+				desc_chain_head = desc_current;
+				desc_chain_len = 0;
+			} else
+				goto error;
 
-			desc = &vq->desc[desc->next];
+			desc_offset = 0;
+			desc = &vq->desc[desc_current];
 			desc_addr = gpa_to_vva(dev, desc->addr);
 			if (unlikely(!desc_addr))
-				return -1;
-
-			desc_offset = 0;
-			desc_avail  = desc->len;
+				goto error;
 		}
 
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
+		/* copy mbuf data */
+		cpy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+		rte_memcpy((void *)(uintptr_t)desc_addr,
+				rte_pktmbuf_mtod_offset(mbuf, void *,
+					mbuf_len - mbuf_avail),
+				cpy_len);
 		vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			     cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
-	}
-
-	return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
-	      struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint16_t avail_idx, free_entries, start_idx;
-	uint16_t desc_indexes[MAX_PKT_BURST];
-	uint16_t used_idx;
-	uint32_t i;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
-		return 0;
+		PRINT_PACKET(dev, (uintptr_t)desc_addr, cpy_len, 0);
+		mbuf_avail     -= cpy_len;
+		desc_addr      += cpy_len;
+		desc_offset    += cpy_len;
+		desc_chain_len += cpy_len;
 	}
 
-	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
-		return 0;
+	update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
 
-	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-	start_idx = vq->last_used_idx;
-	free_entries = avail_idx - start_idx;
-	count = RTE_MIN(count, free_entries);
-	count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
-	if (count == 0)
-		return 0;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
-		dev->vid, start_idx, start_idx + count);
-
-	/* Retrieve all of the desc indexes first to avoid caching issues. */
-	rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
-	for (i = 0; i < count; i++) {
-		used_idx = (start_idx + i) & (vq->size - 1);
-		desc_indexes[i] = vq->avail->ring[used_idx];
-		vq->used->ring[used_idx].id = desc_indexes[i];
-		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
-					       dev->vhost_hlen;
-		vhost_log_used_vring(dev, vq,
-			offsetof(struct vring_used, ring[used_idx]),
-			sizeof(vq->used->ring[used_idx]));
-	}
-
-	rte_prefetch0(&vq->desc[desc_indexes[0]]);
-	for (i = 0; i < count; i++) {
-		uint16_t desc_idx = desc_indexes[i];
-		int err;
+	return 0;
 
-		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
-		if (unlikely(err)) {
-			used_idx = (start_idx + i) & (vq->size - 1);
-			vq->used->ring[used_idx].len = dev->vhost_hlen;
-			vhost_log_used_vring(dev, vq,
-				offsetof(struct vring_used, ring[used_idx]),
-				sizeof(vq->used->ring[used_idx]));
-		}
+error:
+	/* rollback on any error if last_used_idx update on-the-fly */
+	vq->last_used_idx -= num_buffers;
 
-		if (i + 1 < count)
-			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
-	}
+	return -1;
+}
 
+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+	/* flush changes before updating used->idx */
 	rte_smp_wmb();
-
-	*(volatile uint16_t *)&vq->used->idx += count;
-	vq->last_used_idx += count;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, idx),
-		sizeof(vq->used->idx));
-
-	/* flush used->idx update before we read avail->flags. */
+	*(volatile uint16_t *)&vq->used->idx = vq->last_used_idx;
+	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+			sizeof(vq->used->idx));
+	/* flush used->idx update before reading avail->flags */
 	rte_mb();
-
-	/* Kick the guest if necessary. */
+	/* kick the guest if necessary */
 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
 			&& (vq->callfd >= 0))
 		eventfd_write(vq->callfd, (eventfd_t)1);
-	return count;
-}
-
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
-	     uint32_t *allocated, uint32_t *vec_idx,
-	     struct buf_vector *buf_vec)
-{
-	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
-	uint32_t vec_id = *vec_idx;
-	uint32_t len    = *allocated;
-
-	while (1) {
-		if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
-			return -1;
-
-		len += vq->desc[idx].len;
-		buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
-		buf_vec[vec_id].buf_len  = vq->desc[idx].len;
-		buf_vec[vec_id].desc_idx = idx;
-		vec_id++;
-
-		if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
-			break;
-
-		idx = vq->desc[idx].next;
-	}
-
-	*allocated = len;
-	*vec_idx   = vec_id;
-
-	return 0;
 }
 
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
-			    uint16_t *end, struct buf_vector *buf_vec)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint16_t count)
 {
-	uint16_t cur_idx;
+	struct vhost_virtqueue *vq;
+	struct virtio_net *dev;
+	uint32_t is_mrg_rxbuf = 0;
+	uint32_t pkt_idx      = 0;
+	uint32_t pkt_left     = count;
 	uint16_t avail_idx;
-	uint32_t allocated = 0;
-	uint32_t vec_idx = 0;
-	uint16_t tries = 0;
-
-	cur_idx  = vq->last_used_idx;
-
-	while (1) {
-		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-		if (unlikely(cur_idx == avail_idx))
-			return -1;
-
-		if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
-					  &vec_idx, buf_vec) < 0))
-			return -1;
-
-		cur_idx++;
-		tries++;
-
-		if (allocated >= size)
-			break;
-
-		/*
-		 * if we tried all available ring items, and still
-		 * can't get enough buf, it means something abnormal
-		 * happened.
-		 */
-		if (unlikely(tries >= vq->size))
-			return -1;
-	}
 
-	*end = cur_idx;
-	return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
-			    uint16_t end_idx, struct rte_mbuf *m,
-			    struct buf_vector *buf_vec)
-{
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-	uint32_t vec_idx = 0;
-	uint16_t start_idx = vq->last_used_idx;
-	uint16_t cur_idx = start_idx;
-	uint64_t desc_addr;
-	uint32_t desc_chain_head;
-	uint32_t desc_chain_len;
-	uint32_t mbuf_offset, mbuf_avail;
-	uint32_t desc_offset, desc_avail;
-	uint32_t cpy_len;
-	uint16_t desc_idx, used_idx;
-
-	if (unlikely(m == NULL))
+	if (unlikely(!pkt_left))
 		return 0;
 
-	LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
-		dev->vid, cur_idx, end_idx);
+	pkt_left = RTE_MIN((uint32_t)MAX_PKT_BURST, pkt_left);
 
-	desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
+	dev = get_device(vid);
+	if (unlikely(!dev))
 		return 0;
 
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-	virtio_hdr.num_buffers = end_idx - start_idx;
-	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
-		dev->vid, virtio_hdr.num_buffers);
-
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-	vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
-	desc_offset = dev->vhost_hlen;
-	desc_chain_head = buf_vec[vec_idx].desc_idx;
-	desc_chain_len = desc_offset;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current desc buf, get the next one */
-		if (desc_avail == 0) {
-			desc_idx = buf_vec[vec_idx].desc_idx;
-			vec_idx++;
-
-			if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
-				/* Update used ring with desc information */
-				used_idx = cur_idx++ & (vq->size - 1);
-				vq->used->ring[used_idx].id = desc_chain_head;
-				vq->used->ring[used_idx].len = desc_chain_len;
-				vhost_log_used_vring(dev, vq,
-					offsetof(struct vring_used,
-						 ring[used_idx]),
-					sizeof(vq->used->ring[used_idx]));
-				desc_chain_head = buf_vec[vec_idx].desc_idx;
-				desc_chain_len = 0;
-			}
-
-			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-			if (unlikely(!desc_addr))
-				return 0;
-
-			/* Prefetch buffer address. */
-			rte_prefetch0((void *)(uintptr_t)desc_addr);
-			desc_offset = 0;
-			desc_avail  = buf_vec[vec_idx].buf_len;
-		}
-
-		/* done with current mbuf, get the next one */
-		if (mbuf_avail == 0) {
-			m = m->next;
-
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
-		}
-
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
-			cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
-		desc_chain_len += cpy_len;
-	}
-
-	used_idx = cur_idx & (vq->size - 1);
-	vq->used->ring[used_idx].id = desc_chain_head;
-	vq->used->ring[used_idx].len = desc_chain_len;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, ring[used_idx]),
-		sizeof(vq->used->ring[used_idx]));
-
-	return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint32_t pkt_idx = 0, nr_used = 0;
-	uint16_t end;
-	struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
 		return 0;
-	}
 
 	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
+	if (unlikely(!vq->enabled))
 		return 0;
 
-	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
-	if (count == 0)
-		return 0;
-
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
-		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
+	if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+		is_mrg_rxbuf = 1;
 
-		if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
-							 &end, buf_vec) < 0)) {
-			LOG_DEBUG(VHOST_DATA,
-				"(%d) failed to get enough desc from vring\n",
-				dev->vid);
+	/* start enqueuing packets 1 by 1 */
+	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+	while (pkt_left && avail_idx != vq->last_used_idx) {
+		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+					is_mrg_rxbuf))
 			break;
-		}
-
-		nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
-						      pkts[pkt_idx], buf_vec);
-		rte_smp_wmb();
 
-		*(volatile uint16_t *)&vq->used->idx += nr_used;
-		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
-			sizeof(vq->used->idx));
-		vq->last_used_idx += nr_used;
+		pkt_idx++;
+		pkt_left--;
 	}
 
-	if (likely(pkt_idx)) {
-		/* flush used->idx update before we read avail->flags. */
-		rte_mb();
-
-		/* Kick the guest if necessary. */
-		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-				&& (vq->callfd >= 0))
-			eventfd_write(vq->callfd, (eventfd_t)1);
-	}
+	/* update used idx and kick the guest if necessary */
+	if (pkt_idx)
+		notify_guest(dev, vq);
 
 	return pkt_idx;
 }
 
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint16_t count)
-{
-	struct virtio_net *dev = get_device(vid);
-
-	if (!dev)
-		return 0;
-
-	if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
-		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
-	else
-		return virtio_dev_rx(dev, queue_id, pkts, count);
-}
-
 static void
 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
 {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v6 3/6] vhost: remove useless volatile
  2016-09-20  2:00 ` [PATCH v6 " Zhihong Wang
  2016-09-20  2:00   ` [PATCH v6 1/6] vhost: fix windows vm hang Zhihong Wang
  2016-09-20  2:00   ` [PATCH v6 2/6] vhost: rewrite enqueue Zhihong Wang
@ 2016-09-20  2:00   ` Zhihong Wang
  2016-09-20  2:00   ` [PATCH v6 4/6] vhost: add desc prefetch Zhihong Wang
                     ` (4 subsequent siblings)
  7 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-09-20  2:00 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index c2dfc3c..9707dfc 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
 	uint32_t		size;
 
 	/* Last index used on the available ring */
-	volatile uint16_t	last_used_idx;
+	uint16_t		last_used_idx;
 #define VIRTIO_INVALID_EVENTFD		(-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD	(-2)
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v6 4/6] vhost: add desc prefetch
  2016-09-20  2:00 ` [PATCH v6 " Zhihong Wang
                     ` (2 preceding siblings ...)
  2016-09-20  2:00   ` [PATCH v6 3/6] vhost: remove useless volatile Zhihong Wang
@ 2016-09-20  2:00   ` Zhihong Wang
  2016-09-20  2:00   ` [PATCH v6 5/6] vhost: batch update used ring Zhihong Wang
                     ` (3 subsequent siblings)
  7 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-09-20  2:00 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 0ada32b..f32a143 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -300,6 +300,12 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 	/* start enqueuing packets 1 by 1 */
 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
 	while (pkt_left && avail_idx != vq->last_used_idx) {
+		/* prefetch the next desc */
+		if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+			rte_prefetch0(&vq->desc[vq->avail->ring[
+					(vq->last_used_idx + 1) &
+					(vq->size - 1)]]);
+
 		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
 					is_mrg_rxbuf))
 			break;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v6 5/6] vhost: batch update used ring
  2016-09-20  2:00 ` [PATCH v6 " Zhihong Wang
                     ` (3 preceding siblings ...)
  2016-09-20  2:00   ` [PATCH v6 4/6] vhost: add desc prefetch Zhihong Wang
@ 2016-09-20  2:00   ` Zhihong Wang
  2016-09-20  2:00   ` [PATCH v6 6/6] vhost: optimize cache access Zhihong Wang
                     ` (2 subsequent siblings)
  7 siblings, 0 replies; 141+ messages in thread
From: Zhihong Wang @ 2016-09-20  2:00 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch enables batch update of the used ring for better efficiency.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
Changes in v6:

 1. Introduce a function for used ring write.

---
Changes in v4:

 1. Free shadow used ring in the right place.

 2. Add failure check for shadow used ring malloc.

 lib/librte_vhost/vhost.c      | 20 +++++++++++++++--
 lib/librte_vhost/vhost.h      |  4 ++++
 lib/librte_vhost/vhost_user.c | 31 +++++++++++++++++++++-----
 lib/librte_vhost/virtio_net.c | 52 ++++++++++++++++++++++++++++++++++---------
 4 files changed, 89 insertions(+), 18 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 46095c3..cb31cdd 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -119,10 +119,26 @@ cleanup_device(struct virtio_net *dev, int destroy)
 static void
 free_device(struct virtio_net *dev)
 {
+	struct vhost_virtqueue *vq_0;
+	struct vhost_virtqueue *vq_1;
 	uint32_t i;
 
-	for (i = 0; i < dev->virt_qp_nb; i++)
-		rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+	for (i = 0; i < dev->virt_qp_nb; i++) {
+		vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
+		if (vq_0->shadow_used_ring) {
+			rte_free(vq_0->shadow_used_ring);
+			vq_0->shadow_used_ring = NULL;
+		}
+
+		vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
+		if (vq_1->shadow_used_ring) {
+			rte_free(vq_1->shadow_used_ring);
+			vq_1->shadow_used_ring = NULL;
+		}
+
+		/* malloc together, free together */
+		rte_free(vq_0);
+	}
 
 	rte_free(dev);
 }
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 9707dfc..381dc27 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {
 
 	/* Physical address of used ring, for logging */
 	uint64_t		log_guest_addr;
+
+	/* Shadow used ring for performance */
+	struct vring_used_elem	*shadow_used_ring;
+	uint32_t		shadow_used_idx;
 } __rte_cache_aligned;
 
 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index eee99e9..d7cf1ed 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -193,7 +193,21 @@ static int
 vhost_user_set_vring_num(struct virtio_net *dev,
 			 struct vhost_vring_state *state)
 {
-	dev->virtqueue[state->index]->size = state->num;
+	struct vhost_virtqueue *vq;
+
+	vq = dev->virtqueue[state->index];
+	vq->size = state->num;
+	if (!vq->shadow_used_ring) {
+		vq->shadow_used_ring = rte_malloc(NULL,
+				vq->size * sizeof(struct vring_used_elem),
+				RTE_CACHE_LINE_SIZE);
+		if (!vq->shadow_used_ring) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"Failed to allocate memory"
+				" for shadow used ring.\n");
+			return -1;
+		}
+	}
 
 	return 0;
 }
@@ -611,14 +625,21 @@ static int
 vhost_user_get_vring_base(struct virtio_net *dev,
 			  struct vhost_vring_state *state)
 {
+	struct vhost_virtqueue *vq;
+
 	/* We have to stop the queue (virtio) if it is running. */
 	if (dev->flags & VIRTIO_DEV_RUNNING) {
 		dev->flags &= ~VIRTIO_DEV_RUNNING;
 		notify_ops->destroy_device(dev->vid);
 	}
 
+	vq = dev->virtqueue[state->index];
 	/* Here we are safe to get the last used index */
-	state->num = dev->virtqueue[state->index]->last_used_idx;
+	state->num = vq->last_used_idx;
+	if (vq->shadow_used_ring) {
+		rte_free(vq->shadow_used_ring);
+		vq->shadow_used_ring = NULL;
+	}
 
 	RTE_LOG(INFO, VHOST_CONFIG,
 		"vring base idx:%d file:%d\n", state->index, state->num);
@@ -627,10 +648,10 @@ vhost_user_get_vring_base(struct virtio_net *dev,
 	 * sent and only sent in vhost_vring_stop.
 	 * TODO: cleanup the vring, it isn't usable since here.
 	 */
-	if (dev->virtqueue[state->index]->kickfd >= 0)
-		close(dev->virtqueue[state->index]->kickfd);
+	if (vq->kickfd >= 0)
+		close(vq->kickfd);
 
-	dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+	vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
 
 	return 0;
 }
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index f32a143..8f2882b 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -134,17 +134,40 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 }
 
 static inline void __attribute__((always_inline))
-update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		uint32_t desc_chain_head, uint32_t desc_chain_len)
+update_used_ring(struct vhost_virtqueue *vq, uint32_t desc_chain_head,
+		uint32_t desc_chain_len)
 {
-	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
-
-	vq->used->ring[used_idx].id = desc_chain_head;
-	vq->used->ring[used_idx].len = desc_chain_len;
+	vq->shadow_used_ring[vq->shadow_used_idx].id  = desc_chain_head;
+	vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+	vq->shadow_used_idx++;
 	vq->last_used_idx++;
-	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
-				ring[used_idx]),
-			sizeof(vq->used->ring[used_idx]));
+}
+
+static inline void __attribute__((always_inline))
+write_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint32_t used_idx, uint32_t used_idx_shadow, uint32_t size)
+{
+	rte_memcpy(&vq->used->ring[used_idx],
+			&vq->shadow_used_ring[used_idx_shadow],
+			size * sizeof(struct vring_used_elem));
+	vhost_log_used_vring(dev, vq,
+			offsetof(struct vring_used, ring[used_idx]),
+			size * sizeof(struct vring_used_elem));
+}
+
+static inline void __attribute__((always_inline))
+flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint32_t used_idx)
+{
+	if (used_idx + vq->shadow_used_idx < vq->size) {
+		write_used_ring(dev, vq, used_idx, 0, vq->shadow_used_idx);
+	} else {
+		uint32_t size_0 = vq->size - used_idx;
+		uint32_t size_1 = vq->shadow_used_idx - size_0;
+
+		write_used_ring(dev, vq, used_idx, 0, size_0);
+		write_used_ring(dev, vq, 0, size_0, size_1);
+	}
 }
 
 static inline int __attribute__((always_inline))
@@ -204,7 +227,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 				desc_current = desc->next;
 			} else if (is_mrg_rxbuf) {
 				/* start with the next desc chain */
-				update_used_ring(dev, vq, desc_chain_head,
+				update_used_ring(vq, desc_chain_head,
 						desc_chain_len);
 				num_buffers++;
 				virtio_hdr->num_buffers++;
@@ -240,7 +263,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		desc_chain_len += cpy_len;
 	}
 
-	update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
+	update_used_ring(vq, desc_chain_head, desc_chain_len);
 
 	return 0;
 
@@ -273,6 +296,7 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 {
 	struct vhost_virtqueue *vq;
 	struct virtio_net *dev;
+	uint32_t used_idx;
 	uint32_t is_mrg_rxbuf = 0;
 	uint32_t pkt_idx      = 0;
 	uint32_t pkt_left     = count;
@@ -298,6 +322,8 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 		is_mrg_rxbuf = 1;
 
 	/* start enqueuing packets 1 by 1 */
+	vq->shadow_used_idx = 0;
+	used_idx = vq->last_used_idx & (vq->size - 1);
 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
 	while (pkt_left && avail_idx != vq->last_used_idx) {
 		/* prefetch the next desc */
@@ -314,6 +340,10 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 		pkt_left--;
 	}
 
+	/* batch update used ring for better performance */
+	if (likely(vq->shadow_used_idx > 0))
+		flush_used_ring(dev, vq, used_idx);
+
 	/* update used idx and kick the guest if necessary */
 	if (pkt_idx)
 		notify_guest(dev, vq);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v6 6/6] vhost: optimize cache access
  2016-09-20  2:00 ` [PATCH v6 " Zhihong Wang
                     ` (4 preceding siblings ...)
  2016-09-20  2:00   ` [PATCH v6 5/6] vhost: batch update used ring Zhihong Wang
@ 2016-09-20  2:00   ` Zhihong Wang
  2016-09-21  4:32     ` Maxime Coquelin
  2016-09-21  2:26   ` [PATCH v6 0/6] vhost: optimize enqueue Yuanhan Liu
  2016-10-14  9:34   ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Yuanhan Liu
  7 siblings, 1 reply; 141+ messages in thread
From: Zhihong Wang @ 2016-09-20  2:00 UTC (permalink / raw)
  To: dev; +Cc: maxime.coquelin, yuanhan.liu, thomas.monjalon, Zhihong Wang

This patch reorders the code to delay virtio header write to improve
cache access efficiency for cases where the mrg_rxbuf feature is turned
on. CPU pipeline stall cycles can be significantly reduced.

Virtio header write and mbuf data copy are all remote store operations
which takes a long time to finish. It's a good idea to put them together
to remove bubbles in between, to let as many remote store instructions
as possible go into store buffer at the same time to hide latency, and
to let the H/W prefetcher goes to work as early as possible.

On a Haswell machine, about 100 cycles can be saved per packet by this
patch alone. Taking 64B packets traffic for example, this means about 60%
efficiency improvement for the enqueue operation.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

 lib/librte_vhost/virtio_net.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 8f2882b..11a2c1a 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -185,6 +185,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	uint32_t mbuf_len;
 	uint32_t mbuf_avail;
 	uint32_t cpy_len;
+	uint32_t copy_virtio_hdr;
 	uint32_t num_buffers = 0;
 
 	/* start with the first mbuf of the packet */
@@ -199,12 +200,12 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	if (unlikely(!desc_addr))
 		goto error;
 
-	/* handle virtio header */
+	/*
+	 * handle virtio header, the actual write operation is delayed
+	 * for cache optimization, to reduce CPU pipeline stall cycles.
+	 */
 	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
-	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
-	if (is_mrg_rxbuf)
-		virtio_hdr->num_buffers = 1;
-
+	copy_virtio_hdr = 1;
 	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
 	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
 	desc_offset = dev->vhost_hlen;
@@ -249,8 +250,15 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 				goto error;
 		}
 
-		/* copy mbuf data */
+		/* copy virtio header and mbuf data */
 		cpy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+		if (copy_virtio_hdr) {
+			copy_virtio_hdr = 0;
+			virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+			if (is_mrg_rxbuf)
+				virtio_hdr->num_buffers = num_buffers + 1;
+		}
+
 		rte_memcpy((void *)(uintptr_t)desc_addr,
 				rte_pktmbuf_mtod_offset(mbuf, void *,
 					mbuf_len - mbuf_avail),
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* Re: [PATCH v6 0/6] vhost: optimize enqueue
  2016-09-20  2:00 ` [PATCH v6 " Zhihong Wang
                     ` (5 preceding siblings ...)
  2016-09-20  2:00   ` [PATCH v6 6/6] vhost: optimize cache access Zhihong Wang
@ 2016-09-21  2:26   ` Yuanhan Liu
  2016-09-21  4:39     ` Maxime Coquelin
  2016-10-14  9:34   ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Yuanhan Liu
  7 siblings, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-09-21  2:26 UTC (permalink / raw)
  To: maxime.coquelin; +Cc: Zhihong Wang, dev, thomas.monjalon

Hi Maxime,

Do you have more comments about this set? If no, I think I could merge
it shortly.

Thanks.

	--yliu

On Mon, Sep 19, 2016 at 10:00:11PM -0400, Zhihong Wang wrote:
> This patch set optimizes the vhost enqueue function.
> 
> It implements the vhost logic from scratch into a single function designed
> for high performance and good maintainability, and improves CPU efficiency
> significantly by optimizing cache access, which means:
> 
>  *  Higher maximum throughput can be achieved for fast frontends like DPDK
>     virtio pmd.
> 
>  *  Better scalability can be achieved that each vhost core can support
>     more connections because it takes less cycles to handle each single
>     frontend.
> 
> This patch set contains:
> 
>  1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.
> 
>  2. A baseline patch to rewrite the vhost logic.
> 
>  3. A series of optimization patches added upon the baseline.
> 
> The main optimization techniques are:
> 
>  1. Reorder code to reduce CPU pipeline stall cycles.
> 
>  2. Batch update the used ring for better efficiency.
> 
>  3. Prefetch descriptor to hide cache latency.
> 
>  4. Remove useless volatile attribute to allow compiler optimization.
> 
> Code reordering and batch used ring update bring most of the performance
> improvements.
> 
> In the existing code there're 2 callbacks for vhost enqueue:
> 
>  *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
> 
>  *  virtio_dev_rx for mrg_rxbuf turned off cases.
> 
> The performance of the existing code is not optimal, especially when the
> mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
> maintenance efforts.
> 
> Also, there's a compatibility issue in the existing code which causes
> Windows VM to hang when the mrg_rxbuf feature turned on.
> 
> ---
> Changes in v6:
> 
>  1. Merge duplicated code.
> 
>  2. Introduce a function for used ring write.
> 
>  3. Add necessary comments.
> 
> ---
> Changes in v5:
> 
>  1. Rebase to dpdk-next-virtio master.
> 
>  2. Rename variables to keep consistent in naming style.
> 
>  3. Small changes like return value adjustment and vertical alignment.
> 
>  4. Add details in commit log.
> 
> ---
> Changes in v4:
> 
>  1. Fix a Windows VM compatibility issue.
> 
>  2. Free shadow used ring in the right place.
> 
>  3. Add failure check for shadow used ring malloc.
> 
>  4. Refactor the code for clearer logic.
> 
>  5. Add PRINT_PACKET for debugging.
> 
> ---
> Changes in v3:
> 
>  1. Remove unnecessary memset which causes frontend stall on SNB & IVB.
> 
>  2. Rename variables to follow naming convention.
> 
>  3. Rewrite enqueue and delete the obsolete in the same patch.
> 
> ---
> Changes in v2:
> 
>  1. Split the big function into several small ones.
> 
>  2. Use multiple patches to explain each optimization.
> 
>  3. Add comments.
> 
> Zhihong Wang (6):
>   vhost: fix windows vm hang
>   vhost: rewrite enqueue
>   vhost: remove useless volatile
>   vhost: add desc prefetch
>   vhost: batch update used ring
>   vhost: optimize cache access
> 
>  lib/librte_vhost/vhost.c      |  20 +-
>  lib/librte_vhost/vhost.h      |   6 +-
>  lib/librte_vhost/vhost_user.c |  31 ++-
>  lib/librte_vhost/virtio_net.c | 541 ++++++++++++++----------------------------
>  4 files changed, 225 insertions(+), 373 deletions(-)
> 
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v6 6/6] vhost: optimize cache access
  2016-09-20  2:00   ` [PATCH v6 6/6] vhost: optimize cache access Zhihong Wang
@ 2016-09-21  4:32     ` Maxime Coquelin
  0 siblings, 0 replies; 141+ messages in thread
From: Maxime Coquelin @ 2016-09-21  4:32 UTC (permalink / raw)
  To: Zhihong Wang, dev; +Cc: yuanhan.liu, thomas.monjalon



On 09/20/2016 04:00 AM, Zhihong Wang wrote:
> This patch reorders the code to delay virtio header write to improve
> cache access efficiency for cases where the mrg_rxbuf feature is turned
> on. CPU pipeline stall cycles can be significantly reduced.
>
> Virtio header write and mbuf data copy are all remote store operations
> which takes a long time to finish. It's a good idea to put them together
> to remove bubbles in between, to let as many remote store instructions
> as possible go into store buffer at the same time to hide latency, and
> to let the H/W prefetcher goes to work as early as possible.
>
> On a Haswell machine, about 100 cycles can be saved per packet by this
> patch alone. Taking 64B packets traffic for example, this means about 60%
> efficiency improvement for the enqueue operation.

Thanks for the detailed information, I appreciate it.

Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v6 0/6] vhost: optimize enqueue
  2016-09-21  2:26   ` [PATCH v6 0/6] vhost: optimize enqueue Yuanhan Liu
@ 2016-09-21  4:39     ` Maxime Coquelin
  0 siblings, 0 replies; 141+ messages in thread
From: Maxime Coquelin @ 2016-09-21  4:39 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: Zhihong Wang, dev, thomas.monjalon

Hi Yuanhan,

On 09/21/2016 04:26 AM, Yuanhan Liu wrote:
> Hi Maxime,
>
> Do you have more comments about this set? If no, I think I could merge
> it shortly.

No more comments, this is good to me.

Feel free to add:
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime

> Thanks.
>
> 	--yliu
>
> On Mon, Sep 19, 2016 at 10:00:11PM -0400, Zhihong Wang wrote:
>> This patch set optimizes the vhost enqueue function.
>>
>> It implements the vhost logic from scratch into a single function designed
>> for high performance and good maintainability, and improves CPU efficiency
>> significantly by optimizing cache access, which means:
>>
>>  *  Higher maximum throughput can be achieved for fast frontends like DPDK
>>     virtio pmd.
>>
>>  *  Better scalability can be achieved that each vhost core can support
>>     more connections because it takes less cycles to handle each single
>>     frontend.
>>
>> This patch set contains:
>>
>>  1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.
>>
>>  2. A baseline patch to rewrite the vhost logic.
>>
>>  3. A series of optimization patches added upon the baseline.
>>
>> The main optimization techniques are:
>>
>>  1. Reorder code to reduce CPU pipeline stall cycles.
>>
>>  2. Batch update the used ring for better efficiency.
>>
>>  3. Prefetch descriptor to hide cache latency.
>>
>>  4. Remove useless volatile attribute to allow compiler optimization.
>>
>> Code reordering and batch used ring update bring most of the performance
>> improvements.
>>
>> In the existing code there're 2 callbacks for vhost enqueue:
>>
>>  *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
>>
>>  *  virtio_dev_rx for mrg_rxbuf turned off cases.
>>
>> The performance of the existing code is not optimal, especially when the
>> mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
>> maintenance efforts.
>>
>> Also, there's a compatibility issue in the existing code which causes
>> Windows VM to hang when the mrg_rxbuf feature turned on.
>>
>> ---
>> Changes in v6:
>>
>>  1. Merge duplicated code.
>>
>>  2. Introduce a function for used ring write.
>>
>>  3. Add necessary comments.
>>
>> ---
>> Changes in v5:
>>
>>  1. Rebase to dpdk-next-virtio master.
>>
>>  2. Rename variables to keep consistent in naming style.
>>
>>  3. Small changes like return value adjustment and vertical alignment.
>>
>>  4. Add details in commit log.
>>
>> ---
>> Changes in v4:
>>
>>  1. Fix a Windows VM compatibility issue.
>>
>>  2. Free shadow used ring in the right place.
>>
>>  3. Add failure check for shadow used ring malloc.
>>
>>  4. Refactor the code for clearer logic.
>>
>>  5. Add PRINT_PACKET for debugging.
>>
>> ---
>> Changes in v3:
>>
>>  1. Remove unnecessary memset which causes frontend stall on SNB & IVB.
>>
>>  2. Rename variables to follow naming convention.
>>
>>  3. Rewrite enqueue and delete the obsolete in the same patch.
>>
>> ---
>> Changes in v2:
>>
>>  1. Split the big function into several small ones.
>>
>>  2. Use multiple patches to explain each optimization.
>>
>>  3. Add comments.
>>
>> Zhihong Wang (6):
>>   vhost: fix windows vm hang
>>   vhost: rewrite enqueue
>>   vhost: remove useless volatile
>>   vhost: add desc prefetch
>>   vhost: batch update used ring
>>   vhost: optimize cache access
>>
>>  lib/librte_vhost/vhost.c      |  20 +-
>>  lib/librte_vhost/vhost.h      |   6 +-
>>  lib/librte_vhost/vhost_user.c |  31 ++-
>>  lib/librte_vhost/virtio_net.c | 541 ++++++++++++++----------------------------
>>  4 files changed, 225 insertions(+), 373 deletions(-)
>>
>> --
>> 2.7.4

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-08-22  8:11   ` [PATCH v3 0/5] vhost: optimize enqueue Maxime Coquelin
  2016-08-22 10:01     ` Maxime Coquelin
  2016-08-23  2:15     ` Wang, Zhihong
@ 2016-09-21  8:50     ` Jianbo Liu
  2016-09-21  9:27       ` Wang, Zhihong
  2 siblings, 1 reply; 141+ messages in thread
From: Jianbo Liu @ 2016-09-21  8:50 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: Zhihong Wang, dev, yuanhan.liu

Hi Maxime,

On 22 August 2016 at 16:11, Maxime Coquelin <maxime.coquelin@redhat.com> wrote:
> Hi Zhihong,
>
> On 08/19/2016 07:43 AM, Zhihong Wang wrote:
>>
>> This patch set optimizes the vhost enqueue function.
>>
...

>
> My setup consists of one host running a guest.
> The guest generates as much 64bytes packets as possible using

Have you tested with other different packet size?
My testing shows that performance is dropping when packet size is more than 256.

> pktgen-dpdk. The hosts forwards received packets back to the guest
> using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
> physical CPUs.
>
> I tested it with and without your v1 patch, with and without
> rx-mergeable feature turned ON.
> Results are the average of 8 runs of 60 seconds:
>
> Rx-Mergeable ON : 7.72Mpps
> Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
> Rx-Mergeable OFF: 10.52Mpps
> Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
>
> Regards,
> Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-21  8:50     ` Jianbo Liu
@ 2016-09-21  9:27       ` Wang, Zhihong
  2016-09-21 12:54         ` Jianbo Liu
  0 siblings, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-21  9:27 UTC (permalink / raw)
  To: Jianbo Liu, Maxime Coquelin; +Cc: dev, yuanhan.liu



> -----Original Message-----
> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> Sent: Wednesday, September 21, 2016 4:50 PM
> To: Maxime Coquelin <maxime.coquelin@redhat.com>
> Cc: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org;
> yuanhan.liu@linux.intel.com
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> Hi Maxime,
> 
> On 22 August 2016 at 16:11, Maxime Coquelin
> <maxime.coquelin@redhat.com> wrote:
> > Hi Zhihong,
> >
> > On 08/19/2016 07:43 AM, Zhihong Wang wrote:
> >>
> >> This patch set optimizes the vhost enqueue function.
> >>
> ...
> 
> >
> > My setup consists of one host running a guest.
> > The guest generates as much 64bytes packets as possible using
> 
> Have you tested with other different packet size?
> My testing shows that performance is dropping when packet size is more
> than 256.


Hi Jianbo,

Thanks for reporting this.

 1. Are you running the vector frontend with mrg_rxbuf=off?

 2. Could you please specify what CPU you're running? Is it Haswell
    or Ivy Bridge?

 3. How many percentage of drop are you seeing?

This is expected by me because I've already found the root cause and
the way to optimize it, but since it missed the v0 deadline and
requires changes in eal/memcpy, I postpone it to the next release.

After the upcoming optimization the performance for packets larger
than 256 will be improved, and the new code will be much faster than
the current code.


Thanks
Zhihong


> 
> > pktgen-dpdk. The hosts forwards received packets back to the guest
> > using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
> > physical CPUs.
> >
> > I tested it with and without your v1 patch, with and without
> > rx-mergeable feature turned ON.
> > Results are the average of 8 runs of 60 seconds:
> >
> > Rx-Mergeable ON : 7.72Mpps
> > Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
> > Rx-Mergeable OFF: 10.52Mpps
> > Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
> >
> > Regards,
> > Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-21  9:27       ` Wang, Zhihong
@ 2016-09-21 12:54         ` Jianbo Liu
  2016-09-22  2:11           ` Wang, Zhihong
  2016-09-22  2:29           ` Yuanhan Liu
  0 siblings, 2 replies; 141+ messages in thread
From: Jianbo Liu @ 2016-09-21 12:54 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: Maxime Coquelin, dev, yuanhan.liu

On 21 September 2016 at 17:27, Wang, Zhihong <zhihong.wang@intel.com> wrote:
>
>
>> -----Original Message-----
>> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
>> Sent: Wednesday, September 21, 2016 4:50 PM
>> To: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Cc: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org;
>> yuanhan.liu@linux.intel.com
>> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
>>
>> Hi Maxime,
>>
>> On 22 August 2016 at 16:11, Maxime Coquelin
>> <maxime.coquelin@redhat.com> wrote:
>> > Hi Zhihong,
>> >
>> > On 08/19/2016 07:43 AM, Zhihong Wang wrote:
>> >>
>> >> This patch set optimizes the vhost enqueue function.
>> >>
>> ...
>>
>> >
>> > My setup consists of one host running a guest.
>> > The guest generates as much 64bytes packets as possible using
>>
>> Have you tested with other different packet size?
>> My testing shows that performance is dropping when packet size is more
>> than 256.
>
>
> Hi Jianbo,
>
> Thanks for reporting this.
>
>  1. Are you running the vector frontend with mrg_rxbuf=off?
>
>  2. Could you please specify what CPU you're running? Is it Haswell
>     or Ivy Bridge?
>
>  3. How many percentage of drop are you seeing?
>
> This is expected by me because I've already found the root cause and
> the way to optimize it, but since it missed the v0 deadline and
> requires changes in eal/memcpy, I postpone it to the next release.
>
> After the upcoming optimization the performance for packets larger
> than 256 will be improved, and the new code will be much faster than
> the current code.
>

Sorry, I tested on an ARM server, but I wonder if there is the same
issue for x86 platform.

>> > pktgen-dpdk. The hosts forwards received packets back to the guest
>> > using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
>> > physical CPUs.
>> >
>> > I tested it with and without your v1 patch, with and without
>> > rx-mergeable feature turned ON.
>> > Results are the average of 8 runs of 60 seconds:
>> >
>> > Rx-Mergeable ON : 7.72Mpps
>> > Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
>> > Rx-Mergeable OFF: 10.52Mpps
>> > Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
>> >

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-21 12:54         ` Jianbo Liu
@ 2016-09-22  2:11           ` Wang, Zhihong
  2016-09-22  2:29           ` Yuanhan Liu
  1 sibling, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-22  2:11 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: Maxime Coquelin, dev, yuanhan.liu



> -----Original Message-----
> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> Sent: Wednesday, September 21, 2016 8:54 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: Maxime Coquelin <maxime.coquelin@redhat.com>; dev@dpdk.org;
> yuanhan.liu@linux.intel.com
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> On 21 September 2016 at 17:27, Wang, Zhihong <zhihong.wang@intel.com>
> wrote:
> >
> >
> >> -----Original Message-----
> >> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> >> Sent: Wednesday, September 21, 2016 4:50 PM
> >> To: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Cc: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org;
> >> yuanhan.liu@linux.intel.com
> >> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> >>
> >> Hi Maxime,
> >>
> >> On 22 August 2016 at 16:11, Maxime Coquelin
> >> <maxime.coquelin@redhat.com> wrote:
> >> > Hi Zhihong,
> >> >
> >> > On 08/19/2016 07:43 AM, Zhihong Wang wrote:
> >> >>
> >> >> This patch set optimizes the vhost enqueue function.
> >> >>
> >> ...
> >>
> >> >
> >> > My setup consists of one host running a guest.
> >> > The guest generates as much 64bytes packets as possible using
> >>
> >> Have you tested with other different packet size?
> >> My testing shows that performance is dropping when packet size is more
> >> than 256.
> >
> >
> > Hi Jianbo,
> >
> > Thanks for reporting this.
> >
> >  1. Are you running the vector frontend with mrg_rxbuf=off?
> >
> >  2. Could you please specify what CPU you're running? Is it Haswell
> >     or Ivy Bridge?
> >
> >  3. How many percentage of drop are you seeing?
> >
> > This is expected by me because I've already found the root cause and
> > the way to optimize it, but since it missed the v0 deadline and
> > requires changes in eal/memcpy, I postpone it to the next release.
> >
> > After the upcoming optimization the performance for packets larger
> > than 256 will be improved, and the new code will be much faster than
> > the current code.
> >
> 
> Sorry, I tested on an ARM server, but I wonder if there is the same
> issue for x86 platform.


For mrg_rxbuf=off path it might be slight drop for packets larger than
256B (~3% for 512B and ~1% for 1024B), no drop for other cases.

This is not a bug or issue, only we need to enhance memcpy to complete
the whole optimization, which should be done in a separated patch,
unfortunately it misses this release window.


> 
> >> > pktgen-dpdk. The hosts forwards received packets back to the guest
> >> > using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
> >> > physical CPUs.
> >> >
> >> > I tested it with and without your v1 patch, with and without
> >> > rx-mergeable feature turned ON.
> >> > Results are the average of 8 runs of 60 seconds:
> >> >
> >> > Rx-Mergeable ON : 7.72Mpps
> >> > Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
> >> > Rx-Mergeable OFF: 10.52Mpps
> >> > Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
> >> >

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-21 12:54         ` Jianbo Liu
  2016-09-22  2:11           ` Wang, Zhihong
@ 2016-09-22  2:29           ` Yuanhan Liu
  2016-09-22  5:47             ` Jianbo Liu
  1 sibling, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-09-22  2:29 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: Wang, Zhihong, Maxime Coquelin, dev

On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
> >> > My setup consists of one host running a guest.
> >> > The guest generates as much 64bytes packets as possible using
> >>
> >> Have you tested with other different packet size?
> >> My testing shows that performance is dropping when packet size is more
> >> than 256.
> >
> >
> > Hi Jianbo,
> >
> > Thanks for reporting this.
> >
> >  1. Are you running the vector frontend with mrg_rxbuf=off?
> >
> >  2. Could you please specify what CPU you're running? Is it Haswell
> >     or Ivy Bridge?
> >
> >  3. How many percentage of drop are you seeing?
> >
> > This is expected by me because I've already found the root cause and
> > the way to optimize it, but since it missed the v0 deadline and
> > requires changes in eal/memcpy, I postpone it to the next release.
> >
> > After the upcoming optimization the performance for packets larger
> > than 256 will be improved, and the new code will be much faster than
> > the current code.
> >
> 
> Sorry, I tested on an ARM server, but I wonder if there is the same
> issue for x86 platform.

Would you please provide more details? Say, answer the two left
questions from Zhihong?

Thanks.

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-22  2:29           ` Yuanhan Liu
@ 2016-09-22  5:47             ` Jianbo Liu
  2016-09-22  6:58               ` Wang, Zhihong
  2016-10-12  2:53               ` Yuanhan Liu
  0 siblings, 2 replies; 141+ messages in thread
From: Jianbo Liu @ 2016-09-22  5:47 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: Wang, Zhihong, Maxime Coquelin, dev

On 22 September 2016 at 10:29, Yuanhan Liu <yuanhan.liu@linux.intel.com> wrote:
> On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
>> >> > My setup consists of one host running a guest.
>> >> > The guest generates as much 64bytes packets as possible using
>> >>
>> >> Have you tested with other different packet size?
>> >> My testing shows that performance is dropping when packet size is more
>> >> than 256.
>> >
>> >
>> > Hi Jianbo,
>> >
>> > Thanks for reporting this.
>> >
>> >  1. Are you running the vector frontend with mrg_rxbuf=off?
>> >
Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.

>> >  2. Could you please specify what CPU you're running? Is it Haswell
>> >     or Ivy Bridge?
>> >
It's an ARM server.

>> >  3. How many percentage of drop are you seeing?
The testing result:
size (bytes)     improvement (%)
64                   3.92
128                 11.51
256                  24.16
512                  -13.79
1024                -22.51
1500                -12.22
A correction is that performance is dropping if byte size is larger than 512.

>> >
>> > This is expected by me because I've already found the root cause and
>> > the way to optimize it, but since it missed the v0 deadline and
>> > requires changes in eal/memcpy, I postpone it to the next release.
>> >
>> > After the upcoming optimization the performance for packets larger
>> > than 256 will be improved, and the new code will be much faster than
>> > the current code.
>> >
>>
>> Sorry, I tested on an ARM server, but I wonder if there is the same
>> issue for x86 platform.
>
> Would you please provide more details? Say, answer the two left
> questions from Zhihong?
>
> Thanks.
>
>         --yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-22  5:47             ` Jianbo Liu
@ 2016-09-22  6:58               ` Wang, Zhihong
  2016-09-22  9:01                 ` Jianbo Liu
  2016-10-12  2:53               ` Yuanhan Liu
  1 sibling, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-22  6:58 UTC (permalink / raw)
  To: Jianbo Liu, Yuanhan Liu; +Cc: Maxime Coquelin, dev



> -----Original Message-----
> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> Sent: Thursday, September 22, 2016 1:48 PM
> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> On 22 September 2016 at 10:29, Yuanhan Liu <yuanhan.liu@linux.intel.com>
> wrote:
> > On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
> >> >> > My setup consists of one host running a guest.
> >> >> > The guest generates as much 64bytes packets as possible using
> >> >>
> >> >> Have you tested with other different packet size?
> >> >> My testing shows that performance is dropping when packet size is
> more
> >> >> than 256.
> >> >
> >> >
> >> > Hi Jianbo,
> >> >
> >> > Thanks for reporting this.
> >> >
> >> >  1. Are you running the vector frontend with mrg_rxbuf=off?
> >> >
> Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
> 
> >> >  2. Could you please specify what CPU you're running? Is it Haswell
> >> >     or Ivy Bridge?
> >> >
> It's an ARM server.
> 
> >> >  3. How many percentage of drop are you seeing?
> The testing result:
> size (bytes)     improvement (%)
> 64                   3.92
> 128                 11.51
> 256                  24.16
> 512                  -13.79
> 1024                -22.51
> 1500                -12.22
> A correction is that performance is dropping if byte size is larger than 512.


Jianbo,

Could you please verify does this patch really cause enqueue perf to drop?

You can test the enqueue path only by set guest to do rxonly, and compare
the mpps by show port stats all in the guest.


Thanks
Zhihong

> 
> >> >
> >> > This is expected by me because I've already found the root cause and
> >> > the way to optimize it, but since it missed the v0 deadline and
> >> > requires changes in eal/memcpy, I postpone it to the next release.
> >> >
> >> > After the upcoming optimization the performance for packets larger
> >> > than 256 will be improved, and the new code will be much faster than
> >> > the current code.
> >> >
> >>
> >> Sorry, I tested on an ARM server, but I wonder if there is the same
> >> issue for x86 platform.
> >
> > Would you please provide more details? Say, answer the two left
> > questions from Zhihong?
> >
> > Thanks.
> >
> >         --yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-22  6:58               ` Wang, Zhihong
@ 2016-09-22  9:01                 ` Jianbo Liu
  2016-09-22 10:04                   ` Wang, Zhihong
                                     ` (2 more replies)
  0 siblings, 3 replies; 141+ messages in thread
From: Jianbo Liu @ 2016-09-22  9:01 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: Yuanhan Liu, Maxime Coquelin, dev

On 22 September 2016 at 14:58, Wang, Zhihong <zhihong.wang@intel.com> wrote:
>
>
>> -----Original Message-----
>> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
>> Sent: Thursday, September 22, 2016 1:48 PM
>> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
>> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Maxime Coquelin
>> <maxime.coquelin@redhat.com>; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
>>
>> On 22 September 2016 at 10:29, Yuanhan Liu <yuanhan.liu@linux.intel.com>
>> wrote:
>> > On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
>> >> >> > My setup consists of one host running a guest.
>> >> >> > The guest generates as much 64bytes packets as possible using
>> >> >>
>> >> >> Have you tested with other different packet size?
>> >> >> My testing shows that performance is dropping when packet size is
>> more
>> >> >> than 256.
>> >> >
>> >> >
>> >> > Hi Jianbo,
>> >> >
>> >> > Thanks for reporting this.
>> >> >
>> >> >  1. Are you running the vector frontend with mrg_rxbuf=off?
>> >> >
>> Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
>>
>> >> >  2. Could you please specify what CPU you're running? Is it Haswell
>> >> >     or Ivy Bridge?
>> >> >
>> It's an ARM server.
>>
>> >> >  3. How many percentage of drop are you seeing?
>> The testing result:
>> size (bytes)     improvement (%)
>> 64                   3.92
>> 128                 11.51
>> 256                  24.16
>> 512                  -13.79
>> 1024                -22.51
>> 1500                -12.22
>> A correction is that performance is dropping if byte size is larger than 512.
>
>
> Jianbo,
>
> Could you please verify does this patch really cause enqueue perf to drop?
>
> You can test the enqueue path only by set guest to do rxonly, and compare
> the mpps by show port stats all in the guest.
>
>
Tested with testpmd, host: txonly, guest: rxonly
size (bytes)     improvement (%)
64                    4.12
128                   6
256                   2.65
512                   -1.12
1024                 -7.02

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v6 2/6] vhost: rewrite enqueue
  2016-09-20  2:00   ` [PATCH v6 2/6] vhost: rewrite enqueue Zhihong Wang
@ 2016-09-22  9:58     ` Jianbo Liu
  2016-09-22 10:13       ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Jianbo Liu @ 2016-09-22  9:58 UTC (permalink / raw)
  To: Zhihong Wang; +Cc: dev, Maxime Coquelin, Yuanhan Liu, Thomas Monjalon

On 20 September 2016 at 10:00, Zhihong Wang <zhihong.wang@intel.com> wrote:
> This patch implements the vhost logic from scratch into a single function
> to improve maintainability. This is the baseline version of the new code,
> more optimization will be added in the following patches in this patch set.
>
> In the existing code there're 2 callbacks for vhost enqueue:
>
>  *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
>
>  *  virtio_dev_rx for mrg_rxbuf turned off cases.
>
> Having 2 callback paths increases maintenance effort. Also, the performance
> of the existing code is not optimal, especially when the mrg_rxbuf feature
> turned on.
>
> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> ---
> Changes in v6:
.....

> -/*
> - * Returns -1 on fail, 0 on success
> - */
> -static inline int
> -reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
> -                           uint16_t *end, struct buf_vector *buf_vec)
> +uint16_t
> +rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
> +       struct rte_mbuf **pkts, uint16_t count)
>  {
> -       uint16_t cur_idx;
> +       struct vhost_virtqueue *vq;
> +       struct virtio_net *dev;
> +       uint32_t is_mrg_rxbuf = 0;
> +       uint32_t pkt_idx      = 0;
> +       uint32_t pkt_left     = count;

Is pkt_left really needed?

>         uint16_t avail_idx;
> -       uint32_t allocated = 0;
> -       uint32_t vec_idx = 0;
> -       uint16_t tries = 0;
....

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-22  9:01                 ` Jianbo Liu
@ 2016-09-22 10:04                   ` Wang, Zhihong
  2016-09-22 14:41                     ` Jianbo Liu
  2016-09-26  5:37                   ` Luke Gorrie
  2016-09-27 10:21                   ` Yuanhan Liu
  2 siblings, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-22 10:04 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: Yuanhan Liu, Maxime Coquelin, dev



> -----Original Message-----
> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> Sent: Thursday, September 22, 2016 5:02 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: Yuanhan Liu <yuanhan.liu@linux.intel.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> On 22 September 2016 at 14:58, Wang, Zhihong <zhihong.wang@intel.com>
> wrote:
> >
> >
> >> -----Original Message-----
> >> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> >> Sent: Thursday, September 22, 2016 1:48 PM
> >> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> >> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Maxime Coquelin
> >> <maxime.coquelin@redhat.com>; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> >>
> >> On 22 September 2016 at 10:29, Yuanhan Liu
> <yuanhan.liu@linux.intel.com>
> >> wrote:
> >> > On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
> >> >> >> > My setup consists of one host running a guest.
> >> >> >> > The guest generates as much 64bytes packets as possible using
> >> >> >>
> >> >> >> Have you tested with other different packet size?
> >> >> >> My testing shows that performance is dropping when packet size is
> >> more
> >> >> >> than 256.
> >> >> >
> >> >> >
> >> >> > Hi Jianbo,
> >> >> >
> >> >> > Thanks for reporting this.
> >> >> >
> >> >> >  1. Are you running the vector frontend with mrg_rxbuf=off?
> >> >> >
> >> Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
> >>
> >> >> >  2. Could you please specify what CPU you're running? Is it Haswell
> >> >> >     or Ivy Bridge?
> >> >> >
> >> It's an ARM server.
> >>
> >> >> >  3. How many percentage of drop are you seeing?
> >> The testing result:
> >> size (bytes)     improvement (%)
> >> 64                   3.92
> >> 128                 11.51
> >> 256                  24.16
> >> 512                  -13.79
> >> 1024                -22.51
> >> 1500                -12.22
> >> A correction is that performance is dropping if byte size is larger than 512.
> >
> >
> > Jianbo,
> >
> > Could you please verify does this patch really cause enqueue perf to drop?
> >
> > You can test the enqueue path only by set guest to do rxonly, and compare
> > the mpps by show port stats all in the guest.
> >
> >
> Tested with testpmd, host: txonly, guest: rxonly
> size (bytes)     improvement (%)
> 64                    4.12
> 128                   6
> 256                   2.65
> 512                   -1.12
> 1024                 -7.02



I think your number is little bit hard to understand for me, this patch's
optimization contains 2 parts:

 1. ring operation: works for both mrg_rxbuf on and off

 2. remote write ordering: works for mrg_rxbuf=on only

So, for mrg_rxbuf=off, if this patch is good for 64B packets, then it
shouldn't do anything bad for larger packets.

This is the gain on x86 platform: host iofwd between nic and vhost,
guest rxonly.

nic2vm	enhancement
64	21.83%
128	16.97%
256	6.34%
512	0.01%
1024	0.00%

I suspect there's some complication in ARM's micro-arch.

Could you try v6 and apply all patches except the the last one:
[PATCH v6 6/6] vhost: optimize cache access

And see if there's still perf drop?


Thanks
Zhihong


^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v6 2/6] vhost: rewrite enqueue
  2016-09-22  9:58     ` Jianbo Liu
@ 2016-09-22 10:13       ` Wang, Zhihong
  0 siblings, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-22 10:13 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: dev, Maxime Coquelin, Yuanhan Liu, Thomas Monjalon



> -----Original Message-----
> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> Sent: Thursday, September 22, 2016 5:58 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org; Maxime Coquelin <maxime.coquelin@redhat.com>;
> Yuanhan Liu <yuanhan.liu@linux.intel.com>; Thomas Monjalon
> <thomas.monjalon@6wind.com>
> Subject: Re: [dpdk-dev] [PATCH v6 2/6] vhost: rewrite enqueue
> 
> On 20 September 2016 at 10:00, Zhihong Wang <zhihong.wang@intel.com>
> wrote:
> > This patch implements the vhost logic from scratch into a single function
> > to improve maintainability. This is the baseline version of the new code,
> > more optimization will be added in the following patches in this patch set.
> >
> > In the existing code there're 2 callbacks for vhost enqueue:
> >
> >  *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
> >
> >  *  virtio_dev_rx for mrg_rxbuf turned off cases.
> >
> > Having 2 callback paths increases maintenance effort. Also, the
> performance
> > of the existing code is not optimal, especially when the mrg_rxbuf feature
> > turned on.
> >
> > Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> > ---
> > Changes in v6:
> .....
> 
> > -/*
> > - * Returns -1 on fail, 0 on success
> > - */
> > -static inline int
> > -reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
> > -                           uint16_t *end, struct buf_vector *buf_vec)
> > +uint16_t
> > +rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
> > +       struct rte_mbuf **pkts, uint16_t count)
> >  {
> > -       uint16_t cur_idx;
> > +       struct vhost_virtqueue *vq;
> > +       struct virtio_net *dev;
> > +       uint32_t is_mrg_rxbuf = 0;
> > +       uint32_t pkt_idx      = 0;
> > +       uint32_t pkt_left     = count;
> 
> Is pkt_left really needed?

It's a matter of coding style since there's no underlying difference.
I prefer this way personally.

> 
> >         uint16_t avail_idx;
> > -       uint32_t allocated = 0;
> > -       uint32_t vec_idx = 0;
> > -       uint16_t tries = 0;
> ....

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-22 10:04                   ` Wang, Zhihong
@ 2016-09-22 14:41                     ` Jianbo Liu
  2016-09-23  2:56                       ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Jianbo Liu @ 2016-09-22 14:41 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: Yuanhan Liu, Maxime Coquelin, dev

On 22 September 2016 at 18:04, Wang, Zhihong <zhihong.wang@intel.com> wrote:
>
>
>> -----Original Message-----
>> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
>> Sent: Thursday, September 22, 2016 5:02 PM
>> To: Wang, Zhihong <zhihong.wang@intel.com>
>> Cc: Yuanhan Liu <yuanhan.liu@linux.intel.com>; Maxime Coquelin
>> <maxime.coquelin@redhat.com>; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
>>
>> On 22 September 2016 at 14:58, Wang, Zhihong <zhihong.wang@intel.com>
>> wrote:
>> >
>> >
>> >> -----Original Message-----
>> >> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
>> >> Sent: Thursday, September 22, 2016 1:48 PM
>> >> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
>> >> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Maxime Coquelin
>> >> <maxime.coquelin@redhat.com>; dev@dpdk.org
>> >> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
>> >>
>> >> On 22 September 2016 at 10:29, Yuanhan Liu
>> <yuanhan.liu@linux.intel.com>
>> >> wrote:
>> >> > On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
>> >> >> >> > My setup consists of one host running a guest.
>> >> >> >> > The guest generates as much 64bytes packets as possible using
>> >> >> >>
>> >> >> >> Have you tested with other different packet size?
>> >> >> >> My testing shows that performance is dropping when packet size is
>> >> more
>> >> >> >> than 256.
>> >> >> >
>> >> >> >
>> >> >> > Hi Jianbo,
>> >> >> >
>> >> >> > Thanks for reporting this.
>> >> >> >
>> >> >> >  1. Are you running the vector frontend with mrg_rxbuf=off?
>> >> >> >
>> >> Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
>> >>
>> >> >> >  2. Could you please specify what CPU you're running? Is it Haswell
>> >> >> >     or Ivy Bridge?
>> >> >> >
>> >> It's an ARM server.
>> >>
>> >> >> >  3. How many percentage of drop are you seeing?
>> >> The testing result:
>> >> size (bytes)     improvement (%)
>> >> 64                   3.92
>> >> 128                 11.51
>> >> 256                  24.16
>> >> 512                  -13.79
>> >> 1024                -22.51
>> >> 1500                -12.22
>> >> A correction is that performance is dropping if byte size is larger than 512.
>> >
>> >
>> > Jianbo,
>> >
>> > Could you please verify does this patch really cause enqueue perf to drop?
>> >
>> > You can test the enqueue path only by set guest to do rxonly, and compare
>> > the mpps by show port stats all in the guest.
>> >
>> >
>> Tested with testpmd, host: txonly, guest: rxonly
>> size (bytes)     improvement (%)
>> 64                    4.12
>> 128                   6
>> 256                   2.65
>> 512                   -1.12
>> 1024                 -7.02
>
>
>
> I think your number is little bit hard to understand for me, this patch's
> optimization contains 2 parts:
>
>  1. ring operation: works for both mrg_rxbuf on and off
>
>  2. remote write ordering: works for mrg_rxbuf=on only
>
> So, for mrg_rxbuf=off, if this patch is good for 64B packets, then it
> shouldn't do anything bad for larger packets.
>
> This is the gain on x86 platform: host iofwd between nic and vhost,
> guest rxonly.
>
> nic2vm  enhancement
> 64      21.83%
> 128     16.97%
> 256     6.34%
> 512     0.01%
> 1024    0.00%
>
I bootup a VM with 2 virtual port, and stress the traffic between them.
First, I stressed with pktgen-dpdk in VM, and did iofwd in host.
Then, as you told, I did rxonly in VM, and txonly in host.

> I suspect there's some complication in ARM's micro-arch.
>
> Could you try v6 and apply all patches except the the last one:
> [PATCH v6 6/6] vhost: optimize cache access
>
> And see if there's still perf drop?
>
The last patch can improve the performance. The drop is actually
caused by the second patch.

Jianbo

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-22 14:41                     ` Jianbo Liu
@ 2016-09-23  2:56                       ` Wang, Zhihong
  2016-09-23 10:41                         ` Jianbo Liu
  0 siblings, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-23  2:56 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: Yuanhan Liu, Maxime Coquelin, dev



> -----Original Message-----
> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> Sent: Thursday, September 22, 2016 10:42 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: Yuanhan Liu <yuanhan.liu@linux.intel.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> On 22 September 2016 at 18:04, Wang, Zhihong <zhihong.wang@intel.com>
> wrote:
> >
> >
> >> -----Original Message-----
> >> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> >> Sent: Thursday, September 22, 2016 5:02 PM
> >> To: Wang, Zhihong <zhihong.wang@intel.com>
> >> Cc: Yuanhan Liu <yuanhan.liu@linux.intel.com>; Maxime Coquelin
> >> <maxime.coquelin@redhat.com>; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> >>
> >> On 22 September 2016 at 14:58, Wang, Zhihong <zhihong.wang@intel.com>
> >> wrote:
> >> >
> >> >
> >> >> -----Original Message-----
> >> >> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> >> >> Sent: Thursday, September 22, 2016 1:48 PM
> >> >> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> >> >> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Maxime Coquelin
> >> >> <maxime.coquelin@redhat.com>; dev@dpdk.org
> >> >> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> >> >>
> >> >> On 22 September 2016 at 10:29, Yuanhan Liu
> >> <yuanhan.liu@linux.intel.com>
> >> >> wrote:
> >> >> > On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
> >> >> >> >> > My setup consists of one host running a guest.
> >> >> >> >> > The guest generates as much 64bytes packets as possible using
> >> >> >> >>
> >> >> >> >> Have you tested with other different packet size?
> >> >> >> >> My testing shows that performance is dropping when packet size is
> >> >> more
> >> >> >> >> than 256.
> >> >> >> >
> >> >> >> >
> >> >> >> > Hi Jianbo,
> >> >> >> >
> >> >> >> > Thanks for reporting this.
> >> >> >> >
> >> >> >> >  1. Are you running the vector frontend with mrg_rxbuf=off?
> >> >> >> >
> >> >> Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
> >> >>
> >> >> >> >  2. Could you please specify what CPU you're running? Is it Haswell
> >> >> >> >     or Ivy Bridge?
> >> >> >> >
> >> >> It's an ARM server.
> >> >>
> >> >> >> >  3. How many percentage of drop are you seeing?
> >> >> The testing result:
> >> >> size (bytes)     improvement (%)
> >> >> 64                   3.92
> >> >> 128                 11.51
> >> >> 256                  24.16
> >> >> 512                  -13.79
> >> >> 1024                -22.51
> >> >> 1500                -12.22
> >> >> A correction is that performance is dropping if byte size is larger than 512.
> >> >
> >> >
> >> > Jianbo,
> >> >
> >> > Could you please verify does this patch really cause enqueue perf to drop?
> >> >
> >> > You can test the enqueue path only by set guest to do rxonly, and compare
> >> > the mpps by show port stats all in the guest.
> >> >
> >> >
> >> Tested with testpmd, host: txonly, guest: rxonly
> >> size (bytes)     improvement (%)
> >> 64                    4.12
> >> 128                   6
> >> 256                   2.65
> >> 512                   -1.12
> >> 1024                 -7.02
> >
> >
> >
> > I think your number is little bit hard to understand for me, this patch's
> > optimization contains 2 parts:
> >
> >  1. ring operation: works for both mrg_rxbuf on and off
> >
> >  2. remote write ordering: works for mrg_rxbuf=on only
> >
> > So, for mrg_rxbuf=off, if this patch is good for 64B packets, then it
> > shouldn't do anything bad for larger packets.
> >
> > This is the gain on x86 platform: host iofwd between nic and vhost,
> > guest rxonly.
> >
> > nic2vm  enhancement
> > 64      21.83%
> > 128     16.97%
> > 256     6.34%
> > 512     0.01%
> > 1024    0.00%
> >
> I bootup a VM with 2 virtual port, and stress the traffic between them.
> First, I stressed with pktgen-dpdk in VM, and did iofwd in host.
> Then, as you told, I did rxonly in VM, and txonly in host.
> 
> > I suspect there's some complication in ARM's micro-arch.
> >
> > Could you try v6 and apply all patches except the the last one:
> > [PATCH v6 6/6] vhost: optimize cache access
> >
> > And see if there's still perf drop?
> >
> The last patch can improve the performance. The drop is actually
> caused by the second patch.


This is expected because the 2nd patch is just a baseline and all optimization
patches are organized in the rest of this patch set.

I think you can do bottleneck analysis on ARM to see what's slowing down the
perf, there might be some micro-arch complications there, mostly likely in
memcpy.

Do you use glibc's memcpy? I suggest to hand-crafted it on your own.

Could you publish the mrg_rxbuf=on data also? Since it's more widely used
in terms of spec integrity.


Thanks
Zhihong


> 
> Jianbo

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-23  2:56                       ` Wang, Zhihong
@ 2016-09-23 10:41                         ` Jianbo Liu
  2016-09-23 13:41                           ` Thomas Monjalon
  0 siblings, 1 reply; 141+ messages in thread
From: Jianbo Liu @ 2016-09-23 10:41 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: Yuanhan Liu, Maxime Coquelin, dev

On 23 September 2016 at 10:56, Wang, Zhihong <zhihong.wang@intel.com> wrote:
.....
> This is expected because the 2nd patch is just a baseline and all optimization
> patches are organized in the rest of this patch set.
>
> I think you can do bottleneck analysis on ARM to see what's slowing down the
> perf, there might be some micro-arch complications there, mostly likely in
> memcpy.
>
> Do you use glibc's memcpy? I suggest to hand-crafted it on your own.
>
> Could you publish the mrg_rxbuf=on data also? Since it's more widely used
> in terms of spec integrity.
>
I don't think it will be helpful for you, considering the differences
between x86 and arm.
So please move on with this patchset...

Thanks!
Jianbo

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-23 10:41                         ` Jianbo Liu
@ 2016-09-23 13:41                           ` Thomas Monjalon
  2016-09-25  5:41                             ` Wang, Zhihong
  2016-09-26  4:24                             ` Jianbo Liu
  0 siblings, 2 replies; 141+ messages in thread
From: Thomas Monjalon @ 2016-09-23 13:41 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: dev, Wang, Zhihong, Yuanhan Liu, Maxime Coquelin

2016-09-23 18:41, Jianbo Liu:
> On 23 September 2016 at 10:56, Wang, Zhihong <zhihong.wang@intel.com> wrote:
> .....
> > This is expected because the 2nd patch is just a baseline and all optimization
> > patches are organized in the rest of this patch set.
> >
> > I think you can do bottleneck analysis on ARM to see what's slowing down the
> > perf, there might be some micro-arch complications there, mostly likely in
> > memcpy.
> >
> > Do you use glibc's memcpy? I suggest to hand-crafted it on your own.
> >
> > Could you publish the mrg_rxbuf=on data also? Since it's more widely used
> > in terms of spec integrity.
> >
> I don't think it will be helpful for you, considering the differences
> between x86 and arm.
> So please move on with this patchset...

Jianbo,
I don't understand.
You said that the 2nd patch is a regression:
-       volatile uint16_t       last_used_idx;
+       uint16_t                last_used_idx;

And the overrall series lead to performance regression
for packets > 512 B, right?
But we don't know wether you have tested the v6 or not.

Zhihong talked about some improvements possible in rte_memcpy.
ARM64 is using libc memcpy in rte_memcpy.

Now you seem to give up.
Does it mean you accept having a regression in 16.11 release?
Are you working on rte_memcpy?

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-23 13:41                           ` Thomas Monjalon
@ 2016-09-25  5:41                             ` Wang, Zhihong
  2016-09-26  5:12                               ` Jianbo Liu
  2016-09-26  4:24                             ` Jianbo Liu
  1 sibling, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-25  5:41 UTC (permalink / raw)
  To: Thomas Monjalon, Jianbo Liu; +Cc: dev, Yuanhan Liu, Maxime Coquelin



> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> Sent: Friday, September 23, 2016 9:41 PM
> To: Jianbo Liu <jianbo.liu@linaro.org>
> Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
> <yuanhan.liu@linux.intel.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> 2016-09-23 18:41, Jianbo Liu:
> > On 23 September 2016 at 10:56, Wang, Zhihong <zhihong.wang@intel.com>
> wrote:
> > .....
> > > This is expected because the 2nd patch is just a baseline and all optimization
> > > patches are organized in the rest of this patch set.
> > >
> > > I think you can do bottleneck analysis on ARM to see what's slowing down the
> > > perf, there might be some micro-arch complications there, mostly likely in
> > > memcpy.
> > >
> > > Do you use glibc's memcpy? I suggest to hand-crafted it on your own.
> > >
> > > Could you publish the mrg_rxbuf=on data also? Since it's more widely used
> > > in terms of spec integrity.
> > >
> > I don't think it will be helpful for you, considering the differences
> > between x86 and arm.


Hi Jianbo,

This patch does help in ARM for small packets like 64B sized ones,
this actually proves the similarity between x86 and ARM in terms
of caching optimization in this patch.

My estimation is based on:

 1. The last patch are for mrg_rxbuf=on, and since you said it helps
    perf, we can ignore it for now when we discuss mrg_rxbuf=off

 2. Vhost enqueue perf =
    Ring overhead + Virtio header overhead + Data memcpy overhead

 3. This patch helps small packets traffic, which means it helps
    ring + virtio header operations

 4. So, when you say perf drop when packet size larger than 512B,
    this is most likely caused by memcpy in ARM not working well
    with this patch

I'm not saying glibc's memcpy is not good enough, it's just that
this is a rather special use case. And since we see specialized
memcpy + this patch give better performance than other combinations
significantly on x86, we suggest to hand-craft a specialized memcpy
for it.

Of course on ARM this is still just my speculation, and we need to
either prove it or find the actual root cause.

It can be **REALLY HELPFUL** if you could help to test this patch on
ARM for mrg_rxbuf=on cases to see if this patch is in fact helpful
to ARM at all, since mrg_rxbuf=on the more widely used cases.


Thanks
Zhihong


> > So please move on with this patchset...
> 
> Jianbo,
> I don't understand.
> You said that the 2nd patch is a regression:
> -       volatile uint16_t       last_used_idx;
> +       uint16_t                last_used_idx;
> 
> And the overrall series lead to performance regression
> for packets > 512 B, right?
> But we don't know wether you have tested the v6 or not.
> 
> Zhihong talked about some improvements possible in rte_memcpy.
> ARM64 is using libc memcpy in rte_memcpy.
> 
> Now you seem to give up.
> Does it mean you accept having a regression in 16.11 release?
> Are you working on rte_memcpy?

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-23 13:41                           ` Thomas Monjalon
  2016-09-25  5:41                             ` Wang, Zhihong
@ 2016-09-26  4:24                             ` Jianbo Liu
  1 sibling, 0 replies; 141+ messages in thread
From: Jianbo Liu @ 2016-09-26  4:24 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev, Wang, Zhihong, Yuanhan Liu, Maxime Coquelin

Hi Thomas,

On 23 September 2016 at 21:41, Thomas Monjalon
<thomas.monjalon@6wind.com> wrote:
> 2016-09-23 18:41, Jianbo Liu:
>> On 23 September 2016 at 10:56, Wang, Zhihong <zhihong.wang@intel.com> wrote:
>> .....
>> > This is expected because the 2nd patch is just a baseline and all optimization
>> > patches are organized in the rest of this patch set.
>> >
>> > I think you can do bottleneck analysis on ARM to see what's slowing down the
>> > perf, there might be some micro-arch complications there, mostly likely in
>> > memcpy.
>> >
>> > Do you use glibc's memcpy? I suggest to hand-crafted it on your own.
>> >
>> > Could you publish the mrg_rxbuf=on data also? Since it's more widely used
>> > in terms of spec integrity.
>> >
>> I don't think it will be helpful for you, considering the differences
>> between x86 and arm.
>> So please move on with this patchset...
>
> Jianbo,
> I don't understand.
> You said that the 2nd patch is a regression:
> -       volatile uint16_t       last_used_idx;
> +       uint16_t                last_used_idx;
>
No, I meant "vhost: rewrite enqueue".

> And the overrall series lead to performance regression
> for packets > 512 B, right?
> But we don't know wether you have tested the v6 or not.
Yes, I tested v6, and found performance regression for size >=512B.

>
> Zhihong talked about some improvements possible in rte_memcpy.
> ARM64 is using libc memcpy in rte_memcpy.
>
> Now you seem to give up.
> Does it mean you accept having a regression in 16.11 release?
> Are you working on rte_memcpy?
This patchset actually improves performance according to Zhihong's
result on x86 platfrom. And I also get improvement as least with
small-size packet on ARM.
I don't want to give up, but I need more time to find out the reason
for the regression. I think rte_memcpy definitely is one of the ways
to improve performance, but it could be the reason?

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-25  5:41                             ` Wang, Zhihong
@ 2016-09-26  5:12                               ` Jianbo Liu
  2016-09-26  5:25                                 ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Jianbo Liu @ 2016-09-26  5:12 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: Thomas Monjalon, dev, Yuanhan Liu, Maxime Coquelin

On 25 September 2016 at 13:41, Wang, Zhihong <zhihong.wang@intel.com> wrote:
>
>
>> -----Original Message-----
>> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
>> Sent: Friday, September 23, 2016 9:41 PM
>> To: Jianbo Liu <jianbo.liu@linaro.org>
>> Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
>> <yuanhan.liu@linux.intel.com>; Maxime Coquelin
>> <maxime.coquelin@redhat.com>
....
> This patch does help in ARM for small packets like 64B sized ones,
> this actually proves the similarity between x86 and ARM in terms
> of caching optimization in this patch.
>
> My estimation is based on:
>
>  1. The last patch are for mrg_rxbuf=on, and since you said it helps
>     perf, we can ignore it for now when we discuss mrg_rxbuf=off
>
>  2. Vhost enqueue perf =
>     Ring overhead + Virtio header overhead + Data memcpy overhead
>
>  3. This patch helps small packets traffic, which means it helps
>     ring + virtio header operations
>
>  4. So, when you say perf drop when packet size larger than 512B,
>     this is most likely caused by memcpy in ARM not working well
>     with this patch
>
> I'm not saying glibc's memcpy is not good enough, it's just that
> this is a rather special use case. And since we see specialized
> memcpy + this patch give better performance than other combinations
> significantly on x86, we suggest to hand-craft a specialized memcpy
> for it.
>
> Of course on ARM this is still just my speculation, and we need to
> either prove it or find the actual root cause.
>
> It can be **REALLY HELPFUL** if you could help to test this patch on
> ARM for mrg_rxbuf=on cases to see if this patch is in fact helpful
> to ARM at all, since mrg_rxbuf=on the more widely used cases.
>
Actually it's worse than mrg_rxbuf=off.

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-26  5:12                               ` Jianbo Liu
@ 2016-09-26  5:25                                 ` Wang, Zhihong
  2016-09-26  5:38                                   ` Jianbo Liu
  0 siblings, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-26  5:25 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: Thomas Monjalon, dev, Yuanhan Liu, Maxime Coquelin



> -----Original Message-----
> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> Sent: Monday, September 26, 2016 1:13 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: Thomas Monjalon <thomas.monjalon@6wind.com>; dev@dpdk.org; Yuanhan
> Liu <yuanhan.liu@linux.intel.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> On 25 September 2016 at 13:41, Wang, Zhihong <zhihong.wang@intel.com>
> wrote:
> >
> >
> >> -----Original Message-----
> >> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> >> Sent: Friday, September 23, 2016 9:41 PM
> >> To: Jianbo Liu <jianbo.liu@linaro.org>
> >> Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
> >> <yuanhan.liu@linux.intel.com>; Maxime Coquelin
> >> <maxime.coquelin@redhat.com>
> ....
> > This patch does help in ARM for small packets like 64B sized ones,
> > this actually proves the similarity between x86 and ARM in terms
> > of caching optimization in this patch.
> >
> > My estimation is based on:
> >
> >  1. The last patch are for mrg_rxbuf=on, and since you said it helps
> >     perf, we can ignore it for now when we discuss mrg_rxbuf=off
> >
> >  2. Vhost enqueue perf =
> >     Ring overhead + Virtio header overhead + Data memcpy overhead
> >
> >  3. This patch helps small packets traffic, which means it helps
> >     ring + virtio header operations
> >
> >  4. So, when you say perf drop when packet size larger than 512B,
> >     this is most likely caused by memcpy in ARM not working well
> >     with this patch
> >
> > I'm not saying glibc's memcpy is not good enough, it's just that
> > this is a rather special use case. And since we see specialized
> > memcpy + this patch give better performance than other combinations
> > significantly on x86, we suggest to hand-craft a specialized memcpy
> > for it.
> >
> > Of course on ARM this is still just my speculation, and we need to
> > either prove it or find the actual root cause.
> >
> > It can be **REALLY HELPFUL** if you could help to test this patch on
> > ARM for mrg_rxbuf=on cases to see if this patch is in fact helpful
> > to ARM at all, since mrg_rxbuf=on the more widely used cases.
> >
> Actually it's worse than mrg_rxbuf=off.

I mean compare the perf of original vs. original + patch with
mrg_rxbuf turned on. Is there any perf improvement?


^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-22  9:01                 ` Jianbo Liu
  2016-09-22 10:04                   ` Wang, Zhihong
@ 2016-09-26  5:37                   ` Luke Gorrie
  2016-09-26  5:40                     ` Jianbo Liu
  2016-09-27 10:21                   ` Yuanhan Liu
  2 siblings, 1 reply; 141+ messages in thread
From: Luke Gorrie @ 2016-09-26  5:37 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: Wang, Zhihong, Yuanhan Liu, Maxime Coquelin, dev

On 22 September 2016 at 11:01, Jianbo Liu <jianbo.liu@linaro.org> wrote:

> Tested with testpmd, host: txonly, guest: rxonly
> size (bytes)     improvement (%)
> 64                    4.12
> 128                   6
> 256                   2.65
> 512                   -1.12
> 1024                 -7.02
>

Have you considered testing with more diverse workloads e.g. mixed packet
sizes that are not always multiples of the cache line & register sizes?

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-26  5:25                                 ` Wang, Zhihong
@ 2016-09-26  5:38                                   ` Jianbo Liu
  2016-09-26  6:00                                     ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Jianbo Liu @ 2016-09-26  5:38 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: Thomas Monjalon, dev, Yuanhan Liu, Maxime Coquelin

On 26 September 2016 at 13:25, Wang, Zhihong <zhihong.wang@intel.com> wrote:
>
>
>> -----Original Message-----
>> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
>> Sent: Monday, September 26, 2016 1:13 PM
>> To: Wang, Zhihong <zhihong.wang@intel.com>
>> Cc: Thomas Monjalon <thomas.monjalon@6wind.com>; dev@dpdk.org; Yuanhan
>> Liu <yuanhan.liu@linux.intel.com>; Maxime Coquelin
>> <maxime.coquelin@redhat.com>
>> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
>>
>> On 25 September 2016 at 13:41, Wang, Zhihong <zhihong.wang@intel.com>
>> wrote:
>> >
>> >
>> >> -----Original Message-----
>> >> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
>> >> Sent: Friday, September 23, 2016 9:41 PM
>> >> To: Jianbo Liu <jianbo.liu@linaro.org>
>> >> Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
>> >> <yuanhan.liu@linux.intel.com>; Maxime Coquelin
>> >> <maxime.coquelin@redhat.com>
>> ....
>> > This patch does help in ARM for small packets like 64B sized ones,
>> > this actually proves the similarity between x86 and ARM in terms
>> > of caching optimization in this patch.
>> >
>> > My estimation is based on:
>> >
>> >  1. The last patch are for mrg_rxbuf=on, and since you said it helps
>> >     perf, we can ignore it for now when we discuss mrg_rxbuf=off
>> >
>> >  2. Vhost enqueue perf =
>> >     Ring overhead + Virtio header overhead + Data memcpy overhead
>> >
>> >  3. This patch helps small packets traffic, which means it helps
>> >     ring + virtio header operations
>> >
>> >  4. So, when you say perf drop when packet size larger than 512B,
>> >     this is most likely caused by memcpy in ARM not working well
>> >     with this patch
>> >
>> > I'm not saying glibc's memcpy is not good enough, it's just that
>> > this is a rather special use case. And since we see specialized
>> > memcpy + this patch give better performance than other combinations
>> > significantly on x86, we suggest to hand-craft a specialized memcpy
>> > for it.
>> >
>> > Of course on ARM this is still just my speculation, and we need to
>> > either prove it or find the actual root cause.
>> >
>> > It can be **REALLY HELPFUL** if you could help to test this patch on
>> > ARM for mrg_rxbuf=on cases to see if this patch is in fact helpful
>> > to ARM at all, since mrg_rxbuf=on the more widely used cases.
>> >
>> Actually it's worse than mrg_rxbuf=off.
>
> I mean compare the perf of original vs. original + patch with
> mrg_rxbuf turned on. Is there any perf improvement?
>
Yes, orig + patch + on is better than orig + on, but orig + patch + on
is worse than orig + patch + off.

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-26  5:37                   ` Luke Gorrie
@ 2016-09-26  5:40                     ` Jianbo Liu
  0 siblings, 0 replies; 141+ messages in thread
From: Jianbo Liu @ 2016-09-26  5:40 UTC (permalink / raw)
  To: Luke Gorrie; +Cc: Wang, Zhihong, Yuanhan Liu, Maxime Coquelin, dev

On 26 September 2016 at 13:37, Luke Gorrie <luke@snabb.co> wrote:
> On 22 September 2016 at 11:01, Jianbo Liu <jianbo.liu@linaro.org> wrote:
>>
>> Tested with testpmd, host: txonly, guest: rxonly
>> size (bytes)     improvement (%)
>> 64                    4.12
>> 128                   6
>> 256                   2.65
>> 512                   -1.12
>> 1024                 -7.02
>
>
> Have you considered testing with more diverse workloads e.g. mixed packet
> sizes that are not always multiples of the cache line & register sizes?
>
No. Does testpmd can stress performance with mixed size?

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-26  5:38                                   ` Jianbo Liu
@ 2016-09-26  6:00                                     ` Wang, Zhihong
  0 siblings, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-26  6:00 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: Thomas Monjalon, dev, Yuanhan Liu, Maxime Coquelin



> -----Original Message-----
> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> Sent: Monday, September 26, 2016 1:39 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: Thomas Monjalon <thomas.monjalon@6wind.com>; dev@dpdk.org; Yuanhan
> Liu <yuanhan.liu@linux.intel.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> On 26 September 2016 at 13:25, Wang, Zhihong <zhihong.wang@intel.com>
> wrote:
> >
> >
> >> -----Original Message-----
> >> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> >> Sent: Monday, September 26, 2016 1:13 PM
> >> To: Wang, Zhihong <zhihong.wang@intel.com>
> >> Cc: Thomas Monjalon <thomas.monjalon@6wind.com>; dev@dpdk.org;
> Yuanhan
> >> Liu <yuanhan.liu@linux.intel.com>; Maxime Coquelin
> >> <maxime.coquelin@redhat.com>
> >> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> >>
> >> On 25 September 2016 at 13:41, Wang, Zhihong <zhihong.wang@intel.com>
> >> wrote:
> >> >
> >> >
> >> >> -----Original Message-----
> >> >> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> >> >> Sent: Friday, September 23, 2016 9:41 PM
> >> >> To: Jianbo Liu <jianbo.liu@linaro.org>
> >> >> Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan
> Liu
> >> >> <yuanhan.liu@linux.intel.com>; Maxime Coquelin
> >> >> <maxime.coquelin@redhat.com>
> >> ....
> >> > This patch does help in ARM for small packets like 64B sized ones,
> >> > this actually proves the similarity between x86 and ARM in terms
> >> > of caching optimization in this patch.
> >> >
> >> > My estimation is based on:
> >> >
> >> >  1. The last patch are for mrg_rxbuf=on, and since you said it helps
> >> >     perf, we can ignore it for now when we discuss mrg_rxbuf=off
> >> >
> >> >  2. Vhost enqueue perf =
> >> >     Ring overhead + Virtio header overhead + Data memcpy overhead
> >> >
> >> >  3. This patch helps small packets traffic, which means it helps
> >> >     ring + virtio header operations
> >> >
> >> >  4. So, when you say perf drop when packet size larger than 512B,
> >> >     this is most likely caused by memcpy in ARM not working well
> >> >     with this patch
> >> >
> >> > I'm not saying glibc's memcpy is not good enough, it's just that
> >> > this is a rather special use case. And since we see specialized
> >> > memcpy + this patch give better performance than other combinations
> >> > significantly on x86, we suggest to hand-craft a specialized memcpy
> >> > for it.
> >> >
> >> > Of course on ARM this is still just my speculation, and we need to
> >> > either prove it or find the actual root cause.
> >> >
> >> > It can be **REALLY HELPFUL** if you could help to test this patch on
> >> > ARM for mrg_rxbuf=on cases to see if this patch is in fact helpful
> >> > to ARM at all, since mrg_rxbuf=on the more widely used cases.
> >> >
> >> Actually it's worse than mrg_rxbuf=off.
> >
> > I mean compare the perf of original vs. original + patch with
> > mrg_rxbuf turned on. Is there any perf improvement?
> >
> Yes, orig + patch + on is better than orig + on, but orig + patch + on
> is worse than orig + patch + off.


Hi Jianbo,

That's the way it is for virtio, if you compare the current enqueue,
the mrg on perf is even slower.

We should compare:

 1. mrg on: orig vs. orig + patch

 2. mrg off: orig vs. orig + patch

There's more memory touch and in the frontend that brings down the
performance when mrg is on.

Finally, even though mrg on is slower, it's still the mainstream use case
as far as I know.


Thanks
Zhihong


^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-22  9:01                 ` Jianbo Liu
  2016-09-22 10:04                   ` Wang, Zhihong
  2016-09-26  5:37                   ` Luke Gorrie
@ 2016-09-27 10:21                   ` Yuanhan Liu
  2016-09-27 16:45                     ` Wang, Zhihong
  2 siblings, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-09-27 10:21 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: Wang, Zhihong, Maxime Coquelin, dev

[-- Attachment #1: Type: text/plain, Size: 2973 bytes --]

On Thu, Sep 22, 2016 at 05:01:41PM +0800, Jianbo Liu wrote:
> On 22 September 2016 at 14:58, Wang, Zhihong <zhihong.wang@intel.com> wrote:
> >
> >
> >> -----Original Message-----
> >> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> >> Sent: Thursday, September 22, 2016 1:48 PM
> >> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> >> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Maxime Coquelin
> >> <maxime.coquelin@redhat.com>; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> >>
> >> On 22 September 2016 at 10:29, Yuanhan Liu <yuanhan.liu@linux.intel.com>
> >> wrote:
> >> > On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
> >> >> >> > My setup consists of one host running a guest.
> >> >> >> > The guest generates as much 64bytes packets as possible using
> >> >> >>
> >> >> >> Have you tested with other different packet size?
> >> >> >> My testing shows that performance is dropping when packet size is
> >> more
> >> >> >> than 256.
> >> >> >
> >> >> >
> >> >> > Hi Jianbo,
> >> >> >
> >> >> > Thanks for reporting this.
> >> >> >
> >> >> >  1. Are you running the vector frontend with mrg_rxbuf=off?
> >> >> >
> >> Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
> >>
> >> >> >  2. Could you please specify what CPU you're running? Is it Haswell
> >> >> >     or Ivy Bridge?
> >> >> >
> >> It's an ARM server.
> >>
> >> >> >  3. How many percentage of drop are you seeing?
> >> The testing result:
> >> size (bytes)     improvement (%)
> >> 64                   3.92
> >> 128                 11.51
> >> 256                  24.16
> >> 512                  -13.79
> >> 1024                -22.51
> >> 1500                -12.22
> >> A correction is that performance is dropping if byte size is larger than 512.
> >
> >
> > Jianbo,
> >
> > Could you please verify does this patch really cause enqueue perf to drop?
> >
> > You can test the enqueue path only by set guest to do rxonly, and compare
> > the mpps by show port stats all in the guest.
> >
> >
> Tested with testpmd, host: txonly, guest: rxonly
> size (bytes)     improvement (%)
> 64                    4.12
> 128                   6
> 256                   2.65
> 512                   -1.12
> 1024                 -7.02

There is a difference between Zhihong's code and the old I spotted in
the first time: Zhihong removed the avail_idx prefetch. I understand
the prefetch becomes a bit tricky when mrg-rx code path is considered;
thus, I didn't comment on that.

That's one of the difference that, IMO, could drop a regression. I then
finally got a chance to add it back.

A rough test shows it improves the performance of 1400B packet size greatly
in the "txonly in host and rxonly in guest" case: +33% is the number I get
with my test server (Ivybridge).

I guess this might/would help your case as well. Mind to have a test
and tell me the results?

BTW, I made it in rush; I haven't tested the mrg-rx code path yet.

Thanks.

	--yliu

[-- Attachment #2: diff --]
[-- Type: text/plain, Size: 4341 bytes --]

commit e5852d04bf87c02d6d0d8e6d8ded4c33030b9c9e
Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Date:   Tue Sep 27 17:51:15 2016 +0800

    xxxx
    
    Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 381dc27..41bfeba 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -61,6 +61,8 @@ struct buf_vector {
 	uint32_t desc_idx;
 };
 
+#define NR_AVAIL_IDX_PREFETCH	32
+
 /**
  * Structure contains variables relevant to RX/TX virtqueues.
  */
@@ -70,7 +72,7 @@ struct vhost_virtqueue {
 	struct vring_used	*used;
 	uint32_t		size;
 
-	/* Last index used on the available ring */
+	uint16_t		last_avail_idx;
 	uint16_t		last_used_idx;
 #define VIRTIO_INVALID_EVENTFD		(-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD	(-2)
@@ -89,6 +91,9 @@ struct vhost_virtqueue {
 	/* Shadow used ring for performance */
 	struct vring_used_elem	*shadow_used_ring;
 	uint32_t		shadow_used_idx;
+
+	uint16_t		next_avail_idx;
+	uint16_t		avail_idx_buf[NR_AVAIL_IDX_PREFETCH];
 } __rte_cache_aligned;
 
 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 11a2c1a..1cc22fc 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -170,6 +170,41 @@ flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	}
 }
 
+/* Fetch NR_AVAIL_IDX_PREFETCH avail entries at once */
+static void
+prefetch_avail_idx(struct vhost_virtqueue *vq)
+{
+	int i;
+
+	for (i = 0; i < NR_AVAIL_IDX_PREFETCH; i++) {
+		vq->avail_idx_buf[i] = vq->avail->ring[(vq->last_avail_idx + i) &
+					(vq->size - 1)];
+	}
+}
+
+static uint16_t
+next_avail_idx(struct vhost_virtqueue *vq)
+{
+	if (vq->next_avail_idx >= NR_AVAIL_IDX_PREFETCH) {
+		prefetch_avail_idx(vq);
+		vq->next_avail_idx = 0;
+		vq->last_avail_idx += NR_AVAIL_IDX_PREFETCH;
+	}
+
+	return vq->avail_idx_buf[vq->next_avail_idx++];
+}
+
+/*
+ * Just peek, but don't move forward the "next_avail_idx" pointer
+ * The caller also has to make sure the point doesn't go beyond
+ * the array.
+ */
+static uint16_t
+peek_next_avail_idx(struct vhost_virtqueue *vq)
+{
+	return vq->avail_idx_buf[vq->next_avail_idx];
+}
+
 static inline int __attribute__((always_inline))
 enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		uint16_t avail_idx, struct rte_mbuf *mbuf,
@@ -193,7 +228,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	mbuf_avail = mbuf_len;
 
 	/* get the current desc */
-	desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+	desc_current = next_avail_idx(vq);
 	desc_chain_head = desc_current;
 	desc = &vq->desc[desc_current];
 	desc_addr = gpa_to_vva(dev, desc->addr);
@@ -235,9 +270,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
 				if (avail_idx == vq->last_used_idx)
 					goto error;
 
-				desc_current =
-					vq->avail->ring[(vq->last_used_idx) &
-					(vq->size - 1)];
+				desc_current = next_avail_idx(vq);
 				desc_chain_head = desc_current;
 				desc_chain_len = 0;
 			} else
@@ -298,6 +331,7 @@ notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
 		eventfd_write(vq->callfd, (eventfd_t)1);
 }
 
+
 uint16_t
 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 	struct rte_mbuf **pkts, uint16_t count)
@@ -331,14 +365,15 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 
 	/* start enqueuing packets 1 by 1 */
 	vq->shadow_used_idx = 0;
+	vq->next_avail_idx  = 0;
 	used_idx = vq->last_used_idx & (vq->size - 1);
 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
 	while (pkt_left && avail_idx != vq->last_used_idx) {
 		/* prefetch the next desc */
-		if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
-			rte_prefetch0(&vq->desc[vq->avail->ring[
-					(vq->last_used_idx + 1) &
-					(vq->size - 1)]]);
+		if (pkt_left > 1 &&
+		    vq->next_avail_idx + 1 < NR_AVAIL_IDX_PREFETCH) {
+			rte_prefetch0(&vq->desc[peek_next_avail_idx(vq)]);
+		}
 
 		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
 					is_mrg_rxbuf))
@@ -347,6 +382,7 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 		pkt_idx++;
 		pkt_left--;
 	}
+	vq->last_avail_idx += vq->next_avail_idx;
 
 	/* batch update used ring for better performance */
 	if (likely(vq->shadow_used_idx > 0))

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-27 10:21                   ` Yuanhan Liu
@ 2016-09-27 16:45                     ` Wang, Zhihong
  2016-10-09 12:09                       ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-09-27 16:45 UTC (permalink / raw)
  To: Yuanhan Liu, Jianbo Liu; +Cc: Maxime Coquelin, dev



> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Tuesday, September 27, 2016 6:21 PM
> To: Jianbo Liu <jianbo.liu@linaro.org>
> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> On Thu, Sep 22, 2016 at 05:01:41PM +0800, Jianbo Liu wrote:
> > On 22 September 2016 at 14:58, Wang, Zhihong <zhihong.wang@intel.com>
> wrote:
> > >
> > >
> > >> -----Original Message-----
> > >> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> > >> Sent: Thursday, September 22, 2016 1:48 PM
> > >> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> > >> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Maxime Coquelin
> > >> <maxime.coquelin@redhat.com>; dev@dpdk.org
> > >> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> > >>
> > >> On 22 September 2016 at 10:29, Yuanhan Liu <yuanhan.liu@linux.intel.com>
> > >> wrote:
> > >> > On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
> > >> >> >> > My setup consists of one host running a guest.
> > >> >> >> > The guest generates as much 64bytes packets as possible using
> > >> >> >>
> > >> >> >> Have you tested with other different packet size?
> > >> >> >> My testing shows that performance is dropping when packet size is
> > >> more
> > >> >> >> than 256.
> > >> >> >
> > >> >> >
> > >> >> > Hi Jianbo,
> > >> >> >
> > >> >> > Thanks for reporting this.
> > >> >> >
> > >> >> >  1. Are you running the vector frontend with mrg_rxbuf=off?
> > >> >> >
> > >> Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
> > >>
> > >> >> >  2. Could you please specify what CPU you're running? Is it Haswell
> > >> >> >     or Ivy Bridge?
> > >> >> >
> > >> It's an ARM server.
> > >>
> > >> >> >  3. How many percentage of drop are you seeing?
> > >> The testing result:
> > >> size (bytes)     improvement (%)
> > >> 64                   3.92
> > >> 128                 11.51
> > >> 256                  24.16
> > >> 512                  -13.79
> > >> 1024                -22.51
> > >> 1500                -12.22
> > >> A correction is that performance is dropping if byte size is larger than 512.
> > >
> > >
> > > Jianbo,
> > >
> > > Could you please verify does this patch really cause enqueue perf to drop?
> > >
> > > You can test the enqueue path only by set guest to do rxonly, and compare
> > > the mpps by show port stats all in the guest.
> > >
> > >
> > Tested with testpmd, host: txonly, guest: rxonly
> > size (bytes)     improvement (%)
> > 64                    4.12
> > 128                   6
> > 256                   2.65
> > 512                   -1.12
> > 1024                 -7.02
> 
> There is a difference between Zhihong's code and the old I spotted in
> the first time: Zhihong removed the avail_idx prefetch. I understand
> the prefetch becomes a bit tricky when mrg-rx code path is considered;
> thus, I didn't comment on that.
> 
> That's one of the difference that, IMO, could drop a regression. I then
> finally got a chance to add it back.
> 
> A rough test shows it improves the performance of 1400B packet size greatly
> in the "txonly in host and rxonly in guest" case: +33% is the number I get
> with my test server (Ivybridge).

Thanks Yuanhan! I'll validate this on x86.

> 
> I guess this might/would help your case as well. Mind to have a test
> and tell me the results?
> 
> BTW, I made it in rush; I haven't tested the mrg-rx code path yet.
> 
> Thanks.
> 
> 	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-27 16:45                     ` Wang, Zhihong
@ 2016-10-09 12:09                       ` Wang, Zhihong
  2016-10-10  2:44                         ` Yuanhan Liu
  0 siblings, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-10-09 12:09 UTC (permalink / raw)
  To: Wang, Zhihong, Yuanhan Liu, Jianbo Liu; +Cc: Maxime Coquelin, dev



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Wang, Zhihong
> Sent: Wednesday, September 28, 2016 12:45 AM
> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>; Jianbo Liu
> <jianbo.liu@linaro.org>
> Cc: Maxime Coquelin <maxime.coquelin@redhat.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> 
> 
> > -----Original Message-----
> > From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> > Sent: Tuesday, September 27, 2016 6:21 PM
> > To: Jianbo Liu <jianbo.liu@linaro.org>
> > Cc: Wang, Zhihong <zhihong.wang@intel.com>; Maxime Coquelin
> > <maxime.coquelin@redhat.com>; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> >
> > On Thu, Sep 22, 2016 at 05:01:41PM +0800, Jianbo Liu wrote:
> > > On 22 September 2016 at 14:58, Wang, Zhihong
> <zhihong.wang@intel.com>
> > wrote:
> > > >
> > > >
> > > >> -----Original Message-----
> > > >> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> > > >> Sent: Thursday, September 22, 2016 1:48 PM
> > > >> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> > > >> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Maxime Coquelin
> > > >> <maxime.coquelin@redhat.com>; dev@dpdk.org
> > > >> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> > > >>
> > > >> On 22 September 2016 at 10:29, Yuanhan Liu
> <yuanhan.liu@linux.intel.com>
> > > >> wrote:
> > > >> > On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
> > > >> >> >> > My setup consists of one host running a guest.
> > > >> >> >> > The guest generates as much 64bytes packets as possible
> using
> > > >> >> >>
> > > >> >> >> Have you tested with other different packet size?
> > > >> >> >> My testing shows that performance is dropping when packet
> size is
> > > >> more
> > > >> >> >> than 256.
> > > >> >> >
> > > >> >> >
> > > >> >> > Hi Jianbo,
> > > >> >> >
> > > >> >> > Thanks for reporting this.
> > > >> >> >
> > > >> >> >  1. Are you running the vector frontend with mrg_rxbuf=off?
> > > >> >> >
> > > >> Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
> > > >>
> > > >> >> >  2. Could you please specify what CPU you're running? Is it
> Haswell
> > > >> >> >     or Ivy Bridge?
> > > >> >> >
> > > >> It's an ARM server.
> > > >>
> > > >> >> >  3. How many percentage of drop are you seeing?
> > > >> The testing result:
> > > >> size (bytes)     improvement (%)
> > > >> 64                   3.92
> > > >> 128                 11.51
> > > >> 256                  24.16
> > > >> 512                  -13.79
> > > >> 1024                -22.51
> > > >> 1500                -12.22
> > > >> A correction is that performance is dropping if byte size is larger than
> 512.
> > > >
> > > >
> > > > Jianbo,
> > > >
> > > > Could you please verify does this patch really cause enqueue perf to
> drop?
> > > >
> > > > You can test the enqueue path only by set guest to do rxonly, and
> compare
> > > > the mpps by show port stats all in the guest.
> > > >
> > > >
> > > Tested with testpmd, host: txonly, guest: rxonly
> > > size (bytes)     improvement (%)
> > > 64                    4.12
> > > 128                   6
> > > 256                   2.65
> > > 512                   -1.12
> > > 1024                 -7.02
> >
> > There is a difference between Zhihong's code and the old I spotted in
> > the first time: Zhihong removed the avail_idx prefetch. I understand
> > the prefetch becomes a bit tricky when mrg-rx code path is considered;
> > thus, I didn't comment on that.
> >
> > That's one of the difference that, IMO, could drop a regression. I then
> > finally got a chance to add it back.
> >
> > A rough test shows it improves the performance of 1400B packet size
> greatly
> > in the "txonly in host and rxonly in guest" case: +33% is the number I get
> > with my test server (Ivybridge).
> 
> Thanks Yuanhan! I'll validate this on x86.

Hi Yuanhan,

Seems your code doesn't perform correctly. I write a new version
of avail idx prefetch but didn't see any perf benefit.

To be honest I doubt the benefit of this idea. The previous mrg_off
code has this method but doesn't give any benefits.

Even if this is useful, the benefits should be more significant for
small packets, it's unlikely this simple idx prefetch could bring
over 30% perf gain for large packets like 1400B ones.

But if you really do work it out like that I'll be very glad to see.

Thanks
Zhihong

> 
> >
> > I guess this might/would help your case as well. Mind to have a test
> > and tell me the results?
> >
> > BTW, I made it in rush; I haven't tested the mrg-rx code path yet.
> >
> > Thanks.
> >
> > 	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-09 12:09                       ` Wang, Zhihong
@ 2016-10-10  2:44                         ` Yuanhan Liu
  2016-10-10  5:31                           ` Jianbo Liu
  0 siblings, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-10  2:44 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: Jianbo Liu, Maxime Coquelin, dev

On Sun, Oct 09, 2016 at 12:09:07PM +0000, Wang, Zhihong wrote:
> > > > Tested with testpmd, host: txonly, guest: rxonly
> > > > size (bytes)     improvement (%)
> > > > 64                    4.12
> > > > 128                   6
> > > > 256                   2.65
> > > > 512                   -1.12
> > > > 1024                 -7.02
> > >
> > > There is a difference between Zhihong's code and the old I spotted in
> > > the first time: Zhihong removed the avail_idx prefetch. I understand
> > > the prefetch becomes a bit tricky when mrg-rx code path is considered;
> > > thus, I didn't comment on that.
> > >
> > > That's one of the difference that, IMO, could drop a regression. I then
> > > finally got a chance to add it back.
> > >
> > > A rough test shows it improves the performance of 1400B packet size
> > greatly
> > > in the "txonly in host and rxonly in guest" case: +33% is the number I get
> > > with my test server (Ivybridge).
> > 
> > Thanks Yuanhan! I'll validate this on x86.
> 
> Hi Yuanhan,
> 
> Seems your code doesn't perform correctly. I write a new version
> of avail idx prefetch but didn't see any perf benefit.
> 
> To be honest I doubt the benefit of this idea. The previous mrg_off
> code has this method but doesn't give any benefits.

Good point. I thought of that before, too. But you know that I made it
in rush, that I didn't think further and test more.

I looked the code a bit closer this time, and spotted a bug: the prefetch
actually didn't happen, due to following code piece:

	if (vq->next_avail_idx >= NR_AVAIL_IDX_PREFETCH) {
		prefetch_avail_idx(vq);
		...
	}

Since vq->next_avail_idx is set to 0 at the entrance of enqueue path,
prefetch_avail_idx() will be called. The fix is easy though: just put
prefetch_avail_idx before invoking enqueue_packet.

In summary, Zhihong is right, I see no more gains with that fix :(

However, as stated, that's kind of the only difference I found between
yours and the old code, that maybe it's still worthwhile to have a
test on ARM, Jianbo?

	--yliu

> Even if this is useful, the benefits should be more significant for
> small packets, it's unlikely this simple idx prefetch could bring
> over 30% perf gain for large packets like 1400B ones.
> 
> But if you really do work it out like that I'll be very glad to see.
> 
> Thanks
> Zhihong
> 
> > 
> > >
> > > I guess this might/would help your case as well. Mind to have a test
> > > and tell me the results?
> > >
> > > BTW, I made it in rush; I haven't tested the mrg-rx code path yet.
> > >
> > > Thanks.
> > >
> > > 	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-10  2:44                         ` Yuanhan Liu
@ 2016-10-10  5:31                           ` Jianbo Liu
  2016-10-10  6:22                             ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Jianbo Liu @ 2016-10-10  5:31 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: Wang, Zhihong, Maxime Coquelin, dev

On 10 October 2016 at 10:44, Yuanhan Liu <yuanhan.liu@linux.intel.com> wrote:
> On Sun, Oct 09, 2016 at 12:09:07PM +0000, Wang, Zhihong wrote:
>> > > > Tested with testpmd, host: txonly, guest: rxonly
>> > > > size (bytes)     improvement (%)
>> > > > 64                    4.12
>> > > > 128                   6
>> > > > 256                   2.65
>> > > > 512                   -1.12
>> > > > 1024                 -7.02
>> > >
>> > > There is a difference between Zhihong's code and the old I spotted in
>> > > the first time: Zhihong removed the avail_idx prefetch. I understand
>> > > the prefetch becomes a bit tricky when mrg-rx code path is considered;
>> > > thus, I didn't comment on that.
>> > >
>> > > That's one of the difference that, IMO, could drop a regression. I then
>> > > finally got a chance to add it back.
>> > >
>> > > A rough test shows it improves the performance of 1400B packet size
>> > greatly
>> > > in the "txonly in host and rxonly in guest" case: +33% is the number I get
>> > > with my test server (Ivybridge).
>> >
>> > Thanks Yuanhan! I'll validate this on x86.
>>
>> Hi Yuanhan,
>>
>> Seems your code doesn't perform correctly. I write a new version
>> of avail idx prefetch but didn't see any perf benefit.
>>
>> To be honest I doubt the benefit of this idea. The previous mrg_off
>> code has this method but doesn't give any benefits.
>
> Good point. I thought of that before, too. But you know that I made it
> in rush, that I didn't think further and test more.
>
> I looked the code a bit closer this time, and spotted a bug: the prefetch
> actually didn't happen, due to following code piece:
>
>         if (vq->next_avail_idx >= NR_AVAIL_IDX_PREFETCH) {
>                 prefetch_avail_idx(vq);
>                 ...
>         }
>
> Since vq->next_avail_idx is set to 0 at the entrance of enqueue path,
> prefetch_avail_idx() will be called. The fix is easy though: just put
> prefetch_avail_idx before invoking enqueue_packet.
>
> In summary, Zhihong is right, I see no more gains with that fix :(
>
> However, as stated, that's kind of the only difference I found between
> yours and the old code, that maybe it's still worthwhile to have a
> test on ARM, Jianbo?
>
I haven't tested it, but I think it could be no improvement for ARM either.

A smalll suggestion for enqueue_packet:

.....
+       /* start copy from mbuf to desc */
+       while (mbuf_avail || mbuf->next) {
.....

Considering pkt_len is in the first cache line (same as data_len),
while next pointer is in the second cache line,
is it better to check the total packet len, instead of the last mbuf's
next pointer to jump out of while loop and avoid possible cache miss?

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-10  5:31                           ` Jianbo Liu
@ 2016-10-10  6:22                             ` Wang, Zhihong
  2016-10-10  6:57                               ` Jianbo Liu
  0 siblings, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-10-10  6:22 UTC (permalink / raw)
  To: Jianbo Liu, Yuanhan Liu; +Cc: Maxime Coquelin, dev



> -----Original Message-----
> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> Sent: Monday, October 10, 2016 1:32 PM
> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> On 10 October 2016 at 10:44, Yuanhan Liu <yuanhan.liu@linux.intel.com>
> wrote:
> > On Sun, Oct 09, 2016 at 12:09:07PM +0000, Wang, Zhihong wrote:
> >> > > > Tested with testpmd, host: txonly, guest: rxonly
> >> > > > size (bytes)     improvement (%)
> >> > > > 64                    4.12
> >> > > > 128                   6
> >> > > > 256                   2.65
> >> > > > 512                   -1.12
> >> > > > 1024                 -7.02
> >> > >
> >> > > There is a difference between Zhihong's code and the old I spotted in
> >> > > the first time: Zhihong removed the avail_idx prefetch. I understand
> >> > > the prefetch becomes a bit tricky when mrg-rx code path is
> considered;
> >> > > thus, I didn't comment on that.
> >> > >
> >> > > That's one of the difference that, IMO, could drop a regression. I then
> >> > > finally got a chance to add it back.
> >> > >
> >> > > A rough test shows it improves the performance of 1400B packet size
> >> > greatly
> >> > > in the "txonly in host and rxonly in guest" case: +33% is the number I
> get
> >> > > with my test server (Ivybridge).
> >> >
> >> > Thanks Yuanhan! I'll validate this on x86.
> >>
> >> Hi Yuanhan,
> >>
> >> Seems your code doesn't perform correctly. I write a new version
> >> of avail idx prefetch but didn't see any perf benefit.
> >>
> >> To be honest I doubt the benefit of this idea. The previous mrg_off
> >> code has this method but doesn't give any benefits.
> >
> > Good point. I thought of that before, too. But you know that I made it
> > in rush, that I didn't think further and test more.
> >
> > I looked the code a bit closer this time, and spotted a bug: the prefetch
> > actually didn't happen, due to following code piece:
> >
> >         if (vq->next_avail_idx >= NR_AVAIL_IDX_PREFETCH) {
> >                 prefetch_avail_idx(vq);
> >                 ...
> >         }
> >
> > Since vq->next_avail_idx is set to 0 at the entrance of enqueue path,
> > prefetch_avail_idx() will be called. The fix is easy though: just put
> > prefetch_avail_idx before invoking enqueue_packet.
> >
> > In summary, Zhihong is right, I see no more gains with that fix :(
> >
> > However, as stated, that's kind of the only difference I found between
> > yours and the old code, that maybe it's still worthwhile to have a
> > test on ARM, Jianbo?
> >
> I haven't tested it, but I think it could be no improvement for ARM either.
> 
> A smalll suggestion for enqueue_packet:
> 
> .....
> +       /* start copy from mbuf to desc */
> +       while (mbuf_avail || mbuf->next) {
> .....
> 
> Considering pkt_len is in the first cache line (same as data_len),
> while next pointer is in the second cache line,
> is it better to check the total packet len, instead of the last mbuf's
> next pointer to jump out of while loop and avoid possible cache miss?

Jianbo,

Thanks for the reply!

This idea sounds good, but it won't help the general perf in my
opinion, since the 2nd cache line is accessed anyway prior in
virtio_enqueue_offload.

Also this would bring a NULL check when actually access mbuf->next.

BTW, could you please publish the number of:

 1. mrg_rxbuf=on, comparison between original and original + this patch

 2. mrg_rxbuf=off, comparison between original and original + this patch

So we can have a whole picture of how this patch impact on ARM platform.

Thanks
Zhihong


^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-10  6:22                             ` Wang, Zhihong
@ 2016-10-10  6:57                               ` Jianbo Liu
  2016-10-10  7:25                                 ` Wang, Zhihong
  0 siblings, 1 reply; 141+ messages in thread
From: Jianbo Liu @ 2016-10-10  6:57 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: Yuanhan Liu, Maxime Coquelin, dev

On 10 October 2016 at 14:22, Wang, Zhihong <zhihong.wang@intel.com> wrote:
>
>
>> -----Original Message-----
>> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
>> Sent: Monday, October 10, 2016 1:32 PM
>> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
>> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Maxime Coquelin
>> <maxime.coquelin@redhat.com>; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
>>
>> On 10 October 2016 at 10:44, Yuanhan Liu <yuanhan.liu@linux.intel.com>
>> wrote:
>> > On Sun, Oct 09, 2016 at 12:09:07PM +0000, Wang, Zhihong wrote:
>> >> > > > Tested with testpmd, host: txonly, guest: rxonly
>> >> > > > size (bytes)     improvement (%)
>> >> > > > 64                    4.12
>> >> > > > 128                   6
>> >> > > > 256                   2.65
>> >> > > > 512                   -1.12
>> >> > > > 1024                 -7.02
>> >> > >
>> >> > > There is a difference between Zhihong's code and the old I spotted in
>> >> > > the first time: Zhihong removed the avail_idx prefetch. I understand
>> >> > > the prefetch becomes a bit tricky when mrg-rx code path is
>> considered;
>> >> > > thus, I didn't comment on that.
>> >> > >
>> >> > > That's one of the difference that, IMO, could drop a regression. I then
>> >> > > finally got a chance to add it back.
>> >> > >
>> >> > > A rough test shows it improves the performance of 1400B packet size
>> >> > greatly
>> >> > > in the "txonly in host and rxonly in guest" case: +33% is the number I
>> get
>> >> > > with my test server (Ivybridge).
>> >> >
>> >> > Thanks Yuanhan! I'll validate this on x86.
>> >>
>> >> Hi Yuanhan,
>> >>
>> >> Seems your code doesn't perform correctly. I write a new version
>> >> of avail idx prefetch but didn't see any perf benefit.
>> >>
>> >> To be honest I doubt the benefit of this idea. The previous mrg_off
>> >> code has this method but doesn't give any benefits.
>> >
>> > Good point. I thought of that before, too. But you know that I made it
>> > in rush, that I didn't think further and test more.
>> >
>> > I looked the code a bit closer this time, and spotted a bug: the prefetch
>> > actually didn't happen, due to following code piece:
>> >
>> >         if (vq->next_avail_idx >= NR_AVAIL_IDX_PREFETCH) {
>> >                 prefetch_avail_idx(vq);
>> >                 ...
>> >         }
>> >
>> > Since vq->next_avail_idx is set to 0 at the entrance of enqueue path,
>> > prefetch_avail_idx() will be called. The fix is easy though: just put
>> > prefetch_avail_idx before invoking enqueue_packet.
>> >
>> > In summary, Zhihong is right, I see no more gains with that fix :(
>> >
>> > However, as stated, that's kind of the only difference I found between
>> > yours and the old code, that maybe it's still worthwhile to have a
>> > test on ARM, Jianbo?
>> >
>> I haven't tested it, but I think it could be no improvement for ARM either.
>>
>> A smalll suggestion for enqueue_packet:
>>
>> .....
>> +       /* start copy from mbuf to desc */
>> +       while (mbuf_avail || mbuf->next) {
>> .....
>>
>> Considering pkt_len is in the first cache line (same as data_len),
>> while next pointer is in the second cache line,
>> is it better to check the total packet len, instead of the last mbuf's
>> next pointer to jump out of while loop and avoid possible cache miss?
>
> Jianbo,
>
> Thanks for the reply!
>
> This idea sounds good, but it won't help the general perf in my
> opinion, since the 2nd cache line is accessed anyway prior in
> virtio_enqueue_offload.
>
Yes, you are right. I'm thinking of prefetching beforehand.
And if it's a chained mbuf, virtio_enqueue_offload will not be called
in next loop.

> Also this would bring a NULL check when actually access mbuf->next.
>
> BTW, could you please publish the number of:
>
>  1. mrg_rxbuf=on, comparison between original and original + this patch
>
>  2. mrg_rxbuf=off, comparison between original and original + this patch
>
> So we can have a whole picture of how this patch impact on ARM platform.
>
I think you already have got many results in my previous emails.
Sorry I can't test right now and busy with other things.

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-10  6:57                               ` Jianbo Liu
@ 2016-10-10  7:25                                 ` Wang, Zhihong
  0 siblings, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-10-10  7:25 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: Yuanhan Liu, Maxime Coquelin, dev



> -----Original Message-----
> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> Sent: Monday, October 10, 2016 2:58 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: Yuanhan Liu <yuanhan.liu@linux.intel.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> On 10 October 2016 at 14:22, Wang, Zhihong <zhihong.wang@intel.com>
> wrote:
> >
> >
> >> -----Original Message-----
> >> From: Jianbo Liu [mailto:jianbo.liu@linaro.org]
> >> Sent: Monday, October 10, 2016 1:32 PM
> >> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> >> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Maxime Coquelin
> >> <maxime.coquelin@redhat.com>; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> >>
> >> On 10 October 2016 at 10:44, Yuanhan Liu <yuanhan.liu@linux.intel.com>
> >> wrote:
> >> > On Sun, Oct 09, 2016 at 12:09:07PM +0000, Wang, Zhihong wrote:
> >> >> > > > Tested with testpmd, host: txonly, guest: rxonly
> >> >> > > > size (bytes)     improvement (%)
> >> >> > > > 64                    4.12
> >> >> > > > 128                   6
> >> >> > > > 256                   2.65
> >> >> > > > 512                   -1.12
> >> >> > > > 1024                 -7.02
> >> >> > >
> >> >> > > There is a difference between Zhihong's code and the old I spotted
> in
> >> >> > > the first time: Zhihong removed the avail_idx prefetch. I
> understand
> >> >> > > the prefetch becomes a bit tricky when mrg-rx code path is
> >> considered;
> >> >> > > thus, I didn't comment on that.
> >> >> > >
> >> >> > > That's one of the difference that, IMO, could drop a regression. I
> then
> >> >> > > finally got a chance to add it back.
> >> >> > >
> >> >> > > A rough test shows it improves the performance of 1400B packet
> size
> >> >> > greatly
> >> >> > > in the "txonly in host and rxonly in guest" case: +33% is the number
> I
> >> get
> >> >> > > with my test server (Ivybridge).
> >> >> >
> >> >> > Thanks Yuanhan! I'll validate this on x86.
> >> >>
> >> >> Hi Yuanhan,
> >> >>
> >> >> Seems your code doesn't perform correctly. I write a new version
> >> >> of avail idx prefetch but didn't see any perf benefit.
> >> >>
> >> >> To be honest I doubt the benefit of this idea. The previous mrg_off
> >> >> code has this method but doesn't give any benefits.
> >> >
> >> > Good point. I thought of that before, too. But you know that I made it
> >> > in rush, that I didn't think further and test more.
> >> >
> >> > I looked the code a bit closer this time, and spotted a bug: the prefetch
> >> > actually didn't happen, due to following code piece:
> >> >
> >> >         if (vq->next_avail_idx >= NR_AVAIL_IDX_PREFETCH) {
> >> >                 prefetch_avail_idx(vq);
> >> >                 ...
> >> >         }
> >> >
> >> > Since vq->next_avail_idx is set to 0 at the entrance of enqueue path,
> >> > prefetch_avail_idx() will be called. The fix is easy though: just put
> >> > prefetch_avail_idx before invoking enqueue_packet.
> >> >
> >> > In summary, Zhihong is right, I see no more gains with that fix :(
> >> >
> >> > However, as stated, that's kind of the only difference I found between
> >> > yours and the old code, that maybe it's still worthwhile to have a
> >> > test on ARM, Jianbo?
> >> >
> >> I haven't tested it, but I think it could be no improvement for ARM either.
> >>
> >> A smalll suggestion for enqueue_packet:
> >>
> >> .....
> >> +       /* start copy from mbuf to desc */
> >> +       while (mbuf_avail || mbuf->next) {
> >> .....
> >>
> >> Considering pkt_len is in the first cache line (same as data_len),
> >> while next pointer is in the second cache line,
> >> is it better to check the total packet len, instead of the last mbuf's
> >> next pointer to jump out of while loop and avoid possible cache miss?
> >
> > Jianbo,
> >
> > Thanks for the reply!
> >
> > This idea sounds good, but it won't help the general perf in my
> > opinion, since the 2nd cache line is accessed anyway prior in
> > virtio_enqueue_offload.
> >
> Yes, you are right. I'm thinking of prefetching beforehand.
> And if it's a chained mbuf, virtio_enqueue_offload will not be called
> in next loop.
> 
> > Also this would bring a NULL check when actually access mbuf->next.
> >
> > BTW, could you please publish the number of:
> >
> >  1. mrg_rxbuf=on, comparison between original and original + this patch
> >
> >  2. mrg_rxbuf=off, comparison between original and original + this patch
> >
> > So we can have a whole picture of how this patch impact on ARM platform.
> >
> I think you already have got many results in my previous emails.
> Sorry I can't test right now and busy with other things.

We're still missing mrg on data.


^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-09-22  5:47             ` Jianbo Liu
  2016-09-22  6:58               ` Wang, Zhihong
@ 2016-10-12  2:53               ` Yuanhan Liu
  2016-10-12 12:22                 ` Wang, Zhihong
  1 sibling, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-12  2:53 UTC (permalink / raw)
  To: Wang, Zhihong, Jianbo Liu; +Cc: Maxime Coquelin, dev, Thomas Monjalon

On Thu, Sep 22, 2016 at 01:47:45PM +0800, Jianbo Liu wrote:
> On 22 September 2016 at 10:29, Yuanhan Liu <yuanhan.liu@linux.intel.com> wrote:
> > On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
> >> >> > My setup consists of one host running a guest.
> >> >> > The guest generates as much 64bytes packets as possible using
> >> >>
> >> >> Have you tested with other different packet size?
> >> >> My testing shows that performance is dropping when packet size is more
> >> >> than 256.
> >> >
> >> >
> >> > Hi Jianbo,
> >> >
> >> > Thanks for reporting this.
> >> >
> >> >  1. Are you running the vector frontend with mrg_rxbuf=off?
> >> >
> Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
> 
> >> >  2. Could you please specify what CPU you're running? Is it Haswell
> >> >     or Ivy Bridge?
> >> >
> It's an ARM server.
> 
> >> >  3. How many percentage of drop are you seeing?
> The testing result:
> size (bytes)     improvement (%)
> 64                   3.92
> 128                 11.51
> 256                  24.16
> 512                  -13.79
> 1024                -22.51
> 1500                -12.22
> A correction is that performance is dropping if byte size is larger than 512.

I have thought of this twice. Unfortunately, I think I may need NACK this
series.

Merging two code path into one is really good: as you stated, it improves
the maintainability. But only if we see no performance regression on both 
path after the refactor. Unfortunately, that's not the case here: it hurts
the performance for one code path (non-mrg Rx).

That makes me think we may should not do the code path merge at all. I think
that also aligns with what you have said before (internally): we could do the
merge if it gives comparable performance before and after that.

Besides that, I don't quite like the way you did in patch 2 (rewrite enqueue):
you made a lot of changes in one patch. That means if something wrong happened,
it is hard to narrow down which change introduces that regression. Badly,
that's exactly what we met here. Weeks have been passed, I see no progress.

That's the reason we like the idea of "one patch only does one thing, an
atomic thing".

So I will apply the first patch (it's a bug fixing patch) and ask you to
refactor the rest, without the code path merge.

I think we could still have a good maintainability code base if we introduce 
more common helper functions that can be used on both Rx path, or even on
Tx path (such as update_used_ring, or shadow_used_ring).

It's a bit late for too many changes for v16.11. I think you could just
grab patch 6 (vhost: optimize cache access) to the old mrg-Rx code path,
if that also helps the performance? Let us handle the left in next release,
such as shadow used ring.

Thanks.

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-12  2:53               ` Yuanhan Liu
@ 2016-10-12 12:22                 ` Wang, Zhihong
  2016-10-12 15:31                   ` Thomas Monjalon
  2016-10-13  5:33                   ` Yuanhan Liu
  0 siblings, 2 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-10-12 12:22 UTC (permalink / raw)
  To: Yuanhan Liu, Jianbo Liu, Thomas Monjalon, Maxime Coquelin; +Cc: dev



> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Wednesday, October 12, 2016 10:53 AM
> To: Wang, Zhihong <zhihong.wang@intel.com>; Jianbo Liu <jianbo.liu@linaro.org>
> Cc: Maxime Coquelin <maxime.coquelin@redhat.com>; dev@dpdk.org; Thomas
> Monjalon <thomas.monjalon@6wind.com>
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> On Thu, Sep 22, 2016 at 01:47:45PM +0800, Jianbo Liu wrote:
> > On 22 September 2016 at 10:29, Yuanhan Liu <yuanhan.liu@linux.intel.com>
> wrote:
> > > On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
> > >> >> > My setup consists of one host running a guest.
> > >> >> > The guest generates as much 64bytes packets as possible using
> > >> >>
> > >> >> Have you tested with other different packet size?
> > >> >> My testing shows that performance is dropping when packet size is more
> > >> >> than 256.
> > >> >
> > >> >
> > >> > Hi Jianbo,
> > >> >
> > >> > Thanks for reporting this.
> > >> >
> > >> >  1. Are you running the vector frontend with mrg_rxbuf=off?
> > >> >
> > Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
> >
> > >> >  2. Could you please specify what CPU you're running? Is it Haswell
> > >> >     or Ivy Bridge?
> > >> >
> > It's an ARM server.
> >
> > >> >  3. How many percentage of drop are you seeing?
> > The testing result:
> > size (bytes)     improvement (%)
> > 64                   3.92
> > 128                 11.51
> > 256                  24.16
> > 512                  -13.79
> > 1024                -22.51
> > 1500                -12.22
> > A correction is that performance is dropping if byte size is larger than 512.
> 
> I have thought of this twice. Unfortunately, I think I may need NACK this
> series.
> 
> Merging two code path into one is really good: as you stated, it improves
> the maintainability. But only if we see no performance regression on both
> path after the refactor. Unfortunately, that's not the case here: it hurts
> the performance for one code path (non-mrg Rx).
> 
> That makes me think we may should not do the code path merge at all. I think
> that also aligns with what you have said before (internally): we could do the
> merge if it gives comparable performance before and after that.
> 
> Besides that, I don't quite like the way you did in patch 2 (rewrite enqueue):
> you made a lot of changes in one patch. That means if something wrong
> happened,
> it is hard to narrow down which change introduces that regression. Badly,
> that's exactly what we met here. Weeks have been passed, I see no progress.
> 
> That's the reason we like the idea of "one patch only does one thing, an
> atomic thing".


Yuanhan, folks,

Thanks for the analysis. I disagree here though.

I analyze, develop, benchmark on x86 platforms, where this patch
works great.

I've been trying to analyze on ARM too but it takes time and I've
had a schedule. Also since the ARM perf issue comes when it's
v6 already, I might not be able to make it in time. However
that's what I have to do for this patch to be merged in this
or the next release.

In the meantime, may I suggest we consider the possibility to
have dedicated codes for **perf critical paths** for different
kinds of architecture?

It can be hard for a person to have both the knowledge and the
development environment for multiple archs at the same time.

Moreover, different optimization techniques might be required for
different archs, so it's hard and unnecessary to make a function
works for all archs, sometimes it's just not the right thing to do.


Thanks
Zhihong


> 
> So I will apply the first patch (it's a bug fixing patch) and ask you to
> refactor the rest, without the code path merge.
> 
> I think we could still have a good maintainability code base if we introduce
> more common helper functions that can be used on both Rx path, or even on
> Tx path (such as update_used_ring, or shadow_used_ring).
> 
> It's a bit late for too many changes for v16.11. I think you could just
> grab patch 6 (vhost: optimize cache access) to the old mrg-Rx code path,
> if that also helps the performance? Let us handle the left in next release,
> such as shadow used ring.
> 
> Thanks.
> 
> 	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-12 12:22                 ` Wang, Zhihong
@ 2016-10-12 15:31                   ` Thomas Monjalon
  2016-10-13  1:21                     ` Wang, Zhihong
  2016-10-13  3:51                     ` Jianbo Liu
  2016-10-13  5:33                   ` Yuanhan Liu
  1 sibling, 2 replies; 141+ messages in thread
From: Thomas Monjalon @ 2016-10-12 15:31 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: Yuanhan Liu, Jianbo Liu, Maxime Coquelin, dev

Sorry guys, you lost me in the discussion.

Is there some regression only on ARM?
Does it need some work specifically on memcpy for ARM,
or vhost for ARM?
Who can work on ARM optimization?

More comments below.

2016-10-12 12:22, Wang, Zhihong:
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> > > It's an ARM server.
> > >
> > > >> >  3. How many percentage of drop are you seeing?
> > > The testing result:
> > > size (bytes)     improvement (%)
> > > 64                   3.92
> > > 128                 11.51
> > > 256                  24.16
> > > 512                  -13.79
> > > 1024                -22.51
> > > 1500                -12.22
> > > A correction is that performance is dropping if byte size is larger than 512.
> > 
> > I have thought of this twice. Unfortunately, I think I may need NACK this
> > series.
> > 
> > Merging two code path into one is really good: as you stated, it improves
> > the maintainability. But only if we see no performance regression on both
> > path after the refactor. Unfortunately, that's not the case here: it hurts
> > the performance for one code path (non-mrg Rx).

+1

> > That makes me think we may should not do the code path merge at all. I think
> > that also aligns with what you have said before (internally): we could do the
> > merge if it gives comparable performance before and after that.
> > 
> > Besides that, I don't quite like the way you did in patch 2 (rewrite enqueue):
> > you made a lot of changes in one patch. That means if something wrong
> > happened,
> > it is hard to narrow down which change introduces that regression. Badly,
> > that's exactly what we met here. Weeks have been passed, I see no progress.

+1, it is important to have simple patches making changes step by step.

> > That's the reason we like the idea of "one patch only does one thing, an
> > atomic thing".
> 
> 
> Yuanhan, folks,
> 
> Thanks for the analysis. I disagree here though.
> 
> I analyze, develop, benchmark on x86 platforms, where this patch
> works great.
> 
> I've been trying to analyze on ARM too but it takes time and I've
> had a schedule. Also since the ARM perf issue comes when it's
> v6 already, I might not be able to make it in time. However
> that's what I have to do for this patch to be merged in this
> or the next release.
> 
> In the meantime, may I suggest we consider the possibility to
> have dedicated codes for **perf critical paths** for different
> kinds of architecture?

Yes that's what we do in several parts of DPDK.

> It can be hard for a person to have both the knowledge and the
> development environment for multiple archs at the same time.

Yes we do not expect you work on ARM.
So if nobody work on the ARM issue, you could make 2 code paths
in order to allow your optimization for x86 only.
But that's not the preferred way.
And you must split your rework to better identify which part is
a regression on ARM.

> Moreover, different optimization techniques might be required for
> different archs, so it's hard and unnecessary to make a function
> works for all archs, sometimes it's just not the right thing to do.

Yes sometimes. Please help us to be convinced for this case.

> > So I will apply the first patch (it's a bug fixing patch) and ask you to
> > refactor the rest, without the code path merge.
> > 
> > I think we could still have a good maintainability code base if we introduce
> > more common helper functions that can be used on both Rx path, or even on
> > Tx path (such as update_used_ring, or shadow_used_ring).

Yes it is a good step.
And the code path merge could be reconsidered later.

> > It's a bit late for too many changes for v16.11. I think you could just
> > grab patch 6 (vhost: optimize cache access) to the old mrg-Rx code path,
> > if that also helps the performance? Let us handle the left in next release,
> > such as shadow used ring.

Thank you

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-12 15:31                   ` Thomas Monjalon
@ 2016-10-13  1:21                     ` Wang, Zhihong
  2016-10-13  3:51                     ` Jianbo Liu
  1 sibling, 0 replies; 141+ messages in thread
From: Wang, Zhihong @ 2016-10-13  1:21 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: Yuanhan Liu, Jianbo Liu, Maxime Coquelin, dev



> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> Sent: Wednesday, October 12, 2016 11:31 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: Yuanhan Liu <yuanhan.liu@linux.intel.com>; Jianbo Liu
> <jianbo.liu@linaro.org>; Maxime Coquelin <maxime.coquelin@redhat.com>;
> dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> Sorry guys, you lost me in the discussion.
> 
> Is there some regression only on ARM?

ARM is what we see, no info on ppc yet.

> Does it need some work specifically on memcpy for ARM,
> or vhost for ARM?
> Who can work on ARM optimization?

These are still open questions, Jiaobo who reported this doesn't
have time for more testing now according to the reply.

I'm trying to do some test in the hope to identify the root cause.

> 
> More comments below.
> 
> 2016-10-12 12:22, Wang, Zhihong:
> > From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> > > > It's an ARM server.
> > > >
> > > > >> >  3. How many percentage of drop are you seeing?
> > > > The testing result:
> > > > size (bytes)     improvement (%)
> > > > 64                   3.92
> > > > 128                 11.51
> > > > 256                  24.16
> > > > 512                  -13.79
> > > > 1024                -22.51
> > > > 1500                -12.22
> > > > A correction is that performance is dropping if byte size is larger than
> 512.
> > >
> > > I have thought of this twice. Unfortunately, I think I may need NACK this
> > > series.
> > >
> > > Merging two code path into one is really good: as you stated, it improves
> > > the maintainability. But only if we see no performance regression on both
> > > path after the refactor. Unfortunately, that's not the case here: it hurts
> > > the performance for one code path (non-mrg Rx).
> 
> +1
> 
> > > That makes me think we may should not do the code path merge at all. I
> think
> > > that also aligns with what you have said before (internally): we could do
> the
> > > merge if it gives comparable performance before and after that.
> > >
> > > Besides that, I don't quite like the way you did in patch 2 (rewrite
> enqueue):
> > > you made a lot of changes in one patch. That means if something wrong
> > > happened,
> > > it is hard to narrow down which change introduces that regression. Badly,
> > > that's exactly what we met here. Weeks have been passed, I see no
> progress.
> 
> +1, it is important to have simple patches making changes step by step.
> 
> > > That's the reason we like the idea of "one patch only does one thing, an
> > > atomic thing".
> >
> >
> > Yuanhan, folks,
> >
> > Thanks for the analysis. I disagree here though.
> >
> > I analyze, develop, benchmark on x86 platforms, where this patch
> > works great.
> >
> > I've been trying to analyze on ARM too but it takes time and I've
> > had a schedule. Also since the ARM perf issue comes when it's
> > v6 already, I might not be able to make it in time. However
> > that's what I have to do for this patch to be merged in this
> > or the next release.
> >
> > In the meantime, may I suggest we consider the possibility to
> > have dedicated codes for **perf critical paths** for different
> > kinds of architecture?
> 
> Yes that's what we do in several parts of DPDK.
> 
> > It can be hard for a person to have both the knowledge and the
> > development environment for multiple archs at the same time.
> 
> Yes we do not expect you work on ARM.
> So if nobody work on the ARM issue, you could make 2 code paths
> in order to allow your optimization for x86 only.
> But that's not the preferred way.
> And you must split your rework to better identify which part is
> a regression on ARM.
> 
> > Moreover, different optimization techniques might be required for
> > different archs, so it's hard and unnecessary to make a function
> > works for all archs, sometimes it's just not the right thing to do.
> 
> Yes sometimes. Please help us to be convinced for this case.
> 
> > > So I will apply the first patch (it's a bug fixing patch) and ask you to
> > > refactor the rest, without the code path merge.
> > >
> > > I think we could still have a good maintainability code base if we introduce
> > > more common helper functions that can be used on both Rx path, or
> even on
> > > Tx path (such as update_used_ring, or shadow_used_ring).
> 
> Yes it is a good step.
> And the code path merge could be reconsidered later.
> 
> > > It's a bit late for too many changes for v16.11. I think you could just
> > > grab patch 6 (vhost: optimize cache access) to the old mrg-Rx code path,
> > > if that also helps the performance? Let us handle the left in next release,
> > > such as shadow used ring.
> 
> Thank you

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-12 15:31                   ` Thomas Monjalon
  2016-10-13  1:21                     ` Wang, Zhihong
@ 2016-10-13  3:51                     ` Jianbo Liu
  1 sibling, 0 replies; 141+ messages in thread
From: Jianbo Liu @ 2016-10-13  3:51 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: Wang, Zhihong, Yuanhan Liu, Maxime Coquelin, dev

Hi Thomas,

On 12 October 2016 at 23:31, Thomas Monjalon <thomas.monjalon@6wind.com> wrote:
> Sorry guys, you lost me in the discussion.
>
> Is there some regression only on ARM?
> Does it need some work specifically on memcpy for ARM,

I don't know if there is common way to improve memcpy on different ARM
hardware.  Even there is, it could take times.
I have tried do that using neon (like sse) instructions, but without success.

> or vhost for ARM?
> Who can work on ARM optimization?
>

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-12 12:22                 ` Wang, Zhihong
  2016-10-12 15:31                   ` Thomas Monjalon
@ 2016-10-13  5:33                   ` Yuanhan Liu
  2016-10-13  5:35                     ` Yuanhan Liu
  2016-10-13  6:02                     ` Wang, Zhihong
  1 sibling, 2 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-13  5:33 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: Jianbo Liu, Thomas Monjalon, Maxime Coquelin, dev

On Wed, Oct 12, 2016 at 12:22:08PM +0000, Wang, Zhihong wrote:
> > > >> >  3. How many percentage of drop are you seeing?
> > > The testing result:
> > > size (bytes)     improvement (%)
> > > 64                   3.92
> > > 128                 11.51
> > > 256                  24.16
> > > 512                  -13.79
> > > 1024                -22.51
> > > 1500                -12.22
> > > A correction is that performance is dropping if byte size is larger than 512.
> > 
> > I have thought of this twice. Unfortunately, I think I may need NACK this
> > series.
> > 
> > Merging two code path into one is really good: as you stated, it improves
> > the maintainability. But only if we see no performance regression on both
> > path after the refactor. Unfortunately, that's not the case here: it hurts
> > the performance for one code path (non-mrg Rx).
> > 
> > That makes me think we may should not do the code path merge at all. I think
> > that also aligns with what you have said before (internally): we could do the
> > merge if it gives comparable performance before and after that.
> > 
> > Besides that, I don't quite like the way you did in patch 2 (rewrite enqueue):
> > you made a lot of changes in one patch. That means if something wrong
> > happened,
> > it is hard to narrow down which change introduces that regression. Badly,
> > that's exactly what we met here. Weeks have been passed, I see no progress.
> > 
> > That's the reason we like the idea of "one patch only does one thing, an
> > atomic thing".
> 
> 
> Yuanhan, folks,
> 
> Thanks for the analysis. I disagree here though.
> 
> I analyze, develop, benchmark on x86 platforms, where this patch
> works great.

Yes, that's great effort! With your hardwork, we know what the bottleneck
is and how it could be improved.

However, you don't have to do code refactor (merge two code path to one)
to apply those improvements. From what I know, in this patchset, there
are two factors could improve the performance:

- copy hdr together with packet data

- shadow used ring update and update at once

The overall performance boost I got with your v6 patchset with mrg-Rx
code path is about 27% (in PVP case). And I have just applied the 1st
optimization, it yields about 20% boosts. The left could be covered if
we apply the 2nd optimization (I guess).

That would be a clean way to optimize vhost mergeable Rx path:

- you don't touch non-mrg Rx path (well, you may could apply the
  shadow_used_ring trick to it as wel)

  This would at least make sure we will have no such performance
  regression issue reported by ARM guys.

- you don't refactor the code

  The rewrite from scratch could introduce other issues, besides the
  performance regression. We may just don't know it yet.


Make sense to you? If you agree, I think we could still make it in
this release: they would be some small changes after all. For example,
below is the patch applies the 1st optimization tip on top of
dpdk-next-virtio

	--yliu

---------------------------------------------------------------
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 8a151af..0ddb5af 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -379,7 +379,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			    uint16_t end_idx, struct rte_mbuf *m,
 			    struct buf_vector *buf_vec)
 {
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
 	uint32_t vec_idx = 0;
 	uint16_t start_idx = vq->last_used_idx;
 	uint16_t cur_idx = start_idx;
@@ -388,6 +388,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	uint32_t desc_offset, desc_avail;
 	uint32_t cpy_len;
 	uint16_t desc_idx, used_idx;
+	uint16_t num_buffers = end_idx - start_idx;
+	int hdr_copied = 0;
 
 	if (unlikely(m == NULL))
 		return 0;
@@ -399,16 +401,11 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
 		return 0;
 
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
+	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+	rte_prefetch0(virtio_hdr);
 
-	virtio_hdr.num_buffers = end_idx - start_idx;
 	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
-		dev->vid, virtio_hdr.num_buffers);
-
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-	vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
+		dev->vid, num_buffers);
 
 	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
 	desc_offset = dev->vhost_hlen;
@@ -450,6 +447,15 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			mbuf_avail  = rte_pktmbuf_data_len(m);
 		}
 
+		if (hdr_copied == 0) {
+			virtio_hdr->num_buffers = num_buffers;
+			virtio_enqueue_offload(m, &virtio_hdr->hdr);
+			vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
+			PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
+
+			hdr_copied = 1;
+		}
+
 		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
 		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
 			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-13  5:33                   ` Yuanhan Liu
@ 2016-10-13  5:35                     ` Yuanhan Liu
  2016-10-13  6:02                     ` Wang, Zhihong
  1 sibling, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-13  5:35 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: Jianbo Liu, Thomas Monjalon, Maxime Coquelin, dev

On Thu, Oct 13, 2016 at 01:33:24PM +0800, Yuanhan Liu wrote:
> That would be a clean way to optimize vhost mergeable Rx path:
> 
> - you don't touch non-mrg Rx path (well, you may could apply the
>   shadow_used_ring trick to it as wel)

I meant "to non-mrg Rx as well".

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-13  5:33                   ` Yuanhan Liu
  2016-10-13  5:35                     ` Yuanhan Liu
@ 2016-10-13  6:02                     ` Wang, Zhihong
  2016-10-13  7:54                       ` Maxime Coquelin
  1 sibling, 1 reply; 141+ messages in thread
From: Wang, Zhihong @ 2016-10-13  6:02 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: Jianbo Liu, Thomas Monjalon, Maxime Coquelin, dev



> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Thursday, October 13, 2016 1:33 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: Jianbo Liu <jianbo.liu@linaro.org>; Thomas Monjalon
> <thomas.monjalon@6wind.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
> 
> On Wed, Oct 12, 2016 at 12:22:08PM +0000, Wang, Zhihong wrote:
> > > > >> >  3. How many percentage of drop are you seeing?
> > > > The testing result:
> > > > size (bytes)     improvement (%)
> > > > 64                   3.92
> > > > 128                 11.51
> > > > 256                  24.16
> > > > 512                  -13.79
> > > > 1024                -22.51
> > > > 1500                -12.22
> > > > A correction is that performance is dropping if byte size is larger than
> 512.
> > >
> > > I have thought of this twice. Unfortunately, I think I may need NACK this
> > > series.
> > >
> > > Merging two code path into one is really good: as you stated, it improves
> > > the maintainability. But only if we see no performance regression on both
> > > path after the refactor. Unfortunately, that's not the case here: it hurts
> > > the performance for one code path (non-mrg Rx).
> > >
> > > That makes me think we may should not do the code path merge at all. I
> think
> > > that also aligns with what you have said before (internally): we could do
> the
> > > merge if it gives comparable performance before and after that.
> > >
> > > Besides that, I don't quite like the way you did in patch 2 (rewrite
> enqueue):
> > > you made a lot of changes in one patch. That means if something wrong
> > > happened,
> > > it is hard to narrow down which change introduces that regression. Badly,
> > > that's exactly what we met here. Weeks have been passed, I see no
> progress.
> > >
> > > That's the reason we like the idea of "one patch only does one thing, an
> > > atomic thing".
> >
> >
> > Yuanhan, folks,
> >
> > Thanks for the analysis. I disagree here though.
> >
> > I analyze, develop, benchmark on x86 platforms, where this patch
> > works great.
> 
> Yes, that's great effort! With your hardwork, we know what the bottleneck
> is and how it could be improved.
> 
> However, you don't have to do code refactor (merge two code path to one)
> to apply those improvements. From what I know, in this patchset, there
> are two factors could improve the performance:
> 
> - copy hdr together with packet data
> 
> - shadow used ring update and update at once
> 
> The overall performance boost I got with your v6 patchset with mrg-Rx
> code path is about 27% (in PVP case). And I have just applied the 1st
> optimization, it yields about 20% boosts. The left could be covered if
> we apply the 2nd optimization (I guess).
> 
> That would be a clean way to optimize vhost mergeable Rx path:
> 
> - you don't touch non-mrg Rx path (well, you may could apply the
>   shadow_used_ring trick to it as wel)
> 
>   This would at least make sure we will have no such performance
>   regression issue reported by ARM guys.
> 
> - you don't refactor the code
> 
>   The rewrite from scratch could introduce other issues, besides the
>   performance regression. We may just don't know it yet.
> 
> 
> Make sense to you? If you agree, I think we could still make it in
> this release: they would be some small changes after all. For example,
> below is the patch applies the 1st optimization tip on top of
> dpdk-next-virtio


Thanks for this great idea. I think it's a better way to do it.
I'll start to make the patch then.


> 
> 	--yliu
> 
> ---------------------------------------------------------------
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 8a151af..0ddb5af 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -379,7 +379,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net
> *dev, struct vhost_virtqueue *vq,
>  			    uint16_t end_idx, struct rte_mbuf *m,
>  			    struct buf_vector *buf_vec)
>  {
> -	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
> +	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
>  	uint32_t vec_idx = 0;
>  	uint16_t start_idx = vq->last_used_idx;
>  	uint16_t cur_idx = start_idx;
> @@ -388,6 +388,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net
> *dev, struct vhost_virtqueue *vq,
>  	uint32_t desc_offset, desc_avail;
>  	uint32_t cpy_len;
>  	uint16_t desc_idx, used_idx;
> +	uint16_t num_buffers = end_idx - start_idx;
> +	int hdr_copied = 0;
> 
>  	if (unlikely(m == NULL))
>  		return 0;
> @@ -399,16 +401,11 @@ copy_mbuf_to_desc_mergeable(struct virtio_net
> *dev, struct vhost_virtqueue *vq,
>  	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
>  		return 0;
> 
> -	rte_prefetch0((void *)(uintptr_t)desc_addr);
> +	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf
> *)(uintptr_t)desc_addr;
> +	rte_prefetch0(virtio_hdr);
> 
> -	virtio_hdr.num_buffers = end_idx - start_idx;
>  	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
> -		dev->vid, virtio_hdr.num_buffers);
> -
> -	virtio_enqueue_offload(m, &virtio_hdr.hdr);
> -	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
> -	vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
> -	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
> +		dev->vid, num_buffers);
> 
>  	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
>  	desc_offset = dev->vhost_hlen;
> @@ -450,6 +447,15 @@ copy_mbuf_to_desc_mergeable(struct virtio_net
> *dev, struct vhost_virtqueue *vq,
>  			mbuf_avail  = rte_pktmbuf_data_len(m);
>  		}
> 
> +		if (hdr_copied == 0) {
> +			virtio_hdr->num_buffers = num_buffers;
> +			virtio_enqueue_offload(m, &virtio_hdr->hdr);
> +			vhost_log_write(dev, buf_vec[vec_idx].buf_addr,
> dev->vhost_hlen);
> +			PRINT_PACKET(dev, (uintptr_t)desc_addr, dev-
> >vhost_hlen, 0);
> +
> +			hdr_copied = 1;
> +		}
> +
>  		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
>  		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
>  			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [dpdk-stable] [PATCH v6 1/6] vhost: fix windows vm hang
  2016-09-20  2:00   ` [PATCH v6 1/6] vhost: fix windows vm hang Zhihong Wang
@ 2016-10-13  6:18     ` Yuanhan Liu
  0 siblings, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-13  6:18 UTC (permalink / raw)
  To: Zhihong Wang; +Cc: dev, maxime.coquelin, thomas.monjalon, stable

On Mon, Sep 19, 2016 at 10:00:12PM -0400, Zhihong Wang wrote:
> This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost code
> which causes the guest to hang once any packets are enqueued when mrg_rxbuf
> is turned on by setting the right id and len in the used ring.
> 
> As defined in virtio spec 0.95 and 1.0, in each used ring element, id means
> index of start of used descriptor chain, and len means total length of the
> descriptor chain which was written to. While in 16.07 code, index of the
> last descriptor is assigned to id, and the length of the last descriptor is
> assigned to len.
> 
> How to test?
> 
>  1. Start testpmd in the host with a vhost port.
> 
>  2. Start a Windows VM image with qemu and connect to the vhost port.
> 
>  3. Start io forwarding with tx_first in host testpmd.
> 
> For 16.07 code, the Windows VM will hang once any packets are enqueued.
> 
> Cc: <stable@dpdk.org>
> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>

Applied to dpdk-next-virtio (this patch only).

Thanks.

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-13  6:02                     ` Wang, Zhihong
@ 2016-10-13  7:54                       ` Maxime Coquelin
  2016-10-13  9:23                         ` Maxime Coquelin
  0 siblings, 1 reply; 141+ messages in thread
From: Maxime Coquelin @ 2016-10-13  7:54 UTC (permalink / raw)
  To: Wang, Zhihong, Yuanhan Liu; +Cc: Jianbo Liu, Thomas Monjalon, dev



On 10/13/2016 08:02 AM, Wang, Zhihong wrote:
>> > Yes, that's great effort! With your hardwork, we know what the bottleneck
>> > is and how it could be improved.
>> >
>> > However, you don't have to do code refactor (merge two code path to one)
>> > to apply those improvements. From what I know, in this patchset, there
>> > are two factors could improve the performance:
>> >
>> > - copy hdr together with packet data
>> >
>> > - shadow used ring update and update at once
>> >
>> > The overall performance boost I got with your v6 patchset with mrg-Rx
>> > code path is about 27% (in PVP case). And I have just applied the 1st
>> > optimization, it yields about 20% boosts. The left could be covered if
>> > we apply the 2nd optimization (I guess).
>> >
>> > That would be a clean way to optimize vhost mergeable Rx path:
>> >
>> > - you don't touch non-mrg Rx path (well, you may could apply the
>> >   shadow_used_ring trick to it as wel)
>> >
>> >   This would at least make sure we will have no such performance
>> >   regression issue reported by ARM guys.
>> >
>> > - you don't refactor the code
>> >
>> >   The rewrite from scratch could introduce other issues, besides the
>> >   performance regression. We may just don't know it yet.
>> >
>> >
>> > Make sense to you? If you agree, I think we could still make it in
>> > this release: they would be some small changes after all. For example,
>> > below is the patch applies the 1st optimization tip on top of
>> > dpdk-next-virtio
>
> Thanks for this great idea. I think it's a better way to do it.
> I'll start to make the patch then.
>
>

I personally find having two paths better for maintenance as it is
easier to understand (IMHO).
So if we can have the performance gain while keeping the two paths,
I definitely support the idea.

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-13  7:54                       ` Maxime Coquelin
@ 2016-10-13  9:23                         ` Maxime Coquelin
  2016-10-14 10:11                           ` Yuanhan Liu
  0 siblings, 1 reply; 141+ messages in thread
From: Maxime Coquelin @ 2016-10-13  9:23 UTC (permalink / raw)
  To: Wang, Zhihong, Yuanhan Liu; +Cc: Jianbo Liu, Thomas Monjalon, dev



On 10/13/2016 09:54 AM, Maxime Coquelin wrote:
>
>
> On 10/13/2016 08:02 AM, Wang, Zhihong wrote:
>>> > Yes, that's great effort! With your hardwork, we know what the
>>> bottleneck
>>> > is and how it could be improved.
>>> >
>>> > However, you don't have to do code refactor (merge two code path to
>>> one)
>>> > to apply those improvements. From what I know, in this patchset, there
>>> > are two factors could improve the performance:
>>> >
>>> > - copy hdr together with packet data
>>> >
>>> > - shadow used ring update and update at once
>>> >
>>> > The overall performance boost I got with your v6 patchset with mrg-Rx
>>> > code path is about 27% (in PVP case). And I have just applied the 1st
>>> > optimization, it yields about 20% boosts. The left could be covered if
>>> > we apply the 2nd optimization (I guess).
>>> >
>>> > That would be a clean way to optimize vhost mergeable Rx path:
>>> >
>>> > - you don't touch non-mrg Rx path (well, you may could apply the
>>> >   shadow_used_ring trick to it as wel)
>>> >
>>> >   This would at least make sure we will have no such performance
>>> >   regression issue reported by ARM guys.
>>> >
>>> > - you don't refactor the code
>>> >
>>> >   The rewrite from scratch could introduce other issues, besides the
>>> >   performance regression. We may just don't know it yet.
>>> >
>>> >
>>> > Make sense to you? If you agree, I think we could still make it in
>>> > this release: they would be some small changes after all. For example,
>>> > below is the patch applies the 1st optimization tip on top of
>>> > dpdk-next-virtio
>>
>> Thanks for this great idea. I think it's a better way to do it.
>> I'll start to make the patch then.
>>
>>
>
> I personally find having two paths better for maintenance as it is
> easier to understand (IMHO).
> So if we can have the performance gain while keeping the two paths,
> I definitely support the idea.

I was going to re-run some PVP benchmark with 0% pkt loss, as I had
some strange results last week.

Problem is that your series no more apply cleanly due to
next-virtio's master branch history rewrite.
Any chance you send me a rebased version so that I can apply the series?

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* [PATCH v7 0/7] vhost: optimize mergeable Rx path
  2016-09-20  2:00 ` [PATCH v6 " Zhihong Wang
                     ` (6 preceding siblings ...)
  2016-09-21  2:26   ` [PATCH v6 0/6] vhost: optimize enqueue Yuanhan Liu
@ 2016-10-14  9:34   ` Yuanhan Liu
  2016-10-14  9:34     ` [PATCH v7 1/7] vhost: remove useless volatile Yuanhan Liu
                       ` (9 more replies)
  7 siblings, 10 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-14  9:34 UTC (permalink / raw)
  To: dev; +Cc: Maxime Coquelin, Yuanhan Liu, Jianbo Liu

This is a new set of patches to optimize the mergeable Rx code path.
No refactoring (rewrite) was made this time. It just applies some
findings from Zhihong (kudos to him!) that could improve the mergeable
Rx path on the old code.

The two major factors that could improve the performance greatly are:

- copy virtio header together with packet data. This could remove
  the buubbles between the two copy to optimize the cache access.

  This is implemented in patch 2 "vhost: optimize cache access"

- shadow used ring update and update them at once

  The basic idea is to update used ring in a local buffer and flush
  them to the virtio used ring at once in the end. Again, this is
  for optimizing the cache access.

  This is implemented in patch 5 "vhost: shadow used ring update"

The two optimizations could yield 40+% performance in micro testing
and 20+% in PVP case testing with 64B packet size.

Besides that, there are some tiny optimizations, such as prefetch
avail ring (patch 6) and retrieve avail head once (patch 7).

Note: the shadow used ring tech could also be applied to the non-mrg
Rx path (and even the dequeu) path. I didn't do that for two reasons:

- we already update used ring in batch in both path: it's not shadowed
  first though.

- it's a bit too late too make many changes at this stage: RC1 is out. 

Please help testing.

Thanks.

	--yliu

Cc: Jianbo Liu <jianbo.liu@linaro.org>
---
Yuanhan Liu (4):
  vhost: simplify mergeable Rx vring reservation
  vhost: use last avail idx for avail ring reservation
  vhost: prefetch avail ring
  vhost: retrieve avail head once

Zhihong Wang (3):
  vhost: remove useless volatile
  vhost: optimize cache access
  vhost: shadow used ring update

 lib/librte_vhost/vhost.c      |  13 ++-
 lib/librte_vhost/vhost.h      |   5 +-
 lib/librte_vhost/vhost_user.c |  23 +++--
 lib/librte_vhost/virtio_net.c | 193 +++++++++++++++++++++++++-----------------
 4 files changed, 149 insertions(+), 85 deletions(-)

-- 
1.9.0

^ permalink raw reply	[flat|nested] 141+ messages in thread

* [PATCH v7 1/7] vhost: remove useless volatile
  2016-10-14  9:34   ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Yuanhan Liu
@ 2016-10-14  9:34     ` Yuanhan Liu
  2016-10-14  9:34     ` [PATCH v7 2/7] vhost: optimize cache access Yuanhan Liu
                       ` (8 subsequent siblings)
  9 siblings, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-14  9:34 UTC (permalink / raw)
  To: dev; +Cc: Maxime Coquelin, Zhihong Wang

From: Zhihong Wang <zhihong.wang@intel.com>

last_used_idx is a local var, there is no need to decorate it
by "volatile".

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 53dbf33..17c557f 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -85,7 +85,7 @@ struct vhost_virtqueue {
 	uint32_t		size;
 
 	uint16_t		last_avail_idx;
-	volatile uint16_t	last_used_idx;
+	uint16_t		last_used_idx;
 #define VIRTIO_INVALID_EVENTFD		(-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD	(-2)
 
-- 
1.9.0

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v7 2/7] vhost: optimize cache access
  2016-10-14  9:34   ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Yuanhan Liu
  2016-10-14  9:34     ` [PATCH v7 1/7] vhost: remove useless volatile Yuanhan Liu
@ 2016-10-14  9:34     ` Yuanhan Liu
  2016-10-14  9:34     ` [PATCH v7 3/7] vhost: simplify mergeable Rx vring reservation Yuanhan Liu
                       ` (7 subsequent siblings)
  9 siblings, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-14  9:34 UTC (permalink / raw)
  To: dev; +Cc: Maxime Coquelin, Zhihong Wang, Yuanhan Liu

From: Zhihong Wang <zhihong.wang@intel.com>

This patch reorders the code to delay virtio header write to improve
cache access efficiency for cases where the mrg_rxbuf feature is turned
on. CPU pipeline stall cycles can be significantly reduced.

Virtio header write and mbuf data copy are all remote store operations
which takes a long time to finish. It's a good idea to put them together
to remove bubbles in between, to let as many remote store instructions
as possible go into store buffer at the same time to hide latency, and
to let the H/W prefetcher goes to work as early as possible.

On a Haswell machine, about 100 cycles can be saved per packet by this
patch alone. Taking 64B packets traffic for example, this means about 60%
efficiency improvement for the enqueue operation.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
---
 lib/librte_vhost/virtio_net.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 812e5d3..d4fc62a 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -390,6 +390,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	uint32_t desc_offset, desc_avail;
 	uint32_t cpy_len;
 	uint16_t desc_idx, used_idx;
+	uint64_t hdr_addr, hdr_phys_addr;
+	struct rte_mbuf *hdr_mbuf;
 
 	if (unlikely(m == NULL))
 		return 0;
@@ -401,17 +403,15 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
 		return 0;
 
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
+	hdr_mbuf = m;
+	hdr_addr = desc_addr;
+	hdr_phys_addr = buf_vec[vec_idx].buf_addr;
+	rte_prefetch0((void *)(uintptr_t)hdr_addr);
 
 	virtio_hdr.num_buffers = end_idx - start_idx;
 	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
 		dev->vid, virtio_hdr.num_buffers);
 
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-	vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
 	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
 	desc_offset = dev->vhost_hlen;
 	desc_chain_head = buf_vec[vec_idx].desc_idx;
@@ -456,6 +456,16 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			mbuf_avail  = rte_pktmbuf_data_len(m);
 		}
 
+		if (hdr_addr) {
+			virtio_enqueue_offload(hdr_mbuf, &virtio_hdr.hdr);
+			copy_virtio_net_hdr(dev, hdr_addr, virtio_hdr);
+			vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen);
+			PRINT_PACKET(dev, (uintptr_t)hdr_addr,
+				     dev->vhost_hlen, 0);
+
+			hdr_addr = 0;
+		}
+
 		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
 		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
 			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-- 
1.9.0

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v7 3/7] vhost: simplify mergeable Rx vring reservation
  2016-10-14  9:34   ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Yuanhan Liu
  2016-10-14  9:34     ` [PATCH v7 1/7] vhost: remove useless volatile Yuanhan Liu
  2016-10-14  9:34     ` [PATCH v7 2/7] vhost: optimize cache access Yuanhan Liu
@ 2016-10-14  9:34     ` Yuanhan Liu
  2016-10-25 22:08       ` Thomas Monjalon
  2016-10-14  9:34     ` [PATCH v7 4/7] vhost: use last avail idx for avail ring reservation Yuanhan Liu
                       ` (6 subsequent siblings)
  9 siblings, 1 reply; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-14  9:34 UTC (permalink / raw)
  To: dev; +Cc: Maxime Coquelin, Yuanhan Liu, Zhihong Wang

Let it return "num_buffers" we reserved, so that we could re-use it
with copy_mbuf_to_desc_mergeable() directly, instead of calculating
it again there.

Meanwhile, the return type of copy_mbuf_to_desc_mergeable is changed
to "int". -1 will be return on error.

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index d4fc62a..1a40c91 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -336,7 +336,7 @@ fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
  */
 static inline int
 reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
-			    uint16_t *end, struct buf_vector *buf_vec)
+			    struct buf_vector *buf_vec, uint16_t *num_buffers)
 {
 	uint16_t cur_idx;
 	uint16_t avail_idx;
@@ -370,19 +370,18 @@ reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
 			return -1;
 	}
 
-	*end = cur_idx;
+	*num_buffers = cur_idx - vq->last_used_idx;
 	return 0;
 }
 
-static inline uint32_t __attribute__((always_inline))
+static inline int __attribute__((always_inline))
 copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
-			    uint16_t end_idx, struct rte_mbuf *m,
-			    struct buf_vector *buf_vec)
+			    struct rte_mbuf *m, struct buf_vector *buf_vec,
+			    uint16_t num_buffers)
 {
 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
 	uint32_t vec_idx = 0;
-	uint16_t start_idx = vq->last_used_idx;
-	uint16_t cur_idx = start_idx;
+	uint16_t cur_idx = vq->last_used_idx;
 	uint64_t desc_addr;
 	uint32_t desc_chain_head;
 	uint32_t desc_chain_len;
@@ -394,21 +393,21 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mbuf *hdr_mbuf;
 
 	if (unlikely(m == NULL))
-		return 0;
+		return -1;
 
 	LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
 		dev->vid, cur_idx, end_idx);
 
 	desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
 	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
-		return 0;
+		return -1;
 
 	hdr_mbuf = m;
 	hdr_addr = desc_addr;
 	hdr_phys_addr = buf_vec[vec_idx].buf_addr;
 	rte_prefetch0((void *)(uintptr_t)hdr_addr);
 
-	virtio_hdr.num_buffers = end_idx - start_idx;
+	virtio_hdr.num_buffers = num_buffers;
 	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
 		dev->vid, virtio_hdr.num_buffers);
 
@@ -440,7 +439,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
 			if (unlikely(!desc_addr))
-				return 0;
+				return -1;
 
 			/* Prefetch buffer address. */
 			rte_prefetch0((void *)(uintptr_t)desc_addr);
@@ -489,7 +488,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		offsetof(struct vring_used, ring[used_idx]),
 		sizeof(vq->used->ring[used_idx]));
 
-	return end_idx - start_idx;
+	return 0;
 }
 
 static inline uint32_t __attribute__((always_inline))
@@ -497,8 +496,8 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 	struct rte_mbuf **pkts, uint32_t count)
 {
 	struct vhost_virtqueue *vq;
-	uint32_t pkt_idx = 0, nr_used = 0;
-	uint16_t end;
+	uint32_t pkt_idx = 0;
+	uint16_t num_buffers;
 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
 
 	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
@@ -519,22 +518,24 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
 
-		if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
-							 &end, buf_vec) < 0)) {
+		if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len, buf_vec,
+							 &num_buffers) < 0)) {
 			LOG_DEBUG(VHOST_DATA,
 				"(%d) failed to get enough desc from vring\n",
 				dev->vid);
 			break;
 		}
 
-		nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
-						      pkts[pkt_idx], buf_vec);
+		if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx],
+						buf_vec, num_buffers) < 0)
+			break;
+
 		rte_smp_wmb();
 
-		*(volatile uint16_t *)&vq->used->idx += nr_used;
+		*(volatile uint16_t *)&vq->used->idx += num_buffers;
 		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
 			sizeof(vq->used->idx));
-		vq->last_used_idx += nr_used;
+		vq->last_used_idx += num_buffers;
 	}
 
 	if (likely(pkt_idx)) {
-- 
1.9.0

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v7 4/7] vhost: use last avail idx for avail ring reservation
  2016-10-14  9:34   ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Yuanhan Liu
                       ` (2 preceding siblings ...)
  2016-10-14  9:34     ` [PATCH v7 3/7] vhost: simplify mergeable Rx vring reservation Yuanhan Liu
@ 2016-10-14  9:34     ` Yuanhan Liu
  2016-10-14  9:34     ` [PATCH v7 5/7] vhost: shadow used ring update Yuanhan Liu
                       ` (5 subsequent siblings)
  9 siblings, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-14  9:34 UTC (permalink / raw)
  To: dev; +Cc: Maxime Coquelin, Yuanhan Liu, Zhihong Wang

shadow_used_ring will be introduced later. Since then last avail
idx will not be updated together with last used idx.

So, here we use last_avail_idx for avail ring reservation.

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 1a40c91..b5ba633 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -344,7 +344,7 @@ reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
 	uint32_t vec_idx = 0;
 	uint16_t tries = 0;
 
-	cur_idx  = vq->last_used_idx;
+	cur_idx = vq->last_avail_idx;
 
 	while (1) {
 		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
@@ -370,7 +370,7 @@ reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
 			return -1;
 	}
 
-	*num_buffers = cur_idx - vq->last_used_idx;
+	*num_buffers = cur_idx - vq->last_avail_idx;
 	return 0;
 }
 
@@ -536,6 +536,7 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
 			sizeof(vq->used->idx));
 		vq->last_used_idx += num_buffers;
+		vq->last_avail_idx += num_buffers;
 	}
 
 	if (likely(pkt_idx)) {
-- 
1.9.0

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v7 5/7] vhost: shadow used ring update
  2016-10-14  9:34   ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Yuanhan Liu
                       ` (3 preceding siblings ...)
  2016-10-14  9:34     ` [PATCH v7 4/7] vhost: use last avail idx for avail ring reservation Yuanhan Liu
@ 2016-10-14  9:34     ` Yuanhan Liu
  2016-10-14  9:34     ` [PATCH v7 6/7] vhost: prefetch avail ring Yuanhan Liu
                       ` (4 subsequent siblings)
  9 siblings, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-14  9:34 UTC (permalink / raw)
  To: dev; +Cc: Maxime Coquelin, Zhihong Wang, Yuanhan Liu

From: Zhihong Wang <zhihong.wang@intel.com>

The basic idea is to shadow the used ring update: update them into a
local buffer first, and then flush them all to the virtio used vring
at once in the end.

And since we do avail ring reservation before enqueuing data, we would
know which and how many descs will be used. Which means we could update
the shadow used ring at the reservation time. It also introduce another
slight advantage: we don't need access the desc->flag any more inside
copy_mbuf_to_desc_mergeable().

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
---
 lib/librte_vhost/vhost.c      |  13 +++-
 lib/librte_vhost/vhost.h      |   3 +
 lib/librte_vhost/vhost_user.c |  23 +++++--
 lib/librte_vhost/virtio_net.c | 138 +++++++++++++++++++++++++-----------------
 4 files changed, 113 insertions(+), 64 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 469117a..d8116ff 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -121,9 +121,18 @@ static void
 free_device(struct virtio_net *dev)
 {
 	uint32_t i;
+	struct vhost_virtqueue *rxq, *txq;
 
-	for (i = 0; i < dev->virt_qp_nb; i++)
-		rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+	for (i = 0; i < dev->virt_qp_nb; i++) {
+		rxq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ];
+		txq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ];
+
+		rte_free(rxq->shadow_used_ring);
+		rte_free(txq->shadow_used_ring);
+
+		/* rxq and txq are allocated together as queue-pair */
+		rte_free(rxq);
+	}
 
 	rte_free(dev);
 }
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 17c557f..acec772 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -105,6 +105,9 @@ struct vhost_virtqueue {
 	uint16_t		last_zmbuf_idx;
 	struct zcopy_mbuf	*zmbufs;
 	struct zcopy_mbuf_list	zmbuf_list;
+
+	struct vring_used_elem  *shadow_used_ring;
+	uint16_t                shadow_used_idx;
 } __rte_cache_aligned;
 
 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index 3074227..6b83c15 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -198,6 +198,15 @@ vhost_user_set_vring_num(struct virtio_net *dev,
 		}
 	}
 
+	vq->shadow_used_ring = rte_malloc(NULL,
+				vq->size * sizeof(struct vring_used_elem),
+				RTE_CACHE_LINE_SIZE);
+	if (!vq->shadow_used_ring) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"failed to allocate memory for shadow used ring.\n");
+		return -1;
+	}
+
 	return 0;
 }
 
@@ -711,6 +720,8 @@ static int
 vhost_user_get_vring_base(struct virtio_net *dev,
 			  struct vhost_vring_state *state)
 {
+	struct vhost_virtqueue *vq = dev->virtqueue[state->index];
+
 	/* We have to stop the queue (virtio) if it is running. */
 	if (dev->flags & VIRTIO_DEV_RUNNING) {
 		dev->flags &= ~VIRTIO_DEV_RUNNING;
@@ -718,7 +729,7 @@ vhost_user_get_vring_base(struct virtio_net *dev,
 	}
 
 	/* Here we are safe to get the last used index */
-	state->num = dev->virtqueue[state->index]->last_used_idx;
+	state->num = vq->last_used_idx;
 
 	RTE_LOG(INFO, VHOST_CONFIG,
 		"vring base idx:%d file:%d\n", state->index, state->num);
@@ -727,13 +738,15 @@ vhost_user_get_vring_base(struct virtio_net *dev,
 	 * sent and only sent in vhost_vring_stop.
 	 * TODO: cleanup the vring, it isn't usable since here.
 	 */
-	if (dev->virtqueue[state->index]->kickfd >= 0)
-		close(dev->virtqueue[state->index]->kickfd);
+	if (vq->kickfd >= 0)
+		close(vq->kickfd);
 
-	dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+	vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
 
 	if (dev->dequeue_zero_copy)
-		free_zmbufs(dev->virtqueue[state->index]);
+		free_zmbufs(vq);
+	rte_free(vq->shadow_used_ring);
+	vq->shadow_used_ring = NULL;
 
 	return 0;
 }
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index b5ba633..2bdc2fe 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -91,6 +91,56 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
 	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }
 
+static inline void __attribute__((always_inline))
+do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+			  uint16_t to, uint16_t from, uint16_t size)
+{
+	rte_memcpy(&vq->used->ring[to],
+			&vq->shadow_used_ring[from],
+			size * sizeof(struct vring_used_elem));
+	vhost_log_used_vring(dev, vq,
+			offsetof(struct vring_used, ring[to]),
+			size * sizeof(struct vring_used_elem));
+}
+
+static inline void __attribute__((always_inline))
+flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+	uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
+
+	if (used_idx + vq->shadow_used_idx <= vq->size) {
+		do_flush_shadow_used_ring(dev, vq, used_idx, 0,
+					  vq->shadow_used_idx);
+	} else {
+		uint16_t size;
+
+		/* update used ring interval [used_idx, vq->size] */
+		size = vq->size - used_idx;
+		do_flush_shadow_used_ring(dev, vq, used_idx, 0, size);
+
+		/* update the left half used ring interval [0, left_size] */
+		do_flush_shadow_used_ring(dev, vq, 0, size,
+					  vq->shadow_used_idx - size);
+	}
+	vq->last_used_idx += vq->shadow_used_idx;
+
+	rte_smp_wmb();
+
+	*(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
+	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+		sizeof(vq->used->idx));
+}
+
+static inline void __attribute__((always_inline))
+update_shadow_used_ring(struct vhost_virtqueue *vq,
+			 uint16_t desc_idx, uint16_t len)
+{
+	uint16_t i = vq->shadow_used_idx++;
+
+	vq->shadow_used_ring[i].id  = desc_idx;
+	vq->shadow_used_ring[i].len = len;
+}
+
 static void
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
@@ -300,15 +350,16 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 	return count;
 }
 
-static inline int
+static inline int __attribute__((always_inline))
 fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
-	     uint32_t *allocated, uint32_t *vec_idx,
-	     struct buf_vector *buf_vec)
+	     uint32_t *vec_idx, struct buf_vector *buf_vec,
+	     uint16_t *desc_chain_head, uint16_t *desc_chain_len)
 {
 	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
 	uint32_t vec_id = *vec_idx;
-	uint32_t len    = *allocated;
+	uint32_t len    = 0;
 
+	*desc_chain_head = idx;
 	while (1) {
 		if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
 			return -1;
@@ -325,8 +376,8 @@ fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
 		idx = vq->desc[idx].next;
 	}
 
-	*allocated = len;
-	*vec_idx   = vec_id;
+	*desc_chain_len = len;
+	*vec_idx = vec_id;
 
 	return 0;
 }
@@ -340,26 +391,30 @@ reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
 {
 	uint16_t cur_idx;
 	uint16_t avail_idx;
-	uint32_t allocated = 0;
 	uint32_t vec_idx = 0;
 	uint16_t tries = 0;
 
-	cur_idx = vq->last_avail_idx;
+	uint16_t head_idx = 0;
+	uint16_t len = 0;
 
-	while (1) {
+	*num_buffers = 0;
+	cur_idx  = vq->last_avail_idx;
+
+	while (size > 0) {
 		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
 		if (unlikely(cur_idx == avail_idx))
 			return -1;
 
-		if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
-					  &vec_idx, buf_vec) < 0))
+		if (unlikely(fill_vec_buf(vq, cur_idx, &vec_idx, buf_vec,
+					  &head_idx, &len) < 0))
 			return -1;
+		len = RTE_MIN(len, size);
+		update_shadow_used_ring(vq, head_idx, len);
+		size -= len;
 
 		cur_idx++;
 		tries++;
-
-		if (allocated >= size)
-			break;
+		*num_buffers += 1;
 
 		/*
 		 * if we tried all available ring items, and still
@@ -370,25 +425,19 @@ reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
 			return -1;
 	}
 
-	*num_buffers = cur_idx - vq->last_avail_idx;
 	return 0;
 }
 
 static inline int __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
-			    struct rte_mbuf *m, struct buf_vector *buf_vec,
-			    uint16_t num_buffers)
+copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m,
+			    struct buf_vector *buf_vec, uint16_t num_buffers)
 {
 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
 	uint32_t vec_idx = 0;
-	uint16_t cur_idx = vq->last_used_idx;
 	uint64_t desc_addr;
-	uint32_t desc_chain_head;
-	uint32_t desc_chain_len;
 	uint32_t mbuf_offset, mbuf_avail;
 	uint32_t desc_offset, desc_avail;
 	uint32_t cpy_len;
-	uint16_t desc_idx, used_idx;
 	uint64_t hdr_addr, hdr_phys_addr;
 	struct rte_mbuf *hdr_mbuf;
 
@@ -409,34 +458,17 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 	virtio_hdr.num_buffers = num_buffers;
 	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
-		dev->vid, virtio_hdr.num_buffers);
+		dev->vid, num_buffers);
 
 	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
 	desc_offset = dev->vhost_hlen;
-	desc_chain_head = buf_vec[vec_idx].desc_idx;
-	desc_chain_len = desc_offset;
 
 	mbuf_avail  = rte_pktmbuf_data_len(m);
 	mbuf_offset = 0;
 	while (mbuf_avail != 0 || m->next != NULL) {
 		/* done with current desc buf, get the next one */
 		if (desc_avail == 0) {
-			desc_idx = buf_vec[vec_idx].desc_idx;
 			vec_idx++;
-
-			if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
-				/* Update used ring with desc information */
-				used_idx = cur_idx++ & (vq->size - 1);
-				vq->used->ring[used_idx].id = desc_chain_head;
-				vq->used->ring[used_idx].len = desc_chain_len;
-				vhost_log_used_vring(dev, vq,
-					offsetof(struct vring_used,
-						 ring[used_idx]),
-					sizeof(vq->used->ring[used_idx]));
-				desc_chain_head = buf_vec[vec_idx].desc_idx;
-				desc_chain_len = 0;
-			}
-
 			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
 			if (unlikely(!desc_addr))
 				return -1;
@@ -478,16 +510,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		mbuf_offset += cpy_len;
 		desc_avail  -= cpy_len;
 		desc_offset += cpy_len;
-		desc_chain_len += cpy_len;
 	}
 
-	used_idx = cur_idx & (vq->size - 1);
-	vq->used->ring[used_idx].id = desc_chain_head;
-	vq->used->ring[used_idx].len = desc_chain_len;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, ring[used_idx]),
-		sizeof(vq->used->ring[used_idx]));
-
 	return 0;
 }
 
@@ -515,6 +539,7 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 	if (count == 0)
 		return 0;
 
+	vq->shadow_used_idx = 0;
 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
 
@@ -523,23 +548,22 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 			LOG_DEBUG(VHOST_DATA,
 				"(%d) failed to get enough desc from vring\n",
 				dev->vid);
+			vq->shadow_used_idx -= num_buffers;
 			break;
 		}
 
-		if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx],
-						buf_vec, num_buffers) < 0)
+		if (copy_mbuf_to_desc_mergeable(dev, pkts[pkt_idx],
+						buf_vec, num_buffers) < 0) {
+			vq->shadow_used_idx -= num_buffers;
 			break;
+		}
 
-		rte_smp_wmb();
-
-		*(volatile uint16_t *)&vq->used->idx += num_buffers;
-		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
-			sizeof(vq->used->idx));
-		vq->last_used_idx += num_buffers;
 		vq->last_avail_idx += num_buffers;
 	}
 
-	if (likely(pkt_idx)) {
+	if (likely(vq->shadow_used_idx)) {
+		flush_shadow_used_ring(dev, vq);
+
 		/* flush used->idx update before we read avail->flags. */
 		rte_mb();
 
-- 
1.9.0

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v7 6/7] vhost: prefetch avail ring
  2016-10-14  9:34   ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Yuanhan Liu
                       ` (4 preceding siblings ...)
  2016-10-14  9:34     ` [PATCH v7 5/7] vhost: shadow used ring update Yuanhan Liu
@ 2016-10-14  9:34     ` Yuanhan Liu
  2016-10-14  9:34     ` [PATCH v7 7/7] vhost: retrieve avail head once Yuanhan Liu
                       ` (3 subsequent siblings)
  9 siblings, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-14  9:34 UTC (permalink / raw)
  To: dev; +Cc: Maxime Coquelin, Yuanhan Liu, Zhihong Wang

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 2bdc2fe..12a037b 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -539,6 +539,8 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 	if (count == 0)
 		return 0;
 
+	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
+
 	vq->shadow_used_idx = 0;
 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
-- 
1.9.0

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* [PATCH v7 7/7] vhost: retrieve avail head once
  2016-10-14  9:34   ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Yuanhan Liu
                       ` (5 preceding siblings ...)
  2016-10-14  9:34     ` [PATCH v7 6/7] vhost: prefetch avail ring Yuanhan Liu
@ 2016-10-14  9:34     ` Yuanhan Liu
  2016-10-18  2:25     ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Jianbo Liu
                       ` (2 subsequent siblings)
  9 siblings, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-14  9:34 UTC (permalink / raw)
  To: dev; +Cc: Maxime Coquelin, Yuanhan Liu, Zhihong Wang

There is no need to retrieve the latest avail head every time we enqueue
a packet in the mereable Rx path by

    avail_idx = *((volatile uint16_t *)&vq->avail->idx);

Instead, we could just retrieve it once at the beginning of the enqueue
path. This could diminish the cache penalty slightly, because the virtio
driver could be updating it while vhost is reading it (for each packet).

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/virtio_net.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 12a037b..b784dba 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -387,10 +387,10 @@ fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
  */
 static inline int
 reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
-			    struct buf_vector *buf_vec, uint16_t *num_buffers)
+			    struct buf_vector *buf_vec, uint16_t *num_buffers,
+			    uint16_t avail_head)
 {
 	uint16_t cur_idx;
-	uint16_t avail_idx;
 	uint32_t vec_idx = 0;
 	uint16_t tries = 0;
 
@@ -401,8 +401,7 @@ reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
 	cur_idx  = vq->last_avail_idx;
 
 	while (size > 0) {
-		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-		if (unlikely(cur_idx == avail_idx))
+		if (unlikely(cur_idx == avail_head))
 			return -1;
 
 		if (unlikely(fill_vec_buf(vq, cur_idx, &vec_idx, buf_vec,
@@ -523,6 +522,7 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 	uint32_t pkt_idx = 0;
 	uint16_t num_buffers;
 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
+	uint16_t avail_head;
 
 	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
@@ -542,11 +542,12 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
 
 	vq->shadow_used_idx = 0;
+	avail_head = *((volatile uint16_t *)&vq->avail->idx);
 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
 
 		if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len, buf_vec,
-							 &num_buffers) < 0)) {
+					&num_buffers, avail_head) < 0)) {
 			LOG_DEBUG(VHOST_DATA,
 				"(%d) failed to get enough desc from vring\n",
 				dev->vid);
-- 
1.9.0

^ permalink raw reply related	[flat|nested] 141+ messages in thread

* Re: [PATCH v3 0/5] vhost: optimize enqueue
  2016-10-13  9:23                         ` Maxime Coquelin
@ 2016-10-14 10:11                           ` Yuanhan Liu
  0 siblings, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-14 10:11 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: Wang, Zhihong, Jianbo Liu, Thomas Monjalon, dev

On Thu, Oct 13, 2016 at 11:23:44AM +0200, Maxime Coquelin wrote:
> I was going to re-run some PVP benchmark with 0% pkt loss, as I had
> some strange results last week.
> 
> Problem is that your series no more apply cleanly due to
> next-virtio's master branch history rewrite.
> Any chance you send me a rebased version so that I can apply the series?

I think it's pointless to do that now: it won't be merged after all.
We have refactored out the new series, please help review if you
got time :)

BTW, apologize that I forgot to include your Reviewed-by for the
first patch. I intended to do that ...

	--yliu

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v7 0/7] vhost: optimize mergeable Rx path
  2016-10-14  9:34   ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Yuanhan Liu
                       ` (6 preceding siblings ...)
  2016-10-14  9:34     ` [PATCH v7 7/7] vhost: retrieve avail head once Yuanhan Liu
@ 2016-10-18  2:25     ` Jianbo Liu
  2016-10-18 14:53     ` Maxime Coquelin
  2016-10-21  7:51     ` Yuanhan Liu
  9 siblings, 0 replies; 141+ messages in thread
From: Jianbo Liu @ 2016-10-18  2:25 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: dev, Maxime Coquelin

On 14 October 2016 at 17:34, Yuanhan Liu <yuanhan.liu@linux.intel.com> wrote:
> This is a new set of patches to optimize the mergeable Rx code path.
> No refactoring (rewrite) was made this time. It just applies some
> findings from Zhihong (kudos to him!) that could improve the mergeable
> Rx path on the old code.
......

> ---
> Yuanhan Liu (4):
>   vhost: simplify mergeable Rx vring reservation
>   vhost: use last avail idx for avail ring reservation
>   vhost: prefetch avail ring
>   vhost: retrieve avail head once
>
> Zhihong Wang (3):
>   vhost: remove useless volatile
>   vhost: optimize cache access
>   vhost: shadow used ring update
>
>  lib/librte_vhost/vhost.c      |  13 ++-
>  lib/librte_vhost/vhost.h      |   5 +-
>  lib/librte_vhost/vhost_user.c |  23 +++--
>  lib/librte_vhost/virtio_net.c | 193 +++++++++++++++++++++++++-----------------
>  4 files changed, 149 insertions(+), 85 deletions(-)
>

Reviewed-by: Jianbo Liu <jianbo.liu@linaro.org>

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v7 0/7] vhost: optimize mergeable Rx path
  2016-10-14  9:34   ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Yuanhan Liu
                       ` (7 preceding siblings ...)
  2016-10-18  2:25     ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Jianbo Liu
@ 2016-10-18 14:53     ` Maxime Coquelin
  2016-10-21  7:51     ` Yuanhan Liu
  9 siblings, 0 replies; 141+ messages in thread
From: Maxime Coquelin @ 2016-10-18 14:53 UTC (permalink / raw)
  To: Yuanhan Liu, dev; +Cc: Jianbo Liu

Hi Yuanhan,

On 10/14/2016 11:34 AM, Yuanhan Liu wrote:
> This is a new set of patches to optimize the mergeable Rx code path.
> No refactoring (rewrite) was made this time. It just applies some
> findings from Zhihong (kudos to him!) that could improve the mergeable
> Rx path on the old code.
>
> The two major factors that could improve the performance greatly are:
>
> - copy virtio header together with packet data. This could remove
>   the buubbles between the two copy to optimize the cache access.
>
>   This is implemented in patch 2 "vhost: optimize cache access"
>
> - shadow used ring update and update them at once
>
>   The basic idea is to update used ring in a local buffer and flush
>   them to the virtio used ring at once in the end. Again, this is
>   for optimizing the cache access.
>
>   This is implemented in patch 5 "vhost: shadow used ring update"
>
> The two optimizations could yield 40+% performance in micro testing
> and 20+% in PVP case testing with 64B packet size.
>
> Besides that, there are some tiny optimizations, such as prefetch
> avail ring (patch 6) and retrieve avail head once (patch 7).
>
> Note: the shadow used ring tech could also be applied to the non-mrg
> Rx path (and even the dequeu) path. I didn't do that for two reasons:
>
> - we already update used ring in batch in both path: it's not shadowed
>   first though.
>
> - it's a bit too late too make many changes at this stage: RC1 is out.
>
> Please help testing.

I tested the following use-cases without noticing any functional problems:
  - Windows Guests (mergeable ON & OFF, indirect disabled): ping other VM
  - Linux guests with Kernel driver (mergeable ON & OFF, indirect OFF): 
iperf between 2 VMs
  - Linux guest with Virtio PMD (mergeable ON & OFF): testpmd txonly on 
host, rxonly on guest.

Feel free to add my:
Tested-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v7 0/7] vhost: optimize mergeable Rx path
  2016-10-14  9:34   ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Yuanhan Liu
                       ` (8 preceding siblings ...)
  2016-10-18 14:53     ` Maxime Coquelin
@ 2016-10-21  7:51     ` Yuanhan Liu
  9 siblings, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-21  7:51 UTC (permalink / raw)
  To: dev; +Cc: Maxime Coquelin, Jianbo Liu

Applied to dpdk-next-virtio.

And thanks for testing and reviewing.

	--yliu

On Fri, Oct 14, 2016 at 05:34:31PM +0800, Yuanhan Liu wrote:
> This is a new set of patches to optimize the mergeable Rx code path.
> No refactoring (rewrite) was made this time. It just applies some
> findings from Zhihong (kudos to him!) that could improve the mergeable
> Rx path on the old code.
> 
> The two major factors that could improve the performance greatly are:
> 
> - copy virtio header together with packet data. This could remove
>   the buubbles between the two copy to optimize the cache access.
> 
>   This is implemented in patch 2 "vhost: optimize cache access"
> 
> - shadow used ring update and update them at once
> 
>   The basic idea is to update used ring in a local buffer and flush
>   them to the virtio used ring at once in the end. Again, this is
>   for optimizing the cache access.
> 
>   This is implemented in patch 5 "vhost: shadow used ring update"
> 
> The two optimizations could yield 40+% performance in micro testing
> and 20+% in PVP case testing with 64B packet size.
> 
> Besides that, there are some tiny optimizations, such as prefetch
> avail ring (patch 6) and retrieve avail head once (patch 7).
> 
> Note: the shadow used ring tech could also be applied to the non-mrg
> Rx path (and even the dequeu) path. I didn't do that for two reasons:
> 
> - we already update used ring in batch in both path: it's not shadowed
>   first though.
> 
> - it's a bit too late too make many changes at this stage: RC1 is out. 
> 
> Please help testing.
> 
> Thanks.
> 
> 	--yliu
> 
> Cc: Jianbo Liu <jianbo.liu@linaro.org>
> ---
> Yuanhan Liu (4):
>   vhost: simplify mergeable Rx vring reservation
>   vhost: use last avail idx for avail ring reservation
>   vhost: prefetch avail ring
>   vhost: retrieve avail head once
> 
> Zhihong Wang (3):
>   vhost: remove useless volatile
>   vhost: optimize cache access
>   vhost: shadow used ring update
> 
>  lib/librte_vhost/vhost.c      |  13 ++-
>  lib/librte_vhost/vhost.h      |   5 +-
>  lib/librte_vhost/vhost_user.c |  23 +++--
>  lib/librte_vhost/virtio_net.c | 193 +++++++++++++++++++++++++-----------------
>  4 files changed, 149 insertions(+), 85 deletions(-)
> 
> -- 
> 1.9.0

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v7 3/7] vhost: simplify mergeable Rx vring reservation
  2016-10-14  9:34     ` [PATCH v7 3/7] vhost: simplify mergeable Rx vring reservation Yuanhan Liu
@ 2016-10-25 22:08       ` Thomas Monjalon
  2016-10-26  2:56         ` Yuanhan Liu
  0 siblings, 1 reply; 141+ messages in thread
From: Thomas Monjalon @ 2016-10-25 22:08 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: dev

2016-10-14 17:34, Yuanhan Liu:
> -static inline uint32_t __attribute__((always_inline))
> +static inline int __attribute__((always_inline))
>  copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -			    uint16_t end_idx, struct rte_mbuf *m,
> -			    struct buf_vector *buf_vec)
> +			    struct rte_mbuf *m, struct buf_vector *buf_vec,
> +			    uint16_t num_buffers)
>  {
>  	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
>  	uint32_t vec_idx = 0;
> -	uint16_t start_idx = vq->last_used_idx;
> -	uint16_t cur_idx = start_idx;
> +	uint16_t cur_idx = vq->last_used_idx;
>  	uint64_t desc_addr;
>  	uint32_t desc_chain_head;
>  	uint32_t desc_chain_len;
> @@ -394,21 +393,21 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	struct rte_mbuf *hdr_mbuf;
>  
>  	if (unlikely(m == NULL))
> -		return 0;
> +		return -1;
>  
>  	LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
>  		dev->vid, cur_idx, end_idx);

There is a build error:
	lib/librte_vhost/virtio_net.c:399:22: error: ‘end_idx’ undeclared

It is probably trivial and could be fixed directly in the already applied
commit in next-virtio.

^ permalink raw reply	[flat|nested] 141+ messages in thread

* Re: [PATCH v7 3/7] vhost: simplify mergeable Rx vring reservation
  2016-10-25 22:08       ` Thomas Monjalon
@ 2016-10-26  2:56         ` Yuanhan Liu
  0 siblings, 0 replies; 141+ messages in thread
From: Yuanhan Liu @ 2016-10-26  2:56 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

On Wed, Oct 26, 2016 at 12:08:49AM +0200, Thomas Monjalon wrote:
> 2016-10-14 17:34, Yuanhan Liu:
> > -static inline uint32_t __attribute__((always_inline))
> > +static inline int __attribute__((always_inline))
> >  copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > -			    uint16_t end_idx, struct rte_mbuf *m,
> > -			    struct buf_vector *buf_vec)
> > +			    struct rte_mbuf *m, struct buf_vector *buf_vec,
> > +			    uint16_t num_buffers)
> >  {
> >  	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
> >  	uint32_t vec_idx = 0;
> > -	uint16_t start_idx = vq->last_used_idx;
> > -	uint16_t cur_idx = start_idx;
> > +	uint16_t cur_idx = vq->last_used_idx;
> >  	uint64_t desc_addr;
> >  	uint32_t desc_chain_head;
> >  	uint32_t desc_chain_len;
> > @@ -394,21 +393,21 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
> >  	struct rte_mbuf *hdr_mbuf;
> >  
> >  	if (unlikely(m == NULL))
> > -		return 0;
> > +		return -1;
> >  
> >  	LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
> >  		dev->vid, cur_idx, end_idx);
> 
> There is a build error:
> 	lib/librte_vhost/virtio_net.c:399:22: error: ‘end_idx’ undeclared

Oops...  you know, my robot is broken since the holiday :(
I just had a quick fix. Hopefully, it will start working again...

> It is probably trivial and could be fixed directly in the already applied
> commit in next-virtio.

Yes, and FYI, here is the overall diffs I made to fix this bug.

	--yliu

    ---
    diff --git a/lib/librte_vhost/virtio_net.c
    b/lib/librte_vhost/virtio_net.c
    index b784dba..eed0b1c 100644
    --- a/lib/librte_vhost/virtio_net.c
    +++ b/lib/librte_vhost/virtio_net.c
    @@ -443,9 +443,6 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev,
    struct rte_mbuf *m,
            if (unlikely(m == NULL))
                    return -1;
    
    -       LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
    -               dev->vid, cur_idx, end_idx);
    -
            desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
            if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
                    return -1;
    @@ -555,6 +552,10 @@ virtio_dev_merge_rx(struct virtio_net *dev,
    uint16_t queue_id,
                            break;
                    }
    
    +               LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
    +                       dev->vid, vq->last_avail_idx,
    +                       vq->last_avail_idx + num_buffers);
    +
                    if (copy_mbuf_to_desc_mergeable(dev, pkts[pkt_idx],
                                                    buf_vec, num_buffers) < 0) {
                            vq->shadow_used_idx -= num_buffers;

^ permalink raw reply	[flat|nested] 141+ messages in thread

end of thread, other threads:[~2016-10-26  2:55 UTC | newest]

Thread overview: 141+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-16  3:50 [PATCH] optimize vhost enqueue Zhihong Wang
2016-08-16 13:59 ` Maxime Coquelin
2016-08-17  1:45   ` Wang, Zhihong
2016-08-17  2:38     ` Yuanhan Liu
2016-08-17  6:41       ` Wang, Zhihong
2016-08-17  9:17         ` Maxime Coquelin
2016-08-17  9:51           ` Yuanhan Liu
2016-08-18 13:44             ` Wang, Zhihong
2016-08-17 10:07           ` Wang, Zhihong
2016-08-18  6:33 ` [PATCH v2 0/6] vhost: optimize enqueue Zhihong Wang
2016-08-18  6:33   ` [PATCH v2 1/6] vhost: rewrite enqueue Zhihong Wang
2016-08-19  2:39     ` Yuanhan Liu
2016-08-19  7:07       ` Wang, Zhihong
2016-08-18  6:33   ` [PATCH v2 2/6] vhost: remove obsolete Zhihong Wang
2016-08-19  2:32     ` Yuanhan Liu
2016-08-19  7:08       ` Wang, Zhihong
2016-08-18  6:33   ` [PATCH v2 3/6] vhost: remove useless volatile Zhihong Wang
2016-08-18  6:33   ` [PATCH v2 4/6] vhost: add desc prefetch Zhihong Wang
2016-08-18  6:33   ` [PATCH v2 5/6] vhost: batch update used ring Zhihong Wang
2016-08-18  6:33   ` [PATCH v2 6/6] vhost: optimize cache access Zhihong Wang
2016-08-19  5:43 ` [PATCH v3 0/5] vhost: optimize enqueue Zhihong Wang
2016-08-19  5:43   ` [PATCH v3 1/5] vhost: rewrite enqueue Zhihong Wang
2016-08-22  9:35     ` Maxime Coquelin
2016-08-23  2:27       ` Wang, Zhihong
2016-08-25  4:00       ` Yuanhan Liu
2016-08-19  5:43   ` [PATCH v3 2/5] vhost: remove useless volatile Zhihong Wang
2016-08-19  5:43   ` [PATCH v3 3/5] vhost: add desc prefetch Zhihong Wang
2016-08-19  5:43   ` [PATCH v3 4/5] vhost: batch update used ring Zhihong Wang
2016-08-25  3:48     ` Yuanhan Liu
2016-08-25  5:19       ` Wang, Zhihong
2016-08-19  5:43   ` [PATCH v3 5/5] vhost: optimize cache access Zhihong Wang
2016-08-22  8:11   ` [PATCH v3 0/5] vhost: optimize enqueue Maxime Coquelin
2016-08-22 10:01     ` Maxime Coquelin
2016-08-22 10:35       ` Thomas Monjalon
2016-08-24  3:37         ` Wang, Zhihong
2016-08-23  2:31       ` Wang, Zhihong
2016-08-23 10:43         ` Wang, Zhihong
2016-08-23 12:16           ` Maxime Coquelin
2016-08-23 12:22           ` Yuanhan Liu
2016-08-23  2:15     ` Wang, Zhihong
2016-09-21  8:50     ` Jianbo Liu
2016-09-21  9:27       ` Wang, Zhihong
2016-09-21 12:54         ` Jianbo Liu
2016-09-22  2:11           ` Wang, Zhihong
2016-09-22  2:29           ` Yuanhan Liu
2016-09-22  5:47             ` Jianbo Liu
2016-09-22  6:58               ` Wang, Zhihong
2016-09-22  9:01                 ` Jianbo Liu
2016-09-22 10:04                   ` Wang, Zhihong
2016-09-22 14:41                     ` Jianbo Liu
2016-09-23  2:56                       ` Wang, Zhihong
2016-09-23 10:41                         ` Jianbo Liu
2016-09-23 13:41                           ` Thomas Monjalon
2016-09-25  5:41                             ` Wang, Zhihong
2016-09-26  5:12                               ` Jianbo Liu
2016-09-26  5:25                                 ` Wang, Zhihong
2016-09-26  5:38                                   ` Jianbo Liu
2016-09-26  6:00                                     ` Wang, Zhihong
2016-09-26  4:24                             ` Jianbo Liu
2016-09-26  5:37                   ` Luke Gorrie
2016-09-26  5:40                     ` Jianbo Liu
2016-09-27 10:21                   ` Yuanhan Liu
2016-09-27 16:45                     ` Wang, Zhihong
2016-10-09 12:09                       ` Wang, Zhihong
2016-10-10  2:44                         ` Yuanhan Liu
2016-10-10  5:31                           ` Jianbo Liu
2016-10-10  6:22                             ` Wang, Zhihong
2016-10-10  6:57                               ` Jianbo Liu
2016-10-10  7:25                                 ` Wang, Zhihong
2016-10-12  2:53               ` Yuanhan Liu
2016-10-12 12:22                 ` Wang, Zhihong
2016-10-12 15:31                   ` Thomas Monjalon
2016-10-13  1:21                     ` Wang, Zhihong
2016-10-13  3:51                     ` Jianbo Liu
2016-10-13  5:33                   ` Yuanhan Liu
2016-10-13  5:35                     ` Yuanhan Liu
2016-10-13  6:02                     ` Wang, Zhihong
2016-10-13  7:54                       ` Maxime Coquelin
2016-10-13  9:23                         ` Maxime Coquelin
2016-10-14 10:11                           ` Yuanhan Liu
2016-08-30  3:35 ` [PATCH v4 0/6] " Zhihong Wang
2016-08-30  3:35   ` [PATCH v4 1/6] vhost: fix windows vm hang Zhihong Wang
2016-09-05  5:24     ` [dpdk-stable] " Yuanhan Liu
2016-09-05  5:25       ` Wang, Zhihong
2016-09-05  5:40         ` Yuanhan Liu
2016-08-30  3:36   ` [PATCH v4 2/6] vhost: rewrite enqueue Zhihong Wang
2016-09-05  6:39     ` Yuanhan Liu
2016-09-07  5:33       ` Yuanhan Liu
2016-09-07  5:39         ` Wang, Zhihong
2016-08-30  3:36   ` [PATCH v4 3/6] vhost: remove useless volatile Zhihong Wang
2016-08-30  3:36   ` [PATCH v4 4/6] vhost: add desc prefetch Zhihong Wang
2016-08-30  3:36   ` [PATCH v4 5/6] vhost: batch update used ring Zhihong Wang
2016-08-30  3:36   ` [PATCH v4 6/6] vhost: optimize cache access Zhihong Wang
2016-09-09  3:39 ` [PATCH v5 0/6] vhost: optimize enqueue Zhihong Wang
2016-09-09  3:39   ` [PATCH v5 1/6] vhost: fix windows vm hang Zhihong Wang
2016-09-09  3:39   ` [PATCH v5 2/6] vhost: rewrite enqueue Zhihong Wang
2016-09-12 15:42     ` Maxime Coquelin
2016-09-14  8:20       ` Wang, Zhihong
2016-09-15 16:35         ` Maxime Coquelin
2016-09-12 16:26     ` Maxime Coquelin
2016-09-14  8:22       ` Wang, Zhihong
2016-09-18 14:19     ` Yuanhan Liu
2016-09-19  3:29       ` Wang, Zhihong
2016-09-09  3:39   ` [PATCH v5 3/6] vhost: remove useless volatile Zhihong Wang
2016-09-09  3:39   ` [PATCH v5 4/6] vhost: add desc prefetch Zhihong Wang
2016-09-09  3:39   ` [PATCH v5 5/6] vhost: batch update used ring Zhihong Wang
2016-09-12 15:45     ` Maxime Coquelin
2016-09-14  8:43       ` Wang, Zhihong
2016-09-15 16:38         ` Maxime Coquelin
2016-09-18  2:55           ` Yuanhan Liu
2016-09-18  2:57             ` Wang, Zhihong
2016-09-09  3:39   ` [PATCH v5 6/6] vhost: optimize cache access Zhihong Wang
2016-09-12 13:52   ` [PATCH v5 0/6] vhost: optimize enqueue Maxime Coquelin
2016-09-12 13:56     ` Maxime Coquelin
2016-09-12 14:01     ` Yuanhan Liu
2016-09-20  2:00 ` [PATCH v6 " Zhihong Wang
2016-09-20  2:00   ` [PATCH v6 1/6] vhost: fix windows vm hang Zhihong Wang
2016-10-13  6:18     ` [dpdk-stable] " Yuanhan Liu
2016-09-20  2:00   ` [PATCH v6 2/6] vhost: rewrite enqueue Zhihong Wang
2016-09-22  9:58     ` Jianbo Liu
2016-09-22 10:13       ` Wang, Zhihong
2016-09-20  2:00   ` [PATCH v6 3/6] vhost: remove useless volatile Zhihong Wang
2016-09-20  2:00   ` [PATCH v6 4/6] vhost: add desc prefetch Zhihong Wang
2016-09-20  2:00   ` [PATCH v6 5/6] vhost: batch update used ring Zhihong Wang
2016-09-20  2:00   ` [PATCH v6 6/6] vhost: optimize cache access Zhihong Wang
2016-09-21  4:32     ` Maxime Coquelin
2016-09-21  2:26   ` [PATCH v6 0/6] vhost: optimize enqueue Yuanhan Liu
2016-09-21  4:39     ` Maxime Coquelin
2016-10-14  9:34   ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Yuanhan Liu
2016-10-14  9:34     ` [PATCH v7 1/7] vhost: remove useless volatile Yuanhan Liu
2016-10-14  9:34     ` [PATCH v7 2/7] vhost: optimize cache access Yuanhan Liu
2016-10-14  9:34     ` [PATCH v7 3/7] vhost: simplify mergeable Rx vring reservation Yuanhan Liu
2016-10-25 22:08       ` Thomas Monjalon
2016-10-26  2:56         ` Yuanhan Liu
2016-10-14  9:34     ` [PATCH v7 4/7] vhost: use last avail idx for avail ring reservation Yuanhan Liu
2016-10-14  9:34     ` [PATCH v7 5/7] vhost: shadow used ring update Yuanhan Liu
2016-10-14  9:34     ` [PATCH v7 6/7] vhost: prefetch avail ring Yuanhan Liu
2016-10-14  9:34     ` [PATCH v7 7/7] vhost: retrieve avail head once Yuanhan Liu
2016-10-18  2:25     ` [PATCH v7 0/7] vhost: optimize mergeable Rx path Jianbo Liu
2016-10-18 14:53     ` Maxime Coquelin
2016-10-21  7:51     ` Yuanhan Liu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.