From mboxrd@z Thu Jan  1 00:00:00 1970
From: Zhihong Wang <zhihong.wang@intel.com>
Subject: [PATCH] optimize vhost enqueue
Date: Mon, 15 Aug 2016 23:50:02 -0400
Message-ID: <1471319402-112998-1-git-send-email-zhihong.wang@intel.com>
Cc: Zhihong Wang <zhihong.wang@intel.com>
To: dev@dpdk.org
Return-path: <dev-bounces@dpdk.org>
Received: from mga04.intel.com (mga04.intel.com [192.55.52.120])
 by dpdk.org (Postfix) with ESMTP id 417E36932
 for <dev@dpdk.org>; Tue, 16 Aug 2016 12:58:00 +0200 (CEST)
List-Id: patches and discussions about DPDK <dev.dpdk.org>
List-Unsubscribe: <http://dpdk.org/ml/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://dpdk.org/ml/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <http://dpdk.org/ml/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
Errors-To: dev-bounces@dpdk.org
Sender: "dev" <dev-bounces@dpdk.org>

This patch optimizes the vhost enqueue function: rte_vhost_enqueue_burst.

Currently there're 2 callbacks for vhost enqueue:
 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
 *  virtio_dev_rx for mrg_rxbuf turned off cases.

The virtio_dev_merge_rx doesn't provide optimal performance, also it is
reported having compatibility issue working with Windows VMs.

Besides, having 2 separated functions increases maintenance efforts.

This patch uses a single function logic to replace the current 2 for
better maintainability, and provides better performance by optimizing
caching behavior especially for mrg_rxbuf turned on cases.

It also fixes the issue working with Windows VMs.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
 lib/librte_vhost/vhost-net.h  |   6 +-
 lib/librte_vhost/vhost_rxtx.c | 582 ++++++++++++++----------------------------
 lib/librte_vhost/virtio-net.c |  15 +-
 3 files changed, 208 insertions(+), 395 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 38593a2..a15182c 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
 	uint32_t		size;
 
 	/* Last index used on the available ring */
-	volatile uint16_t	last_used_idx;
+	uint16_t		last_used_idx;
 #define VIRTIO_INVALID_EVENTFD		(-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD	(-2)
 
@@ -85,6 +85,10 @@ struct vhost_virtqueue {
 
 	/* Physical address of used ring, for logging */
 	uint64_t		log_guest_addr;
+
+	/* Shadow used ring for performance */
+	struct vring_used_elem	*shadow_used_ring;
+	uint32_t		shadow_used_idx;
 } __rte_cache_aligned;
 
 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..1263168 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
 	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }
 
-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
 	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -125,427 +125,227 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 	}
 }
 
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-		    struct virtio_net_hdr_mrg_rxbuf hdr)
-{
-	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-	else
-		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
-}
-
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		  struct rte_mbuf *m, uint16_t desc_idx)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint16_t count)
 {
-	uint32_t desc_avail, desc_offset;
-	uint32_t mbuf_avail, mbuf_offset;
-	uint32_t cpy_len;
+	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
+	struct vhost_virtqueue *vq;
 	struct vring_desc *desc;
-	uint64_t desc_addr;
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
-	desc = &vq->desc[desc_idx];
-	desc_addr = gpa_to_vva(dev, desc->addr);
-	/*
-	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-	 * otherwise stores offset on the stack instead of in a register.
-	 */
-	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-		return -1;
-
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-	desc_offset = dev->vhost_hlen;
-	desc_avail  = desc->len - dev->vhost_hlen;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current mbuf, fetch next */
-		if (mbuf_avail == 0) {
-			m = m->next;
-
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
-		}
-
-		/* done with current desc buf, fetch next */
-		if (desc_avail == 0) {
-			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
-				/* Room in vring buffer is not enough */
-				return -1;
-			}
-			if (unlikely(desc->next >= vq->size))
-				return -1;
-
-			desc = &vq->desc[desc->next];
-			desc_addr = gpa_to_vva(dev, desc->addr);
-			if (unlikely(!desc_addr))
-				return -1;
-
-			desc_offset = 0;
-			desc_avail  = desc->len;
-		}
-
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			     cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
-	}
-
-	return 0;
-}
+	struct virtio_net *dev;
+	struct rte_mbuf *mbuf;
+	uint64_t desc_host_write_addr = 0;
+	uint32_t desc_chain_head = 0;
+	uint32_t desc_chain_len = 0;
+	uint32_t desc_current = 0;
+	uint32_t desc_write_offset = 0;
+	uint32_t used_idx_static = 0;
+	uint32_t pkt_idx = 0;
+	uint32_t pkt_left = 0;
+	uint32_t pkt_sent = 0;
+	uint32_t mbuf_len = 0;
+	uint32_t mbuf_len_left = 0;
+	uint32_t copy_len = 0;
+	uint32_t copy_virtio_hdr = 0;
+	uint32_t is_mrg_rxbuf = 0;
+	uint32_t is_virtio_1 = 0;
+
+	if (unlikely(count == 0))
+		return 0;
 
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
-	      struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint16_t avail_idx, free_entries, start_idx;
-	uint16_t desc_indexes[MAX_PKT_BURST];
-	uint16_t used_idx;
-	uint32_t i;
+	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
 
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
+	dev = get_device(vid);
+	if (unlikely(!dev))
 		return 0;
-	}
 
-	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
 		return 0;
 
-	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-	start_idx = vq->last_used_idx;
-	free_entries = avail_idx - start_idx;
-	count = RTE_MIN(count, free_entries);
-	count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
-	if (count == 0)
+	vq = dev->virtqueue[queue_id];
+	if (unlikely(!vq->enabled))
 		return 0;
 
-	LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
-		dev->vid, start_idx, start_idx + count);
+	if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+		is_mrg_rxbuf = 1;
+
+	if (dev->features & (1ULL << VIRTIO_F_VERSION_1))
+		is_virtio_1 = 1;
+
+	pkt_idx = 0;
+	pkt_left = count;
+	used_idx_static = vq->last_used_idx & (vq->size - 1);
+	vq->shadow_used_idx = 0;
+
+	while (pkt_left > 0) {
+		if (unlikely(vq->avail->idx == vq->last_used_idx))
+			goto done;
+
+		if (pkt_left > 1 && vq->avail->idx != vq->last_used_idx + 1)
+			rte_prefetch0(&vq->desc[
+					vq->avail->ring[
+					(vq->last_used_idx + 1) &
+					(vq->size - 1)]]);
+
+		mbuf = pkts[pkt_idx];
+		mbuf_len = rte_pktmbuf_data_len(mbuf);
+		mbuf_len_left = mbuf_len;
+		pkt_idx++;
+		pkt_left--;
+
+		desc_chain_head = vq->avail->ring[(vq->last_used_idx) &
+			(vq->size - 1)];
+		desc_current = desc_chain_head;
+		desc = &vq->desc[desc_current];
+		desc_host_write_addr = gpa_to_vva(dev, desc->addr);
+		if (unlikely(!desc_host_write_addr))
+			goto done;
+
+		virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)
+			(uintptr_t)desc_host_write_addr;
+		copy_virtio_hdr = 1;
+
+		vhost_log_write(dev, desc->addr, dev->vhost_hlen);
+		desc_write_offset = dev->vhost_hlen;
+		desc_chain_len = desc_write_offset;
+		desc_host_write_addr += desc_write_offset;
+
+		while (1) {
+			if (!mbuf_len_left) {
+				if (mbuf->next) {
+					mbuf = mbuf->next;
+					mbuf_len = rte_pktmbuf_data_len(mbuf);
+					mbuf_len_left = mbuf_len;
+				} else
+					break;
+			}
 
-	/* Retrieve all of the desc indexes first to avoid caching issues. */
-	rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
-	for (i = 0; i < count; i++) {
-		used_idx = (start_idx + i) & (vq->size - 1);
-		desc_indexes[i] = vq->avail->ring[used_idx];
-		vq->used->ring[used_idx].id = desc_indexes[i];
-		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
-					       dev->vhost_hlen;
-		vhost_log_used_vring(dev, vq,
-			offsetof(struct vring_used, ring[used_idx]),
-			sizeof(vq->used->ring[used_idx]));
-	}
+			if (desc->len <= desc_write_offset) {
+				if (desc->flags & VRING_DESC_F_NEXT) {
+					desc_write_offset = 0;
+					desc_current = desc->next;
+					desc = &vq->desc[desc_current];
+					desc_host_write_addr =
+						gpa_to_vva(dev, desc->addr);
+					if (unlikely(!desc_host_write_addr))
+						goto rollback;
+				} else if (is_mrg_rxbuf) {
+					vq->shadow_used_ring[
+						vq->shadow_used_idx].id =
+						desc_chain_head;
+					vq->shadow_used_ring[
+						vq->shadow_used_idx].len =
+						desc_chain_len;
+					vq->shadow_used_idx++;
+					vq->last_used_idx++;
+					virtio_hdr->num_buffers++;
+					if (unlikely(vq->avail->idx ==
+							vq->last_used_idx))
+						goto rollback;
+
+					desc_chain_head = vq->avail->ring[
+						(vq->last_used_idx) &
+						(vq->size - 1)];
+					desc_current = desc_chain_head;
+					desc = &vq->desc[desc_current];
+					desc_host_write_addr =
+						gpa_to_vva(dev, desc->addr);
+					if (unlikely(!desc_host_write_addr))
+						goto rollback;
+
+					desc_chain_len = 0;
+					desc_write_offset = 0;
+				} else
+					goto rollback;
+			}
 
-	rte_prefetch0(&vq->desc[desc_indexes[0]]);
-	for (i = 0; i < count; i++) {
-		uint16_t desc_idx = desc_indexes[i];
-		int err;
+			copy_len = RTE_MIN(desc->len - desc_write_offset,
+					mbuf_len_left);
+			if (copy_virtio_hdr) {
+				copy_virtio_hdr = 0;
+				memset((void *)(uintptr_t)&(virtio_hdr->hdr),
+						0, dev->vhost_hlen);
+				virtio_enqueue_offload(mbuf,
+						&(virtio_hdr->hdr));
+				if (is_mrg_rxbuf || is_virtio_1)
+					virtio_hdr->num_buffers = 1;
+			}
 
-		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
-		if (unlikely(err)) {
-			used_idx = (start_idx + i) & (vq->size - 1);
-			vq->used->ring[used_idx].len = dev->vhost_hlen;
-			vhost_log_used_vring(dev, vq,
-				offsetof(struct vring_used, ring[used_idx]),
-				sizeof(vq->used->ring[used_idx]));
+			rte_memcpy((void *)(uintptr_t)desc_host_write_addr,
+					rte_pktmbuf_mtod_offset(mbuf, void *,
+						mbuf_len - mbuf_len_left),
+					copy_len);
+			vhost_log_write(dev, desc->addr + desc_write_offset,
+					copy_len);
+			mbuf_len_left -= copy_len;
+			desc_write_offset += copy_len;
+			desc_host_write_addr += copy_len;
+			desc_chain_len += copy_len;
 		}
 
-		if (i + 1 < count)
-			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
+		vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+		vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+		vq->shadow_used_idx++;
+		vq->last_used_idx++;
+		pkt_sent++;
 	}
 
-	rte_smp_wmb();
-
-	*(volatile uint16_t *)&vq->used->idx += count;
-	vq->last_used_idx += count;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, idx),
-		sizeof(vq->used->idx));
-
-	/* flush used->idx update before we read avail->flags. */
-	rte_mb();
-
-	/* Kick the guest if necessary. */
-	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-			&& (vq->callfd >= 0))
-		eventfd_write(vq->callfd, (eventfd_t)1);
-	return count;
-}
-
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
-	     uint32_t *allocated, uint32_t *vec_idx,
-	     struct buf_vector *buf_vec)
-{
-	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
-	uint32_t vec_id = *vec_idx;
-	uint32_t len    = *allocated;
-
-	while (1) {
-		if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
-			return -1;
-
-		len += vq->desc[idx].len;
-		buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
-		buf_vec[vec_id].buf_len  = vq->desc[idx].len;
-		buf_vec[vec_id].desc_idx = idx;
-		vec_id++;
-
-		if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
-			break;
-
-		idx = vq->desc[idx].next;
-	}
-
-	*allocated = len;
-	*vec_idx   = vec_id;
-
-	return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
-			    uint16_t *end, struct buf_vector *buf_vec)
-{
-	uint16_t cur_idx;
-	uint16_t avail_idx;
-	uint32_t allocated = 0;
-	uint32_t vec_idx = 0;
-	uint16_t tries = 0;
-
-	cur_idx  = vq->last_used_idx;
-
-	while (1) {
-		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-		if (unlikely(cur_idx == avail_idx))
-			return -1;
-
-		if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
-					  &vec_idx, buf_vec) < 0))
-			return -1;
-
-		cur_idx++;
-		tries++;
-
-		if (allocated >= size)
-			break;
-
-		/*
-		 * if we tried all available ring items, and still
-		 * can't get enough buf, it means something abnormal
-		 * happened.
-		 */
-		if (unlikely(tries >= vq->size))
-			return -1;
-	}
-
-	*end = cur_idx;
-	return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
-			    uint16_t end_idx, struct rte_mbuf *m,
-			    struct buf_vector *buf_vec)
-{
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-	uint32_t vec_idx = 0;
-	uint16_t start_idx = vq->last_used_idx;
-	uint16_t cur_idx = start_idx;
-	uint64_t desc_addr;
-	uint32_t mbuf_offset, mbuf_avail;
-	uint32_t desc_offset, desc_avail;
-	uint32_t cpy_len;
-	uint16_t desc_idx, used_idx;
-
-	if (unlikely(m == NULL))
-		return 0;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
-		dev->vid, cur_idx, end_idx);
-
-	desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
-		return 0;
-
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-	virtio_hdr.num_buffers = end_idx - start_idx;
-	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
-		dev->vid, virtio_hdr.num_buffers);
-
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-	vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
-	desc_offset = dev->vhost_hlen;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current desc buf, get the next one */
-		if (desc_avail == 0) {
-			desc_idx = buf_vec[vec_idx].desc_idx;
-
-			if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
-				/* Update used ring with desc information */
-				used_idx = cur_idx++ & (vq->size - 1);
-				vq->used->ring[used_idx].id  = desc_idx;
-				vq->used->ring[used_idx].len = desc_offset;
-				vhost_log_used_vring(dev, vq,
+done:
+	if (likely(vq->shadow_used_idx > 0)) {
+		if (used_idx_static + vq->shadow_used_idx < vq->size) {
+			rte_memcpy(&vq->used->ring[used_idx_static],
+					&vq->shadow_used_ring[0],
+					vq->shadow_used_idx *
+					sizeof(struct vring_used_elem));
+			vhost_log_used_vring(dev, vq,
 					offsetof(struct vring_used,
-						 ring[used_idx]),
-					sizeof(vq->used->ring[used_idx]));
-			}
-
-			vec_idx++;
-			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-			if (unlikely(!desc_addr))
-				return 0;
-
-			/* Prefetch buffer address. */
-			rte_prefetch0((void *)(uintptr_t)desc_addr);
-			desc_offset = 0;
-			desc_avail  = buf_vec[vec_idx].buf_len;
-		}
-
-		/* done with current mbuf, get the next one */
-		if (mbuf_avail == 0) {
-			m = m->next;
+						ring[used_idx_static]),
+					vq->shadow_used_idx *
+					sizeof(struct vring_used_elem));
+		} else {
+			uint32_t part_1 = vq->size - used_idx_static;
+			uint32_t part_2 = vq->shadow_used_idx - part_1;
 
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
+			rte_memcpy(&vq->used->ring[used_idx_static],
+					&vq->shadow_used_ring[0],
+					part_1 *
+					sizeof(struct vring_used_elem));
+			vhost_log_used_vring(dev, vq,
+					offsetof(struct vring_used,
+						ring[used_idx_static]),
+					part_1 *
+					sizeof(struct vring_used_elem));
+			rte_memcpy(&vq->used->ring[0],
+					&vq->shadow_used_ring[part_1],
+					part_2 *
+					sizeof(struct vring_used_elem));
+			vhost_log_used_vring(dev, vq,
+					offsetof(struct vring_used,
+						ring[0]),
+					part_2 *
+					sizeof(struct vring_used_elem));
 		}
-
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
-			cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
 	}
 
-	used_idx = cur_idx & (vq->size - 1);
-	vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
-	vq->used->ring[used_idx].len = desc_offset;
+	rte_smp_wmb();
+	vq->used->idx = vq->last_used_idx;
 	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, ring[used_idx]),
-		sizeof(vq->used->ring[used_idx]));
-
-	return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint32_t pkt_idx = 0, nr_used = 0;
-	uint16_t end;
-	struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
-		return 0;
-	}
-
-	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
-		return 0;
-
-	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
-	if (count == 0)
-		return 0;
-
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
-		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
-
-		if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
-							 &end, buf_vec) < 0)) {
-			LOG_DEBUG(VHOST_DATA,
-				"(%d) failed to get enough desc from vring\n",
-				dev->vid);
-			break;
-		}
-
-		nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
-						      pkts[pkt_idx], buf_vec);
-		rte_smp_wmb();
-
-		*(volatile uint16_t *)&vq->used->idx += nr_used;
-		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+			offsetof(struct vring_used, idx),
 			sizeof(vq->used->idx));
-		vq->last_used_idx += nr_used;
-	}
-
-	if (likely(pkt_idx)) {
-		/* flush used->idx update before we read avail->flags. */
-		rte_mb();
-
-		/* Kick the guest if necessary. */
+	rte_mb();
+	if (likely(pkt_sent)) {
 		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
 				&& (vq->callfd >= 0))
 			eventfd_write(vq->callfd, (eventfd_t)1);
 	}
 
-	return pkt_idx;
-}
-
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint16_t count)
-{
-	struct virtio_net *dev = get_device(vid);
+	return pkt_sent;
 
-	if (!dev)
-		return 0;
+rollback:
+	if (is_mrg_rxbuf || is_virtio_1)
+		vq->last_used_idx -= virtio_hdr->num_buffers - 1;
 
-	if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
-		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
-	else
-		return virtio_dev_rx(dev, queue_id, pkts, count);
+	goto done;
 }
 
 static void
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 1785695..87d09fa 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -152,10 +152,14 @@ cleanup_device(struct virtio_net *dev, int destroy)
 static void
 free_device(struct virtio_net *dev)
 {
+	struct vhost_virtqueue *vq;
 	uint32_t i;
 
-	for (i = 0; i < dev->virt_qp_nb; i++)
-		rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+	for (i = 0; i < dev->virt_qp_nb; i++) {
+		vq = dev->virtqueue[i * VIRTIO_QNUM];
+		rte_free(vq->shadow_used_ring);
+		rte_free(vq);
+	}
 
 	rte_free(dev);
 }
@@ -418,13 +422,18 @@ int
 vhost_set_vring_num(int vid, struct vhost_vring_state *state)
 {
 	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
 
 	dev = get_device(vid);
 	if (dev == NULL)
 		return -1;
 
 	/* State->index refers to the queue index. The txq is 1, rxq is 0. */
-	dev->virtqueue[state->index]->size = state->num;
+	vq = dev->virtqueue[state->index];
+	vq->size = state->num;
+	vq->shadow_used_ring = rte_malloc("",
+			vq->size * sizeof(struct vring_used_elem),
+			RTE_CACHE_LINE_SIZE);
 
 	return 0;
 }
-- 
2.7.4