All of lore.kernel.org
 help / color / mirror / Atom feed
From: Joyce Kong <joyce.kong@arm.com>
To: maxime.coquelin@redhat.com, chenbo.xia@intel.com,
	jerinj@marvell.com, ruifeng.wang@arm.com,
	honnappa.nagarahalli@arm.com
Cc: dev@dpdk.org, nd@arm.com
Subject: [dpdk-dev] [PATCH v1 1/4] net/virtio: move AVX based Rx and Tx code to separate file
Date: Tue, 17 Nov 2020 18:06:32 +0800	[thread overview]
Message-ID: <20201117100635.27690-2-joyce.kong@arm.com> (raw)
In-Reply-To: <20201117100635.27690-1-joyce.kong@arm.com>

Split out AVX instruction based virtio packed ring Rx and Tx
implementation to a separate file.

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 drivers/net/virtio/meson.build              |   4 +-
 drivers/net/virtio/virtio_rxtx.c            |   6 +-
 drivers/net/virtio/virtio_rxtx_packed.c     | 137 +++++
 drivers/net/virtio/virtio_rxtx_packed.h     | 298 ++++++++++
 drivers/net/virtio/virtio_rxtx_packed_avx.c | 626 --------------------
 drivers/net/virtio/virtio_rxtx_packed_avx.h | 239 ++++++++
 6 files changed, 678 insertions(+), 632 deletions(-)
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed.c
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed.h
 delete mode 100644 drivers/net/virtio/virtio_rxtx_packed_avx.c
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed_avx.h

diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build
index eaed46373..01b8de6d4 100644
--- a/drivers/net/virtio/meson.build
+++ b/drivers/net/virtio/meson.build
@@ -13,12 +13,12 @@ if arch_subdir == 'x86'
 		if cc.has_argument('-mavx512f') and cc.has_argument('-mavx512vl') and cc.has_argument('-mavx512bw')
 			cflags += ['-DCC_AVX512_SUPPORT']
 			virtio_avx512_lib = static_library('virtio_avx512_lib',
-					      'virtio_rxtx_packed_avx.c',
+					      'virtio_rxtx_packed.c',
 					      dependencies: [static_rte_ethdev,
 						static_rte_kvargs, static_rte_bus_pci],
 					      include_directories: includes,
 					      c_args: [cflags, '-mavx512f', '-mavx512bw', '-mavx512vl'])
-			objs += virtio_avx512_lib.extract_objects('virtio_rxtx_packed_avx.c')
+			objs += virtio_avx512_lib.extract_objects('virtio_rxtx_packed.c')
 			if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0'))
 				cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
 			elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0'))
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 77934e8c5..622d4bf20 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -2025,8 +2025,7 @@ virtio_xmit_pkts_inorder(void *tx_queue,
 	return nb_tx;
 }
 
-#ifndef CC_AVX512_SUPPORT
-uint16_t
+__rte_weak uint16_t
 virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
 			    struct rte_mbuf **rx_pkts __rte_unused,
 			    uint16_t nb_pkts __rte_unused)
@@ -2034,11 +2033,10 @@ virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
 	return 0;
 }
 
-uint16_t
+__rte_weak uint16_t
 virtio_xmit_pkts_packed_vec(void *tx_queue __rte_unused,
 			    struct rte_mbuf **tx_pkts __rte_unused,
 			    uint16_t nb_pkts __rte_unused)
 {
 	return 0;
 }
-#endif /* ifndef CC_AVX512_SUPPORT */
diff --git a/drivers/net/virtio/virtio_rxtx_packed.c b/drivers/net/virtio/virtio_rxtx_packed.c
new file mode 100644
index 000000000..99d9a5a99
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_packed.c
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_net.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtio_pci.h"
+#include "virtio_rxtx_packed.h"
+#include "virtqueue.h"
+
+#ifdef CC_AVX512_SUPPORT
+#include "virtio_rxtx_packed_avx.h"
+#endif
+
+uint16_t
+virtio_xmit_pkts_packed_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+			uint16_t nb_pkts)
+{
+	struct virtnet_tx *txvq = tx_queue;
+	struct virtqueue *vq = txvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t nb_tx = 0;
+	uint16_t remained;
+
+	if (unlikely(hw->started == 0 && tx_pkts != hw->inject_pkts))
+		return nb_tx;
+
+	if (unlikely(nb_pkts < 1))
+		return nb_pkts;
+
+	PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
+
+	if (vq->vq_free_cnt <= vq->vq_nentries - vq->vq_free_thresh)
+		virtio_xmit_cleanup_inorder_packed(vq, vq->vq_free_thresh);
+
+	remained = RTE_MIN(nb_pkts, vq->vq_free_cnt);
+
+	while (remained) {
+		if (remained >= PACKED_BATCH_SIZE) {
+			if (!virtqueue_enqueue_batch_packed_vec(txvq,
+						&tx_pkts[nb_tx])) {
+				nb_tx += PACKED_BATCH_SIZE;
+				remained -= PACKED_BATCH_SIZE;
+				continue;
+			}
+		}
+		if (!virtqueue_enqueue_single_packed_vec(txvq,
+					tx_pkts[nb_tx])) {
+			nb_tx++;
+			remained--;
+			continue;
+		}
+		break;
+	};
+
+	txvq->stats.packets += nb_tx;
+
+	if (likely(nb_tx)) {
+		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
+			virtqueue_notify(vq);
+			PMD_TX_LOG(DEBUG, "Notified backend after xmit");
+		}
+	}
+
+	return nb_tx;
+}
+
+uint16_t
+virtio_recv_pkts_packed_vec(void *rx_queue,
+			    struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts)
+{
+	struct virtnet_rx *rxvq = rx_queue;
+	struct virtqueue *vq = rxvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t num, nb_rx = 0;
+	uint32_t nb_enqueued = 0;
+	uint16_t free_cnt = vq->vq_free_thresh;
+
+	if (unlikely(hw->started == 0))
+		return nb_rx;
+
+	num = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts);
+	if (likely(num > PACKED_BATCH_SIZE))
+		num = num - ((vq->vq_used_cons_idx + num) % PACKED_BATCH_SIZE);
+
+	while (num) {
+		if (!virtqueue_dequeue_batch_packed_vec(rxvq,
+					&rx_pkts[nb_rx])) {
+			nb_rx += PACKED_BATCH_SIZE;
+			num -= PACKED_BATCH_SIZE;
+			continue;
+		}
+		if (!virtqueue_dequeue_single_packed_vec(rxvq,
+					&rx_pkts[nb_rx])) {
+			nb_rx++;
+			num--;
+			continue;
+		}
+		break;
+	};
+
+	PMD_RX_LOG(DEBUG, "dequeue:%d", num);
+
+	rxvq->stats.packets += nb_rx;
+
+	if (likely(vq->vq_free_cnt >= free_cnt)) {
+		struct rte_mbuf *new_pkts[free_cnt];
+		if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts,
+						free_cnt) == 0)) {
+			virtio_recv_refill_packed_vec(rxvq, new_pkts,
+					free_cnt);
+			nb_enqueued += free_cnt;
+		} else {
+			struct rte_eth_dev *dev =
+				&rte_eth_devices[rxvq->port_id];
+			dev->data->rx_mbuf_alloc_failed += free_cnt;
+		}
+	}
+
+	if (likely(nb_enqueued)) {
+		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
+			virtqueue_notify(vq);
+			PMD_RX_LOG(DEBUG, "Notified");
+		}
+	}
+
+	return nb_rx;
+}
diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h
new file mode 100644
index 000000000..b0b1d63ec
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_packed.h
@@ -0,0 +1,298 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#ifndef _VIRTIO_RXTX_PACKED_H_
+#define _VIRTIO_RXTX_PACKED_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_net.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+#define BYTE_SIZE 8
+/* flag bits offset in packed ring desc higher 64bits */
+#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
+	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
+
+#define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
+	FLAGS_BITS_OFFSET)
+
+/* reference count offset in mbuf rearm data */
+#define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
+	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
+/* segment number offset in mbuf rearm data */
+#define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
+	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
+
+/* default rearm data */
+#define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
+	1ULL << REFCNT_BITS_OFFSET)
+
+/* id bits offset in packed ring desc higher 64bits */
+#define ID_BITS_OFFSET ((offsetof(struct vring_packed_desc, id) - \
+	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
+
+/* net hdr short size mask */
+#define NET_HDR_MASK 0x3F
+
+#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
+	sizeof(struct vring_packed_desc))
+#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
+
+#ifdef VIRTIO_GCC_UNROLL_PRAGMA
+#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \
+	for (iter = val; iter < size; iter++)
+#endif
+
+#ifdef VIRTIO_CLANG_UNROLL_PRAGMA
+#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
+	for (iter = val; iter < size; iter++)
+#endif
+
+#ifdef VIRTIO_ICC_UNROLL_PRAGMA
+#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
+	for (iter = val; iter < size; iter++)
+#endif
+
+#ifndef virtio_for_each_try_unroll
+#define virtio_for_each_try_unroll(iter, val, size) \
+	for (iter = val; iter < size; iter++)
+#endif
+
+static inline void
+virtio_update_batch_stats(struct virtnet_stats *stats,
+			  uint16_t pkt_len1,
+			  uint16_t pkt_len2,
+			  uint16_t pkt_len3,
+			  uint16_t pkt_len4)
+{
+	stats->bytes += pkt_len1;
+	stats->bytes += pkt_len2;
+	stats->bytes += pkt_len3;
+	stats->bytes += pkt_len4;
+}
+
+static inline int
+virtqueue_enqueue_single_packed_vec(struct virtnet_tx *txvq,
+				    struct rte_mbuf *txm)
+{
+	struct virtqueue *vq = txvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t hdr_size = hw->vtnet_hdr_size;
+	uint16_t slots, can_push = 0, use_indirect = 0;
+	int16_t need;
+
+	/* optimize ring usage */
+	if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) ||
+	     vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) &&
+	     rte_mbuf_refcnt_read(txm) == 1 && RTE_MBUF_DIRECT(txm) &&
+	     txm->nb_segs == 1 && rte_pktmbuf_headroom(txm) >= hdr_size)
+		can_push = 1;
+	else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
+		 txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
+		use_indirect = 1;
+
+	/* How many main ring entries are needed to this Tx?
+	 * indirect   => 1
+	 * any_layout => number of segments
+	 * default    => number of segments + 1
+	 */
+	slots = use_indirect ? 1 : (txm->nb_segs + !can_push);
+	can_push = rte_mbuf_refcnt_read(txm) == 1 &&
+		   RTE_MBUF_DIRECT(txm) &&
+		   txm->nb_segs == 1 &&
+		   rte_pktmbuf_headroom(txm) >= hdr_size;
+
+	slots = txm->nb_segs + !can_push;
+	need = slots - vq->vq_free_cnt;
+
+	/* Positive value indicates it need free vring descriptors */
+	if (unlikely(need > 0)) {
+		virtio_xmit_cleanup_inorder_packed(vq, need);
+		need = slots - vq->vq_free_cnt;
+		if (unlikely(need > 0)) {
+			PMD_TX_LOG(ERR,
+				   "No free tx descriptors to transmit");
+			return -1;
+		}
+	}
+
+	/* Enqueue Packet buffers */
+	virtqueue_enqueue_xmit_packed(txvq, txm, slots, use_indirect,
+				can_push, 1);
+
+	txvq->stats.bytes += txm->pkt_len;
+	return 0;
+}
+
+/* Optionally fill offload information in structure */
+static inline int
+virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
+{
+	struct rte_net_hdr_lens hdr_lens;
+	uint32_t hdrlen, ptype;
+	int l4_supported = 0;
+
+	/* nothing to do */
+	if (hdr->flags == 0)
+		return 0;
+
+	/* GSO not support in vec path, skip check */
+	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
+
+	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
+	m->packet_type = ptype;
+	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
+	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
+	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
+		l4_supported = 1;
+
+	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
+		if (hdr->csum_start <= hdrlen && l4_supported) {
+			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
+		} else {
+			/* Unknown proto or tunnel, do sw cksum. We can assume
+			 * the cksum field is in the first segment since the
+			 * buffers we provided to the host are large enough.
+			 * In case of SCTP, this will be wrong since it's a CRC
+			 * but there's nothing we can do.
+			 */
+			uint16_t csum = 0, off;
+
+			if (rte_raw_cksum_mbuf(m, hdr->csum_start,
+				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
+				&csum) < 0)
+				return -1;
+			if (likely(csum != 0xffff))
+				csum = ~csum;
+			off = hdr->csum_offset + hdr->csum_start;
+			if (rte_pktmbuf_data_len(m) >= off + 1)
+				*rte_pktmbuf_mtod_offset(m, uint16_t *,
+					off) = csum;
+		}
+	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
+		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
+	}
+
+	return 0;
+}
+
+static inline uint16_t
+virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq,
+				    struct rte_mbuf **rx_pkts)
+{
+	uint16_t used_idx, id;
+	uint32_t len;
+	struct virtqueue *vq = rxvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint32_t hdr_size = hw->vtnet_hdr_size;
+	struct virtio_net_hdr *hdr;
+	struct vring_packed_desc *desc;
+	struct rte_mbuf *cookie;
+
+	desc = vq->vq_packed.ring.desc;
+	used_idx = vq->vq_used_cons_idx;
+	if (!desc_is_used(&desc[used_idx], vq))
+		return -1;
+
+	len = desc[used_idx].len;
+	id = desc[used_idx].id;
+	cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
+	if (unlikely(cookie == NULL)) {
+		PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u",
+				vq->vq_used_cons_idx);
+		return -1;
+	}
+	rte_prefetch0(cookie);
+	rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
+
+	cookie->data_off = RTE_PKTMBUF_HEADROOM;
+	cookie->ol_flags = 0;
+	cookie->pkt_len = (uint32_t)(len - hdr_size);
+	cookie->data_len = (uint32_t)(len - hdr_size);
+
+	hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr +
+					RTE_PKTMBUF_HEADROOM - hdr_size);
+	if (hw->has_rx_offload)
+		virtio_vec_rx_offload(cookie, hdr);
+
+	*rx_pkts = cookie;
+
+	rxvq->stats.bytes += cookie->pkt_len;
+
+	vq->vq_free_cnt++;
+	vq->vq_used_cons_idx++;
+	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
+		vq->vq_used_cons_idx -= vq->vq_nentries;
+		vq->vq_packed.used_wrap_counter ^= 1;
+	}
+
+	return 0;
+}
+
+static inline void
+virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq,
+			      struct rte_mbuf **cookie,
+			      uint16_t num)
+{
+	struct virtqueue *vq = rxvq->vq;
+	struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
+	uint16_t flags = vq->vq_packed.cached_flags;
+	struct virtio_hw *hw = vq->hw;
+	struct vq_desc_extra *dxp;
+	uint16_t idx, i;
+	uint16_t batch_num, total_num = 0;
+	uint16_t head_idx = vq->vq_avail_idx;
+	uint16_t head_flag = vq->vq_packed.cached_flags;
+	uint64_t addr;
+
+	do {
+		idx = vq->vq_avail_idx;
+
+		batch_num = PACKED_BATCH_SIZE;
+		if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
+			batch_num = vq->vq_nentries - idx;
+		if (unlikely((total_num + batch_num) > num))
+			batch_num = num - total_num;
+
+		virtio_for_each_try_unroll(i, 0, batch_num) {
+			dxp = &vq->vq_descx[idx + i];
+			dxp->cookie = (void *)cookie[total_num + i];
+
+			addr = VIRTIO_MBUF_ADDR(cookie[total_num + i], vq) +
+				RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size;
+			start_dp[idx + i].addr = addr;
+			start_dp[idx + i].len = cookie[total_num + i]->buf_len
+				- RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
+			if (total_num || i) {
+				virtqueue_store_flags_packed(&start_dp[idx + i],
+						flags, hw->weak_barriers);
+			}
+		}
+
+		vq->vq_avail_idx += batch_num;
+		if (vq->vq_avail_idx >= vq->vq_nentries) {
+			vq->vq_avail_idx -= vq->vq_nentries;
+			vq->vq_packed.cached_flags ^=
+				VRING_PACKED_DESC_F_AVAIL_USED;
+			flags = vq->vq_packed.cached_flags;
+		}
+		total_num += batch_num;
+	} while (total_num < num);
+
+	virtqueue_store_flags_packed(&start_dp[head_idx], head_flag,
+				hw->weak_barriers);
+	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num);
+}
+
+#endif /* _VIRTIO_RXTX_PACKED_H_ */
diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.c b/drivers/net/virtio/virtio_rxtx_packed_avx.c
deleted file mode 100644
index 9bc62719e..000000000
--- a/drivers/net/virtio/virtio_rxtx_packed_avx.c
+++ /dev/null
@@ -1,626 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2020 Intel Corporation
- */
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-
-#include <rte_net.h>
-
-#include "virtio_logs.h"
-#include "virtio_ethdev.h"
-#include "virtio_pci.h"
-#include "virtqueue.h"
-
-#define BYTE_SIZE 8
-/* flag bits offset in packed ring desc higher 64bits */
-#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
-	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
-
-#define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
-	FLAGS_BITS_OFFSET)
-
-/* reference count offset in mbuf rearm data */
-#define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
-	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
-/* segment number offset in mbuf rearm data */
-#define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
-	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
-
-/* default rearm data */
-#define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
-	1ULL << REFCNT_BITS_OFFSET)
-
-/* id bits offset in packed ring desc higher 64bits */
-#define ID_BITS_OFFSET ((offsetof(struct vring_packed_desc, id) - \
-	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
-
-/* net hdr short size mask */
-#define NET_HDR_MASK 0x3F
-
-#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
-	sizeof(struct vring_packed_desc))
-#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
-
-#ifdef VIRTIO_GCC_UNROLL_PRAGMA
-#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \
-	for (iter = val; iter < size; iter++)
-#endif
-
-#ifdef VIRTIO_CLANG_UNROLL_PRAGMA
-#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
-	for (iter = val; iter < size; iter++)
-#endif
-
-#ifdef VIRTIO_ICC_UNROLL_PRAGMA
-#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \
-	for (iter = val; iter < size; iter++)
-#endif
-
-#ifndef virtio_for_each_try_unroll
-#define virtio_for_each_try_unroll(iter, val, num) \
-	for (iter = val; iter < num; iter++)
-#endif
-
-static inline void
-virtio_update_batch_stats(struct virtnet_stats *stats,
-			  uint16_t pkt_len1,
-			  uint16_t pkt_len2,
-			  uint16_t pkt_len3,
-			  uint16_t pkt_len4)
-{
-	stats->bytes += pkt_len1;
-	stats->bytes += pkt_len2;
-	stats->bytes += pkt_len3;
-	stats->bytes += pkt_len4;
-}
-
-static inline int
-virtqueue_enqueue_batch_packed_vec(struct virtnet_tx *txvq,
-				   struct rte_mbuf **tx_pkts)
-{
-	struct virtqueue *vq = txvq->vq;
-	uint16_t head_size = vq->hw->vtnet_hdr_size;
-	uint16_t idx = vq->vq_avail_idx;
-	struct virtio_net_hdr *hdr;
-	struct vq_desc_extra *dxp;
-	uint16_t i, cmp;
-
-	if (vq->vq_avail_idx & PACKED_BATCH_MASK)
-		return -1;
-
-	if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
-		return -1;
-
-	/* Load four mbufs rearm data */
-	RTE_BUILD_BUG_ON(REFCNT_BITS_OFFSET >= 64);
-	RTE_BUILD_BUG_ON(SEG_NUM_BITS_OFFSET >= 64);
-	__m256i mbufs = _mm256_set_epi64x(*tx_pkts[3]->rearm_data,
-					  *tx_pkts[2]->rearm_data,
-					  *tx_pkts[1]->rearm_data,
-					  *tx_pkts[0]->rearm_data);
-
-	/* refcnt=1 and nb_segs=1 */
-	__m256i mbuf_ref = _mm256_set1_epi64x(DEFAULT_REARM_DATA);
-	__m256i head_rooms = _mm256_set1_epi16(head_size);
-
-	/* Check refcnt and nb_segs */
-	const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12;
-	cmp = _mm256_mask_cmpneq_epu16_mask(mask, mbufs, mbuf_ref);
-	if (unlikely(cmp))
-		return -1;
-
-	/* Check headroom is enough */
-	const __mmask16 data_mask = 0x1 | 0x1 << 4 | 0x1 << 8 | 0x1 << 12;
-	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_off) !=
-		offsetof(struct rte_mbuf, rearm_data));
-	cmp = _mm256_mask_cmplt_epu16_mask(data_mask, mbufs, head_rooms);
-	if (unlikely(cmp))
-		return -1;
-
-	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-		dxp = &vq->vq_descx[idx + i];
-		dxp->ndescs = 1;
-		dxp->cookie = tx_pkts[i];
-	}
-
-	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-		tx_pkts[i]->data_off -= head_size;
-		tx_pkts[i]->data_len += head_size;
-	}
-
-	__m512i descs_base = _mm512_set_epi64(tx_pkts[3]->data_len,
-			VIRTIO_MBUF_ADDR(tx_pkts[3], vq),
-			tx_pkts[2]->data_len,
-			VIRTIO_MBUF_ADDR(tx_pkts[2], vq),
-			tx_pkts[1]->data_len,
-			VIRTIO_MBUF_ADDR(tx_pkts[1], vq),
-			tx_pkts[0]->data_len,
-			VIRTIO_MBUF_ADDR(tx_pkts[0], vq));
-
-	/* id offset and data offset */
-	__m512i data_offsets = _mm512_set_epi64((uint64_t)3 << ID_BITS_OFFSET,
-						tx_pkts[3]->data_off,
-						(uint64_t)2 << ID_BITS_OFFSET,
-						tx_pkts[2]->data_off,
-						(uint64_t)1 << ID_BITS_OFFSET,
-						tx_pkts[1]->data_off,
-						0, tx_pkts[0]->data_off);
-
-	__m512i new_descs = _mm512_add_epi64(descs_base, data_offsets);
-
-	uint64_t flags_temp = (uint64_t)idx << ID_BITS_OFFSET |
-		(uint64_t)vq->vq_packed.cached_flags << FLAGS_BITS_OFFSET;
-
-	/* flags offset and guest virtual address offset */
-	__m128i flag_offset = _mm_set_epi64x(flags_temp, 0);
-	__m512i v_offset = _mm512_broadcast_i32x4(flag_offset);
-	__m512i v_desc = _mm512_add_epi64(new_descs, v_offset);
-
-	if (!vq->hw->has_tx_offload) {
-		__m128i all_mask = _mm_set1_epi16(0xFFFF);
-		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-			hdr = rte_pktmbuf_mtod_offset(tx_pkts[i],
-					struct virtio_net_hdr *, -head_size);
-			__m128i v_hdr = _mm_loadu_si128((void *)hdr);
-			if (unlikely(_mm_mask_test_epi16_mask(NET_HDR_MASK,
-							v_hdr, all_mask))) {
-				__m128i all_zero = _mm_setzero_si128();
-				_mm_mask_storeu_epi16((void *)hdr,
-						NET_HDR_MASK, all_zero);
-			}
-		}
-	} else {
-		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-			hdr = rte_pktmbuf_mtod_offset(tx_pkts[i],
-					struct virtio_net_hdr *, -head_size);
-			virtqueue_xmit_offload(hdr, tx_pkts[i], true);
-		}
-	}
-
-	/* Enqueue Packet buffers */
-	_mm512_storeu_si512((void *)&vq->vq_packed.ring.desc[idx], v_desc);
-
-	virtio_update_batch_stats(&txvq->stats, tx_pkts[0]->pkt_len,
-			tx_pkts[1]->pkt_len, tx_pkts[2]->pkt_len,
-			tx_pkts[3]->pkt_len);
-
-	vq->vq_avail_idx += PACKED_BATCH_SIZE;
-	vq->vq_free_cnt -= PACKED_BATCH_SIZE;
-
-	if (vq->vq_avail_idx >= vq->vq_nentries) {
-		vq->vq_avail_idx -= vq->vq_nentries;
-		vq->vq_packed.cached_flags ^=
-			VRING_PACKED_DESC_F_AVAIL_USED;
-	}
-
-	return 0;
-}
-
-static inline int
-virtqueue_enqueue_single_packed_vec(struct virtnet_tx *txvq,
-				    struct rte_mbuf *txm)
-{
-	struct virtqueue *vq = txvq->vq;
-	struct virtio_hw *hw = vq->hw;
-	uint16_t hdr_size = hw->vtnet_hdr_size;
-	uint16_t slots, can_push = 0, use_indirect = 0;
-	int16_t need;
-
-	/* optimize ring usage */
-	if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) ||
-	      vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) &&
-	    rte_mbuf_refcnt_read(txm) == 1 &&
-	    RTE_MBUF_DIRECT(txm) &&
-	    txm->nb_segs == 1 &&
-	    rte_pktmbuf_headroom(txm) >= hdr_size)
-		can_push = 1;
-	else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
-		 txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
-		use_indirect = 1;
-	/* How many main ring entries are needed to this Tx?
-	 * indirect   => 1
-	 * any_layout => number of segments
-	 * default    => number of segments + 1
-	 */
-	slots = use_indirect ? 1 : (txm->nb_segs + !can_push);
-	need = slots - vq->vq_free_cnt;
-
-	/* Positive value indicates it need free vring descriptors */
-	if (unlikely(need > 0)) {
-		virtio_xmit_cleanup_inorder_packed(vq, need);
-		need = slots - vq->vq_free_cnt;
-		if (unlikely(need > 0)) {
-			PMD_TX_LOG(ERR,
-				   "No free tx descriptors to transmit");
-			return -1;
-		}
-	}
-
-	/* Enqueue Packet buffers */
-	virtqueue_enqueue_xmit_packed(txvq, txm, slots, use_indirect,
-				can_push, 1);
-
-	txvq->stats.bytes += txm->pkt_len;
-	return 0;
-}
-
-uint16_t
-virtio_xmit_pkts_packed_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
-			uint16_t nb_pkts)
-{
-	struct virtnet_tx *txvq = tx_queue;
-	struct virtqueue *vq = txvq->vq;
-	struct virtio_hw *hw = vq->hw;
-	uint16_t nb_tx = 0;
-	uint16_t remained;
-
-	if (unlikely(hw->started == 0 && tx_pkts != hw->inject_pkts))
-		return nb_tx;
-
-	if (unlikely(nb_pkts < 1))
-		return nb_pkts;
-
-	PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
-
-	if (vq->vq_free_cnt <= vq->vq_nentries - vq->vq_free_thresh)
-		virtio_xmit_cleanup_inorder_packed(vq, vq->vq_free_thresh);
-
-	remained = RTE_MIN(nb_pkts, vq->vq_free_cnt);
-
-	while (remained) {
-		if (remained >= PACKED_BATCH_SIZE) {
-			if (!virtqueue_enqueue_batch_packed_vec(txvq,
-						&tx_pkts[nb_tx])) {
-				nb_tx += PACKED_BATCH_SIZE;
-				remained -= PACKED_BATCH_SIZE;
-				continue;
-			}
-		}
-		if (!virtqueue_enqueue_single_packed_vec(txvq,
-					tx_pkts[nb_tx])) {
-			nb_tx++;
-			remained--;
-			continue;
-		}
-		break;
-	};
-
-	txvq->stats.packets += nb_tx;
-
-	if (likely(nb_tx)) {
-		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
-			virtqueue_notify(vq);
-			PMD_TX_LOG(DEBUG, "Notified backend after xmit");
-		}
-	}
-
-	return nb_tx;
-}
-
-/* Optionally fill offload information in structure */
-static inline int
-virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
-{
-	struct rte_net_hdr_lens hdr_lens;
-	uint32_t hdrlen, ptype;
-	int l4_supported = 0;
-
-	/* nothing to do */
-	if (hdr->flags == 0)
-		return 0;
-
-	/* GSO not support in vec path, skip check */
-	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
-
-	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
-	m->packet_type = ptype;
-	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
-	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
-	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
-		l4_supported = 1;
-
-	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
-		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
-		if (hdr->csum_start <= hdrlen && l4_supported) {
-			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
-		} else {
-			/* Unknown proto or tunnel, do sw cksum. We can assume
-			 * the cksum field is in the first segment since the
-			 * buffers we provided to the host are large enough.
-			 * In case of SCTP, this will be wrong since it's a CRC
-			 * but there's nothing we can do.
-			 */
-			uint16_t csum = 0, off;
-
-			if (rte_raw_cksum_mbuf(m, hdr->csum_start,
-				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
-				&csum) < 0)
-				return -1;
-			if (likely(csum != 0xffff))
-				csum = ~csum;
-			off = hdr->csum_offset + hdr->csum_start;
-			if (rte_pktmbuf_data_len(m) >= off + 1)
-				*rte_pktmbuf_mtod_offset(m, uint16_t *,
-					off) = csum;
-		}
-	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
-		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
-	}
-
-	return 0;
-}
-
-static inline uint16_t
-virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
-				   struct rte_mbuf **rx_pkts)
-{
-	struct virtqueue *vq = rxvq->vq;
-	struct virtio_hw *hw = vq->hw;
-	uint16_t hdr_size = hw->vtnet_hdr_size;
-	uint64_t addrs[PACKED_BATCH_SIZE];
-	uint16_t id = vq->vq_used_cons_idx;
-	uint8_t desc_stats;
-	uint16_t i;
-	void *desc_addr;
-
-	if (id & PACKED_BATCH_MASK)
-		return -1;
-
-	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
-		return -1;
-
-	/* only care avail/used bits */
-#if defined(RTE_ARCH_I686)
-	__m512i v_mask = _mm512_set4_epi64(PACKED_FLAGS_MASK, 0x0,
-					   PACKED_FLAGS_MASK, 0x0);
-#else
-	__m512i v_mask = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK);
-#endif
-	desc_addr = &vq->vq_packed.ring.desc[id];
-
-	__m512i v_desc = _mm512_loadu_si512(desc_addr);
-	__m512i v_flag = _mm512_and_epi64(v_desc, v_mask);
-
-	__m512i v_used_flag = _mm512_setzero_si512();
-	if (vq->vq_packed.used_wrap_counter)
-#if defined(RTE_ARCH_I686)
-		v_used_flag = _mm512_set4_epi64(PACKED_FLAGS_MASK, 0x0,
-						PACKED_FLAGS_MASK, 0x0);
-#else
-		v_used_flag = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK);
-#endif
-
-	/* Check all descs are used */
-	desc_stats = _mm512_cmpneq_epu64_mask(v_flag, v_used_flag);
-	if (desc_stats)
-		return -1;
-
-	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-		rx_pkts[i] = (struct rte_mbuf *)vq->vq_descx[id + i].cookie;
-		rte_packet_prefetch(rte_pktmbuf_mtod(rx_pkts[i], void *));
-
-		addrs[i] = (uintptr_t)rx_pkts[i]->rx_descriptor_fields1;
-	}
-
-	/*
-	 * load len from desc, store into mbuf pkt_len and data_len
-	 * len limiated by l6bit buf_len, pkt_len[16:31] can be ignored
-	 */
-	const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12;
-	__m512i values = _mm512_maskz_shuffle_epi32(mask, v_desc, 0xAA);
-
-	/* reduce hdr_len from pkt_len and data_len */
-	__m512i mbuf_len_offset = _mm512_maskz_set1_epi32(mask,
-			(uint32_t)-hdr_size);
-
-	__m512i v_value = _mm512_add_epi32(values, mbuf_len_offset);
-
-	/* assert offset of data_len */
-	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
-		offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
-
-	__m512i v_index = _mm512_set_epi64(addrs[3] + 8, addrs[3],
-					   addrs[2] + 8, addrs[2],
-					   addrs[1] + 8, addrs[1],
-					   addrs[0] + 8, addrs[0]);
-	/* batch store into mbufs */
-	_mm512_i64scatter_epi64(0, v_index, v_value, 1);
-
-	if (hw->has_rx_offload) {
-		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-			char *addr = (char *)rx_pkts[i]->buf_addr +
-				RTE_PKTMBUF_HEADROOM - hdr_size;
-			virtio_vec_rx_offload(rx_pkts[i],
-					(struct virtio_net_hdr *)addr);
-		}
-	}
-
-	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
-			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
-			rx_pkts[3]->pkt_len);
-
-	vq->vq_free_cnt += PACKED_BATCH_SIZE;
-
-	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
-	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
-		vq->vq_used_cons_idx -= vq->vq_nentries;
-		vq->vq_packed.used_wrap_counter ^= 1;
-	}
-
-	return 0;
-}
-
-static uint16_t
-virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq,
-				    struct rte_mbuf **rx_pkts)
-{
-	uint16_t used_idx, id;
-	uint32_t len;
-	struct virtqueue *vq = rxvq->vq;
-	struct virtio_hw *hw = vq->hw;
-	uint32_t hdr_size = hw->vtnet_hdr_size;
-	struct virtio_net_hdr *hdr;
-	struct vring_packed_desc *desc;
-	struct rte_mbuf *cookie;
-
-	desc = vq->vq_packed.ring.desc;
-	used_idx = vq->vq_used_cons_idx;
-	if (!desc_is_used(&desc[used_idx], vq))
-		return -1;
-
-	len = desc[used_idx].len;
-	id = desc[used_idx].id;
-	cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
-	if (unlikely(cookie == NULL)) {
-		PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u",
-				vq->vq_used_cons_idx);
-		return -1;
-	}
-	rte_prefetch0(cookie);
-	rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
-
-	cookie->data_off = RTE_PKTMBUF_HEADROOM;
-	cookie->ol_flags = 0;
-	cookie->pkt_len = (uint32_t)(len - hdr_size);
-	cookie->data_len = (uint32_t)(len - hdr_size);
-
-	hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr +
-					RTE_PKTMBUF_HEADROOM - hdr_size);
-	if (hw->has_rx_offload)
-		virtio_vec_rx_offload(cookie, hdr);
-
-	*rx_pkts = cookie;
-
-	rxvq->stats.bytes += cookie->pkt_len;
-
-	vq->vq_free_cnt++;
-	vq->vq_used_cons_idx++;
-	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
-		vq->vq_used_cons_idx -= vq->vq_nentries;
-		vq->vq_packed.used_wrap_counter ^= 1;
-	}
-
-	return 0;
-}
-
-static inline void
-virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq,
-			      struct rte_mbuf **cookie,
-			      uint16_t num)
-{
-	struct virtqueue *vq = rxvq->vq;
-	struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
-	uint16_t flags = vq->vq_packed.cached_flags;
-	struct virtio_hw *hw = vq->hw;
-	struct vq_desc_extra *dxp;
-	uint16_t idx, i;
-	uint16_t batch_num, total_num = 0;
-	uint16_t head_idx = vq->vq_avail_idx;
-	uint16_t head_flag = vq->vq_packed.cached_flags;
-	uint64_t addr;
-
-	do {
-		idx = vq->vq_avail_idx;
-
-		batch_num = PACKED_BATCH_SIZE;
-		if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
-			batch_num = vq->vq_nentries - idx;
-		if (unlikely((total_num + batch_num) > num))
-			batch_num = num - total_num;
-
-		virtio_for_each_try_unroll(i, 0, batch_num) {
-			dxp = &vq->vq_descx[idx + i];
-			dxp->cookie = (void *)cookie[total_num + i];
-
-			addr = VIRTIO_MBUF_ADDR(cookie[total_num + i], vq) +
-				RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size;
-			start_dp[idx + i].addr = addr;
-			start_dp[idx + i].len = cookie[total_num + i]->buf_len
-				- RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
-			if (total_num || i) {
-				virtqueue_store_flags_packed(&start_dp[idx + i],
-						flags, hw->weak_barriers);
-			}
-		}
-
-		vq->vq_avail_idx += batch_num;
-		if (vq->vq_avail_idx >= vq->vq_nentries) {
-			vq->vq_avail_idx -= vq->vq_nentries;
-			vq->vq_packed.cached_flags ^=
-				VRING_PACKED_DESC_F_AVAIL_USED;
-			flags = vq->vq_packed.cached_flags;
-		}
-		total_num += batch_num;
-	} while (total_num < num);
-
-	virtqueue_store_flags_packed(&start_dp[head_idx], head_flag,
-				hw->weak_barriers);
-	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num);
-}
-
-uint16_t
-virtio_recv_pkts_packed_vec(void *rx_queue,
-			    struct rte_mbuf **rx_pkts,
-			    uint16_t nb_pkts)
-{
-	struct virtnet_rx *rxvq = rx_queue;
-	struct virtqueue *vq = rxvq->vq;
-	struct virtio_hw *hw = vq->hw;
-	uint16_t num, nb_rx = 0;
-	uint32_t nb_enqueued = 0;
-	uint16_t free_cnt = vq->vq_free_thresh;
-
-	if (unlikely(hw->started == 0))
-		return nb_rx;
-
-	num = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts);
-	if (likely(num > PACKED_BATCH_SIZE))
-		num = num - ((vq->vq_used_cons_idx + num) % PACKED_BATCH_SIZE);
-
-	while (num) {
-		if (!virtqueue_dequeue_batch_packed_vec(rxvq,
-					&rx_pkts[nb_rx])) {
-			nb_rx += PACKED_BATCH_SIZE;
-			num -= PACKED_BATCH_SIZE;
-			continue;
-		}
-		if (!virtqueue_dequeue_single_packed_vec(rxvq,
-					&rx_pkts[nb_rx])) {
-			nb_rx++;
-			num--;
-			continue;
-		}
-		break;
-	};
-
-	PMD_RX_LOG(DEBUG, "dequeue:%d", num);
-
-	rxvq->stats.packets += nb_rx;
-
-	if (likely(vq->vq_free_cnt >= free_cnt)) {
-		struct rte_mbuf *new_pkts[free_cnt];
-		if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts,
-						free_cnt) == 0)) {
-			virtio_recv_refill_packed_vec(rxvq, new_pkts,
-					free_cnt);
-			nb_enqueued += free_cnt;
-		} else {
-			struct rte_eth_dev *dev =
-				&rte_eth_devices[rxvq->port_id];
-			dev->data->rx_mbuf_alloc_failed += free_cnt;
-		}
-	}
-
-	if (likely(nb_enqueued)) {
-		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
-			virtqueue_notify(vq);
-			PMD_RX_LOG(DEBUG, "Notified");
-		}
-	}
-
-	return nb_rx;
-}
diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.h b/drivers/net/virtio/virtio_rxtx_packed_avx.h
new file mode 100644
index 000000000..f83182884
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_packed_avx.h
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_net.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtio_pci.h"
+#include "virtio_rxtx_packed.h"
+#include "virtqueue.h"
+
+static inline int
+virtqueue_enqueue_batch_packed_vec(struct virtnet_tx *txvq,
+				   struct rte_mbuf **tx_pkts)
+{
+	struct virtqueue *vq = txvq->vq;
+	uint16_t head_size = vq->hw->vtnet_hdr_size;
+	uint16_t idx = vq->vq_avail_idx;
+	struct virtio_net_hdr *hdr;
+	struct vq_desc_extra *dxp;
+	uint16_t i, cmp;
+
+	if (vq->vq_avail_idx & PACKED_BATCH_MASK)
+		return -1;
+
+	if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
+		return -1;
+
+	/* Load four mbufs rearm data */
+	RTE_BUILD_BUG_ON(REFCNT_BITS_OFFSET >= 64);
+	RTE_BUILD_BUG_ON(SEG_NUM_BITS_OFFSET >= 64);
+	__m256i mbufs = _mm256_set_epi64x(*tx_pkts[3]->rearm_data,
+					  *tx_pkts[2]->rearm_data,
+					  *tx_pkts[1]->rearm_data,
+					  *tx_pkts[0]->rearm_data);
+
+	/* refcnt=1 and nb_segs=1 */
+	__m256i mbuf_ref = _mm256_set1_epi64x(DEFAULT_REARM_DATA);
+	__m256i head_rooms = _mm256_set1_epi16(head_size);
+
+	/* Check refcnt and nb_segs */
+	const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12;
+	cmp = _mm256_mask_cmpneq_epu16_mask(mask, mbufs, mbuf_ref);
+	if (unlikely(cmp))
+		return -1;
+
+	/* Check headroom is enough */
+	const __mmask16 data_mask = 0x1 | 0x1 << 4 | 0x1 << 8 | 0x1 << 12;
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_off) !=
+		offsetof(struct rte_mbuf, rearm_data));
+	cmp = _mm256_mask_cmplt_epu16_mask(data_mask, mbufs, head_rooms);
+	if (unlikely(cmp))
+		return -1;
+
+	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		dxp = &vq->vq_descx[idx + i];
+		dxp->ndescs = 1;
+		dxp->cookie = tx_pkts[i];
+	}
+
+	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		tx_pkts[i]->data_off -= head_size;
+		tx_pkts[i]->data_len += head_size;
+	}
+
+	__m512i descs_base = _mm512_set_epi64(tx_pkts[3]->data_len,
+			VIRTIO_MBUF_ADDR(tx_pkts[3], vq),
+			tx_pkts[2]->data_len,
+			VIRTIO_MBUF_ADDR(tx_pkts[2], vq),
+			tx_pkts[1]->data_len,
+			VIRTIO_MBUF_ADDR(tx_pkts[1], vq),
+			tx_pkts[0]->data_len,
+			VIRTIO_MBUF_ADDR(tx_pkts[0], vq));
+
+	/* id offset and data offset */
+	__m512i data_offsets = _mm512_set_epi64((uint64_t)3 << ID_BITS_OFFSET,
+						tx_pkts[3]->data_off,
+						(uint64_t)2 << ID_BITS_OFFSET,
+						tx_pkts[2]->data_off,
+						(uint64_t)1 << ID_BITS_OFFSET,
+						tx_pkts[1]->data_off,
+						0, tx_pkts[0]->data_off);
+
+	__m512i new_descs = _mm512_add_epi64(descs_base, data_offsets);
+
+	uint64_t flags_temp = (uint64_t)idx << ID_BITS_OFFSET |
+		(uint64_t)vq->vq_packed.cached_flags << FLAGS_BITS_OFFSET;
+
+	/* flags offset and guest virtual address offset */
+	__m128i flag_offset = _mm_set_epi64x(flags_temp, 0);
+	__m512i v_offset = _mm512_broadcast_i32x4(flag_offset);
+	__m512i v_desc = _mm512_add_epi64(new_descs, v_offset);
+
+	if (!vq->hw->has_tx_offload) {
+		__m128i all_mask = _mm_set1_epi16(0xFFFF);
+		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			hdr = rte_pktmbuf_mtod_offset(tx_pkts[i],
+					struct virtio_net_hdr *, -head_size);
+			__m128i v_hdr = _mm_loadu_si128((void *)hdr);
+			if (unlikely(_mm_mask_test_epi16_mask(NET_HDR_MASK,
+							v_hdr, all_mask))) {
+				__m128i all_zero = _mm_setzero_si128();
+				_mm_mask_storeu_epi16((void *)hdr,
+						NET_HDR_MASK, all_zero);
+			}
+		}
+	} else {
+		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			hdr = rte_pktmbuf_mtod_offset(tx_pkts[i],
+					struct virtio_net_hdr *, -head_size);
+			virtqueue_xmit_offload(hdr, tx_pkts[i], true);
+		}
+	}
+
+	/* Enqueue Packet buffers */
+	_mm512_storeu_si512((void *)&vq->vq_packed.ring.desc[idx], v_desc);
+
+	virtio_update_batch_stats(&txvq->stats, tx_pkts[0]->pkt_len,
+			tx_pkts[1]->pkt_len, tx_pkts[2]->pkt_len,
+			tx_pkts[3]->pkt_len);
+
+	vq->vq_avail_idx += PACKED_BATCH_SIZE;
+	vq->vq_free_cnt -= PACKED_BATCH_SIZE;
+
+	if (vq->vq_avail_idx >= vq->vq_nentries) {
+		vq->vq_avail_idx -= vq->vq_nentries;
+		vq->vq_packed.cached_flags ^=
+			VRING_PACKED_DESC_F_AVAIL_USED;
+	}
+
+	return 0;
+}
+
+static inline uint16_t
+virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
+				   struct rte_mbuf **rx_pkts)
+{
+	struct virtqueue *vq = rxvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t hdr_size = hw->vtnet_hdr_size;
+	uint64_t addrs[PACKED_BATCH_SIZE];
+	uint16_t id = vq->vq_used_cons_idx;
+	uint8_t desc_stats;
+	uint16_t i;
+	void *desc_addr;
+
+	if (id & PACKED_BATCH_MASK)
+		return -1;
+
+	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
+		return -1;
+
+	/* only care avail/used bits */
+#if defined(RTE_ARCH_I686)
+	__m512i v_mask = _mm512_set4_epi64(PACKED_FLAGS_MASK, 0x0,
+					   PACKED_FLAGS_MASK, 0x0);
+#else
+	__m512i v_mask = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK);
+#endif
+	desc_addr = &vq->vq_packed.ring.desc[id];
+
+	__m512i v_desc = _mm512_loadu_si512(desc_addr);
+	__m512i v_flag = _mm512_and_epi64(v_desc, v_mask);
+
+	__m512i v_used_flag = _mm512_setzero_si512();
+	if (vq->vq_packed.used_wrap_counter)
+#if defined(RTE_ARCH_I686)
+		v_used_flag = _mm512_set4_epi64(PACKED_FLAGS_MASK, 0x0,
+						PACKED_FLAGS_MASK, 0x0);
+#else
+		v_used_flag = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK);
+#endif
+
+	/* Check all descs are used */
+	desc_stats = _mm512_cmpneq_epu64_mask(v_flag, v_used_flag);
+	if (desc_stats)
+		return -1;
+
+	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		rx_pkts[i] = (struct rte_mbuf *)vq->vq_descx[id + i].cookie;
+		rte_packet_prefetch(rte_pktmbuf_mtod(rx_pkts[i], void *));
+
+		addrs[i] = (uintptr_t)rx_pkts[i]->rx_descriptor_fields1;
+	}
+
+	/*
+	 * load len from desc, store into mbuf pkt_len and data_len
+	 * len limiated by l6bit buf_len, pkt_len[16:31] can be ignored
+	 */
+	const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12;
+	__m512i values = _mm512_maskz_shuffle_epi32(mask, v_desc, 0xAA);
+
+	/* reduce hdr_len from pkt_len and data_len */
+	__m512i mbuf_len_offset = _mm512_maskz_set1_epi32(mask,
+			(uint32_t)-hdr_size);
+
+	__m512i v_value = _mm512_add_epi32(values, mbuf_len_offset);
+
+	/* assert offset of data_len */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+		offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+
+	__m512i v_index = _mm512_set_epi64(addrs[3] + 8, addrs[3],
+					   addrs[2] + 8, addrs[2],
+					   addrs[1] + 8, addrs[1],
+					   addrs[0] + 8, addrs[0]);
+	/* batch store into mbufs */
+	_mm512_i64scatter_epi64(0, v_index, v_value, 1);
+
+	if (hw->has_rx_offload) {
+		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			char *addr = (char *)rx_pkts[i]->buf_addr +
+				RTE_PKTMBUF_HEADROOM - hdr_size;
+			virtio_vec_rx_offload(rx_pkts[i],
+					(struct virtio_net_hdr *)addr);
+		}
+	}
+
+	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
+			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
+			rx_pkts[3]->pkt_len);
+
+	vq->vq_free_cnt += PACKED_BATCH_SIZE;
+
+	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
+	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
+		vq->vq_used_cons_idx -= vq->vq_nentries;
+		vq->vq_packed.used_wrap_counter ^= 1;
+	}
+
+	return 0;
+}
-- 
2.28.0


  reply	other threads:[~2020-11-17 10:07 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-11 12:09 [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON Joyce Kong
2020-09-11 12:09 ` [dpdk-dev] [RFC 1/3] net/virtio: move AVX based Rx and Tx code to separate file Joyce Kong
2020-09-11 12:09 ` [dpdk-dev] [RFC 2/3] net/virtio: add vectorized packed ring Rx NEON path Joyce Kong
2020-09-11 12:09 ` [dpdk-dev] [RFC 3/3] net/virtio: add election for packed vector " Joyce Kong
2020-10-05  7:34 ` [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON Maxime Coquelin
2020-10-08  6:54   ` Joyce Kong
2020-10-15  9:01   ` Ruifeng Wang
2020-10-15  9:02     ` Maxime Coquelin
2020-11-17 10:06 ` [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX " Joyce Kong
2020-11-17 10:06   ` Joyce Kong [this message]
2021-01-05 14:06     ` [dpdk-dev] [PATCH v1 1/4] net/virtio: move AVX based Rx and Tx code to separate file Maxime Coquelin
2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path Joyce Kong
2021-01-05 14:16     ` Maxime Coquelin
2021-01-05 14:27       ` Maxime Coquelin
2021-01-07 10:39         ` Maxime Coquelin
2021-01-08  7:29           ` Joyce Kong
2021-01-08 17:02     ` Ferruh Yigit
2021-01-08 22:26       ` Honnappa Nagarahalli
2021-01-11 13:05         ` Aaron Conole
2021-01-11 10:45       ` Maxime Coquelin
2021-01-11 13:04       ` Aaron Conole
2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 3/4] net/virtio: add vectorized packed ring Tx " Joyce Kong
2021-01-05 14:33     ` Maxime Coquelin
2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 4/4] net/virtio: add election for packed vector " Joyce Kong
2021-01-05 14:42     ` Maxime Coquelin
2021-01-08  9:11   ` [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX path with NEON Maxime Coquelin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201117100635.27690-2-joyce.kong@arm.com \
    --to=joyce.kong@arm.com \
    --cc=chenbo.xia@intel.com \
    --cc=dev@dpdk.org \
    --cc=honnappa.nagarahalli@arm.com \
    --cc=jerinj@marvell.com \
    --cc=maxime.coquelin@redhat.com \
    --cc=nd@arm.com \
    --cc=ruifeng.wang@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.