All of lore.kernel.org
 help / color / mirror / Atom feed
From: Wenjun Wu <wenjun1.wu@intel.com>
To: dev@dpdk.org, jingjing.wu@intel.com, beilei.xing@intel.com,
	qi.z.zhang@intel.com
Cc: mingxia.liu@intel.com, Wenjun Wu <wenjun1.wu@intel.com>
Subject: [PATCH v2 1/1] common/idpf: add AVX512 data path for split queue model
Date: Wed, 14 Dec 2022 07:04:25 +0000	[thread overview]
Message-ID: <20221214070425.970409-2-wenjun1.wu@intel.com> (raw)
In-Reply-To: <20221214070425.970409-1-wenjun1.wu@intel.com>

Add support of AVX512 data path for split queue model.

Signed-off-by: Wenjun Wu <wenjun1.wu@intel.com>
---
 drivers/common/idpf/idpf_common_rxtx.c        |  22 +-
 drivers/common/idpf/idpf_common_rxtx.h        |  19 +-
 drivers/common/idpf/idpf_common_rxtx_avx512.c | 797 +++++++++++++++++-
 drivers/common/idpf/version.map               |   5 +-
 drivers/net/idpf/idpf_rxtx.c                  |  44 +-
 drivers/net/idpf/idpf_rxtx_vec_common.h       |  17 +-
 6 files changed, 868 insertions(+), 36 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx.c b/drivers/common/idpf/idpf_common_rxtx.c
index 7618819e68..7f8311d8f6 100644
--- a/drivers/common/idpf/idpf_common_rxtx.c
+++ b/drivers/common/idpf/idpf_common_rxtx.c
@@ -161,6 +161,9 @@ reset_split_rx_bufq(struct idpf_rx_queue *rxq)
 	/* The number of descriptors which can be refilled. */
 	rxq->nb_rx_hold = rxq->nb_rx_desc - 1;
 
+	rxq->rxrearm_nb = 0;
+	rxq->rxrearm_start = 0;
+
 	rxq->bufq1 = NULL;
 	rxq->bufq2 = NULL;
 }
@@ -236,6 +239,10 @@ reset_split_tx_descq(struct idpf_tx_queue *txq)
 	txq->last_desc_cleaned = 0;
 	txq->sw_tail = 0;
 	txq->nb_free = txq->nb_tx_desc - 1;
+
+	memset(txq->ctype, 0, sizeof(txq->ctype));
+	txq->next_dd = txq->rs_thresh - 1;
+	txq->next_rs = txq->rs_thresh - 1;
 }
 
 void
@@ -1426,12 +1433,12 @@ release_rxq_mbufs_vec(struct idpf_rx_queue *rxq)
 	memset(rxq->sw_ring, 0, sizeof(rxq->sw_ring[0]) * rxq->nb_rx_desc);
 }
 
-static const struct idpf_rxq_ops def_singleq_rx_ops_vec = {
+static const struct idpf_rxq_ops def_rx_ops_vec = {
 	.release_mbufs = release_rxq_mbufs_vec,
 };
 
 static inline int
-idpf_singleq_rx_vec_setup_default(struct idpf_rx_queue *rxq)
+idpf_rxq_vec_setup_default(struct idpf_rx_queue *rxq)
 {
 	uintptr_t p;
 	struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */
@@ -1451,6 +1458,13 @@ idpf_singleq_rx_vec_setup_default(struct idpf_rx_queue *rxq)
 int __rte_cold
 idpf_singleq_rx_vec_setup(struct idpf_rx_queue *rxq)
 {
-	rxq->ops = &def_singleq_rx_ops_vec;
-	return idpf_singleq_rx_vec_setup_default(rxq);
+	rxq->ops = &def_rx_ops_vec;
+	return idpf_rxq_vec_setup_default(rxq);
+}
+
+int __rte_cold
+idpf_splitq_rx_vec_setup(struct idpf_rx_queue *rxq)
+{
+	rxq->bufq2->ops = &def_rx_ops_vec;
+	return idpf_rxq_vec_setup_default(rxq->bufq2);
 }
diff --git a/drivers/common/idpf/idpf_common_rxtx.h b/drivers/common/idpf/idpf_common_rxtx.h
index 8a6e04af56..e23484d031 100644
--- a/drivers/common/idpf/idpf_common_rxtx.h
+++ b/drivers/common/idpf/idpf_common_rxtx.h
@@ -52,6 +52,8 @@
 #define IDPF_VPMD_TX_MAX_BURST		32
 #define IDPF_VPMD_DESCS_PER_LOOP	4
 #define IDPF_RXQ_REARM_THRESH		64
+#define IDPD_TXQ_SCAN_CQ_THRESH	64
+#define IDPF_TX_CTYPE_NUM	8
 
 /* MTS */
 #define GLTSYN_CMD_SYNC_0_0	(PF_TIMESYNC_BASE + 0x0)
@@ -185,6 +187,7 @@ struct idpf_tx_queue {
 	uint32_t tx_start_qid;
 	uint8_t expected_gen_id;
 	struct idpf_tx_queue *complq;
+	uint16_t ctype[IDPF_TX_CTYPE_NUM];
 };
 
 /* Offload features */
@@ -203,6 +206,12 @@ struct idpf_tx_vec_entry {
 	struct rte_mbuf *mbuf;
 };
 
+union idpf_tx_desc {
+	struct idpf_base_tx_desc *tx_ring;
+	struct idpf_flex_tx_sched_desc *desc_ring;
+	struct idpf_splitq_tx_compl_desc *compl_ring;
+};
+
 struct idpf_rxq_ops {
 	void (*release_mbufs)(struct idpf_rx_queue *rxq);
 };
@@ -265,14 +274,22 @@ uint16_t idpf_prep_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 __rte_internal
 int idpf_singleq_rx_vec_setup(struct idpf_rx_queue *rxq);
 __rte_internal
-int idpf_singleq_tx_vec_setup_avx512(struct idpf_tx_queue *txq);
+int idpf_splitq_rx_vec_setup(struct idpf_rx_queue *rxq);
+__rte_internal
+int idpf_tx_vec_setup_avx512(struct idpf_tx_queue *txq);
 __rte_internal
 uint16_t idpf_singleq_recv_pkts_avx512(void *rx_queue,
 				       struct rte_mbuf **rx_pkts,
 				       uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_splitq_recv_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+				      uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_singleq_xmit_pkts_avx512(void *tx_queue,
 				       struct rte_mbuf **tx_pkts,
 				       uint16_t nb_pkts);
+__rte_internal
+uint16_t idpf_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+				      uint16_t nb_pkts);
 
 #endif /* _IDPF_COMMON_RXTX_H_ */
diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index de52c6e45f..5a91ed610e 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -537,8 +537,462 @@ idpf_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts);
 }
 
+static __rte_always_inline void
+idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
+{
+	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	uint16_t rx_id;
+	int i;
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mempool_get_bulk(rx_bufq->mp,
+				 (void *)rxp,
+				 IDPF_RXQ_REARM_THRESH) < 0) {
+		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+		    rx_bufq->nb_rx_desc) {
+			__m128i dma_addr0;
+
+			dma_addr0 = _mm_setzero_si128();
+			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+				rxp[i] = &rx_bufq->fake_mbuf;
+				_mm_store_si128((__m128i *)&rxdp[i],
+						dma_addr0);
+			}
+		}
+		rx_bufq->rx_stats.mbuf_alloc_failed += IDPF_RXQ_REARM_THRESH;
+		return;
+	}
+
+	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
+			i += 8, rxp += 8, rxdp += 8) {
+		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+
+static __rte_always_inline void
+idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	struct rte_mempool_cache *cache =
+		rte_mempool_default_cache(rx_bufq->mp, rte_lcore_id());
+	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	if (unlikely(!cache))
+		return idpf_splitq_rearm_common(rx_bufq);
+
+	/* We need to pull 'n' more MBUFs into the software ring from mempool
+	 * We inline the mempool function here, so we can vectorize the copy
+	 * from the cache into the shadow ring.
+	 */
+
+	/* Can this be satisfied from the cache? */
+	if (cache->len < IDPF_RXQ_REARM_THRESH) {
+		/* No. Backfill the cache first, and then fill from it */
+		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size -
+							cache->len);
+
+		/* How many do we require i.e. number to fill the cache + the request */
+		int ret = rte_mempool_ops_dequeue_bulk
+				(rx_bufq->mp, &cache->objs[cache->len], req);
+		if (ret == 0) {
+			cache->len += req;
+		} else {
+			if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+			    rx_bufq->nb_rx_desc) {
+				__m128i dma_addr0;
+
+				dma_addr0 = _mm_setzero_si128();
+				for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+					rxp[i] = &rx_bufq->fake_mbuf;
+					_mm_storeu_si128((__m128i *)&rxdp[i],
+							 dma_addr0);
+				}
+			}
+			rx_bufq->rx_stats.mbuf_alloc_failed += IDPF_RXQ_REARM_THRESH;
+			return;
+		}
+	}
+
+	const __m512i iova_offsets =  _mm512_set1_epi64(offsetof
+							(struct rte_mbuf, buf_iova));
+	const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+
+	/* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
+	 * from mempool cache and populating both shadow and HW rings
+	 */
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH / IDPF_DESCS_PER_LOOP_AVX; i++) {
+		const __m512i mbuf_ptrs = _mm512_loadu_si512
+			(&cache->objs[cache->len - IDPF_DESCS_PER_LOOP_AVX]);
+		_mm512_storeu_si512(rxp, mbuf_ptrs);
+
+		const __m512i iova_base_addrs = _mm512_i64gather_epi64
+				(_mm512_add_epi64(mbuf_ptrs, iova_offsets),
+				 0, /* base */
+				 1  /* scale */);
+		const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
+				headroom);
+
+		const __m512i iova_addrs_1 = _mm512_bsrli_epi128(iova_addrs, 8);
+
+		rxdp[0].split_rd.pkt_addr =
+			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 0));
+		rxdp[1].split_rd.pkt_addr =
+			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 0));
+		rxdp[2].split_rd.pkt_addr =
+			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 1));
+		rxdp[3].split_rd.pkt_addr =
+			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 1));
+		rxdp[4].split_rd.pkt_addr =
+			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 2));
+		rxdp[5].split_rd.pkt_addr =
+			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 2));
+		rxdp[6].split_rd.pkt_addr =
+			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 3));
+		rxdp[7].split_rd.pkt_addr =
+			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 3));
+
+		rxp += IDPF_DESCS_PER_LOOP_AVX;
+		rxdp += IDPF_DESCS_PER_LOOP_AVX;
+		cache->len -= IDPF_DESCS_PER_LOOP_AVX;
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+			   (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+
+static __rte_always_inline uint16_t
+_idpf_splitq_recv_raw_pkts_avx512(struct idpf_rx_queue *rxq,
+				  struct rte_mbuf **rx_pkts,
+				  uint16_t nb_pkts)
+{
+	const uint32_t *type_table = rxq->adapter->ptype_tbl;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
+						    rxq->bufq2->mbuf_initializer);
+	/* only handle bufq2 here */
+	struct rte_mbuf **sw_ring = &rxq->bufq2->sw_ring[rxq->rx_tail];
+	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
+
+	rxdp += rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+
+	/* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act
+	 */
+	if (rxq->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+		idpf_splitq_rearm(rxq->bufq2);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available
+	 */
+	if (((rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id &
+	      VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) >>
+	     VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) != rxq->expected_gen_id)
+		return 0;
+
+	const __m512i dd_check = _mm512_set1_epi64(1);
+	const __m512i gen_check = _mm512_set1_epi64((uint64_t)1<<46);
+
+	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
+	const __m512i shuf_msk =
+		_mm512_set_epi32
+			(/* 1st descriptor */
+			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
+			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
+					/* octet 15~14, 16 bits data_len */
+			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
+					/* octet 15~14, low 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 2nd descriptor */
+			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
+			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
+					/* octet 15~14, 16 bits data_len */
+			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
+					/* octet 15~14, low 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 3rd descriptor */
+			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
+			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
+					/* octet 15~14, 16 bits data_len */
+			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
+					/* octet 15~14, low 16 bits pkt_len */
+			 0xFFFFFFFF,    /* pkt_type set as unknown */
+			 /* 4th descriptor */
+			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
+			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
+					/* octet 15~14, 16 bits data_len */
+			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
+					/* octet 15~14, low 16 bits pkt_len */
+			 0xFFFFFFFF     /* pkt_type set as unknown */
+			);
+	/**
+	 * compile-time check the above crc and shuffle layout is correct.
+	 * NOTE: the first field (lowest address) is given last in set_epi
+	 * calls above.
+	 */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+	uint16_t i, received;
+
+	for (i = 0, received = 0; i < nb_pkts;
+	     i += IDPF_DESCS_PER_LOOP_AVX,
+	     rxdp += IDPF_DESCS_PER_LOOP_AVX) {
+		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
+		_mm256_storeu_si256((void *)&rx_pkts[i],
+				    _mm256_loadu_si256((void *)&sw_ring[i]));
+#ifdef RTE_ARCH_X86_64
+		_mm256_storeu_si256
+			((void *)&rx_pkts[i + 4],
+			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
+#endif
+
+		__m512i raw_desc0_3, raw_desc4_7;
+		const __m128i raw_desc7 =
+			_mm_load_si128((void *)(rxdp + 7));
+		rte_compiler_barrier();
+		const __m128i raw_desc6 =
+			_mm_load_si128((void *)(rxdp + 6));
+		rte_compiler_barrier();
+		const __m128i raw_desc5 =
+			_mm_load_si128((void *)(rxdp + 5));
+		rte_compiler_barrier();
+		const __m128i raw_desc4 =
+			_mm_load_si128((void *)(rxdp + 4));
+		rte_compiler_barrier();
+		const __m128i raw_desc3 =
+			_mm_load_si128((void *)(rxdp + 3));
+		rte_compiler_barrier();
+		const __m128i raw_desc2 =
+			_mm_load_si128((void *)(rxdp + 2));
+		rte_compiler_barrier();
+		const __m128i raw_desc1 =
+			_mm_load_si128((void *)(rxdp + 1));
+		rte_compiler_barrier();
+		const __m128i raw_desc0 =
+			_mm_load_si128((void *)(rxdp + 0));
+
+		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
+		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
+		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
+		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
+
+		/**
+		 * convert descriptors 4-7 into mbufs, adjusting length and
+		 * re-arranging fields. Then write into the mbuf
+		 */
+		const __m512i len_mask = _mm512_set_epi32(0xffffffff, 0xffffffff,
+							  0xffff3fff, 0xffffffff,
+							  0xffffffff, 0xffffffff,
+							  0xffff3fff, 0xffffffff,
+							  0xffffffff, 0xffffffff,
+							  0xffff3fff, 0xffffffff,
+							  0xffffffff, 0xffffffff,
+							  0xffff3fff, 0xffffffff);
+		const __m512i desc4_7 = _mm512_and_epi32(raw_desc4_7, len_mask);
+		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
+
+		/**
+		 * to get packet types, shift 64-bit values down 30 bits
+		 * and so ptype is in lower 8-bits in each
+		 */
+		const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 16);
+		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
+		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
+		const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 16);
+		const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 0);
+		const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 16);
+		const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 0);
+
+		const __m512i ptype4_7 = _mm512_set_epi32
+			(0, 0, 0, type_table[ptype7],
+			 0, 0, 0, type_table[ptype6],
+			 0, 0, 0, type_table[ptype5],
+			 0, 0, 0, type_table[ptype4]);
+		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+
+		/**
+		 * convert descriptors 0-3 into mbufs, adjusting length and
+		 * re-arranging fields. Then write into the mbuf
+		 */
+		const __m512i desc0_3 = _mm512_and_epi32(raw_desc0_3, len_mask);
+		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
+
+		/* get the packet types */
+		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 16);
+		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
+		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
+		const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 16);
+		const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 0);
+		const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 16);
+		const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 0);
+
+		const __m512i ptype0_3 = _mm512_set_epi32
+			(0, 0, 0, type_table[ptype3],
+			 0, 0, 0, type_table[ptype2],
+			 0, 0, 0, type_table[ptype1],
+			 0, 0, 0, type_table[ptype0]);
+		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+
+		/**
+		 * use permute/extract to get status and generation bit content
+		 * After the operations, the packets status flags are in the
+		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+		 */
+
+		const __m512i dd_permute_msk = _mm512_set_epi64
+			(11, 15, 3, 7, 9, 13, 1, 5);
+		const __m512i status0_7 = _mm512_permutex2var_epi64
+			(raw_desc4_7, dd_permute_msk, raw_desc0_3);
+		const __m512i gen_permute_msk = _mm512_set_epi64
+			(10, 14, 2, 6, 8, 12, 0, 4);
+		const __m512i raw_gen0_7 = _mm512_permutex2var_epi64
+			(raw_desc4_7, gen_permute_msk, raw_desc0_3);
+
+		/* now do flag manipulation */
+
+		/**
+		 * At this point, we have the 8 sets of flags in the low 16-bits
+		 * of each 32-bit value in vlan0.
+		 * We want to extract these, and merge them with the mbuf init
+		 * data so we can do a single write to the mbuf to set the flags
+		 * and all the other initialization fields. Extracting the
+		 * appropriate flags means that we have to do a shift and blend
+		 * for each mbuf before we do the write. However, we can also
+		 * add in the previously computed rx_descriptor fields to
+		 * make a single 256-bit write per mbuf
+		 */
+		/* check the structure matches expectations */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+				 offsetof(struct rte_mbuf, rearm_data) + 8);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+				 RTE_ALIGN(offsetof(struct rte_mbuf,
+						    rearm_data),
+						    16));
+				/* build up data and do writes */
+		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
+			rearm6, rearm7;
+		const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
+		const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
+		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
+		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
+
+		rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20);
+		rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20);
+		rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20);
+		rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20);
+
+		/* write to mbuf */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
+				    rearm6);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
+				    rearm4);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
+				    rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
+				    rearm0);
+
+		rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0);
+		rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0);
+		rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0);
+		rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0);
+
+		/* again write to mbufs */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
+				    rearm7);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
+				    rearm5);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
+				    rearm3);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
+				    rearm1);
+
+		const __mmask8 dd_mask = _mm512_cmpeq_epi64_mask(
+			_mm512_and_epi64(status0_7, dd_check), dd_check);
+		const __mmask8 gen_mask = _mm512_cmpeq_epi64_mask(
+			_mm512_and_epi64(raw_gen0_7, gen_check),
+			_mm512_set1_epi64((uint64_t)rxq->expected_gen_id << 46));
+		const __mmask8 recv_mask = _kand_mask8(dd_mask, gen_mask);
+		uint16_t burst = __builtin_popcount(_cvtmask8_u32(recv_mask));
+
+		received += burst;
+		if (burst != IDPF_DESCS_PER_LOOP_AVX)
+			break;
+	}
+
+	/* update tail pointers */
+	rxq->rx_tail += received;
+	rxq->expected_gen_id ^= ((rxq->rx_tail & rxq->nb_rx_desc) != 0);
+	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
+	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
+		rxq->rx_tail--;
+		received--;
+	}
+
+	rxq->bufq2->rxrearm_nb += received;
+	return received;
+}
+
+/* only bufq2 can receive pkts */
+uint16_t
+idpf_splitq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	return _idpf_splitq_recv_raw_pkts_avx512(rx_queue, rx_pkts,
+						 nb_pkts);
+}
+
 static __rte_always_inline int
-idpf_tx_free_bufs_avx512(struct idpf_tx_queue *txq)
+idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 {
 	struct idpf_tx_vec_entry *txep;
 	uint32_t n;
@@ -657,7 +1111,7 @@ tx_backlog_entry_avx512(struct idpf_tx_vec_entry *txep,
 
 #define IDPF_FLEX_TXD_QW1_BUF_SZ_S 48
 static __rte_always_inline void
-idpf_vtx1(volatile struct idpf_flex_tx_desc *txdp,
+idpf_singleq_vtx1(volatile struct idpf_flex_tx_desc *txdp,
 	  struct rte_mbuf *pkt, uint64_t flags)
 {
 	uint64_t high_qw =
@@ -673,7 +1127,7 @@ idpf_vtx1(volatile struct idpf_flex_tx_desc *txdp,
 #define IDPF_TX_LEN_MASK 0xAA
 #define IDPF_TX_OFF_MASK 0x55
 static __rte_always_inline void
-idpf_vtx(volatile struct idpf_flex_tx_desc *txdp,
+idpf_singleq_vtx(volatile struct idpf_flex_tx_desc *txdp,
 	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
 {
 	const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_FLEX_DATA  |
@@ -681,7 +1135,7 @@ idpf_vtx(volatile struct idpf_flex_tx_desc *txdp,
 
 	/* if unaligned on 32-bit boundary, do one to align */
 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
-		idpf_vtx1(txdp, *pkt, flags);
+		idpf_singleq_vtx1(txdp, *pkt, flags);
 		nb_pkts--, txdp++, pkt++;
 	}
 
@@ -719,14 +1173,14 @@ idpf_vtx(volatile struct idpf_flex_tx_desc *txdp,
 
 	/* do any last ones */
 	while (nb_pkts) {
-		idpf_vtx1(txdp, *pkt, flags);
+		idpf_singleq_vtx1(txdp, *pkt, flags);
 		txdp++, pkt++, nb_pkts--;
 	}
 }
 
 static __rte_always_inline uint16_t
-idpf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
-				 uint16_t nb_pkts)
+idpf_singleq_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+					 uint16_t nb_pkts)
 {
 	struct idpf_tx_queue *txq = tx_queue;
 	volatile struct idpf_flex_tx_desc *txdp;
@@ -739,7 +1193,7 @@ idpf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 	nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
 
 	if (txq->nb_free < txq->free_thresh)
-		idpf_tx_free_bufs_avx512(txq);
+		idpf_tx_singleq_free_bufs_avx512(txq);
 
 	nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
 	nb_commit = nb_pkts;
@@ -757,11 +1211,11 @@ idpf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 	if (nb_commit >= n) {
 		tx_backlog_entry_avx512(txep, tx_pkts, n);
 
-		idpf_vtx(txdp, tx_pkts, n - 1, flags);
+		idpf_singleq_vtx(txdp, tx_pkts, n - 1, flags);
 		tx_pkts += (n - 1);
 		txdp += (n - 1);
 
-		idpf_vtx1(txdp, *tx_pkts++, rs);
+		idpf_singleq_vtx1(txdp, *tx_pkts++, rs);
 
 		nb_commit = (uint16_t)(nb_commit - n);
 
@@ -776,7 +1230,7 @@ idpf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
 
-	idpf_vtx(txdp, tx_pkts, nb_commit, flags);
+	idpf_singleq_vtx(txdp, tx_pkts, nb_commit, flags);
 
 	tx_id = (uint16_t)(tx_id + nb_commit);
 	if (tx_id > txq->next_rs) {
@@ -795,7 +1249,7 @@ idpf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 }
 
 static __rte_always_inline uint16_t
-idpf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
+idpf_singleq_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
 			      uint16_t nb_pkts)
 {
 	uint16_t nb_tx = 0;
@@ -805,7 +1259,7 @@ idpf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
 		uint16_t ret, num;
 
 		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
-		ret = idpf_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx],
+		ret = idpf_singleq_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx],
 						       num);
 		nb_tx += ret;
 		nb_pkts -= ret;
@@ -820,11 +1274,313 @@ uint16_t
 idpf_singleq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 			     uint16_t nb_pkts)
 {
-	return idpf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts);
+	return idpf_singleq_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts);
+}
+
+static __rte_always_inline void
+idpf_splitq_scan_cq_ring(struct idpf_tx_queue *cq)
+{
+	struct idpf_splitq_tx_compl_desc *compl_ring;
+	struct idpf_tx_queue *txq;
+	uint16_t genid, txq_qid, cq_qid, i;
+	uint8_t ctype;
+
+	cq_qid = cq->tx_tail;
+
+	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
+		if (cq_qid == cq->nb_tx_desc) {
+			cq_qid = 0;
+			cq->expected_gen_id ^= 1;
+		}
+		compl_ring = &cq->compl_ring[cq_qid];
+		genid = (compl_ring->qid_comptype_gen &
+			rte_cpu_to_le_64(IDPF_TXD_COMPLQ_GEN_M)) >> IDPF_TXD_COMPLQ_GEN_S;
+		if (genid != cq->expected_gen_id)
+			break;
+		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+			IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
+		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+			IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
+		txq = cq->txqs[txq_qid - cq->tx_start_qid];
+		txq->ctype[ctype]++;
+		cq_qid++;
+	}
+
+	cq->tx_tail = cq_qid;
+}
+
+static __rte_always_inline int
+idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
+{
+	struct idpf_tx_vec_entry *txep;
+	uint32_t n;
+	uint32_t i;
+	int nb_free = 0;
+	struct rte_mbuf *m, *free[txq->rs_thresh];
+
+	n = txq->rs_thresh;
+
+	 /* first buffer to free from S/W ring is at index
+	  * tx_next_dd - (tx_rs_thresh-1)
+	  */
+	txep = (void *)txq->sw_ring;
+	txep += txq->next_dd - (n - 1);
+
+	if (txq->offloads & IDPF_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
+		struct rte_mempool *mp = txep[0].mbuf->pool;
+		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
+								rte_lcore_id());
+		void **cache_objs;
+
+		if (!cache || cache->len == 0)
+			goto normal;
+
+		cache_objs = &cache->objs[cache->len];
+
+		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+			goto done;
+		}
+
+		/* The cache follows the following algorithm
+		 *   1. Add the objects to the cache
+		 *   2. Anything greater than the cache min value (if it crosses the
+		 *   cache flush threshold) is flushed to the ring.
+		 */
+		/* Add elements back into the cache */
+		uint32_t copied = 0;
+		/* n is multiple of 32 */
+		while (copied < n) {
+			const __m512i a = _mm512_loadu_si512(&txep[copied]);
+			const __m512i b = _mm512_loadu_si512(&txep[copied + 8]);
+			const __m512i c = _mm512_loadu_si512(&txep[copied + 16]);
+			const __m512i d = _mm512_loadu_si512(&txep[copied + 24]);
+
+			_mm512_storeu_si512(&cache_objs[copied], a);
+			_mm512_storeu_si512(&cache_objs[copied + 8], b);
+			_mm512_storeu_si512(&cache_objs[copied + 16], c);
+			_mm512_storeu_si512(&cache_objs[copied + 24], d);
+			copied += 32;
+		}
+		cache->len += n;
+
+		if (cache->len >= cache->flushthresh) {
+			rte_mempool_ops_enqueue_bulk(mp,
+						     &cache->objs[cache->size],
+						     cache->len - cache->size);
+			cache->len = cache->size;
+		}
+		goto done;
+	}
+
+normal:
+	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
+	if (likely(m)) {
+		free[0] = m;
+		nb_free = 1;
+		for (i = 1; i < n; i++) {
+			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+			if (likely(m)) {
+				if (likely(m->pool == free[0]->pool)) {
+					free[nb_free++] = m;
+				} else {
+					rte_mempool_put_bulk(free[0]->pool,
+							     (void *)free,
+							     nb_free);
+					free[0] = m;
+					nb_free = 1;
+				}
+			}
+		}
+		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+	} else {
+		for (i = 1; i < n; i++) {
+			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+			if (m)
+				rte_mempool_put(m->pool, m);
+		}
+	}
+
+done:
+	/* buffers were freed, update counters */
+	txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh);
+	txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh);
+	if (txq->next_dd >= txq->nb_tx_desc)
+		txq->next_dd = (uint16_t)(txq->rs_thresh - 1);
+	txq->ctype[IDPF_TXD_COMPLT_RS] -= txq->rs_thresh;
+
+	return txq->rs_thresh;
+}
+
+#define IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S	48
+
+static __rte_always_inline void
+idpf_splitq_vtx1(volatile struct idpf_flex_tx_sched_desc *txdp,
+	  struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+		(IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+		 ((uint64_t)flags) |
+		 ((uint64_t)pkt->data_len << IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S));
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+					    pkt->buf_iova + pkt->data_off);
+	_mm_storeu_si128((__m128i *)txdp, descriptor);
+}
+
+static __rte_always_inline void
+idpf_splitq_vtx(volatile struct idpf_flex_tx_sched_desc *txdp,
+	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE  |
+			((uint64_t)flags));
+
+	/* if unaligned on 32-bit boundary, do one to align */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		idpf_splitq_vtx1(txdp, *pkt, flags);
+		nb_pkts--, txdp++, pkt++;
+	}
+
+	/* do 4 at a time while possible, in bursts */
+	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
+		uint64_t hi_qw3 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len <<
+			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw2 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len <<
+			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw1 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len <<
+			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw0 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len <<
+			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
+
+		__m512i desc0_3 =
+			_mm512_set_epi64
+				(hi_qw3,
+				 pkt[3]->buf_iova + pkt[3]->data_off,
+				 hi_qw2,
+				 pkt[2]->buf_iova + pkt[2]->data_off,
+				 hi_qw1,
+				 pkt[1]->buf_iova + pkt[1]->data_off,
+				 hi_qw0,
+				 pkt[0]->buf_iova + pkt[0]->data_off);
+		_mm512_storeu_si512((void *)txdp, desc0_3);
+	}
+
+	/* do any last ones */
+	while (nb_pkts) {
+		idpf_splitq_vtx1(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+}
+
+static __rte_always_inline uint16_t
+idpf_splitq_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+					uint16_t nb_pkts)
+{
+	struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue;
+	volatile struct idpf_flex_tx_sched_desc *txdp;
+	struct idpf_tx_vec_entry *txep;
+	uint16_t n, nb_commit, tx_id;
+	/* bit2 is reserved and must be set to 1 according to Spec */
+	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
+
+	tx_id = txq->tx_tail;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
+
+	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->desc_ring[tx_id];
+	txep = (void *)txq->sw_ring;
+	txep += tx_id;
+
+	txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		tx_backlog_entry_avx512(txep, tx_pkts, n);
+
+		idpf_splitq_vtx((void *)txdp, tx_pkts, n - 1, cmd_dtype);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		idpf_splitq_vtx1((void *)txdp, *tx_pkts++, cmd_dtype);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &txq->desc_ring[tx_id];
+		txep = (void *)txq->sw_ring;
+		txep += tx_id;
+	}
+
+	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
+
+	idpf_splitq_vtx((void *)txdp, tx_pkts, nb_commit, cmd_dtype);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->next_rs)
+		txq->next_rs =
+			(uint16_t)(txq->next_rs + txq->rs_thresh);
+
+	txq->tx_tail = tx_id;
+
+	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+static __rte_always_inline uint16_t
+idpf_splitq_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
+				     uint16_t nb_pkts)
+{
+	struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue;
+	uint16_t nb_tx = 0;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+
+		idpf_splitq_scan_cq_ring(txq->complq);
+
+		if (txq->ctype[IDPF_TXD_COMPLT_RS] > txq->free_thresh)
+			idpf_tx_splitq_free_bufs_avx512(txq);
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
+		ret = idpf_splitq_xmit_fixed_burst_vec_avx512(tx_queue,
+							      &tx_pkts[nb_tx],
+							      num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
+
+uint16_t
+idpf_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+			     uint16_t nb_pkts)
+{
+	return idpf_splitq_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts);
 }
 
 static inline void
-idpf_singleq_tx_release_mbufs_avx512(struct idpf_tx_queue *txq)
+idpf_tx_release_mbufs_avx512(struct idpf_tx_queue *txq)
 {
 	unsigned int i;
 	const uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1);
@@ -843,13 +1599,16 @@ idpf_singleq_tx_release_mbufs_avx512(struct idpf_tx_queue *txq)
 	}
 }
 
-static const struct idpf_txq_ops avx512_singleq_tx_vec_ops = {
-	.release_mbufs = idpf_singleq_tx_release_mbufs_avx512,
+static const struct idpf_txq_ops avx512_tx_vec_ops = {
+	.release_mbufs = idpf_tx_release_mbufs_avx512,
 };
 
 int __rte_cold
-idpf_singleq_tx_vec_setup_avx512(struct idpf_tx_queue *txq)
+idpf_tx_vec_setup_avx512(struct idpf_tx_queue *txq)
 {
-	txq->ops = &avx512_singleq_tx_vec_ops;
+	if (!txq)
+		return 0;
+
+	txq->ops = &avx512_tx_vec_ops;
 	return 0;
 }
diff --git a/drivers/common/idpf/version.map b/drivers/common/idpf/version.map
index 33747a5c69..7018a1d695 100644
--- a/drivers/common/idpf/version.map
+++ b/drivers/common/idpf/version.map
@@ -44,9 +44,12 @@ INTERNAL {
 	idpf_singleq_xmit_pkts;
 	idpf_prep_pkts;
 	idpf_singleq_rx_vec_setup;
-	idpf_singleq_tx_vec_setup_avx512;
+	idpf_splitq_rx_vec_setup;
+	idpf_tx_vec_setup_avx512;
 	idpf_singleq_recv_pkts_avx512;
+	idpf_splitq_recv_pkts_avx512;
 	idpf_singleq_xmit_pkts_avx512;
+	idpf_splitq_xmit_pkts_avx512;
 
 	local: *;
 };
diff --git a/drivers/net/idpf/idpf_rxtx.c b/drivers/net/idpf/idpf_rxtx.c
index b6d4e5abd0..e30d7c56ee 100644
--- a/drivers/net/idpf/idpf_rxtx.c
+++ b/drivers/net/idpf/idpf_rxtx.c
@@ -761,7 +761,8 @@ idpf_set_rx_function(struct rte_eth_dev *dev)
 		if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
 #ifdef CC_AVX512_SUPPORT
 			if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
-			    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1)
+			    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
+			    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512DQ))
 				vport->rx_use_avx512 = true;
 #else
 		PMD_DRV_LOG(NOTICE,
@@ -774,6 +775,20 @@ idpf_set_rx_function(struct rte_eth_dev *dev)
 
 #ifdef RTE_ARCH_X86
 	if (vport->rxq_model == VIRTCHNL2_QUEUE_MODEL_SPLIT) {
+#ifdef RTE_ARCH_X86
+		if (vport->rx_vec_allowed) {
+			for (i = 0; i < dev->data->nb_tx_queues; i++) {
+				rxq = dev->data->rx_queues[i];
+				(void)idpf_splitq_rx_vec_setup(rxq);
+			}
+#ifdef CC_AVX512_SUPPORT
+			if (vport->rx_use_avx512) {
+				dev->rx_pkt_burst = idpf_splitq_recv_pkts_avx512;
+				return;
+			}
+#endif
+		}
+#endif
 		dev->rx_pkt_burst = idpf_splitq_recv_pkts;
 	} else {
 		if (vport->rx_vec_allowed) {
@@ -809,14 +824,22 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
 	int i;
 #endif /* CC_AVX512_SUPPORT */
 
-	if (idpf_rx_vec_dev_check_default(dev) == IDPF_VECTOR_PATH &&
+	if (idpf_tx_vec_dev_check_default(dev) == IDPF_VECTOR_PATH &&
 	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
 		vport->tx_vec_allowed = true;
 		if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
 #ifdef CC_AVX512_SUPPORT
+		{
 			if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
 			    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1)
 				vport->tx_use_avx512 = true;
+			if (vport->tx_use_avx512) {
+				for (i = 0; i < dev->data->nb_tx_queues; i++) {
+					txq = dev->data->tx_queues[i];
+					idpf_tx_vec_setup_avx512(txq);
+				}
+			}
+		}
 #else
 		PMD_DRV_LOG(NOTICE,
 			    "AVX512 is not supported in build env");
@@ -827,6 +850,17 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
 #endif /* RTE_ARCH_X86 */
 
 	if (vport->txq_model == VIRTCHNL2_QUEUE_MODEL_SPLIT) {
+#ifdef RTE_ARCH_X86
+		if (vport->tx_vec_allowed) {
+#ifdef CC_AVX512_SUPPORT
+			if (vport->tx_use_avx512) {
+				dev->tx_pkt_burst = idpf_splitq_xmit_pkts_avx512;
+				dev->tx_pkt_prepare = idpf_prep_pkts;
+				return;
+			}
+#endif
+		}
+#endif
 		dev->tx_pkt_burst = idpf_splitq_xmit_pkts;
 		dev->tx_pkt_prepare = idpf_prep_pkts;
 	} else {
@@ -834,12 +868,6 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
 		if (vport->tx_vec_allowed) {
 #ifdef CC_AVX512_SUPPORT
 			if (vport->tx_use_avx512) {
-				for (i = 0; i < dev->data->nb_tx_queues; i++) {
-					txq = dev->data->tx_queues[i];
-					if (txq == NULL)
-						continue;
-					idpf_singleq_tx_vec_setup_avx512(txq);
-				}
 				dev->tx_pkt_burst = idpf_singleq_xmit_pkts_avx512;
 				dev->tx_pkt_prepare = idpf_prep_pkts;
 				return;
diff --git a/drivers/net/idpf/idpf_rxtx_vec_common.h b/drivers/net/idpf/idpf_rxtx_vec_common.h
index 0f4e10e154..650d81f596 100644
--- a/drivers/net/idpf/idpf_rxtx_vec_common.h
+++ b/drivers/net/idpf/idpf_rxtx_vec_common.h
@@ -63,16 +63,27 @@ idpf_tx_vec_queue_default(struct idpf_tx_queue *txq)
 	return IDPF_VECTOR_PATH;
 }
 
+static inline int
+idpf_rx_splitq_vec_default(struct idpf_rx_queue *rxq)
+{
+	if (rxq->bufq2->rx_buf_len < rxq->max_pkt_len)
+		return -1;
+
+	return IDPF_VECTOR_PATH;
+}
+
 static inline int
 idpf_rx_vec_dev_check_default(struct rte_eth_dev *dev)
 {
-	int i;
+	struct idpf_vport *vport = dev->data->dev_private;
 	struct idpf_rx_queue *rxq;
-	int ret = 0;
+	int i, ret = 0;
 
 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
 		rxq = dev->data->rx_queues[i];
-		ret = (idpf_rx_vec_queue_default(rxq));
+		ret = idpf_rx_vec_queue_default(rxq) ||
+		      (vport->rxq_model == VIRTCHNL2_QUEUE_MODEL_SPLIT &&
+		       idpf_rx_splitq_vec_default(rxq));
 		if (ret < 0)
 			return -1;
 	}
-- 
2.34.1


  reply	other threads:[~2022-12-14  6:58 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-12-14  6:07 [PATCH v1] common/idpf: add AVX512 data path for split queue model Wenjun Wu
2022-12-14  7:04 ` [PATCH v2 0/1] Add support AVX512 split queue datapath Wenjun Wu
2022-12-14  7:04   ` Wenjun Wu [this message]
2023-01-06  6:46 ` [PATCH v3 " Wenjun Wu
2023-01-06  6:46   ` [PATCH v3 1/1] common/idpf: add AVX512 data path for split queue model Wenjun Wu
2023-01-11  7:13     ` Liu, Mingxia
2023-01-12  2:40 ` [PATCH v4 0/1] Add support AVX512 split queue datapath Wenjun Wu
2023-01-12  2:40   ` [PATCH v4 1/1] common/idpf: add AVX512 data path for split queue model Wenjun Wu
2023-01-12  7:27     ` Lu, Wenzhuo
2023-01-13  1:51 ` [PATCH v5 0/1] Add support AVX512 split queue datapath Wenjun Wu
2023-01-13  1:51   ` [PATCH v5 1/1] common/idpf: add AVX512 data path for split queue model Wenjun Wu
2023-01-14  3:54 ` [PATCH v6 0/1] Add support AVX512 split queue datapath Wenjun Wu
2023-01-14  3:54   ` [PATCH v6 1/1] common/idpf: add AVX512 data path for split queue model Wenjun Wu
2023-01-18  3:51 ` [PATCH v7 0/1] Add support AVX512 split queue datapath Wenjun Wu
2023-01-18  3:51   ` [PATCH v7 1/1] common/idpf: add AVX512 data path for split queue model Wenjun Wu
2023-02-06  3:34 ` [PATCH v8 0/1] Add support AVX512 split queue datapath Wenjun Wu
2023-02-06  3:34   ` [PATCH v8 1/1] common/idpf: add AVX512 data path for split queue model Wenjun Wu
2023-02-06  3:41 ` [PATCH v9 0/1] Add support AVX512 split queue datapath Wenjun Wu
2023-02-06  3:41   ` [PATCH v9 1/1] common/idpf: add AVX512 data path for split queue model Wenjun Wu
2023-02-07  2:11 ` [PATCH v10 0/1] Add support AVX512 split queue datapath Wenjun Wu
2023-02-07  2:11   ` [PATCH v10 1/1] common/idpf: add AVX512 data path for split queue model Wenjun Wu
2023-02-07  3:10     ` Zhang, Qi Z
2023-02-07  8:45 ` [PATCH v11 0/1] Add support AVX512 split queue datapath Wenjun Wu
2023-02-07  8:45   ` [PATCH v11 1/1] common/idpf: add AVX512 data path for split queue model Wenjun Wu
2023-02-08  1:35   ` [PATCH v11 0/1] Add support AVX512 split queue datapath Zhang, Qi Z

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221214070425.970409-2-wenjun1.wu@intel.com \
    --to=wenjun1.wu@intel.com \
    --cc=beilei.xing@intel.com \
    --cc=dev@dpdk.org \
    --cc=jingjing.wu@intel.com \
    --cc=mingxia.liu@intel.com \
    --cc=qi.z.zhang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.