All of lore.kernel.org
 help / color / mirror / Atom feed
From: Leyi Rong <leyi.rong@intel.com>
To: jingjing.wu@intel.com, qi.z.zhang@intel.com,
	beilei.xing@intel.com, xiaolong.ye@intel.com
Cc: dev@dpdk.org, Leyi Rong <leyi.rong@intel.com>
Subject: [dpdk-dev] [PATCH v6 04/11] net/iavf: flexible Rx descriptor support in AVX path
Date: Mon, 20 Apr 2020 14:16:18 +0800	[thread overview]
Message-ID: <20200420061625.21424-5-leyi.rong@intel.com> (raw)
In-Reply-To: <20200420061625.21424-1-leyi.rong@intel.com>

Support flexible Rx descriptor format in AVX
path of iAVF PMD.

Signed-off-by: Leyi Rong <leyi.rong@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c          |  24 +-
 drivers/net/iavf/iavf_rxtx.h          |   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx2.c | 539 ++++++++++++++++++++++++++
 3 files changed, 563 insertions(+), 6 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 2eac1de2b..8a1292823 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2087,16 +2087,28 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 				    "Using %sVector Scattered Rx (port %d).",
 				    use_avx2 ? "avx2 " : "",
 				    dev->data->port_id);
-			dev->rx_pkt_burst = use_avx2 ?
-					    iavf_recv_scattered_pkts_vec_avx2 :
-					    iavf_recv_scattered_pkts_vec;
+			if (vf->vf_res->vf_cap_flags &
+				VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+				dev->rx_pkt_burst = use_avx2 ?
+					iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
+					iavf_recv_scattered_pkts_vec;
+			else
+				dev->rx_pkt_burst = use_avx2 ?
+					iavf_recv_scattered_pkts_vec_avx2 :
+					iavf_recv_scattered_pkts_vec;
 		} else {
 			PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
 				    use_avx2 ? "avx2 " : "",
 				    dev->data->port_id);
-			dev->rx_pkt_burst = use_avx2 ?
-					    iavf_recv_pkts_vec_avx2 :
-					    iavf_recv_pkts_vec;
+			if (vf->vf_res->vf_cap_flags &
+				VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+				dev->rx_pkt_burst = use_avx2 ?
+					iavf_recv_pkts_vec_avx2_flex_rxd :
+					iavf_recv_pkts_vec;
+			else
+				dev->rx_pkt_burst = use_avx2 ?
+					iavf_recv_pkts_vec_avx2 :
+					iavf_recv_pkts_vec;
 		}
 
 		return;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 8246e797f..84ec39132 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -414,9 +414,15 @@ uint16_t iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 				  uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 				 uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx2_flex_rxd(void *rx_queue,
+					  struct rte_mbuf **rx_pkts,
+					  uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx2(void *rx_queue,
 					   struct rte_mbuf **rx_pkts,
 					   uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx2_flex_rxd(void *rx_queue,
+						    struct rte_mbuf **rx_pkts,
+						    uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 			    uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
index 2587083d8..04603e7d4 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
@@ -614,6 +614,464 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 	return received;
 }
 
+static inline uint16_t
+_iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
+				      struct rte_mbuf **rx_pkts,
+				      uint16_t nb_pkts, uint8_t *split_packet)
+{
+#define IAVF_DESCS_PER_LOOP_AVX 8
+
+	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
+			0, rxq->mbuf_initializer);
+	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
+	volatile union iavf_rx_flex_desc *rxdp =
+		(union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+
+	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act
+	 */
+	if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
+		iavf_rxq_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available
+	 */
+	if (!(rxdp->wb.status_error0 &
+			rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S)))
+		return 0;
+
+	/* constants used in processing loop */
+	const __m256i crc_adjust =
+		_mm256_set_epi16
+			(/* first descriptor */
+			 0, 0, 0,       /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 0,             /* ignore high-16bits of pkt_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0, 0,          /* ignore pkt_type field */
+			 /* second descriptor */
+			 0, 0, 0,       /* ignore non-length fields */
+			 -rxq->crc_len, /* sub crc on data_len */
+			 0,             /* ignore high-16bits of pkt_len */
+			 -rxq->crc_len, /* sub crc on pkt_len */
+			 0, 0           /* ignore pkt_type field */
+			);
+
+	/* 8 packets DD mask, LSB in each 32-bit value */
+	const __m256i dd_check = _mm256_set1_epi32(1);
+
+	/* 8 packets EOP mask, second-LSB in each 32-bit value */
+	const __m256i eop_check = _mm256_slli_epi32(dd_check,
+			IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
+
+	/* mask to shuffle from desc. to mbuf (2 descriptors)*/
+	const __m256i shuf_msk =
+		_mm256_set_epi8
+			(/* first descriptor */
+			 15, 14,
+			 13, 12,	/* octet 12~15, 32 bits rss */
+			 11, 10,	/* octet 10~11, 16 bits vlan_macip */
+			 5, 4,		/* octet 4~5, 16 bits data_len */
+			 0xFF, 0xFF,	/* skip hi 16 bits pkt_len, zero out */
+			 5, 4,		/* octet 4~5, 16 bits pkt_len */
+			 0xFF, 0xFF,	/* pkt_type set as unknown */
+			 0xFF, 0xFF,	/*pkt_type set as unknown */
+			 /* second descriptor */
+			 15, 14,
+			 13, 12,	/* octet 12~15, 32 bits rss */
+			 11, 10,	/* octet 10~11, 16 bits vlan_macip */
+			 5, 4,		/* octet 4~5, 16 bits data_len */
+			 0xFF, 0xFF,	/* skip hi 16 bits pkt_len, zero out */
+			 5, 4,		/* octet 4~5, 16 bits pkt_len */
+			 0xFF, 0xFF,	/* pkt_type set as unknown */
+			 0xFF, 0xFF	/*pkt_type set as unknown */
+			);
+	/**
+	 * compile-time check the above crc and shuffle layout is correct.
+	 * NOTE: the first field (lowest address) is given last in set_epi
+	 * calls above.
+	 */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+	/* Status/Error flag masks */
+	/**
+	 * mask everything except Checksum Reports, RSS indication
+	 * and VLAN indication.
+	 * bit6:4 for IP/L4 checksum errors.
+	 * bit12 is for RSS indication.
+	 * bit13 is for VLAN indication.
+	 */
+	const __m256i flags_mask =
+		 _mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
+	/**
+	 * data to be shuffled by the result of the flags mask shifted by 4
+	 * bits.  This gives use the l3_l4 flags.
+	 */
+	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+			/* shift right 1 bit to make sure it not exceed 255 */
+			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+			 PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+			/* second 128-bits */
+			0, 0, 0, 0, 0, 0, 0, 0,
+			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
+			 PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
+			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
+	const __m256i cksum_mask =
+		 _mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+				   PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+				   PKT_RX_EIP_CKSUM_BAD);
+	/**
+	 * data to be shuffled by result of flag mask, shifted down 12.
+	 * If RSS(bit12)/VLAN(bit13) are set,
+	 * shuffle moves appropriate flags in place.
+	 */
+	const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			PKT_RX_RSS_HASH, 0,
+			/* end up 128-bits */
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 0, 0, 0,
+			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			PKT_RX_RSS_HASH, 0);
+
+	uint16_t i, received;
+
+	for (i = 0, received = 0; i < nb_pkts;
+	     i += IAVF_DESCS_PER_LOOP_AVX,
+	     rxdp += IAVF_DESCS_PER_LOOP_AVX) {
+		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
+		_mm256_storeu_si256((void *)&rx_pkts[i],
+				    _mm256_loadu_si256((void *)&sw_ring[i]));
+#ifdef RTE_ARCH_X86_64
+		_mm256_storeu_si256
+			((void *)&rx_pkts[i + 4],
+			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
+#endif
+
+		__m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7;
+
+		const __m128i raw_desc7 =
+			_mm_load_si128((void *)(rxdp + 7));
+		rte_compiler_barrier();
+		const __m128i raw_desc6 =
+			_mm_load_si128((void *)(rxdp + 6));
+		rte_compiler_barrier();
+		const __m128i raw_desc5 =
+			_mm_load_si128((void *)(rxdp + 5));
+		rte_compiler_barrier();
+		const __m128i raw_desc4 =
+			_mm_load_si128((void *)(rxdp + 4));
+		rte_compiler_barrier();
+		const __m128i raw_desc3 =
+			_mm_load_si128((void *)(rxdp + 3));
+		rte_compiler_barrier();
+		const __m128i raw_desc2 =
+			_mm_load_si128((void *)(rxdp + 2));
+		rte_compiler_barrier();
+		const __m128i raw_desc1 =
+			_mm_load_si128((void *)(rxdp + 1));
+		rte_compiler_barrier();
+		const __m128i raw_desc0 =
+			_mm_load_si128((void *)(rxdp + 0));
+
+		raw_desc6_7 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc6),
+				 raw_desc7, 1);
+		raw_desc4_5 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc4),
+				 raw_desc5, 1);
+		raw_desc2_3 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc2),
+				 raw_desc3, 1);
+		raw_desc0_1 =
+			_mm256_inserti128_si256
+				(_mm256_castsi128_si256(raw_desc0),
+				 raw_desc1, 1);
+
+		if (split_packet) {
+			int j;
+
+			for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
+				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
+		}
+
+		/**
+		 * convert descriptors 4-7 into mbufs, re-arrange fields.
+		 * Then write into the mbuf.
+		 */
+		__m256i mb6_7 = _mm256_shuffle_epi8(raw_desc6_7, shuf_msk);
+		__m256i mb4_5 = _mm256_shuffle_epi8(raw_desc4_5, shuf_msk);
+
+		mb6_7 = _mm256_add_epi16(mb6_7, crc_adjust);
+		mb4_5 = _mm256_add_epi16(mb4_5, crc_adjust);
+		/**
+		 * to get packet types, ptype is located in bit16-25
+		 * of each 128bits
+		 */
+		const __m256i ptype_mask =
+			_mm256_set1_epi16(IAVF_RX_FLEX_DESC_PTYPE_M);
+		const __m256i ptypes6_7 =
+			_mm256_and_si256(raw_desc6_7, ptype_mask);
+		const __m256i ptypes4_5 =
+			_mm256_and_si256(raw_desc4_5, ptype_mask);
+		const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9);
+		const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1);
+		const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9);
+		const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1);
+
+		mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype7], 4);
+		mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype6], 0);
+		mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype5], 4);
+		mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype4], 0);
+		/* merge the status bits into one register */
+		const __m256i status4_7 = _mm256_unpackhi_epi32(raw_desc6_7,
+				raw_desc4_5);
+
+		/**
+		 * convert descriptors 0-3 into mbufs, re-arrange fields.
+		 * Then write into the mbuf.
+		 */
+		__m256i mb2_3 = _mm256_shuffle_epi8(raw_desc2_3, shuf_msk);
+		__m256i mb0_1 = _mm256_shuffle_epi8(raw_desc0_1, shuf_msk);
+
+		mb2_3 = _mm256_add_epi16(mb2_3, crc_adjust);
+		mb0_1 = _mm256_add_epi16(mb0_1, crc_adjust);
+		/**
+		 * to get packet types, ptype is located in bit16-25
+		 * of each 128bits
+		 */
+		const __m256i ptypes2_3 =
+			_mm256_and_si256(raw_desc2_3, ptype_mask);
+		const __m256i ptypes0_1 =
+			_mm256_and_si256(raw_desc0_1, ptype_mask);
+		const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9);
+		const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1);
+		const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9);
+		const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1);
+
+		mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype3], 4);
+		mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype2], 0);
+		mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype1], 4);
+		mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype0], 0);
+		/* merge the status bits into one register */
+		const __m256i status0_3 = _mm256_unpackhi_epi32(raw_desc2_3,
+								raw_desc0_1);
+
+		/**
+		 * take the two sets of status bits and merge to one
+		 * After merge, the packets status flags are in the
+		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+		 */
+		__m256i status0_7 = _mm256_unpacklo_epi64(status4_7,
+							  status0_3);
+
+		/* now do flag manipulation */
+
+		/* get only flag/error bits we want */
+		const __m256i flag_bits =
+			_mm256_and_si256(status0_7, flags_mask);
+		/**
+		 * l3_l4_error flags, shuffle, then shift to correct adjustment
+		 * of flags in flags_shuf, and finally mask out extra bits
+		 */
+		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+				_mm256_srli_epi32(flag_bits, 4));
+		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+		/* set rss and vlan flags */
+		const __m256i rss_vlan_flag_bits =
+			_mm256_srli_epi32(flag_bits, 12);
+		const __m256i rss_vlan_flags =
+			_mm256_shuffle_epi8(rss_vlan_flags_shuf,
+					    rss_vlan_flag_bits);
+
+		/* merge flags */
+		const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
+				rss_vlan_flags);
+		/**
+		 * At this point, we have the 8 sets of flags in the low 16-bits
+		 * of each 32-bit value in vlan0.
+		 * We want to extract these, and merge them with the mbuf init
+		 * data so we can do a single write to the mbuf to set the flags
+		 * and all the other initialization fields. Extracting the
+		 * appropriate flags means that we have to do a shift and blend
+		 * for each mbuf before we do the write. However, we can also
+		 * add in the previously computed rx_descriptor fields to
+		 * make a single 256-bit write per mbuf
+		 */
+		/* check the structure matches expectations */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+				 offsetof(struct rte_mbuf, rearm_data) + 8);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+				 RTE_ALIGN(offsetof(struct rte_mbuf,
+						    rearm_data),
+					   16));
+		/* build up data and do writes */
+		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
+			rearm6, rearm7;
+		rearm6 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(mbuf_flags, 8),
+					    0x04);
+		rearm4 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(mbuf_flags, 4),
+					    0x04);
+		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
+		rearm0 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_srli_si256(mbuf_flags, 4),
+					    0x04);
+		/* permute to add in the rx_descriptor e.g. rss fields */
+		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
+		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
+		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
+		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		/* write to mbuf */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
+				    rearm6);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
+				    rearm4);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
+				    rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
+				    rearm0);
+
+		/* repeat for the odd mbufs */
+		const __m256i odd_flags =
+			_mm256_castsi128_si256
+				(_mm256_extracti128_si256(mbuf_flags, 1));
+		rearm7 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(odd_flags, 8),
+					    0x04);
+		rearm5 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_slli_si256(odd_flags, 4),
+					    0x04);
+		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
+		rearm1 = _mm256_blend_epi32(mbuf_init,
+					    _mm256_srli_si256(odd_flags, 4),
+					    0x04);
+		/* since odd mbufs are already in hi 128-bits use blend */
+		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
+		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
+		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
+		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		/* again write to mbufs */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
+				    rearm7);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
+				    rearm5);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
+				    rearm3);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
+				    rearm1);
+
+		/* extract and record EOP bit */
+		if (split_packet) {
+			const __m128i eop_mask =
+				_mm_set1_epi16(1 <<
+					       IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
+			const __m256i eop_bits256 = _mm256_and_si256(status0_7,
+								     eop_check);
+			/* pack status bits into a single 128-bit register */
+			const __m128i eop_bits =
+				_mm_packus_epi32
+					(_mm256_castsi256_si128(eop_bits256),
+					 _mm256_extractf128_si256(eop_bits256,
+								  1));
+			/**
+			 * flip bits, and mask out the EOP bit, which is now
+			 * a split-packet bit i.e. !EOP, rather than EOP one.
+			 */
+			__m128i split_bits = _mm_andnot_si128(eop_bits,
+					eop_mask);
+			/**
+			 * eop bits are out of order, so we need to shuffle them
+			 * back into order again. In doing so, only use low 8
+			 * bits, which acts like another pack instruction
+			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
+			 * [Since we use epi8, the 16-bit positions are
+			 * multiplied by 2 in the eop_shuffle value.]
+			 */
+			__m128i eop_shuffle =
+				_mm_set_epi8(/* zero hi 64b */
+					     0xFF, 0xFF, 0xFF, 0xFF,
+					     0xFF, 0xFF, 0xFF, 0xFF,
+					     /* move values to lo 64b */
+					     8, 0, 10, 2,
+					     12, 4, 14, 6);
+			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
+			*(uint64_t *)split_packet =
+				_mm_cvtsi128_si64(split_bits);
+			split_packet += IAVF_DESCS_PER_LOOP_AVX;
+		}
+
+		/* perform dd_check */
+		status0_7 = _mm256_and_si256(status0_7, dd_check);
+		status0_7 = _mm256_packs_epi32(status0_7,
+					       _mm256_setzero_si256());
+
+		uint64_t burst = __builtin_popcountll
+					(_mm_cvtsi128_si64
+						(_mm256_extracti128_si256
+							(status0_7, 1)));
+		burst += __builtin_popcountll
+				(_mm_cvtsi128_si64
+					(_mm256_castsi256_si128(status0_7)));
+		received += burst;
+		if (burst != IAVF_DESCS_PER_LOOP_AVX)
+			break;
+	}
+
+	/* update tail pointers */
+	rxq->rx_tail += received;
+	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
+	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */
+		rxq->rx_tail--;
+		received--;
+	}
+	rxq->rxrearm_nb += received;
+	return received;
+}
+
 /**
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
@@ -625,6 +1083,18 @@ iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _iavf_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
+/**
+ * Notice:
+ * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+iavf_recv_pkts_vec_avx2_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
+				 uint16_t nb_pkts)
+{
+	return _iavf_recv_raw_pkts_vec_avx2_flex_rxd(rx_queue, rx_pkts,
+						     nb_pkts, NULL);
+}
+
 /**
  * vPMD receive routine that reassembles single burst of 32 scattered packets
  * Notice:
@@ -690,6 +1160,75 @@ iavf_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 				rx_pkts + retval, nb_pkts);
 }
 
+/**
+ * vPMD receive routine that reassembles single burst of
+ * 32 scattered packets for flex RxD
+ * Notice:
+ * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
+ */
+static uint16_t
+iavf_recv_scattered_burst_vec_avx2_flex_rxd(void *rx_queue,
+					    struct rte_mbuf **rx_pkts,
+					    uint16_t nb_pkts)
+{
+	struct iavf_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx2_flex_rxd(rxq,
+					rx_pkts, nb_pkts, split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+
+	if (!rxq->pkt_first_seg &&
+	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
+	    split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly*/
+	unsigned int i = 0;
+
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassemble then*/
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
+					     &split_flags[i]);
+}
+
+/**
+ * vPMD receive routine that reassembles scattered packets for flex RxD.
+ * Main receive routine that can handle arbitrary burst sizes
+ * Notice:
+ * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
+ */
+uint16_t
+iavf_recv_scattered_pkts_vec_avx2_flex_rxd(void *rx_queue,
+					   struct rte_mbuf **rx_pkts,
+					   uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
+		uint16_t burst =
+			iavf_recv_scattered_burst_vec_avx2_flex_rxd
+			(rx_queue, rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_MAX_BURST)
+			return retval;
+	}
+	return retval + iavf_recv_scattered_burst_vec_avx2_flex_rxd(rx_queue,
+				rx_pkts + retval, nb_pkts);
+}
+
 static inline void
 iavf_vtx1(volatile struct iavf_tx_desc *txdp,
 	  struct rte_mbuf *pkt, uint64_t flags)
-- 
2.17.1


  parent reply	other threads:[~2020-04-20  6:27 UTC|newest]

Thread overview: 80+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-03-16  7:45 [dpdk-dev] [PATCH 00/12] framework for advanced iAVF PMD Leyi Rong
2020-03-16  7:45 ` [dpdk-dev] [PATCH 01/12] net/iavf: remove 16B Rx descriptor compile option Leyi Rong
2020-03-16  7:45 ` [dpdk-dev] [PATCH 02/12] net/iavf: return error if opcode is mismatched Leyi Rong
2020-03-16  7:45 ` [dpdk-dev] [PATCH 03/12] net/iavf: support to query DDP package info Leyi Rong
2020-03-16  7:45 ` [dpdk-dev] [PATCH 04/12] net/iavf: flexible Rx descriptor support in normal path Leyi Rong
2020-03-26  6:50   ` Wu, Jingjing
2020-03-16  7:45 ` [dpdk-dev] [PATCH 05/12] net/iavf: flexible Rx descriptor support in AVX path Leyi Rong
2020-03-16  7:45 ` [dpdk-dev] [PATCH 06/12] net/iavf: flexible Rx descriptor support in SSE path Leyi Rong
2020-03-16  7:45 ` [dpdk-dev] [PATCH 07/12] net/iavf: add flow director enabled switch value Leyi Rong
2020-03-26  6:57   ` Wu, Jingjing
2020-03-16  7:45 ` [dpdk-dev] [PATCH 08/12] net/iavf: support flow mark in normal data path Leyi Rong
2020-03-16  7:46 ` [dpdk-dev] [PATCH 09/12] net/iavf: support flow mark in AVX path Leyi Rong
2020-03-16  7:46 ` [dpdk-dev] [PATCH 10/12] net/iavf: support flow mark in SSE path Leyi Rong
2020-03-16  7:46 ` [dpdk-dev] [PATCH 11/12] net/iavf: add RSS hash parsing in AVX path Leyi Rong
2020-03-16  7:46 ` [dpdk-dev] [PATCH 12/12] net/iavf: add RSS hash parsing in SSE path Leyi Rong
2020-03-31  8:02 ` [dpdk-dev] [PATCH v2 00/12] framework for advanced iAVF PMD Leyi Rong
2020-03-31  8:02   ` [dpdk-dev] [PATCH v2 01/12] net/iavf: flexible Rx descriptor definitions Leyi Rong
2020-03-31  8:02   ` [dpdk-dev] [PATCH v2 02/12] net/iavf: return error if opcode is mismatched Leyi Rong
2020-03-31  8:02   ` [dpdk-dev] [PATCH v2 03/12] net/iavf: support to query DDP package info Leyi Rong
2020-03-31  8:02   ` [dpdk-dev] [PATCH v2 04/12] net/iavf: flexible Rx descriptor support in normal path Leyi Rong
2020-03-31  8:02   ` [dpdk-dev] [PATCH v2 05/12] net/iavf: flexible Rx descriptor support in AVX path Leyi Rong
2020-03-31  8:02   ` [dpdk-dev] [PATCH v2 06/12] net/iavf: flexible Rx descriptor support in SSE path Leyi Rong
2020-03-31  8:02   ` [dpdk-dev] [PATCH v2 07/12] net/iavf: add flow director enabled switch value Leyi Rong
2020-03-31  8:02   ` [dpdk-dev] [PATCH v2 08/12] net/iavf: support flow mark in normal data path Leyi Rong
2020-03-31  8:02   ` [dpdk-dev] [PATCH v2 09/12] net/iavf: support flow mark in AVX path Leyi Rong
2020-03-31  8:02   ` [dpdk-dev] [PATCH v2 10/12] net/iavf: support flow mark in SSE path Leyi Rong
2020-03-31  8:02   ` [dpdk-dev] [PATCH v2 11/12] net/iavf: add RSS hash parsing in AVX path Leyi Rong
2020-03-31  8:02   ` [dpdk-dev] [PATCH v2 12/12] net/iavf: add RSS hash parsing in SSE path Leyi Rong
2020-04-08  6:21 ` [dpdk-dev] [PATCH v3 00/11] framework for advanced iAVF PMD Leyi Rong
2020-04-08  6:22   ` [dpdk-dev] [PATCH v3 01/11] net/iavf: flexible Rx descriptor definitions Leyi Rong
2020-04-08  6:22   ` [dpdk-dev] [PATCH v3 02/11] net/iavf: return error if opcode is mismatched Leyi Rong
2020-04-08  6:22   ` [dpdk-dev] [PATCH v3 03/11] net/iavf: flexible Rx descriptor support in normal path Leyi Rong
2020-04-08  6:22   ` [dpdk-dev] [PATCH v3 04/11] net/iavf: flexible Rx descriptor support in AVX path Leyi Rong
2020-04-08  6:22   ` [dpdk-dev] [PATCH v3 05/11] net/iavf: flexible Rx descriptor support in SSE path Leyi Rong
2020-04-08  6:22   ` [dpdk-dev] [PATCH v3 06/11] net/iavf: add flow director enabled switch value Leyi Rong
2020-04-08  6:22   ` [dpdk-dev] [PATCH v3 07/11] net/iavf: support flow mark in normal data path Leyi Rong
2020-04-08  6:22   ` [dpdk-dev] [PATCH v3 08/11] net/iavf: support flow mark in AVX path Leyi Rong
2020-04-08  6:22   ` [dpdk-dev] [PATCH v3 09/11] net/iavf: support flow mark in SSE path Leyi Rong
2020-04-08  6:22   ` [dpdk-dev] [PATCH v3 10/11] net/iavf: add RSS hash parsing in AVX path Leyi Rong
2020-04-08  6:22   ` [dpdk-dev] [PATCH v3 11/11] net/iavf: add RSS hash parsing in SSE path Leyi Rong
2020-04-14  6:15 ` [dpdk-dev] [PATCH v4 00/11] framework for advanced iAVF PMD Leyi Rong
2020-04-14  6:15   ` [dpdk-dev] [PATCH v4 01/11] net/iavf: flexible Rx descriptor definitions Leyi Rong
2020-04-14  6:15   ` [dpdk-dev] [PATCH v4 02/11] net/iavf: return error if opcode is mismatched Leyi Rong
2020-04-14  6:15   ` [dpdk-dev] [PATCH v4 03/11] net/iavf: flexible Rx descriptor support in normal path Leyi Rong
2020-04-14  6:15   ` [dpdk-dev] [PATCH v4 04/11] net/iavf: flexible Rx descriptor support in AVX path Leyi Rong
2020-04-14  6:15   ` [dpdk-dev] [PATCH v4 05/11] net/iavf: flexible Rx descriptor support in SSE path Leyi Rong
2020-04-14  6:15   ` [dpdk-dev] [PATCH v4 06/11] net/iavf: add flow director enabled switch value Leyi Rong
2020-04-14  6:15   ` [dpdk-dev] [PATCH v4 07/11] net/iavf: support flow mark in normal data path Leyi Rong
2020-04-14  6:15   ` [dpdk-dev] [PATCH v4 08/11] net/iavf: support flow mark in AVX path Leyi Rong
2020-04-14  6:15   ` [dpdk-dev] [PATCH v4 09/11] net/iavf: support flow mark in SSE path Leyi Rong
2020-04-14  6:15   ` [dpdk-dev] [PATCH v4 10/11] net/iavf: add RSS hash parsing in AVX path Leyi Rong
2020-04-14  6:15   ` [dpdk-dev] [PATCH v4 11/11] net/iavf: add RSS hash parsing in SSE path Leyi Rong
2020-04-16  8:09 ` [dpdk-dev] [PATCH v5 00/11] framework for advanced iAVF PMD Leyi Rong
2020-04-16  8:09   ` [dpdk-dev] [PATCH v5 01/11] net/iavf: flexible Rx descriptor definitions Leyi Rong
2020-04-16  8:09   ` [dpdk-dev] [PATCH v5 02/11] net/iavf: return error if opcode is mismatched Leyi Rong
2020-04-16  8:09   ` [dpdk-dev] [PATCH v5 03/11] net/iavf: flexible Rx descriptor support in normal path Leyi Rong
2020-04-16  8:09   ` [dpdk-dev] [PATCH v5 04/11] net/iavf: flexible Rx descriptor support in AVX path Leyi Rong
2020-04-17 13:28     ` Ferruh Yigit
2020-04-16  8:09   ` [dpdk-dev] [PATCH v5 05/11] net/iavf: flexible Rx descriptor support in SSE path Leyi Rong
2020-04-16  8:09   ` [dpdk-dev] [PATCH v5 06/11] net/iavf: add flow director enabled switch value Leyi Rong
2020-04-16  8:09   ` [dpdk-dev] [PATCH v5 07/11] net/iavf: support flow mark in normal data path Leyi Rong
2020-04-16  8:09   ` [dpdk-dev] [PATCH v5 08/11] net/iavf: support flow mark in AVX path Leyi Rong
2020-04-16  8:09   ` [dpdk-dev] [PATCH v5 09/11] net/iavf: support flow mark in SSE path Leyi Rong
2020-04-16  8:09   ` [dpdk-dev] [PATCH v5 10/11] net/iavf: add RSS hash parsing in AVX path Leyi Rong
2020-04-16  8:09   ` [dpdk-dev] [PATCH v5 11/11] net/iavf: add RSS hash parsing in SSE path Leyi Rong
2020-04-16 12:21   ` [dpdk-dev] [PATCH v5 00/11] framework for advanced iAVF PMD Zhang, Qi Z
2020-04-17  4:11     ` Ye Xiaolong
2020-04-20  6:16 ` [dpdk-dev] [PATCH v6 " Leyi Rong
2020-04-20  6:16   ` [dpdk-dev] [PATCH v6 01/11] net/iavf: flexible Rx descriptor definitions Leyi Rong
2020-04-20  6:16   ` [dpdk-dev] [PATCH v6 02/11] net/iavf: return error if opcode is mismatched Leyi Rong
2020-04-20  6:16   ` [dpdk-dev] [PATCH v6 03/11] net/iavf: flexible Rx descriptor support in normal path Leyi Rong
2020-04-20  6:16   ` Leyi Rong [this message]
2020-04-20  6:16   ` [dpdk-dev] [PATCH v6 05/11] net/iavf: flexible Rx descriptor support in SSE path Leyi Rong
2020-04-20  6:16   ` [dpdk-dev] [PATCH v6 06/11] net/iavf: add flow director enabled switch value Leyi Rong
2020-04-20  6:16   ` [dpdk-dev] [PATCH v6 07/11] net/iavf: support flow mark in normal data path Leyi Rong
2020-04-20  6:16   ` [dpdk-dev] [PATCH v6 08/11] net/iavf: support flow mark in AVX path Leyi Rong
2020-04-20  6:16   ` [dpdk-dev] [PATCH v6 09/11] net/iavf: support flow mark in SSE path Leyi Rong
2020-04-20  6:16   ` [dpdk-dev] [PATCH v6 10/11] net/iavf: add RSS hash parsing in AVX path Leyi Rong
2020-04-20  6:16   ` [dpdk-dev] [PATCH v6 11/11] net/iavf: add RSS hash parsing in SSE path Leyi Rong
2020-04-21  3:14   ` [dpdk-dev] [PATCH v6 00/11] framework for advanced iAVF PMD Ye Xiaolong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200420061625.21424-5-leyi.rong@intel.com \
    --to=leyi.rong@intel.com \
    --cc=beilei.xing@intel.com \
    --cc=dev@dpdk.org \
    --cc=jingjing.wu@intel.com \
    --cc=qi.z.zhang@intel.com \
    --cc=xiaolong.ye@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.