All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/7] net/mlx5: improve single core performance
@ 2016-11-24 16:03 Nelio Laranjeiro
  2016-11-24 16:03 ` [PATCH 1/7] net/mlx5: prepare Tx vectorization Nelio Laranjeiro
                   ` (7 more replies)
  0 siblings, 8 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
  To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil

This series applies on top of
"[PATCH] eal: define generic vector types" [1][2]

Using built-in vector types forces compilers to consider SIMD instructions in
specific places in order to improve performance on both IBM POWER8 and Intel
architectures.

For example, testpmd single-thread I/O forwarding packets per second
performance is improved by 6% on Intel platforms.

 [1] http://dpdk.org/ml/archives/dev/2016-November/050261.html
 [2] http://dpdk.org/dev/patchwork/patch/17024/

Nelio Laranjeiro (7):
  net/mlx5: prepare Tx vectorization
  net/mlx5: use work queue buffer as a raw buffer
  net/mlx5: use vector types to speed up processing
  net/mlx5: fix missing inline attributes
  net/mlx5: move static prototype
  net/mlx5: optimize copy of Ethernet header
  net/mlx5: remove inefficient prefetching

 drivers/net/mlx5/mlx5_prm.h  |  20 +++-
 drivers/net/mlx5/mlx5_rxtx.c | 243 +++++++++++++++++++++++--------------------
 drivers/net/mlx5/mlx5_rxtx.h |   2 +-
 drivers/net/mlx5/mlx5_txq.c  |   8 +-
 4 files changed, 150 insertions(+), 123 deletions(-)

-- 
2.1.4

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/7] net/mlx5: prepare Tx vectorization
  2016-11-24 16:03 [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
@ 2016-11-24 16:03 ` Nelio Laranjeiro
  2016-11-24 16:03 ` [PATCH 2/7] net/mlx5: use work queue buffer as a raw buffer Nelio Laranjeiro
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
  To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil, Elad Persiko

Prepare the code to write the Work Queue Element with vectorized
instructions.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Elad Persiko <eladpe@mellanox.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 44 ++++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index ffd09ac..5dacd93 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -391,6 +391,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		uint32_t length;
 		unsigned int ds = 0;
 		uintptr_t addr;
+		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
+		uint8_t ehdr[2];
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		uint32_t total_length = 0;
 #endif
@@ -416,6 +418,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			rte_prefetch0(*pkts);
 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
 		length = DATA_LEN(buf);
+		ehdr[0] = ((uint8_t *)addr)[0];
+		ehdr[1] = ((uint8_t *)addr)[1];
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		total_length = length;
 #endif
@@ -439,24 +443,20 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		} else {
 			wqe->eseg.cs_flags = 0;
 		}
-		raw  = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0];
-		/* Start the know and common part of the WQE structure. */
-		wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
-		wqe->ctrl[2] = 0;
-		wqe->ctrl[3] = 0;
-		wqe->eseg.rsvd0 = 0;
-		wqe->eseg.rsvd1 = 0;
-		wqe->eseg.mss = 0;
-		wqe->eseg.rsvd2 = 0;
-		/* Start by copying the Ethernet Header. */
-		memcpy((uint8_t *)raw, ((uint8_t *)addr), 16);
+		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
+		/*
+		 * Start by copying the Ethernet header minus the first two
+		 * bytes which will be appended at the end of the Ethernet
+		 * segment.
+		 */
+		memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2, 16);
 		length -= MLX5_WQE_DWORD_SIZE;
 		addr += MLX5_WQE_DWORD_SIZE;
 		/* Replace the Ethernet type by the VLAN if necessary. */
 		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
 			uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
 
-			memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE -
+			memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE - 2 -
 					   sizeof(vlan)),
 			       &vlan, sizeof(vlan));
 			addr -= sizeof(vlan);
@@ -468,10 +468,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				(uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
 			uint16_t max_inline =
 				txq->max_inline * RTE_CACHE_LINE_SIZE;
-			uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
 			uint16_t room;
 
-			raw += MLX5_WQE_DWORD_SIZE;
+			/*
+			 * raw starts two bytes before the boundary to
+			 * continue the above copy of packet data.
+			 */
+			raw += MLX5_WQE_DWORD_SIZE - 2;
 			room = end - (uintptr_t)raw;
 			if (room > max_inline) {
 				uintptr_t addr_end = (addr + max_inline) &
@@ -487,8 +490,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				/* Sanity check. */
 				assert(addr <= addr_end);
 			}
-			/* Store the inlined packet size in the WQE. */
-			wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
 			/*
 			 * 2 DWORDs consumed by the WQE header + 1 DSEG +
 			 * the size of the inline part of the packet.
@@ -570,7 +571,18 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			--pkts_n;
 next_pkt:
 		++i;
+		/* Initialize known and common part of the WQE structure. */
+		wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
 		wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
+		wqe->ctrl[2] = 0;
+		wqe->ctrl[3] = 0;
+		wqe->eseg.rsvd0 = 0;
+		wqe->eseg.rsvd1 = 0;
+		wqe->eseg.mss = 0;
+		wqe->eseg.rsvd2 = 0;
+		wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
+		wqe->eseg.inline_hdr[0] = ehdr[0];
+		wqe->eseg.inline_hdr[1] = ehdr[1];
 		txq->wqe_ci += (ds + 3) / 4;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/7] net/mlx5: use work queue buffer as a raw buffer
  2016-11-24 16:03 [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
  2016-11-24 16:03 ` [PATCH 1/7] net/mlx5: prepare Tx vectorization Nelio Laranjeiro
@ 2016-11-24 16:03 ` Nelio Laranjeiro
  2016-11-24 16:03 ` [PATCH 3/7] net/mlx5: use vector types to speed up processing Nelio Laranjeiro
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
  To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil

Define a single work queue element type that encompasses them all.  It
includes control, Ethernet segment and raw data all grouped in a single
place.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5_prm.h  |  13 ++++--
 drivers/net/mlx5/mlx5_rxtx.c | 103 ++++++++++++++++++++++---------------------
 drivers/net/mlx5/mlx5_rxtx.h |   2 +-
 drivers/net/mlx5/mlx5_txq.c  |   8 ++--
 4 files changed, 68 insertions(+), 58 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 7f31a2f..3dd4cbe 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -114,12 +114,19 @@ struct mlx5_wqe_eth_seg_small {
 	uint32_t rsvd2;
 	uint16_t inline_hdr_sz;
 	uint8_t inline_hdr[2];
-};
+} __rte_aligned(MLX5_WQE_DWORD_SIZE);
 
 struct mlx5_wqe_inl_small {
 	uint32_t byte_cnt;
 	uint8_t raw;
-};
+} __rte_aligned(MLX5_WQE_DWORD_SIZE);
+
+struct mlx5_wqe_ctrl {
+	uint32_t ctrl0;
+	uint32_t ctrl1;
+	uint32_t ctrl2;
+	uint32_t ctrl3;
+} __rte_aligned(MLX5_WQE_DWORD_SIZE);
 
 /* Small common part of the WQE. */
 struct mlx5_wqe {
@@ -131,7 +138,7 @@ struct mlx5_wqe {
 struct mlx5_wqe64 {
 	struct mlx5_wqe hdr;
 	uint8_t raw[32];
-} __rte_aligned(64);
+} __rte_aligned(MLX5_WQE_SIZE);
 
 /* MPW session status. */
 enum mlx5_mpw_state {
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 5dacd93..ada8e74 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -154,6 +154,24 @@ check_cqe(volatile struct mlx5_cqe *cqe,
 	return 0;
 }
 
+/**
+ * Return the address of the WQE.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param  wqe_ci
+ *   WQE consumer index.
+ *
+ * @return
+ *   WQE address.
+ */
+static inline uintptr_t *
+tx_mlx5_wqe(struct txq *txq, uint16_t ci)
+{
+	ci &= ((1 << txq->wqe_n) - 1);
+	return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE);
+}
+
 static inline void
 txq_complete(struct txq *txq) __attribute__((always_inline));
 
@@ -175,7 +193,7 @@ txq_complete(struct txq *txq)
 	uint16_t elts_tail;
 	uint16_t cq_ci = txq->cq_ci;
 	volatile struct mlx5_cqe *cqe = NULL;
-	volatile struct mlx5_wqe *wqe;
+	volatile struct mlx5_wqe_ctrl *ctrl;
 
 	do {
 		volatile struct mlx5_cqe *tmp;
@@ -201,9 +219,9 @@ txq_complete(struct txq *txq)
 	} while (1);
 	if (unlikely(cqe == NULL))
 		return;
-	wqe = &(*txq->wqes)[ntohs(cqe->wqe_counter) &
-			    ((1 << txq->wqe_n) - 1)].hdr;
-	elts_tail = wqe->ctrl[3];
+	ctrl = (volatile struct mlx5_wqe_ctrl *)
+		tx_mlx5_wqe(txq, ntohs(cqe->wqe_counter));
+	elts_tail = ctrl->ctrl3;
 	assert(elts_tail < (1 << txq->wqe_n));
 	/* Free buffers. */
 	while (elts_free != elts_tail) {
@@ -331,23 +349,6 @@ tx_prefetch_cqe(struct txq *txq, uint16_t ci)
 }
 
 /**
- * Prefetch a WQE.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param  wqe_ci
- *   WQE consumer index.
- */
-static inline void
-tx_prefetch_wqe(struct txq *txq, uint16_t ci)
-{
-	volatile struct mlx5_wqe64 *wqe;
-
-	wqe = &(*txq->wqes)[ci & ((1 << txq->wqe_n) - 1)];
-	rte_prefetch0(wqe);
-}
-
-/**
  * DPDK callback for TX.
  *
  * @param dpdk_txq
@@ -411,9 +412,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		--segs_n;
 		if (!segs_n)
 			--pkts_n;
-		wqe = &(*txq->wqes)[txq->wqe_ci &
-				    ((1 << txq->wqe_n) - 1)].hdr;
-		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+		wqe = (volatile struct mlx5_wqe *)
+			tx_mlx5_wqe(txq, txq->wqe_ci);
+		rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
 		if (pkts_n > 1)
 			rte_prefetch0(*pkts);
 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
@@ -464,8 +465,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		}
 		/* Inline if enough room. */
 		if (txq->max_inline != 0) {
-			uintptr_t end =
-				(uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
+			uintptr_t end = (uintptr_t)
+				(((uintptr_t)txq->wqes) +
+				 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
 			uint16_t max_inline =
 				txq->max_inline * RTE_CACHE_LINE_SIZE;
 			uint16_t room;
@@ -496,12 +498,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			 */
 			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
 			if (length > 0) {
-				dseg = (struct mlx5_wqe_data_seg *)
+				dseg = (volatile struct mlx5_wqe_data_seg *)
 					((uintptr_t)wqe +
 					 (ds * MLX5_WQE_DWORD_SIZE));
 				if ((uintptr_t)dseg >= end)
-					dseg = (struct mlx5_wqe_data_seg *)
-						((uintptr_t)&(*txq->wqes)[0]);
+					dseg = (volatile struct
+						mlx5_wqe_data_seg *)
+					       txq->wqes;
 				goto use_dseg;
 			} else if (!segs_n) {
 				goto next_pkt;
@@ -514,12 +517,12 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			 * Ethernet Header as been stored.
 			 */
 			wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
-			dseg = (struct mlx5_wqe_data_seg *)
+			dseg = (volatile struct mlx5_wqe_data_seg *)
 				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
 			ds = 3;
 use_dseg:
 			/* Add the remaining packet as a simple ds. */
-			*dseg = (struct mlx5_wqe_data_seg) {
+			*dseg = (volatile struct mlx5_wqe_data_seg) {
 				.addr = htonll(addr),
 				.byte_count = htonl(length),
 				.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
@@ -542,9 +545,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) &
 				((1 << txq->wqe_n) - 1);
 
-			dseg = (struct mlx5_wqe_data_seg *)
-				((uintptr_t)&(*txq->wqes)[n]);
-			tx_prefetch_wqe(txq, n + 1);
+			dseg = (volatile struct mlx5_wqe_data_seg *)
+			       tx_mlx5_wqe(txq, n);
+			rte_prefetch0(tx_mlx5_wqe(txq, n + 1));
 		} else {
 			++dseg;
 		}
@@ -556,7 +559,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		total_length += length;
 #endif
 		/* Store segment information. */
-		*dseg = (struct mlx5_wqe_data_seg) {
+		*dseg = (volatile struct mlx5_wqe_data_seg) {
 			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
 			.byte_count = htonl(length),
 			.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
@@ -629,13 +632,13 @@ mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
 	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
 		(volatile struct mlx5_wqe_data_seg (*)[])
-		(uintptr_t)&(*txq->wqes)[(idx + 1) & ((1 << txq->wqe_n) - 1)];
+		tx_mlx5_wqe(txq, idx + 1);
 
 	mpw->state = MLX5_MPW_STATE_OPENED;
 	mpw->pkts_n = 0;
 	mpw->len = length;
 	mpw->total_len = 0;
-	mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
+	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
 	mpw->wqe->eseg.mss = htons(length);
 	mpw->wqe->eseg.inline_hdr_sz = 0;
 	mpw->wqe->eseg.rsvd0 = 0;
@@ -677,8 +680,8 @@ mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
 		++txq->wqe_ci;
 	else
 		txq->wqe_ci += 2;
-	tx_prefetch_wqe(txq, txq->wqe_ci);
-	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
+	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
 }
 
 /**
@@ -712,8 +715,8 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		return 0;
 	/* Prefetch first packet cacheline. */
 	tx_prefetch_cqe(txq, txq->cq_ci);
-	tx_prefetch_wqe(txq, txq->wqe_ci);
-	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
+	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
 	/* Start processing. */
 	txq_complete(txq);
 	max = (elts_n - (elts_head - txq->elts_tail));
@@ -841,7 +844,7 @@ mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 	mpw->pkts_n = 0;
 	mpw->len = length;
 	mpw->total_len = 0;
-	mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
+	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
 	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
 				  (txq->wqe_ci << 8) |
 				  MLX5_OPCODE_TSO);
@@ -917,8 +920,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 		return 0;
 	/* Prefetch first packet cacheline. */
 	tx_prefetch_cqe(txq, txq->cq_ci);
-	tx_prefetch_wqe(txq, txq->wqe_ci);
-	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
+	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
 	/* Start processing. */
 	txq_complete(txq);
 	max = (elts_n - (elts_head - txq->elts_tail));
@@ -1019,14 +1022,15 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			(*txq->elts)[elts_head] = buf;
 			/* Maximum number of bytes before wrapping. */
-			max = ((uintptr_t)&(*txq->wqes)[1 << txq->wqe_n] -
+			max = ((((uintptr_t)(txq->wqes)) +
+				(1 << txq->wqe_n) *
+				MLX5_WQE_SIZE) -
 			       (uintptr_t)mpw.data.raw);
 			if (length > max) {
 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
 					   (void *)addr,
 					   max);
-				mpw.data.raw =
-					(volatile void *)&(*txq->wqes)[0];
+				mpw.data.raw = (volatile void *)txq->wqes;
 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
 					   (void *)(addr + max),
 					   length - max);
@@ -1038,9 +1042,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 				mpw.data.raw += length;
 			}
 			if ((uintptr_t)mpw.data.raw ==
-			    (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n])
-				mpw.data.raw =
-					(volatile void *)&(*txq->wqes)[0];
+			    (uintptr_t)tx_mlx5_wqe(txq, 1 << txq->wqe_n))
+				mpw.data.raw = (volatile void *)txq->wqes;
 			++mpw.pkts_n;
 			++j;
 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 8f2cddb..b9b90a7 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -259,7 +259,7 @@ struct txq {
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
-	volatile struct mlx5_wqe64 (*wqes)[]; /* Work queue. */
+	volatile void *wqes; /* Work queue (use volatile to write into). */
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
 	volatile void *bf_reg; /* Blueflame register. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 053665d..f4c6682 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -82,7 +82,9 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
 	for (i = 0; (i != elts_n); ++i)
 		(*txq_ctrl->txq.elts)[i] = NULL;
 	for (i = 0; (i != (1u << txq_ctrl->txq.wqe_n)); ++i) {
-		volatile struct mlx5_wqe64 *wqe = &(*txq_ctrl->txq.wqes)[i];
+		volatile struct mlx5_wqe64 *wqe =
+			(volatile struct mlx5_wqe64 *)
+			txq_ctrl->txq.wqes + i;
 
 		memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe));
 	}
@@ -214,9 +216,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 	}
 	tmpl->txq.cqe_n = log2above(ibcq->cqe);
 	tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8;
-	tmpl->txq.wqes =
-		(volatile struct mlx5_wqe64 (*)[])
-		(uintptr_t)qp->gen_data.sqstart;
+	tmpl->txq.wqes = qp->gen_data.sqstart;
 	tmpl->txq.wqe_n = log2above(qp->sq.wqe_cnt);
 	tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
 	tmpl->txq.bf_reg = qp->gen_data.bf->reg;
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 3/7] net/mlx5: use vector types to speed up processing
  2016-11-24 16:03 [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
  2016-11-24 16:03 ` [PATCH 1/7] net/mlx5: prepare Tx vectorization Nelio Laranjeiro
  2016-11-24 16:03 ` [PATCH 2/7] net/mlx5: use work queue buffer as a raw buffer Nelio Laranjeiro
@ 2016-11-24 16:03 ` Nelio Laranjeiro
  2016-11-24 16:03 ` [PATCH 4/7] net/mlx5: fix missing inline attributes Nelio Laranjeiro
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
  To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil

Let compiler automatically use the vector capabilities of the target
machine to optimize instructions.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5_prm.h  |  7 +++++
 drivers/net/mlx5/mlx5_rxtx.c | 74 +++++++++++++++++++++++---------------------
 2 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 3dd4cbe..9cd9fdf 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -44,6 +44,7 @@
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
+#include <rte_vect.h>
 #include "mlx5_autoconf.h"
 
 /* Get CQE owner bit. */
@@ -134,6 +135,12 @@ struct mlx5_wqe {
 	struct mlx5_wqe_eth_seg_small eseg;
 };
 
+/* Vectorize WQE header. */
+struct mlx5_wqe_v {
+	rte_v128u32_t ctrl;
+	rte_v128u32_t eseg;
+};
+
 /* WQE. */
 struct mlx5_wqe64 {
 	struct mlx5_wqe hdr;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index ada8e74..e161cd9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -371,7 +371,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int j = 0;
 	unsigned int max;
 	unsigned int comp;
-	volatile struct mlx5_wqe *wqe = NULL;
+	volatile struct mlx5_wqe_v *wqe = NULL;
 	unsigned int segs_n = 0;
 	struct rte_mbuf *buf = NULL;
 	uint8_t *raw;
@@ -388,12 +388,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (max > elts_n)
 		max -= elts_n;
 	do {
-		volatile struct mlx5_wqe_data_seg *dseg = NULL;
+		volatile rte_v128u32_t *dseg = NULL;
 		uint32_t length;
 		unsigned int ds = 0;
 		uintptr_t addr;
 		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
 		uint8_t ehdr[2];
+		uint8_t cs_flags = 0;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		uint32_t total_length = 0;
 #endif
@@ -412,7 +413,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		--segs_n;
 		if (!segs_n)
 			--pkts_n;
-		wqe = (volatile struct mlx5_wqe *)
+		wqe = (volatile struct mlx5_wqe_v *)
 			tx_mlx5_wqe(txq, txq->wqe_ci);
 		rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
 		if (pkts_n > 1)
@@ -438,11 +439,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
-			wqe->eseg.cs_flags =
-				MLX5_ETH_WQE_L3_CSUM |
-				MLX5_ETH_WQE_L4_CSUM;
-		} else {
-			wqe->eseg.cs_flags = 0;
+			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
 		}
 		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
 		/*
@@ -498,12 +495,11 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			 */
 			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
 			if (length > 0) {
-				dseg = (volatile struct mlx5_wqe_data_seg *)
+				dseg = (volatile rte_v128u32_t *)
 					((uintptr_t)wqe +
 					 (ds * MLX5_WQE_DWORD_SIZE));
 				if ((uintptr_t)dseg >= end)
-					dseg = (volatile struct
-						mlx5_wqe_data_seg *)
+					dseg = (volatile rte_v128u32_t *)
 					       txq->wqes;
 				goto use_dseg;
 			} else if (!segs_n) {
@@ -516,16 +512,17 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			 * No inline has been done in the packet, only the
 			 * Ethernet Header as been stored.
 			 */
-			wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
-			dseg = (volatile struct mlx5_wqe_data_seg *)
+			dseg = (volatile rte_v128u32_t *)
 				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
 			ds = 3;
 use_dseg:
 			/* Add the remaining packet as a simple ds. */
-			*dseg = (volatile struct mlx5_wqe_data_seg) {
-				.addr = htonll(addr),
-				.byte_count = htonl(length),
-				.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+			addr = htonll(addr);
+			*dseg = (rte_v128u32_t){
+				htonl(length),
+				txq_mp2mr(txq, txq_mb2mp(buf)),
+				addr,
+				addr >> 32,
 			};
 			++ds;
 			if (!segs_n)
@@ -545,7 +542,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) &
 				((1 << txq->wqe_n) - 1);
 
-			dseg = (volatile struct mlx5_wqe_data_seg *)
+			dseg = (volatile rte_v128u32_t *)
 			       tx_mlx5_wqe(txq, n);
 			rte_prefetch0(tx_mlx5_wqe(txq, n + 1));
 		} else {
@@ -559,10 +556,12 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		total_length += length;
 #endif
 		/* Store segment information. */
-		*dseg = (volatile struct mlx5_wqe_data_seg) {
-			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
-			.byte_count = htonl(length),
-			.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+		addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
+		*dseg = (rte_v128u32_t){
+			htonl(length),
+			txq_mp2mr(txq, txq_mb2mp(buf)),
+			addr,
+			addr >> 32,
 		};
 		(*txq->elts)[elts_head] = buf;
 		elts_head = (elts_head + 1) & (elts_n - 1);
@@ -575,17 +574,19 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 next_pkt:
 		++i;
 		/* Initialize known and common part of the WQE structure. */
-		wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
-		wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
-		wqe->ctrl[2] = 0;
-		wqe->ctrl[3] = 0;
-		wqe->eseg.rsvd0 = 0;
-		wqe->eseg.rsvd1 = 0;
-		wqe->eseg.mss = 0;
-		wqe->eseg.rsvd2 = 0;
-		wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
-		wqe->eseg.inline_hdr[0] = ehdr[0];
-		wqe->eseg.inline_hdr[1] = ehdr[1];
+		wqe->ctrl = (rte_v128u32_t){
+			htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND),
+			htonl(txq->qp_num_8s | ds),
+			0,
+			0,
+		};
+		wqe->eseg = (rte_v128u32_t){
+			0,
+			cs_flags,
+			0,
+			(ehdr[1] << 24) | (ehdr[0] << 16) |
+			htons(pkt_inline_sz),
+		};
 		txq->wqe_ci += (ds + 3) / 4;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
@@ -598,10 +599,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	/* Check whether completion threshold has been reached. */
 	comp = txq->elts_comp + i + j;
 	if (comp >= MLX5_TX_COMP_THRESH) {
+		volatile struct mlx5_wqe_ctrl *w =
+			(volatile struct mlx5_wqe_ctrl *)wqe;
+
 		/* Request completion on last WQE. */
-		wqe->ctrl[2] = htonl(8);
+		w->ctrl2 = htonl(8);
 		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->ctrl[3] = elts_head;
+		w->ctrl3 = elts_head;
 		txq->elts_comp = 0;
 	} else {
 		txq->elts_comp = comp;
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 4/7] net/mlx5: fix missing inline attributes
  2016-11-24 16:03 [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
                   ` (2 preceding siblings ...)
  2016-11-24 16:03 ` [PATCH 3/7] net/mlx5: use vector types to speed up processing Nelio Laranjeiro
@ 2016-11-24 16:03 ` Nelio Laranjeiro
  2016-11-24 16:03 ` [PATCH 5/7] net/mlx5: move static prototype Nelio Laranjeiro
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
  To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil, stable

These functions must be forced inline for better performance.

Fixes: 99c12dcca65d ("net/mlx5: handle Rx CQE compression")
Fixes: 1d88ba171942 ("net/mlx5: refactor Tx data path")
Fixes: 67fa62bc672d ("mlx5: support checksum offload")

CC: stable@dpdk.org
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index e161cd9..52733da 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -71,6 +71,31 @@
 #include "mlx5_defs.h"
 #include "mlx5_prm.h"
 
+static inline int
+check_cqe(volatile struct mlx5_cqe *cqe,
+	  unsigned int cqes_n, const uint16_t ci)
+	  __attribute__((always_inline));
+
+static inline uint32_t
+txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+	__attribute__((always_inline));
+
+static inline void
+mlx5_tx_dbrec(struct txq *txq) __attribute__((always_inline));
+
+static inline uint32_t
+rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
+	__attribute__((always_inline));
+
+static inline int
+mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
+		 uint16_t cqe_cnt, uint32_t *rss_hash)
+		 __attribute__((always_inline));
+
+static inline uint32_t
+rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe)
+		   __attribute__((always_inline));
+
 #ifndef NDEBUG
 
 /**
@@ -100,11 +125,6 @@ check_cqe_seen(volatile struct mlx5_cqe *cqe)
 
 #endif /* NDEBUG */
 
-static inline int
-check_cqe(volatile struct mlx5_cqe *cqe,
-	  unsigned int cqes_n, const uint16_t ci)
-	  __attribute__((always_inline));
-
 /**
  * Check whether CQE is valid.
  *
@@ -266,10 +286,6 @@ txq_mb2mp(struct rte_mbuf *buf)
 	return buf->pool;
 }
 
-static inline uint32_t
-txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
-	__attribute__((always_inline));
-
 /**
  * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
  * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 5/7] net/mlx5: move static prototype
  2016-11-24 16:03 [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
                   ` (3 preceding siblings ...)
  2016-11-24 16:03 ` [PATCH 4/7] net/mlx5: fix missing inline attributes Nelio Laranjeiro
@ 2016-11-24 16:03 ` Nelio Laranjeiro
  2016-11-24 16:03 ` [PATCH 6/7] net/mlx5: optimize copy of Ethernet header Nelio Laranjeiro
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
  To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil

Gather function prototypes at the beginning of the file.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 52733da..0d0b807 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -76,6 +76,9 @@ check_cqe(volatile struct mlx5_cqe *cqe,
 	  unsigned int cqes_n, const uint16_t ci)
 	  __attribute__((always_inline));
 
+static inline void
+txq_complete(struct txq *txq) __attribute__((always_inline));
+
 static inline uint32_t
 txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
 	__attribute__((always_inline));
@@ -192,9 +195,6 @@ tx_mlx5_wqe(struct txq *txq, uint16_t ci)
 	return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE);
 }
 
-static inline void
-txq_complete(struct txq *txq) __attribute__((always_inline));
-
 /**
  * Manage TX completions.
  *
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 6/7] net/mlx5: optimize copy of Ethernet header
  2016-11-24 16:03 [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
                   ` (4 preceding siblings ...)
  2016-11-24 16:03 ` [PATCH 5/7] net/mlx5: move static prototype Nelio Laranjeiro
@ 2016-11-24 16:03 ` Nelio Laranjeiro
  2016-11-24 16:03 ` [PATCH 7/7] net/mlx5: remove inefficient prefetching Nelio Laranjeiro
  2017-01-05 14:13 ` [PATCH 0/7] net/mlx5: improve single core performance Ferruh Yigit
  7 siblings, 0 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
  To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil

Use fewer instructions to copy the first two bytes of Ethernet headers to
work queue elements.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 0d0b807..4b8c197 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -409,7 +409,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		unsigned int ds = 0;
 		uintptr_t addr;
 		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
-		uint8_t ehdr[2];
+		uint16_t ehdr;
 		uint8_t cs_flags = 0;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		uint32_t total_length = 0;
@@ -436,8 +436,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			rte_prefetch0(*pkts);
 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
 		length = DATA_LEN(buf);
-		ehdr[0] = ((uint8_t *)addr)[0];
-		ehdr[1] = ((uint8_t *)addr)[1];
+		ehdr = (((uint8_t *)addr)[1] << 8) |
+		       ((uint8_t *)addr)[0];
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		total_length = length;
 #endif
@@ -600,8 +600,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			0,
 			cs_flags,
 			0,
-			(ehdr[1] << 24) | (ehdr[0] << 16) |
-			htons(pkt_inline_sz),
+			(ehdr << 16) | htons(pkt_inline_sz),
 		};
 		txq->wqe_ci += (ds + 3) / 4;
 #ifdef MLX5_PMD_SOFT_COUNTERS
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 7/7] net/mlx5: remove inefficient prefetching
  2016-11-24 16:03 [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
                   ` (5 preceding siblings ...)
  2016-11-24 16:03 ` [PATCH 6/7] net/mlx5: optimize copy of Ethernet header Nelio Laranjeiro
@ 2016-11-24 16:03 ` Nelio Laranjeiro
  2017-01-05 14:13 ` [PATCH 0/7] net/mlx5: improve single core performance Ferruh Yigit
  7 siblings, 0 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
  To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil

Prefetching completion queue entries is inefficient because too few CPU
cycles are spent before their use, which results into cache misses anyway.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 4b8c197..9f74fd4 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -348,23 +348,6 @@ mlx5_tx_dbrec(struct txq *txq)
 }
 
 /**
- * Prefetch a CQE.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param cqe_ci
- *   CQE consumer index.
- */
-static inline void
-tx_prefetch_cqe(struct txq *txq, uint16_t ci)
-{
-	volatile struct mlx5_cqe *cqe;
-
-	cqe = &(*txq->cqes)[ci & ((1 << txq->cqe_n) - 1)];
-	rte_prefetch0(cqe);
-}
-
-/**
  * DPDK callback for TX.
  *
  * @param dpdk_txq
@@ -395,8 +378,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (unlikely(!pkts_n))
 		return 0;
 	/* Prefetch first packet cacheline. */
-	tx_prefetch_cqe(txq, txq->cq_ci);
-	tx_prefetch_cqe(txq, txq->cq_ci + 1);
 	rte_prefetch0(*pkts);
 	/* Start processing. */
 	txq_complete(txq);
@@ -733,7 +714,6 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (unlikely(!pkts_n))
 		return 0;
 	/* Prefetch first packet cacheline. */
-	tx_prefetch_cqe(txq, txq->cq_ci);
 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
 	/* Start processing. */
@@ -938,7 +918,6 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 	if (unlikely(!pkts_n))
 		return 0;
 	/* Prefetch first packet cacheline. */
-	tx_prefetch_cqe(txq, txq->cq_ci);
 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
 	/* Start processing. */
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/7] net/mlx5: improve single core performance
  2016-11-24 16:03 [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
                   ` (6 preceding siblings ...)
  2016-11-24 16:03 ` [PATCH 7/7] net/mlx5: remove inefficient prefetching Nelio Laranjeiro
@ 2017-01-05 14:13 ` Ferruh Yigit
  2017-01-18  7:20   ` Thomas Monjalon
  7 siblings, 1 reply; 13+ messages in thread
From: Ferruh Yigit @ 2017-01-05 14:13 UTC (permalink / raw)
  To: Nelio Laranjeiro, dev; +Cc: Thomas Monjalon, Adrien Mazarguil

On 11/24/2016 4:03 PM, Nelio Laranjeiro wrote:
> This series applies on top of
> "[PATCH] eal: define generic vector types" [1][2]
> 
> Using built-in vector types forces compilers to consider SIMD instructions in
> specific places in order to improve performance on both IBM POWER8 and Intel
> architectures.
> 
> For example, testpmd single-thread I/O forwarding packets per second
> performance is improved by 6% on Intel platforms.
> 
>  [1] http://dpdk.org/ml/archives/dev/2016-November/050261.html
>  [2] http://dpdk.org/dev/patchwork/patch/17024/
> 
> Nelio Laranjeiro (7):
>   net/mlx5: prepare Tx vectorization
>   net/mlx5: use work queue buffer as a raw buffer
>   net/mlx5: use vector types to speed up processing
>   net/mlx5: fix missing inline attributes
>   net/mlx5: move static prototype
>   net/mlx5: optimize copy of Ethernet header
>   net/mlx5: remove inefficient prefetching
> 
<...>

Series applied to dpdk-next-net/master, thanks.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/7] net/mlx5: improve single core performance
  2017-01-05 14:13 ` [PATCH 0/7] net/mlx5: improve single core performance Ferruh Yigit
@ 2017-01-18  7:20   ` Thomas Monjalon
  2017-01-18  9:23     ` Nélio Laranjeiro
  2017-01-18 10:01     ` [PATCH] net/mlx5: fix 32bits compilation issue Nelio Laranjeiro
  0 siblings, 2 replies; 13+ messages in thread
From: Thomas Monjalon @ 2017-01-18  7:20 UTC (permalink / raw)
  To: Ferruh Yigit, Nelio Laranjeiro; +Cc: dev, Adrien Mazarguil

2017-01-05 14:13, Ferruh Yigit:
> On 11/24/2016 4:03 PM, Nelio Laranjeiro wrote:
> > This series applies on top of
> > "[PATCH] eal: define generic vector types" [1][2]
> > 
> > Using built-in vector types forces compilers to consider SIMD instructions in
> > specific places in order to improve performance on both IBM POWER8 and Intel
> > architectures.
> > 
> > For example, testpmd single-thread I/O forwarding packets per second
> > performance is improved by 6% on Intel platforms.
> > 
> >  [1] http://dpdk.org/ml/archives/dev/2016-November/050261.html
> >  [2] http://dpdk.org/dev/patchwork/patch/17024/
> > 
> > Nelio Laranjeiro (7):
> >   net/mlx5: prepare Tx vectorization
> >   net/mlx5: use work queue buffer as a raw buffer
> >   net/mlx5: use vector types to speed up processing
> >   net/mlx5: fix missing inline attributes
> >   net/mlx5: move static prototype
> >   net/mlx5: optimize copy of Ethernet header
> >   net/mlx5: remove inefficient prefetching
> 
> Series applied to dpdk-next-net/master, thanks.

It will not be pulled in mainline because compilation fails on 32-bit:

drivers/net/mlx5/mlx5_rxtx.c: In function ‘mlx5_tx_burst’:
drivers/net/mlx5/mlx5_rxtx.c:523:10: error:
right shift count >= width of type [-Werror=shift-count-overflow]
     addr >> 32,
          ^~

Please Ferruh, remove the series from next-net.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/7] net/mlx5: improve single core performance
  2017-01-18  7:20   ` Thomas Monjalon
@ 2017-01-18  9:23     ` Nélio Laranjeiro
  2017-01-18 10:01     ` [PATCH] net/mlx5: fix 32bits compilation issue Nelio Laranjeiro
  1 sibling, 0 replies; 13+ messages in thread
From: Nélio Laranjeiro @ 2017-01-18  9:23 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: Ferruh Yigit, dev, Adrien Mazarguil

On Wed, Jan 18, 2017 at 08:20:41AM +0100, Thomas Monjalon wrote:
> 2017-01-05 14:13, Ferruh Yigit:
> > On 11/24/2016 4:03 PM, Nelio Laranjeiro wrote:
> > > This series applies on top of
> > > "[PATCH] eal: define generic vector types" [1][2]
> > > 
> > > Using built-in vector types forces compilers to consider SIMD instructions in
> > > specific places in order to improve performance on both IBM POWER8 and Intel
> > > architectures.
> > > 
> > > For example, testpmd single-thread I/O forwarding packets per second
> > > performance is improved by 6% on Intel platforms.
> > > 
> > >  [1] http://dpdk.org/ml/archives/dev/2016-November/050261.html
> > >  [2] http://dpdk.org/dev/patchwork/patch/17024/
> > > 
> > > Nelio Laranjeiro (7):
> > >   net/mlx5: prepare Tx vectorization
> > >   net/mlx5: use work queue buffer as a raw buffer
> > >   net/mlx5: use vector types to speed up processing
> > >   net/mlx5: fix missing inline attributes
> > >   net/mlx5: move static prototype
> > >   net/mlx5: optimize copy of Ethernet header
> > >   net/mlx5: remove inefficient prefetching
> > 
> > Series applied to dpdk-next-net/master, thanks.
> 
> It will not be pulled in mainline because compilation fails on 32-bit:
> 
> drivers/net/mlx5/mlx5_rxtx.c: In function ‘mlx5_tx_burst’:
> drivers/net/mlx5/mlx5_rxtx.c:523:10: error:
> right shift count >= width of type [-Werror=shift-count-overflow]
>      addr >> 32,
>           ^~
> 
> Please Ferruh, remove the series from next-net.

Hi Thomas,

Wait, I'll submit a fix in few minutes.

Regards,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] net/mlx5: fix 32bits compilation issue
  2017-01-18  7:20   ` Thomas Monjalon
  2017-01-18  9:23     ` Nélio Laranjeiro
@ 2017-01-18 10:01     ` Nelio Laranjeiro
  2017-01-18 10:13       ` Thomas Monjalon
  1 sibling, 1 reply; 13+ messages in thread
From: Nelio Laranjeiro @ 2017-01-18 10:01 UTC (permalink / raw)
  To: Thomas Monjalon, dev; +Cc: Adrien Mazarguil, Ferruh Yigit

Fixes: 02bb06aca20f ("net/mlx5: use vector types to speed up processing")

Reported-by: Thomas Monjalon <thomas.monjalon@6wind.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

---

Please squash it in the original patch if possible.
---
 drivers/net/mlx5/mlx5_rxtx.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 8cf68c5..0177428 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -390,6 +390,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		uint32_t length;
 		unsigned int ds = 0;
 		uintptr_t addr;
+		uint64_t naddr;
 		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
 		uint8_t ehdr[2];
 		uint8_t cs_flags = 0;
@@ -515,12 +516,12 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			ds = 3;
 use_dseg:
 			/* Add the remaining packet as a simple ds. */
-			addr = htonll(addr);
+			naddr = htonll(addr);
 			*dseg = (rte_v128u32_t){
 				htonl(length),
 				txq_mp2mr(txq, txq_mb2mp(buf)),
-				addr,
-				addr >> 32,
+				naddr,
+				naddr >> 32,
 			};
 			++ds;
 			if (!segs_n)
@@ -554,12 +555,12 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		total_length += length;
 #endif
 		/* Store segment information. */
-		addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
+		naddr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
 		*dseg = (rte_v128u32_t){
 			htonl(length),
 			txq_mp2mr(txq, txq_mb2mp(buf)),
-			addr,
-			addr >> 32,
+			naddr,
+			naddr >> 32,
 		};
 		(*txq->elts)[elts_head] = buf;
 		elts_head = (elts_head + 1) & (elts_n - 1);
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] net/mlx5: fix 32bits compilation issue
  2017-01-18 10:01     ` [PATCH] net/mlx5: fix 32bits compilation issue Nelio Laranjeiro
@ 2017-01-18 10:13       ` Thomas Monjalon
  0 siblings, 0 replies; 13+ messages in thread
From: Thomas Monjalon @ 2017-01-18 10:13 UTC (permalink / raw)
  To: Nelio Laranjeiro; +Cc: dev, Adrien Mazarguil, Ferruh Yigit

2017-01-18 11:01, Nelio Laranjeiro:
> Fixes: 02bb06aca20f ("net/mlx5: use vector types to speed up processing")
> 
> Reported-by: Thomas Monjalon <thomas.monjalon@6wind.com>
> Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> 
> ---
> 
> Please squash it in the original patch if possible.

It works, thanks.

Squashed with "net/mlx5: use vector types to speed up processing"

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2017-01-18 10:13 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-24 16:03 [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
2016-11-24 16:03 ` [PATCH 1/7] net/mlx5: prepare Tx vectorization Nelio Laranjeiro
2016-11-24 16:03 ` [PATCH 2/7] net/mlx5: use work queue buffer as a raw buffer Nelio Laranjeiro
2016-11-24 16:03 ` [PATCH 3/7] net/mlx5: use vector types to speed up processing Nelio Laranjeiro
2016-11-24 16:03 ` [PATCH 4/7] net/mlx5: fix missing inline attributes Nelio Laranjeiro
2016-11-24 16:03 ` [PATCH 5/7] net/mlx5: move static prototype Nelio Laranjeiro
2016-11-24 16:03 ` [PATCH 6/7] net/mlx5: optimize copy of Ethernet header Nelio Laranjeiro
2016-11-24 16:03 ` [PATCH 7/7] net/mlx5: remove inefficient prefetching Nelio Laranjeiro
2017-01-05 14:13 ` [PATCH 0/7] net/mlx5: improve single core performance Ferruh Yigit
2017-01-18  7:20   ` Thomas Monjalon
2017-01-18  9:23     ` Nélio Laranjeiro
2017-01-18 10:01     ` [PATCH] net/mlx5: fix 32bits compilation issue Nelio Laranjeiro
2017-01-18 10:13       ` Thomas Monjalon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.