All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/6] net/mlx5: performance improvement
@ 2016-09-07  7:09 Nelio Laranjeiro
  2016-09-07  7:09 ` [PATCH 1/6] net/mlx5: rework hardware structures Nelio Laranjeiro
                   ` (12 more replies)
  0 siblings, 13 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-07  7:09 UTC (permalink / raw)
  To: dev

 - Rework structure elements to reduce their size.
 - Removes a second useless loop in Tx burst function.

This series should be applied on top of "net/mlx5: various fixes".

Nelio Laranjeiro (6):
  net/mlx5: rework hardware structures
  net/mlx5: reduce Tx and Rx structure size
  net/mlx5: reduce Tx and Rx structure size
  net/mlx5: reduce Tx structure size
  net/mlx5: reduce Tx and Rx structure size
  net/mlx5: remove gather loop on segments

 drivers/net/mlx5/mlx5_ethdev.c |   4 +-
 drivers/net/mlx5/mlx5_prm.h    |  70 ++-----
 drivers/net/mlx5/mlx5_rxq.c    |  12 +-
 drivers/net/mlx5/mlx5_rxtx.c   | 458 +++++++++++++++++++++--------------------
 drivers/net/mlx5/mlx5_rxtx.h   |  22 +-
 drivers/net/mlx5/mlx5_txq.c    |  18 +-
 6 files changed, 285 insertions(+), 299 deletions(-)

-- 
2.1.4

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 1/6] net/mlx5: rework hardware structures
  2016-09-07  7:09 [PATCH 0/6] net/mlx5: performance improvement Nelio Laranjeiro
@ 2016-09-07  7:09 ` Nelio Laranjeiro
  2016-09-07  7:09 ` [PATCH 2/6] net/mlx5: reduce Tx and Rx structure size Nelio Laranjeiro
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-07  7:09 UTC (permalink / raw)
  To: dev

Rework Work Queue Element (aka WQE) structures to fit PMD needs.
A WQE is an aggregation of 16 bytes elements known as "data segments"
(aka dseg).

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_prm.h  |  70 ++++++------------
 drivers/net/mlx5/mlx5_rxtx.c | 165 ++++++++++++++++++++++---------------------
 drivers/net/mlx5/mlx5_rxtx.h |   2 +-
 drivers/net/mlx5/mlx5_txq.c  |   4 +-
 4 files changed, 110 insertions(+), 131 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 5db219b..042562c 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -65,8 +65,15 @@
 /* Maximum number of packets a multi-packet WQE can handle. */
 #define MLX5_MPW_DSEG_MAX 5
 
-/* Room for inline data in regular work queue element. */
-#define MLX5_WQE64_INL_DATA 12
+/* WQE DWORD size */
+#define MLX5_WQE_DWORD_SIZE 16
+
+/* WQE size */
+#define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
+
+/* Compute the number of DS. */
+#define MLX5_WQE_DS(n) \
+	(((n) + MLX5_WQE_DWORD_SIZE - 1) / MLX5_WQE_DWORD_SIZE)
 
 /* Room for inline data in multi-packet WQE. */
 #define MLX5_MWQE64_INL_DATA 28
@@ -79,59 +86,26 @@ struct mlx5_wqe_eth_seg_small {
 	uint16_t mss;
 	uint32_t rsvd2;
 	uint16_t inline_hdr_sz;
+	uint8_t inline_hdr[2];
 };
 
-/* Regular WQE. */
-struct mlx5_wqe_regular {
-	union {
-		struct mlx5_wqe_ctrl_seg ctrl;
-		uint32_t data[4];
-	} ctrl;
-	struct mlx5_wqe_eth_seg eseg;
-	struct mlx5_wqe_data_seg dseg;
-} __rte_aligned(64);
-
-/* Inline WQE. */
-struct mlx5_wqe_inl {
-	union {
-		struct mlx5_wqe_ctrl_seg ctrl;
-		uint32_t data[4];
-	} ctrl;
-	struct mlx5_wqe_eth_seg eseg;
+struct mlx5_wqe_inl_small {
 	uint32_t byte_cnt;
-	uint8_t data[MLX5_WQE64_INL_DATA];
-} __rte_aligned(64);
+	uint8_t raw;
+};
 
-/* Multi-packet WQE. */
-struct mlx5_wqe_mpw {
-	union {
-		struct mlx5_wqe_ctrl_seg ctrl;
-		uint32_t data[4];
-	} ctrl;
+/* Small common part of the WQE. */
+struct mlx5_wqe {
+	uint32_t ctrl[4];
 	struct mlx5_wqe_eth_seg_small eseg;
-	struct mlx5_wqe_data_seg dseg[2];
-} __rte_aligned(64);
+};
 
-/* Multi-packet WQE with inline. */
-struct mlx5_wqe_mpw_inl {
-	union {
-		struct mlx5_wqe_ctrl_seg ctrl;
-		uint32_t data[4];
-	} ctrl;
-	struct mlx5_wqe_eth_seg_small eseg;
-	uint32_t byte_cnt;
-	uint8_t data[MLX5_MWQE64_INL_DATA];
+/* WQE. */
+struct mlx5_wqe64 {
+	struct mlx5_wqe hdr;
+	uint8_t raw[32];
 } __rte_aligned(64);
 
-/* Union of all WQE types. */
-union mlx5_wqe {
-	struct mlx5_wqe_regular wqe;
-	struct mlx5_wqe_inl inl;
-	struct mlx5_wqe_mpw mpw;
-	struct mlx5_wqe_mpw_inl mpw_inl;
-	uint8_t data[64];
-};
-
 /* MPW session status. */
 enum mlx5_mpw_state {
 	MLX5_MPW_STATE_OPENED,
@@ -145,7 +119,7 @@ struct mlx5_mpw {
 	unsigned int pkts_n;
 	unsigned int len;
 	unsigned int total_len;
-	volatile union mlx5_wqe *wqe;
+	volatile struct mlx5_wqe *wqe;
 	union {
 		volatile struct mlx5_wqe_data_seg *dseg[MLX5_MPW_DSEG_MAX];
 		volatile uint8_t *raw;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 67e0f37..5feeb3f 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -173,7 +173,7 @@ txq_complete(struct txq *txq)
 	uint16_t elts_tail;
 	uint16_t cq_ci = txq->cq_ci;
 	volatile struct mlx5_cqe64 *cqe = NULL;
-	volatile union mlx5_wqe *wqe;
+	volatile struct mlx5_wqe *wqe;
 
 	do {
 		volatile struct mlx5_cqe64 *tmp;
@@ -199,8 +199,8 @@ txq_complete(struct txq *txq)
 	} while (1);
 	if (unlikely(cqe == NULL))
 		return;
-	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)];
-	elts_tail = wqe->wqe.ctrl.data[3];
+	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)].hdr;
+	elts_tail = wqe->ctrl[3];
 	assert(elts_tail < txq->wqe_n);
 	/* Free buffers. */
 	while (elts_free != elts_tail) {
@@ -302,33 +302,33 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
  *   Number of DS elements consumed.
  */
 static inline unsigned int
-mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
+mlx5_wqe_write(struct txq *txq, volatile struct mlx5_wqe *wqe,
 	       struct rte_mbuf *buf, uint32_t length)
 {
-	uintptr_t raw = (uintptr_t)&wqe->wqe.eseg.inline_hdr_start;
+	uint8_t *raw = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0];
 	uint16_t ds;
-	uint16_t pkt_inline_sz = 16;
+	uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
 	uintptr_t addr = rte_pktmbuf_mtod(buf, uintptr_t);
 	struct mlx5_wqe_data_seg *dseg = NULL;
 
-	assert(length >= 16);
+	assert(length >= MLX5_WQE_DWORD_SIZE);
 	/* Start the know and common part of the WQE structure. */
-	wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
-	wqe->wqe.ctrl.data[2] = 0;
-	wqe->wqe.ctrl.data[3] = 0;
-	wqe->wqe.eseg.rsvd0 = 0;
-	wqe->wqe.eseg.rsvd1 = 0;
-	wqe->wqe.eseg.mss = 0;
-	wqe->wqe.eseg.rsvd2 = 0;
+	wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+	wqe->ctrl[2] = 0;
+	wqe->ctrl[3] = 0;
+	wqe->eseg.rsvd0 = 0;
+	wqe->eseg.rsvd1 = 0;
+	wqe->eseg.mss = 0;
+	wqe->eseg.rsvd2 = 0;
 	/* Start by copying the Ethernet Header. */
 	rte_mov16((uint8_t *)raw, (uint8_t *)addr);
-	length -= 16;
-	addr += 16;
+	length -= MLX5_WQE_DWORD_SIZE;
+	addr += MLX5_WQE_DWORD_SIZE;
 	/* Replace the Ethernet type by the VLAN if necessary. */
 	if (buf->ol_flags & PKT_TX_VLAN_PKT) {
 		uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
 
-		memcpy((uint8_t *)(raw + 16 - sizeof(vlan)),
+		memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE - sizeof(vlan)),
 		       &vlan, sizeof(vlan));
 		addr -= sizeof(vlan);
 		length += sizeof(vlan);
@@ -339,7 +339,7 @@ mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
 		uint16_t max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
 		uint16_t room;
 
-		raw += 16;
+		raw += MLX5_WQE_DWORD_SIZE;
 		room = end - (uintptr_t)raw;
 		if (room > max_inline) {
 			uintptr_t addr_end = (addr + max_inline) &
@@ -356,15 +356,15 @@ mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
 			assert(addr <= addr_end);
 		}
 		/* Store the inlined packet size in the WQE. */
-		wqe->wqe.eseg.inline_hdr_sz = htons(pkt_inline_sz);
+		wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
 		/*
 		 * 2 DWORDs consumed by the WQE header + 1 DSEG +
 		 * the size of the inline part of the packet.
 		 */
-		ds = 2 + ((pkt_inline_sz - 2 + 15) / 16);
+		ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
 		if (length > 0) {
 			dseg = (struct mlx5_wqe_data_seg *)
-				((uintptr_t)wqe + (ds * 16));
+				((uintptr_t)wqe + (ds * MLX5_WQE_DWORD_SIZE));
 			if ((uintptr_t)dseg >= end)
 				dseg = (struct mlx5_wqe_data_seg *)
 					((uintptr_t)&(*txq->wqes)[0]);
@@ -377,9 +377,9 @@ mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
 		 * No inline has been done in the packet, only the Ethernet
 		 * Header as been stored.
 		 */
-		wqe->wqe.eseg.inline_hdr_sz = htons(16);
+		wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
 		dseg = (struct mlx5_wqe_data_seg *)
-			((uintptr_t)wqe + (ds * 16));
+			((uintptr_t)wqe + (ds * MLX5_WQE_DWORD_SIZE));
 use_dseg:
 		*dseg = (struct mlx5_wqe_data_seg) {
 			.addr = htonll(addr),
@@ -388,7 +388,7 @@ use_dseg:
 		};
 		++ds;
 	}
-	wqe->wqe.ctrl.data[1] = htonl(txq->qp_num_8s | ds);
+	wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
 	return ds;
 }
 
@@ -444,7 +444,7 @@ tx_prefetch_cqe(struct txq *txq, uint16_t ci)
 static inline void
 tx_prefetch_wqe(struct txq *txq, uint16_t ci)
 {
-	volatile union mlx5_wqe *wqe;
+	volatile struct mlx5_wqe64 *wqe;
 
 	wqe = &(*txq->wqes)[ci & (txq->wqe_n - 1)];
 	rte_prefetch0(wqe);
@@ -473,7 +473,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int j = 0;
 	unsigned int max;
 	unsigned int comp;
-	volatile union mlx5_wqe *wqe = NULL;
+	volatile struct mlx5_wqe *wqe = NULL;
 
 	if (unlikely(!pkts_n))
 		return 0;
@@ -492,7 +492,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		uint32_t length;
 		unsigned int segs_n = buf->nb_segs;
 		volatile struct mlx5_wqe_data_seg *dseg;
-		unsigned int ds = sizeof(*wqe) / 16;
+		unsigned int ds = 0;
 
 		/*
 		 * Make sure there is enough room to store this packet and
@@ -504,7 +504,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		max -= segs_n;
 		--pkts_n;
 		elts_head_next = (elts_head + 1) & (elts_n - 1);
-		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)].hdr;
 		tx_prefetch_wqe(txq, txq->wqe_ci);
 		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
 		if (pkts_n)
@@ -519,7 +519,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
-			wqe->wqe.eseg.cs_flags =
+			wqe->eseg.cs_flags =
 				MLX5_ETH_WQE_L3_CSUM |
 				MLX5_ETH_WQE_L4_CSUM;
 		} else {
@@ -529,15 +529,15 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if (segs_n == 1)
 			goto skip_segs;
 		dseg = (volatile struct mlx5_wqe_data_seg *)
-			(((uintptr_t)wqe) + ds * 16);
+			(((uintptr_t)wqe) + ds * MLX5_WQE_DWORD_SIZE);
 		while (--segs_n) {
 			/*
 			 * Spill on next WQE when the current one does not have
 			 * enough room left. Size of WQE must a be a multiple
 			 * of data segment size.
 			 */
-			assert(!(sizeof(*wqe) % sizeof(*dseg)));
-			if (!(ds % (sizeof(*wqe) / 16)))
+			assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
+			if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE)))
 				dseg = (volatile void *)
 					&(*txq->wqes)[txq->wqe_ci++ &
 						      (txq->wqe_n - 1)];
@@ -558,8 +558,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			++j;
 		}
 		/* Update DS field in WQE. */
-		wqe->wqe.ctrl.data[1] &= htonl(0xffffffc0);
-		wqe->wqe.ctrl.data[1] |= htonl(ds & 0x3f);
+		wqe->ctrl[1] &= htonl(0xffffffc0);
+		wqe->ctrl[1] |= htonl(ds & 0x3f);
 skip_segs:
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
@@ -577,9 +577,9 @@ skip_segs:
 	comp = txq->elts_comp + i + j;
 	if (comp >= MLX5_TX_COMP_THRESH) {
 		/* Request completion on last WQE. */
-		wqe->wqe.ctrl.data[2] = htonl(8);
+		wqe->ctrl[2] = htonl(8);
 		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->wqe.ctrl.data[3] = elts_head;
+		wqe->ctrl[3] = elts_head;
 		txq->elts_comp = 0;
 	} else {
 		txq->elts_comp = comp;
@@ -616,19 +616,20 @@ mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 	mpw->pkts_n = 0;
 	mpw->len = length;
 	mpw->total_len = 0;
-	mpw->wqe = &(*txq->wqes)[idx];
-	mpw->wqe->mpw.eseg.mss = htons(length);
-	mpw->wqe->mpw.eseg.inline_hdr_sz = 0;
-	mpw->wqe->mpw.eseg.rsvd0 = 0;
-	mpw->wqe->mpw.eseg.rsvd1 = 0;
-	mpw->wqe->mpw.eseg.rsvd2 = 0;
-	mpw->wqe->mpw.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
-					   (txq->wqe_ci << 8) |
-					   MLX5_OPCODE_LSO_MPW);
-	mpw->wqe->mpw.ctrl.data[2] = 0;
-	mpw->wqe->mpw.ctrl.data[3] = 0;
-	mpw->data.dseg[0] = &mpw->wqe->mpw.dseg[0];
-	mpw->data.dseg[1] = &mpw->wqe->mpw.dseg[1];
+	mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
+	mpw->wqe->eseg.mss = htons(length);
+	mpw->wqe->eseg.inline_hdr_sz = 0;
+	mpw->wqe->eseg.rsvd0 = 0;
+	mpw->wqe->eseg.rsvd1 = 0;
+	mpw->wqe->eseg.rsvd2 = 0;
+	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
+				  (txq->wqe_ci << 8) | MLX5_OPCODE_LSO_MPW);
+	mpw->wqe->ctrl[2] = 0;
+	mpw->wqe->ctrl[3] = 0;
+	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
+		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
+	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
+		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
 	mpw->data.dseg[2] = &(*dseg)[0];
 	mpw->data.dseg[3] = &(*dseg)[1];
 	mpw->data.dseg[4] = &(*dseg)[2];
@@ -651,7 +652,7 @@ mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
 	 * count as 2.
 	 */
-	mpw->wqe->mpw.ctrl.data[1] = htonl(txq->qp_num_8s | (2 + num));
+	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num));
 	mpw->state = MLX5_MPW_STATE_CLOSED;
 	if (num < 3)
 		++txq->wqe_ci;
@@ -729,11 +730,11 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
 		    ((mpw.len != length) ||
 		     (segs_n != 1) ||
-		     (mpw.wqe->mpw.eseg.cs_flags != cs_flags)))
+		     (mpw.wqe->eseg.cs_flags != cs_flags)))
 			mlx5_mpw_close(txq, &mpw);
 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
 			mlx5_mpw_new(txq, &mpw, length);
-			mpw.wqe->mpw.eseg.cs_flags = cs_flags;
+			mpw.wqe->eseg.cs_flags = cs_flags;
 		}
 		/* Multi-segment packets must be alone in their MPW. */
 		assert((segs_n == 1) || (mpw.pkts_n == 0));
@@ -779,12 +780,12 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	/* "j" includes both packets and segments. */
 	comp = txq->elts_comp + j;
 	if (comp >= MLX5_TX_COMP_THRESH) {
-		volatile union mlx5_wqe *wqe = mpw.wqe;
+		volatile struct mlx5_wqe *wqe = mpw.wqe;
 
 		/* Request completion on last WQE. */
-		wqe->mpw.ctrl.data[2] = htonl(8);
+		wqe->ctrl[2] = htonl(8);
 		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->mpw.ctrl.data[3] = elts_head;
+		wqe->ctrl[3] = elts_head;
 		txq->elts_comp = 0;
 	} else {
 		txq->elts_comp = comp;
@@ -815,24 +816,27 @@ static inline void
 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 {
 	uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+	struct mlx5_wqe_inl_small *inl;
 
 	mpw->state = MLX5_MPW_INL_STATE_OPENED;
 	mpw->pkts_n = 0;
 	mpw->len = length;
 	mpw->total_len = 0;
-	mpw->wqe = &(*txq->wqes)[idx];
-	mpw->wqe->mpw_inl.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
-					       (txq->wqe_ci << 8) |
-					       MLX5_OPCODE_LSO_MPW);
-	mpw->wqe->mpw_inl.ctrl.data[2] = 0;
-	mpw->wqe->mpw_inl.ctrl.data[3] = 0;
-	mpw->wqe->mpw_inl.eseg.mss = htons(length);
-	mpw->wqe->mpw_inl.eseg.inline_hdr_sz = 0;
-	mpw->wqe->mpw_inl.eseg.cs_flags = 0;
-	mpw->wqe->mpw_inl.eseg.rsvd0 = 0;
-	mpw->wqe->mpw_inl.eseg.rsvd1 = 0;
-	mpw->wqe->mpw_inl.eseg.rsvd2 = 0;
-	mpw->data.raw = &mpw->wqe->mpw_inl.data[0];
+	mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
+	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
+				  (txq->wqe_ci << 8) |
+				  MLX5_OPCODE_LSO_MPW);
+	mpw->wqe->ctrl[2] = 0;
+	mpw->wqe->ctrl[3] = 0;
+	mpw->wqe->eseg.mss = htons(length);
+	mpw->wqe->eseg.inline_hdr_sz = 0;
+	mpw->wqe->eseg.cs_flags = 0;
+	mpw->wqe->eseg.rsvd0 = 0;
+	mpw->wqe->eseg.rsvd1 = 0;
+	mpw->wqe->eseg.rsvd2 = 0;
+	inl = (struct mlx5_wqe_inl_small *)
+		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
+	mpw->data.raw = (uint8_t *)&inl->raw;
 }
 
 /**
@@ -847,17 +851,18 @@ static inline void
 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
 {
 	unsigned int size;
+	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
+		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
 
-	size = sizeof(*mpw->wqe) - MLX5_MWQE64_INL_DATA + mpw->total_len;
+	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
 	/*
 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
 	 * count as 2.
 	 */
-	mpw->wqe->mpw_inl.ctrl.data[1] =
-		htonl(txq->qp_num_8s | ((size + 15) / 16));
+	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size));
 	mpw->state = MLX5_MPW_STATE_CLOSED;
-	mpw->wqe->mpw_inl.byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
-	txq->wqe_ci += (size + (sizeof(*mpw->wqe) - 1)) / sizeof(*mpw->wqe);
+	inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
+	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
 }
 
 /**
@@ -930,13 +935,13 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
 			if ((mpw.len != length) ||
 			    (segs_n != 1) ||
-			    (mpw.wqe->mpw.eseg.cs_flags != cs_flags))
+			    (mpw.wqe->eseg.cs_flags != cs_flags))
 				mlx5_mpw_close(txq, &mpw);
 		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
 			if ((mpw.len != length) ||
 			    (segs_n != 1) ||
 			    (length > inline_room) ||
-			    (mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) {
+			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
 				mlx5_mpw_inline_close(txq, &mpw);
 				inline_room =
 					txq->max_inline * RTE_CACHE_LINE_SIZE;
@@ -946,10 +951,10 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 			if ((segs_n != 1) ||
 			    (length > inline_room)) {
 				mlx5_mpw_new(txq, &mpw, length);
-				mpw.wqe->mpw.eseg.cs_flags = cs_flags;
+				mpw.wqe->eseg.cs_flags = cs_flags;
 			} else {
 				mlx5_mpw_inline_new(txq, &mpw, length);
-				mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags;
+				mpw.wqe->eseg.cs_flags = cs_flags;
 			}
 		}
 		/* Multi-segment packets must be alone in their MPW. */
@@ -1042,12 +1047,12 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 	/* "j" includes both packets and segments. */
 	comp = txq->elts_comp + j;
 	if (comp >= MLX5_TX_COMP_THRESH) {
-		volatile union mlx5_wqe *wqe = mpw.wqe;
+		volatile struct mlx5_wqe *wqe = mpw.wqe;
 
 		/* Request completion on last WQE. */
-		wqe->mpw_inl.ctrl.data[2] = htonl(8);
+		wqe->ctrl[2] = htonl(8);
 		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->mpw_inl.ctrl.data[3] = elts_head;
+		wqe->ctrl[3] = elts_head;
 		txq->elts_comp = 0;
 	} else {
 		txq->elts_comp = comp;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 8c568ad..fbc2a78 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -252,7 +252,7 @@ struct txq {
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
-	volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
+	volatile struct mlx5_wqe64 (*wqes)[]; /* Work queue. */
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
 	volatile void *bf_reg; /* Blueflame register. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 5ddd2fb..e8ebbbe 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -82,7 +82,7 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
 	for (i = 0; (i != elts_n); ++i)
 		(*txq_ctrl->txq.elts)[i] = NULL;
 	for (i = 0; (i != txq_ctrl->txq.wqe_n); ++i) {
-		volatile union mlx5_wqe *wqe = &(*txq_ctrl->txq.wqes)[i];
+		volatile struct mlx5_wqe64 *wqe = &(*txq_ctrl->txq.wqes)[i];
 
 		memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe));
 	}
@@ -215,7 +215,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 	tmpl->txq.cqe_n = ibcq->cqe + 1;
 	tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8;
 	tmpl->txq.wqes =
-		(volatile union mlx5_wqe (*)[])
+		(volatile struct mlx5_wqe64 (*)[])
 		(uintptr_t)qp->gen_data.sqstart;
 	tmpl->txq.wqe_n = qp->sq.wqe_cnt;
 	tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 2/6] net/mlx5: reduce Tx and Rx structure size
  2016-09-07  7:09 [PATCH 0/6] net/mlx5: performance improvement Nelio Laranjeiro
  2016-09-07  7:09 ` [PATCH 1/6] net/mlx5: rework hardware structures Nelio Laranjeiro
@ 2016-09-07  7:09 ` Nelio Laranjeiro
  2016-09-07  7:09 ` [PATCH 3/6] " Nelio Laranjeiro
                   ` (10 subsequent siblings)
  12 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-07  7:09 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil

PMD uses only power of two number of descriptors, storing the number of
elements in log2 helps to reduce the size of the container to store it.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5_ethdev.c |  4 ++--
 drivers/net/mlx5/mlx5_rxq.c    | 10 +++++-----
 drivers/net/mlx5/mlx5_rxtx.c   | 10 +++++-----
 drivers/net/mlx5/mlx5_rxtx.h   | 12 +++++++-----
 drivers/net/mlx5/mlx5_txq.c    |  6 +++---
 5 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 1ae80e5..137ea66 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -808,7 +808,7 @@ recover:
 		if (rehash)
 			ret = rxq_rehash(dev, rxq_ctrl);
 		else
-			ret = rxq_ctrl_setup(dev, rxq_ctrl, rxq->elts_n,
+			ret = rxq_ctrl_setup(dev, rxq_ctrl, 1 << rxq->elts_n,
 					     rxq_ctrl->socket, NULL, rxq->mp);
 		if (!ret)
 			continue;
@@ -1314,7 +1314,7 @@ mlx5_secondary_data_setup(struct priv *priv)
 		if (txq_ctrl != NULL) {
 			if (txq_ctrl_setup(priv->dev,
 					   primary_txq_ctrl,
-					   primary_txq->elts_n,
+					   1 << primary_txq->elts_n,
 					   primary_txq_ctrl->socket,
 					   NULL) == 0) {
 				txq_ctrl->txq.stats.idx =
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 44889d1..d9db368 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -723,7 +723,7 @@ rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
 	if (rxq_ctrl->rxq.elts == NULL)
 		return;
 
-	for (i = 0; (i != rxq_ctrl->rxq.elts_n); ++i) {
+	for (i = 0; (i != (1u << rxq_ctrl->rxq.elts_n)); ++i) {
 		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
 			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
 		(*rxq_ctrl->rxq.elts)[i] = NULL;
@@ -807,7 +807,7 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
 int
 rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
 {
-	unsigned int elts_n = rxq_ctrl->rxq.elts_n;
+	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
 	unsigned int i;
 	struct ibv_exp_wq_attr mod;
 	int err;
@@ -870,7 +870,7 @@ rxq_setup(struct rxq_ctrl *tmpl)
 	struct ibv_cq *ibcq = tmpl->cq;
 	struct mlx5_cq *cq = to_mxxx(cq, cq);
 	struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
-	struct rte_mbuf *(*elts)[tmpl->rxq.elts_n] =
+	struct rte_mbuf *(*elts)[1 << tmpl->rxq.elts_n] =
 		rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
 
 	if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
@@ -924,7 +924,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 		.priv = priv,
 		.socket = socket,
 		.rxq = {
-			.elts_n = desc,
+			.elts_n = log2above(desc),
 			.mp = mp,
 		},
 	};
@@ -1148,7 +1148,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 	}
 	/* Reuse buffers from original queue if possible. */
 	if (rxq_ctrl->rxq.elts_n) {
-		assert(rxq_ctrl->rxq.elts_n == desc);
+		assert(1 << rxq_ctrl->rxq.elts_n == desc);
 		assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
 		ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
 	} else
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 5feeb3f..e132727 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -166,8 +166,8 @@ txq_complete(struct txq *txq) __attribute__((always_inline));
 static inline void
 txq_complete(struct txq *txq)
 {
-	const unsigned int elts_n = txq->elts_n;
 	const unsigned int cqe_n = txq->cqe_n;
+	const unsigned int elts_n = 1 << txq->elts_n;
 	const unsigned int cqe_cnt = cqe_n - 1;
 	uint16_t elts_free = txq->elts_tail;
 	uint16_t elts_tail;
@@ -468,7 +468,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct txq *txq = (struct txq *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
-	const unsigned int elts_n = txq->elts_n;
+	const unsigned int elts_n = 1 << txq->elts_n;
 	unsigned int i = 0;
 	unsigned int j = 0;
 	unsigned int max;
@@ -680,7 +680,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct txq *txq = (struct txq *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
-	const unsigned int elts_n = txq->elts_n;
+	const unsigned int elts_n = 1 << txq->elts_n;
 	unsigned int i = 0;
 	unsigned int j = 0;
 	unsigned int max;
@@ -884,7 +884,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 {
 	struct txq *txq = (struct txq *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
-	const unsigned int elts_n = txq->elts_n;
+	const unsigned int elts_n = 1 << txq->elts_n;
 	unsigned int i = 0;
 	unsigned int j = 0;
 	unsigned int max;
@@ -1272,8 +1272,8 @@ uint16_t
 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct rxq *rxq = dpdk_rxq;
-	const unsigned int wqe_cnt = rxq->elts_n - 1;
 	const unsigned int cqe_cnt = rxq->cqe_n - 1;
+	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
 	const unsigned int sges_n = rxq->sges_n;
 	struct rte_mbuf *pkt = NULL;
 	struct rte_mbuf *seg = NULL;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index fbc2a78..3ba3913 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -54,6 +54,7 @@
 #endif
 #include <rte_mbuf.h>
 #include <rte_mempool.h>
+#include <rte_common.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-pedantic"
 #endif
@@ -109,16 +110,16 @@ struct rxq {
 	unsigned int vlan_strip:1; /* Enable VLAN stripping. */
 	unsigned int crc_present:1; /* CRC must be subtracted. */
 	unsigned int sges_n:2; /* Log 2 of SGEs (max buffers per packet). */
+	unsigned int elts_n:4; /* Log 2 of Mbufs. */
+	unsigned int port_id:8;
+	volatile uint32_t *rq_db;
+	volatile uint32_t *cq_db;
 	uint16_t rq_ci;
 	uint16_t cq_ci;
-	uint16_t elts_n;
 	uint16_t cqe_n; /* Number of CQ elements. */
-	uint16_t port_id;
 	volatile struct mlx5_wqe_data_seg(*wqes)[];
 	volatile struct mlx5_cqe(*cqes)[];
 	struct rxq_zip zip; /* Compressed context. */
-	volatile uint32_t *rq_db;
-	volatile uint32_t *cq_db;
 	struct rte_mbuf *(*elts)[];
 	struct rte_mempool *mp;
 	struct mlx5_rxq_stats stats;
@@ -238,15 +239,16 @@ struct hash_rxq {
 };
 
 /* TX queue descriptor. */
+RTE_STD_C11
 struct txq {
 	uint16_t elts_head; /* Current index in (*elts)[]. */
 	uint16_t elts_tail; /* First element awaiting completion. */
 	uint16_t elts_comp; /* Counter since last completion request. */
-	uint16_t elts_n; /* (*elts)[] length. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
 	uint16_t cqe_n; /* Number of CQ elements. */
 	uint16_t wqe_ci; /* Consumer index for work queue. */
 	uint16_t wqe_n; /* Number of WQ elements. */
+	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
 	uint16_t bf_offset; /* Blueflame offset. */
 	uint16_t bf_buf_size; /* Blueflame size. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index e8ebbbe..9055016 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -101,7 +101,7 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
 static void
 txq_free_elts(struct txq_ctrl *txq_ctrl)
 {
-	unsigned int elts_n = txq_ctrl->txq.elts_n;
+	unsigned int elts_n = 1 << txq_ctrl->txq.elts_n;
 	unsigned int elts_head = txq_ctrl->txq.elts_head;
 	unsigned int elts_tail = txq_ctrl->txq.elts_tail;
 	struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts;
@@ -227,7 +227,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 		(volatile struct mlx5_cqe (*)[])
 		(uintptr_t)cq->active_buf->buf;
 	tmpl->txq.elts =
-		(struct rte_mbuf *(*)[tmpl->txq.elts_n])
+		(struct rte_mbuf *(*)[1 << tmpl->txq.elts_n])
 		((uintptr_t)txq_ctrl + sizeof(*txq_ctrl));
 	return 0;
 }
@@ -277,7 +277,7 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 	}
 	(void)conf; /* Thresholds configuration (ignored). */
 	assert(desc > MLX5_TX_COMP_THRESH);
-	tmpl.txq.elts_n = desc;
+	tmpl.txq.elts_n = log2above(desc);
 	/* MRs will be registered in mp2mr[] later. */
 	attr.rd = (struct ibv_exp_res_domain_init_attr){
 		.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 3/6] net/mlx5: reduce Tx and Rx structure size
  2016-09-07  7:09 [PATCH 0/6] net/mlx5: performance improvement Nelio Laranjeiro
  2016-09-07  7:09 ` [PATCH 1/6] net/mlx5: rework hardware structures Nelio Laranjeiro
  2016-09-07  7:09 ` [PATCH 2/6] net/mlx5: reduce Tx and Rx structure size Nelio Laranjeiro
@ 2016-09-07  7:09 ` Nelio Laranjeiro
  2016-09-07  7:09 ` [PATCH 4/6] net/mlx5: reduce Tx " Nelio Laranjeiro
                   ` (9 subsequent siblings)
  12 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-07  7:09 UTC (permalink / raw)
  To: dev

PMD uses only power of two number of Completion Queue Elements, storing the
number of elements in log2 helps to reduce the size of the container to
store it.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxq.c  | 2 +-
 drivers/net/mlx5/mlx5_rxtx.c | 8 ++++----
 drivers/net/mlx5/mlx5_rxtx.h | 4 ++--
 drivers/net/mlx5/mlx5_txq.c  | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index d9db368..f6f4315 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -881,7 +881,7 @@ rxq_setup(struct rxq_ctrl *tmpl)
 	if (elts == NULL)
 		return ENOMEM;
 	tmpl->rxq.rq_db = rwq->rq.db;
-	tmpl->rxq.cqe_n = ibcq->cqe + 1;
+	tmpl->rxq.cqe_n = log2above(ibcq->cqe);
 	tmpl->rxq.cq_ci = 0;
 	tmpl->rxq.rq_ci = 0;
 	tmpl->rxq.cq_db = cq->dbrec;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index e132727..4f28aa9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -166,8 +166,8 @@ txq_complete(struct txq *txq) __attribute__((always_inline));
 static inline void
 txq_complete(struct txq *txq)
 {
-	const unsigned int cqe_n = txq->cqe_n;
 	const unsigned int elts_n = 1 << txq->elts_n;
+	const unsigned int cqe_n = 1 << txq->cqe_n;
 	const unsigned int cqe_cnt = cqe_n - 1;
 	uint16_t elts_free = txq->elts_tail;
 	uint16_t elts_tail;
@@ -427,9 +427,9 @@ mlx5_tx_dbrec(struct txq *txq)
 static inline void
 tx_prefetch_cqe(struct txq *txq, uint16_t ci)
 {
-	volatile struct mlx5_cqe64 *cqe;
+	volatile struct mlx5_cqe *cqe;
 
-	cqe = &(*txq->cqes)[ci & (txq->cqe_n - 1)].cqe64;
+	cqe = &(*txq->cqes)[ci & ((1 << txq->cqe_n) - 1)];
 	rte_prefetch0(cqe);
 }
 
@@ -1272,8 +1272,8 @@ uint16_t
 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct rxq *rxq = dpdk_rxq;
-	const unsigned int cqe_cnt = rxq->cqe_n - 1;
 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
+	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
 	const unsigned int sges_n = rxq->sges_n;
 	struct rte_mbuf *pkt = NULL;
 	struct rte_mbuf *seg = NULL;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3ba3913..224614e 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -110,13 +110,13 @@ struct rxq {
 	unsigned int vlan_strip:1; /* Enable VLAN stripping. */
 	unsigned int crc_present:1; /* CRC must be subtracted. */
 	unsigned int sges_n:2; /* Log 2 of SGEs (max buffers per packet). */
+	unsigned int cqe_n:4; /* Log 2 of CQ elements. */
 	unsigned int elts_n:4; /* Log 2 of Mbufs. */
 	unsigned int port_id:8;
 	volatile uint32_t *rq_db;
 	volatile uint32_t *cq_db;
 	uint16_t rq_ci;
 	uint16_t cq_ci;
-	uint16_t cqe_n; /* Number of CQ elements. */
 	volatile struct mlx5_wqe_data_seg(*wqes)[];
 	volatile struct mlx5_cqe(*cqes)[];
 	struct rxq_zip zip; /* Compressed context. */
@@ -245,10 +245,10 @@ struct txq {
 	uint16_t elts_tail; /* First element awaiting completion. */
 	uint16_t elts_comp; /* Counter since last completion request. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
-	uint16_t cqe_n; /* Number of CQ elements. */
 	uint16_t wqe_ci; /* Consumer index for work queue. */
 	uint16_t wqe_n; /* Number of WQ elements. */
 	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
+	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
 	uint16_t bf_offset; /* Blueflame offset. */
 	uint16_t bf_buf_size; /* Blueflame size. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 9055016..6145b69 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -212,7 +212,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
 		return EINVAL;
 	}
-	tmpl->txq.cqe_n = ibcq->cqe + 1;
+	tmpl->txq.cqe_n = log2above(ibcq->cqe);
 	tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8;
 	tmpl->txq.wqes =
 		(volatile struct mlx5_wqe64 (*)[])
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 4/6] net/mlx5: reduce Tx structure size
  2016-09-07  7:09 [PATCH 0/6] net/mlx5: performance improvement Nelio Laranjeiro
                   ` (2 preceding siblings ...)
  2016-09-07  7:09 ` [PATCH 3/6] " Nelio Laranjeiro
@ 2016-09-07  7:09 ` Nelio Laranjeiro
  2016-09-07  7:09 ` [PATCH 5/6] net/mlx5: reduce Tx and Rx " Nelio Laranjeiro
                   ` (8 subsequent siblings)
  12 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-07  7:09 UTC (permalink / raw)
  To: dev

Blue Flame is a buffer allocated with a power of two value, its size is
returned by Verbs in log2.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 2 +-
 drivers/net/mlx5/mlx5_rxtx.h | 2 +-
 drivers/net/mlx5/mlx5_txq.c  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 4f28aa9..214922b 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -413,7 +413,7 @@ mlx5_tx_dbrec(struct txq *txq)
 	/* Ensure ordering between DB record and BF copy. */
 	rte_wmb();
 	rte_mov16(dst, (uint8_t *)data);
-	txq->bf_offset ^= txq->bf_buf_size;
+	txq->bf_offset ^= (1 << txq->bf_buf_size);
 }
 
 /**
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 224614e..3dca8ca 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -249,8 +249,8 @@ struct txq {
 	uint16_t wqe_n; /* Number of WQ elements. */
 	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
 	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
+	uint16_t bf_buf_size:4; /* Log2 Blueflame size. */
 	uint16_t bf_offset; /* Blueflame offset. */
-	uint16_t bf_buf_size; /* Blueflame size. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 6145b69..9919e37 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -221,7 +221,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 	tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
 	tmpl->txq.bf_reg = qp->gen_data.bf->reg;
 	tmpl->txq.bf_offset = qp->gen_data.bf->offset;
-	tmpl->txq.bf_buf_size = qp->gen_data.bf->buf_size;
+	tmpl->txq.bf_buf_size = log2above(qp->gen_data.bf->buf_size);
 	tmpl->txq.cq_db = cq->dbrec;
 	tmpl->txq.cqes =
 		(volatile struct mlx5_cqe (*)[])
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 5/6] net/mlx5: reduce Tx and Rx structure size
  2016-09-07  7:09 [PATCH 0/6] net/mlx5: performance improvement Nelio Laranjeiro
                   ` (3 preceding siblings ...)
  2016-09-07  7:09 ` [PATCH 4/6] net/mlx5: reduce Tx " Nelio Laranjeiro
@ 2016-09-07  7:09 ` Nelio Laranjeiro
  2016-09-07  7:09 ` [PATCH 6/6] net/mlx5: remove gather loop on segments Nelio Laranjeiro
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-07  7:09 UTC (permalink / raw)
  To: dev

PMD uses only power of two number of Work Queue Elements, storing the
number of elements in log2 helps to reduce the size of the container to
store it.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 23 ++++++++++++-----------
 drivers/net/mlx5/mlx5_rxtx.h |  2 +-
 drivers/net/mlx5/mlx5_txq.c  |  4 ++--
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 214922b..9d00ddc 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -199,9 +199,10 @@ txq_complete(struct txq *txq)
 	} while (1);
 	if (unlikely(cqe == NULL))
 		return;
-	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)].hdr;
+	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) &
+			    ((1 << txq->wqe_n) - 1)].hdr;
 	elts_tail = wqe->ctrl[3];
-	assert(elts_tail < txq->wqe_n);
+	assert(elts_tail < (1 << txq->wqe_n));
 	/* Free buffers. */
 	while (elts_free != elts_tail) {
 		struct rte_mbuf *elt = (*txq->elts)[elts_free];
@@ -335,7 +336,7 @@ mlx5_wqe_write(struct txq *txq, volatile struct mlx5_wqe *wqe,
 	}
 	/* Inline if enough room. */
 	if (txq->max_inline != 0) {
-		uintptr_t end = (uintptr_t)&(*txq->wqes)[txq->wqe_n];
+		uintptr_t end = (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
 		uint16_t max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
 		uint16_t room;
 
@@ -446,7 +447,7 @@ tx_prefetch_wqe(struct txq *txq, uint16_t ci)
 {
 	volatile struct mlx5_wqe64 *wqe;
 
-	wqe = &(*txq->wqes)[ci & (txq->wqe_n - 1)];
+	wqe = &(*txq->wqes)[ci & ((1 << txq->wqe_n) - 1)];
 	rte_prefetch0(wqe);
 }
 
@@ -504,7 +505,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		max -= segs_n;
 		--pkts_n;
 		elts_head_next = (elts_head + 1) & (elts_n - 1);
-		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)].hdr;
+		wqe = &(*txq->wqes)[txq->wqe_ci & ((1 << txq->wqe_n) - 1)].hdr;
 		tx_prefetch_wqe(txq, txq->wqe_ci);
 		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
 		if (pkts_n)
@@ -540,7 +541,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE)))
 				dseg = (volatile void *)
 					&(*txq->wqes)[txq->wqe_ci++ &
-						      (txq->wqe_n - 1)];
+						      ((1 << txq->wqe_n) - 1)];
 			else
 				++dseg;
 			++ds;
@@ -607,10 +608,10 @@ skip_segs:
 static inline void
 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 {
-	uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
 	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
 		(volatile struct mlx5_wqe_data_seg (*)[])
-		(uintptr_t)&(*txq->wqes)[(idx + 1) & (txq->wqe_n - 1)];
+		(uintptr_t)&(*txq->wqes)[(idx + 1) & ((1 << txq->wqe_n) - 1)];
 
 	mpw->state = MLX5_MPW_STATE_OPENED;
 	mpw->pkts_n = 0;
@@ -815,7 +816,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 static inline void
 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 {
-	uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
 	struct mlx5_wqe_inl_small *inl;
 
 	mpw->state = MLX5_MPW_INL_STATE_OPENED;
@@ -1000,7 +1001,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			(*txq->elts)[elts_head] = buf;
 			/* Maximum number of bytes before wrapping. */
-			max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] -
+			max = ((uintptr_t)&(*txq->wqes)[1 << txq->wqe_n] -
 			       (uintptr_t)mpw.data.raw);
 			if (length > max) {
 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
@@ -1019,7 +1020,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 				mpw.data.raw += length;
 			}
 			if ((uintptr_t)mpw.data.raw ==
-			    (uintptr_t)&(*txq->wqes)[txq->wqe_n])
+			    (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n])
 				mpw.data.raw =
 					(volatile void *)&(*txq->wqes)[0];
 			++mpw.pkts_n;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3dca8ca..9828aef 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -246,9 +246,9 @@ struct txq {
 	uint16_t elts_comp; /* Counter since last completion request. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
 	uint16_t wqe_ci; /* Consumer index for work queue. */
-	uint16_t wqe_n; /* Number of WQ elements. */
 	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
 	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
+	uint16_t wqe_n:4; /* Number of of WQ elements (in log2). */
 	uint16_t bf_buf_size:4; /* Log2 Blueflame size. */
 	uint16_t bf_offset; /* Blueflame offset. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 9919e37..3d2d132 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -81,7 +81,7 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
 
 	for (i = 0; (i != elts_n); ++i)
 		(*txq_ctrl->txq.elts)[i] = NULL;
-	for (i = 0; (i != txq_ctrl->txq.wqe_n); ++i) {
+	for (i = 0; (i != (1u << txq_ctrl->txq.wqe_n)); ++i) {
 		volatile struct mlx5_wqe64 *wqe = &(*txq_ctrl->txq.wqes)[i];
 
 		memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe));
@@ -217,7 +217,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 	tmpl->txq.wqes =
 		(volatile struct mlx5_wqe64 (*)[])
 		(uintptr_t)qp->gen_data.sqstart;
-	tmpl->txq.wqe_n = qp->sq.wqe_cnt;
+	tmpl->txq.wqe_n = log2above(qp->sq.wqe_cnt);
 	tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
 	tmpl->txq.bf_reg = qp->gen_data.bf->reg;
 	tmpl->txq.bf_offset = qp->gen_data.bf->offset;
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 6/6] net/mlx5: remove gather loop on segments
  2016-09-07  7:09 [PATCH 0/6] net/mlx5: performance improvement Nelio Laranjeiro
                   ` (4 preceding siblings ...)
  2016-09-07  7:09 ` [PATCH 5/6] net/mlx5: reduce Tx and Rx " Nelio Laranjeiro
@ 2016-09-07  7:09 ` Nelio Laranjeiro
  2016-09-14 12:18 ` [PATCH v2 0/6] net/mlx5: performance improvement Nelio Laranjeiro
                   ` (6 subsequent siblings)
  12 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-07  7:09 UTC (permalink / raw)
  To: dev; +Cc: Vasily Philipov

Tx function was handling a double loop to send segmented packets, it can be
done in a single one.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Vasily Philipov <vasilyf@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 312 ++++++++++++++++++++++---------------------
 1 file changed, 158 insertions(+), 154 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 9d00ddc..b91b644 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -288,112 +288,6 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
 }
 
 /**
- * Write a regular WQE.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param wqe
- *   Pointer to the WQE to fill.
- * @param buf
- *   Buffer.
- * @param length
- *   Packet length.
- *
- * @return ds
- *   Number of DS elements consumed.
- */
-static inline unsigned int
-mlx5_wqe_write(struct txq *txq, volatile struct mlx5_wqe *wqe,
-	       struct rte_mbuf *buf, uint32_t length)
-{
-	uint8_t *raw = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0];
-	uint16_t ds;
-	uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
-	uintptr_t addr = rte_pktmbuf_mtod(buf, uintptr_t);
-	struct mlx5_wqe_data_seg *dseg = NULL;
-
-	assert(length >= MLX5_WQE_DWORD_SIZE);
-	/* Start the know and common part of the WQE structure. */
-	wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
-	wqe->ctrl[2] = 0;
-	wqe->ctrl[3] = 0;
-	wqe->eseg.rsvd0 = 0;
-	wqe->eseg.rsvd1 = 0;
-	wqe->eseg.mss = 0;
-	wqe->eseg.rsvd2 = 0;
-	/* Start by copying the Ethernet Header. */
-	rte_mov16((uint8_t *)raw, (uint8_t *)addr);
-	length -= MLX5_WQE_DWORD_SIZE;
-	addr += MLX5_WQE_DWORD_SIZE;
-	/* Replace the Ethernet type by the VLAN if necessary. */
-	if (buf->ol_flags & PKT_TX_VLAN_PKT) {
-		uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
-
-		memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE - sizeof(vlan)),
-		       &vlan, sizeof(vlan));
-		addr -= sizeof(vlan);
-		length += sizeof(vlan);
-	}
-	/* Inline if enough room. */
-	if (txq->max_inline != 0) {
-		uintptr_t end = (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
-		uint16_t max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
-		uint16_t room;
-
-		raw += MLX5_WQE_DWORD_SIZE;
-		room = end - (uintptr_t)raw;
-		if (room > max_inline) {
-			uintptr_t addr_end = (addr + max_inline) &
-				~(RTE_CACHE_LINE_SIZE - 1);
-			uint16_t copy_b = ((addr_end - addr) > length) ?
-					  length :
-					  (addr_end - addr);
-
-			rte_memcpy((void *)raw, (void *)addr, copy_b);
-			addr += copy_b;
-			length -= copy_b;
-			pkt_inline_sz += copy_b;
-			/* Sanity check. */
-			assert(addr <= addr_end);
-		}
-		/* Store the inlined packet size in the WQE. */
-		wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
-		/*
-		 * 2 DWORDs consumed by the WQE header + 1 DSEG +
-		 * the size of the inline part of the packet.
-		 */
-		ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
-		if (length > 0) {
-			dseg = (struct mlx5_wqe_data_seg *)
-				((uintptr_t)wqe + (ds * MLX5_WQE_DWORD_SIZE));
-			if ((uintptr_t)dseg >= end)
-				dseg = (struct mlx5_wqe_data_seg *)
-					((uintptr_t)&(*txq->wqes)[0]);
-			goto use_dseg;
-		}
-	} else {
-		/* Add the remaining packet as a simple ds. */
-		ds = 3;
-		/*
-		 * No inline has been done in the packet, only the Ethernet
-		 * Header as been stored.
-		 */
-		wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
-		dseg = (struct mlx5_wqe_data_seg *)
-			((uintptr_t)wqe + (ds * MLX5_WQE_DWORD_SIZE));
-use_dseg:
-		*dseg = (struct mlx5_wqe_data_seg) {
-			.addr = htonll(addr),
-			.byte_count = htonl(length),
-			.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
-		};
-		++ds;
-	}
-	wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
-	return ds;
-}
-
-/**
  * Ring TX queue doorbell.
  *
  * @param txq
@@ -475,6 +369,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int max;
 	unsigned int comp;
 	volatile struct mlx5_wqe *wqe = NULL;
+	unsigned int segs_n = 0;
+	struct rte_mbuf *buf = NULL;
+	uint8_t *raw;
 
 	if (unlikely(!pkts_n))
 		return 0;
@@ -488,13 +385,17 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (max > elts_n)
 		max -= elts_n;
 	do {
-		struct rte_mbuf *buf = *(pkts++);
-		unsigned int elts_head_next;
+		volatile struct mlx5_wqe_data_seg *dseg = NULL;
 		uint32_t length;
-		unsigned int segs_n = buf->nb_segs;
-		volatile struct mlx5_wqe_data_seg *dseg;
 		unsigned int ds = 0;
+		uintptr_t addr;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		uint32_t total_length = 0;
+#endif
 
+		/* first_seg */
+		buf = *(pkts++);
+		segs_n = buf->nb_segs;
 		/*
 		 * Make sure there is enough room to store this packet and
 		 * that one ring entry remains unused.
@@ -503,20 +404,30 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if (max < segs_n + 1)
 			break;
 		max -= segs_n;
-		--pkts_n;
-		elts_head_next = (elts_head + 1) & (elts_n - 1);
-		wqe = &(*txq->wqes)[txq->wqe_ci & ((1 << txq->wqe_n) - 1)].hdr;
-		tx_prefetch_wqe(txq, txq->wqe_ci);
+		--segs_n;
+		if (!segs_n)
+			--pkts_n;
+		wqe = &(*txq->wqes)[txq->wqe_ci &
+				    ((1 << txq->wqe_n) - 1)].hdr;
 		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
-		if (pkts_n)
+		if (pkts_n > 1)
 			rte_prefetch0(*pkts);
+		addr = rte_pktmbuf_mtod(buf, uintptr_t);
 		length = DATA_LEN(buf);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		total_length = length;
+#endif
+		assert(length >= MLX5_WQE_DWORD_SIZE);
 		/* Update element. */
 		(*txq->elts)[elts_head] = buf;
+		elts_head = (elts_head + 1) & (elts_n - 1);
 		/* Prefetch next buffer data. */
-		if (pkts_n)
-			rte_prefetch0(rte_pktmbuf_mtod(*pkts,
-						       volatile void *));
+		if (pkts_n > 1) {
+			volatile void *pkt_addr;
+
+			pkt_addr = rte_pktmbuf_mtod(*pkts, volatile void *);
+			rte_prefetch0(pkt_addr);
+		}
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -526,50 +437,143 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		} else {
 			wqe->eseg.cs_flags = 0;
 		}
-		ds = mlx5_wqe_write(txq, wqe, buf, length);
-		if (segs_n == 1)
-			goto skip_segs;
-		dseg = (volatile struct mlx5_wqe_data_seg *)
-			(((uintptr_t)wqe) + ds * MLX5_WQE_DWORD_SIZE);
-		while (--segs_n) {
+		raw  = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0];
+		/* Start the know and common part of the WQE structure. */
+		wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+		wqe->ctrl[2] = 0;
+		wqe->ctrl[3] = 0;
+		wqe->eseg.rsvd0 = 0;
+		wqe->eseg.rsvd1 = 0;
+		wqe->eseg.mss = 0;
+		wqe->eseg.rsvd2 = 0;
+		/* Start by copying the Ethernet Header. */
+		rte_mov16((uint8_t *)raw, (uint8_t *)addr);
+		length -= MLX5_WQE_DWORD_SIZE;
+		addr += MLX5_WQE_DWORD_SIZE;
+		/* Replace the Ethernet type by the VLAN if necessary. */
+		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
+			uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
+
+			memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE -
+					   sizeof(vlan)),
+			       &vlan, sizeof(vlan));
+			addr -= sizeof(vlan);
+			length += sizeof(vlan);
+		}
+		/* Inline if enough room. */
+		if (txq->max_inline != 0) {
+			uintptr_t end =
+				(uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
+			uint16_t max_inline =
+				txq->max_inline * RTE_CACHE_LINE_SIZE;
+			uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
+			uint16_t room;
+
+			raw += MLX5_WQE_DWORD_SIZE;
+			room = end - (uintptr_t)raw;
+			if (room > max_inline) {
+				uintptr_t addr_end = (addr + max_inline) &
+					~(RTE_CACHE_LINE_SIZE - 1);
+				uint16_t copy_b = ((addr_end - addr) > length) ?
+						  length :
+						  (addr_end - addr);
+
+				rte_memcpy((void *)raw, (void *)addr, copy_b);
+				addr += copy_b;
+				length -= copy_b;
+				pkt_inline_sz += copy_b;
+				/* Sanity check. */
+				assert(addr <= addr_end);
+			}
+			/* Store the inlined packet size in the WQE. */
+			wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
+			/*
+			 * 2 DWORDs consumed by the WQE header + 1 DSEG +
+			 * the size of the inline part of the packet.
+			 */
+			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
+			if (length > 0) {
+				dseg = (struct mlx5_wqe_data_seg *)
+					((uintptr_t)wqe +
+					 (ds * MLX5_WQE_DWORD_SIZE));
+				if ((uintptr_t)dseg >= end)
+					dseg = (struct mlx5_wqe_data_seg *)
+						((uintptr_t)&(*txq->wqes)[0]);
+				goto use_dseg;
+			} else if (!segs_n) {
+				goto next_pkt;
+			} else {
+				goto next_seg;
+			}
+		} else {
 			/*
-			 * Spill on next WQE when the current one does not have
-			 * enough room left. Size of WQE must a be a multiple
-			 * of data segment size.
+			 * No inline has been done in the packet, only the
+			 * Ethernet Header as been stored.
 			 */
-			assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
-			if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE)))
-				dseg = (volatile void *)
-					&(*txq->wqes)[txq->wqe_ci++ &
-						      ((1 << txq->wqe_n) - 1)];
-			else
-				++dseg;
+			wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
+			dseg = (struct mlx5_wqe_data_seg *)
+				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
+			ds = 3;
+use_dseg:
+			/* Add the remaining packet as a simple ds. */
+			*dseg = (struct mlx5_wqe_data_seg) {
+				.addr = htonll(addr),
+				.byte_count = htonl(length),
+				.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+			};
 			++ds;
-			buf = buf->next;
-			assert(buf);
-			/* Store segment information. */
-			dseg->byte_count = htonl(DATA_LEN(buf));
-			dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf));
-			dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
-			(*txq->elts)[elts_head_next] = buf;
-			elts_head_next = (elts_head_next + 1) & (elts_n - 1);
+			if (!segs_n)
+				goto next_pkt;
+		}
+next_seg:
+		assert(buf);
+		assert(ds);
+		assert(wqe);
+		/*
+		 * Spill on next WQE when the current one does not have
+		 * enough room left. Size of WQE must a be a multiple
+		 * of data segment size.
+		 */
+		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
+		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
+			unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) &
+				((1 << txq->wqe_n) - 1);
+
+			dseg = (struct mlx5_wqe_data_seg *)
+				((uintptr_t)&(*txq->wqes)[n]);
+			tx_prefetch_wqe(txq, n + 1);
+		} else {
+			++dseg;
+		}
+		++ds;
+		buf = buf->next;
+		assert(buf);
+		length = DATA_LEN(buf);
 #ifdef MLX5_PMD_SOFT_COUNTERS
-			length += DATA_LEN(buf);
+		total_length += length;
 #endif
-			++j;
-		}
-		/* Update DS field in WQE. */
-		wqe->ctrl[1] &= htonl(0xffffffc0);
-		wqe->ctrl[1] |= htonl(ds & 0x3f);
-skip_segs:
+		/* Store segment information. */
+		*dseg = (struct mlx5_wqe_data_seg) {
+			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
+			.byte_count = htonl(length),
+			.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+		};
+		(*txq->elts)[elts_head] = buf;
+		elts_head = (elts_head + 1) & (elts_n - 1);
+		++j;
+		--segs_n;
+		if (segs_n)
+			goto next_seg;
+		else
+			--pkts_n;
+next_pkt:
+		++i;
+		wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
+		txq->wqe_ci += (ds + 3) / 4;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
+		txq->stats.obytes += total_length;
 #endif
-		/* Increment consumer index. */
-		txq->wqe_ci += (ds + 3) / 4;
-		elts_head = elts_head_next;
-		++i;
 	} while (pkts_n);
 	/* Take a shortcut if nothing must be sent. */
 	if (unlikely(i == 0))
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 0/6] net/mlx5: performance improvement
  2016-09-07  7:09 [PATCH 0/6] net/mlx5: performance improvement Nelio Laranjeiro
                   ` (5 preceding siblings ...)
  2016-09-07  7:09 ` [PATCH 6/6] net/mlx5: remove gather loop on segments Nelio Laranjeiro
@ 2016-09-14 12:18 ` Nelio Laranjeiro
  2016-09-19 16:17   ` Bruce Richardson
  2016-09-20  8:53   ` [PATCH v3 " Nelio Laranjeiro
  2016-09-14 12:18 ` [PATCH v2 1/6] net/mlx5: rework hardware structures Nelio Laranjeiro
                   ` (5 subsequent siblings)
  12 siblings, 2 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-14 12:18 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil

 - Rework structure elements to reduce their size.
 - Removes a second useless loop in Tx burst function.

This series should be applied on top of "net/mlx5: various fixes".

Changes in v2:

 - rework serie to apply it on top of "net/mlx5: various fixes".

Nelio Laranjeiro (6):
  net/mlx5: rework hardware structures
  net/mlx5: reduce Tx and Rx structure size
  net/mlx5: reduce Tx and Rx structure size
  net/mlx5: reduce Tx structure size
  net/mlx5: reduce Tx and Rx structure size
  net/mlx5: remove gather loop on segments

 drivers/net/mlx5/mlx5_ethdev.c |   4 +-
 drivers/net/mlx5/mlx5_prm.h    |  70 ++-----
 drivers/net/mlx5/mlx5_rxq.c    |  12 +-
 drivers/net/mlx5/mlx5_rxtx.c   | 460 +++++++++++++++++++++--------------------
 drivers/net/mlx5/mlx5_rxtx.h   |  22 +-
 drivers/net/mlx5/mlx5_txq.c    |  18 +-
 6 files changed, 286 insertions(+), 300 deletions(-)

-- 
2.1.4

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 1/6] net/mlx5: rework hardware structures
  2016-09-07  7:09 [PATCH 0/6] net/mlx5: performance improvement Nelio Laranjeiro
                   ` (6 preceding siblings ...)
  2016-09-14 12:18 ` [PATCH v2 0/6] net/mlx5: performance improvement Nelio Laranjeiro
@ 2016-09-14 12:18 ` Nelio Laranjeiro
  2016-09-19 16:14   ` Bruce Richardson
  2016-09-14 12:18 ` [PATCH v2 2/6] net/mlx5: reduce Tx and Rx structure size Nelio Laranjeiro
                   ` (4 subsequent siblings)
  12 siblings, 1 reply; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-14 12:18 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil

Rework Work Queue Element (aka WQE) structures to fit PMD needs.
A WQE is an aggregation of 16 bytes elements known as "data segments"
(aka dseg).

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_prm.h  |  70 ++++++------------
 drivers/net/mlx5/mlx5_rxtx.c | 167 ++++++++++++++++++++++---------------------
 drivers/net/mlx5/mlx5_rxtx.h |   2 +-
 drivers/net/mlx5/mlx5_txq.c  |   4 +-
 4 files changed, 111 insertions(+), 132 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 5db219b..042562c 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -65,8 +65,15 @@
 /* Maximum number of packets a multi-packet WQE can handle. */
 #define MLX5_MPW_DSEG_MAX 5
 
-/* Room for inline data in regular work queue element. */
-#define MLX5_WQE64_INL_DATA 12
+/* WQE DWORD size */
+#define MLX5_WQE_DWORD_SIZE 16
+
+/* WQE size */
+#define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
+
+/* Compute the number of DS. */
+#define MLX5_WQE_DS(n) \
+	(((n) + MLX5_WQE_DWORD_SIZE - 1) / MLX5_WQE_DWORD_SIZE)
 
 /* Room for inline data in multi-packet WQE. */
 #define MLX5_MWQE64_INL_DATA 28
@@ -79,59 +86,26 @@ struct mlx5_wqe_eth_seg_small {
 	uint16_t mss;
 	uint32_t rsvd2;
 	uint16_t inline_hdr_sz;
+	uint8_t inline_hdr[2];
 };
 
-/* Regular WQE. */
-struct mlx5_wqe_regular {
-	union {
-		struct mlx5_wqe_ctrl_seg ctrl;
-		uint32_t data[4];
-	} ctrl;
-	struct mlx5_wqe_eth_seg eseg;
-	struct mlx5_wqe_data_seg dseg;
-} __rte_aligned(64);
-
-/* Inline WQE. */
-struct mlx5_wqe_inl {
-	union {
-		struct mlx5_wqe_ctrl_seg ctrl;
-		uint32_t data[4];
-	} ctrl;
-	struct mlx5_wqe_eth_seg eseg;
+struct mlx5_wqe_inl_small {
 	uint32_t byte_cnt;
-	uint8_t data[MLX5_WQE64_INL_DATA];
-} __rte_aligned(64);
+	uint8_t raw;
+};
 
-/* Multi-packet WQE. */
-struct mlx5_wqe_mpw {
-	union {
-		struct mlx5_wqe_ctrl_seg ctrl;
-		uint32_t data[4];
-	} ctrl;
+/* Small common part of the WQE. */
+struct mlx5_wqe {
+	uint32_t ctrl[4];
 	struct mlx5_wqe_eth_seg_small eseg;
-	struct mlx5_wqe_data_seg dseg[2];
-} __rte_aligned(64);
+};
 
-/* Multi-packet WQE with inline. */
-struct mlx5_wqe_mpw_inl {
-	union {
-		struct mlx5_wqe_ctrl_seg ctrl;
-		uint32_t data[4];
-	} ctrl;
-	struct mlx5_wqe_eth_seg_small eseg;
-	uint32_t byte_cnt;
-	uint8_t data[MLX5_MWQE64_INL_DATA];
+/* WQE. */
+struct mlx5_wqe64 {
+	struct mlx5_wqe hdr;
+	uint8_t raw[32];
 } __rte_aligned(64);
 
-/* Union of all WQE types. */
-union mlx5_wqe {
-	struct mlx5_wqe_regular wqe;
-	struct mlx5_wqe_inl inl;
-	struct mlx5_wqe_mpw mpw;
-	struct mlx5_wqe_mpw_inl mpw_inl;
-	uint8_t data[64];
-};
-
 /* MPW session status. */
 enum mlx5_mpw_state {
 	MLX5_MPW_STATE_OPENED,
@@ -145,7 +119,7 @@ struct mlx5_mpw {
 	unsigned int pkts_n;
 	unsigned int len;
 	unsigned int total_len;
-	volatile union mlx5_wqe *wqe;
+	volatile struct mlx5_wqe *wqe;
 	union {
 		volatile struct mlx5_wqe_data_seg *dseg[MLX5_MPW_DSEG_MAX];
 		volatile uint8_t *raw;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index ecc76ad..5feeb3f 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -173,7 +173,7 @@ txq_complete(struct txq *txq)
 	uint16_t elts_tail;
 	uint16_t cq_ci = txq->cq_ci;
 	volatile struct mlx5_cqe64 *cqe = NULL;
-	volatile union mlx5_wqe *wqe;
+	volatile struct mlx5_wqe *wqe;
 
 	do {
 		volatile struct mlx5_cqe64 *tmp;
@@ -199,8 +199,8 @@ txq_complete(struct txq *txq)
 	} while (1);
 	if (unlikely(cqe == NULL))
 		return;
-	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)];
-	elts_tail = wqe->wqe.ctrl.data[3];
+	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)].hdr;
+	elts_tail = wqe->ctrl[3];
 	assert(elts_tail < txq->wqe_n);
 	/* Free buffers. */
 	while (elts_free != elts_tail) {
@@ -302,33 +302,33 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
  *   Number of DS elements consumed.
  */
 static inline unsigned int
-mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
+mlx5_wqe_write(struct txq *txq, volatile struct mlx5_wqe *wqe,
 	       struct rte_mbuf *buf, uint32_t length)
 {
-	uintptr_t raw = (uintptr_t)&wqe->wqe.eseg.inline_hdr_start;
+	uint8_t *raw = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0];
 	uint16_t ds;
-	uint16_t pkt_inline_sz = 16;
+	uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
 	uintptr_t addr = rte_pktmbuf_mtod(buf, uintptr_t);
 	struct mlx5_wqe_data_seg *dseg = NULL;
 
-	assert(length >= 16);
+	assert(length >= MLX5_WQE_DWORD_SIZE);
 	/* Start the know and common part of the WQE structure. */
-	wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
-	wqe->wqe.ctrl.data[2] = 0;
-	wqe->wqe.ctrl.data[3] = 0;
-	wqe->wqe.eseg.rsvd0 = 0;
-	wqe->wqe.eseg.rsvd1 = 0;
-	wqe->wqe.eseg.mss = 0;
-	wqe->wqe.eseg.rsvd2 = 0;
+	wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+	wqe->ctrl[2] = 0;
+	wqe->ctrl[3] = 0;
+	wqe->eseg.rsvd0 = 0;
+	wqe->eseg.rsvd1 = 0;
+	wqe->eseg.mss = 0;
+	wqe->eseg.rsvd2 = 0;
 	/* Start by copying the Ethernet Header. */
 	rte_mov16((uint8_t *)raw, (uint8_t *)addr);
-	length -= 16;
-	addr += 16;
+	length -= MLX5_WQE_DWORD_SIZE;
+	addr += MLX5_WQE_DWORD_SIZE;
 	/* Replace the Ethernet type by the VLAN if necessary. */
 	if (buf->ol_flags & PKT_TX_VLAN_PKT) {
 		uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
 
-		memcpy((uint8_t *)(raw + 16 - sizeof(vlan)),
+		memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE - sizeof(vlan)),
 		       &vlan, sizeof(vlan));
 		addr -= sizeof(vlan);
 		length += sizeof(vlan);
@@ -339,7 +339,7 @@ mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
 		uint16_t max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
 		uint16_t room;
 
-		raw += 16;
+		raw += MLX5_WQE_DWORD_SIZE;
 		room = end - (uintptr_t)raw;
 		if (room > max_inline) {
 			uintptr_t addr_end = (addr + max_inline) &
@@ -356,15 +356,15 @@ mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
 			assert(addr <= addr_end);
 		}
 		/* Store the inlined packet size in the WQE. */
-		wqe->wqe.eseg.inline_hdr_sz = htons(pkt_inline_sz);
+		wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
 		/*
 		 * 2 DWORDs consumed by the WQE header + 1 DSEG +
 		 * the size of the inline part of the packet.
 		 */
-		ds = 2 + ((pkt_inline_sz - 2 + 15) / 16);
+		ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
 		if (length > 0) {
 			dseg = (struct mlx5_wqe_data_seg *)
-				((uintptr_t)wqe + (ds * 16));
+				((uintptr_t)wqe + (ds * MLX5_WQE_DWORD_SIZE));
 			if ((uintptr_t)dseg >= end)
 				dseg = (struct mlx5_wqe_data_seg *)
 					((uintptr_t)&(*txq->wqes)[0]);
@@ -377,9 +377,9 @@ mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
 		 * No inline has been done in the packet, only the Ethernet
 		 * Header as been stored.
 		 */
-		wqe->wqe.eseg.inline_hdr_sz = htons(16);
+		wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
 		dseg = (struct mlx5_wqe_data_seg *)
-			((uintptr_t)wqe + (ds * 16));
+			((uintptr_t)wqe + (ds * MLX5_WQE_DWORD_SIZE));
 use_dseg:
 		*dseg = (struct mlx5_wqe_data_seg) {
 			.addr = htonll(addr),
@@ -388,7 +388,7 @@ use_dseg:
 		};
 		++ds;
 	}
-	wqe->wqe.ctrl.data[1] = htonl(txq->qp_num_8s | ds);
+	wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
 	return ds;
 }
 
@@ -444,7 +444,7 @@ tx_prefetch_cqe(struct txq *txq, uint16_t ci)
 static inline void
 tx_prefetch_wqe(struct txq *txq, uint16_t ci)
 {
-	volatile union mlx5_wqe *wqe;
+	volatile struct mlx5_wqe64 *wqe;
 
 	wqe = &(*txq->wqes)[ci & (txq->wqe_n - 1)];
 	rte_prefetch0(wqe);
@@ -473,7 +473,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int j = 0;
 	unsigned int max;
 	unsigned int comp;
-	volatile union mlx5_wqe *wqe = NULL;
+	volatile struct mlx5_wqe *wqe = NULL;
 
 	if (unlikely(!pkts_n))
 		return 0;
@@ -492,7 +492,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		uint32_t length;
 		unsigned int segs_n = buf->nb_segs;
 		volatile struct mlx5_wqe_data_seg *dseg;
-		unsigned int ds = sizeof(*wqe) / 16;
+		unsigned int ds = 0;
 
 		/*
 		 * Make sure there is enough room to store this packet and
@@ -504,7 +504,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		max -= segs_n;
 		--pkts_n;
 		elts_head_next = (elts_head + 1) & (elts_n - 1);
-		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)].hdr;
 		tx_prefetch_wqe(txq, txq->wqe_ci);
 		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
 		if (pkts_n)
@@ -519,25 +519,25 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
-			wqe->wqe.eseg.cs_flags =
+			wqe->eseg.cs_flags =
 				MLX5_ETH_WQE_L3_CSUM |
 				MLX5_ETH_WQE_L4_CSUM;
 		} else {
-			wqe->wqe.eseg.cs_flags = 0;
+			wqe->eseg.cs_flags = 0;
 		}
 		ds = mlx5_wqe_write(txq, wqe, buf, length);
 		if (segs_n == 1)
 			goto skip_segs;
 		dseg = (volatile struct mlx5_wqe_data_seg *)
-			(((uintptr_t)wqe) + ds * 16);
+			(((uintptr_t)wqe) + ds * MLX5_WQE_DWORD_SIZE);
 		while (--segs_n) {
 			/*
 			 * Spill on next WQE when the current one does not have
 			 * enough room left. Size of WQE must a be a multiple
 			 * of data segment size.
 			 */
-			assert(!(sizeof(*wqe) % sizeof(*dseg)));
-			if (!(ds % (sizeof(*wqe) / 16)))
+			assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
+			if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE)))
 				dseg = (volatile void *)
 					&(*txq->wqes)[txq->wqe_ci++ &
 						      (txq->wqe_n - 1)];
@@ -558,8 +558,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			++j;
 		}
 		/* Update DS field in WQE. */
-		wqe->wqe.ctrl.data[1] &= htonl(0xffffffc0);
-		wqe->wqe.ctrl.data[1] |= htonl(ds & 0x3f);
+		wqe->ctrl[1] &= htonl(0xffffffc0);
+		wqe->ctrl[1] |= htonl(ds & 0x3f);
 skip_segs:
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
@@ -577,9 +577,9 @@ skip_segs:
 	comp = txq->elts_comp + i + j;
 	if (comp >= MLX5_TX_COMP_THRESH) {
 		/* Request completion on last WQE. */
-		wqe->wqe.ctrl.data[2] = htonl(8);
+		wqe->ctrl[2] = htonl(8);
 		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->wqe.ctrl.data[3] = elts_head;
+		wqe->ctrl[3] = elts_head;
 		txq->elts_comp = 0;
 	} else {
 		txq->elts_comp = comp;
@@ -616,19 +616,20 @@ mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 	mpw->pkts_n = 0;
 	mpw->len = length;
 	mpw->total_len = 0;
-	mpw->wqe = &(*txq->wqes)[idx];
-	mpw->wqe->mpw.eseg.mss = htons(length);
-	mpw->wqe->mpw.eseg.inline_hdr_sz = 0;
-	mpw->wqe->mpw.eseg.rsvd0 = 0;
-	mpw->wqe->mpw.eseg.rsvd1 = 0;
-	mpw->wqe->mpw.eseg.rsvd2 = 0;
-	mpw->wqe->mpw.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
-					   (txq->wqe_ci << 8) |
-					   MLX5_OPCODE_LSO_MPW);
-	mpw->wqe->mpw.ctrl.data[2] = 0;
-	mpw->wqe->mpw.ctrl.data[3] = 0;
-	mpw->data.dseg[0] = &mpw->wqe->mpw.dseg[0];
-	mpw->data.dseg[1] = &mpw->wqe->mpw.dseg[1];
+	mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
+	mpw->wqe->eseg.mss = htons(length);
+	mpw->wqe->eseg.inline_hdr_sz = 0;
+	mpw->wqe->eseg.rsvd0 = 0;
+	mpw->wqe->eseg.rsvd1 = 0;
+	mpw->wqe->eseg.rsvd2 = 0;
+	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
+				  (txq->wqe_ci << 8) | MLX5_OPCODE_LSO_MPW);
+	mpw->wqe->ctrl[2] = 0;
+	mpw->wqe->ctrl[3] = 0;
+	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
+		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
+	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
+		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
 	mpw->data.dseg[2] = &(*dseg)[0];
 	mpw->data.dseg[3] = &(*dseg)[1];
 	mpw->data.dseg[4] = &(*dseg)[2];
@@ -651,7 +652,7 @@ mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
 	 * count as 2.
 	 */
-	mpw->wqe->mpw.ctrl.data[1] = htonl(txq->qp_num_8s | (2 + num));
+	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num));
 	mpw->state = MLX5_MPW_STATE_CLOSED;
 	if (num < 3)
 		++txq->wqe_ci;
@@ -729,11 +730,11 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
 		    ((mpw.len != length) ||
 		     (segs_n != 1) ||
-		     (mpw.wqe->mpw.eseg.cs_flags != cs_flags)))
+		     (mpw.wqe->eseg.cs_flags != cs_flags)))
 			mlx5_mpw_close(txq, &mpw);
 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
 			mlx5_mpw_new(txq, &mpw, length);
-			mpw.wqe->mpw.eseg.cs_flags = cs_flags;
+			mpw.wqe->eseg.cs_flags = cs_flags;
 		}
 		/* Multi-segment packets must be alone in their MPW. */
 		assert((segs_n == 1) || (mpw.pkts_n == 0));
@@ -779,12 +780,12 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	/* "j" includes both packets and segments. */
 	comp = txq->elts_comp + j;
 	if (comp >= MLX5_TX_COMP_THRESH) {
-		volatile union mlx5_wqe *wqe = mpw.wqe;
+		volatile struct mlx5_wqe *wqe = mpw.wqe;
 
 		/* Request completion on last WQE. */
-		wqe->mpw.ctrl.data[2] = htonl(8);
+		wqe->ctrl[2] = htonl(8);
 		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->mpw.ctrl.data[3] = elts_head;
+		wqe->ctrl[3] = elts_head;
 		txq->elts_comp = 0;
 	} else {
 		txq->elts_comp = comp;
@@ -815,24 +816,27 @@ static inline void
 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 {
 	uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+	struct mlx5_wqe_inl_small *inl;
 
 	mpw->state = MLX5_MPW_INL_STATE_OPENED;
 	mpw->pkts_n = 0;
 	mpw->len = length;
 	mpw->total_len = 0;
-	mpw->wqe = &(*txq->wqes)[idx];
-	mpw->wqe->mpw_inl.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
-					       (txq->wqe_ci << 8) |
-					       MLX5_OPCODE_LSO_MPW);
-	mpw->wqe->mpw_inl.ctrl.data[2] = 0;
-	mpw->wqe->mpw_inl.ctrl.data[3] = 0;
-	mpw->wqe->mpw_inl.eseg.mss = htons(length);
-	mpw->wqe->mpw_inl.eseg.inline_hdr_sz = 0;
-	mpw->wqe->mpw_inl.eseg.cs_flags = 0;
-	mpw->wqe->mpw_inl.eseg.rsvd0 = 0;
-	mpw->wqe->mpw_inl.eseg.rsvd1 = 0;
-	mpw->wqe->mpw_inl.eseg.rsvd2 = 0;
-	mpw->data.raw = &mpw->wqe->mpw_inl.data[0];
+	mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
+	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
+				  (txq->wqe_ci << 8) |
+				  MLX5_OPCODE_LSO_MPW);
+	mpw->wqe->ctrl[2] = 0;
+	mpw->wqe->ctrl[3] = 0;
+	mpw->wqe->eseg.mss = htons(length);
+	mpw->wqe->eseg.inline_hdr_sz = 0;
+	mpw->wqe->eseg.cs_flags = 0;
+	mpw->wqe->eseg.rsvd0 = 0;
+	mpw->wqe->eseg.rsvd1 = 0;
+	mpw->wqe->eseg.rsvd2 = 0;
+	inl = (struct mlx5_wqe_inl_small *)
+		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
+	mpw->data.raw = (uint8_t *)&inl->raw;
 }
 
 /**
@@ -847,17 +851,18 @@ static inline void
 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
 {
 	unsigned int size;
+	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
+		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
 
-	size = sizeof(*mpw->wqe) - MLX5_MWQE64_INL_DATA + mpw->total_len;
+	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
 	/*
 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
 	 * count as 2.
 	 */
-	mpw->wqe->mpw_inl.ctrl.data[1] =
-		htonl(txq->qp_num_8s | ((size + 15) / 16));
+	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size));
 	mpw->state = MLX5_MPW_STATE_CLOSED;
-	mpw->wqe->mpw_inl.byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
-	txq->wqe_ci += (size + (sizeof(*mpw->wqe) - 1)) / sizeof(*mpw->wqe);
+	inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
+	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
 }
 
 /**
@@ -930,13 +935,13 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
 			if ((mpw.len != length) ||
 			    (segs_n != 1) ||
-			    (mpw.wqe->mpw.eseg.cs_flags != cs_flags))
+			    (mpw.wqe->eseg.cs_flags != cs_flags))
 				mlx5_mpw_close(txq, &mpw);
 		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
 			if ((mpw.len != length) ||
 			    (segs_n != 1) ||
 			    (length > inline_room) ||
-			    (mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) {
+			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
 				mlx5_mpw_inline_close(txq, &mpw);
 				inline_room =
 					txq->max_inline * RTE_CACHE_LINE_SIZE;
@@ -946,10 +951,10 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 			if ((segs_n != 1) ||
 			    (length > inline_room)) {
 				mlx5_mpw_new(txq, &mpw, length);
-				mpw.wqe->mpw.eseg.cs_flags = cs_flags;
+				mpw.wqe->eseg.cs_flags = cs_flags;
 			} else {
 				mlx5_mpw_inline_new(txq, &mpw, length);
-				mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags;
+				mpw.wqe->eseg.cs_flags = cs_flags;
 			}
 		}
 		/* Multi-segment packets must be alone in their MPW. */
@@ -1042,12 +1047,12 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 	/* "j" includes both packets and segments. */
 	comp = txq->elts_comp + j;
 	if (comp >= MLX5_TX_COMP_THRESH) {
-		volatile union mlx5_wqe *wqe = mpw.wqe;
+		volatile struct mlx5_wqe *wqe = mpw.wqe;
 
 		/* Request completion on last WQE. */
-		wqe->mpw_inl.ctrl.data[2] = htonl(8);
+		wqe->ctrl[2] = htonl(8);
 		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->mpw_inl.ctrl.data[3] = elts_head;
+		wqe->ctrl[3] = elts_head;
 		txq->elts_comp = 0;
 	} else {
 		txq->elts_comp = comp;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 8c568ad..fbc2a78 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -252,7 +252,7 @@ struct txq {
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
-	volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
+	volatile struct mlx5_wqe64 (*wqes)[]; /* Work queue. */
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
 	volatile void *bf_reg; /* Blueflame register. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 5ddd2fb..e8ebbbe 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -82,7 +82,7 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
 	for (i = 0; (i != elts_n); ++i)
 		(*txq_ctrl->txq.elts)[i] = NULL;
 	for (i = 0; (i != txq_ctrl->txq.wqe_n); ++i) {
-		volatile union mlx5_wqe *wqe = &(*txq_ctrl->txq.wqes)[i];
+		volatile struct mlx5_wqe64 *wqe = &(*txq_ctrl->txq.wqes)[i];
 
 		memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe));
 	}
@@ -215,7 +215,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 	tmpl->txq.cqe_n = ibcq->cqe + 1;
 	tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8;
 	tmpl->txq.wqes =
-		(volatile union mlx5_wqe (*)[])
+		(volatile struct mlx5_wqe64 (*)[])
 		(uintptr_t)qp->gen_data.sqstart;
 	tmpl->txq.wqe_n = qp->sq.wqe_cnt;
 	tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 2/6] net/mlx5: reduce Tx and Rx structure size
  2016-09-07  7:09 [PATCH 0/6] net/mlx5: performance improvement Nelio Laranjeiro
                   ` (7 preceding siblings ...)
  2016-09-14 12:18 ` [PATCH v2 1/6] net/mlx5: rework hardware structures Nelio Laranjeiro
@ 2016-09-14 12:18 ` Nelio Laranjeiro
  2016-09-14 12:18 ` [PATCH v2 3/6] " Nelio Laranjeiro
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-14 12:18 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil

PMD uses only power of two number of descriptors, storing the number of
elements in log2 helps to reduce the size of the container to store it.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5_ethdev.c |  4 ++--
 drivers/net/mlx5/mlx5_rxq.c    | 10 +++++-----
 drivers/net/mlx5/mlx5_rxtx.c   | 10 +++++-----
 drivers/net/mlx5/mlx5_rxtx.h   | 12 +++++++-----
 drivers/net/mlx5/mlx5_txq.c    |  6 +++---
 5 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 1ae80e5..137ea66 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -808,7 +808,7 @@ recover:
 		if (rehash)
 			ret = rxq_rehash(dev, rxq_ctrl);
 		else
-			ret = rxq_ctrl_setup(dev, rxq_ctrl, rxq->elts_n,
+			ret = rxq_ctrl_setup(dev, rxq_ctrl, 1 << rxq->elts_n,
 					     rxq_ctrl->socket, NULL, rxq->mp);
 		if (!ret)
 			continue;
@@ -1314,7 +1314,7 @@ mlx5_secondary_data_setup(struct priv *priv)
 		if (txq_ctrl != NULL) {
 			if (txq_ctrl_setup(priv->dev,
 					   primary_txq_ctrl,
-					   primary_txq->elts_n,
+					   1 << primary_txq->elts_n,
 					   primary_txq_ctrl->socket,
 					   NULL) == 0) {
 				txq_ctrl->txq.stats.idx =
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 44889d1..d9db368 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -723,7 +723,7 @@ rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
 	if (rxq_ctrl->rxq.elts == NULL)
 		return;
 
-	for (i = 0; (i != rxq_ctrl->rxq.elts_n); ++i) {
+	for (i = 0; (i != (1u << rxq_ctrl->rxq.elts_n)); ++i) {
 		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
 			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
 		(*rxq_ctrl->rxq.elts)[i] = NULL;
@@ -807,7 +807,7 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
 int
 rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
 {
-	unsigned int elts_n = rxq_ctrl->rxq.elts_n;
+	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
 	unsigned int i;
 	struct ibv_exp_wq_attr mod;
 	int err;
@@ -870,7 +870,7 @@ rxq_setup(struct rxq_ctrl *tmpl)
 	struct ibv_cq *ibcq = tmpl->cq;
 	struct mlx5_cq *cq = to_mxxx(cq, cq);
 	struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
-	struct rte_mbuf *(*elts)[tmpl->rxq.elts_n] =
+	struct rte_mbuf *(*elts)[1 << tmpl->rxq.elts_n] =
 		rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
 
 	if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
@@ -924,7 +924,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 		.priv = priv,
 		.socket = socket,
 		.rxq = {
-			.elts_n = desc,
+			.elts_n = log2above(desc),
 			.mp = mp,
 		},
 	};
@@ -1148,7 +1148,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 	}
 	/* Reuse buffers from original queue if possible. */
 	if (rxq_ctrl->rxq.elts_n) {
-		assert(rxq_ctrl->rxq.elts_n == desc);
+		assert(1 << rxq_ctrl->rxq.elts_n == desc);
 		assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
 		ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
 	} else
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 5feeb3f..e132727 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -166,8 +166,8 @@ txq_complete(struct txq *txq) __attribute__((always_inline));
 static inline void
 txq_complete(struct txq *txq)
 {
-	const unsigned int elts_n = txq->elts_n;
 	const unsigned int cqe_n = txq->cqe_n;
+	const unsigned int elts_n = 1 << txq->elts_n;
 	const unsigned int cqe_cnt = cqe_n - 1;
 	uint16_t elts_free = txq->elts_tail;
 	uint16_t elts_tail;
@@ -468,7 +468,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct txq *txq = (struct txq *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
-	const unsigned int elts_n = txq->elts_n;
+	const unsigned int elts_n = 1 << txq->elts_n;
 	unsigned int i = 0;
 	unsigned int j = 0;
 	unsigned int max;
@@ -680,7 +680,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct txq *txq = (struct txq *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
-	const unsigned int elts_n = txq->elts_n;
+	const unsigned int elts_n = 1 << txq->elts_n;
 	unsigned int i = 0;
 	unsigned int j = 0;
 	unsigned int max;
@@ -884,7 +884,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 {
 	struct txq *txq = (struct txq *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
-	const unsigned int elts_n = txq->elts_n;
+	const unsigned int elts_n = 1 << txq->elts_n;
 	unsigned int i = 0;
 	unsigned int j = 0;
 	unsigned int max;
@@ -1272,8 +1272,8 @@ uint16_t
 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct rxq *rxq = dpdk_rxq;
-	const unsigned int wqe_cnt = rxq->elts_n - 1;
 	const unsigned int cqe_cnt = rxq->cqe_n - 1;
+	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
 	const unsigned int sges_n = rxq->sges_n;
 	struct rte_mbuf *pkt = NULL;
 	struct rte_mbuf *seg = NULL;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index fbc2a78..3ba3913 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -54,6 +54,7 @@
 #endif
 #include <rte_mbuf.h>
 #include <rte_mempool.h>
+#include <rte_common.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-pedantic"
 #endif
@@ -109,16 +110,16 @@ struct rxq {
 	unsigned int vlan_strip:1; /* Enable VLAN stripping. */
 	unsigned int crc_present:1; /* CRC must be subtracted. */
 	unsigned int sges_n:2; /* Log 2 of SGEs (max buffers per packet). */
+	unsigned int elts_n:4; /* Log 2 of Mbufs. */
+	unsigned int port_id:8;
+	volatile uint32_t *rq_db;
+	volatile uint32_t *cq_db;
 	uint16_t rq_ci;
 	uint16_t cq_ci;
-	uint16_t elts_n;
 	uint16_t cqe_n; /* Number of CQ elements. */
-	uint16_t port_id;
 	volatile struct mlx5_wqe_data_seg(*wqes)[];
 	volatile struct mlx5_cqe(*cqes)[];
 	struct rxq_zip zip; /* Compressed context. */
-	volatile uint32_t *rq_db;
-	volatile uint32_t *cq_db;
 	struct rte_mbuf *(*elts)[];
 	struct rte_mempool *mp;
 	struct mlx5_rxq_stats stats;
@@ -238,15 +239,16 @@ struct hash_rxq {
 };
 
 /* TX queue descriptor. */
+RTE_STD_C11
 struct txq {
 	uint16_t elts_head; /* Current index in (*elts)[]. */
 	uint16_t elts_tail; /* First element awaiting completion. */
 	uint16_t elts_comp; /* Counter since last completion request. */
-	uint16_t elts_n; /* (*elts)[] length. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
 	uint16_t cqe_n; /* Number of CQ elements. */
 	uint16_t wqe_ci; /* Consumer index for work queue. */
 	uint16_t wqe_n; /* Number of WQ elements. */
+	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
 	uint16_t bf_offset; /* Blueflame offset. */
 	uint16_t bf_buf_size; /* Blueflame size. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index e8ebbbe..9055016 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -101,7 +101,7 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
 static void
 txq_free_elts(struct txq_ctrl *txq_ctrl)
 {
-	unsigned int elts_n = txq_ctrl->txq.elts_n;
+	unsigned int elts_n = 1 << txq_ctrl->txq.elts_n;
 	unsigned int elts_head = txq_ctrl->txq.elts_head;
 	unsigned int elts_tail = txq_ctrl->txq.elts_tail;
 	struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts;
@@ -227,7 +227,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 		(volatile struct mlx5_cqe (*)[])
 		(uintptr_t)cq->active_buf->buf;
 	tmpl->txq.elts =
-		(struct rte_mbuf *(*)[tmpl->txq.elts_n])
+		(struct rte_mbuf *(*)[1 << tmpl->txq.elts_n])
 		((uintptr_t)txq_ctrl + sizeof(*txq_ctrl));
 	return 0;
 }
@@ -277,7 +277,7 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 	}
 	(void)conf; /* Thresholds configuration (ignored). */
 	assert(desc > MLX5_TX_COMP_THRESH);
-	tmpl.txq.elts_n = desc;
+	tmpl.txq.elts_n = log2above(desc);
 	/* MRs will be registered in mp2mr[] later. */
 	attr.rd = (struct ibv_exp_res_domain_init_attr){
 		.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 3/6] net/mlx5: reduce Tx and Rx structure size
  2016-09-07  7:09 [PATCH 0/6] net/mlx5: performance improvement Nelio Laranjeiro
                   ` (8 preceding siblings ...)
  2016-09-14 12:18 ` [PATCH v2 2/6] net/mlx5: reduce Tx and Rx structure size Nelio Laranjeiro
@ 2016-09-14 12:18 ` Nelio Laranjeiro
  2016-09-14 12:18 ` [PATCH v2 4/6] net/mlx5: reduce Tx " Nelio Laranjeiro
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-14 12:18 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil

PMD uses only power of two number of Completion Queue Elements, storing the
number of elements in log2 helps to reduce the size of the container to
store it.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxq.c  | 2 +-
 drivers/net/mlx5/mlx5_rxtx.c | 8 ++++----
 drivers/net/mlx5/mlx5_rxtx.h | 4 ++--
 drivers/net/mlx5/mlx5_txq.c  | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index d9db368..f6f4315 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -881,7 +881,7 @@ rxq_setup(struct rxq_ctrl *tmpl)
 	if (elts == NULL)
 		return ENOMEM;
 	tmpl->rxq.rq_db = rwq->rq.db;
-	tmpl->rxq.cqe_n = ibcq->cqe + 1;
+	tmpl->rxq.cqe_n = log2above(ibcq->cqe);
 	tmpl->rxq.cq_ci = 0;
 	tmpl->rxq.rq_ci = 0;
 	tmpl->rxq.cq_db = cq->dbrec;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index e132727..4f28aa9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -166,8 +166,8 @@ txq_complete(struct txq *txq) __attribute__((always_inline));
 static inline void
 txq_complete(struct txq *txq)
 {
-	const unsigned int cqe_n = txq->cqe_n;
 	const unsigned int elts_n = 1 << txq->elts_n;
+	const unsigned int cqe_n = 1 << txq->cqe_n;
 	const unsigned int cqe_cnt = cqe_n - 1;
 	uint16_t elts_free = txq->elts_tail;
 	uint16_t elts_tail;
@@ -427,9 +427,9 @@ mlx5_tx_dbrec(struct txq *txq)
 static inline void
 tx_prefetch_cqe(struct txq *txq, uint16_t ci)
 {
-	volatile struct mlx5_cqe64 *cqe;
+	volatile struct mlx5_cqe *cqe;
 
-	cqe = &(*txq->cqes)[ci & (txq->cqe_n - 1)].cqe64;
+	cqe = &(*txq->cqes)[ci & ((1 << txq->cqe_n) - 1)];
 	rte_prefetch0(cqe);
 }
 
@@ -1272,8 +1272,8 @@ uint16_t
 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct rxq *rxq = dpdk_rxq;
-	const unsigned int cqe_cnt = rxq->cqe_n - 1;
 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
+	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
 	const unsigned int sges_n = rxq->sges_n;
 	struct rte_mbuf *pkt = NULL;
 	struct rte_mbuf *seg = NULL;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3ba3913..224614e 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -110,13 +110,13 @@ struct rxq {
 	unsigned int vlan_strip:1; /* Enable VLAN stripping. */
 	unsigned int crc_present:1; /* CRC must be subtracted. */
 	unsigned int sges_n:2; /* Log 2 of SGEs (max buffers per packet). */
+	unsigned int cqe_n:4; /* Log 2 of CQ elements. */
 	unsigned int elts_n:4; /* Log 2 of Mbufs. */
 	unsigned int port_id:8;
 	volatile uint32_t *rq_db;
 	volatile uint32_t *cq_db;
 	uint16_t rq_ci;
 	uint16_t cq_ci;
-	uint16_t cqe_n; /* Number of CQ elements. */
 	volatile struct mlx5_wqe_data_seg(*wqes)[];
 	volatile struct mlx5_cqe(*cqes)[];
 	struct rxq_zip zip; /* Compressed context. */
@@ -245,10 +245,10 @@ struct txq {
 	uint16_t elts_tail; /* First element awaiting completion. */
 	uint16_t elts_comp; /* Counter since last completion request. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
-	uint16_t cqe_n; /* Number of CQ elements. */
 	uint16_t wqe_ci; /* Consumer index for work queue. */
 	uint16_t wqe_n; /* Number of WQ elements. */
 	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
+	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
 	uint16_t bf_offset; /* Blueflame offset. */
 	uint16_t bf_buf_size; /* Blueflame size. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 9055016..6145b69 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -212,7 +212,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
 		return EINVAL;
 	}
-	tmpl->txq.cqe_n = ibcq->cqe + 1;
+	tmpl->txq.cqe_n = log2above(ibcq->cqe);
 	tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8;
 	tmpl->txq.wqes =
 		(volatile struct mlx5_wqe64 (*)[])
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 4/6] net/mlx5: reduce Tx structure size
  2016-09-07  7:09 [PATCH 0/6] net/mlx5: performance improvement Nelio Laranjeiro
                   ` (9 preceding siblings ...)
  2016-09-14 12:18 ` [PATCH v2 3/6] " Nelio Laranjeiro
@ 2016-09-14 12:18 ` Nelio Laranjeiro
  2016-09-14 12:18 ` [PATCH v2 5/6] net/mlx5: reduce Tx and Rx " Nelio Laranjeiro
  2016-09-14 12:18 ` [PATCH v2 6/6] net/mlx5: remove gather loop on segments Nelio Laranjeiro
  12 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-14 12:18 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil

Blue Flame is a buffer allocated with a power of two value, its size is
returned by Verbs in log2.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 2 +-
 drivers/net/mlx5/mlx5_rxtx.h | 2 +-
 drivers/net/mlx5/mlx5_txq.c  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 4f28aa9..214922b 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -413,7 +413,7 @@ mlx5_tx_dbrec(struct txq *txq)
 	/* Ensure ordering between DB record and BF copy. */
 	rte_wmb();
 	rte_mov16(dst, (uint8_t *)data);
-	txq->bf_offset ^= txq->bf_buf_size;
+	txq->bf_offset ^= (1 << txq->bf_buf_size);
 }
 
 /**
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 224614e..3dca8ca 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -249,8 +249,8 @@ struct txq {
 	uint16_t wqe_n; /* Number of WQ elements. */
 	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
 	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
+	uint16_t bf_buf_size:4; /* Log2 Blueflame size. */
 	uint16_t bf_offset; /* Blueflame offset. */
-	uint16_t bf_buf_size; /* Blueflame size. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 6145b69..9919e37 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -221,7 +221,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 	tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
 	tmpl->txq.bf_reg = qp->gen_data.bf->reg;
 	tmpl->txq.bf_offset = qp->gen_data.bf->offset;
-	tmpl->txq.bf_buf_size = qp->gen_data.bf->buf_size;
+	tmpl->txq.bf_buf_size = log2above(qp->gen_data.bf->buf_size);
 	tmpl->txq.cq_db = cq->dbrec;
 	tmpl->txq.cqes =
 		(volatile struct mlx5_cqe (*)[])
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 5/6] net/mlx5: reduce Tx and Rx structure size
  2016-09-07  7:09 [PATCH 0/6] net/mlx5: performance improvement Nelio Laranjeiro
                   ` (10 preceding siblings ...)
  2016-09-14 12:18 ` [PATCH v2 4/6] net/mlx5: reduce Tx " Nelio Laranjeiro
@ 2016-09-14 12:18 ` Nelio Laranjeiro
  2016-09-14 12:18 ` [PATCH v2 6/6] net/mlx5: remove gather loop on segments Nelio Laranjeiro
  12 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-14 12:18 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil

PMD uses only power of two number of Work Queue Elements, storing the
number of elements in log2 helps to reduce the size of the container to
store it.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 23 ++++++++++++-----------
 drivers/net/mlx5/mlx5_rxtx.h |  2 +-
 drivers/net/mlx5/mlx5_txq.c  |  4 ++--
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 214922b..9d00ddc 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -199,9 +199,10 @@ txq_complete(struct txq *txq)
 	} while (1);
 	if (unlikely(cqe == NULL))
 		return;
-	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)].hdr;
+	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) &
+			    ((1 << txq->wqe_n) - 1)].hdr;
 	elts_tail = wqe->ctrl[3];
-	assert(elts_tail < txq->wqe_n);
+	assert(elts_tail < (1 << txq->wqe_n));
 	/* Free buffers. */
 	while (elts_free != elts_tail) {
 		struct rte_mbuf *elt = (*txq->elts)[elts_free];
@@ -335,7 +336,7 @@ mlx5_wqe_write(struct txq *txq, volatile struct mlx5_wqe *wqe,
 	}
 	/* Inline if enough room. */
 	if (txq->max_inline != 0) {
-		uintptr_t end = (uintptr_t)&(*txq->wqes)[txq->wqe_n];
+		uintptr_t end = (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
 		uint16_t max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
 		uint16_t room;
 
@@ -446,7 +447,7 @@ tx_prefetch_wqe(struct txq *txq, uint16_t ci)
 {
 	volatile struct mlx5_wqe64 *wqe;
 
-	wqe = &(*txq->wqes)[ci & (txq->wqe_n - 1)];
+	wqe = &(*txq->wqes)[ci & ((1 << txq->wqe_n) - 1)];
 	rte_prefetch0(wqe);
 }
 
@@ -504,7 +505,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		max -= segs_n;
 		--pkts_n;
 		elts_head_next = (elts_head + 1) & (elts_n - 1);
-		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)].hdr;
+		wqe = &(*txq->wqes)[txq->wqe_ci & ((1 << txq->wqe_n) - 1)].hdr;
 		tx_prefetch_wqe(txq, txq->wqe_ci);
 		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
 		if (pkts_n)
@@ -540,7 +541,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE)))
 				dseg = (volatile void *)
 					&(*txq->wqes)[txq->wqe_ci++ &
-						      (txq->wqe_n - 1)];
+						      ((1 << txq->wqe_n) - 1)];
 			else
 				++dseg;
 			++ds;
@@ -607,10 +608,10 @@ skip_segs:
 static inline void
 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 {
-	uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
 	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
 		(volatile struct mlx5_wqe_data_seg (*)[])
-		(uintptr_t)&(*txq->wqes)[(idx + 1) & (txq->wqe_n - 1)];
+		(uintptr_t)&(*txq->wqes)[(idx + 1) & ((1 << txq->wqe_n) - 1)];
 
 	mpw->state = MLX5_MPW_STATE_OPENED;
 	mpw->pkts_n = 0;
@@ -815,7 +816,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 static inline void
 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 {
-	uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
 	struct mlx5_wqe_inl_small *inl;
 
 	mpw->state = MLX5_MPW_INL_STATE_OPENED;
@@ -1000,7 +1001,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			(*txq->elts)[elts_head] = buf;
 			/* Maximum number of bytes before wrapping. */
-			max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] -
+			max = ((uintptr_t)&(*txq->wqes)[1 << txq->wqe_n] -
 			       (uintptr_t)mpw.data.raw);
 			if (length > max) {
 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
@@ -1019,7 +1020,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 				mpw.data.raw += length;
 			}
 			if ((uintptr_t)mpw.data.raw ==
-			    (uintptr_t)&(*txq->wqes)[txq->wqe_n])
+			    (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n])
 				mpw.data.raw =
 					(volatile void *)&(*txq->wqes)[0];
 			++mpw.pkts_n;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3dca8ca..9828aef 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -246,9 +246,9 @@ struct txq {
 	uint16_t elts_comp; /* Counter since last completion request. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
 	uint16_t wqe_ci; /* Consumer index for work queue. */
-	uint16_t wqe_n; /* Number of WQ elements. */
 	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
 	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
+	uint16_t wqe_n:4; /* Number of of WQ elements (in log2). */
 	uint16_t bf_buf_size:4; /* Log2 Blueflame size. */
 	uint16_t bf_offset; /* Blueflame offset. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 9919e37..3d2d132 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -81,7 +81,7 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
 
 	for (i = 0; (i != elts_n); ++i)
 		(*txq_ctrl->txq.elts)[i] = NULL;
-	for (i = 0; (i != txq_ctrl->txq.wqe_n); ++i) {
+	for (i = 0; (i != (1u << txq_ctrl->txq.wqe_n)); ++i) {
 		volatile struct mlx5_wqe64 *wqe = &(*txq_ctrl->txq.wqes)[i];
 
 		memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe));
@@ -217,7 +217,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 	tmpl->txq.wqes =
 		(volatile struct mlx5_wqe64 (*)[])
 		(uintptr_t)qp->gen_data.sqstart;
-	tmpl->txq.wqe_n = qp->sq.wqe_cnt;
+	tmpl->txq.wqe_n = log2above(qp->sq.wqe_cnt);
 	tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
 	tmpl->txq.bf_reg = qp->gen_data.bf->reg;
 	tmpl->txq.bf_offset = qp->gen_data.bf->offset;
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 6/6] net/mlx5: remove gather loop on segments
  2016-09-07  7:09 [PATCH 0/6] net/mlx5: performance improvement Nelio Laranjeiro
                   ` (11 preceding siblings ...)
  2016-09-14 12:18 ` [PATCH v2 5/6] net/mlx5: reduce Tx and Rx " Nelio Laranjeiro
@ 2016-09-14 12:18 ` Nelio Laranjeiro
  12 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-14 12:18 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil, Vasily Philipov

Tx function was handling a double loop to send segmented packets, it can be
done in a single one.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Vasily Philipov <vasilyf@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 312 ++++++++++++++++++++++---------------------
 1 file changed, 158 insertions(+), 154 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 9d00ddc..b91b644 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -288,112 +288,6 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
 }
 
 /**
- * Write a regular WQE.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param wqe
- *   Pointer to the WQE to fill.
- * @param buf
- *   Buffer.
- * @param length
- *   Packet length.
- *
- * @return ds
- *   Number of DS elements consumed.
- */
-static inline unsigned int
-mlx5_wqe_write(struct txq *txq, volatile struct mlx5_wqe *wqe,
-	       struct rte_mbuf *buf, uint32_t length)
-{
-	uint8_t *raw = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0];
-	uint16_t ds;
-	uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
-	uintptr_t addr = rte_pktmbuf_mtod(buf, uintptr_t);
-	struct mlx5_wqe_data_seg *dseg = NULL;
-
-	assert(length >= MLX5_WQE_DWORD_SIZE);
-	/* Start the know and common part of the WQE structure. */
-	wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
-	wqe->ctrl[2] = 0;
-	wqe->ctrl[3] = 0;
-	wqe->eseg.rsvd0 = 0;
-	wqe->eseg.rsvd1 = 0;
-	wqe->eseg.mss = 0;
-	wqe->eseg.rsvd2 = 0;
-	/* Start by copying the Ethernet Header. */
-	rte_mov16((uint8_t *)raw, (uint8_t *)addr);
-	length -= MLX5_WQE_DWORD_SIZE;
-	addr += MLX5_WQE_DWORD_SIZE;
-	/* Replace the Ethernet type by the VLAN if necessary. */
-	if (buf->ol_flags & PKT_TX_VLAN_PKT) {
-		uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
-
-		memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE - sizeof(vlan)),
-		       &vlan, sizeof(vlan));
-		addr -= sizeof(vlan);
-		length += sizeof(vlan);
-	}
-	/* Inline if enough room. */
-	if (txq->max_inline != 0) {
-		uintptr_t end = (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
-		uint16_t max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
-		uint16_t room;
-
-		raw += MLX5_WQE_DWORD_SIZE;
-		room = end - (uintptr_t)raw;
-		if (room > max_inline) {
-			uintptr_t addr_end = (addr + max_inline) &
-				~(RTE_CACHE_LINE_SIZE - 1);
-			uint16_t copy_b = ((addr_end - addr) > length) ?
-					  length :
-					  (addr_end - addr);
-
-			rte_memcpy((void *)raw, (void *)addr, copy_b);
-			addr += copy_b;
-			length -= copy_b;
-			pkt_inline_sz += copy_b;
-			/* Sanity check. */
-			assert(addr <= addr_end);
-		}
-		/* Store the inlined packet size in the WQE. */
-		wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
-		/*
-		 * 2 DWORDs consumed by the WQE header + 1 DSEG +
-		 * the size of the inline part of the packet.
-		 */
-		ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
-		if (length > 0) {
-			dseg = (struct mlx5_wqe_data_seg *)
-				((uintptr_t)wqe + (ds * MLX5_WQE_DWORD_SIZE));
-			if ((uintptr_t)dseg >= end)
-				dseg = (struct mlx5_wqe_data_seg *)
-					((uintptr_t)&(*txq->wqes)[0]);
-			goto use_dseg;
-		}
-	} else {
-		/* Add the remaining packet as a simple ds. */
-		ds = 3;
-		/*
-		 * No inline has been done in the packet, only the Ethernet
-		 * Header as been stored.
-		 */
-		wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
-		dseg = (struct mlx5_wqe_data_seg *)
-			((uintptr_t)wqe + (ds * MLX5_WQE_DWORD_SIZE));
-use_dseg:
-		*dseg = (struct mlx5_wqe_data_seg) {
-			.addr = htonll(addr),
-			.byte_count = htonl(length),
-			.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
-		};
-		++ds;
-	}
-	wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
-	return ds;
-}
-
-/**
  * Ring TX queue doorbell.
  *
  * @param txq
@@ -475,6 +369,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int max;
 	unsigned int comp;
 	volatile struct mlx5_wqe *wqe = NULL;
+	unsigned int segs_n = 0;
+	struct rte_mbuf *buf = NULL;
+	uint8_t *raw;
 
 	if (unlikely(!pkts_n))
 		return 0;
@@ -488,13 +385,17 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (max > elts_n)
 		max -= elts_n;
 	do {
-		struct rte_mbuf *buf = *(pkts++);
-		unsigned int elts_head_next;
+		volatile struct mlx5_wqe_data_seg *dseg = NULL;
 		uint32_t length;
-		unsigned int segs_n = buf->nb_segs;
-		volatile struct mlx5_wqe_data_seg *dseg;
 		unsigned int ds = 0;
+		uintptr_t addr;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		uint32_t total_length = 0;
+#endif
 
+		/* first_seg */
+		buf = *(pkts++);
+		segs_n = buf->nb_segs;
 		/*
 		 * Make sure there is enough room to store this packet and
 		 * that one ring entry remains unused.
@@ -503,20 +404,30 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if (max < segs_n + 1)
 			break;
 		max -= segs_n;
-		--pkts_n;
-		elts_head_next = (elts_head + 1) & (elts_n - 1);
-		wqe = &(*txq->wqes)[txq->wqe_ci & ((1 << txq->wqe_n) - 1)].hdr;
-		tx_prefetch_wqe(txq, txq->wqe_ci);
+		--segs_n;
+		if (!segs_n)
+			--pkts_n;
+		wqe = &(*txq->wqes)[txq->wqe_ci &
+				    ((1 << txq->wqe_n) - 1)].hdr;
 		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
-		if (pkts_n)
+		if (pkts_n > 1)
 			rte_prefetch0(*pkts);
+		addr = rte_pktmbuf_mtod(buf, uintptr_t);
 		length = DATA_LEN(buf);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		total_length = length;
+#endif
+		assert(length >= MLX5_WQE_DWORD_SIZE);
 		/* Update element. */
 		(*txq->elts)[elts_head] = buf;
+		elts_head = (elts_head + 1) & (elts_n - 1);
 		/* Prefetch next buffer data. */
-		if (pkts_n)
-			rte_prefetch0(rte_pktmbuf_mtod(*pkts,
-						       volatile void *));
+		if (pkts_n > 1) {
+			volatile void *pkt_addr;
+
+			pkt_addr = rte_pktmbuf_mtod(*pkts, volatile void *);
+			rte_prefetch0(pkt_addr);
+		}
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -526,50 +437,143 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		} else {
 			wqe->eseg.cs_flags = 0;
 		}
-		ds = mlx5_wqe_write(txq, wqe, buf, length);
-		if (segs_n == 1)
-			goto skip_segs;
-		dseg = (volatile struct mlx5_wqe_data_seg *)
-			(((uintptr_t)wqe) + ds * MLX5_WQE_DWORD_SIZE);
-		while (--segs_n) {
+		raw  = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0];
+		/* Start the know and common part of the WQE structure. */
+		wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+		wqe->ctrl[2] = 0;
+		wqe->ctrl[3] = 0;
+		wqe->eseg.rsvd0 = 0;
+		wqe->eseg.rsvd1 = 0;
+		wqe->eseg.mss = 0;
+		wqe->eseg.rsvd2 = 0;
+		/* Start by copying the Ethernet Header. */
+		rte_mov16((uint8_t *)raw, (uint8_t *)addr);
+		length -= MLX5_WQE_DWORD_SIZE;
+		addr += MLX5_WQE_DWORD_SIZE;
+		/* Replace the Ethernet type by the VLAN if necessary. */
+		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
+			uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
+
+			memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE -
+					   sizeof(vlan)),
+			       &vlan, sizeof(vlan));
+			addr -= sizeof(vlan);
+			length += sizeof(vlan);
+		}
+		/* Inline if enough room. */
+		if (txq->max_inline != 0) {
+			uintptr_t end =
+				(uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
+			uint16_t max_inline =
+				txq->max_inline * RTE_CACHE_LINE_SIZE;
+			uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
+			uint16_t room;
+
+			raw += MLX5_WQE_DWORD_SIZE;
+			room = end - (uintptr_t)raw;
+			if (room > max_inline) {
+				uintptr_t addr_end = (addr + max_inline) &
+					~(RTE_CACHE_LINE_SIZE - 1);
+				uint16_t copy_b = ((addr_end - addr) > length) ?
+						  length :
+						  (addr_end - addr);
+
+				rte_memcpy((void *)raw, (void *)addr, copy_b);
+				addr += copy_b;
+				length -= copy_b;
+				pkt_inline_sz += copy_b;
+				/* Sanity check. */
+				assert(addr <= addr_end);
+			}
+			/* Store the inlined packet size in the WQE. */
+			wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
+			/*
+			 * 2 DWORDs consumed by the WQE header + 1 DSEG +
+			 * the size of the inline part of the packet.
+			 */
+			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
+			if (length > 0) {
+				dseg = (struct mlx5_wqe_data_seg *)
+					((uintptr_t)wqe +
+					 (ds * MLX5_WQE_DWORD_SIZE));
+				if ((uintptr_t)dseg >= end)
+					dseg = (struct mlx5_wqe_data_seg *)
+						((uintptr_t)&(*txq->wqes)[0]);
+				goto use_dseg;
+			} else if (!segs_n) {
+				goto next_pkt;
+			} else {
+				goto next_seg;
+			}
+		} else {
 			/*
-			 * Spill on next WQE when the current one does not have
-			 * enough room left. Size of WQE must a be a multiple
-			 * of data segment size.
+			 * No inline has been done in the packet, only the
+			 * Ethernet Header as been stored.
 			 */
-			assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
-			if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE)))
-				dseg = (volatile void *)
-					&(*txq->wqes)[txq->wqe_ci++ &
-						      ((1 << txq->wqe_n) - 1)];
-			else
-				++dseg;
+			wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
+			dseg = (struct mlx5_wqe_data_seg *)
+				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
+			ds = 3;
+use_dseg:
+			/* Add the remaining packet as a simple ds. */
+			*dseg = (struct mlx5_wqe_data_seg) {
+				.addr = htonll(addr),
+				.byte_count = htonl(length),
+				.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+			};
 			++ds;
-			buf = buf->next;
-			assert(buf);
-			/* Store segment information. */
-			dseg->byte_count = htonl(DATA_LEN(buf));
-			dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf));
-			dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
-			(*txq->elts)[elts_head_next] = buf;
-			elts_head_next = (elts_head_next + 1) & (elts_n - 1);
+			if (!segs_n)
+				goto next_pkt;
+		}
+next_seg:
+		assert(buf);
+		assert(ds);
+		assert(wqe);
+		/*
+		 * Spill on next WQE when the current one does not have
+		 * enough room left. Size of WQE must a be a multiple
+		 * of data segment size.
+		 */
+		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
+		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
+			unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) &
+				((1 << txq->wqe_n) - 1);
+
+			dseg = (struct mlx5_wqe_data_seg *)
+				((uintptr_t)&(*txq->wqes)[n]);
+			tx_prefetch_wqe(txq, n + 1);
+		} else {
+			++dseg;
+		}
+		++ds;
+		buf = buf->next;
+		assert(buf);
+		length = DATA_LEN(buf);
 #ifdef MLX5_PMD_SOFT_COUNTERS
-			length += DATA_LEN(buf);
+		total_length += length;
 #endif
-			++j;
-		}
-		/* Update DS field in WQE. */
-		wqe->ctrl[1] &= htonl(0xffffffc0);
-		wqe->ctrl[1] |= htonl(ds & 0x3f);
-skip_segs:
+		/* Store segment information. */
+		*dseg = (struct mlx5_wqe_data_seg) {
+			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
+			.byte_count = htonl(length),
+			.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+		};
+		(*txq->elts)[elts_head] = buf;
+		elts_head = (elts_head + 1) & (elts_n - 1);
+		++j;
+		--segs_n;
+		if (segs_n)
+			goto next_seg;
+		else
+			--pkts_n;
+next_pkt:
+		++i;
+		wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
+		txq->wqe_ci += (ds + 3) / 4;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
+		txq->stats.obytes += total_length;
 #endif
-		/* Increment consumer index. */
-		txq->wqe_ci += (ds + 3) / 4;
-		elts_head = elts_head_next;
-		++i;
 	} while (pkts_n);
 	/* Take a shortcut if nothing must be sent. */
 	if (unlikely(i == 0))
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/6] net/mlx5: rework hardware structures
  2016-09-14 12:18 ` [PATCH v2 1/6] net/mlx5: rework hardware structures Nelio Laranjeiro
@ 2016-09-19 16:14   ` Bruce Richardson
  2016-09-20  7:09     ` Nélio Laranjeiro
  0 siblings, 1 reply; 26+ messages in thread
From: Bruce Richardson @ 2016-09-19 16:14 UTC (permalink / raw)
  To: Nelio Laranjeiro; +Cc: dev, Adrien Mazarguil

On Wed, Sep 14, 2016 at 02:18:02PM +0200, Nelio Laranjeiro wrote:
> Rework Work Queue Element (aka WQE) structures to fit PMD needs.
> A WQE is an aggregation of 16 bytes elements known as "data segments"
> (aka dseg).
> 
> Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Hi Nelio,

can you give a bit more detail in the commit message here. What are the "PMD
needs" that must be met, and how is the WQE reworked to meet them?

Thanks,
/Bruce

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 0/6] net/mlx5: performance improvement
  2016-09-14 12:18 ` [PATCH v2 0/6] net/mlx5: performance improvement Nelio Laranjeiro
@ 2016-09-19 16:17   ` Bruce Richardson
  2016-09-20  7:25     ` Nélio Laranjeiro
  2016-09-20  8:53   ` [PATCH v3 " Nelio Laranjeiro
  1 sibling, 1 reply; 26+ messages in thread
From: Bruce Richardson @ 2016-09-19 16:17 UTC (permalink / raw)
  To: Nelio Laranjeiro; +Cc: dev, Adrien Mazarguil

On Wed, Sep 14, 2016 at 02:18:01PM +0200, Nelio Laranjeiro wrote:
>  - Rework structure elements to reduce their size.
>  - Removes a second useless loop in Tx burst function.
> 
> This series should be applied on top of "net/mlx5: various fixes".
> 
> Changes in v2:
> 
>  - rework serie to apply it on top of "net/mlx5: various fixes".
> 
> Nelio Laranjeiro (6):
>   net/mlx5: rework hardware structures
>   net/mlx5: reduce Tx and Rx structure size
>   net/mlx5: reduce Tx and Rx structure size
>   net/mlx5: reduce Tx structure size
>   net/mlx5: reduce Tx and Rx structure size
>   net/mlx5: remove gather loop on segments
> 
Three of the patches in this set have the same title, which seems a lot.
Can you expand the titles in some way to make them a little different and help
those browsing the commit short-logs?

/Bruce

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/6] net/mlx5: rework hardware structures
  2016-09-19 16:14   ` Bruce Richardson
@ 2016-09-20  7:09     ` Nélio Laranjeiro
  0 siblings, 0 replies; 26+ messages in thread
From: Nélio Laranjeiro @ 2016-09-20  7:09 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: dev, Adrien Mazarguil

On Mon, Sep 19, 2016 at 05:14:26PM +0100, Bruce Richardson wrote:
> On Wed, Sep 14, 2016 at 02:18:02PM +0200, Nelio Laranjeiro wrote:
> > Rework Work Queue Element (aka WQE) structures to fit PMD needs.
> > A WQE is an aggregation of 16 bytes elements known as "data segments"
> > (aka dseg).
> > 
> > Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> 
> Hi Nelio,
> 
> can you give a bit more detail in the commit message here. What are the "PMD
> needs" that must be met, and how is the WQE reworked to meet them?
> 
> Thanks,
> /Bruce

Sure, I will try to explain it better in v3.

Regards,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 0/6] net/mlx5: performance improvement
  2016-09-19 16:17   ` Bruce Richardson
@ 2016-09-20  7:25     ` Nélio Laranjeiro
  0 siblings, 0 replies; 26+ messages in thread
From: Nélio Laranjeiro @ 2016-09-20  7:25 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: dev, Adrien Mazarguil

On Mon, Sep 19, 2016 at 05:17:34PM +0100, Bruce Richardson wrote:
> On Wed, Sep 14, 2016 at 02:18:01PM +0200, Nelio Laranjeiro wrote:
> >  - Rework structure elements to reduce their size.
> >  - Removes a second useless loop in Tx burst function.
> > 
> > This series should be applied on top of "net/mlx5: various fixes".
> > 
> > Changes in v2:
> > 
> >  - rework serie to apply it on top of "net/mlx5: various fixes".
> > 
> > Nelio Laranjeiro (6):
> >   net/mlx5: rework hardware structures
> >   net/mlx5: reduce Tx and Rx structure size
> >   net/mlx5: reduce Tx and Rx structure size
> >   net/mlx5: reduce Tx structure size
> >   net/mlx5: reduce Tx and Rx structure size
> >   net/mlx5: remove gather loop on segments
> > 
> Three of the patches in this set have the same title, which seems a lot.
> Can you expand the titles in some way to make them a little different and help
> those browsing the commit short-logs?
> 
> /Bruce

Sure, It will be updated in v3.

Regards,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 0/6] net/mlx5: performance improvement
  2016-09-14 12:18 ` [PATCH v2 0/6] net/mlx5: performance improvement Nelio Laranjeiro
  2016-09-19 16:17   ` Bruce Richardson
@ 2016-09-20  8:53   ` Nelio Laranjeiro
  2016-09-20  8:53     ` [PATCH v3 1/6] net/mlx5: rework hardware structures Nelio Laranjeiro
                       ` (6 more replies)
  1 sibling, 7 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-20  8:53 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil, Bruce Richardson

 - Rework structure elements to reduce their size.
 - Removes a second useless loop in Tx burst function.

This series should be applied on top of "net/mlx5: various fixes".

Changes in v2:

 - rework serie to apply it on top of "net/mlx5: various fixes".

Changes in v3:

 - Rework some commits log.

Nelio Laranjeiro (6):
  net/mlx5: rework hardware structures
  net/mlx5: reduce memory overhead of Rx/Tx descriptors
  net/mlx5: reduce memory overhead for CQE handling
  net/mlx5: reduce memory overhead for BF handling
  net/mlx5: reduce memory overhead for WQE handling
  net/mlx5: remove gather loop on segments

 drivers/net/mlx5/mlx5_ethdev.c |   4 +-
 drivers/net/mlx5/mlx5_prm.h    |  70 ++-----
 drivers/net/mlx5/mlx5_rxq.c    |  12 +-
 drivers/net/mlx5/mlx5_rxtx.c   | 460 +++++++++++++++++++++--------------------
 drivers/net/mlx5/mlx5_rxtx.h   |  22 +-
 drivers/net/mlx5/mlx5_txq.c    |  18 +-
 6 files changed, 286 insertions(+), 300 deletions(-)

-- 
2.1.4

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 1/6] net/mlx5: rework hardware structures
  2016-09-20  8:53   ` [PATCH v3 " Nelio Laranjeiro
@ 2016-09-20  8:53     ` Nelio Laranjeiro
  2016-09-20  8:53     ` [PATCH v3 2/6] net/mlx5: reduce memory overhead of Rx/Tx descriptors Nelio Laranjeiro
                       ` (5 subsequent siblings)
  6 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-20  8:53 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil, Bruce Richardson

Rework Work Queue Element (aka WQE) structures to fit PMD needs.
A WQE is an aggregation of 16 bytes elements known as "data segments"
(aka dseg).

The only common part is the first two elements i.e. the control one to
define the job type, and the Ethernet segment which embed offload requests
with other informations,  after that, it can have:
  - a raw data packet,
  - a data pointer to the packet itself,
  - both.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_prm.h  |  70 ++++++------------
 drivers/net/mlx5/mlx5_rxtx.c | 167 ++++++++++++++++++++++---------------------
 drivers/net/mlx5/mlx5_rxtx.h |   2 +-
 drivers/net/mlx5/mlx5_txq.c  |   4 +-
 4 files changed, 111 insertions(+), 132 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 5db219b..042562c 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -65,8 +65,15 @@
 /* Maximum number of packets a multi-packet WQE can handle. */
 #define MLX5_MPW_DSEG_MAX 5
 
-/* Room for inline data in regular work queue element. */
-#define MLX5_WQE64_INL_DATA 12
+/* WQE DWORD size */
+#define MLX5_WQE_DWORD_SIZE 16
+
+/* WQE size */
+#define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
+
+/* Compute the number of DS. */
+#define MLX5_WQE_DS(n) \
+	(((n) + MLX5_WQE_DWORD_SIZE - 1) / MLX5_WQE_DWORD_SIZE)
 
 /* Room for inline data in multi-packet WQE. */
 #define MLX5_MWQE64_INL_DATA 28
@@ -79,59 +86,26 @@ struct mlx5_wqe_eth_seg_small {
 	uint16_t mss;
 	uint32_t rsvd2;
 	uint16_t inline_hdr_sz;
+	uint8_t inline_hdr[2];
 };
 
-/* Regular WQE. */
-struct mlx5_wqe_regular {
-	union {
-		struct mlx5_wqe_ctrl_seg ctrl;
-		uint32_t data[4];
-	} ctrl;
-	struct mlx5_wqe_eth_seg eseg;
-	struct mlx5_wqe_data_seg dseg;
-} __rte_aligned(64);
-
-/* Inline WQE. */
-struct mlx5_wqe_inl {
-	union {
-		struct mlx5_wqe_ctrl_seg ctrl;
-		uint32_t data[4];
-	} ctrl;
-	struct mlx5_wqe_eth_seg eseg;
+struct mlx5_wqe_inl_small {
 	uint32_t byte_cnt;
-	uint8_t data[MLX5_WQE64_INL_DATA];
-} __rte_aligned(64);
+	uint8_t raw;
+};
 
-/* Multi-packet WQE. */
-struct mlx5_wqe_mpw {
-	union {
-		struct mlx5_wqe_ctrl_seg ctrl;
-		uint32_t data[4];
-	} ctrl;
+/* Small common part of the WQE. */
+struct mlx5_wqe {
+	uint32_t ctrl[4];
 	struct mlx5_wqe_eth_seg_small eseg;
-	struct mlx5_wqe_data_seg dseg[2];
-} __rte_aligned(64);
+};
 
-/* Multi-packet WQE with inline. */
-struct mlx5_wqe_mpw_inl {
-	union {
-		struct mlx5_wqe_ctrl_seg ctrl;
-		uint32_t data[4];
-	} ctrl;
-	struct mlx5_wqe_eth_seg_small eseg;
-	uint32_t byte_cnt;
-	uint8_t data[MLX5_MWQE64_INL_DATA];
+/* WQE. */
+struct mlx5_wqe64 {
+	struct mlx5_wqe hdr;
+	uint8_t raw[32];
 } __rte_aligned(64);
 
-/* Union of all WQE types. */
-union mlx5_wqe {
-	struct mlx5_wqe_regular wqe;
-	struct mlx5_wqe_inl inl;
-	struct mlx5_wqe_mpw mpw;
-	struct mlx5_wqe_mpw_inl mpw_inl;
-	uint8_t data[64];
-};
-
 /* MPW session status. */
 enum mlx5_mpw_state {
 	MLX5_MPW_STATE_OPENED,
@@ -145,7 +119,7 @@ struct mlx5_mpw {
 	unsigned int pkts_n;
 	unsigned int len;
 	unsigned int total_len;
-	volatile union mlx5_wqe *wqe;
+	volatile struct mlx5_wqe *wqe;
 	union {
 		volatile struct mlx5_wqe_data_seg *dseg[MLX5_MPW_DSEG_MAX];
 		volatile uint8_t *raw;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index ecc76ad..5feeb3f 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -173,7 +173,7 @@ txq_complete(struct txq *txq)
 	uint16_t elts_tail;
 	uint16_t cq_ci = txq->cq_ci;
 	volatile struct mlx5_cqe64 *cqe = NULL;
-	volatile union mlx5_wqe *wqe;
+	volatile struct mlx5_wqe *wqe;
 
 	do {
 		volatile struct mlx5_cqe64 *tmp;
@@ -199,8 +199,8 @@ txq_complete(struct txq *txq)
 	} while (1);
 	if (unlikely(cqe == NULL))
 		return;
-	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)];
-	elts_tail = wqe->wqe.ctrl.data[3];
+	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)].hdr;
+	elts_tail = wqe->ctrl[3];
 	assert(elts_tail < txq->wqe_n);
 	/* Free buffers. */
 	while (elts_free != elts_tail) {
@@ -302,33 +302,33 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
  *   Number of DS elements consumed.
  */
 static inline unsigned int
-mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
+mlx5_wqe_write(struct txq *txq, volatile struct mlx5_wqe *wqe,
 	       struct rte_mbuf *buf, uint32_t length)
 {
-	uintptr_t raw = (uintptr_t)&wqe->wqe.eseg.inline_hdr_start;
+	uint8_t *raw = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0];
 	uint16_t ds;
-	uint16_t pkt_inline_sz = 16;
+	uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
 	uintptr_t addr = rte_pktmbuf_mtod(buf, uintptr_t);
 	struct mlx5_wqe_data_seg *dseg = NULL;
 
-	assert(length >= 16);
+	assert(length >= MLX5_WQE_DWORD_SIZE);
 	/* Start the know and common part of the WQE structure. */
-	wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
-	wqe->wqe.ctrl.data[2] = 0;
-	wqe->wqe.ctrl.data[3] = 0;
-	wqe->wqe.eseg.rsvd0 = 0;
-	wqe->wqe.eseg.rsvd1 = 0;
-	wqe->wqe.eseg.mss = 0;
-	wqe->wqe.eseg.rsvd2 = 0;
+	wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+	wqe->ctrl[2] = 0;
+	wqe->ctrl[3] = 0;
+	wqe->eseg.rsvd0 = 0;
+	wqe->eseg.rsvd1 = 0;
+	wqe->eseg.mss = 0;
+	wqe->eseg.rsvd2 = 0;
 	/* Start by copying the Ethernet Header. */
 	rte_mov16((uint8_t *)raw, (uint8_t *)addr);
-	length -= 16;
-	addr += 16;
+	length -= MLX5_WQE_DWORD_SIZE;
+	addr += MLX5_WQE_DWORD_SIZE;
 	/* Replace the Ethernet type by the VLAN if necessary. */
 	if (buf->ol_flags & PKT_TX_VLAN_PKT) {
 		uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
 
-		memcpy((uint8_t *)(raw + 16 - sizeof(vlan)),
+		memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE - sizeof(vlan)),
 		       &vlan, sizeof(vlan));
 		addr -= sizeof(vlan);
 		length += sizeof(vlan);
@@ -339,7 +339,7 @@ mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
 		uint16_t max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
 		uint16_t room;
 
-		raw += 16;
+		raw += MLX5_WQE_DWORD_SIZE;
 		room = end - (uintptr_t)raw;
 		if (room > max_inline) {
 			uintptr_t addr_end = (addr + max_inline) &
@@ -356,15 +356,15 @@ mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
 			assert(addr <= addr_end);
 		}
 		/* Store the inlined packet size in the WQE. */
-		wqe->wqe.eseg.inline_hdr_sz = htons(pkt_inline_sz);
+		wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
 		/*
 		 * 2 DWORDs consumed by the WQE header + 1 DSEG +
 		 * the size of the inline part of the packet.
 		 */
-		ds = 2 + ((pkt_inline_sz - 2 + 15) / 16);
+		ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
 		if (length > 0) {
 			dseg = (struct mlx5_wqe_data_seg *)
-				((uintptr_t)wqe + (ds * 16));
+				((uintptr_t)wqe + (ds * MLX5_WQE_DWORD_SIZE));
 			if ((uintptr_t)dseg >= end)
 				dseg = (struct mlx5_wqe_data_seg *)
 					((uintptr_t)&(*txq->wqes)[0]);
@@ -377,9 +377,9 @@ mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
 		 * No inline has been done in the packet, only the Ethernet
 		 * Header as been stored.
 		 */
-		wqe->wqe.eseg.inline_hdr_sz = htons(16);
+		wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
 		dseg = (struct mlx5_wqe_data_seg *)
-			((uintptr_t)wqe + (ds * 16));
+			((uintptr_t)wqe + (ds * MLX5_WQE_DWORD_SIZE));
 use_dseg:
 		*dseg = (struct mlx5_wqe_data_seg) {
 			.addr = htonll(addr),
@@ -388,7 +388,7 @@ use_dseg:
 		};
 		++ds;
 	}
-	wqe->wqe.ctrl.data[1] = htonl(txq->qp_num_8s | ds);
+	wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
 	return ds;
 }
 
@@ -444,7 +444,7 @@ tx_prefetch_cqe(struct txq *txq, uint16_t ci)
 static inline void
 tx_prefetch_wqe(struct txq *txq, uint16_t ci)
 {
-	volatile union mlx5_wqe *wqe;
+	volatile struct mlx5_wqe64 *wqe;
 
 	wqe = &(*txq->wqes)[ci & (txq->wqe_n - 1)];
 	rte_prefetch0(wqe);
@@ -473,7 +473,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int j = 0;
 	unsigned int max;
 	unsigned int comp;
-	volatile union mlx5_wqe *wqe = NULL;
+	volatile struct mlx5_wqe *wqe = NULL;
 
 	if (unlikely(!pkts_n))
 		return 0;
@@ -492,7 +492,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		uint32_t length;
 		unsigned int segs_n = buf->nb_segs;
 		volatile struct mlx5_wqe_data_seg *dseg;
-		unsigned int ds = sizeof(*wqe) / 16;
+		unsigned int ds = 0;
 
 		/*
 		 * Make sure there is enough room to store this packet and
@@ -504,7 +504,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		max -= segs_n;
 		--pkts_n;
 		elts_head_next = (elts_head + 1) & (elts_n - 1);
-		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)].hdr;
 		tx_prefetch_wqe(txq, txq->wqe_ci);
 		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
 		if (pkts_n)
@@ -519,25 +519,25 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
-			wqe->wqe.eseg.cs_flags =
+			wqe->eseg.cs_flags =
 				MLX5_ETH_WQE_L3_CSUM |
 				MLX5_ETH_WQE_L4_CSUM;
 		} else {
-			wqe->wqe.eseg.cs_flags = 0;
+			wqe->eseg.cs_flags = 0;
 		}
 		ds = mlx5_wqe_write(txq, wqe, buf, length);
 		if (segs_n == 1)
 			goto skip_segs;
 		dseg = (volatile struct mlx5_wqe_data_seg *)
-			(((uintptr_t)wqe) + ds * 16);
+			(((uintptr_t)wqe) + ds * MLX5_WQE_DWORD_SIZE);
 		while (--segs_n) {
 			/*
 			 * Spill on next WQE when the current one does not have
 			 * enough room left. Size of WQE must a be a multiple
 			 * of data segment size.
 			 */
-			assert(!(sizeof(*wqe) % sizeof(*dseg)));
-			if (!(ds % (sizeof(*wqe) / 16)))
+			assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
+			if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE)))
 				dseg = (volatile void *)
 					&(*txq->wqes)[txq->wqe_ci++ &
 						      (txq->wqe_n - 1)];
@@ -558,8 +558,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			++j;
 		}
 		/* Update DS field in WQE. */
-		wqe->wqe.ctrl.data[1] &= htonl(0xffffffc0);
-		wqe->wqe.ctrl.data[1] |= htonl(ds & 0x3f);
+		wqe->ctrl[1] &= htonl(0xffffffc0);
+		wqe->ctrl[1] |= htonl(ds & 0x3f);
 skip_segs:
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
@@ -577,9 +577,9 @@ skip_segs:
 	comp = txq->elts_comp + i + j;
 	if (comp >= MLX5_TX_COMP_THRESH) {
 		/* Request completion on last WQE. */
-		wqe->wqe.ctrl.data[2] = htonl(8);
+		wqe->ctrl[2] = htonl(8);
 		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->wqe.ctrl.data[3] = elts_head;
+		wqe->ctrl[3] = elts_head;
 		txq->elts_comp = 0;
 	} else {
 		txq->elts_comp = comp;
@@ -616,19 +616,20 @@ mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 	mpw->pkts_n = 0;
 	mpw->len = length;
 	mpw->total_len = 0;
-	mpw->wqe = &(*txq->wqes)[idx];
-	mpw->wqe->mpw.eseg.mss = htons(length);
-	mpw->wqe->mpw.eseg.inline_hdr_sz = 0;
-	mpw->wqe->mpw.eseg.rsvd0 = 0;
-	mpw->wqe->mpw.eseg.rsvd1 = 0;
-	mpw->wqe->mpw.eseg.rsvd2 = 0;
-	mpw->wqe->mpw.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
-					   (txq->wqe_ci << 8) |
-					   MLX5_OPCODE_LSO_MPW);
-	mpw->wqe->mpw.ctrl.data[2] = 0;
-	mpw->wqe->mpw.ctrl.data[3] = 0;
-	mpw->data.dseg[0] = &mpw->wqe->mpw.dseg[0];
-	mpw->data.dseg[1] = &mpw->wqe->mpw.dseg[1];
+	mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
+	mpw->wqe->eseg.mss = htons(length);
+	mpw->wqe->eseg.inline_hdr_sz = 0;
+	mpw->wqe->eseg.rsvd0 = 0;
+	mpw->wqe->eseg.rsvd1 = 0;
+	mpw->wqe->eseg.rsvd2 = 0;
+	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
+				  (txq->wqe_ci << 8) | MLX5_OPCODE_LSO_MPW);
+	mpw->wqe->ctrl[2] = 0;
+	mpw->wqe->ctrl[3] = 0;
+	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
+		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
+	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
+		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
 	mpw->data.dseg[2] = &(*dseg)[0];
 	mpw->data.dseg[3] = &(*dseg)[1];
 	mpw->data.dseg[4] = &(*dseg)[2];
@@ -651,7 +652,7 @@ mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
 	 * count as 2.
 	 */
-	mpw->wqe->mpw.ctrl.data[1] = htonl(txq->qp_num_8s | (2 + num));
+	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num));
 	mpw->state = MLX5_MPW_STATE_CLOSED;
 	if (num < 3)
 		++txq->wqe_ci;
@@ -729,11 +730,11 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
 		    ((mpw.len != length) ||
 		     (segs_n != 1) ||
-		     (mpw.wqe->mpw.eseg.cs_flags != cs_flags)))
+		     (mpw.wqe->eseg.cs_flags != cs_flags)))
 			mlx5_mpw_close(txq, &mpw);
 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
 			mlx5_mpw_new(txq, &mpw, length);
-			mpw.wqe->mpw.eseg.cs_flags = cs_flags;
+			mpw.wqe->eseg.cs_flags = cs_flags;
 		}
 		/* Multi-segment packets must be alone in their MPW. */
 		assert((segs_n == 1) || (mpw.pkts_n == 0));
@@ -779,12 +780,12 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	/* "j" includes both packets and segments. */
 	comp = txq->elts_comp + j;
 	if (comp >= MLX5_TX_COMP_THRESH) {
-		volatile union mlx5_wqe *wqe = mpw.wqe;
+		volatile struct mlx5_wqe *wqe = mpw.wqe;
 
 		/* Request completion on last WQE. */
-		wqe->mpw.ctrl.data[2] = htonl(8);
+		wqe->ctrl[2] = htonl(8);
 		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->mpw.ctrl.data[3] = elts_head;
+		wqe->ctrl[3] = elts_head;
 		txq->elts_comp = 0;
 	} else {
 		txq->elts_comp = comp;
@@ -815,24 +816,27 @@ static inline void
 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 {
 	uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+	struct mlx5_wqe_inl_small *inl;
 
 	mpw->state = MLX5_MPW_INL_STATE_OPENED;
 	mpw->pkts_n = 0;
 	mpw->len = length;
 	mpw->total_len = 0;
-	mpw->wqe = &(*txq->wqes)[idx];
-	mpw->wqe->mpw_inl.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
-					       (txq->wqe_ci << 8) |
-					       MLX5_OPCODE_LSO_MPW);
-	mpw->wqe->mpw_inl.ctrl.data[2] = 0;
-	mpw->wqe->mpw_inl.ctrl.data[3] = 0;
-	mpw->wqe->mpw_inl.eseg.mss = htons(length);
-	mpw->wqe->mpw_inl.eseg.inline_hdr_sz = 0;
-	mpw->wqe->mpw_inl.eseg.cs_flags = 0;
-	mpw->wqe->mpw_inl.eseg.rsvd0 = 0;
-	mpw->wqe->mpw_inl.eseg.rsvd1 = 0;
-	mpw->wqe->mpw_inl.eseg.rsvd2 = 0;
-	mpw->data.raw = &mpw->wqe->mpw_inl.data[0];
+	mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
+	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
+				  (txq->wqe_ci << 8) |
+				  MLX5_OPCODE_LSO_MPW);
+	mpw->wqe->ctrl[2] = 0;
+	mpw->wqe->ctrl[3] = 0;
+	mpw->wqe->eseg.mss = htons(length);
+	mpw->wqe->eseg.inline_hdr_sz = 0;
+	mpw->wqe->eseg.cs_flags = 0;
+	mpw->wqe->eseg.rsvd0 = 0;
+	mpw->wqe->eseg.rsvd1 = 0;
+	mpw->wqe->eseg.rsvd2 = 0;
+	inl = (struct mlx5_wqe_inl_small *)
+		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
+	mpw->data.raw = (uint8_t *)&inl->raw;
 }
 
 /**
@@ -847,17 +851,18 @@ static inline void
 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
 {
 	unsigned int size;
+	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
+		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
 
-	size = sizeof(*mpw->wqe) - MLX5_MWQE64_INL_DATA + mpw->total_len;
+	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
 	/*
 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
 	 * count as 2.
 	 */
-	mpw->wqe->mpw_inl.ctrl.data[1] =
-		htonl(txq->qp_num_8s | ((size + 15) / 16));
+	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size));
 	mpw->state = MLX5_MPW_STATE_CLOSED;
-	mpw->wqe->mpw_inl.byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
-	txq->wqe_ci += (size + (sizeof(*mpw->wqe) - 1)) / sizeof(*mpw->wqe);
+	inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
+	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
 }
 
 /**
@@ -930,13 +935,13 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
 			if ((mpw.len != length) ||
 			    (segs_n != 1) ||
-			    (mpw.wqe->mpw.eseg.cs_flags != cs_flags))
+			    (mpw.wqe->eseg.cs_flags != cs_flags))
 				mlx5_mpw_close(txq, &mpw);
 		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
 			if ((mpw.len != length) ||
 			    (segs_n != 1) ||
 			    (length > inline_room) ||
-			    (mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) {
+			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
 				mlx5_mpw_inline_close(txq, &mpw);
 				inline_room =
 					txq->max_inline * RTE_CACHE_LINE_SIZE;
@@ -946,10 +951,10 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 			if ((segs_n != 1) ||
 			    (length > inline_room)) {
 				mlx5_mpw_new(txq, &mpw, length);
-				mpw.wqe->mpw.eseg.cs_flags = cs_flags;
+				mpw.wqe->eseg.cs_flags = cs_flags;
 			} else {
 				mlx5_mpw_inline_new(txq, &mpw, length);
-				mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags;
+				mpw.wqe->eseg.cs_flags = cs_flags;
 			}
 		}
 		/* Multi-segment packets must be alone in their MPW. */
@@ -1042,12 +1047,12 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 	/* "j" includes both packets and segments. */
 	comp = txq->elts_comp + j;
 	if (comp >= MLX5_TX_COMP_THRESH) {
-		volatile union mlx5_wqe *wqe = mpw.wqe;
+		volatile struct mlx5_wqe *wqe = mpw.wqe;
 
 		/* Request completion on last WQE. */
-		wqe->mpw_inl.ctrl.data[2] = htonl(8);
+		wqe->ctrl[2] = htonl(8);
 		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->mpw_inl.ctrl.data[3] = elts_head;
+		wqe->ctrl[3] = elts_head;
 		txq->elts_comp = 0;
 	} else {
 		txq->elts_comp = comp;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 8c568ad..fbc2a78 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -252,7 +252,7 @@ struct txq {
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
-	volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
+	volatile struct mlx5_wqe64 (*wqes)[]; /* Work queue. */
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
 	volatile void *bf_reg; /* Blueflame register. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 5ddd2fb..e8ebbbe 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -82,7 +82,7 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
 	for (i = 0; (i != elts_n); ++i)
 		(*txq_ctrl->txq.elts)[i] = NULL;
 	for (i = 0; (i != txq_ctrl->txq.wqe_n); ++i) {
-		volatile union mlx5_wqe *wqe = &(*txq_ctrl->txq.wqes)[i];
+		volatile struct mlx5_wqe64 *wqe = &(*txq_ctrl->txq.wqes)[i];
 
 		memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe));
 	}
@@ -215,7 +215,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 	tmpl->txq.cqe_n = ibcq->cqe + 1;
 	tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8;
 	tmpl->txq.wqes =
-		(volatile union mlx5_wqe (*)[])
+		(volatile struct mlx5_wqe64 (*)[])
 		(uintptr_t)qp->gen_data.sqstart;
 	tmpl->txq.wqe_n = qp->sq.wqe_cnt;
 	tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v3 2/6] net/mlx5: reduce memory overhead of Rx/Tx descriptors
  2016-09-20  8:53   ` [PATCH v3 " Nelio Laranjeiro
  2016-09-20  8:53     ` [PATCH v3 1/6] net/mlx5: rework hardware structures Nelio Laranjeiro
@ 2016-09-20  8:53     ` Nelio Laranjeiro
  2016-09-20  8:53     ` [PATCH v3 3/6] net/mlx5: reduce memory overhead for CQE handling Nelio Laranjeiro
                       ` (4 subsequent siblings)
  6 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-20  8:53 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil, Bruce Richardson

PMD uses only power of two number of descriptors, storing the number of
elements in log2 helps to reduce the size of the container to store it.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5_ethdev.c |  4 ++--
 drivers/net/mlx5/mlx5_rxq.c    | 10 +++++-----
 drivers/net/mlx5/mlx5_rxtx.c   | 10 +++++-----
 drivers/net/mlx5/mlx5_rxtx.h   | 12 +++++++-----
 drivers/net/mlx5/mlx5_txq.c    |  6 +++---
 5 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 1ae80e5..137ea66 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -808,7 +808,7 @@ recover:
 		if (rehash)
 			ret = rxq_rehash(dev, rxq_ctrl);
 		else
-			ret = rxq_ctrl_setup(dev, rxq_ctrl, rxq->elts_n,
+			ret = rxq_ctrl_setup(dev, rxq_ctrl, 1 << rxq->elts_n,
 					     rxq_ctrl->socket, NULL, rxq->mp);
 		if (!ret)
 			continue;
@@ -1314,7 +1314,7 @@ mlx5_secondary_data_setup(struct priv *priv)
 		if (txq_ctrl != NULL) {
 			if (txq_ctrl_setup(priv->dev,
 					   primary_txq_ctrl,
-					   primary_txq->elts_n,
+					   1 << primary_txq->elts_n,
 					   primary_txq_ctrl->socket,
 					   NULL) == 0) {
 				txq_ctrl->txq.stats.idx =
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 44889d1..d9db368 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -723,7 +723,7 @@ rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
 	if (rxq_ctrl->rxq.elts == NULL)
 		return;
 
-	for (i = 0; (i != rxq_ctrl->rxq.elts_n); ++i) {
+	for (i = 0; (i != (1u << rxq_ctrl->rxq.elts_n)); ++i) {
 		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
 			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
 		(*rxq_ctrl->rxq.elts)[i] = NULL;
@@ -807,7 +807,7 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
 int
 rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
 {
-	unsigned int elts_n = rxq_ctrl->rxq.elts_n;
+	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
 	unsigned int i;
 	struct ibv_exp_wq_attr mod;
 	int err;
@@ -870,7 +870,7 @@ rxq_setup(struct rxq_ctrl *tmpl)
 	struct ibv_cq *ibcq = tmpl->cq;
 	struct mlx5_cq *cq = to_mxxx(cq, cq);
 	struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
-	struct rte_mbuf *(*elts)[tmpl->rxq.elts_n] =
+	struct rte_mbuf *(*elts)[1 << tmpl->rxq.elts_n] =
 		rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
 
 	if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
@@ -924,7 +924,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 		.priv = priv,
 		.socket = socket,
 		.rxq = {
-			.elts_n = desc,
+			.elts_n = log2above(desc),
 			.mp = mp,
 		},
 	};
@@ -1148,7 +1148,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 	}
 	/* Reuse buffers from original queue if possible. */
 	if (rxq_ctrl->rxq.elts_n) {
-		assert(rxq_ctrl->rxq.elts_n == desc);
+		assert(1 << rxq_ctrl->rxq.elts_n == desc);
 		assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
 		ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
 	} else
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 5feeb3f..e132727 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -166,8 +166,8 @@ txq_complete(struct txq *txq) __attribute__((always_inline));
 static inline void
 txq_complete(struct txq *txq)
 {
-	const unsigned int elts_n = txq->elts_n;
 	const unsigned int cqe_n = txq->cqe_n;
+	const unsigned int elts_n = 1 << txq->elts_n;
 	const unsigned int cqe_cnt = cqe_n - 1;
 	uint16_t elts_free = txq->elts_tail;
 	uint16_t elts_tail;
@@ -468,7 +468,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct txq *txq = (struct txq *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
-	const unsigned int elts_n = txq->elts_n;
+	const unsigned int elts_n = 1 << txq->elts_n;
 	unsigned int i = 0;
 	unsigned int j = 0;
 	unsigned int max;
@@ -680,7 +680,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct txq *txq = (struct txq *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
-	const unsigned int elts_n = txq->elts_n;
+	const unsigned int elts_n = 1 << txq->elts_n;
 	unsigned int i = 0;
 	unsigned int j = 0;
 	unsigned int max;
@@ -884,7 +884,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 {
 	struct txq *txq = (struct txq *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
-	const unsigned int elts_n = txq->elts_n;
+	const unsigned int elts_n = 1 << txq->elts_n;
 	unsigned int i = 0;
 	unsigned int j = 0;
 	unsigned int max;
@@ -1272,8 +1272,8 @@ uint16_t
 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct rxq *rxq = dpdk_rxq;
-	const unsigned int wqe_cnt = rxq->elts_n - 1;
 	const unsigned int cqe_cnt = rxq->cqe_n - 1;
+	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
 	const unsigned int sges_n = rxq->sges_n;
 	struct rte_mbuf *pkt = NULL;
 	struct rte_mbuf *seg = NULL;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index fbc2a78..3ba3913 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -54,6 +54,7 @@
 #endif
 #include <rte_mbuf.h>
 #include <rte_mempool.h>
+#include <rte_common.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-pedantic"
 #endif
@@ -109,16 +110,16 @@ struct rxq {
 	unsigned int vlan_strip:1; /* Enable VLAN stripping. */
 	unsigned int crc_present:1; /* CRC must be subtracted. */
 	unsigned int sges_n:2; /* Log 2 of SGEs (max buffers per packet). */
+	unsigned int elts_n:4; /* Log 2 of Mbufs. */
+	unsigned int port_id:8;
+	volatile uint32_t *rq_db;
+	volatile uint32_t *cq_db;
 	uint16_t rq_ci;
 	uint16_t cq_ci;
-	uint16_t elts_n;
 	uint16_t cqe_n; /* Number of CQ elements. */
-	uint16_t port_id;
 	volatile struct mlx5_wqe_data_seg(*wqes)[];
 	volatile struct mlx5_cqe(*cqes)[];
 	struct rxq_zip zip; /* Compressed context. */
-	volatile uint32_t *rq_db;
-	volatile uint32_t *cq_db;
 	struct rte_mbuf *(*elts)[];
 	struct rte_mempool *mp;
 	struct mlx5_rxq_stats stats;
@@ -238,15 +239,16 @@ struct hash_rxq {
 };
 
 /* TX queue descriptor. */
+RTE_STD_C11
 struct txq {
 	uint16_t elts_head; /* Current index in (*elts)[]. */
 	uint16_t elts_tail; /* First element awaiting completion. */
 	uint16_t elts_comp; /* Counter since last completion request. */
-	uint16_t elts_n; /* (*elts)[] length. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
 	uint16_t cqe_n; /* Number of CQ elements. */
 	uint16_t wqe_ci; /* Consumer index for work queue. */
 	uint16_t wqe_n; /* Number of WQ elements. */
+	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
 	uint16_t bf_offset; /* Blueflame offset. */
 	uint16_t bf_buf_size; /* Blueflame size. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index e8ebbbe..9055016 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -101,7 +101,7 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
 static void
 txq_free_elts(struct txq_ctrl *txq_ctrl)
 {
-	unsigned int elts_n = txq_ctrl->txq.elts_n;
+	unsigned int elts_n = 1 << txq_ctrl->txq.elts_n;
 	unsigned int elts_head = txq_ctrl->txq.elts_head;
 	unsigned int elts_tail = txq_ctrl->txq.elts_tail;
 	struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts;
@@ -227,7 +227,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 		(volatile struct mlx5_cqe (*)[])
 		(uintptr_t)cq->active_buf->buf;
 	tmpl->txq.elts =
-		(struct rte_mbuf *(*)[tmpl->txq.elts_n])
+		(struct rte_mbuf *(*)[1 << tmpl->txq.elts_n])
 		((uintptr_t)txq_ctrl + sizeof(*txq_ctrl));
 	return 0;
 }
@@ -277,7 +277,7 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 	}
 	(void)conf; /* Thresholds configuration (ignored). */
 	assert(desc > MLX5_TX_COMP_THRESH);
-	tmpl.txq.elts_n = desc;
+	tmpl.txq.elts_n = log2above(desc);
 	/* MRs will be registered in mp2mr[] later. */
 	attr.rd = (struct ibv_exp_res_domain_init_attr){
 		.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v3 3/6] net/mlx5: reduce memory overhead for CQE handling
  2016-09-20  8:53   ` [PATCH v3 " Nelio Laranjeiro
  2016-09-20  8:53     ` [PATCH v3 1/6] net/mlx5: rework hardware structures Nelio Laranjeiro
  2016-09-20  8:53     ` [PATCH v3 2/6] net/mlx5: reduce memory overhead of Rx/Tx descriptors Nelio Laranjeiro
@ 2016-09-20  8:53     ` Nelio Laranjeiro
  2016-09-20  8:53     ` [PATCH v3 4/6] net/mlx5: reduce memory overhead for BF handling Nelio Laranjeiro
                       ` (3 subsequent siblings)
  6 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-20  8:53 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil, Bruce Richardson

PMD uses only power of two number of Completion Queue Elements (aka CQE),
storing the number of elements in log2 helps to reduce the size of the
container to store it.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxq.c  | 2 +-
 drivers/net/mlx5/mlx5_rxtx.c | 8 ++++----
 drivers/net/mlx5/mlx5_rxtx.h | 4 ++--
 drivers/net/mlx5/mlx5_txq.c  | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index d9db368..f6f4315 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -881,7 +881,7 @@ rxq_setup(struct rxq_ctrl *tmpl)
 	if (elts == NULL)
 		return ENOMEM;
 	tmpl->rxq.rq_db = rwq->rq.db;
-	tmpl->rxq.cqe_n = ibcq->cqe + 1;
+	tmpl->rxq.cqe_n = log2above(ibcq->cqe);
 	tmpl->rxq.cq_ci = 0;
 	tmpl->rxq.rq_ci = 0;
 	tmpl->rxq.cq_db = cq->dbrec;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index e132727..4f28aa9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -166,8 +166,8 @@ txq_complete(struct txq *txq) __attribute__((always_inline));
 static inline void
 txq_complete(struct txq *txq)
 {
-	const unsigned int cqe_n = txq->cqe_n;
 	const unsigned int elts_n = 1 << txq->elts_n;
+	const unsigned int cqe_n = 1 << txq->cqe_n;
 	const unsigned int cqe_cnt = cqe_n - 1;
 	uint16_t elts_free = txq->elts_tail;
 	uint16_t elts_tail;
@@ -427,9 +427,9 @@ mlx5_tx_dbrec(struct txq *txq)
 static inline void
 tx_prefetch_cqe(struct txq *txq, uint16_t ci)
 {
-	volatile struct mlx5_cqe64 *cqe;
+	volatile struct mlx5_cqe *cqe;
 
-	cqe = &(*txq->cqes)[ci & (txq->cqe_n - 1)].cqe64;
+	cqe = &(*txq->cqes)[ci & ((1 << txq->cqe_n) - 1)];
 	rte_prefetch0(cqe);
 }
 
@@ -1272,8 +1272,8 @@ uint16_t
 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct rxq *rxq = dpdk_rxq;
-	const unsigned int cqe_cnt = rxq->cqe_n - 1;
 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
+	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
 	const unsigned int sges_n = rxq->sges_n;
 	struct rte_mbuf *pkt = NULL;
 	struct rte_mbuf *seg = NULL;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3ba3913..224614e 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -110,13 +110,13 @@ struct rxq {
 	unsigned int vlan_strip:1; /* Enable VLAN stripping. */
 	unsigned int crc_present:1; /* CRC must be subtracted. */
 	unsigned int sges_n:2; /* Log 2 of SGEs (max buffers per packet). */
+	unsigned int cqe_n:4; /* Log 2 of CQ elements. */
 	unsigned int elts_n:4; /* Log 2 of Mbufs. */
 	unsigned int port_id:8;
 	volatile uint32_t *rq_db;
 	volatile uint32_t *cq_db;
 	uint16_t rq_ci;
 	uint16_t cq_ci;
-	uint16_t cqe_n; /* Number of CQ elements. */
 	volatile struct mlx5_wqe_data_seg(*wqes)[];
 	volatile struct mlx5_cqe(*cqes)[];
 	struct rxq_zip zip; /* Compressed context. */
@@ -245,10 +245,10 @@ struct txq {
 	uint16_t elts_tail; /* First element awaiting completion. */
 	uint16_t elts_comp; /* Counter since last completion request. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
-	uint16_t cqe_n; /* Number of CQ elements. */
 	uint16_t wqe_ci; /* Consumer index for work queue. */
 	uint16_t wqe_n; /* Number of WQ elements. */
 	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
+	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
 	uint16_t bf_offset; /* Blueflame offset. */
 	uint16_t bf_buf_size; /* Blueflame size. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 9055016..6145b69 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -212,7 +212,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
 		return EINVAL;
 	}
-	tmpl->txq.cqe_n = ibcq->cqe + 1;
+	tmpl->txq.cqe_n = log2above(ibcq->cqe);
 	tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8;
 	tmpl->txq.wqes =
 		(volatile struct mlx5_wqe64 (*)[])
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v3 4/6] net/mlx5: reduce memory overhead for BF handling
  2016-09-20  8:53   ` [PATCH v3 " Nelio Laranjeiro
                       ` (2 preceding siblings ...)
  2016-09-20  8:53     ` [PATCH v3 3/6] net/mlx5: reduce memory overhead for CQE handling Nelio Laranjeiro
@ 2016-09-20  8:53     ` Nelio Laranjeiro
  2016-09-20  8:53     ` [PATCH v3 5/6] net/mlx5: reduce memory overhead for WQE handling Nelio Laranjeiro
                       ` (2 subsequent siblings)
  6 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-20  8:53 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil, Bruce Richardson

Blue Flame (aka BF) is a buffer allocated with a power of two value, its
size is returned by Verbs in log2.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 2 +-
 drivers/net/mlx5/mlx5_rxtx.h | 2 +-
 drivers/net/mlx5/mlx5_txq.c  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 4f28aa9..214922b 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -413,7 +413,7 @@ mlx5_tx_dbrec(struct txq *txq)
 	/* Ensure ordering between DB record and BF copy. */
 	rte_wmb();
 	rte_mov16(dst, (uint8_t *)data);
-	txq->bf_offset ^= txq->bf_buf_size;
+	txq->bf_offset ^= (1 << txq->bf_buf_size);
 }
 
 /**
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 224614e..3dca8ca 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -249,8 +249,8 @@ struct txq {
 	uint16_t wqe_n; /* Number of WQ elements. */
 	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
 	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
+	uint16_t bf_buf_size:4; /* Log2 Blueflame size. */
 	uint16_t bf_offset; /* Blueflame offset. */
-	uint16_t bf_buf_size; /* Blueflame size. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 6145b69..9919e37 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -221,7 +221,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 	tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
 	tmpl->txq.bf_reg = qp->gen_data.bf->reg;
 	tmpl->txq.bf_offset = qp->gen_data.bf->offset;
-	tmpl->txq.bf_buf_size = qp->gen_data.bf->buf_size;
+	tmpl->txq.bf_buf_size = log2above(qp->gen_data.bf->buf_size);
 	tmpl->txq.cq_db = cq->dbrec;
 	tmpl->txq.cqes =
 		(volatile struct mlx5_cqe (*)[])
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v3 5/6] net/mlx5: reduce memory overhead for WQE handling
  2016-09-20  8:53   ` [PATCH v3 " Nelio Laranjeiro
                       ` (3 preceding siblings ...)
  2016-09-20  8:53     ` [PATCH v3 4/6] net/mlx5: reduce memory overhead for BF handling Nelio Laranjeiro
@ 2016-09-20  8:53     ` Nelio Laranjeiro
  2016-09-20  8:53     ` [PATCH v3 6/6] net/mlx5: remove gather loop on segments Nelio Laranjeiro
  2016-09-21 10:22     ` [PATCH v3 0/6] net/mlx5: performance improvement Bruce Richardson
  6 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-20  8:53 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil, Bruce Richardson

PMD uses only power of two number of Work Queue Elements (aka WQE), storing
the number of elements in log2 helps to reduce the size of the container to
store it.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 23 ++++++++++++-----------
 drivers/net/mlx5/mlx5_rxtx.h |  2 +-
 drivers/net/mlx5/mlx5_txq.c  |  4 ++--
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 214922b..9d00ddc 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -199,9 +199,10 @@ txq_complete(struct txq *txq)
 	} while (1);
 	if (unlikely(cqe == NULL))
 		return;
-	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)].hdr;
+	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) &
+			    ((1 << txq->wqe_n) - 1)].hdr;
 	elts_tail = wqe->ctrl[3];
-	assert(elts_tail < txq->wqe_n);
+	assert(elts_tail < (1 << txq->wqe_n));
 	/* Free buffers. */
 	while (elts_free != elts_tail) {
 		struct rte_mbuf *elt = (*txq->elts)[elts_free];
@@ -335,7 +336,7 @@ mlx5_wqe_write(struct txq *txq, volatile struct mlx5_wqe *wqe,
 	}
 	/* Inline if enough room. */
 	if (txq->max_inline != 0) {
-		uintptr_t end = (uintptr_t)&(*txq->wqes)[txq->wqe_n];
+		uintptr_t end = (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
 		uint16_t max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
 		uint16_t room;
 
@@ -446,7 +447,7 @@ tx_prefetch_wqe(struct txq *txq, uint16_t ci)
 {
 	volatile struct mlx5_wqe64 *wqe;
 
-	wqe = &(*txq->wqes)[ci & (txq->wqe_n - 1)];
+	wqe = &(*txq->wqes)[ci & ((1 << txq->wqe_n) - 1)];
 	rte_prefetch0(wqe);
 }
 
@@ -504,7 +505,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		max -= segs_n;
 		--pkts_n;
 		elts_head_next = (elts_head + 1) & (elts_n - 1);
-		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)].hdr;
+		wqe = &(*txq->wqes)[txq->wqe_ci & ((1 << txq->wqe_n) - 1)].hdr;
 		tx_prefetch_wqe(txq, txq->wqe_ci);
 		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
 		if (pkts_n)
@@ -540,7 +541,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE)))
 				dseg = (volatile void *)
 					&(*txq->wqes)[txq->wqe_ci++ &
-						      (txq->wqe_n - 1)];
+						      ((1 << txq->wqe_n) - 1)];
 			else
 				++dseg;
 			++ds;
@@ -607,10 +608,10 @@ skip_segs:
 static inline void
 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 {
-	uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
 	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
 		(volatile struct mlx5_wqe_data_seg (*)[])
-		(uintptr_t)&(*txq->wqes)[(idx + 1) & (txq->wqe_n - 1)];
+		(uintptr_t)&(*txq->wqes)[(idx + 1) & ((1 << txq->wqe_n) - 1)];
 
 	mpw->state = MLX5_MPW_STATE_OPENED;
 	mpw->pkts_n = 0;
@@ -815,7 +816,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 static inline void
 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 {
-	uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
 	struct mlx5_wqe_inl_small *inl;
 
 	mpw->state = MLX5_MPW_INL_STATE_OPENED;
@@ -1000,7 +1001,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			(*txq->elts)[elts_head] = buf;
 			/* Maximum number of bytes before wrapping. */
-			max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] -
+			max = ((uintptr_t)&(*txq->wqes)[1 << txq->wqe_n] -
 			       (uintptr_t)mpw.data.raw);
 			if (length > max) {
 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
@@ -1019,7 +1020,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 				mpw.data.raw += length;
 			}
 			if ((uintptr_t)mpw.data.raw ==
-			    (uintptr_t)&(*txq->wqes)[txq->wqe_n])
+			    (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n])
 				mpw.data.raw =
 					(volatile void *)&(*txq->wqes)[0];
 			++mpw.pkts_n;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3dca8ca..9828aef 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -246,9 +246,9 @@ struct txq {
 	uint16_t elts_comp; /* Counter since last completion request. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
 	uint16_t wqe_ci; /* Consumer index for work queue. */
-	uint16_t wqe_n; /* Number of WQ elements. */
 	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
 	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
+	uint16_t wqe_n:4; /* Number of of WQ elements (in log2). */
 	uint16_t bf_buf_size:4; /* Log2 Blueflame size. */
 	uint16_t bf_offset; /* Blueflame offset. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 9919e37..3d2d132 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -81,7 +81,7 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
 
 	for (i = 0; (i != elts_n); ++i)
 		(*txq_ctrl->txq.elts)[i] = NULL;
-	for (i = 0; (i != txq_ctrl->txq.wqe_n); ++i) {
+	for (i = 0; (i != (1u << txq_ctrl->txq.wqe_n)); ++i) {
 		volatile struct mlx5_wqe64 *wqe = &(*txq_ctrl->txq.wqes)[i];
 
 		memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe));
@@ -217,7 +217,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
 	tmpl->txq.wqes =
 		(volatile struct mlx5_wqe64 (*)[])
 		(uintptr_t)qp->gen_data.sqstart;
-	tmpl->txq.wqe_n = qp->sq.wqe_cnt;
+	tmpl->txq.wqe_n = log2above(qp->sq.wqe_cnt);
 	tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
 	tmpl->txq.bf_reg = qp->gen_data.bf->reg;
 	tmpl->txq.bf_offset = qp->gen_data.bf->offset;
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v3 6/6] net/mlx5: remove gather loop on segments
  2016-09-20  8:53   ` [PATCH v3 " Nelio Laranjeiro
                       ` (4 preceding siblings ...)
  2016-09-20  8:53     ` [PATCH v3 5/6] net/mlx5: reduce memory overhead for WQE handling Nelio Laranjeiro
@ 2016-09-20  8:53     ` Nelio Laranjeiro
  2016-09-21 10:22     ` [PATCH v3 0/6] net/mlx5: performance improvement Bruce Richardson
  6 siblings, 0 replies; 26+ messages in thread
From: Nelio Laranjeiro @ 2016-09-20  8:53 UTC (permalink / raw)
  To: dev; +Cc: Adrien Mazarguil, Bruce Richardson, Vasily Philipov

Tx function was handling a double loop to send segmented packets, it can be
done in a single one.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Vasily Philipov <vasilyf@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 312 ++++++++++++++++++++++---------------------
 1 file changed, 158 insertions(+), 154 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 9d00ddc..b91b644 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -288,112 +288,6 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
 }
 
 /**
- * Write a regular WQE.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param wqe
- *   Pointer to the WQE to fill.
- * @param buf
- *   Buffer.
- * @param length
- *   Packet length.
- *
- * @return ds
- *   Number of DS elements consumed.
- */
-static inline unsigned int
-mlx5_wqe_write(struct txq *txq, volatile struct mlx5_wqe *wqe,
-	       struct rte_mbuf *buf, uint32_t length)
-{
-	uint8_t *raw = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0];
-	uint16_t ds;
-	uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
-	uintptr_t addr = rte_pktmbuf_mtod(buf, uintptr_t);
-	struct mlx5_wqe_data_seg *dseg = NULL;
-
-	assert(length >= MLX5_WQE_DWORD_SIZE);
-	/* Start the know and common part of the WQE structure. */
-	wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
-	wqe->ctrl[2] = 0;
-	wqe->ctrl[3] = 0;
-	wqe->eseg.rsvd0 = 0;
-	wqe->eseg.rsvd1 = 0;
-	wqe->eseg.mss = 0;
-	wqe->eseg.rsvd2 = 0;
-	/* Start by copying the Ethernet Header. */
-	rte_mov16((uint8_t *)raw, (uint8_t *)addr);
-	length -= MLX5_WQE_DWORD_SIZE;
-	addr += MLX5_WQE_DWORD_SIZE;
-	/* Replace the Ethernet type by the VLAN if necessary. */
-	if (buf->ol_flags & PKT_TX_VLAN_PKT) {
-		uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
-
-		memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE - sizeof(vlan)),
-		       &vlan, sizeof(vlan));
-		addr -= sizeof(vlan);
-		length += sizeof(vlan);
-	}
-	/* Inline if enough room. */
-	if (txq->max_inline != 0) {
-		uintptr_t end = (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
-		uint16_t max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
-		uint16_t room;
-
-		raw += MLX5_WQE_DWORD_SIZE;
-		room = end - (uintptr_t)raw;
-		if (room > max_inline) {
-			uintptr_t addr_end = (addr + max_inline) &
-				~(RTE_CACHE_LINE_SIZE - 1);
-			uint16_t copy_b = ((addr_end - addr) > length) ?
-					  length :
-					  (addr_end - addr);
-
-			rte_memcpy((void *)raw, (void *)addr, copy_b);
-			addr += copy_b;
-			length -= copy_b;
-			pkt_inline_sz += copy_b;
-			/* Sanity check. */
-			assert(addr <= addr_end);
-		}
-		/* Store the inlined packet size in the WQE. */
-		wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
-		/*
-		 * 2 DWORDs consumed by the WQE header + 1 DSEG +
-		 * the size of the inline part of the packet.
-		 */
-		ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
-		if (length > 0) {
-			dseg = (struct mlx5_wqe_data_seg *)
-				((uintptr_t)wqe + (ds * MLX5_WQE_DWORD_SIZE));
-			if ((uintptr_t)dseg >= end)
-				dseg = (struct mlx5_wqe_data_seg *)
-					((uintptr_t)&(*txq->wqes)[0]);
-			goto use_dseg;
-		}
-	} else {
-		/* Add the remaining packet as a simple ds. */
-		ds = 3;
-		/*
-		 * No inline has been done in the packet, only the Ethernet
-		 * Header as been stored.
-		 */
-		wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
-		dseg = (struct mlx5_wqe_data_seg *)
-			((uintptr_t)wqe + (ds * MLX5_WQE_DWORD_SIZE));
-use_dseg:
-		*dseg = (struct mlx5_wqe_data_seg) {
-			.addr = htonll(addr),
-			.byte_count = htonl(length),
-			.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
-		};
-		++ds;
-	}
-	wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
-	return ds;
-}
-
-/**
  * Ring TX queue doorbell.
  *
  * @param txq
@@ -475,6 +369,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int max;
 	unsigned int comp;
 	volatile struct mlx5_wqe *wqe = NULL;
+	unsigned int segs_n = 0;
+	struct rte_mbuf *buf = NULL;
+	uint8_t *raw;
 
 	if (unlikely(!pkts_n))
 		return 0;
@@ -488,13 +385,17 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (max > elts_n)
 		max -= elts_n;
 	do {
-		struct rte_mbuf *buf = *(pkts++);
-		unsigned int elts_head_next;
+		volatile struct mlx5_wqe_data_seg *dseg = NULL;
 		uint32_t length;
-		unsigned int segs_n = buf->nb_segs;
-		volatile struct mlx5_wqe_data_seg *dseg;
 		unsigned int ds = 0;
+		uintptr_t addr;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		uint32_t total_length = 0;
+#endif
 
+		/* first_seg */
+		buf = *(pkts++);
+		segs_n = buf->nb_segs;
 		/*
 		 * Make sure there is enough room to store this packet and
 		 * that one ring entry remains unused.
@@ -503,20 +404,30 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if (max < segs_n + 1)
 			break;
 		max -= segs_n;
-		--pkts_n;
-		elts_head_next = (elts_head + 1) & (elts_n - 1);
-		wqe = &(*txq->wqes)[txq->wqe_ci & ((1 << txq->wqe_n) - 1)].hdr;
-		tx_prefetch_wqe(txq, txq->wqe_ci);
+		--segs_n;
+		if (!segs_n)
+			--pkts_n;
+		wqe = &(*txq->wqes)[txq->wqe_ci &
+				    ((1 << txq->wqe_n) - 1)].hdr;
 		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
-		if (pkts_n)
+		if (pkts_n > 1)
 			rte_prefetch0(*pkts);
+		addr = rte_pktmbuf_mtod(buf, uintptr_t);
 		length = DATA_LEN(buf);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		total_length = length;
+#endif
+		assert(length >= MLX5_WQE_DWORD_SIZE);
 		/* Update element. */
 		(*txq->elts)[elts_head] = buf;
+		elts_head = (elts_head + 1) & (elts_n - 1);
 		/* Prefetch next buffer data. */
-		if (pkts_n)
-			rte_prefetch0(rte_pktmbuf_mtod(*pkts,
-						       volatile void *));
+		if (pkts_n > 1) {
+			volatile void *pkt_addr;
+
+			pkt_addr = rte_pktmbuf_mtod(*pkts, volatile void *);
+			rte_prefetch0(pkt_addr);
+		}
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -526,50 +437,143 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		} else {
 			wqe->eseg.cs_flags = 0;
 		}
-		ds = mlx5_wqe_write(txq, wqe, buf, length);
-		if (segs_n == 1)
-			goto skip_segs;
-		dseg = (volatile struct mlx5_wqe_data_seg *)
-			(((uintptr_t)wqe) + ds * MLX5_WQE_DWORD_SIZE);
-		while (--segs_n) {
+		raw  = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0];
+		/* Start the know and common part of the WQE structure. */
+		wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+		wqe->ctrl[2] = 0;
+		wqe->ctrl[3] = 0;
+		wqe->eseg.rsvd0 = 0;
+		wqe->eseg.rsvd1 = 0;
+		wqe->eseg.mss = 0;
+		wqe->eseg.rsvd2 = 0;
+		/* Start by copying the Ethernet Header. */
+		rte_mov16((uint8_t *)raw, (uint8_t *)addr);
+		length -= MLX5_WQE_DWORD_SIZE;
+		addr += MLX5_WQE_DWORD_SIZE;
+		/* Replace the Ethernet type by the VLAN if necessary. */
+		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
+			uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
+
+			memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE -
+					   sizeof(vlan)),
+			       &vlan, sizeof(vlan));
+			addr -= sizeof(vlan);
+			length += sizeof(vlan);
+		}
+		/* Inline if enough room. */
+		if (txq->max_inline != 0) {
+			uintptr_t end =
+				(uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
+			uint16_t max_inline =
+				txq->max_inline * RTE_CACHE_LINE_SIZE;
+			uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
+			uint16_t room;
+
+			raw += MLX5_WQE_DWORD_SIZE;
+			room = end - (uintptr_t)raw;
+			if (room > max_inline) {
+				uintptr_t addr_end = (addr + max_inline) &
+					~(RTE_CACHE_LINE_SIZE - 1);
+				uint16_t copy_b = ((addr_end - addr) > length) ?
+						  length :
+						  (addr_end - addr);
+
+				rte_memcpy((void *)raw, (void *)addr, copy_b);
+				addr += copy_b;
+				length -= copy_b;
+				pkt_inline_sz += copy_b;
+				/* Sanity check. */
+				assert(addr <= addr_end);
+			}
+			/* Store the inlined packet size in the WQE. */
+			wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
+			/*
+			 * 2 DWORDs consumed by the WQE header + 1 DSEG +
+			 * the size of the inline part of the packet.
+			 */
+			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
+			if (length > 0) {
+				dseg = (struct mlx5_wqe_data_seg *)
+					((uintptr_t)wqe +
+					 (ds * MLX5_WQE_DWORD_SIZE));
+				if ((uintptr_t)dseg >= end)
+					dseg = (struct mlx5_wqe_data_seg *)
+						((uintptr_t)&(*txq->wqes)[0]);
+				goto use_dseg;
+			} else if (!segs_n) {
+				goto next_pkt;
+			} else {
+				goto next_seg;
+			}
+		} else {
 			/*
-			 * Spill on next WQE when the current one does not have
-			 * enough room left. Size of WQE must a be a multiple
-			 * of data segment size.
+			 * No inline has been done in the packet, only the
+			 * Ethernet Header as been stored.
 			 */
-			assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
-			if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE)))
-				dseg = (volatile void *)
-					&(*txq->wqes)[txq->wqe_ci++ &
-						      ((1 << txq->wqe_n) - 1)];
-			else
-				++dseg;
+			wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
+			dseg = (struct mlx5_wqe_data_seg *)
+				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
+			ds = 3;
+use_dseg:
+			/* Add the remaining packet as a simple ds. */
+			*dseg = (struct mlx5_wqe_data_seg) {
+				.addr = htonll(addr),
+				.byte_count = htonl(length),
+				.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+			};
 			++ds;
-			buf = buf->next;
-			assert(buf);
-			/* Store segment information. */
-			dseg->byte_count = htonl(DATA_LEN(buf));
-			dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf));
-			dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
-			(*txq->elts)[elts_head_next] = buf;
-			elts_head_next = (elts_head_next + 1) & (elts_n - 1);
+			if (!segs_n)
+				goto next_pkt;
+		}
+next_seg:
+		assert(buf);
+		assert(ds);
+		assert(wqe);
+		/*
+		 * Spill on next WQE when the current one does not have
+		 * enough room left. Size of WQE must a be a multiple
+		 * of data segment size.
+		 */
+		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
+		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
+			unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) &
+				((1 << txq->wqe_n) - 1);
+
+			dseg = (struct mlx5_wqe_data_seg *)
+				((uintptr_t)&(*txq->wqes)[n]);
+			tx_prefetch_wqe(txq, n + 1);
+		} else {
+			++dseg;
+		}
+		++ds;
+		buf = buf->next;
+		assert(buf);
+		length = DATA_LEN(buf);
 #ifdef MLX5_PMD_SOFT_COUNTERS
-			length += DATA_LEN(buf);
+		total_length += length;
 #endif
-			++j;
-		}
-		/* Update DS field in WQE. */
-		wqe->ctrl[1] &= htonl(0xffffffc0);
-		wqe->ctrl[1] |= htonl(ds & 0x3f);
-skip_segs:
+		/* Store segment information. */
+		*dseg = (struct mlx5_wqe_data_seg) {
+			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
+			.byte_count = htonl(length),
+			.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+		};
+		(*txq->elts)[elts_head] = buf;
+		elts_head = (elts_head + 1) & (elts_n - 1);
+		++j;
+		--segs_n;
+		if (segs_n)
+			goto next_seg;
+		else
+			--pkts_n;
+next_pkt:
+		++i;
+		wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
+		txq->wqe_ci += (ds + 3) / 4;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
+		txq->stats.obytes += total_length;
 #endif
-		/* Increment consumer index. */
-		txq->wqe_ci += (ds + 3) / 4;
-		elts_head = elts_head_next;
-		++i;
 	} while (pkts_n);
 	/* Take a shortcut if nothing must be sent. */
 	if (unlikely(i == 0))
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 0/6] net/mlx5: performance improvement
  2016-09-20  8:53   ` [PATCH v3 " Nelio Laranjeiro
                       ` (5 preceding siblings ...)
  2016-09-20  8:53     ` [PATCH v3 6/6] net/mlx5: remove gather loop on segments Nelio Laranjeiro
@ 2016-09-21 10:22     ` Bruce Richardson
  6 siblings, 0 replies; 26+ messages in thread
From: Bruce Richardson @ 2016-09-21 10:22 UTC (permalink / raw)
  To: Nelio Laranjeiro; +Cc: dev, Adrien Mazarguil

On Tue, Sep 20, 2016 at 10:53:45AM +0200, Nelio Laranjeiro wrote:
>  - Rework structure elements to reduce their size.
>  - Removes a second useless loop in Tx burst function.
> 
> This series should be applied on top of "net/mlx5: various fixes".
> 
> Changes in v2:
> 
>  - rework serie to apply it on top of "net/mlx5: various fixes".
> 
> Changes in v3:
> 
>  - Rework some commits log.
> 
> Nelio Laranjeiro (6):
>   net/mlx5: rework hardware structures
>   net/mlx5: reduce memory overhead of Rx/Tx descriptors
>   net/mlx5: reduce memory overhead for CQE handling
>   net/mlx5: reduce memory overhead for BF handling
>   net/mlx5: reduce memory overhead for WQE handling
>   net/mlx5: remove gather loop on segments
> 
Applied to dpdk-next-net/rel_16_11

/Bruce

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2016-09-21 10:22 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-07  7:09 [PATCH 0/6] net/mlx5: performance improvement Nelio Laranjeiro
2016-09-07  7:09 ` [PATCH 1/6] net/mlx5: rework hardware structures Nelio Laranjeiro
2016-09-07  7:09 ` [PATCH 2/6] net/mlx5: reduce Tx and Rx structure size Nelio Laranjeiro
2016-09-07  7:09 ` [PATCH 3/6] " Nelio Laranjeiro
2016-09-07  7:09 ` [PATCH 4/6] net/mlx5: reduce Tx " Nelio Laranjeiro
2016-09-07  7:09 ` [PATCH 5/6] net/mlx5: reduce Tx and Rx " Nelio Laranjeiro
2016-09-07  7:09 ` [PATCH 6/6] net/mlx5: remove gather loop on segments Nelio Laranjeiro
2016-09-14 12:18 ` [PATCH v2 0/6] net/mlx5: performance improvement Nelio Laranjeiro
2016-09-19 16:17   ` Bruce Richardson
2016-09-20  7:25     ` Nélio Laranjeiro
2016-09-20  8:53   ` [PATCH v3 " Nelio Laranjeiro
2016-09-20  8:53     ` [PATCH v3 1/6] net/mlx5: rework hardware structures Nelio Laranjeiro
2016-09-20  8:53     ` [PATCH v3 2/6] net/mlx5: reduce memory overhead of Rx/Tx descriptors Nelio Laranjeiro
2016-09-20  8:53     ` [PATCH v3 3/6] net/mlx5: reduce memory overhead for CQE handling Nelio Laranjeiro
2016-09-20  8:53     ` [PATCH v3 4/6] net/mlx5: reduce memory overhead for BF handling Nelio Laranjeiro
2016-09-20  8:53     ` [PATCH v3 5/6] net/mlx5: reduce memory overhead for WQE handling Nelio Laranjeiro
2016-09-20  8:53     ` [PATCH v3 6/6] net/mlx5: remove gather loop on segments Nelio Laranjeiro
2016-09-21 10:22     ` [PATCH v3 0/6] net/mlx5: performance improvement Bruce Richardson
2016-09-14 12:18 ` [PATCH v2 1/6] net/mlx5: rework hardware structures Nelio Laranjeiro
2016-09-19 16:14   ` Bruce Richardson
2016-09-20  7:09     ` Nélio Laranjeiro
2016-09-14 12:18 ` [PATCH v2 2/6] net/mlx5: reduce Tx and Rx structure size Nelio Laranjeiro
2016-09-14 12:18 ` [PATCH v2 3/6] " Nelio Laranjeiro
2016-09-14 12:18 ` [PATCH v2 4/6] net/mlx5: reduce Tx " Nelio Laranjeiro
2016-09-14 12:18 ` [PATCH v2 5/6] net/mlx5: reduce Tx and Rx " Nelio Laranjeiro
2016-09-14 12:18 ` [PATCH v2 6/6] net/mlx5: remove gather loop on segments Nelio Laranjeiro

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.