All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1
       [not found] <1508752838-30408-1-git-send-email-ophirmu@mellanox.com>
@ 2017-10-23 14:21 ` Ophir Munk
  2017-10-23 14:21   ` [PATCH v2 1/7] net/mlx4: remove error flows from Tx fast path Ophir Munk
                     ` (7 more replies)
  0 siblings, 8 replies; 84+ messages in thread
From: Ophir Munk @ 2017-10-23 14:21 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad, Ophir Munk

v1:
This series is a follow-up of the new datapath implementation introduced in RC1.
It is dedicated to Tx and improves the performance.

v2:
Change uint32_t be_lkey to rte_be32_t lkey

Matan Azrad (4):
  net/mlx4: remove error flows from Tx fast path
  net/mlx4: merge Tx path functions
  net/mlx4: remove unnecessary variables in Tx burst
  net/mlx4: improve performance of one Tx segment

Ophir Munk (3):
  net/mlx4: inline more Tx functions
  net/mlx4: save lkey in big-endian format
  net/mlx4: separate Tx for multi-segments

 drivers/net/mlx4/mlx4_rxtx.c | 343 +++++++++++++++++++++++--------------------
 drivers/net/mlx4/mlx4_rxtx.h |  56 ++++++-
 2 files changed, 234 insertions(+), 165 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 84+ messages in thread

* [PATCH v2 1/7] net/mlx4: remove error flows from Tx fast path
  2017-10-23 14:21 ` [PATCH v2 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
@ 2017-10-23 14:21   ` Ophir Munk
  2017-10-25 16:49     ` Adrien Mazarguil
  2017-10-23 14:21   ` [PATCH v2 2/7] net/mlx4: inline more Tx functions Ophir Munk
                     ` (6 subsequent siblings)
  7 siblings, 1 reply; 84+ messages in thread
From: Ophir Munk @ 2017-10-23 14:21 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

From: Matan Azrad <matan@mellanox.com>

Move unnecessary error flows to DEBUG mode for
performance improvements.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 36173ad..011ea79 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -169,6 +169,7 @@ mlx4_txq_complete(struct txq *txq)
 		 * Make sure we read the CQE after we read the ownership bit.
 		 */
 		rte_rmb();
+#ifndef NDEBUG
 		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
 			     MLX4_CQE_OPCODE_ERROR)) {
 			struct mlx4_err_cqe *cqe_err =
@@ -178,6 +179,7 @@ mlx4_txq_complete(struct txq *txq)
 			      (void *)txq, cqe_err->vendor_err,
 			      cqe_err->syndrome);
 		}
+#endif /* NDEBUG */
 		/* Get WQE index reported in the CQE. */
 		new_index =
 			rte_be_to_cpu_16(cqe->wqe_index) & sq->txbb_cnt_mask;
@@ -322,7 +324,6 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 	uint32_t byte_count;
 	int wqe_real_size;
 	int nr_txbbs;
-	int rc;
 	struct pv *pv = (struct pv *)txq->bounce_buf;
 	int pv_counter = 0;
 
@@ -337,8 +338,7 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 	if (((sq->head - sq->tail) + nr_txbbs +
 	     sq->headroom_txbbs) >= sq->txbb_cnt ||
 	    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-		rc = ENOSPC;
-		goto err;
+		return -ENOSPC;
 	}
 	/* Get the control and data entries of the WQE. */
 	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
@@ -354,6 +354,7 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 		dseg->addr = rte_cpu_to_be_64(addr);
 		/* Memory region key for this memory pool. */
 		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+#ifndef NDEBUG
 		if (unlikely(lkey == (uint32_t)-1)) {
 			/* MR does not exist. */
 			DEBUG("%p: unable to get MP <-> MR association",
@@ -366,9 +367,9 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
 			mlx4_txq_stamp_freed_wqe(sq, head_idx,
 				     (sq->head & sq->txbb_cnt) ? 0 : 1);
-			rc = EFAULT;
-			goto err;
+			return -EFAULT;
 		}
+#endif /* NDEBUG */
 		dseg->lkey = rte_cpu_to_be_32(lkey);
 		if (likely(buf->data_len)) {
 			byte_count = rte_cpu_to_be_32(buf->data_len);
@@ -471,9 +472,6 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 					       MLX4_BIT_WQE_OWN : 0));
 	sq->head += nr_txbbs;
 	return 0;
-err:
-	rte_errno = rc;
-	return -rc;
 }
 
 /**
@@ -510,8 +508,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	assert(max <= elts_n);
 	/* Always leave one free entry in the ring. */
 	--max;
-	if (max == 0)
-		return 0;
 	if (max > pkts_n)
 		max = pkts_n;
 	for (i = 0; (i != max); ++i) {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 2/7] net/mlx4: inline more Tx functions
  2017-10-23 14:21 ` [PATCH v2 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
  2017-10-23 14:21   ` [PATCH v2 1/7] net/mlx4: remove error flows from Tx fast path Ophir Munk
@ 2017-10-23 14:21   ` Ophir Munk
  2017-10-25 16:49     ` Adrien Mazarguil
  2017-10-23 14:21   ` [PATCH v2 3/7] net/mlx4: save lkey in big-endian format Ophir Munk
                     ` (5 subsequent siblings)
  7 siblings, 1 reply; 84+ messages in thread
From: Ophir Munk @ 2017-10-23 14:21 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad, Ophir Munk

Change functions to inline on Tx fast path to improve performance

Inside the inline function call other functions to handle "unlikely"
cases such that the inline function code footprint is small.

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 43 ++++++------------------------------
 drivers/net/mlx4/mlx4_rxtx.h | 52 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 011ea79..ae37f9b 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -220,54 +220,25 @@ mlx4_txq_complete(struct txq *txq)
 	return 0;
 }
 
-/**
- * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
- * the cloned mbuf is allocated is returned instead.
- *
- * @param buf
- *   Pointer to mbuf.
- *
- * @return
- *   Memory pool where data is located for given mbuf.
- */
-static struct rte_mempool *
-mlx4_txq_mb2mp(struct rte_mbuf *buf)
-{
-	if (unlikely(RTE_MBUF_INDIRECT(buf)))
-		return rte_mbuf_from_indirect(buf)->pool;
-	return buf->pool;
-}
 
 /**
- * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
- * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
- * remove an entry first.
+ * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
+ * If mp2mr[] is full, remove an entry first.
  *
  * @param txq
  *   Pointer to Tx queue structure.
  * @param[in] mp
- *   Memory pool for which a memory region lkey must be returned.
+ *   Memory pool for which a memory region lkey must be added
+ * @param[in] i
+ *   Index in memory pool (MP) where to add memory region (MR)
  *
  * @return
- *   mr->lkey on success, (uint32_t)-1 on failure.
+ *   Added mr->lkey on success, (uint32_t)-1 on failure.
  */
-uint32_t
-mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
 {
-	unsigned int i;
 	struct ibv_mr *mr;
 
-	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
-		if (unlikely(txq->mp2mr[i].mp == NULL)) {
-			/* Unknown MP, add a new MR for it. */
-			break;
-		}
-		if (txq->mp2mr[i].mp == mp) {
-			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
-			return txq->mp2mr[i].lkey;
-		}
-	}
 	/* Add a new entry, register MR first. */
 	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
 	      (void *)txq, mp->name, (void *)mp);
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index e10bbca..719ef45 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -53,6 +53,7 @@
 
 #include "mlx4.h"
 #include "mlx4_prm.h"
+#include "mlx4_utils.h"
 
 /** Rx queue counters. */
 struct mlx4_rxq_stats {
@@ -160,7 +161,6 @@ void mlx4_rx_queue_release(void *dpdk_rxq);
 
 /* mlx4_rxtx.c */
 
-uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
 uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
 		       uint16_t pkts_n);
 uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
@@ -169,6 +169,8 @@ uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
 uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
+uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
+				unsigned int i);
 
 /* mlx4_txq.c */
 
@@ -177,4 +179,52 @@ int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
 			const struct rte_eth_txconf *conf);
 void mlx4_tx_queue_release(void *dpdk_txq);
 
+/**
+ * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
+ * the cloned mbuf is allocated is returned instead.
+ *
+ * @param buf
+ *   Pointer to mbuf.
+ *
+ * @return
+ *   Memory pool where data is located for given mbuf.
+ */
+static __rte_always_inline struct rte_mempool *
+mlx4_txq_mb2mp(struct rte_mbuf *buf)
+{
+	if (unlikely(RTE_MBUF_INDIRECT(buf)))
+		return rte_mbuf_from_indirect(buf)->pool;
+	return buf->pool;
+}
+
+/**
+ * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
+ * Call mlx4_txq_add_mr() if MP is not registered yet.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param[in] mp
+ *   Memory pool for which a memory region lkey must be returned.
+ *
+ * @return
+ *   mr->lkey on success, (uint32_t)-1 on failure.
+ */
+static __rte_always_inline uint32_t
+mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+{
+	unsigned int i;
+
+	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+		if (unlikely(txq->mp2mr[i].mp == NULL)) {
+			/* Unknown MP, add a new MR for it. */
+			break;
+		}
+		if (txq->mp2mr[i].mp == mp) {
+			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
+			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
+			return txq->mp2mr[i].lkey;
+		}
+	}
+	return mlx4_txq_add_mr(txq, mp, i);
+}
 #endif /* MLX4_RXTX_H_ */
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 3/7] net/mlx4: save lkey in big-endian format
  2017-10-23 14:21 ` [PATCH v2 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
  2017-10-23 14:21   ` [PATCH v2 1/7] net/mlx4: remove error flows from Tx fast path Ophir Munk
  2017-10-23 14:21   ` [PATCH v2 2/7] net/mlx4: inline more Tx functions Ophir Munk
@ 2017-10-23 14:21   ` Ophir Munk
  2017-10-23 15:24     ` Nélio Laranjeiro
  2017-10-23 14:21   ` [PATCH v2 4/7] net/mlx4: merge Tx path functions Ophir Munk
                     ` (4 subsequent siblings)
  7 siblings, 1 reply; 84+ messages in thread
From: Ophir Munk @ 2017-10-23 14:21 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad, Ophir Munk

mlx4 NIC is using lkey in big endian format. Save lkey in this format
in order to avoid conversions during Tx fast path

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 16 +++++++---------
 drivers/net/mlx4/mlx4_rxtx.h | 14 ++++++++------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index ae37f9b..4a77be8 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -233,9 +233,9 @@ mlx4_txq_complete(struct txq *txq)
  *   Index in memory pool (MP) where to add memory region (MR)
  *
  * @return
- *   Added mr->lkey on success, (uint32_t)-1 on failure.
+ *   Added mr->lkey (rte_be32_t) on success, (uint32_t)-1 on failure.
  */
-uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
+rte_be32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
 {
 	struct ibv_mr *mr;
 
@@ -260,9 +260,9 @@ uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
 	/* Store the new entry. */
 	txq->mp2mr[i].mp = mp;
 	txq->mp2mr[i].mr = mr;
-	txq->mp2mr[i].lkey = mr->lkey;
+	txq->mp2mr[i].lkey = rte_cpu_to_be_32(mr->lkey);
 	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
-	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
+	      (void *)txq, mp->name, (void *)mp, mr->lkey);
 	return txq->mp2mr[i].lkey;
 }
 
@@ -289,7 +289,6 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 		uint16_t flags16[2];
 	} srcrb;
 	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
-	uint32_t lkey;
 	uintptr_t addr;
 	uint32_t owner_opcode = MLX4_OPCODE_SEND;
 	uint32_t byte_count;
@@ -323,10 +322,10 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 		if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
 			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
 		dseg->addr = rte_cpu_to_be_64(addr);
-		/* Memory region key for this memory pool. */
-		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+		/* Memory region key (big endian) for this memory pool. */
+		dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
 #ifndef NDEBUG
-		if (unlikely(lkey == (uint32_t)-1)) {
+		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
 			/* MR does not exist. */
 			DEBUG("%p: unable to get MP <-> MR association",
 			      (void *)txq);
@@ -341,7 +340,6 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 			return -EFAULT;
 		}
 #endif /* NDEBUG */
-		dseg->lkey = rte_cpu_to_be_32(lkey);
 		if (likely(buf->data_len)) {
 			byte_count = rte_cpu_to_be_32(buf->data_len);
 		} else {
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 719ef45..b1e8ac4 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -135,7 +135,7 @@ struct txq {
 	struct {
 		const struct rte_mempool *mp; /**< Cached memory pool. */
 		struct ibv_mr *mr; /**< Memory region (for mp). */
-		uint32_t lkey; /**< mr->lkey copy. */
+		rte_be32_t lkey; /**< mr->lkey copy. (big endian) */
 	} mp2mr[MLX4_PMD_TX_MP_CACHE]; /**< MP to MR translation table. */
 	struct priv *priv; /**< Back pointer to private data. */
 	unsigned int socket; /**< CPU socket ID for allocations. */
@@ -169,7 +169,7 @@ uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
 uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
-uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
+rte_be32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
 				unsigned int i);
 
 /* mlx4_txq.c */
@@ -207,9 +207,9 @@ mlx4_txq_mb2mp(struct rte_mbuf *buf)
  *   Memory pool for which a memory region lkey must be returned.
  *
  * @return
- *   mr->lkey on success, (uint32_t)-1 on failure.
+ *   mr->lkey (rte_be32_t) on success, (uint32_t)-1 on failure.
  */
-static __rte_always_inline uint32_t
+static __rte_always_inline rte_be32_t
 mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
 {
 	unsigned int i;
@@ -220,8 +220,10 @@ mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
 			break;
 		}
 		if (txq->mp2mr[i].mp == mp) {
-			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
+			assert(txq->mp2mr[i].lkey !=
+				rte_cpu_to_be_32((uint32_t)-1));
+			assert(txq->mp2mr[i].mr->lkey ==
+				rte_be_to_cpu_32(txq->mp2mr[i].lkey));
 			return txq->mp2mr[i].lkey;
 		}
 	}
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 4/7] net/mlx4: merge Tx path functions
  2017-10-23 14:21 ` [PATCH v2 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
                     ` (2 preceding siblings ...)
  2017-10-23 14:21   ` [PATCH v2 3/7] net/mlx4: save lkey in big-endian format Ophir Munk
@ 2017-10-23 14:21   ` Ophir Munk
  2017-10-24 13:51     ` Nélio Laranjeiro
  2017-10-23 14:21   ` [PATCH v2 5/7] net/mlx4: remove unnecessary variables in Tx burst Ophir Munk
                     ` (3 subsequent siblings)
  7 siblings, 1 reply; 84+ messages in thread
From: Ophir Munk @ 2017-10-23 14:21 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

From: Matan Azrad <matan@mellanox.com>

Merge tx_burst and mlx4_post_send functions to prevent
double asking about WQ remain space.

This should improve performance.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 353 +++++++++++++++++++++----------------------
 1 file changed, 170 insertions(+), 183 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 4a77be8..014a6d3 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -267,183 +267,6 @@ rte_be32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
 }
 
 /**
- * Posts a single work request to a send queue.
- *
- * @param txq
- *   Target Tx queue.
- * @param pkt
- *   Packet to transmit.
- *
- * @return
- *   0 on success, negative errno value otherwise and rte_errno is set.
- */
-static inline int
-mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
-{
-	struct mlx4_wqe_ctrl_seg *ctrl;
-	struct mlx4_wqe_data_seg *dseg;
-	struct mlx4_sq *sq = &txq->msq;
-	struct rte_mbuf *buf;
-	union {
-		uint32_t flags;
-		uint16_t flags16[2];
-	} srcrb;
-	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
-	uintptr_t addr;
-	uint32_t owner_opcode = MLX4_OPCODE_SEND;
-	uint32_t byte_count;
-	int wqe_real_size;
-	int nr_txbbs;
-	struct pv *pv = (struct pv *)txq->bounce_buf;
-	int pv_counter = 0;
-
-	/* Calculate the needed work queue entry size for this packet. */
-	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
-			pkt->nb_segs * sizeof(struct mlx4_wqe_data_seg);
-	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
-	/*
-	 * Check that there is room for this WQE in the send queue and that
-	 * the WQE size is legal.
-	 */
-	if (((sq->head - sq->tail) + nr_txbbs +
-	     sq->headroom_txbbs) >= sq->txbb_cnt ||
-	    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-		return -ENOSPC;
-	}
-	/* Get the control and data entries of the WQE. */
-	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
-	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
-					    sizeof(struct mlx4_wqe_ctrl_seg));
-	/* Fill the data segments with buffer information. */
-	for (buf = pkt; buf != NULL; buf = buf->next, dseg++) {
-		addr = rte_pktmbuf_mtod(buf, uintptr_t);
-		rte_prefetch0((volatile void *)addr);
-		/* Handle WQE wraparound. */
-		if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
-			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
-		dseg->addr = rte_cpu_to_be_64(addr);
-		/* Memory region key (big endian) for this memory pool. */
-		dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
-#ifndef NDEBUG
-		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
-			/* MR does not exist. */
-			DEBUG("%p: unable to get MP <-> MR association",
-			      (void *)txq);
-			/*
-			 * Restamp entry in case of failure.
-			 * Make sure that size is written correctly
-			 * Note that we give ownership to the SW, not the HW.
-			 */
-			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
-			mlx4_txq_stamp_freed_wqe(sq, head_idx,
-				     (sq->head & sq->txbb_cnt) ? 0 : 1);
-			return -EFAULT;
-		}
-#endif /* NDEBUG */
-		if (likely(buf->data_len)) {
-			byte_count = rte_cpu_to_be_32(buf->data_len);
-		} else {
-			/*
-			 * Zero length segment is treated as inline segment
-			 * with zero data.
-			 */
-			byte_count = RTE_BE32(0x80000000);
-		}
-		/*
-		 * If the data segment is not at the beginning of a
-		 * Tx basic block (TXBB) then write the byte count,
-		 * else postpone the writing to just before updating the
-		 * control segment.
-		 */
-		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
-			/*
-			 * Need a barrier here before writing the byte_count
-			 * fields to make sure that all the data is visible
-			 * before the byte_count field is set.
-			 * Otherwise, if the segment begins a new cacheline,
-			 * the HCA prefetcher could grab the 64-byte chunk and
-			 * get a valid (!= 0xffffffff) byte count but stale
-			 * data, and end up sending the wrong data.
-			 */
-			rte_io_wmb();
-			dseg->byte_count = byte_count;
-		} else {
-			/*
-			 * This data segment starts at the beginning of a new
-			 * TXBB, so we need to postpone its byte_count writing
-			 * for later.
-			 */
-			pv[pv_counter].dseg = dseg;
-			pv[pv_counter++].val = byte_count;
-		}
-	}
-	/* Write the first DWORD of each TXBB save earlier. */
-	if (pv_counter) {
-		/* Need a barrier here before writing the byte_count. */
-		rte_io_wmb();
-		for (--pv_counter; pv_counter  >= 0; pv_counter--)
-			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
-	}
-	/* Fill the control parameters for this packet. */
-	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
-	/*
-	 * For raw Ethernet, the SOLICIT flag is used to indicate that no ICRC
-	 * should be calculated.
-	 */
-	txq->elts_comp_cd -= nr_txbbs;
-	if (unlikely(txq->elts_comp_cd <= 0)) {
-		txq->elts_comp_cd = txq->elts_comp_cd_init;
-		srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
-				       MLX4_WQE_CTRL_CQ_UPDATE);
-	} else {
-		srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
-	}
-	/* Enable HW checksum offload if requested */
-	if (txq->csum &&
-	    (pkt->ol_flags &
-	     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) {
-		const uint64_t is_tunneled = (pkt->ol_flags &
-					      (PKT_TX_TUNNEL_GRE |
-					       PKT_TX_TUNNEL_VXLAN));
-
-		if (is_tunneled && txq->csum_l2tun) {
-			owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM |
-					MLX4_WQE_CTRL_IL4_HDR_CSUM;
-			if (pkt->ol_flags & PKT_TX_OUTER_IP_CKSUM)
-				srcrb.flags |=
-					RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM);
-		} else {
-			srcrb.flags |= RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM |
-						MLX4_WQE_CTRL_TCP_UDP_CSUM);
-		}
-	}
-	if (txq->lb) {
-		/*
-		 * Copy destination MAC address to the WQE, this allows
-		 * loopback in eSwitch, so that VFs and PF can communicate
-		 * with each other.
-		 */
-		srcrb.flags16[0] = *(rte_pktmbuf_mtod(pkt, uint16_t *));
-		ctrl->imm = *(rte_pktmbuf_mtod_offset(pkt, uint32_t *,
-						      sizeof(uint16_t)));
-	} else {
-		ctrl->imm = 0;
-	}
-	ctrl->srcrb_flags = srcrb.flags;
-	/*
-	 * Make sure descriptor is fully written before
-	 * setting ownership bit (because HW can start
-	 * executing as soon as we do).
-	 */
-	rte_wmb();
-	ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
-					      ((sq->head & sq->txbb_cnt) ?
-					       MLX4_BIT_WQE_OWN : 0));
-	sq->head += nr_txbbs;
-	return 0;
-}
-
-/**
  * DPDK callback for Tx.
  *
  * @param dpdk_txq
@@ -466,7 +289,8 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int bytes_sent = 0;
 	unsigned int i;
 	unsigned int max;
-	int err;
+	struct mlx4_sq *sq = &txq->msq;
+	struct pv *pv = (struct pv *)txq->bounce_buf;
 
 	assert(txq->elts_comp_cd != 0);
 	mlx4_txq_complete(txq);
@@ -485,6 +309,20 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
 		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
 		struct txq_elt *elt = &(*txq->elts)[elts_head];
+		uint32_t owner_opcode = MLX4_OPCODE_SEND;
+		struct mlx4_wqe_ctrl_seg *ctrl;
+		struct mlx4_wqe_data_seg *dseg;
+		struct rte_mbuf *sbuf;
+		union {
+			uint32_t flags;
+			uint16_t flags16[2];
+		} srcrb;
+		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+		uintptr_t addr;
+		uint32_t byte_count;
+		int wqe_real_size;
+		int nr_txbbs;
+		int pv_counter = 0;
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -503,18 +341,167 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		/* Post the packet for sending. */
-		err = mlx4_post_send(txq, buf);
-		if (unlikely(err)) {
+
+		/*
+		 * Calculate the needed work queue entry size
+		 * for this packet.
+		 */
+		wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+		nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
+		/*
+		 * Check that there is room for this WQE in the send
+		 * queue and that the WQE size is legal.
+		 */
+		if (((sq->head - sq->tail) + nr_txbbs +
+		     sq->headroom_txbbs) >= sq->txbb_cnt ||
+		    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
 			elt->buf = NULL;
-			goto stop;
+			break;
+		}
+		/* Get the control and data entries of the WQE. */
+		ctrl = (struct mlx4_wqe_ctrl_seg *)
+				mlx4_get_send_wqe(sq, head_idx);
+		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+				sizeof(struct mlx4_wqe_ctrl_seg));
+		/* Fill the data segments with buffer information. */
+		for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
+			addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+			rte_prefetch0((volatile void *)addr);
+			/* Handle WQE wraparound. */
+			if (unlikely(dseg >=
+			    (struct mlx4_wqe_data_seg *)sq->eob))
+				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+			dseg->addr = rte_cpu_to_be_64(addr);
+			/* Memory region key (big endian). */
+			dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+	#ifndef NDEBUG
+			if (unlikely(dseg->lkey ==
+				rte_cpu_to_be_32((uint32_t)-1))) {
+				/* MR does not exist. */
+				DEBUG("%p: unable to get MP <-> MR association",
+				      (void *)txq);
+				/*
+				 * Restamp entry in case of failure.
+				 * Make sure that size is written correctly
+				 * Note that we give ownership to the SW,
+				 * not the HW.
+				 */
+				ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+				mlx4_txq_stamp_freed_wqe(sq, head_idx,
+					     (sq->head & sq->txbb_cnt) ? 0 : 1);
+				elt->buf = NULL;
+				break;
+			}
+	#endif /* NDEBUG */
+			if (likely(sbuf->data_len)) {
+				byte_count = rte_cpu_to_be_32(sbuf->data_len);
+			} else {
+				/*
+				 * Zero length segment is treated as inline
+				 * segment with zero data.
+				 */
+				byte_count = RTE_BE32(0x80000000);
+			}
+			/*
+			 * If the data segment is not at the beginning
+			 * of a Tx basic block (TXBB) then write the
+			 * byte count, else postpone the writing to
+			 * just before updating the control segment.
+			 */
+			if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+				/*
+				 * Need a barrier here before writing the
+				 * byte_count fields to make sure that all the
+				 * data is visible before the byte_count field
+				 * is set. otherwise, if the segment begins a
+				 * new cacheline, the HCA prefetcher could grab
+				 * the 64-byte chunk and get a valid
+				 * (!= 0xffffffff) byte count but stale data,
+				 * and end up sending the wrong data.
+				 */
+				rte_io_wmb();
+				dseg->byte_count = byte_count;
+			} else {
+				/*
+				 * This data segment starts at the beginning of
+				 * a new TXBB, so we need to postpone its
+				 * byte_count writing for later.
+				 */
+				pv[pv_counter].dseg = dseg;
+				pv[pv_counter++].val = byte_count;
+			}
+		}
+		/* Write the first DWORD of each TXBB save earlier. */
+		if (pv_counter) {
+			/* Need a barrier before writing the byte_count. */
+			rte_io_wmb();
+			for (--pv_counter; pv_counter  >= 0; pv_counter--)
+				pv[pv_counter].dseg->byte_count =
+						pv[pv_counter].val;
+		}
+		/* Fill the control parameters for this packet. */
+		ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+		/*
+		 * For raw Ethernet, the SOLICIT flag is used to indicate
+		 * that no ICRC should be calculated.
+		 */
+		txq->elts_comp_cd -= nr_txbbs;
+		if (unlikely(txq->elts_comp_cd <= 0)) {
+			txq->elts_comp_cd = txq->elts_comp_cd_init;
+			srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
+					       MLX4_WQE_CTRL_CQ_UPDATE);
+		} else {
+			srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
 		}
+		/* Enable HW checksum offload if requested */
+		if (txq->csum &&
+		    (buf->ol_flags &
+		     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) {
+			const uint64_t is_tunneled = (buf->ol_flags &
+						      (PKT_TX_TUNNEL_GRE |
+						       PKT_TX_TUNNEL_VXLAN));
+
+			if (is_tunneled && txq->csum_l2tun) {
+				owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM |
+						MLX4_WQE_CTRL_IL4_HDR_CSUM;
+				if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM)
+					srcrb.flags |=
+					    RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM);
+			} else {
+				srcrb.flags |=
+					RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM |
+						MLX4_WQE_CTRL_TCP_UDP_CSUM);
+			}
+		}
+		if (txq->lb) {
+			/*
+			 * Copy destination MAC address to the WQE, this allows
+			 * loopback in eSwitch, so that VFs and PF can
+			 * communicate with each other.
+			 */
+			srcrb.flags16[0] = *(rte_pktmbuf_mtod(buf, uint16_t *));
+			ctrl->imm = *(rte_pktmbuf_mtod_offset(buf, uint32_t *,
+					      sizeof(uint16_t)));
+		} else {
+			ctrl->imm = 0;
+		}
+		ctrl->srcrb_flags = srcrb.flags;
+		/*
+		 * Make sure descriptor is fully written before
+		 * setting ownership bit (because HW can start
+		 * executing as soon as we do).
+		 */
+		rte_wmb();
+		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
+					      ((sq->head & sq->txbb_cnt) ?
+						       MLX4_BIT_WQE_OWN : 0));
+		sq->head += nr_txbbs;
 		elt->buf = buf;
 		bytes_sent += buf->pkt_len;
 		++elts_comp;
 		elts_head = elts_head_next;
 	}
-stop:
 	/* Take a shortcut if nothing must be sent. */
 	if (unlikely(i == 0))
 		return 0;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 5/7] net/mlx4: remove unnecessary variables in Tx burst
  2017-10-23 14:21 ` [PATCH v2 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
                     ` (3 preceding siblings ...)
  2017-10-23 14:21   ` [PATCH v2 4/7] net/mlx4: merge Tx path functions Ophir Munk
@ 2017-10-23 14:21   ` Ophir Munk
  2017-10-25 16:49     ` Adrien Mazarguil
  2017-10-23 14:21   ` [PATCH v2 6/7] net/mlx4: improve performance of one Tx segment Ophir Munk
                     ` (2 subsequent siblings)
  7 siblings, 1 reply; 84+ messages in thread
From: Ophir Munk @ 2017-10-23 14:21 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

From: Matan Azrad <matan@mellanox.com>

Remove usage of variables which doesn't add new information for
performance improvement.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 014a6d3..e8d9a35 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -285,8 +285,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	struct txq *txq = (struct txq *)dpdk_txq;
 	unsigned int elts_head = txq->elts_head;
 	const unsigned int elts_n = txq->elts_n;
-	unsigned int elts_comp = 0;
-	unsigned int bytes_sent = 0;
 	unsigned int i;
 	unsigned int max;
 	struct mlx4_sq *sq = &txq->msq;
@@ -498,8 +496,7 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 						       MLX4_BIT_WQE_OWN : 0));
 		sq->head += nr_txbbs;
 		elt->buf = buf;
-		bytes_sent += buf->pkt_len;
-		++elts_comp;
+		txq->stats.obytes += buf->pkt_len;
 		elts_head = elts_head_next;
 	}
 	/* Take a shortcut if nothing must be sent. */
@@ -507,13 +504,12 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		return 0;
 	/* Increment send statistics counters. */
 	txq->stats.opackets += i;
-	txq->stats.obytes += bytes_sent;
 	/* Make sure that descriptors are written before doorbell record. */
 	rte_wmb();
 	/* Ring QP doorbell. */
 	rte_write32(txq->msq.doorbell_qpn, txq->msq.db);
 	txq->elts_head = elts_head;
-	txq->elts_comp += elts_comp;
+	txq->elts_comp += i;
 	return i;
 }
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 6/7] net/mlx4: improve performance of one Tx segment
  2017-10-23 14:21 ` [PATCH v2 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
                     ` (4 preceding siblings ...)
  2017-10-23 14:21   ` [PATCH v2 5/7] net/mlx4: remove unnecessary variables in Tx burst Ophir Munk
@ 2017-10-23 14:21   ` Ophir Munk
  2017-10-25 16:50     ` Adrien Mazarguil
  2017-10-23 14:22   ` [PATCH v2 7/7] net/mlx4: separate Tx for multi-segments Ophir Munk
  2017-10-30 10:07   ` [PATCH v3 0/7] Tx path improvements Matan Azrad
  7 siblings, 1 reply; 84+ messages in thread
From: Ophir Munk @ 2017-10-23 14:21 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

From: Matan Azrad <matan@mellanox.com>

Since one segment shouldn't use additional memory to save segments
byte_count for writing them in different order we can prevent
additional memory unnecessary usage in this case.
By the way, prevent loop management.

All for performance improvement.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 125 +++++++++++++++++++++++++++++--------------
 1 file changed, 85 insertions(+), 40 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index e8d9a35..3236552 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -310,7 +310,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		uint32_t owner_opcode = MLX4_OPCODE_SEND;
 		struct mlx4_wqe_ctrl_seg *ctrl;
 		struct mlx4_wqe_data_seg *dseg;
-		struct rte_mbuf *sbuf;
 		union {
 			uint32_t flags;
 			uint16_t flags16[2];
@@ -363,12 +362,12 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
 				sizeof(struct mlx4_wqe_ctrl_seg));
 		/* Fill the data segments with buffer information. */
-		for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
-			addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+		if (likely(buf->nb_segs == 1)) {
+			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			rte_prefetch0((volatile void *)addr);
 			/* Handle WQE wraparound. */
-			if (unlikely(dseg >=
-			    (struct mlx4_wqe_data_seg *)sq->eob))
+			if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)
+					sq->eob))
 				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
 			dseg->addr = rte_cpu_to_be_64(addr);
 			/* Memory region key (big endian). */
@@ -392,44 +391,90 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				break;
 			}
 	#endif /* NDEBUG */
-			if (likely(sbuf->data_len)) {
-				byte_count = rte_cpu_to_be_32(sbuf->data_len);
-			} else {
-				/*
-				 * Zero length segment is treated as inline
-				 * segment with zero data.
-				 */
-				byte_count = RTE_BE32(0x80000000);
-			}
-			/*
-			 * If the data segment is not at the beginning
-			 * of a Tx basic block (TXBB) then write the
-			 * byte count, else postpone the writing to
-			 * just before updating the control segment.
-			 */
-			if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
-				/*
-				 * Need a barrier here before writing the
-				 * byte_count fields to make sure that all the
-				 * data is visible before the byte_count field
-				 * is set. otherwise, if the segment begins a
-				 * new cacheline, the HCA prefetcher could grab
-				 * the 64-byte chunk and get a valid
-				 * (!= 0xffffffff) byte count but stale data,
-				 * and end up sending the wrong data.
-				 */
-				rte_io_wmb();
-				dseg->byte_count = byte_count;
-			} else {
+			/* Need a barrier here before writing the byte_count. */
+			rte_io_wmb();
+			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
+		} else {
+			/* Fill the data segments with buffer information. */
+			struct rte_mbuf *sbuf;
+
+			for (sbuf = buf;
+				 sbuf != NULL;
+				 sbuf = sbuf->next, dseg++) {
+				addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+				rte_prefetch0((volatile void *)addr);
+				/* Handle WQE wraparound. */
+				if (unlikely(dseg >=
+					(struct mlx4_wqe_data_seg *)sq->eob))
+					dseg = (struct mlx4_wqe_data_seg *)
+							sq->buf;
+				dseg->addr = rte_cpu_to_be_64(addr);
+				/* Memory region key (big endian). */
+				dseg->lkey = mlx4_txq_mp2mr(txq,
+						mlx4_txq_mb2mp(sbuf));
+		#ifndef NDEBUG
+				if (unlikely(dseg->lkey ==
+					rte_cpu_to_be_32((uint32_t)-1))) {
+					/* MR does not exist. */
+					DEBUG("%p: unable to get MP <-> MR association",
+						  (void *)txq);
+					/*
+					 * Restamp entry in case of failure.
+					 * Make sure that size is written
+					 * correctly, note that we give
+					 * ownership to the SW, not the HW.
+					 */
+					ctrl->fence_size =
+						(wqe_real_size >> 4) & 0x3f;
+					mlx4_txq_stamp_freed_wqe(sq, head_idx,
+					    (sq->head & sq->txbb_cnt) ? 0 : 1);
+					elt->buf = NULL;
+					break;
+				}
+		#endif /* NDEBUG */
+				if (likely(sbuf->data_len)) {
+					byte_count =
+					  rte_cpu_to_be_32(sbuf->data_len);
+				} else {
+					/*
+					 * Zero length segment is treated as
+					 * inline segment with zero data.
+					 */
+					byte_count = RTE_BE32(0x80000000);
+				}
 				/*
-				 * This data segment starts at the beginning of
-				 * a new TXBB, so we need to postpone its
-				 * byte_count writing for later.
+				 * If the data segment is not at the beginning
+				 * of a Tx basic block (TXBB) then write the
+				 * byte count, else postpone the writing to
+				 * just before updating the control segment.
 				 */
-				pv[pv_counter].dseg = dseg;
-				pv[pv_counter++].val = byte_count;
+				if ((uintptr_t)dseg &
+					(uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+					/*
+					 * Need a barrier here before writing
+					 * the byte_count fields to make sure
+					 * that all the data is visible before
+					 * the byte_count field is set.
+					 * Otherwise, if the segment begins a
+					 * new cacheline, the HCA prefetcher
+					 * could grab the 64-byte chunk and get
+					 * a valid (!= 0xffffffff) byte count
+					 * but stale data, and end up sending
+					 * the wrong data.
+					 */
+					rte_io_wmb();
+					dseg->byte_count = byte_count;
+				} else {
+					/*
+					 * This data segment starts at the
+					 * beginning of a new TXBB, so we
+					 * need to postpone its byte_count
+					 * writing for later.
+					 */
+					pv[pv_counter].dseg = dseg;
+					pv[pv_counter++].val = byte_count;
+				}
 			}
-		}
 		/* Write the first DWORD of each TXBB save earlier. */
 		if (pv_counter) {
 			/* Need a barrier before writing the byte_count. */
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 7/7] net/mlx4: separate Tx for multi-segments
  2017-10-23 14:21 ` [PATCH v2 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
                     ` (5 preceding siblings ...)
  2017-10-23 14:21   ` [PATCH v2 6/7] net/mlx4: improve performance of one Tx segment Ophir Munk
@ 2017-10-23 14:22   ` Ophir Munk
  2017-10-25 16:50     ` Adrien Mazarguil
  2017-10-30 10:07   ` [PATCH v3 0/7] Tx path improvements Matan Azrad
  7 siblings, 1 reply; 84+ messages in thread
From: Ophir Munk @ 2017-10-23 14:22 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad, Ophir Munk

This commit optimizes handling of one segment and calls a
dedicated function for handling multi segments

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 284 +++++++++++++++++++++++--------------------
 1 file changed, 154 insertions(+), 130 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 3236552..9596859 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -62,6 +62,9 @@
 #include "mlx4_rxtx.h"
 #include "mlx4_utils.h"
 
+#define WQE_ONE_DATA_SEG_SIZE \
+	(sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg))
+
 /**
  * Pointer-value pair structure used in tx_post_send for saving the first
  * DWORD (32 byte) of a TXBB.
@@ -140,22 +143,19 @@ mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, uint16_t index, uint8_t owner)
  * @return
  *   0 on success, -1 on failure.
  */
-static int
-mlx4_txq_complete(struct txq *txq)
+static inline int __attribute__((always_inline))
+mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
+				struct mlx4_sq *sq)
 {
 	unsigned int elts_comp = txq->elts_comp;
 	unsigned int elts_tail = txq->elts_tail;
-	const unsigned int elts_n = txq->elts_n;
 	struct mlx4_cq *cq = &txq->mcq;
-	struct mlx4_sq *sq = &txq->msq;
 	struct mlx4_cqe *cqe;
 	uint32_t cons_index = cq->cons_index;
 	uint16_t new_index;
 	uint16_t nr_txbbs = 0;
 	int pkts = 0;
 
-	if (unlikely(elts_comp == 0))
-		return 0;
 	/*
 	 * Traverse over all CQ entries reported and handle each WQ entry
 	 * reported by them.
@@ -266,6 +266,120 @@ rte_be32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
 	return txq->mp2mr[i].lkey;
 }
 
+static int handle_multi_segs(struct rte_mbuf *buf,
+			    struct txq *txq,
+			    struct mlx4_wqe_ctrl_seg **pctrl)
+{
+	int wqe_real_size;
+	int nr_txbbs;
+	struct pv *pv = (struct pv *)txq->bounce_buf;
+	struct mlx4_sq *sq = &txq->msq;
+	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_data_seg *dseg;
+	uintptr_t addr;
+	uint32_t byte_count;
+	int pv_counter = 0;
+
+	/* Calculate the needed work queue entry size for this packet. */
+	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+		buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
+	/*
+	 * Check that there is room for this WQE in the send queue and that
+	 * the WQE size is legal.
+	 */
+	if (((sq->head - sq->tail) + nr_txbbs +
+				sq->headroom_txbbs) >= sq->txbb_cnt ||
+			nr_txbbs > MLX4_MAX_WQE_TXBBS) {
+		return -1;
+	}
+
+	/* Get the control and data entries of the WQE. */
+	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
+	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+			sizeof(struct mlx4_wqe_ctrl_seg));
+	*pctrl = ctrl;
+	/* Fill the data segments with buffer information. */
+	struct rte_mbuf *sbuf;
+
+	for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
+		addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+		rte_prefetch0((volatile void *)addr);
+		/* Handle WQE wraparound. */
+		if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
+			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+		dseg->addr = rte_cpu_to_be_64(addr);
+		/* Memory region key (big endian) for this memory pool. */
+		dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+#ifndef NDEBUG
+		/* Calculate the needed work queue entry size for this packet */
+		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
+			/* MR does not exist. */
+			DEBUG("%p: unable to get MP <-> MR association",
+					(void *)txq);
+			/*
+			 * Restamp entry in case of failure.
+			 * Make sure that size is written correctly
+			 * Note that we give ownership to the SW, not the HW.
+			 */
+			wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+			mlx4_txq_stamp_freed_wqe(sq, head_idx,
+					(sq->head & sq->txbb_cnt) ? 0 : 1);
+			return -1;
+		}
+#endif /* NDEBUG */
+		if (likely(sbuf->data_len)) {
+			byte_count = rte_cpu_to_be_32(sbuf->data_len);
+		} else {
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			byte_count = RTE_BE32(0x80000000);
+		}
+		/*
+		 * If the data segment is not at the beginning of a
+		 * Tx basic block (TXBB) then write the byte count,
+		 * else postpone the writing to just before updating the
+		 * control segment.
+		 */
+		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+			/*
+			 * Need a barrier here before writing the byte_count
+			 * fields to make sure that all the data is visible
+			 * before the byte_count field is set.
+			 * Otherwise, if the segment begins a new cacheline,
+			 * the HCA prefetcher could grab the 64-byte chunk and
+			 * get a valid (!= 0xffffffff) byte count but stale
+			 * data, and end up sending the wrong data.
+			 */
+			rte_io_wmb();
+			dseg->byte_count = byte_count;
+		} else {
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[pv_counter].dseg = dseg;
+			pv[pv_counter++].val = byte_count;
+		}
+	}
+	/* Write the first DWORD of each TXBB save earlier. */
+	if (pv_counter) {
+		/* Need a barrier here before writing the byte_count. */
+		rte_io_wmb();
+		for (--pv_counter; pv_counter  >= 0; pv_counter--)
+			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
+	}
+	/* Fill the control parameters for this packet. */
+	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+
+	return nr_txbbs;
+}
 /**
  * DPDK callback for Tx.
  *
@@ -288,10 +402,11 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int i;
 	unsigned int max;
 	struct mlx4_sq *sq = &txq->msq;
-	struct pv *pv = (struct pv *)txq->bounce_buf;
+	int nr_txbbs;
 
 	assert(txq->elts_comp_cd != 0);
-	mlx4_txq_complete(txq);
+	if (likely(txq->elts_comp != 0))
+		mlx4_txq_complete(txq, elts_n, sq);
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
 		max -= elts_n;
@@ -316,10 +431,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		} srcrb;
 		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
 		uintptr_t addr;
-		uint32_t byte_count;
-		int wqe_real_size;
-		int nr_txbbs;
-		int pv_counter = 0;
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -338,31 +449,22 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-
-		/*
-		 * Calculate the needed work queue entry size
-		 * for this packet.
-		 */
-		wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
-				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
-		nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
-		/*
-		 * Check that there is room for this WQE in the send
-		 * queue and that the WQE size is legal.
-		 */
-		if (((sq->head - sq->tail) + nr_txbbs +
-		     sq->headroom_txbbs) >= sq->txbb_cnt ||
-		    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-			elt->buf = NULL;
-			break;
-		}
-		/* Get the control and data entries of the WQE. */
-		ctrl = (struct mlx4_wqe_ctrl_seg *)
-				mlx4_get_send_wqe(sq, head_idx);
-		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
-				sizeof(struct mlx4_wqe_ctrl_seg));
-		/* Fill the data segments with buffer information. */
 		if (likely(buf->nb_segs == 1)) {
+			/*
+			 * Check that there is room for this WQE in the send
+			 * queue and that the WQE size is legal
+			 */
+			if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs)
+						>= sq->txbb_cnt ||
+						1 > MLX4_MAX_WQE_TXBBS) {
+				elt->buf = NULL;
+				break;
+			}
+			/* Get the control and data entries of the WQE. */
+			ctrl = (struct mlx4_wqe_ctrl_seg *)
+					mlx4_get_send_wqe(sq, head_idx);
+			dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+					sizeof(struct mlx4_wqe_ctrl_seg));
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			rte_prefetch0((volatile void *)addr);
 			/* Handle WQE wraparound. */
@@ -371,120 +473,42 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
 			dseg->addr = rte_cpu_to_be_64(addr);
 			/* Memory region key (big endian). */
-			dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
-	#ifndef NDEBUG
+			dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+#ifndef NDEBUG
 			if (unlikely(dseg->lkey ==
 				rte_cpu_to_be_32((uint32_t)-1))) {
 				/* MR does not exist. */
 				DEBUG("%p: unable to get MP <-> MR association",
-				      (void *)txq);
+						(void *)txq);
 				/*
 				 * Restamp entry in case of failure.
 				 * Make sure that size is written correctly
 				 * Note that we give ownership to the SW,
 				 * not the HW.
 				 */
-				ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+				ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4)
+							& 0x3f;
 				mlx4_txq_stamp_freed_wqe(sq, head_idx,
-					     (sq->head & sq->txbb_cnt) ? 0 : 1);
+					(sq->head & sq->txbb_cnt) ? 0 : 1);
 				elt->buf = NULL;
 				break;
 			}
-	#endif /* NDEBUG */
+#endif /* NDEBUG */
 			/* Need a barrier here before writing the byte_count. */
 			rte_io_wmb();
 			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
+
+			/* Fill the control parameters for this packet. */
+			ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
+			nr_txbbs = 1;
 		} else {
-			/* Fill the data segments with buffer information. */
-			struct rte_mbuf *sbuf;
-
-			for (sbuf = buf;
-				 sbuf != NULL;
-				 sbuf = sbuf->next, dseg++) {
-				addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
-				rte_prefetch0((volatile void *)addr);
-				/* Handle WQE wraparound. */
-				if (unlikely(dseg >=
-					(struct mlx4_wqe_data_seg *)sq->eob))
-					dseg = (struct mlx4_wqe_data_seg *)
-							sq->buf;
-				dseg->addr = rte_cpu_to_be_64(addr);
-				/* Memory region key (big endian). */
-				dseg->lkey = mlx4_txq_mp2mr(txq,
-						mlx4_txq_mb2mp(sbuf));
-		#ifndef NDEBUG
-				if (unlikely(dseg->lkey ==
-					rte_cpu_to_be_32((uint32_t)-1))) {
-					/* MR does not exist. */
-					DEBUG("%p: unable to get MP <-> MR association",
-						  (void *)txq);
-					/*
-					 * Restamp entry in case of failure.
-					 * Make sure that size is written
-					 * correctly, note that we give
-					 * ownership to the SW, not the HW.
-					 */
-					ctrl->fence_size =
-						(wqe_real_size >> 4) & 0x3f;
-					mlx4_txq_stamp_freed_wqe(sq, head_idx,
-					    (sq->head & sq->txbb_cnt) ? 0 : 1);
-					elt->buf = NULL;
-					break;
-				}
-		#endif /* NDEBUG */
-				if (likely(sbuf->data_len)) {
-					byte_count =
-					  rte_cpu_to_be_32(sbuf->data_len);
-				} else {
-					/*
-					 * Zero length segment is treated as
-					 * inline segment with zero data.
-					 */
-					byte_count = RTE_BE32(0x80000000);
-				}
-				/*
-				 * If the data segment is not at the beginning
-				 * of a Tx basic block (TXBB) then write the
-				 * byte count, else postpone the writing to
-				 * just before updating the control segment.
-				 */
-				if ((uintptr_t)dseg &
-					(uintptr_t)(MLX4_TXBB_SIZE - 1)) {
-					/*
-					 * Need a barrier here before writing
-					 * the byte_count fields to make sure
-					 * that all the data is visible before
-					 * the byte_count field is set.
-					 * Otherwise, if the segment begins a
-					 * new cacheline, the HCA prefetcher
-					 * could grab the 64-byte chunk and get
-					 * a valid (!= 0xffffffff) byte count
-					 * but stale data, and end up sending
-					 * the wrong data.
-					 */
-					rte_io_wmb();
-					dseg->byte_count = byte_count;
-				} else {
-					/*
-					 * This data segment starts at the
-					 * beginning of a new TXBB, so we
-					 * need to postpone its byte_count
-					 * writing for later.
-					 */
-					pv[pv_counter].dseg = dseg;
-					pv[pv_counter++].val = byte_count;
-				}
+			nr_txbbs = handle_multi_segs(buf, txq, &ctrl);
+			if (nr_txbbs < 0) {
+				elt->buf = NULL;
+				break;
 			}
-		/* Write the first DWORD of each TXBB save earlier. */
-		if (pv_counter) {
-			/* Need a barrier before writing the byte_count. */
-			rte_io_wmb();
-			for (--pv_counter; pv_counter  >= 0; pv_counter--)
-				pv[pv_counter].dseg->byte_count =
-						pv[pv_counter].val;
 		}
-		/* Fill the control parameters for this packet. */
-		ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+
 		/*
 		 * For raw Ethernet, the SOLICIT flag is used to indicate
 		 * that no ICRC should be calculated.
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 3/7] net/mlx4: save lkey in big-endian format
  2017-10-23 14:21   ` [PATCH v2 3/7] net/mlx4: save lkey in big-endian format Ophir Munk
@ 2017-10-23 15:24     ` Nélio Laranjeiro
  0 siblings, 0 replies; 84+ messages in thread
From: Nélio Laranjeiro @ 2017-10-23 15:24 UTC (permalink / raw)
  To: Ophir Munk
  Cc: Adrien Mazarguil, dev, Thomas Monjalon, Olga Shern, Matan Azrad

Hi Ophir,

Some small comments, please see bellow

On Mon, Oct 23, 2017 at 02:21:56PM +0000, Ophir Munk wrote:
> mlx4 NIC is using lkey in big endian format. Save lkey in this format
> in order to avoid conversions during Tx fast path
> 
> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> ---
>  drivers/net/mlx4/mlx4_rxtx.c | 16 +++++++---------
>  drivers/net/mlx4/mlx4_rxtx.h | 14 ++++++++------
>  2 files changed, 15 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index ae37f9b..4a77be8 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -233,9 +233,9 @@ mlx4_txq_complete(struct txq *txq)
>   *   Index in memory pool (MP) where to add memory region (MR)
>   *
>   * @return
> - *   Added mr->lkey on success, (uint32_t)-1 on failure.
> + *   Added mr->lkey (rte_be32_t) on success, (uint32_t)-1 on failure.

This comment is strange, it always returns an rte_be32_t no matter what.

>   */
> -uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
> +rte_be32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
>  {
>  	struct ibv_mr *mr;
>  
> @@ -260,9 +260,9 @@ uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
>  	/* Store the new entry. */
>  	txq->mp2mr[i].mp = mp;
>  	txq->mp2mr[i].mr = mr;
> -	txq->mp2mr[i].lkey = mr->lkey;
> +	txq->mp2mr[i].lkey = rte_cpu_to_be_32(mr->lkey);
>  	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
> -	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
> +	      (void *)txq, mp->name, (void *)mp, mr->lkey);
>  	return txq->mp2mr[i].lkey;
>  }
>  
> @@ -289,7 +289,6 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
>  		uint16_t flags16[2];
>  	} srcrb;
>  	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
> -	uint32_t lkey;
>  	uintptr_t addr;
>  	uint32_t owner_opcode = MLX4_OPCODE_SEND;
>  	uint32_t byte_count;
> @@ -323,10 +322,10 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
>  		if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
>  			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
>  		dseg->addr = rte_cpu_to_be_64(addr);
> -		/* Memory region key for this memory pool. */
> -		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
> +		/* Memory region key (big endian) for this memory pool. */
> +		dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));

Is not the comment redundant with the return type of the function?

>  #ifndef NDEBUG
> -		if (unlikely(lkey == (uint32_t)-1)) {
> +		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {

((uint32_t)-1) == ((rte_be32_t)-1) like (uin32t_t)0 == (rte_be32_t)0.
Why do you need this extra conversion?

>  			/* MR does not exist. */
>  			DEBUG("%p: unable to get MP <-> MR association",
>  			      (void *)txq);
> @@ -341,7 +340,6 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
>  			return -EFAULT;
>  		}
>  #endif /* NDEBUG */
> -		dseg->lkey = rte_cpu_to_be_32(lkey);
>  		if (likely(buf->data_len)) {
>  			byte_count = rte_cpu_to_be_32(buf->data_len);
>  		} else {
> diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
> index 719ef45..b1e8ac4 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.h
> +++ b/drivers/net/mlx4/mlx4_rxtx.h
> @@ -135,7 +135,7 @@ struct txq {
>  	struct {
>  		const struct rte_mempool *mp; /**< Cached memory pool. */
>  		struct ibv_mr *mr; /**< Memory region (for mp). */
> -		uint32_t lkey; /**< mr->lkey copy. */
> +		rte_be32_t lkey; /**< mr->lkey copy. (big endian) */

The "big endian" in the comment is redundant.

>  	} mp2mr[MLX4_PMD_TX_MP_CACHE]; /**< MP to MR translation table. */
>  	struct priv *priv; /**< Back pointer to private data. */
>  	unsigned int socket; /**< CPU socket ID for allocations. */
> @@ -169,7 +169,7 @@ uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts,
>  			       uint16_t pkts_n);
>  uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
>  			       uint16_t pkts_n);
> -uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> +rte_be32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
>  				unsigned int i);
>  
>  /* mlx4_txq.c */
> @@ -207,9 +207,9 @@ mlx4_txq_mb2mp(struct rte_mbuf *buf)
>   *   Memory pool for which a memory region lkey must be returned.
>   *
>   * @return
> - *   mr->lkey on success, (uint32_t)-1 on failure.
> + *   mr->lkey (rte_be32_t) on success, (uint32_t)-1 on failure.

The comment is also strange, the return type does not change in the mean time.

>   */
> -static __rte_always_inline uint32_t
> +static __rte_always_inline rte_be32_t
>  mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
>  {
>  	unsigned int i;
> @@ -220,8 +220,10 @@ mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
>  			break;
>  		}
>  		if (txq->mp2mr[i].mp == mp) {
> -			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> -			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
> +			assert(txq->mp2mr[i].lkey !=
> +				rte_cpu_to_be_32((uint32_t)-1));

Thanks,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 4/7] net/mlx4: merge Tx path functions
  2017-10-23 14:21   ` [PATCH v2 4/7] net/mlx4: merge Tx path functions Ophir Munk
@ 2017-10-24 13:51     ` Nélio Laranjeiro
  2017-10-24 20:36       ` Ophir Munk
  0 siblings, 1 reply; 84+ messages in thread
From: Nélio Laranjeiro @ 2017-10-24 13:51 UTC (permalink / raw)
  To: Ophir Munk
  Cc: Adrien Mazarguil, dev, Thomas Monjalon, Olga Shern, Matan Azrad

On Mon, Oct 23, 2017 at 02:21:57PM +0000, Ophir Munk wrote:
> From: Matan Azrad <matan@mellanox.com>
> 
> Merge tx_burst and mlx4_post_send functions to prevent
> double asking about WQ remain space.
> 
> This should improve performance.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>
> ---
>  drivers/net/mlx4/mlx4_rxtx.c | 353 +++++++++++++++++++++----------------------
>  1 file changed, 170 insertions(+), 183 deletions(-)

What are the real expectation you have on the remaining patches of the
series?

According to the comment of this commit log "This should improve
performance" there are too many barriers at each packet/segment level to
improve something.

The point is, mlx4_burst_tx() should write all the WQE without any
barrier as it is processing a burst of packets (whereas Verbs functions
which may only process a single packet).  The lonely barrier which
should be present is the one to ensure that all the host memory is
flushed before triggering the Tx doorbell.

There is also too many cases handled which are useless in bursts
situation, this function needs to be re-written to its minimal use case
i.e.  processing a valid burst of packets/segments and triggering at the
end of the burst the Tx doorbell.

Regards,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 4/7] net/mlx4: merge Tx path functions
  2017-10-24 13:51     ` Nélio Laranjeiro
@ 2017-10-24 20:36       ` Ophir Munk
  2017-10-25  7:50         ` Nélio Laranjeiro
  0 siblings, 1 reply; 84+ messages in thread
From: Ophir Munk @ 2017-10-24 20:36 UTC (permalink / raw)
  To: Nélio Laranjeiro
  Cc: Adrien Mazarguil, dev, Thomas Monjalon, Olga Shern, Matan Azrad

Hi,

On Tuesday, October 24, 2017 4:52 PM, Nélio Laranjeiro wrote:
> 
> On Mon, Oct 23, 2017 at 02:21:57PM +0000, Ophir Munk wrote:
> > From: Matan Azrad <matan@mellanox.com>
> >
> > Merge tx_burst and mlx4_post_send functions to prevent double asking
> > about WQ remain space.
> >
> > This should improve performance.
> >
> > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > ---
> >  drivers/net/mlx4/mlx4_rxtx.c | 353
> > +++++++++++++++++++++----------------------
> >  1 file changed, 170 insertions(+), 183 deletions(-)
> 
> What are the real expectation you have on the remaining patches of the
> series?
> 
> According to the comment of this commit log "This should improve
> performance" there are too many barriers at each packet/segment level to
> improve something.
> 
> The point is, mlx4_burst_tx() should write all the WQE without any barrier as
> it is processing a burst of packets (whereas Verbs functions which may only
> process a single packet).  

> The lonely barrier which should be present is the
> one to ensure that all the host memory is flushed before triggering the Tx
> doorbell.
> 

There is a known ConnectX-3 HW limitation: the first 4 bytes of every TXWBB (64 bytes chunks) should be 
written in a reversed order (from last TXWBB to first TXWBB). 
The last 60 bytes of any TXWBB can be written in any order (before writing the first 4 bytes).
Is your last statement (using lonely barrier) is in accordance with this limitation? Please explain.

> There is also too many cases handled which are useless in bursts situation,
> this function needs to be re-written to its minimal use case i.e.  processing a
> valid burst of packets/segments and triggering at the end of the burst the Tx
> doorbell.
> 
> Regards,
> 
> --
> Nélio Laranjeiro
> 6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 4/7] net/mlx4: merge Tx path functions
  2017-10-24 20:36       ` Ophir Munk
@ 2017-10-25  7:50         ` Nélio Laranjeiro
  2017-10-26 10:31           ` Matan Azrad
  0 siblings, 1 reply; 84+ messages in thread
From: Nélio Laranjeiro @ 2017-10-25  7:50 UTC (permalink / raw)
  To: Ophir Munk
  Cc: Adrien Mazarguil, dev, Thomas Monjalon, Olga Shern, Matan Azrad

On Tue, Oct 24, 2017 at 08:36:52PM +0000, Ophir Munk wrote:
> Hi,
> 
> On Tuesday, October 24, 2017 4:52 PM, Nélio Laranjeiro wrote:
> > 
> > On Mon, Oct 23, 2017 at 02:21:57PM +0000, Ophir Munk wrote:
> > > From: Matan Azrad <matan@mellanox.com>
> > >
> > > Merge tx_burst and mlx4_post_send functions to prevent double asking
> > > about WQ remain space.
> > >
> > > This should improve performance.
> > >
> > > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > > ---
> > >  drivers/net/mlx4/mlx4_rxtx.c | 353
> > > +++++++++++++++++++++----------------------
> > >  1 file changed, 170 insertions(+), 183 deletions(-)
> > 
> > What are the real expectation you have on the remaining patches of the
> > series?
> > 
> > According to the comment of this commit log "This should improve
> > performance" there are too many barriers at each packet/segment level to
> > improve something.
> > 
> > The point is, mlx4_burst_tx() should write all the WQE without any barrier as
> > it is processing a burst of packets (whereas Verbs functions which may only
> > process a single packet).  
> 
> > The lonely barrier which should be present is the
> > one to ensure that all the host memory is flushed before triggering the Tx
> > doorbell.
> > 
> 
> There is a known ConnectX-3 HW limitation: the first 4 bytes of every
> TXWBB (64 bytes chunks) should be 
> written in a reversed order (from last TXWBB to first TXWBB).

This means the first WQE filled by the burst function is the doorbell.
In such situation, the first four bytes of it can be written before
leaving the burst function and after a write memory barrier.

Until this first WQE is not complete, the NIC won't start processing the
packets.  Memory barriers per packets becomes useless.

It gives something like:

 uint32_t tx_bb_db = 0;
 void *first_wqe = NULL;

 /*
  * Prepare all Packets by writing the WQEs without the 4 first bytes of
  * the first WQE.
  */
 for () {
 	if (!wqe) {
		first_wqe = wqe;
		tx_bb_db = foo;
	}
 }
 /* Leaving. */
 rte_wmb();
 *(uin32_t*)wqe = tx_bb_db;
 return n;

> The last 60 bytes of any TXWBB can be written in any order (before
> writing the first 4 bytes).
> Is your last statement (using lonely barrier) is in accordance with
> this limitation? Please explain.
> 
> > There is also too many cases handled which are useless in bursts situation,
> > this function needs to be re-written to its minimal use case i.e.  processing a
> > valid burst of packets/segments and triggering at the end of the burst the Tx
> > doorbell.
> > 

Regards,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 1/7] net/mlx4: remove error flows from Tx fast path
  2017-10-23 14:21   ` [PATCH v2 1/7] net/mlx4: remove error flows from Tx fast path Ophir Munk
@ 2017-10-25 16:49     ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-25 16:49 UTC (permalink / raw)
  To: Ophir Munk; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

Hi Ophir, Matan,

On Mon, Oct 23, 2017 at 02:21:54PM +0000, Ophir Munk wrote:
> From: Matan Azrad <matan@mellanox.com>
> 
> Move unnecessary error flows to DEBUG mode for
> performance improvements.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>

One comment before going on with the review of the remaining patches in this
series, ideally, even obvious patches such as this one (ifdef'ing some code
away) should come with a couple of sentences describing their performance
impact.

If it's negligible or just a gut feeling, then say so. If it's significant,
then by all means you should provide hard numbers and a test scenario (no
need to go in details though). Performance being the main purpose of DPDK
and all, it's *that* important since such commits may be considered for
back-ports later (the lack of performance can be consirered a bug).

Some patches labelled as performance improvements are actually code clean up
not worth validating on their own, but possibly leading to subsequent
patches with actual improvements. I think it's the case here, just avoid
mentioning "performance improvement" without proof.

Other than that:

Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 2/7] net/mlx4: inline more Tx functions
  2017-10-23 14:21   ` [PATCH v2 2/7] net/mlx4: inline more Tx functions Ophir Munk
@ 2017-10-25 16:49     ` Adrien Mazarguil
  2017-10-25 21:42       ` Ophir Munk
  0 siblings, 1 reply; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-25 16:49 UTC (permalink / raw)
  To: Ophir Munk; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

Hi Ophir,

On Mon, Oct 23, 2017 at 02:21:55PM +0000, Ophir Munk wrote:
> Change functions to inline on Tx fast path to improve performance
> 
> Inside the inline function call other functions to handle "unlikely"
> cases such that the inline function code footprint is small.
> 
> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>

Reading this, it's like adding __rte_always_inline improves performance at
all, which I doubt unless you can show proof through performance results.

When in doubt, leave it to the compiler, the static keyword is usually
enough of a hint. Too much forced inlining may actually be harmful.

What this patch really does is splitting the heavy lookup/registration
function in two halves with one small static inline function for the lookup
part that calls the separate registration part in the unlikely event MR is
not already registered.

Thankfully the compiler doesn't inline the large registration function back,
which results in the perceived performance improvement for the time being,
however there is no guarantee it won't happen in the future (you didn't use
the noinline keyword on the registration function for that).

Therefore I have a bunch of comments and suggestions, see below.

> ---
>  drivers/net/mlx4/mlx4_rxtx.c | 43 ++++++------------------------------
>  drivers/net/mlx4/mlx4_rxtx.h | 52 +++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 58 insertions(+), 37 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 011ea79..ae37f9b 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -220,54 +220,25 @@ mlx4_txq_complete(struct txq *txq)
>  	return 0;
>  }
>  
> -/**
> - * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
> - * the cloned mbuf is allocated is returned instead.
> - *
> - * @param buf
> - *   Pointer to mbuf.
> - *
> - * @return
> - *   Memory pool where data is located for given mbuf.
> - */
> -static struct rte_mempool *
> -mlx4_txq_mb2mp(struct rte_mbuf *buf)
> -{
> -	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> -		return rte_mbuf_from_indirect(buf)->pool;
> -	return buf->pool;
> -}
>  
>  /**
> - * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
> - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
> - * remove an entry first.
> + * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
> + * If mp2mr[] is full, remove an entry first.
>   *
>   * @param txq
>   *   Pointer to Tx queue structure.
>   * @param[in] mp
> - *   Memory pool for which a memory region lkey must be returned.
> + *   Memory pool for which a memory region lkey must be added
> + * @param[in] i
> + *   Index in memory pool (MP) where to add memory region (MR)
>   *
>   * @return
> - *   mr->lkey on success, (uint32_t)-1 on failure.
> + *   Added mr->lkey on success, (uint32_t)-1 on failure.
>   */
> -uint32_t
> -mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
> +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
>  {
> -	unsigned int i;
>  	struct ibv_mr *mr;
>  
> -	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> -		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> -			/* Unknown MP, add a new MR for it. */
> -			break;
> -		}
> -		if (txq->mp2mr[i].mp == mp) {
> -			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> -			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
> -			return txq->mp2mr[i].lkey;
> -		}
> -	}
>  	/* Add a new entry, register MR first. */
>  	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
>  	      (void *)txq, mp->name, (void *)mp);
> diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
> index e10bbca..719ef45 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.h
> +++ b/drivers/net/mlx4/mlx4_rxtx.h
> @@ -53,6 +53,7 @@
>  
>  #include "mlx4.h"
>  #include "mlx4_prm.h"
> +#include "mlx4_utils.h"

Why?

>  
>  /** Rx queue counters. */
>  struct mlx4_rxq_stats {
> @@ -160,7 +161,6 @@ void mlx4_rx_queue_release(void *dpdk_rxq);
>  
>  /* mlx4_rxtx.c */
>  
> -uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
>  uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
>  		       uint16_t pkts_n);
>  uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
> @@ -169,6 +169,8 @@ uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts,
>  			       uint16_t pkts_n);
>  uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
>  			       uint16_t pkts_n);
> +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> +				unsigned int i);
>  
>  /* mlx4_txq.c */
>  
> @@ -177,4 +179,52 @@ int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
>  			const struct rte_eth_txconf *conf);
>  void mlx4_tx_queue_release(void *dpdk_txq);
>  
> +/**
> + * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
> + * the cloned mbuf is allocated is returned instead.
> + *
> + * @param buf
> + *   Pointer to mbuf.
> + *
> + * @return
> + *   Memory pool where data is located for given mbuf.
> + */
> +static __rte_always_inline struct rte_mempool *
> +mlx4_txq_mb2mp(struct rte_mbuf *buf)
> +{
> +	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> +		return rte_mbuf_from_indirect(buf)->pool;
> +	return buf->pool;
> +}
> +
> +/**
> + * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
> + * Call mlx4_txq_add_mr() if MP is not registered yet.
> + *
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param[in] mp
> + *   Memory pool for which a memory region lkey must be returned.
> + *
> + * @return
> + *   mr->lkey on success, (uint32_t)-1 on failure.
> + */
> +static __rte_always_inline uint32_t

Note __rte_always_inline is defined in rte_common.h and should be explicitly
included (however don't do that, see below).

> +mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> +		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> +			/* Unknown MP, add a new MR for it. */
> +			break;
> +		}
> +		if (txq->mp2mr[i].mp == mp) {
> +			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> +			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);

assert() requires assert.h (but don't include it, see subsequent
suggestion).

> +			return txq->mp2mr[i].lkey;
> +		}
> +	}
> +	return mlx4_txq_add_mr(txq, mp, i);
> +}
>  #endif /* MLX4_RXTX_H_ */

So as described above, these functions do not need the __rte_always_inline,
please remove it. They also do not need to be located in a header file; the
reason it's the case for their mlx5 counterparts is that they have to be
shared between vectorized/non-vectorized code. No such requirement here, you
should move them back to their original spot.

My suggestion for this performance improvement is to move mlx4_txq_add_mr()
to a different file, mlx4_mr.c looks like a good candidate. This fact will
ensure it's never inlined and far away from the data path.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 5/7] net/mlx4: remove unnecessary variables in Tx burst
  2017-10-23 14:21   ` [PATCH v2 5/7] net/mlx4: remove unnecessary variables in Tx burst Ophir Munk
@ 2017-10-25 16:49     ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-25 16:49 UTC (permalink / raw)
  To: Ophir Munk; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

Hi Ophir/Matan,

On Mon, Oct 23, 2017 at 02:21:58PM +0000, Ophir Munk wrote:
> From: Matan Azrad <matan@mellanox.com>
> 
> Remove usage of variables which doesn't add new information for
> performance improvement.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>

I'm almost 100% sure this commit wasn't validated for performance on its
own. Don't mention "performance improvement" in that case.

If you're removing a couple of local variables for readability, just say
so.

More below.

> ---
>  drivers/net/mlx4/mlx4_rxtx.c | 8 ++------
>  1 file changed, 2 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 014a6d3..e8d9a35 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -285,8 +285,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  	struct txq *txq = (struct txq *)dpdk_txq;
>  	unsigned int elts_head = txq->elts_head;
>  	const unsigned int elts_n = txq->elts_n;
> -	unsigned int elts_comp = 0;
> -	unsigned int bytes_sent = 0;
>  	unsigned int i;
>  	unsigned int max;
>  	struct mlx4_sq *sq = &txq->msq;
> @@ -498,8 +496,7 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  						       MLX4_BIT_WQE_OWN : 0));
>  		sq->head += nr_txbbs;
>  		elt->buf = buf;
> -		bytes_sent += buf->pkt_len;
> -		++elts_comp;
> +		txq->stats.obytes += buf->pkt_len;
>  		elts_head = elts_head_next;
>  	}
>  	/* Take a shortcut if nothing must be sent. */
> @@ -507,13 +504,12 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  		return 0;
>  	/* Increment send statistics counters. */
>  	txq->stats.opackets += i;
> -	txq->stats.obytes += bytes_sent;
>  	/* Make sure that descriptors are written before doorbell record. */
>  	rte_wmb();
>  	/* Ring QP doorbell. */
>  	rte_write32(txq->msq.doorbell_qpn, txq->msq.db);
>  	txq->elts_head = elts_head;
> -	txq->elts_comp += elts_comp;
> +	txq->elts_comp += i;
>  	return i;
>  }

For bytes_sent, reading these changes and assuming -O0 with the compiler
attempting to convert the code without reordering/improving things, this
replaces register variables used in a loop with memory operations on a large
structure through a pointer (txq->stats update for every iteration instead
of once at the end).

Are you really sure this is more optimized that way? Although the compiler
likely does it already with -O3, helping it avoid unnecessary memory writes
is good in my opinion.

OK for the removal of redundant elts_comp though. Although I'm pretty sure
once again the compiler didn't wait for this patch to optimize it away.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 6/7] net/mlx4: improve performance of one Tx segment
  2017-10-23 14:21   ` [PATCH v2 6/7] net/mlx4: improve performance of one Tx segment Ophir Munk
@ 2017-10-25 16:50     ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-25 16:50 UTC (permalink / raw)
  To: Ophir Munk; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

On Mon, Oct 23, 2017 at 02:21:59PM +0000, Ophir Munk wrote:
> From: Matan Azrad <matan@mellanox.com>
> 
> Since one segment shouldn't use additional memory to save segments
> byte_count for writing them in different order we can prevent
> additional memory unnecessary usage in this case.
> By the way, prevent loop management.
> 
> All for performance improvement.

...of single-segent scenario? In my opinion the TX burst function doesn't
know the likeliest use case of the application unless it first checks some
user-provided configuration, e.g. some flag telling it TX gather is a rare
occurrence.

Multiple segment TX is actually quite common even for small packet
sizes. Applications may find it easier to prepend a cloned mbuf segment to
all packets in order to perform some encapsulation than to memcpy() its
contents inside the headroom of each packet to send. It's much more
efficient CPU-wise and a better use of HW capabilities.

likely() and unlikely() must be very carefully used in order to not wreck
the performance of the non-ideal (real-world, non-benchmarking, however you
want to call it) scenario, so when in doubt, keep them for exceptions and
error checks.

I can't accept this patch without performance results for single and
multiple-segments use cases which show they both benefit from it.

A few more comments below.

> Signed-off-by: Matan Azrad <matan@mellanox.com>
> 
> ---
>  drivers/net/mlx4/mlx4_rxtx.c | 125 +++++++++++++++++++++++++++++--------------
>  1 file changed, 85 insertions(+), 40 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index e8d9a35..3236552 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -310,7 +310,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  		uint32_t owner_opcode = MLX4_OPCODE_SEND;
>  		struct mlx4_wqe_ctrl_seg *ctrl;
>  		struct mlx4_wqe_data_seg *dseg;
> -		struct rte_mbuf *sbuf;
>  		union {
>  			uint32_t flags;
>  			uint16_t flags16[2];
> @@ -363,12 +362,12 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
>  				sizeof(struct mlx4_wqe_ctrl_seg));
>  		/* Fill the data segments with buffer information. */
> -		for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
> -			addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> +		if (likely(buf->nb_segs == 1)) {
> +			addr = rte_pktmbuf_mtod(buf, uintptr_t);
>  			rte_prefetch0((volatile void *)addr);
>  			/* Handle WQE wraparound. */
> -			if (unlikely(dseg >=
> -			    (struct mlx4_wqe_data_seg *)sq->eob))
> +			if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)
> +					sq->eob))

Besides the fact this coding style change is unrelated to this commit, this
is one example of unlikely() that should not be unlikely(). While it only
occurs every time the index wraps at the end of the ring, it's still
extremely likely and expected given the number of packets processed per
second.

>  				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
>  			dseg->addr = rte_cpu_to_be_64(addr);
>  			/* Memory region key (big endian). */
> @@ -392,44 +391,90 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  				break;
>  			}
>  	#endif /* NDEBUG */
> -			if (likely(sbuf->data_len)) {
> -				byte_count = rte_cpu_to_be_32(sbuf->data_len);
> -			} else {
> -				/*
> -				 * Zero length segment is treated as inline
> -				 * segment with zero data.
> -				 */
> -				byte_count = RTE_BE32(0x80000000);
> -			}
> -			/*
> -			 * If the data segment is not at the beginning
> -			 * of a Tx basic block (TXBB) then write the
> -			 * byte count, else postpone the writing to
> -			 * just before updating the control segment.
> -			 */
> -			if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> -				/*
> -				 * Need a barrier here before writing the
> -				 * byte_count fields to make sure that all the
> -				 * data is visible before the byte_count field
> -				 * is set. otherwise, if the segment begins a
> -				 * new cacheline, the HCA prefetcher could grab
> -				 * the 64-byte chunk and get a valid
> -				 * (!= 0xffffffff) byte count but stale data,
> -				 * and end up sending the wrong data.
> -				 */
> -				rte_io_wmb();
> -				dseg->byte_count = byte_count;
> -			} else {
> +			/* Need a barrier here before writing the byte_count. */
> +			rte_io_wmb();
> +			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
> +		} else {
> +			/* Fill the data segments with buffer information. */
> +			struct rte_mbuf *sbuf;
> +
> +			for (sbuf = buf;
> +				 sbuf != NULL;
> +				 sbuf = sbuf->next, dseg++) {
> +				addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> +				rte_prefetch0((volatile void *)addr);
> +				/* Handle WQE wraparound. */
> +				if (unlikely(dseg >=
> +					(struct mlx4_wqe_data_seg *)sq->eob))
> +					dseg = (struct mlx4_wqe_data_seg *)
> +							sq->buf;
> +				dseg->addr = rte_cpu_to_be_64(addr);
> +				/* Memory region key (big endian). */
> +				dseg->lkey = mlx4_txq_mp2mr(txq,
> +						mlx4_txq_mb2mp(sbuf));
> +		#ifndef NDEBUG

I didn't catch this in the original review, coding rules prohibit indented
preprocessor directives. You must remove the extra indent if you're
modifying them.

> +				if (unlikely(dseg->lkey ==
> +					rte_cpu_to_be_32((uint32_t)-1))) {
> +					/* MR does not exist. */
> +					DEBUG("%p: unable to get MP <-> MR association",
> +						  (void *)txq);
> +					/*
> +					 * Restamp entry in case of failure.
> +					 * Make sure that size is written
> +					 * correctly, note that we give
> +					 * ownership to the SW, not the HW.
> +					 */
> +					ctrl->fence_size =
> +						(wqe_real_size >> 4) & 0x3f;
> +					mlx4_txq_stamp_freed_wqe(sq, head_idx,
> +					    (sq->head & sq->txbb_cnt) ? 0 : 1);
> +					elt->buf = NULL;
> +					break;
> +				}
> +		#endif /* NDEBUG */
> +				if (likely(sbuf->data_len)) {
> +					byte_count =
> +					  rte_cpu_to_be_32(sbuf->data_len);
> +				} else {
> +					/*
> +					 * Zero length segment is treated as
> +					 * inline segment with zero data.
> +					 */
> +					byte_count = RTE_BE32(0x80000000);
> +				}
>  				/*
> -				 * This data segment starts at the beginning of
> -				 * a new TXBB, so we need to postpone its
> -				 * byte_count writing for later.
> +				 * If the data segment is not at the beginning
> +				 * of a Tx basic block (TXBB) then write the
> +				 * byte count, else postpone the writing to
> +				 * just before updating the control segment.
>  				 */
> -				pv[pv_counter].dseg = dseg;
> -				pv[pv_counter++].val = byte_count;
> +				if ((uintptr_t)dseg &
> +					(uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> +					/*
> +					 * Need a barrier here before writing
> +					 * the byte_count fields to make sure
> +					 * that all the data is visible before
> +					 * the byte_count field is set.
> +					 * Otherwise, if the segment begins a
> +					 * new cacheline, the HCA prefetcher
> +					 * could grab the 64-byte chunk and get
> +					 * a valid (!= 0xffffffff) byte count
> +					 * but stale data, and end up sending
> +					 * the wrong data.
> +					 */
> +					rte_io_wmb();
> +					dseg->byte_count = byte_count;
> +				} else {
> +					/*
> +					 * This data segment starts at the
> +					 * beginning of a new TXBB, so we
> +					 * need to postpone its byte_count
> +					 * writing for later.
> +					 */
> +					pv[pv_counter].dseg = dseg;
> +					pv[pv_counter++].val = byte_count;
> +				}
>  			}
> -		}

Where did that block go? Isn't there an unnecessary indentation level here?

>  		/* Write the first DWORD of each TXBB save earlier. */
>  		if (pv_counter) {
>  			/* Need a barrier before writing the byte_count. */
> -- 
> 2.7.4
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 7/7] net/mlx4: separate Tx for multi-segments
  2017-10-23 14:22   ` [PATCH v2 7/7] net/mlx4: separate Tx for multi-segments Ophir Munk
@ 2017-10-25 16:50     ` Adrien Mazarguil
  2017-10-30  8:15       ` Ophir Munk
  0 siblings, 1 reply; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-25 16:50 UTC (permalink / raw)
  To: Ophir Munk; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

Hi Ophir,

On Mon, Oct 23, 2017 at 02:22:00PM +0000, Ophir Munk wrote:
> This commit optimizes handling of one segment and calls a
> dedicated function for handling multi segments
> 
> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>

While it indeed moves the code to a separate function I'm not sure by how
much it improves performance.

Is it noticeably better, can you provide a short performance summary with
and without this patch? Is that the case for both single and multi-segment
scenarios, or was this improvement at the cost of a degradation in the
latter case?

If it splits a large function in two smaller ones for readability and no
performance validation was done on this specific patch alone, please not
label it as a performance improvement. I'm fine with readability
improvements when properly identified as such.

A few additional comments below.

> ---
>  drivers/net/mlx4/mlx4_rxtx.c | 284 +++++++++++++++++++++++--------------------
>  1 file changed, 154 insertions(+), 130 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 3236552..9596859 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -62,6 +62,9 @@
>  #include "mlx4_rxtx.h"
>  #include "mlx4_utils.h"
>  
> +#define WQE_ONE_DATA_SEG_SIZE \
> +	(sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg))
> +
>  /**
>   * Pointer-value pair structure used in tx_post_send for saving the first
>   * DWORD (32 byte) of a TXBB.
> @@ -140,22 +143,19 @@ mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, uint16_t index, uint8_t owner)
>   * @return
>   *   0 on success, -1 on failure.
>   */
> -static int
> -mlx4_txq_complete(struct txq *txq)
> +static inline int __attribute__((always_inline))

Should be static only, leave the rest to the compiler. This function is
large enough that it shouldn't make much of a difference anyway (unless
proved otherwise).

> +mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
> +				struct mlx4_sq *sq)
>  {
>  	unsigned int elts_comp = txq->elts_comp;
>  	unsigned int elts_tail = txq->elts_tail;
> -	const unsigned int elts_n = txq->elts_n;
>  	struct mlx4_cq *cq = &txq->mcq;
> -	struct mlx4_sq *sq = &txq->msq;
>  	struct mlx4_cqe *cqe;
>  	uint32_t cons_index = cq->cons_index;
>  	uint16_t new_index;
>  	uint16_t nr_txbbs = 0;
>  	int pkts = 0;
>  
> -	if (unlikely(elts_comp == 0))
> -		return 0;
>  	/*
>  	 * Traverse over all CQ entries reported and handle each WQ entry
>  	 * reported by them.
> @@ -266,6 +266,120 @@ rte_be32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
>  	return txq->mp2mr[i].lkey;
>  }
>  
> +static int handle_multi_segs(struct rte_mbuf *buf,
> +			    struct txq *txq,
> +			    struct mlx4_wqe_ctrl_seg **pctrl)
> +{
> +	int wqe_real_size;
> +	int nr_txbbs;
> +	struct pv *pv = (struct pv *)txq->bounce_buf;
> +	struct mlx4_sq *sq = &txq->msq;
> +	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
> +	struct mlx4_wqe_ctrl_seg *ctrl;
> +	struct mlx4_wqe_data_seg *dseg;
> +	uintptr_t addr;
> +	uint32_t byte_count;
> +	int pv_counter = 0;
> +
> +	/* Calculate the needed work queue entry size for this packet. */
> +	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> +		buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
> +	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
> +	/*
> +	 * Check that there is room for this WQE in the send queue and that
> +	 * the WQE size is legal.
> +	 */
> +	if (((sq->head - sq->tail) + nr_txbbs +
> +				sq->headroom_txbbs) >= sq->txbb_cnt ||
> +			nr_txbbs > MLX4_MAX_WQE_TXBBS) {
> +		return -1;
> +	}
> +
> +	/* Get the control and data entries of the WQE. */
> +	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
> +	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> +			sizeof(struct mlx4_wqe_ctrl_seg));
> +	*pctrl = ctrl;
> +	/* Fill the data segments with buffer information. */
> +	struct rte_mbuf *sbuf;
> +
> +	for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
> +		addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> +		rte_prefetch0((volatile void *)addr);
> +		/* Handle WQE wraparound. */
> +		if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
> +			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
> +		dseg->addr = rte_cpu_to_be_64(addr);
> +		/* Memory region key (big endian) for this memory pool. */
> +		dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
> +#ifndef NDEBUG
> +		/* Calculate the needed work queue entry size for this packet */
> +		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
> +			/* MR does not exist. */
> +			DEBUG("%p: unable to get MP <-> MR association",
> +					(void *)txq);
> +			/*
> +			 * Restamp entry in case of failure.
> +			 * Make sure that size is written correctly
> +			 * Note that we give ownership to the SW, not the HW.
> +			 */
> +			wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> +				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
> +			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> +			mlx4_txq_stamp_freed_wqe(sq, head_idx,
> +					(sq->head & sq->txbb_cnt) ? 0 : 1);
> +			return -1;
> +		}
> +#endif /* NDEBUG */
> +		if (likely(sbuf->data_len)) {
> +			byte_count = rte_cpu_to_be_32(sbuf->data_len);
> +		} else {
> +			/*
> +			 * Zero length segment is treated as inline segment
> +			 * with zero data.
> +			 */
> +			byte_count = RTE_BE32(0x80000000);
> +		}
> +		/*
> +		 * If the data segment is not at the beginning of a
> +		 * Tx basic block (TXBB) then write the byte count,
> +		 * else postpone the writing to just before updating the
> +		 * control segment.
> +		 */
> +		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> +			/*
> +			 * Need a barrier here before writing the byte_count
> +			 * fields to make sure that all the data is visible
> +			 * before the byte_count field is set.
> +			 * Otherwise, if the segment begins a new cacheline,
> +			 * the HCA prefetcher could grab the 64-byte chunk and
> +			 * get a valid (!= 0xffffffff) byte count but stale
> +			 * data, and end up sending the wrong data.
> +			 */
> +			rte_io_wmb();
> +			dseg->byte_count = byte_count;
> +		} else {
> +			/*
> +			 * This data segment starts at the beginning of a new
> +			 * TXBB, so we need to postpone its byte_count writing
> +			 * for later.
> +			 */
> +			pv[pv_counter].dseg = dseg;
> +			pv[pv_counter++].val = byte_count;
> +		}
> +	}
> +	/* Write the first DWORD of each TXBB save earlier. */
> +	if (pv_counter) {
> +		/* Need a barrier here before writing the byte_count. */
> +		rte_io_wmb();
> +		for (--pv_counter; pv_counter  >= 0; pv_counter--)
> +			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
> +	}
> +	/* Fill the control parameters for this packet. */
> +	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> +
> +	return nr_txbbs;
> +}
>  /**
>   * DPDK callback for Tx.
>   *
> @@ -288,10 +402,11 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  	unsigned int i;
>  	unsigned int max;
>  	struct mlx4_sq *sq = &txq->msq;
> -	struct pv *pv = (struct pv *)txq->bounce_buf;
> +	int nr_txbbs;
>  
>  	assert(txq->elts_comp_cd != 0);
> -	mlx4_txq_complete(txq);
> +	if (likely(txq->elts_comp != 0))
> +		mlx4_txq_complete(txq, elts_n, sq);
>  	max = (elts_n - (elts_head - txq->elts_tail));
>  	if (max > elts_n)
>  		max -= elts_n;
> @@ -316,10 +431,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  		} srcrb;
>  		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
>  		uintptr_t addr;
> -		uint32_t byte_count;
> -		int wqe_real_size;
> -		int nr_txbbs;
> -		int pv_counter = 0;
>  
>  		/* Clean up old buffer. */
>  		if (likely(elt->buf != NULL)) {
> @@ -338,31 +449,22 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  			} while (tmp != NULL);
>  		}
>  		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> -
> -		/*
> -		 * Calculate the needed work queue entry size
> -		 * for this packet.
> -		 */
> -		wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> -				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
> -		nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
> -		/*
> -		 * Check that there is room for this WQE in the send
> -		 * queue and that the WQE size is legal.
> -		 */
> -		if (((sq->head - sq->tail) + nr_txbbs +
> -		     sq->headroom_txbbs) >= sq->txbb_cnt ||
> -		    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
> -			elt->buf = NULL;
> -			break;
> -		}
> -		/* Get the control and data entries of the WQE. */
> -		ctrl = (struct mlx4_wqe_ctrl_seg *)
> -				mlx4_get_send_wqe(sq, head_idx);
> -		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> -				sizeof(struct mlx4_wqe_ctrl_seg));
> -		/* Fill the data segments with buffer information. */
>  		if (likely(buf->nb_segs == 1)) {
> +			/*
> +			 * Check that there is room for this WQE in the send
> +			 * queue and that the WQE size is legal
> +			 */
> +			if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs)
> +						>= sq->txbb_cnt ||
> +						1 > MLX4_MAX_WQE_TXBBS) {
> +				elt->buf = NULL;
> +				break;
> +			}
> +			/* Get the control and data entries of the WQE. */
> +			ctrl = (struct mlx4_wqe_ctrl_seg *)
> +					mlx4_get_send_wqe(sq, head_idx);
> +			dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> +					sizeof(struct mlx4_wqe_ctrl_seg));
>  			addr = rte_pktmbuf_mtod(buf, uintptr_t);
>  			rte_prefetch0((volatile void *)addr);
>  			/* Handle WQE wraparound. */
> @@ -371,120 +473,42 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
>  			dseg->addr = rte_cpu_to_be_64(addr);
>  			/* Memory region key (big endian). */
> -			dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
> -	#ifndef NDEBUG
> +			dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
> +#ifndef NDEBUG
>  			if (unlikely(dseg->lkey ==
>  				rte_cpu_to_be_32((uint32_t)-1))) {
>  				/* MR does not exist. */
>  				DEBUG("%p: unable to get MP <-> MR association",
> -				      (void *)txq);
> +						(void *)txq);
>  				/*
>  				 * Restamp entry in case of failure.
>  				 * Make sure that size is written correctly
>  				 * Note that we give ownership to the SW,
>  				 * not the HW.
>  				 */
> -				ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> +				ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4)
> +							& 0x3f;
>  				mlx4_txq_stamp_freed_wqe(sq, head_idx,
> -					     (sq->head & sq->txbb_cnt) ? 0 : 1);
> +					(sq->head & sq->txbb_cnt) ? 0 : 1);
>  				elt->buf = NULL;
>  				break;
>  			}
> -	#endif /* NDEBUG */
> +#endif /* NDEBUG */
>  			/* Need a barrier here before writing the byte_count. */
>  			rte_io_wmb();
>  			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
> +
> +			/* Fill the control parameters for this packet. */
> +			ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
> +			nr_txbbs = 1;
>  		} else {
> -			/* Fill the data segments with buffer information. */
> -			struct rte_mbuf *sbuf;
> -
> -			for (sbuf = buf;
> -				 sbuf != NULL;
> -				 sbuf = sbuf->next, dseg++) {
> -				addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> -				rte_prefetch0((volatile void *)addr);
> -				/* Handle WQE wraparound. */
> -				if (unlikely(dseg >=
> -					(struct mlx4_wqe_data_seg *)sq->eob))
> -					dseg = (struct mlx4_wqe_data_seg *)
> -							sq->buf;
> -				dseg->addr = rte_cpu_to_be_64(addr);
> -				/* Memory region key (big endian). */
> -				dseg->lkey = mlx4_txq_mp2mr(txq,
> -						mlx4_txq_mb2mp(sbuf));
> -		#ifndef NDEBUG
> -				if (unlikely(dseg->lkey ==
> -					rte_cpu_to_be_32((uint32_t)-1))) {
> -					/* MR does not exist. */
> -					DEBUG("%p: unable to get MP <-> MR association",
> -						  (void *)txq);
> -					/*
> -					 * Restamp entry in case of failure.
> -					 * Make sure that size is written
> -					 * correctly, note that we give
> -					 * ownership to the SW, not the HW.
> -					 */
> -					ctrl->fence_size =
> -						(wqe_real_size >> 4) & 0x3f;
> -					mlx4_txq_stamp_freed_wqe(sq, head_idx,
> -					    (sq->head & sq->txbb_cnt) ? 0 : 1);
> -					elt->buf = NULL;
> -					break;
> -				}
> -		#endif /* NDEBUG */
> -				if (likely(sbuf->data_len)) {
> -					byte_count =
> -					  rte_cpu_to_be_32(sbuf->data_len);
> -				} else {
> -					/*
> -					 * Zero length segment is treated as
> -					 * inline segment with zero data.
> -					 */
> -					byte_count = RTE_BE32(0x80000000);
> -				}
> -				/*
> -				 * If the data segment is not at the beginning
> -				 * of a Tx basic block (TXBB) then write the
> -				 * byte count, else postpone the writing to
> -				 * just before updating the control segment.
> -				 */
> -				if ((uintptr_t)dseg &
> -					(uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> -					/*
> -					 * Need a barrier here before writing
> -					 * the byte_count fields to make sure
> -					 * that all the data is visible before
> -					 * the byte_count field is set.
> -					 * Otherwise, if the segment begins a
> -					 * new cacheline, the HCA prefetcher
> -					 * could grab the 64-byte chunk and get
> -					 * a valid (!= 0xffffffff) byte count
> -					 * but stale data, and end up sending
> -					 * the wrong data.
> -					 */
> -					rte_io_wmb();
> -					dseg->byte_count = byte_count;
> -				} else {
> -					/*
> -					 * This data segment starts at the
> -					 * beginning of a new TXBB, so we
> -					 * need to postpone its byte_count
> -					 * writing for later.
> -					 */
> -					pv[pv_counter].dseg = dseg;
> -					pv[pv_counter++].val = byte_count;
> -				}
> +			nr_txbbs = handle_multi_segs(buf, txq, &ctrl);

Having all this part non-inline could degrade multi-segment performance, is
that OK?

> +			if (nr_txbbs < 0) {
> +				elt->buf = NULL;
> +				break;
>  			}
> -		/* Write the first DWORD of each TXBB save earlier. */
> -		if (pv_counter) {
> -			/* Need a barrier before writing the byte_count. */
> -			rte_io_wmb();
> -			for (--pv_counter; pv_counter  >= 0; pv_counter--)
> -				pv[pv_counter].dseg->byte_count =
> -						pv[pv_counter].val;
>  		}
> -		/* Fill the control parameters for this packet. */
> -		ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> +
>  		/*
>  		 * For raw Ethernet, the SOLICIT flag is used to indicate
>  		 * that no ICRC should be calculated.
> -- 
> 2.7.4
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 2/7] net/mlx4: inline more Tx functions
  2017-10-25 16:49     ` Adrien Mazarguil
@ 2017-10-25 21:42       ` Ophir Munk
  2017-10-26  7:48         ` Adrien Mazarguil
  0 siblings, 1 reply; 84+ messages in thread
From: Ophir Munk @ 2017-10-25 21:42 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

Hi Adrien,

On Wednesday, October 25, 2017 7:50 PM, Adrien Mazarguil wrote:
> 
> Hi Ophir,
> 
> On Mon, Oct 23, 2017 at 02:21:55PM +0000, Ophir Munk wrote:
> > Change functions to inline on Tx fast path to improve performance
> >
> > Inside the inline function call other functions to handle "unlikely"
> > cases such that the inline function code footprint is small.
> >
> > Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> 
> Reading this, it's like adding __rte_always_inline improves performance at
> all, which I doubt unless you can show proof through performance results.
> 
> When in doubt, leave it to the compiler, the static keyword is usually enough
> of a hint. Too much forced inlining may actually be harmful.
> 
> What this patch really does is splitting the heavy lookup/registration function
> in two halves with one small static inline function for the lookup part that
> calls the separate registration part in the unlikely event MR is not already
> registered.
> 
> Thankfully the compiler doesn't inline the large registration function back,
> which results in the perceived performance improvement for the time being,
> however there is no guarantee it won't happen in the future (you didn't use
> the noinline keyword on the registration function for that).
> 
> Therefore I have a bunch of comments and suggestions, see below.
> 
> > ---
> >  drivers/net/mlx4/mlx4_rxtx.c | 43
> > ++++++------------------------------
> >  drivers/net/mlx4/mlx4_rxtx.h | 52
> > +++++++++++++++++++++++++++++++++++++++++++-
> >  2 files changed, 58 insertions(+), 37 deletions(-)
> >
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > b/drivers/net/mlx4/mlx4_rxtx.c index 011ea79..ae37f9b 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > @@ -220,54 +220,25 @@ mlx4_txq_complete(struct txq *txq)
> >  	return 0;
> >  }
> >
> > -/**
> > - * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from
> > which
> > - * the cloned mbuf is allocated is returned instead.
> > - *
> > - * @param buf
> > - *   Pointer to mbuf.
> > - *
> > - * @return
> > - *   Memory pool where data is located for given mbuf.
> > - */
> > -static struct rte_mempool *
> > -mlx4_txq_mb2mp(struct rte_mbuf *buf)
> > -{
> > -	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > -		return rte_mbuf_from_indirect(buf)->pool;
> > -	return buf->pool;
> > -}
> >
> >  /**
> > - * Get memory region (MR) <-> memory pool (MP) association from txq-
> >mp2mr[].
> > - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is
> > full,
> > - * remove an entry first.
> > + * Add memory region (MR) <-> memory pool (MP) association to txq-
> >mp2mr[].
> > + * If mp2mr[] is full, remove an entry first.
> >   *
> >   * @param txq
> >   *   Pointer to Tx queue structure.
> >   * @param[in] mp
> > - *   Memory pool for which a memory region lkey must be returned.
> > + *   Memory pool for which a memory region lkey must be added
> > + * @param[in] i
> > + *   Index in memory pool (MP) where to add memory region (MR)
> >   *
> >   * @return
> > - *   mr->lkey on success, (uint32_t)-1 on failure.
> > + *   Added mr->lkey on success, (uint32_t)-1 on failure.
> >   */
> > -uint32_t
> > -mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
> > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> > +uint32_t i)
> >  {
> > -	unsigned int i;
> >  	struct ibv_mr *mr;
> >
> > -	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > -		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > -			/* Unknown MP, add a new MR for it. */
> > -			break;
> > -		}
> > -		if (txq->mp2mr[i].mp == mp) {
> > -			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > -			assert(txq->mp2mr[i].mr->lkey == txq-
> >mp2mr[i].lkey);
> > -			return txq->mp2mr[i].lkey;
> > -		}
> > -	}
> >  	/* Add a new entry, register MR first. */
> >  	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
> >  	      (void *)txq, mp->name, (void *)mp); diff --git
> > a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h index
> > e10bbca..719ef45 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.h
> > +++ b/drivers/net/mlx4/mlx4_rxtx.h
> > @@ -53,6 +53,7 @@
> >
> >  #include "mlx4.h"
> >  #include "mlx4_prm.h"
> > +#include "mlx4_utils.h"
> 
> Why?
> 
> >
> >  /** Rx queue counters. */
> >  struct mlx4_rxq_stats {
> > @@ -160,7 +161,6 @@ void mlx4_rx_queue_release(void *dpdk_rxq);
> >
> >  /* mlx4_rxtx.c */
> >
> > -uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
> > uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
> >  		       uint16_t pkts_n);
> >  uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, @@
> > -169,6 +169,8 @@ uint16_t mlx4_tx_burst_removed(void *dpdk_txq,
> struct rte_mbuf **pkts,
> >  			       uint16_t pkts_n);
> >  uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
> >  			       uint16_t pkts_n);
> > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> > +				unsigned int i);
> >
> >  /* mlx4_txq.c */
> >
> > @@ -177,4 +179,52 @@ int mlx4_tx_queue_setup(struct rte_eth_dev
> *dev, uint16_t idx,
> >  			const struct rte_eth_txconf *conf);  void
> > mlx4_tx_queue_release(void *dpdk_txq);
> >
> > +/**
> > + * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from
> > +which
> > + * the cloned mbuf is allocated is returned instead.
> > + *
> > + * @param buf
> > + *   Pointer to mbuf.
> > + *
> > + * @return
> > + *   Memory pool where data is located for given mbuf.
> > + */
> > +static __rte_always_inline struct rte_mempool * mlx4_txq_mb2mp(struct
> > +rte_mbuf *buf) {
> > +	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > +		return rte_mbuf_from_indirect(buf)->pool;
> > +	return buf->pool;
> > +}
> > +
> > +/**
> > + * Get memory region (MR) <-> memory pool (MP) association from txq-
> >mp2mr[].
> > + * Call mlx4_txq_add_mr() if MP is not registered yet.
> > + *
> > + * @param txq
> > + *   Pointer to Tx queue structure.
> > + * @param[in] mp
> > + *   Memory pool for which a memory region lkey must be returned.
> > + *
> > + * @return
> > + *   mr->lkey on success, (uint32_t)-1 on failure.
> > + */
> > +static __rte_always_inline uint32_t
> 
> Note __rte_always_inline is defined in rte_common.h and should be
> explicitly included (however don't do that, see below).
> 
> > +mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) {
> > +	unsigned int i;
> > +
> > +	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > +		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > +			/* Unknown MP, add a new MR for it. */
> > +			break;
> > +		}
> > +		if (txq->mp2mr[i].mp == mp) {
> > +			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > +			assert(txq->mp2mr[i].mr->lkey == txq-
> >mp2mr[i].lkey);
> 
> assert() requires assert.h (but don't include it, see subsequent suggestion).
> 
> > +			return txq->mp2mr[i].lkey;
> > +		}
> > +	}
> > +	return mlx4_txq_add_mr(txq, mp, i);
> > +}
> >  #endif /* MLX4_RXTX_H_ */
> 
> So as described above, these functions do not need the __rte_always_inline,
> please remove it. They also do not need to be located in a header file; the
> reason it's the case for their mlx5 counterparts is that they have to be shared
> between vectorized/non-vectorized code. No such requirement here, you
> should move them back to their original spot.
> 

Static function mlx4_txq_mp2mr() must be in a header file because it is shared by 2 files: mlx4_txq.c and mlx4_rxtx.c.
It is not related to vectorized/non-vectorized code in mlx5.
Having said that -__rte_always_inline is required as well otherwise compilation fails with 
drivers/net/mlx4/mlx4_rxtx.h:200:1: error: 'mlx4_txq_mp2mr' defined but not used [-Werror=unused-function]
for files which include mlx4_rxtx.h

> My suggestion for this performance improvement is to move
> mlx4_txq_add_mr() to a different file, mlx4_mr.c looks like a good
> candidate. This fact will ensure it's never inlined and far away from the data
> path.
> 

Function mlx4_txq_add_mr() is relatively small. 
What do you say about preceding it with __attribute((noinline)) instead of creating a new file?

> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 2/7] net/mlx4: inline more Tx functions
  2017-10-25 21:42       ` Ophir Munk
@ 2017-10-26  7:48         ` Adrien Mazarguil
  2017-10-26 14:27           ` Ophir Munk
  0 siblings, 1 reply; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-26  7:48 UTC (permalink / raw)
  To: Ophir Munk; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

Hi Ophir,

Please see below.

On Wed, Oct 25, 2017 at 09:42:46PM +0000, Ophir Munk wrote:
> Hi Adrien,
> 
> On Wednesday, October 25, 2017 7:50 PM, Adrien Mazarguil wrote:
> > 
> > Hi Ophir,
> > 
> > On Mon, Oct 23, 2017 at 02:21:55PM +0000, Ophir Munk wrote:
> > > Change functions to inline on Tx fast path to improve performance
> > >
> > > Inside the inline function call other functions to handle "unlikely"
> > > cases such that the inline function code footprint is small.
> > >
> > > Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> > 
> > Reading this, it's like adding __rte_always_inline improves performance at
> > all, which I doubt unless you can show proof through performance results.
> > 
> > When in doubt, leave it to the compiler, the static keyword is usually enough
> > of a hint. Too much forced inlining may actually be harmful.
> > 
> > What this patch really does is splitting the heavy lookup/registration function
> > in two halves with one small static inline function for the lookup part that
> > calls the separate registration part in the unlikely event MR is not already
> > registered.
> > 
> > Thankfully the compiler doesn't inline the large registration function back,
> > which results in the perceived performance improvement for the time being,
> > however there is no guarantee it won't happen in the future (you didn't use
> > the noinline keyword on the registration function for that).
> > 
> > Therefore I have a bunch of comments and suggestions, see below.
> > 
> > > ---
> > >  drivers/net/mlx4/mlx4_rxtx.c | 43
> > > ++++++------------------------------
> > >  drivers/net/mlx4/mlx4_rxtx.h | 52
> > > +++++++++++++++++++++++++++++++++++++++++++-
> > >  2 files changed, 58 insertions(+), 37 deletions(-)
> > >
> > > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > > b/drivers/net/mlx4/mlx4_rxtx.c index 011ea79..ae37f9b 100644
> > > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > > @@ -220,54 +220,25 @@ mlx4_txq_complete(struct txq *txq)
> > >  	return 0;
> > >  }
> > >
> > > -/**
> > > - * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from
> > > which
> > > - * the cloned mbuf is allocated is returned instead.
> > > - *
> > > - * @param buf
> > > - *   Pointer to mbuf.
> > > - *
> > > - * @return
> > > - *   Memory pool where data is located for given mbuf.
> > > - */
> > > -static struct rte_mempool *
> > > -mlx4_txq_mb2mp(struct rte_mbuf *buf)
> > > -{
> > > -	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > > -		return rte_mbuf_from_indirect(buf)->pool;
> > > -	return buf->pool;
> > > -}
> > >
> > >  /**
> > > - * Get memory region (MR) <-> memory pool (MP) association from txq-
> > >mp2mr[].
> > > - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is
> > > full,
> > > - * remove an entry first.
> > > + * Add memory region (MR) <-> memory pool (MP) association to txq-
> > >mp2mr[].
> > > + * If mp2mr[] is full, remove an entry first.
> > >   *
> > >   * @param txq
> > >   *   Pointer to Tx queue structure.
> > >   * @param[in] mp
> > > - *   Memory pool for which a memory region lkey must be returned.
> > > + *   Memory pool for which a memory region lkey must be added
> > > + * @param[in] i
> > > + *   Index in memory pool (MP) where to add memory region (MR)
> > >   *
> > >   * @return
> > > - *   mr->lkey on success, (uint32_t)-1 on failure.
> > > + *   Added mr->lkey on success, (uint32_t)-1 on failure.
> > >   */
> > > -uint32_t
> > > -mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
> > > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> > > +uint32_t i)
> > >  {
> > > -	unsigned int i;
> > >  	struct ibv_mr *mr;
> > >
> > > -	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > > -		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > > -			/* Unknown MP, add a new MR for it. */
> > > -			break;
> > > -		}
> > > -		if (txq->mp2mr[i].mp == mp) {
> > > -			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > > -			assert(txq->mp2mr[i].mr->lkey == txq-
> > >mp2mr[i].lkey);
> > > -			return txq->mp2mr[i].lkey;
> > > -		}
> > > -	}
> > >  	/* Add a new entry, register MR first. */
> > >  	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
> > >  	      (void *)txq, mp->name, (void *)mp); diff --git
> > > a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h index
> > > e10bbca..719ef45 100644
> > > --- a/drivers/net/mlx4/mlx4_rxtx.h
> > > +++ b/drivers/net/mlx4/mlx4_rxtx.h
> > > @@ -53,6 +53,7 @@
> > >
> > >  #include "mlx4.h"
> > >  #include "mlx4_prm.h"
> > > +#include "mlx4_utils.h"
> > 
> > Why?
> > 
> > >
> > >  /** Rx queue counters. */
> > >  struct mlx4_rxq_stats {
> > > @@ -160,7 +161,6 @@ void mlx4_rx_queue_release(void *dpdk_rxq);
> > >
> > >  /* mlx4_rxtx.c */
> > >
> > > -uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
> > > uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
> > >  		       uint16_t pkts_n);
> > >  uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, @@
> > > -169,6 +169,8 @@ uint16_t mlx4_tx_burst_removed(void *dpdk_txq,
> > struct rte_mbuf **pkts,
> > >  			       uint16_t pkts_n);
> > >  uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
> > >  			       uint16_t pkts_n);
> > > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> > > +				unsigned int i);
> > >
> > >  /* mlx4_txq.c */
> > >
> > > @@ -177,4 +179,52 @@ int mlx4_tx_queue_setup(struct rte_eth_dev
> > *dev, uint16_t idx,
> > >  			const struct rte_eth_txconf *conf);  void
> > > mlx4_tx_queue_release(void *dpdk_txq);
> > >
> > > +/**
> > > + * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from
> > > +which
> > > + * the cloned mbuf is allocated is returned instead.
> > > + *
> > > + * @param buf
> > > + *   Pointer to mbuf.
> > > + *
> > > + * @return
> > > + *   Memory pool where data is located for given mbuf.
> > > + */
> > > +static __rte_always_inline struct rte_mempool * mlx4_txq_mb2mp(struct
> > > +rte_mbuf *buf) {
> > > +	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > > +		return rte_mbuf_from_indirect(buf)->pool;
> > > +	return buf->pool;
> > > +}
> > > +
> > > +/**
> > > + * Get memory region (MR) <-> memory pool (MP) association from txq-
> > >mp2mr[].
> > > + * Call mlx4_txq_add_mr() if MP is not registered yet.
> > > + *
> > > + * @param txq
> > > + *   Pointer to Tx queue structure.
> > > + * @param[in] mp
> > > + *   Memory pool for which a memory region lkey must be returned.
> > > + *
> > > + * @return
> > > + *   mr->lkey on success, (uint32_t)-1 on failure.
> > > + */
> > > +static __rte_always_inline uint32_t
> > 
> > Note __rte_always_inline is defined in rte_common.h and should be
> > explicitly included (however don't do that, see below).
> > 
> > > +mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) {
> > > +	unsigned int i;
> > > +
> > > +	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > > +		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > > +			/* Unknown MP, add a new MR for it. */
> > > +			break;
> > > +		}
> > > +		if (txq->mp2mr[i].mp == mp) {
> > > +			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > > +			assert(txq->mp2mr[i].mr->lkey == txq-
> > >mp2mr[i].lkey);
> > 
> > assert() requires assert.h (but don't include it, see subsequent suggestion).
> > 
> > > +			return txq->mp2mr[i].lkey;
> > > +		}
> > > +	}
> > > +	return mlx4_txq_add_mr(txq, mp, i);
> > > +}
> > >  #endif /* MLX4_RXTX_H_ */
> > 
> > So as described above, these functions do not need the __rte_always_inline,
> > please remove it. They also do not need to be located in a header file; the
> > reason it's the case for their mlx5 counterparts is that they have to be shared
> > between vectorized/non-vectorized code. No such requirement here, you
> > should move them back to their original spot.
> > 
> 
> Static function mlx4_txq_mp2mr() must be in a header file because it is shared by 2 files: mlx4_txq.c and mlx4_rxtx.c.
> It is not related to vectorized/non-vectorized code in mlx5.
> Having said that -__rte_always_inline is required as well otherwise compilation fails with 
> drivers/net/mlx4/mlx4_rxtx.h:200:1: error: 'mlx4_txq_mp2mr' defined but not used [-Werror=unused-function]
> for files which include mlx4_rxtx.h

All right, then what you were looking or was static inline, not *force*
inline. The former is a hint, the latter doesn't leave much of a choice to
the compiler, it means you're sure this way brings the most performance,
however for this patch I really think inlining plays a really minor part
(even changes anything at all) compared to dividing this function, which is
the real performance improvement.

> > My suggestion for this performance improvement is to move
> > mlx4_txq_add_mr() to a different file, mlx4_mr.c looks like a good
> > candidate. This fact will ensure it's never inlined and far away from the data
> > path.
> > 
> 
> Function mlx4_txq_add_mr() is relatively small. 
> What do you say about preceding it with __attribute((noinline)) instead of creating a new file?

What I mean is you should declare mlx4_txq_add_mr() which does the heavy
lifting inside mlx4_mr.c and provide its definition in mlx4.h instead of
mlx4_rxtx.h.

Then, mlx4_txq_mp2mr() can remain defined in mlx4_rxtx.c in its original
spot as a non-static function with its public declaration remaining in
mlx4_rxtx.h for users outside of this file.

The fact mlx4_txq_mp2mr() remains defined in that file *before*
mlx4_post_send()'s definition where it's needed allows the compiler to
optimize it away as if it was static inline thanks to -O3, that is, unless
it thinks doing so would hurt performance, but as a (now) small function
this shouldn't be an issue.

Other reasons includes that doing so would make a smaller diff that focuses
on the performance improvement itself. The extra performance brought by a
statically inlined version of mlx4_txq_mp2mr() is not needed in mlx4_txq.c,
whose only purpose is to set up queues.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 4/7] net/mlx4: merge Tx path functions
  2017-10-25  7:50         ` Nélio Laranjeiro
@ 2017-10-26 10:31           ` Matan Azrad
  2017-10-26 12:12             ` Nélio Laranjeiro
  0 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-26 10:31 UTC (permalink / raw)
  To: Nélio Laranjeiro, Ophir Munk
  Cc: Adrien Mazarguil, dev, Thomas Monjalon, Olga Shern, Mordechay Haimovsky

Hi Nelio

I think the memory barrier discussion is not relevant for this patch (if it will be relevant I will create new one).
Please see my comments inline.

Regarding this specific patch, I didn't see any comment from you, Are you agree with it? 
 
> -----Original Message-----
> From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]
> Sent: Wednesday, October 25, 2017 10:50 AM
> To: Ophir Munk <ophirmu@mellanox.com>
> Cc: Adrien Mazarguil <adrien.mazarguil@6wind.com>; dev@dpdk.org;
> Thomas Monjalon <thomas@monjalon.net>; Olga Shern
> <olgas@mellanox.com>; Matan Azrad <matan@mellanox.com>
> Subject: Re: [dpdk-dev] [PATCH v2 4/7] net/mlx4: merge Tx path functions
> 
> On Tue, Oct 24, 2017 at 08:36:52PM +0000, Ophir Munk wrote:
> > Hi,
> >
> > On Tuesday, October 24, 2017 4:52 PM, Nélio Laranjeiro wrote:
> > >
> > > On Mon, Oct 23, 2017 at 02:21:57PM +0000, Ophir Munk wrote:
> > > > From: Matan Azrad <matan@mellanox.com>
> > > >
> > > > Merge tx_burst and mlx4_post_send functions to prevent double
> > > > asking about WQ remain space.
> > > >
> > > > This should improve performance.
> > > >
> > > > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > > > ---
> > > >  drivers/net/mlx4/mlx4_rxtx.c | 353
> > > > +++++++++++++++++++++----------------------
> > > >  1 file changed, 170 insertions(+), 183 deletions(-)
> > >
> > > What are the real expectation you have on the remaining patches of
> > > the series?
> > >
> > > According to the comment of this commit log "This should improve
> > > performance" there are too many barriers at each packet/segment
> > > level to improve something.
> > >
> > > The point is, mlx4_burst_tx() should write all the WQE without any
> > > barrier as it is processing a burst of packets (whereas Verbs
> > > functions which may only process a single packet).
> >
> > > The lonely barrier which should be present is the one to ensure that
> > > all the host memory is flushed before triggering the Tx doorbell.
> > >
> >
> > There is a known ConnectX-3 HW limitation: the first 4 bytes of every
> > TXWBB (64 bytes chunks) should be
> > written in a reversed order (from last TXWBB to first TXWBB).
> 
> This means the first WQE filled by the burst function is the doorbell.
> In such situation, the first four bytes of it can be written before
> leaving the burst function and after a write memory barrier.
> 
> Until this first WQE is not complete, the NIC won't start processing the
> packets.  Memory barriers per packets becomes useless.

I think this is not true, Since mlx4 HW can prefetch advanced TXbbs if their first 4
bytes are valid in spite of the first WQE is still not valid (please read the spec).

> 
> It gives something like:
> 
>  uint32_t tx_bb_db = 0;
>  void *first_wqe = NULL;
> 
>  /*
>   * Prepare all Packets by writing the WQEs without the 4 first bytes of
>   * the first WQE.
>   */
>  for () {
>  	if (!wqe) {
> 		first_wqe = wqe;
> 		tx_bb_db = foo;
> 	}
>  }
>  /* Leaving. */
>  rte_wmb();
>  *(uin32_t*)wqe = tx_bb_db;
>  return n;
>

I will take care to check if we can do 2 loops:
Write all  last 60B per TXbb.
Memory barrier.
Write all first 4B per TXbbs.

> > The last 60 bytes of any TXWBB can be written in any order (before
> > writing the first 4 bytes).
> > Is your last statement (using lonely barrier) is in accordance with
> > this limitation? Please explain.
> >
> > > There is also too many cases handled which are useless in bursts
> situation,
> > > this function needs to be re-written to its minimal use case i.e.
> processing a
> > > valid burst of packets/segments and triggering at the end of the burst the
> Tx
> > > doorbell.
> > >
> 
> Regards,
> 
> --
> Nélio Laranjeiro
> 6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 4/7] net/mlx4: merge Tx path functions
  2017-10-26 10:31           ` Matan Azrad
@ 2017-10-26 12:12             ` Nélio Laranjeiro
  2017-10-26 12:30               ` Matan Azrad
  0 siblings, 1 reply; 84+ messages in thread
From: Nélio Laranjeiro @ 2017-10-26 12:12 UTC (permalink / raw)
  To: Matan Azrad
  Cc: Ophir Munk, Adrien Mazarguil, dev, Thomas Monjalon, Olga Shern,
	Mordechay Haimovsky

On Thu, Oct 26, 2017 at 10:31:06AM +0000, Matan Azrad wrote:
> Hi Nelio
> 
> I think the memory barrier discussion is not relevant for this patch
> (if it will be relevant I will create new one).
> Please see my comments inline.

It was not my single comment.  There is also useless code like having
null segments in the packets which is not allowed on DPDK.

> Regarding this specific patch, I didn't see any comment from you, Are
> you agree with it? 
>  
> > -----Original Message-----
> > From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]
> > Sent: Wednesday, October 25, 2017 10:50 AM
> > To: Ophir Munk <ophirmu@mellanox.com>
> > Cc: Adrien Mazarguil <adrien.mazarguil@6wind.com>; dev@dpdk.org;
> > Thomas Monjalon <thomas@monjalon.net>; Olga Shern
> > <olgas@mellanox.com>; Matan Azrad <matan@mellanox.com>
> > Subject: Re: [dpdk-dev] [PATCH v2 4/7] net/mlx4: merge Tx path functions
> > 
> > On Tue, Oct 24, 2017 at 08:36:52PM +0000, Ophir Munk wrote:
> > > Hi,
> > >
> > > On Tuesday, October 24, 2017 4:52 PM, Nélio Laranjeiro wrote:
> > > >
> > > > On Mon, Oct 23, 2017 at 02:21:57PM +0000, Ophir Munk wrote:
> > > > > From: Matan Azrad <matan@mellanox.com>
> > > > >
> > > > > Merge tx_burst and mlx4_post_send functions to prevent double
> > > > > asking about WQ remain space.
> > > > >
> > > > > This should improve performance.
> > > > >
> > > > > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > > > > ---
> > > > >  drivers/net/mlx4/mlx4_rxtx.c | 353
> > > > > +++++++++++++++++++++----------------------
> > > > >  1 file changed, 170 insertions(+), 183 deletions(-)
> > > >
> > > > What are the real expectation you have on the remaining patches of
> > > > the series?
> > > >
> > > > According to the comment of this commit log "This should improve
> > > > performance" there are too many barriers at each packet/segment
> > > > level to improve something.
> > > >
> > > > The point is, mlx4_burst_tx() should write all the WQE without any
> > > > barrier as it is processing a burst of packets (whereas Verbs
> > > > functions which may only process a single packet).
> > >
> > > > The lonely barrier which should be present is the one to ensure that
> > > > all the host memory is flushed before triggering the Tx doorbell.
> > > >
> > >
> > > There is a known ConnectX-3 HW limitation: the first 4 bytes of every
> > > TXWBB (64 bytes chunks) should be
> > > written in a reversed order (from last TXWBB to first TXWBB).
> > 
> > This means the first WQE filled by the burst function is the doorbell.
> > In such situation, the first four bytes of it can be written before
> > leaving the burst function and after a write memory barrier.
> > 
> > Until this first WQE is not complete, the NIC won't start processing the
> > packets.  Memory barriers per packets becomes useless.
> 
> I think this is not true, Since mlx4 HW can prefetch advanced TXbbs if their first 4
> bytes are valid in spite of the first WQE is still not valid (please read the spec).

A compiler barrier is enough on x86 to forbid the CPU to re-order the
instructions, on arm you need a memory barrier, there is a macro in DPDK
for that, rte_io_wmb().

Before triggering the doorbell you must flush the case, this is the only
place where the rte_wmb() should be used.

> > It gives something like:
> > 
> >  uint32_t tx_bb_db = 0;
> >  void *first_wqe = NULL;
> > 
> >  /*
> >   * Prepare all Packets by writing the WQEs without the 4 first bytes of
> >   * the first WQE.
> >   */
> >  for () {
> >  	if (!wqe) {
> > 		first_wqe = wqe;
> > 		tx_bb_db = foo;
> > 	}
> >  }
> >  /* Leaving. */
> >  rte_wmb();
> >  *(uin32_t*)wqe = tx_bb_db;
> >  return n;
> >
> 
> I will take care to check if we can do 2 loops:
> Write all  last 60B per TXbb.
> Memory barrier.
> Write all first 4B per TXbbs.
> 
> > > The last 60 bytes of any TXWBB can be written in any order (before
> > > writing the first 4 bytes).
> > > Is your last statement (using lonely barrier) is in accordance with
> > > this limitation? Please explain.
> > >
> > > > There is also too many cases handled which are useless in bursts
> > situation,
> > > > this function needs to be re-written to its minimal use case i.e.
> > processing a
> > > > valid burst of packets/segments and triggering at the end of the burst the
> > Tx
> > > > doorbell.
> > > >
> > 
> > Regards,
> > 
> > --
> > Nélio Laranjeiro
> > 6WIND

Regards,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 4/7] net/mlx4: merge Tx path functions
  2017-10-26 12:12             ` Nélio Laranjeiro
@ 2017-10-26 12:30               ` Matan Azrad
  2017-10-26 13:44                 ` Nélio Laranjeiro
  0 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-26 12:30 UTC (permalink / raw)
  To: Nélio Laranjeiro
  Cc: Ophir Munk, Adrien Mazarguil, dev, Thomas Monjalon, Olga Shern,
	Mordechay Haimovsky

Hi Nelio
Please see my comments below (3).


> -----Original Message-----
> From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]
> Sent: Thursday, October 26, 2017 3:12 PM
> To: Matan Azrad <matan@mellanox.com>
> Cc: Ophir Munk <ophirmu@mellanox.com>; Adrien Mazarguil
> <adrien.mazarguil@6wind.com>; dev@dpdk.org; Thomas Monjalon
> <thomas@monjalon.net>; Olga Shern <olgas@mellanox.com>; Mordechay
> Haimovsky <motih@mellanox.com>
> Subject: Re: [dpdk-dev] [PATCH v2 4/7] net/mlx4: merge Tx path functions
> 
> On Thu, Oct 26, 2017 at 10:31:06AM +0000, Matan Azrad wrote:
> > Hi Nelio
> >
> > I think the memory barrier discussion is not relevant for this patch
> > (if it will be relevant I will create new one).
> > Please see my comments inline.
> 
> It was not my single comment.  There is also useless code like having null
> segments in the packets which is not allowed on DPDK.

Sorry, but I can't find comments in the previous mails.
Moreover  this comment(first time I see it) is not relevant to this patch and asking something else.
All what this patch does is to merge 2 functions to prevent double
asking about WQ remain space...
Remove memory\compiler barriers or dealing with null segments are not in the scope here. 

> 
> > Regarding this specific patch, I didn't see any comment from you, Are
> > you agree with it?
> >
> > > -----Original Message-----
> > > From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]
> > > Sent: Wednesday, October 25, 2017 10:50 AM
> > > To: Ophir Munk <ophirmu@mellanox.com>
> > > Cc: Adrien Mazarguil <adrien.mazarguil@6wind.com>; dev@dpdk.org;
> > > Thomas Monjalon <thomas@monjalon.net>; Olga Shern
> > > <olgas@mellanox.com>; Matan Azrad <matan@mellanox.com>
> > > Subject: Re: [dpdk-dev] [PATCH v2 4/7] net/mlx4: merge Tx path
> > > functions
> > >
> > > On Tue, Oct 24, 2017 at 08:36:52PM +0000, Ophir Munk wrote:
> > > > Hi,
> > > >
> > > > On Tuesday, October 24, 2017 4:52 PM, Nélio Laranjeiro wrote:
> > > > >
> > > > > On Mon, Oct 23, 2017 at 02:21:57PM +0000, Ophir Munk wrote:
> > > > > > From: Matan Azrad <matan@mellanox.com>
> > > > > >
> > > > > > Merge tx_burst and mlx4_post_send functions to prevent double
> > > > > > asking about WQ remain space.
> > > > > >
> > > > > > This should improve performance.
> > > > > >
> > > > > > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > > > > > ---
> > > > > >  drivers/net/mlx4/mlx4_rxtx.c | 353
> > > > > > +++++++++++++++++++++----------------------
> > > > > >  1 file changed, 170 insertions(+), 183 deletions(-)
> > > > >
> > > > > What are the real expectation you have on the remaining patches
> > > > > of the series?
> > > > >
> > > > > According to the comment of this commit log "This should improve
> > > > > performance" there are too many barriers at each packet/segment
> > > > > level to improve something.
> > > > >
> > > > > The point is, mlx4_burst_tx() should write all the WQE without
> > > > > any barrier as it is processing a burst of packets (whereas
> > > > > Verbs functions which may only process a single packet).
> > > >
> > > > > The lonely barrier which should be present is the one to ensure
> > > > > that all the host memory is flushed before triggering the Tx doorbell.
> > > > >
> > > >
> > > > There is a known ConnectX-3 HW limitation: the first 4 bytes of
> > > > every TXWBB (64 bytes chunks) should be written in a reversed
> > > > order (from last TXWBB to first TXWBB).
> > >
> > > This means the first WQE filled by the burst function is the doorbell.
> > > In such situation, the first four bytes of it can be written before
> > > leaving the burst function and after a write memory barrier.
> > >
> > > Until this first WQE is not complete, the NIC won't start processing
> > > the packets.  Memory barriers per packets becomes useless.
> >
> > I think this is not true, Since mlx4 HW can prefetch advanced TXbbs if
> > their first 4 bytes are valid in spite of the first WQE is still not valid (please
> read the spec).
> 
> A compiler barrier is enough on x86 to forbid the CPU to re-order the
> instructions, on arm you need a memory barrier, there is a macro in DPDK for
> that, rte_io_wmb().
> 

We are also using compiler barrier here.

> Before triggering the doorbell you must flush the case, this is the only place
> where the rte_wmb() should be used.
> 

We are also using memory barrier only for this reason.

> > > It gives something like:
> > >
> > >  uint32_t tx_bb_db = 0;
> > >  void *first_wqe = NULL;
> > >
> > >  /*
> > >   * Prepare all Packets by writing the WQEs without the 4 first bytes of
> > >   * the first WQE.
> > >   */
> > >  for () {
> > >  	if (!wqe) {
> > > 		first_wqe = wqe;
> > > 		tx_bb_db = foo;
> > > 	}
> > >  }
> > >  /* Leaving. */
> > >  rte_wmb();
> > >  *(uin32_t*)wqe = tx_bb_db;
> > >  return n;
> > >
> >
> > I will take care to check if we can do 2 loops:
> > Write all  last 60B per TXbb.
> > Memory barrier.
> > Write all first 4B per TXbbs.
> >
> > > > The last 60 bytes of any TXWBB can be written in any order (before
> > > > writing the first 4 bytes).
> > > > Is your last statement (using lonely barrier) is in accordance
> > > > with this limitation? Please explain.
> > > >
> > > > > There is also too many cases handled which are useless in bursts
> > > situation,
> > > > > this function needs to be re-written to its minimal use case i.e.
> > > processing a
> > > > > valid burst of packets/segments and triggering at the end of the
> > > > > burst the
> > > Tx
> > > > > doorbell.
> > > > >
> > >
> > > Regards,
> > >
> > > --
> > > Nélio Laranjeiro
> > > 6WIND
> 
> Regards,
> 
> --
> Nélio Laranjeiro
> 6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 4/7] net/mlx4: merge Tx path functions
  2017-10-26 12:30               ` Matan Azrad
@ 2017-10-26 13:44                 ` Nélio Laranjeiro
  2017-10-26 16:21                   ` Matan Azrad
  0 siblings, 1 reply; 84+ messages in thread
From: Nélio Laranjeiro @ 2017-10-26 13:44 UTC (permalink / raw)
  To: Matan Azrad
  Cc: Ophir Munk, Adrien Mazarguil, dev, Thomas Monjalon, Olga Shern,
	Mordechay Haimovsky

On Thu, Oct 26, 2017 at 12:30:54PM +0000, Matan Azrad wrote:
> Hi Nelio
> Please see my comments below (3).
> 
> 
> > -----Original Message-----
> > From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]
> > Sent: Thursday, October 26, 2017 3:12 PM
> > To: Matan Azrad <matan@mellanox.com>
> > Cc: Ophir Munk <ophirmu@mellanox.com>; Adrien Mazarguil
> > <adrien.mazarguil@6wind.com>; dev@dpdk.org; Thomas Monjalon
> > <thomas@monjalon.net>; Olga Shern <olgas@mellanox.com>; Mordechay
> > Haimovsky <motih@mellanox.com>
> > Subject: Re: [dpdk-dev] [PATCH v2 4/7] net/mlx4: merge Tx path functions
> > 
> > On Thu, Oct 26, 2017 at 10:31:06AM +0000, Matan Azrad wrote:
> > > Hi Nelio
> > >
> > > I think the memory barrier discussion is not relevant for this patch
> > > (if it will be relevant I will create new one).
> > > Please see my comments inline.
> > 
> > It was not my single comment.  There is also useless code like having null
> > segments in the packets which is not allowed on DPDK.
> 
> Sorry, but I can't find comments in the previous mails.

You should search in the series,

> Moreover  this comment(first time I see it) is not relevant to this patch and asking something else.
> All what this patch does is to merge 2 functions to prevent double
> asking about WQ remain space...

Again in the series itself.

The point, this series embed 7 patches for "performance improvement",
whereas the single improvement is avoiding to call an outside function by
copy/pasting it into the PMD.
In fact it will save few cycles, but this improvements could have been
much more if the it was not a bare copy/paste.

The real question is what is the improvement?  If the improvement is
significant, it worse having this series, otherwise it does not as it
may also bring some bugs which may be resolve from its original source
whereas this one will remain.

> Remove memory\compiler barriers or dealing with null segments are not in the scope here. 
> 
> > 
> > > Regarding this specific patch, I didn't see any comment from you, Are
> > > you agree with it?
> > >
> > > > -----Original Message-----
> > > > From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]
> > > > Sent: Wednesday, October 25, 2017 10:50 AM
> > > > To: Ophir Munk <ophirmu@mellanox.com>
> > > > Cc: Adrien Mazarguil <adrien.mazarguil@6wind.com>; dev@dpdk.org;
> > > > Thomas Monjalon <thomas@monjalon.net>; Olga Shern
> > > > <olgas@mellanox.com>; Matan Azrad <matan@mellanox.com>
> > > > Subject: Re: [dpdk-dev] [PATCH v2 4/7] net/mlx4: merge Tx path
> > > > functions
> > > >
> > > > On Tue, Oct 24, 2017 at 08:36:52PM +0000, Ophir Munk wrote:
> > > > > Hi,
> > > > >
> > > > > On Tuesday, October 24, 2017 4:52 PM, Nélio Laranjeiro wrote:
> > > > > >
> > > > > > On Mon, Oct 23, 2017 at 02:21:57PM +0000, Ophir Munk wrote:
> > > > > > > From: Matan Azrad <matan@mellanox.com>
> > > > > > >
> > > > > > > Merge tx_burst and mlx4_post_send functions to prevent double
> > > > > > > asking about WQ remain space.
> > > > > > >
> > > > > > > This should improve performance.
> > > > > > >
> > > > > > > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > > > > > > ---
> > > > > > >  drivers/net/mlx4/mlx4_rxtx.c | 353
> > > > > > > +++++++++++++++++++++----------------------
> > > > > > >  1 file changed, 170 insertions(+), 183 deletions(-)
> > > > > >
> > > > > > What are the real expectation you have on the remaining patches
> > > > > > of the series?
> > > > > >
> > > > > > According to the comment of this commit log "This should improve
> > > > > > performance" there are too many barriers at each packet/segment
> > > > > > level to improve something.
> > > > > >
> > > > > > The point is, mlx4_burst_tx() should write all the WQE without
> > > > > > any barrier as it is processing a burst of packets (whereas
> > > > > > Verbs functions which may only process a single packet).
> > > > >
> > > > > > The lonely barrier which should be present is the one to ensure
> > > > > > that all the host memory is flushed before triggering the Tx doorbell.
> > > > > >
> > > > >
> > > > > There is a known ConnectX-3 HW limitation: the first 4 bytes of
> > > > > every TXWBB (64 bytes chunks) should be written in a reversed
> > > > > order (from last TXWBB to first TXWBB).
> > > >
> > > > This means the first WQE filled by the burst function is the doorbell.
> > > > In such situation, the first four bytes of it can be written before
> > > > leaving the burst function and after a write memory barrier.
> > > >
> > > > Until this first WQE is not complete, the NIC won't start processing
> > > > the packets.  Memory barriers per packets becomes useless.
> > >
> > > I think this is not true, Since mlx4 HW can prefetch advanced TXbbs if
> > > their first 4 bytes are valid in spite of the first WQE is still not valid (please
> > read the spec).
> > 
> > A compiler barrier is enough on x86 to forbid the CPU to re-order the
> > instructions, on arm you need a memory barrier, there is a macro in DPDK for
> > that, rte_io_wmb().
> > 
> We are also using compiler barrier here.
> 
> > Before triggering the doorbell you must flush the case, this is the only place
> > where the rte_wmb() should be used.
> > 
> 
> We are also using memory barrier only for this reason.
> 
> > > > It gives something like:
> > > >
> > > >  uint32_t tx_bb_db = 0;
> > > >  void *first_wqe = NULL;
> > > >
> > > >  /*
> > > >   * Prepare all Packets by writing the WQEs without the 4 first bytes of
> > > >   * the first WQE.
> > > >   */
> > > >  for () {
> > > >  	if (!wqe) {
> > > > 		first_wqe = wqe;
> > > > 		tx_bb_db = foo;
> > > > 	}
> > > >  }
> > > >  /* Leaving. */
> > > >  rte_wmb();
> > > >  *(uin32_t*)wqe = tx_bb_db;
> > > >  return n;
> > > >
> > >
> > > I will take care to check if we can do 2 loops:
> > > Write all  last 60B per TXbb.
> > > Memory barrier.
> > > Write all first 4B per TXbbs.
> > >
> > > > > The last 60 bytes of any TXWBB can be written in any order (before
> > > > > writing the first 4 bytes).
> > > > > Is your last statement (using lonely barrier) is in accordance
> > > > > with this limitation? Please explain.
> > > > >
> > > > > > There is also too many cases handled which are useless in bursts
> > > > situation,
> > > > > > this function needs to be re-written to its minimal use case i.e.
> > > > processing a
> > > > > > valid burst of packets/segments and triggering at the end of the
> > > > > > burst the
> > > > Tx
> > > > > > doorbell.
> > > > > >
> > > >
> > > > Regards,
> > > >
> > > > --
> > > > Nélio Laranjeiro
> > > > 6WIND
> > 
> > Regards,
> > 
> > --
> > Nélio Laranjeiro
> > 6WIND

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 2/7] net/mlx4: inline more Tx functions
  2017-10-26  7:48         ` Adrien Mazarguil
@ 2017-10-26 14:27           ` Ophir Munk
  2017-10-29 19:30             ` Ophir Munk
  0 siblings, 1 reply; 84+ messages in thread
From: Ophir Munk @ 2017-10-26 14:27 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

Hi,
Please see inside

On Thursday, October 26, 2017 10:49 AM Adrien Mazarguil wrote:
> To: Ophir Munk <ophirmu@mellanox.com>
> Cc: dev@dpdk.org; Thomas Monjalon <thomas@monjalon.net>; Olga Shern
> <olgas@mellanox.com>; Matan Azrad <matan@mellanox.com>
> Subject: Re: [PATCH v2 2/7] net/mlx4: inline more Tx functions
> 
> Hi Ophir,
> 
> Please see below.
> 
> On Wed, Oct 25, 2017 at 09:42:46PM +0000, Ophir Munk wrote:
> > Hi Adrien,
> >
> > On Wednesday, October 25, 2017 7:50 PM, Adrien Mazarguil wrote:
> > >
> > > Hi Ophir,
> > >
> > > On Mon, Oct 23, 2017 at 02:21:55PM +0000, Ophir Munk wrote:
> > > > Change functions to inline on Tx fast path to improve performance
> > > >
> > > > Inside the inline function call other functions to handle "unlikely"
> > > > cases such that the inline function code footprint is small.
> > > >
> > > > Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> > >
> > > Reading this, it's like adding __rte_always_inline improves
> > > performance at all, which I doubt unless you can show proof through
> performance results.
> > >
> > > When in doubt, leave it to the compiler, the static keyword is
> > > usually enough of a hint. Too much forced inlining may actually be
> harmful.
> > >
> > > What this patch really does is splitting the heavy
> > > lookup/registration function in two halves with one small static
> > > inline function for the lookup part that calls the separate
> > > registration part in the unlikely event MR is not already registered.
> > >
> > > Thankfully the compiler doesn't inline the large registration
> > > function back, which results in the perceived performance
> > > improvement for the time being, however there is no guarantee it
> > > won't happen in the future (you didn't use the noinline keyword on the
> registration function for that).
> > >
> > > Therefore I have a bunch of comments and suggestions, see below.
> > >
> > > > ---
> > > >  drivers/net/mlx4/mlx4_rxtx.c | 43
> > > > ++++++------------------------------
> > > >  drivers/net/mlx4/mlx4_rxtx.h | 52
> > > > +++++++++++++++++++++++++++++++++++++++++++-
> > > >  2 files changed, 58 insertions(+), 37 deletions(-)
> > > >
> > > > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > > > b/drivers/net/mlx4/mlx4_rxtx.c index 011ea79..ae37f9b 100644
> > > > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > > > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > > > @@ -220,54 +220,25 @@ mlx4_txq_complete(struct txq *txq)
> > > >  	return 0;
> > > >  }
> > > >
> > > > -/**
> > > > - * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool
> > > > from which
> > > > - * the cloned mbuf is allocated is returned instead.
> > > > - *
> > > > - * @param buf
> > > > - *   Pointer to mbuf.
> > > > - *
> > > > - * @return
> > > > - *   Memory pool where data is located for given mbuf.
> > > > - */
> > > > -static struct rte_mempool *
> > > > -mlx4_txq_mb2mp(struct rte_mbuf *buf) -{
> > > > -	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > > > -		return rte_mbuf_from_indirect(buf)->pool;
> > > > -	return buf->pool;
> > > > -}
> > > >
> > > >  /**
> > > > - * Get memory region (MR) <-> memory pool (MP) association from
> > > >txq- mp2mr[].
> > > > - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[]
> > > >is  full,
> > > > - * remove an entry first.
> > > > + * Add memory region (MR) <-> memory pool (MP) association to
> > > > + txq-
> > > >mp2mr[].
> > > > + * If mp2mr[] is full, remove an entry first.
> > > >   *
> > > >   * @param txq
> > > >   *   Pointer to Tx queue structure.
> > > >   * @param[in] mp
> > > > - *   Memory pool for which a memory region lkey must be returned.
> > > > + *   Memory pool for which a memory region lkey must be added
> > > > + * @param[in] i
> > > > + *   Index in memory pool (MP) where to add memory region (MR)
> > > >   *
> > > >   * @return
> > > > - *   mr->lkey on success, (uint32_t)-1 on failure.
> > > > + *   Added mr->lkey on success, (uint32_t)-1 on failure.
> > > >   */
> > > > -uint32_t
> > > > -mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
> > > > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> > > > +uint32_t i)
> > > >  {
> > > > -	unsigned int i;
> > > >  	struct ibv_mr *mr;
> > > >
> > > > -	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > > > -		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > > > -			/* Unknown MP, add a new MR for it. */
> > > > -			break;
> > > > -		}
> > > > -		if (txq->mp2mr[i].mp == mp) {
> > > > -			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > > > -			assert(txq->mp2mr[i].mr->lkey == txq-
> > > >mp2mr[i].lkey);
> > > > -			return txq->mp2mr[i].lkey;
> > > > -		}
> > > > -	}
> > > >  	/* Add a new entry, register MR first. */
> > > >  	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
> > > >  	      (void *)txq, mp->name, (void *)mp); diff --git
> > > >a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h index
> > > > e10bbca..719ef45 100644
> > > > --- a/drivers/net/mlx4/mlx4_rxtx.h
> > > > +++ b/drivers/net/mlx4/mlx4_rxtx.h
> > > > @@ -53,6 +53,7 @@
> > > >
> > > >  #include "mlx4.h"
> > > >  #include "mlx4_prm.h"
> > > > +#include "mlx4_utils.h"
> > >
> > > Why?
> > >
> > > >
> > > >  /** Rx queue counters. */
> > > >  struct mlx4_rxq_stats {
> > > > @@ -160,7 +161,6 @@ void mlx4_rx_queue_release(void *dpdk_rxq);
> > > >
> > > >  /* mlx4_rxtx.c */
> > > >
> > > > -uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
> > > > uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
> > > >  		       uint16_t pkts_n);
> > > >  uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, @@
> > > > -169,6 +169,8 @@ uint16_t mlx4_tx_burst_removed(void *dpdk_txq,
> > > struct rte_mbuf **pkts,
> > > >  			       uint16_t pkts_n);
> > > >  uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf
> **pkts,
> > > >  			       uint16_t pkts_n);
> > > > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> > > > +				unsigned int i);
> > > >
> > > >  /* mlx4_txq.c */
> > > >
> > > > @@ -177,4 +179,52 @@ int mlx4_tx_queue_setup(struct rte_eth_dev
> > > *dev, uint16_t idx,
> > > >  			const struct rte_eth_txconf *conf);  void
> > > > mlx4_tx_queue_release(void *dpdk_txq);
> > > >
> > > > +/**
> > > > + * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool
> > > > +from which
> > > > + * the cloned mbuf is allocated is returned instead.
> > > > + *
> > > > + * @param buf
> > > > + *   Pointer to mbuf.
> > > > + *
> > > > + * @return
> > > > + *   Memory pool where data is located for given mbuf.
> > > > + */
> > > > +static __rte_always_inline struct rte_mempool *
> > > > +mlx4_txq_mb2mp(struct rte_mbuf *buf) {
> > > > +	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > > > +		return rte_mbuf_from_indirect(buf)->pool;
> > > > +	return buf->pool;
> > > > +}
> > > > +
> > > > +/**
> > > > + * Get memory region (MR) <-> memory pool (MP) association from
> > > > +txq-
> > > >mp2mr[].
> > > > + * Call mlx4_txq_add_mr() if MP is not registered yet.
> > > > + *
> > > > + * @param txq
> > > > + *   Pointer to Tx queue structure.
> > > > + * @param[in] mp
> > > > + *   Memory pool for which a memory region lkey must be returned.
> > > > + *
> > > > + * @return
> > > > + *   mr->lkey on success, (uint32_t)-1 on failure.
> > > > + */
> > > > +static __rte_always_inline uint32_t
> > >
> > > Note __rte_always_inline is defined in rte_common.h and should be
> > > explicitly included (however don't do that, see below).
> > >
> > > > +mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) {
> > > > +	unsigned int i;
> > > > +
> > > > +	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > > > +		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > > > +			/* Unknown MP, add a new MR for it. */
> > > > +			break;
> > > > +		}
> > > > +		if (txq->mp2mr[i].mp == mp) {
> > > > +			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > > > +			assert(txq->mp2mr[i].mr->lkey == txq-
> > > >mp2mr[i].lkey);
> > >
> > > assert() requires assert.h (but don't include it, see subsequent
> suggestion).
> > >
> > > > +			return txq->mp2mr[i].lkey;
> > > > +		}
> > > > +	}
> > > > +	return mlx4_txq_add_mr(txq, mp, i); }
> > > >  #endif /* MLX4_RXTX_H_ */
> > >
> > > So as described above, these functions do not need the
> > > __rte_always_inline, please remove it. They also do not need to be
> > > located in a header file; the reason it's the case for their mlx5
> > > counterparts is that they have to be shared between
> > > vectorized/non-vectorized code. No such requirement here, you should
> move them back to their original spot.
> > >
> >
> > Static function mlx4_txq_mp2mr() must be in a header file because it is
> shared by 2 files: mlx4_txq.c and mlx4_rxtx.c.
> > It is not related to vectorized/non-vectorized code in mlx5.
> > Having said that -__rte_always_inline is required as well otherwise
> > compilation fails with
> > drivers/net/mlx4/mlx4_rxtx.h:200:1: error: 'mlx4_txq_mp2mr' defined
> > but not used [-Werror=unused-function] for files which include
> > mlx4_rxtx.h
> 
> All right, then what you were looking or was static inline, not *force* inline.
> The former is a hint, the latter doesn't leave much of a choice to the
> compiler, it means you're sure this way brings the most performance,
> however for this patch I really think inlining plays a really minor part (even
> changes anything at all) compared to dividing this function, which is the real
> performance improvement.

Without inline I get ~0.2Mpps degradation on my setup, therefore I suggest keeping inline.
The final call is yours. Please let me know if to leave inline or remove.

> 
> > > My suggestion for this performance improvement is to move
> > > mlx4_txq_add_mr() to a different file, mlx4_mr.c looks like a good
> > > candidate. This fact will ensure it's never inlined and far away
> > > from the data path.
> > >
> >
> > Function mlx4_txq_add_mr() is relatively small.
> > What do you say about preceding it with __attribute((noinline)) instead of
> creating a new file?
> 
> What I mean is you should declare mlx4_txq_add_mr() which does the heavy
> lifting inside mlx4_mr.c and provide its definition in mlx4.h instead of
> mlx4_rxtx.h.
> 
> Then, mlx4_txq_mp2mr() can remain defined in mlx4_rxtx.c in its original
> spot as a non-static function with its public declaration remaining in
> mlx4_rxtx.h for users outside of this file.
> 

That's done before. mlx4_txq_mp2mr() is defined in mlx4_rxtx.c as described.

> The fact mlx4_txq_mp2mr() remains defined in that file *before*
> mlx4_post_send()'s definition where it's needed allows the compiler to
> optimize it away as if it was static inline thanks to -O3, that is, unless it thinks
> doing so would hurt performance, but as a (now) small function this
> shouldn't be an issue.
> 
> Other reasons includes that doing so would make a smaller diff that focuses
> on the performance improvement itself. The extra performance brought by a
> statically inlined version of mlx4_txq_mp2mr() is not needed in mlx4_txq.c,
> whose only purpose is to set up queues.

I have moved mlx4_txq_add_mr() to mlx_mr.c. One of the disadvantages of moving it is that now 
it requires adding #include "mlx4_rxtx.h" in the C file in order to know the details of txq struct.
I liked mlx4_mr.c for being encapsulated from any Tx/Rx specific structures and just handling MR issues.

> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 4/7] net/mlx4: merge Tx path functions
  2017-10-26 13:44                 ` Nélio Laranjeiro
@ 2017-10-26 16:21                   ` Matan Azrad
  0 siblings, 0 replies; 84+ messages in thread
From: Matan Azrad @ 2017-10-26 16:21 UTC (permalink / raw)
  To: Nélio Laranjeiro
  Cc: Ophir Munk, Adrien Mazarguil, dev, Thomas Monjalon, Olga Shern,
	Mordechay Haimovsky

Hi Nelio

> -----Original Message-----
> From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]
> Sent: Thursday, October 26, 2017 4:44 PM
> To: Matan Azrad <matan@mellanox.com>
> Cc: Ophir Munk <ophirmu@mellanox.com>; Adrien Mazarguil
> <adrien.mazarguil@6wind.com>; dev@dpdk.org; Thomas Monjalon
> <thomas@monjalon.net>; Olga Shern <olgas@mellanox.com>; Mordechay
> Haimovsky <motih@mellanox.com>
> Subject: Re: [dpdk-dev] [PATCH v2 4/7] net/mlx4: merge Tx path functions
> 
> On Thu, Oct 26, 2017 at 12:30:54PM +0000, Matan Azrad wrote:
> > Hi Nelio
> > Please see my comments below (3).
> >
> >
> > > -----Original Message-----
> > > From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]
> > > Sent: Thursday, October 26, 2017 3:12 PM
> > > To: Matan Azrad <matan@mellanox.com>
> > > Cc: Ophir Munk <ophirmu@mellanox.com>; Adrien Mazarguil
> > > <adrien.mazarguil@6wind.com>; dev@dpdk.org; Thomas Monjalon
> > > <thomas@monjalon.net>; Olga Shern <olgas@mellanox.com>;
> Mordechay
> > > Haimovsky <motih@mellanox.com>
> > > Subject: Re: [dpdk-dev] [PATCH v2 4/7] net/mlx4: merge Tx path
> > > functions
> > >
> > > On Thu, Oct 26, 2017 at 10:31:06AM +0000, Matan Azrad wrote:
> > > > Hi Nelio
> > > >
> > > > I think the memory barrier discussion is not relevant for this
> > > > patch (if it will be relevant I will create new one).
> > > > Please see my comments inline.
> > >
> > > It was not my single comment.  There is also useless code like
> > > having null segments in the packets which is not allowed on DPDK.
> >
> > Sorry, but I can't find comments in the previous mails.
> 
> You should search in the series,
> 
> > Moreover  this comment(first time I see it) is not relevant to this patch and
> asking something else.
> > All what this patch does is to merge 2 functions to prevent double
> > asking about WQ remain space...
> 
> Again in the series itself.
> 
> The point, this series embed 7 patches for "performance improvement",
> whereas the single improvement is avoiding to call an outside function by
> copy/pasting it into the PMD.
> In fact it will save few cycles, but this improvements could have been much
> more if the it was not a bare copy/paste.
> 

This simple merge improves 0.2MPPS in my setup.
If you have more improvements (other than reduce if statement) regarding this merge please suggest. 

> The real question is what is the improvement?  If the improvement is
> significant, it worse having this series, otherwise it does not as it may also
> bring some bugs which may be resolve from its original source whereas this
> one will remain.
> 

Each commit in this series improves performance - all of them improve performance significantly and brought us to our target.

By the way, I think series discussion should be in patch 0 :)

> > Remove memory\compiler barriers or dealing with null segments are not in
> the scope here.
> >
> > >
> > > > Regarding this specific patch, I didn't see any comment from you,
> > > > Are you agree with it?
> > > >
> > > > > -----Original Message-----
> > > > > From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]
> > > > > Sent: Wednesday, October 25, 2017 10:50 AM
> > > > > To: Ophir Munk <ophirmu@mellanox.com>
> > > > > Cc: Adrien Mazarguil <adrien.mazarguil@6wind.com>; dev@dpdk.org;
> > > > > Thomas Monjalon <thomas@monjalon.net>; Olga Shern
> > > > > <olgas@mellanox.com>; Matan Azrad <matan@mellanox.com>
> > > > > Subject: Re: [dpdk-dev] [PATCH v2 4/7] net/mlx4: merge Tx path
> > > > > functions
> > > > >
> > > > > On Tue, Oct 24, 2017 at 08:36:52PM +0000, Ophir Munk wrote:
> > > > > > Hi,
> > > > > >
> > > > > > On Tuesday, October 24, 2017 4:52 PM, Nélio Laranjeiro wrote:
> > > > > > >
> > > > > > > On Mon, Oct 23, 2017 at 02:21:57PM +0000, Ophir Munk wrote:
> > > > > > > > From: Matan Azrad <matan@mellanox.com>
> > > > > > > >
> > > > > > > > Merge tx_burst and mlx4_post_send functions to prevent
> > > > > > > > double asking about WQ remain space.
> > > > > > > >
> > > > > > > > This should improve performance.
> > > > > > > >
> > > > > > > > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > > > > > > > ---
> > > > > > > >  drivers/net/mlx4/mlx4_rxtx.c | 353
> > > > > > > > +++++++++++++++++++++----------------------
> > > > > > > >  1 file changed, 170 insertions(+), 183 deletions(-)
> > > > > > >
> > > > > > > What are the real expectation you have on the remaining
> > > > > > > patches of the series?
> > > > > > >
> > > > > > > According to the comment of this commit log "This should
> > > > > > > improve performance" there are too many barriers at each
> > > > > > > packet/segment level to improve something.
> > > > > > >
> > > > > > > The point is, mlx4_burst_tx() should write all the WQE
> > > > > > > without any barrier as it is processing a burst of packets
> > > > > > > (whereas Verbs functions which may only process a single
> packet).
> > > > > >
> > > > > > > The lonely barrier which should be present is the one to
> > > > > > > ensure that all the host memory is flushed before triggering the Tx
> doorbell.
> > > > > > >
> > > > > >
> > > > > > There is a known ConnectX-3 HW limitation: the first 4 bytes
> > > > > > of every TXWBB (64 bytes chunks) should be written in a
> > > > > > reversed order (from last TXWBB to first TXWBB).
> > > > >
> > > > > This means the first WQE filled by the burst function is the doorbell.
> > > > > In such situation, the first four bytes of it can be written
> > > > > before leaving the burst function and after a write memory barrier.
> > > > >
> > > > > Until this first WQE is not complete, the NIC won't start
> > > > > processing the packets.  Memory barriers per packets becomes
> useless.
> > > >
> > > > I think this is not true, Since mlx4 HW can prefetch advanced
> > > > TXbbs if their first 4 bytes are valid in spite of the first WQE
> > > > is still not valid (please
> > > read the spec).
> > >
> > > A compiler barrier is enough on x86 to forbid the CPU to re-order
> > > the instructions, on arm you need a memory barrier, there is a macro
> > > in DPDK for that, rte_io_wmb().
> > >
> > We are also using compiler barrier here.
> >
> > > Before triggering the doorbell you must flush the case, this is the
> > > only place where the rte_wmb() should be used.
> > >
> >
> > We are also using memory barrier only for this reason.
> >
> > > > > It gives something like:
> > > > >
> > > > >  uint32_t tx_bb_db = 0;
> > > > >  void *first_wqe = NULL;
> > > > >
> > > > >  /*
> > > > >   * Prepare all Packets by writing the WQEs without the 4 first bytes of
> > > > >   * the first WQE.
> > > > >   */
> > > > >  for () {
> > > > >  	if (!wqe) {
> > > > > 		first_wqe = wqe;
> > > > > 		tx_bb_db = foo;
> > > > > 	}
> > > > >  }
> > > > >  /* Leaving. */
> > > > >  rte_wmb();
> > > > >  *(uin32_t*)wqe = tx_bb_db;
> > > > >  return n;
> > > > >
> > > >
> > > > I will take care to check if we can do 2 loops:
> > > > Write all  last 60B per TXbb.
> > > > Memory barrier.
> > > > Write all first 4B per TXbbs.
> > > >
> > > > > > The last 60 bytes of any TXWBB can be written in any order
> > > > > > (before writing the first 4 bytes).
> > > > > > Is your last statement (using lonely barrier) is in accordance
> > > > > > with this limitation? Please explain.
> > > > > >
> > > > > > > There is also too many cases handled which are useless in
> > > > > > > bursts
> > > > > situation,
> > > > > > > this function needs to be re-written to its minimal use case i.e.
> > > > > processing a
> > > > > > > valid burst of packets/segments and triggering at the end of
> > > > > > > the burst the
> > > > > Tx
> > > > > > > doorbell.
> > > > > > >
> > > > >
> > > > > Regards,
> > > > >
> > > > > --
> > > > > Nélio Laranjeiro
> > > > > 6WIND
> > >
> > > Regards,
> > >
> > > --
> > > Nélio Laranjeiro
> > > 6WIND
> 
> --
> Nélio Laranjeiro
> 6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 2/7] net/mlx4: inline more Tx functions
  2017-10-26 14:27           ` Ophir Munk
@ 2017-10-29 19:30             ` Ophir Munk
  0 siblings, 0 replies; 84+ messages in thread
From: Ophir Munk @ 2017-10-29 19:30 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

On Thursday, October 26, 2017 5:28 PM, Ophir Munk wrote:

> 
> Hi,
> Please see inside
> 
> On Thursday, October 26, 2017 10:49 AM Adrien Mazarguil wrote:
> > To: Ophir Munk <ophirmu@mellanox.com>
> > Cc: dev@dpdk.org; Thomas Monjalon <thomas@monjalon.net>; Olga
> Shern
> > <olgas@mellanox.com>; Matan Azrad <matan@mellanox.com>
> > Subject: Re: [PATCH v2 2/7] net/mlx4: inline more Tx functions
> >
> > Hi Ophir,
> >
> > Please see below.
> >
> > On Wed, Oct 25, 2017 at 09:42:46PM +0000, Ophir Munk wrote:
> > > Hi Adrien,
> > >
> > > On Wednesday, October 25, 2017 7:50 PM, Adrien Mazarguil wrote:
> > > >
> > > > Hi Ophir,
> > > >
> > > > On Mon, Oct 23, 2017 at 02:21:55PM +0000, Ophir Munk wrote:
> > > > > Change functions to inline on Tx fast path to improve
> > > > > performance
> > > > >
> > > > > Inside the inline function call other functions to handle "unlikely"
> > > > > cases such that the inline function code footprint is small.
> > > > >
> > > > > Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> > > >
> > > > Reading this, it's like adding __rte_always_inline improves
> > > > performance at all, which I doubt unless you can show proof
> > > > through
> > performance results.
> > > >
> > > > When in doubt, leave it to the compiler, the static keyword is
> > > > usually enough of a hint. Too much forced inlining may actually be
> > harmful.
> > > >
> > > > What this patch really does is splitting the heavy
> > > > lookup/registration function in two halves with one small static
> > > > inline function for the lookup part that calls the separate
> > > > registration part in the unlikely event MR is not already registered.
> > > >
> > > > Thankfully the compiler doesn't inline the large registration
> > > > function back, which results in the perceived performance
> > > > improvement for the time being, however there is no guarantee it
> > > > won't happen in the future (you didn't use the noinline keyword on
> > > > the
> > registration function for that).
> > > >
> > > > Therefore I have a bunch of comments and suggestions, see below.
> > > >
> > > > > ---
> > > > >  drivers/net/mlx4/mlx4_rxtx.c | 43
> > > > > ++++++------------------------------
> > > > >  drivers/net/mlx4/mlx4_rxtx.h | 52
> > > > > +++++++++++++++++++++++++++++++++++++++++++-
> > > > >  2 files changed, 58 insertions(+), 37 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > > > > b/drivers/net/mlx4/mlx4_rxtx.c index 011ea79..ae37f9b 100644
> > > > > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > > > > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > > > > @@ -220,54 +220,25 @@ mlx4_txq_complete(struct txq *txq)
> > > > >  	return 0;
> > > > >  }
> > > > >
> > > > > -/**
> > > > > - * Get memory pool (MP) from mbuf. If mbuf is indirect, the
> > > > > pool from which
> > > > > - * the cloned mbuf is allocated is returned instead.
> > > > > - *
> > > > > - * @param buf
> > > > > - *   Pointer to mbuf.
> > > > > - *
> > > > > - * @return
> > > > > - *   Memory pool where data is located for given mbuf.
> > > > > - */
> > > > > -static struct rte_mempool *
> > > > > -mlx4_txq_mb2mp(struct rte_mbuf *buf) -{
> > > > > -	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > > > > -		return rte_mbuf_from_indirect(buf)->pool;
> > > > > -	return buf->pool;
> > > > > -}
> > > > >
> > > > >  /**
> > > > > - * Get memory region (MR) <-> memory pool (MP) association from
> > > > >txq- mp2mr[].
> > > > > - * Add MP to txq->mp2mr[] if it's not registered yet. If
> > > > >mp2mr[] is  full,
> > > > > - * remove an entry first.
> > > > > + * Add memory region (MR) <-> memory pool (MP) association to
> > > > > + txq-
> > > > >mp2mr[].
> > > > > + * If mp2mr[] is full, remove an entry first.
> > > > >   *
> > > > >   * @param txq
> > > > >   *   Pointer to Tx queue structure.
> > > > >   * @param[in] mp
> > > > > - *   Memory pool for which a memory region lkey must be returned.
> > > > > + *   Memory pool for which a memory region lkey must be added
> > > > > + * @param[in] i
> > > > > + *   Index in memory pool (MP) where to add memory region (MR)
> > > > >   *
> > > > >   * @return
> > > > > - *   mr->lkey on success, (uint32_t)-1 on failure.
> > > > > + *   Added mr->lkey on success, (uint32_t)-1 on failure.
> > > > >   */
> > > > > -uint32_t
> > > > > -mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
> > > > > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool
> > > > > +*mp, uint32_t i)
> > > > >  {
> > > > > -	unsigned int i;
> > > > >  	struct ibv_mr *mr;
> > > > >
> > > > > -	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > > > > -		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > > > > -			/* Unknown MP, add a new MR for it. */
> > > > > -			break;
> > > > > -		}
> > > > > -		if (txq->mp2mr[i].mp == mp) {
> > > > > -			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > > > > -			assert(txq->mp2mr[i].mr->lkey == txq-
> > > > >mp2mr[i].lkey);
> > > > > -			return txq->mp2mr[i].lkey;
> > > > > -		}
> > > > > -	}
> > > > >  	/* Add a new entry, register MR first. */
> > > > >  	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
> > > > >  	      (void *)txq, mp->name, (void *)mp); diff --git
> > > > >a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
> > > > >index
> > > > > e10bbca..719ef45 100644
> > > > > --- a/drivers/net/mlx4/mlx4_rxtx.h
> > > > > +++ b/drivers/net/mlx4/mlx4_rxtx.h
> > > > > @@ -53,6 +53,7 @@
> > > > >
> > > > >  #include "mlx4.h"
> > > > >  #include "mlx4_prm.h"
> > > > > +#include "mlx4_utils.h"
> > > >
> > > > Why?
> > > >
> > > > >
> > > > >  /** Rx queue counters. */
> > > > >  struct mlx4_rxq_stats {
> > > > > @@ -160,7 +161,6 @@ void mlx4_rx_queue_release(void
> *dpdk_rxq);
> > > > >
> > > > >  /* mlx4_rxtx.c */
> > > > >
> > > > > -uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool
> > > > > *mp); uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
> > > > >  		       uint16_t pkts_n);
> > > > >  uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
> > > > > @@
> > > > > -169,6 +169,8 @@ uint16_t mlx4_tx_burst_removed(void *dpdk_txq,
> > > > struct rte_mbuf **pkts,
> > > > >  			       uint16_t pkts_n);
> > > > >  uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf
> > **pkts,
> > > > >  			       uint16_t pkts_n);
> > > > > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool
> *mp,
> > > > > +				unsigned int i);
> > > > >
> > > > >  /* mlx4_txq.c */
> > > > >
> > > > > @@ -177,4 +179,52 @@ int mlx4_tx_queue_setup(struct rte_eth_dev
> > > > *dev, uint16_t idx,
> > > > >  			const struct rte_eth_txconf *conf);  void
> > > > > mlx4_tx_queue_release(void *dpdk_txq);
> > > > >
> > > > > +/**
> > > > > + * Get memory pool (MP) from mbuf. If mbuf is indirect, the
> > > > > +pool from which
> > > > > + * the cloned mbuf is allocated is returned instead.
> > > > > + *
> > > > > + * @param buf
> > > > > + *   Pointer to mbuf.
> > > > > + *
> > > > > + * @return
> > > > > + *   Memory pool where data is located for given mbuf.
> > > > > + */
> > > > > +static __rte_always_inline struct rte_mempool *
> > > > > +mlx4_txq_mb2mp(struct rte_mbuf *buf) {
> > > > > +	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > > > > +		return rte_mbuf_from_indirect(buf)->pool;
> > > > > +	return buf->pool;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * Get memory region (MR) <-> memory pool (MP) association from
> > > > > +txq-
> > > > >mp2mr[].
> > > > > + * Call mlx4_txq_add_mr() if MP is not registered yet.
> > > > > + *
> > > > > + * @param txq
> > > > > + *   Pointer to Tx queue structure.
> > > > > + * @param[in] mp
> > > > > + *   Memory pool for which a memory region lkey must be returned.
> > > > > + *
> > > > > + * @return
> > > > > + *   mr->lkey on success, (uint32_t)-1 on failure.
> > > > > + */
> > > > > +static __rte_always_inline uint32_t
> > > >
> > > > Note __rte_always_inline is defined in rte_common.h and should be
> > > > explicitly included (however don't do that, see below).
> > > >
> > > > > +mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) {
> > > > > +	unsigned int i;
> > > > > +
> > > > > +	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > > > > +		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > > > > +			/* Unknown MP, add a new MR for it. */
> > > > > +			break;
> > > > > +		}
> > > > > +		if (txq->mp2mr[i].mp == mp) {
> > > > > +			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > > > > +			assert(txq->mp2mr[i].mr->lkey == txq-
> > > > >mp2mr[i].lkey);
> > > >
> > > > assert() requires assert.h (but don't include it, see subsequent
> > suggestion).
> > > >
> > > > > +			return txq->mp2mr[i].lkey;
> > > > > +		}
> > > > > +	}
> > > > > +	return mlx4_txq_add_mr(txq, mp, i); }
> > > > >  #endif /* MLX4_RXTX_H_ */
> > > >
> > > > So as described above, these functions do not need the
> > > > __rte_always_inline, please remove it. They also do not need to be
> > > > located in a header file; the reason it's the case for their mlx5
> > > > counterparts is that they have to be shared between
> > > > vectorized/non-vectorized code. No such requirement here, you
> > > > should
> > move them back to their original spot.
> > > >
> > >
> > > Static function mlx4_txq_mp2mr() must be in a header file because it
> > > is
> > shared by 2 files: mlx4_txq.c and mlx4_rxtx.c.
> > > It is not related to vectorized/non-vectorized code in mlx5.
> > > Having said that -__rte_always_inline is required as well otherwise
> > > compilation fails with
> > > drivers/net/mlx4/mlx4_rxtx.h:200:1: error: 'mlx4_txq_mp2mr' defined
> > > but not used [-Werror=unused-function] for files which include
> > > mlx4_rxtx.h
> >
> > All right, then what you were looking or was static inline, not *force*
> inline.
> > The former is a hint, the latter doesn't leave much of a choice to the
> > compiler, it means you're sure this way brings the most performance,
> > however for this patch I really think inlining plays a really minor
> > part (even changes anything at all) compared to dividing this
> > function, which is the real performance improvement.
> 
> Without inline I get ~0.2Mpps degradation on my setup, therefore I suggest
> keeping inline.
> The final call is yours. Please let me know if to leave inline or remove.
> 

Till further indication - function mlx4_txq_mp2mr() is left static in next v3
of this series.

> >
> > > > My suggestion for this performance improvement is to move
> > > > mlx4_txq_add_mr() to a different file, mlx4_mr.c looks like a good
> > > > candidate. This fact will ensure it's never inlined and far away
> > > > from the data path.
> > > >
> > >
> > > Function mlx4_txq_add_mr() is relatively small.
> > > What do you say about preceding it with __attribute((noinline))
> > > instead of
> > creating a new file?
> >
> > What I mean is you should declare mlx4_txq_add_mr() which does the
> > heavy lifting inside mlx4_mr.c and provide its definition in mlx4.h
> > instead of mlx4_rxtx.h.
> >
> > Then, mlx4_txq_mp2mr() can remain defined in mlx4_rxtx.c in its
> > original spot as a non-static function with its public declaration
> > remaining in mlx4_rxtx.h for users outside of this file.
> >
> 
> That's done before. mlx4_txq_mp2mr() is defined in mlx4_rxtx.c as
> described.
> 
> > The fact mlx4_txq_mp2mr() remains defined in that file *before*
> > mlx4_post_send()'s definition where it's needed allows the compiler to
> > optimize it away as if it was static inline thanks to -O3, that is,
> > unless it thinks doing so would hurt performance, but as a (now) small
> > function this shouldn't be an issue.
> >
> > Other reasons includes that doing so would make a smaller diff that
> > focuses on the performance improvement itself. The extra performance
> > brought by a statically inlined version of mlx4_txq_mp2mr() is not
> > needed in mlx4_txq.c, whose only purpose is to set up queues.
> 
> I have moved mlx4_txq_add_mr() to mlx_mr.c. One of the disadvantages of
> moving it is that now it requires adding #include "mlx4_rxtx.h" in the C file in
> order to know the details of txq struct.
> I liked mlx4_mr.c for being encapsulated from any Tx/Rx specific structures
> and just handling MR issues.
> 
> >
> > --
> > Adrien Mazarguil
> > 6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 7/7] net/mlx4: separate Tx for multi-segments
  2017-10-25 16:50     ` Adrien Mazarguil
@ 2017-10-30  8:15       ` Ophir Munk
  0 siblings, 0 replies; 84+ messages in thread
From: Ophir Munk @ 2017-10-30  8:15 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

Hi,
Please see inline.

On Wednesday, October 25, 2017 7:50 PM Adrien Mazarguil wrote:
> 
> Hi Ophir,
> 
> On Mon, Oct 23, 2017 at 02:22:00PM +0000, Ophir Munk wrote:
> > This commit optimizes handling of one segment and calls a dedicated
> > function for handling multi segments
> >
> > Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> 
> While it indeed moves the code to a separate function I'm not sure by how
> much it improves performance.
> 
> Is it noticeably better, can you provide a short performance summary with
> and without this patch? Is that the case for both single and multi-segment
> scenarios, or was this improvement at the cost of a degradation in the latter
> case?
> 

In v3 this commit is squashed into the previous commit "net/mlx4: improve performance of one Tx segment" 
as both commits represent one logic unit.
On Matan's setup performance improvement with those 2 commits occurs for both single and multi-segment scenarios.
On my setup performance improvement occurs for single segment only:
With patch versus without patch: 
64	  	+0.2 mpps
64,64		-0.2 mpps
64,64,64,64	-0.07 mpps

> If it splits a large function in two smaller ones for readability and no
> performance validation was done on this specific patch alone, please not
> label it as a performance improvement. I'm fine with readability
> improvements when properly identified as such.
> 

Performance improvement indication was removed from commit message.

> A few additional comments below.
> 
> > ---
> >  drivers/net/mlx4/mlx4_rxtx.c | 284
> > +++++++++++++++++++++++--------------------
> >  1 file changed, 154 insertions(+), 130 deletions(-)
> >
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > b/drivers/net/mlx4/mlx4_rxtx.c index 3236552..9596859 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > @@ -62,6 +62,9 @@
> >  #include "mlx4_rxtx.h"
> >  #include "mlx4_utils.h"
> >
> > +#define WQE_ONE_DATA_SEG_SIZE \
> > +	(sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct
> > +mlx4_wqe_data_seg))
> > +
> >  /**
> >   * Pointer-value pair structure used in tx_post_send for saving the first
> >   * DWORD (32 byte) of a TXBB.
> > @@ -140,22 +143,19 @@ mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq,
> uint16_t index, uint8_t owner)
> >   * @return
> >   *   0 on success, -1 on failure.
> >   */
> > -static int
> > -mlx4_txq_complete(struct txq *txq)
> > +static inline int __attribute__((always_inline))
> 
> Should be static only, leave the rest to the compiler. This function is large
> enough that it shouldn't make much of a difference anyway (unless proved
> otherwise).
> 

Done.
__attribute__((always_inline)) was removed.

> > +mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
> > +				struct mlx4_sq *sq)
> >  {
> >  	unsigned int elts_comp = txq->elts_comp;
> >  	unsigned int elts_tail = txq->elts_tail;
> > -	const unsigned int elts_n = txq->elts_n;
> >  	struct mlx4_cq *cq = &txq->mcq;
> > -	struct mlx4_sq *sq = &txq->msq;
> >  	struct mlx4_cqe *cqe;
> >  	uint32_t cons_index = cq->cons_index;
> >  	uint16_t new_index;
> >  	uint16_t nr_txbbs = 0;
> >  	int pkts = 0;
> >
> > -	if (unlikely(elts_comp == 0))
> > -		return 0;
> >  	/*
> >  	 * Traverse over all CQ entries reported and handle each WQ entry
> >  	 * reported by them.
> > @@ -266,6 +266,120 @@ rte_be32_t mlx4_txq_add_mr(struct txq *txq,
> struct rte_mempool *mp, uint32_t i)
> >  	return txq->mp2mr[i].lkey;
> >  }
> >
> > +static int handle_multi_segs(struct rte_mbuf *buf,
> > +			    struct txq *txq,
> > +			    struct mlx4_wqe_ctrl_seg **pctrl) {
> > +	int wqe_real_size;
> > +	int nr_txbbs;
> > +	struct pv *pv = (struct pv *)txq->bounce_buf;
> > +	struct mlx4_sq *sq = &txq->msq;
> > +	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
> > +	struct mlx4_wqe_ctrl_seg *ctrl;
> > +	struct mlx4_wqe_data_seg *dseg;
> > +	uintptr_t addr;
> > +	uint32_t byte_count;
> > +	int pv_counter = 0;
> > +
> > +	/* Calculate the needed work queue entry size for this packet. */
> > +	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> > +		buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
> > +	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
> > +	/*
> > +	 * Check that there is room for this WQE in the send queue and that
> > +	 * the WQE size is legal.
> > +	 */
> > +	if (((sq->head - sq->tail) + nr_txbbs +
> > +				sq->headroom_txbbs) >= sq->txbb_cnt ||
> > +			nr_txbbs > MLX4_MAX_WQE_TXBBS) {
> > +		return -1;
> > +	}
> > +
> > +	/* Get the control and data entries of the WQE. */
> > +	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq,
> head_idx);
> > +	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> > +			sizeof(struct mlx4_wqe_ctrl_seg));
> > +	*pctrl = ctrl;
> > +	/* Fill the data segments with buffer information. */
> > +	struct rte_mbuf *sbuf;
> > +
> > +	for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
> > +		addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> > +		rte_prefetch0((volatile void *)addr);
> > +		/* Handle WQE wraparound. */
> > +		if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
> > +			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
> > +		dseg->addr = rte_cpu_to_be_64(addr);
> > +		/* Memory region key (big endian) for this memory pool. */
> > +		dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
> #ifndef
> > +NDEBUG
> > +		/* Calculate the needed work queue entry size for this packet
> */
> > +		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
> > +			/* MR does not exist. */
> > +			DEBUG("%p: unable to get MP <-> MR association",
> > +					(void *)txq);
> > +			/*
> > +			 * Restamp entry in case of failure.
> > +			 * Make sure that size is written correctly
> > +			 * Note that we give ownership to the SW, not the
> HW.
> > +			 */
> > +			wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> > +				buf->nb_segs * sizeof(struct
> mlx4_wqe_data_seg);
> > +			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> > +			mlx4_txq_stamp_freed_wqe(sq, head_idx,
> > +					(sq->head & sq->txbb_cnt) ? 0 : 1);
> > +			return -1;
> > +		}
> > +#endif /* NDEBUG */
> > +		if (likely(sbuf->data_len)) {
> > +			byte_count = rte_cpu_to_be_32(sbuf->data_len);
> > +		} else {
> > +			/*
> > +			 * Zero length segment is treated as inline segment
> > +			 * with zero data.
> > +			 */
> > +			byte_count = RTE_BE32(0x80000000);
> > +		}
> > +		/*
> > +		 * If the data segment is not at the beginning of a
> > +		 * Tx basic block (TXBB) then write the byte count,
> > +		 * else postpone the writing to just before updating the
> > +		 * control segment.
> > +		 */
> > +		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> > +			/*
> > +			 * Need a barrier here before writing the byte_count
> > +			 * fields to make sure that all the data is visible
> > +			 * before the byte_count field is set.
> > +			 * Otherwise, if the segment begins a new cacheline,
> > +			 * the HCA prefetcher could grab the 64-byte chunk
> and
> > +			 * get a valid (!= 0xffffffff) byte count but stale
> > +			 * data, and end up sending the wrong data.
> > +			 */
> > +			rte_io_wmb();
> > +			dseg->byte_count = byte_count;
> > +		} else {
> > +			/*
> > +			 * This data segment starts at the beginning of a new
> > +			 * TXBB, so we need to postpone its byte_count
> writing
> > +			 * for later.
> > +			 */
> > +			pv[pv_counter].dseg = dseg;
> > +			pv[pv_counter++].val = byte_count;
> > +		}
> > +	}
> > +	/* Write the first DWORD of each TXBB save earlier. */
> > +	if (pv_counter) {
> > +		/* Need a barrier here before writing the byte_count. */
> > +		rte_io_wmb();
> > +		for (--pv_counter; pv_counter  >= 0; pv_counter--)
> > +			pv[pv_counter].dseg->byte_count =
> pv[pv_counter].val;
> > +	}
> > +	/* Fill the control parameters for this packet. */
> > +	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> > +
> > +	return nr_txbbs;
> > +}
> >  /**
> >   * DPDK callback for Tx.
> >   *
> > @@ -288,10 +402,11 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf
> **pkts, uint16_t pkts_n)
> >  	unsigned int i;
> >  	unsigned int max;
> >  	struct mlx4_sq *sq = &txq->msq;
> > -	struct pv *pv = (struct pv *)txq->bounce_buf;
> > +	int nr_txbbs;
> >
> >  	assert(txq->elts_comp_cd != 0);
> > -	mlx4_txq_complete(txq);
> > +	if (likely(txq->elts_comp != 0))
> > +		mlx4_txq_complete(txq, elts_n, sq);
> >  	max = (elts_n - (elts_head - txq->elts_tail));
> >  	if (max > elts_n)
> >  		max -= elts_n;
> > @@ -316,10 +431,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf
> **pkts, uint16_t pkts_n)
> >  		} srcrb;
> >  		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
> >  		uintptr_t addr;
> > -		uint32_t byte_count;
> > -		int wqe_real_size;
> > -		int nr_txbbs;
> > -		int pv_counter = 0;
> >
> >  		/* Clean up old buffer. */
> >  		if (likely(elt->buf != NULL)) {
> > @@ -338,31 +449,22 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf
> **pkts, uint16_t pkts_n)
> >  			} while (tmp != NULL);
> >  		}
> >  		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> > -
> > -		/*
> > -		 * Calculate the needed work queue entry size
> > -		 * for this packet.
> > -		 */
> > -		wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> > -				buf->nb_segs * sizeof(struct
> mlx4_wqe_data_seg);
> > -		nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
> > -		/*
> > -		 * Check that there is room for this WQE in the send
> > -		 * queue and that the WQE size is legal.
> > -		 */
> > -		if (((sq->head - sq->tail) + nr_txbbs +
> > -		     sq->headroom_txbbs) >= sq->txbb_cnt ||
> > -		    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
> > -			elt->buf = NULL;
> > -			break;
> > -		}
> > -		/* Get the control and data entries of the WQE. */
> > -		ctrl = (struct mlx4_wqe_ctrl_seg *)
> > -				mlx4_get_send_wqe(sq, head_idx);
> > -		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> > -				sizeof(struct mlx4_wqe_ctrl_seg));
> > -		/* Fill the data segments with buffer information. */
> >  		if (likely(buf->nb_segs == 1)) {
> > +			/*
> > +			 * Check that there is room for this WQE in the send
> > +			 * queue and that the WQE size is legal
> > +			 */
> > +			if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs)
> > +						>= sq->txbb_cnt ||
> > +						1 >
> MLX4_MAX_WQE_TXBBS) {
> > +				elt->buf = NULL;
> > +				break;
> > +			}
> > +			/* Get the control and data entries of the WQE. */
> > +			ctrl = (struct mlx4_wqe_ctrl_seg *)
> > +					mlx4_get_send_wqe(sq, head_idx);
> > +			dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> > +					sizeof(struct mlx4_wqe_ctrl_seg));
> >  			addr = rte_pktmbuf_mtod(buf, uintptr_t);
> >  			rte_prefetch0((volatile void *)addr);
> >  			/* Handle WQE wraparound. */
> > @@ -371,120 +473,42 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf
> **pkts, uint16_t pkts_n)
> >  				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
> >  			dseg->addr = rte_cpu_to_be_64(addr);
> >  			/* Memory region key (big endian). */
> > -			dseg->lkey = mlx4_txq_mp2mr(txq,
> mlx4_txq_mb2mp(sbuf));
> > -	#ifndef NDEBUG
> > +			dseg->lkey = mlx4_txq_mp2mr(txq,
> mlx4_txq_mb2mp(buf)); #ifndef
> > +NDEBUG
> >  			if (unlikely(dseg->lkey ==
> >  				rte_cpu_to_be_32((uint32_t)-1))) {
> >  				/* MR does not exist. */
> >  				DEBUG("%p: unable to get MP <-> MR
> association",
> > -				      (void *)txq);
> > +						(void *)txq);
> >  				/*
> >  				 * Restamp entry in case of failure.
> >  				 * Make sure that size is written correctly
> >  				 * Note that we give ownership to the SW,
> >  				 * not the HW.
> >  				 */
> > -				ctrl->fence_size = (wqe_real_size >> 4) &
> 0x3f;
> > +				ctrl->fence_size =
> (WQE_ONE_DATA_SEG_SIZE >> 4)
> > +							& 0x3f;
> >  				mlx4_txq_stamp_freed_wqe(sq, head_idx,
> > -					     (sq->head & sq->txbb_cnt) ? 0 : 1);
> > +					(sq->head & sq->txbb_cnt) ? 0 : 1);
> >  				elt->buf = NULL;
> >  				break;
> >  			}
> > -	#endif /* NDEBUG */
> > +#endif /* NDEBUG */
> >  			/* Need a barrier here before writing the
> byte_count. */
> >  			rte_io_wmb();
> >  			dseg->byte_count = rte_cpu_to_be_32(buf-
> >data_len);
> > +
> > +			/* Fill the control parameters for this packet. */
> > +			ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4)
> & 0x3f;
> > +			nr_txbbs = 1;
> >  		} else {
> > -			/* Fill the data segments with buffer information. */
> > -			struct rte_mbuf *sbuf;
> > -
> > -			for (sbuf = buf;
> > -				 sbuf != NULL;
> > -				 sbuf = sbuf->next, dseg++) {
> > -				addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> > -				rte_prefetch0((volatile void *)addr);
> > -				/* Handle WQE wraparound. */
> > -				if (unlikely(dseg >=
> > -					(struct mlx4_wqe_data_seg *)sq-
> >eob))
> > -					dseg = (struct mlx4_wqe_data_seg *)
> > -							sq->buf;
> > -				dseg->addr = rte_cpu_to_be_64(addr);
> > -				/* Memory region key (big endian). */
> > -				dseg->lkey = mlx4_txq_mp2mr(txq,
> > -						mlx4_txq_mb2mp(sbuf));
> > -		#ifndef NDEBUG
> > -				if (unlikely(dseg->lkey ==
> > -					rte_cpu_to_be_32((uint32_t)-1))) {
> > -					/* MR does not exist. */
> > -					DEBUG("%p: unable to get MP <->
> MR association",
> > -						  (void *)txq);
> > -					/*
> > -					 * Restamp entry in case of failure.
> > -					 * Make sure that size is written
> > -					 * correctly, note that we give
> > -					 * ownership to the SW, not the HW.
> > -					 */
> > -					ctrl->fence_size =
> > -						(wqe_real_size >> 4) & 0x3f;
> > -					mlx4_txq_stamp_freed_wqe(sq,
> head_idx,
> > -					    (sq->head & sq->txbb_cnt) ? 0 : 1);
> > -					elt->buf = NULL;
> > -					break;
> > -				}
> > -		#endif /* NDEBUG */
> > -				if (likely(sbuf->data_len)) {
> > -					byte_count =
> > -					  rte_cpu_to_be_32(sbuf->data_len);
> > -				} else {
> > -					/*
> > -					 * Zero length segment is treated as
> > -					 * inline segment with zero data.
> > -					 */
> > -					byte_count =
> RTE_BE32(0x80000000);
> > -				}
> > -				/*
> > -				 * If the data segment is not at the beginning
> > -				 * of a Tx basic block (TXBB) then write the
> > -				 * byte count, else postpone the writing to
> > -				 * just before updating the control segment.
> > -				 */
> > -				if ((uintptr_t)dseg &
> > -					(uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> > -					/*
> > -					 * Need a barrier here before writing
> > -					 * the byte_count fields to make sure
> > -					 * that all the data is visible before
> > -					 * the byte_count field is set.
> > -					 * Otherwise, if the segment begins a
> > -					 * new cacheline, the HCA prefetcher
> > -					 * could grab the 64-byte chunk and
> get
> > -					 * a valid (!= 0xffffffff) byte count
> > -					 * but stale data, and end up sending
> > -					 * the wrong data.
> > -					 */
> > -					rte_io_wmb();
> > -					dseg->byte_count = byte_count;
> > -				} else {
> > -					/*
> > -					 * This data segment starts at the
> > -					 * beginning of a new TXBB, so we
> > -					 * need to postpone its byte_count
> > -					 * writing for later.
> > -					 */
> > -					pv[pv_counter].dseg = dseg;
> > -					pv[pv_counter++].val = byte_count;
> > -				}
> > +			nr_txbbs = handle_multi_segs(buf, txq, &ctrl);
> 
> Having all this part non-inline could degrade multi-segment performance, is
> that OK?

It is inline because it is defined as static. Performance is not degraded in this case.

> 
> > +			if (nr_txbbs < 0) {
> > +				elt->buf = NULL;
> > +				break;
> >  			}
> > -		/* Write the first DWORD of each TXBB save earlier. */
> > -		if (pv_counter) {
> > -			/* Need a barrier before writing the byte_count. */
> > -			rte_io_wmb();
> > -			for (--pv_counter; pv_counter  >= 0; pv_counter--)
> > -				pv[pv_counter].dseg->byte_count =
> > -						pv[pv_counter].val;
> >  		}
> > -		/* Fill the control parameters for this packet. */
> > -		ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> > +
> >  		/*
> >  		 * For raw Ethernet, the SOLICIT flag is used to indicate
> >  		 * that no ICRC should be calculated.
> > --
> > 2.7.4
> >
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* [PATCH v3 0/7] Tx path improvements
  2017-10-23 14:21 ` [PATCH v2 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
                     ` (6 preceding siblings ...)
  2017-10-23 14:22   ` [PATCH v2 7/7] net/mlx4: separate Tx for multi-segments Ophir Munk
@ 2017-10-30 10:07   ` Matan Azrad
  2017-10-30 10:07     ` [PATCH v3 1/7] net/mlx4: remove error flows from Tx fast path Matan Azrad
                       ` (8 more replies)
  7 siblings, 9 replies; 84+ messages in thread
From: Matan Azrad @ 2017-10-30 10:07 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

v1:
This series is a follow-up of the new datapath implementation introduced in RC1.
It is dedicated to Tx and improves the performance.

v2:
Change uint32_t be_lkey to rte_be32_t lkey

v3:
Rebase to 17.11-rc2.
Remove "net/mlx4: save lkey in big-endian format" patch which degrade performance.
Merge separate segment cases patches "net/mlx4: improve performance of one Tx segment" and "net/mlx4: separate Tx for multi-segments".
"net/mlx4: inline more Tx functions" renamed to "net/mlx4: associate MR to MP in a short function". 
Leave bytes counter as old version.
Add memory barrier improvement patch.
Remove empty segment support.

Matan Azrad (6):
  net/mlx4: remove error flows from Tx fast path
  net/mlx4: merge Tx path functions
  net/mlx4: remove completion counter in Tx burst
  net/mlx4: separate Tx segment cases
  net/mlx4: mitigate Tx path memory barriers
  net/mlx4: remove empty Tx segment support

Ophir Munk (1):
  net/mlx4: associate MR to MP in a short function

 drivers/net/mlx4/mlx4.h      |   2 +
 drivers/net/mlx4/mlx4_mr.c   |  46 ++++++
 drivers/net/mlx4/mlx4_rxtx.c | 361 +++++++++++++++++++++----------------------
 drivers/net/mlx4/mlx4_rxtx.h |  33 +++-
 4 files changed, 259 insertions(+), 183 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 84+ messages in thread

* [PATCH v3 1/7] net/mlx4: remove error flows from Tx fast path
  2017-10-30 10:07   ` [PATCH v3 0/7] Tx path improvements Matan Azrad
@ 2017-10-30 10:07     ` Matan Azrad
  2017-10-30 14:23       ` Adrien Mazarguil
  2017-10-30 10:07     ` [PATCH v3 2/7] net/mlx4: associate MR to MP in a short function Matan Azrad
                       ` (7 subsequent siblings)
  8 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-30 10:07 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Move unnecessary error flows to DEBUG mode.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 67dc712..4f899ff 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -169,6 +169,7 @@ struct pv {
 		 * Make sure we read the CQE after we read the ownership bit.
 		 */
 		rte_rmb();
+#ifndef NDEBUG
 		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
 			     MLX4_CQE_OPCODE_ERROR)) {
 			struct mlx4_err_cqe *cqe_err =
@@ -178,6 +179,7 @@ struct pv {
 			      (void *)txq, cqe_err->vendor_err,
 			      cqe_err->syndrome);
 		}
+#endif /* NDEBUG */
 		/* Get WQE index reported in the CQE. */
 		new_index =
 			rte_be_to_cpu_16(cqe->wqe_index) & sq->txbb_cnt_mask;
@@ -322,7 +324,6 @@ struct pv {
 	uint32_t byte_count;
 	int wqe_real_size;
 	int nr_txbbs;
-	int rc;
 	struct pv *pv = (struct pv *)txq->bounce_buf;
 	int pv_counter = 0;
 
@@ -337,8 +338,7 @@ struct pv {
 	if (((sq->head - sq->tail) + nr_txbbs +
 	     sq->headroom_txbbs) >= sq->txbb_cnt ||
 	    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-		rc = ENOSPC;
-		goto err;
+		return -ENOSPC;
 	}
 	/* Get the control and data entries of the WQE. */
 	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
@@ -354,6 +354,7 @@ struct pv {
 		dseg->addr = rte_cpu_to_be_64(addr);
 		/* Memory region key for this memory pool. */
 		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+#ifndef NDEBUG
 		if (unlikely(lkey == (uint32_t)-1)) {
 			/* MR does not exist. */
 			DEBUG("%p: unable to get MP <-> MR association",
@@ -366,9 +367,9 @@ struct pv {
 			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
 			mlx4_txq_stamp_freed_wqe(sq, head_idx,
 				     (sq->head & sq->txbb_cnt) ? 0 : 1);
-			rc = EFAULT;
-			goto err;
+			return -EFAULT;
 		}
+#endif /* NDEBUG */
 		dseg->lkey = rte_cpu_to_be_32(lkey);
 		if (likely(buf->data_len)) {
 			byte_count = rte_cpu_to_be_32(buf->data_len);
@@ -471,9 +472,6 @@ struct pv {
 					       MLX4_BIT_WQE_OWN : 0));
 	sq->head += nr_txbbs;
 	return 0;
-err:
-	rte_errno = rc;
-	return -rc;
 }
 
 /**
@@ -510,8 +508,6 @@ struct pv {
 	assert(max <= elts_n);
 	/* Always leave one free entry in the ring. */
 	--max;
-	if (max == 0)
-		return 0;
 	if (max > pkts_n)
 		max = pkts_n;
 	for (i = 0; (i != max); ++i) {
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v3 2/7] net/mlx4: associate MR to MP in a short function
  2017-10-30 10:07   ` [PATCH v3 0/7] Tx path improvements Matan Azrad
  2017-10-30 10:07     ` [PATCH v3 1/7] net/mlx4: remove error flows from Tx fast path Matan Azrad
@ 2017-10-30 10:07     ` Matan Azrad
  2017-10-30 14:23       ` Adrien Mazarguil
  2017-10-30 10:07     ` [PATCH v3 3/7] net/mlx4: merge Tx path functions Matan Azrad
                       ` (6 subsequent siblings)
  8 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-30 10:07 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

From: Ophir Munk <ophirmu@mellanox.com>

Associate memory region to mempool (on data path)
in a short function.
Handle the less common case of adding a new memory
region to mempool in a separate function.

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 drivers/net/mlx4/mlx4.h      |  2 ++
 drivers/net/mlx4/mlx4_mr.c   | 46 +++++++++++++++++++++++++++++++++++
 drivers/net/mlx4/mlx4_rxtx.c | 57 --------------------------------------------
 drivers/net/mlx4/mlx4_rxtx.h | 33 ++++++++++++++++++++++++-
 4 files changed, 80 insertions(+), 58 deletions(-)

diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index e0a9853..70cf453 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -160,5 +160,7 @@ int mlx4_flow_ctrl_set(struct rte_eth_dev *dev,
 /* mlx4_mr.c */
 
 struct ibv_mr *mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp);
+uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
+			 uint32_t i);
 
 #endif /* RTE_PMD_MLX4_H_ */
diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
index 9700884..ebf6547 100644
--- a/drivers/net/mlx4/mlx4_mr.c
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -56,6 +56,7 @@
 #include <rte_mempool.h>
 
 #include "mlx4_utils.h"
+#include "mlx4_rxtx.h"
 
 struct mlx4_check_mempool_data {
 	int ret;
@@ -181,3 +182,48 @@ struct ibv_mr *
 		rte_errno = errno ? errno : EINVAL;
 	return mr;
 }
+
+/**
+ * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
+ * If mp2mr[] is full, remove an entry first.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param[in] mp
+ *   Memory pool for which a memory region lkey must be added
+ * @param[in] i
+ *   Index in memory pool (MP) where to add memory region (MR)
+ *
+ * @return
+ *   Added mr->lkey on success, (uint32_t)-1 on failure.
+ */
+uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
+{
+	struct ibv_mr *mr;
+
+	/* Add a new entry, register MR first. */
+	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
+	      (void *)txq, mp->name, (void *)mp);
+	mr = mlx4_mp2mr(txq->priv->pd, mp);
+	if (unlikely(mr == NULL)) {
+		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
+		      (void *)txq);
+		return (uint32_t)-1;
+	}
+	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
+		/* Table is full, remove oldest entry. */
+		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
+		      (void *)txq);
+		--i;
+		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
+		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
+			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+	}
+	/* Store the new entry. */
+	txq->mp2mr[i].mp = mp;
+	txq->mp2mr[i].mr = mr;
+	txq->mp2mr[i].lkey = mr->lkey;
+	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIU32,
+	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
+	return txq->mp2mr[i].lkey;
+}
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 4f899ff..f89df46 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -239,63 +239,6 @@ struct pv {
 }
 
 /**
- * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
- * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
- * remove an entry first.
- *
- * @param txq
- *   Pointer to Tx queue structure.
- * @param[in] mp
- *   Memory pool for which a memory region lkey must be returned.
- *
- * @return
- *   mr->lkey on success, (uint32_t)-1 on failure.
- */
-uint32_t
-mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
-{
-	unsigned int i;
-	struct ibv_mr *mr;
-
-	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
-		if (unlikely(txq->mp2mr[i].mp == NULL)) {
-			/* Unknown MP, add a new MR for it. */
-			break;
-		}
-		if (txq->mp2mr[i].mp == mp) {
-			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
-			return txq->mp2mr[i].lkey;
-		}
-	}
-	/* Add a new entry, register MR first. */
-	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
-	      (void *)txq, mp->name, (void *)mp);
-	mr = mlx4_mp2mr(txq->priv->pd, mp);
-	if (unlikely(mr == NULL)) {
-		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
-		      (void *)txq);
-		return (uint32_t)-1;
-	}
-	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
-		/* Table is full, remove oldest entry. */
-		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
-		      (void *)txq);
-		--i;
-		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
-		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
-			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
-	}
-	/* Store the new entry. */
-	txq->mp2mr[i].mp = mp;
-	txq->mp2mr[i].mr = mr;
-	txq->mp2mr[i].lkey = mr->lkey;
-	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
-	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
-	return txq->mp2mr[i].lkey;
-}
-
-/**
  * Posts a single work request to a send queue.
  *
  * @param txq
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 7d67748..b32fcc9 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -36,6 +36,7 @@
 
 #include <stdint.h>
 #include <sys/queue.h>
+#include <assert.h>
 
 /* Verbs headers do not support -pedantic. */
 #ifdef PEDANTIC
@@ -165,7 +166,6 @@ int mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
 
 /* mlx4_rxtx.c */
 
-uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
 uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
 		       uint16_t pkts_n);
 uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
@@ -182,4 +182,35 @@ int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
 			const struct rte_eth_txconf *conf);
 void mlx4_tx_queue_release(void *dpdk_txq);
 
+
+/**
+ * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
+ * Call mlx4_txq_add_mr() if MP is not registered yet.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param[in] mp
+ *   Memory pool for which a memory region lkey must be returned.
+ *
+ * @return
+ *   mr->lkey on success, (uint32_t)-1 on failure.
+ */
+static inline uint32_t
+mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+{
+	unsigned int i;
+
+	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+		if (unlikely(txq->mp2mr[i].mp == NULL)) {
+			/* Unknown MP, add a new MR for it. */
+			break;
+		}
+		if (txq->mp2mr[i].mp == mp) {
+			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
+			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
+			return txq->mp2mr[i].lkey;
+		}
+	}
+	return mlx4_txq_add_mr(txq, mp, i);
+}
 #endif /* MLX4_RXTX_H_ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v3 3/7] net/mlx4: merge Tx path functions
  2017-10-30 10:07   ` [PATCH v3 0/7] Tx path improvements Matan Azrad
  2017-10-30 10:07     ` [PATCH v3 1/7] net/mlx4: remove error flows from Tx fast path Matan Azrad
  2017-10-30 10:07     ` [PATCH v3 2/7] net/mlx4: associate MR to MP in a short function Matan Azrad
@ 2017-10-30 10:07     ` Matan Azrad
  2017-10-30 14:23       ` Adrien Mazarguil
  2017-10-30 10:07     ` [PATCH v3 4/7] net/mlx4: remove completion counter in Tx burst Matan Azrad
                       ` (5 subsequent siblings)
  8 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-30 10:07 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Merge tx_burst and mlx4_post_send functions to prevent
double asking about WQ remain space.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 357 +++++++++++++++++++++----------------------
 1 file changed, 172 insertions(+), 185 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index f89df46..a8d8e81 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -239,185 +239,6 @@ struct pv {
 }
 
 /**
- * Posts a single work request to a send queue.
- *
- * @param txq
- *   Target Tx queue.
- * @param pkt
- *   Packet to transmit.
- *
- * @return
- *   0 on success, negative errno value otherwise and rte_errno is set.
- */
-static inline int
-mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
-{
-	struct mlx4_wqe_ctrl_seg *ctrl;
-	struct mlx4_wqe_data_seg *dseg;
-	struct mlx4_sq *sq = &txq->msq;
-	struct rte_mbuf *buf;
-	union {
-		uint32_t flags;
-		uint16_t flags16[2];
-	} srcrb;
-	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
-	uint32_t lkey;
-	uintptr_t addr;
-	uint32_t owner_opcode = MLX4_OPCODE_SEND;
-	uint32_t byte_count;
-	int wqe_real_size;
-	int nr_txbbs;
-	struct pv *pv = (struct pv *)txq->bounce_buf;
-	int pv_counter = 0;
-
-	/* Calculate the needed work queue entry size for this packet. */
-	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
-			pkt->nb_segs * sizeof(struct mlx4_wqe_data_seg);
-	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
-	/*
-	 * Check that there is room for this WQE in the send queue and that
-	 * the WQE size is legal.
-	 */
-	if (((sq->head - sq->tail) + nr_txbbs +
-	     sq->headroom_txbbs) >= sq->txbb_cnt ||
-	    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-		return -ENOSPC;
-	}
-	/* Get the control and data entries of the WQE. */
-	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
-	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
-					    sizeof(struct mlx4_wqe_ctrl_seg));
-	/* Fill the data segments with buffer information. */
-	for (buf = pkt; buf != NULL; buf = buf->next, dseg++) {
-		addr = rte_pktmbuf_mtod(buf, uintptr_t);
-		rte_prefetch0((volatile void *)addr);
-		/* Handle WQE wraparound. */
-		if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
-			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
-		dseg->addr = rte_cpu_to_be_64(addr);
-		/* Memory region key for this memory pool. */
-		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
-#ifndef NDEBUG
-		if (unlikely(lkey == (uint32_t)-1)) {
-			/* MR does not exist. */
-			DEBUG("%p: unable to get MP <-> MR association",
-			      (void *)txq);
-			/*
-			 * Restamp entry in case of failure.
-			 * Make sure that size is written correctly
-			 * Note that we give ownership to the SW, not the HW.
-			 */
-			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
-			mlx4_txq_stamp_freed_wqe(sq, head_idx,
-				     (sq->head & sq->txbb_cnt) ? 0 : 1);
-			return -EFAULT;
-		}
-#endif /* NDEBUG */
-		dseg->lkey = rte_cpu_to_be_32(lkey);
-		if (likely(buf->data_len)) {
-			byte_count = rte_cpu_to_be_32(buf->data_len);
-		} else {
-			/*
-			 * Zero length segment is treated as inline segment
-			 * with zero data.
-			 */
-			byte_count = RTE_BE32(0x80000000);
-		}
-		/*
-		 * If the data segment is not at the beginning of a
-		 * Tx basic block (TXBB) then write the byte count,
-		 * else postpone the writing to just before updating the
-		 * control segment.
-		 */
-		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
-			/*
-			 * Need a barrier here before writing the byte_count
-			 * fields to make sure that all the data is visible
-			 * before the byte_count field is set.
-			 * Otherwise, if the segment begins a new cacheline,
-			 * the HCA prefetcher could grab the 64-byte chunk and
-			 * get a valid (!= 0xffffffff) byte count but stale
-			 * data, and end up sending the wrong data.
-			 */
-			rte_io_wmb();
-			dseg->byte_count = byte_count;
-		} else {
-			/*
-			 * This data segment starts at the beginning of a new
-			 * TXBB, so we need to postpone its byte_count writing
-			 * for later.
-			 */
-			pv[pv_counter].dseg = dseg;
-			pv[pv_counter++].val = byte_count;
-		}
-	}
-	/* Write the first DWORD of each TXBB save earlier. */
-	if (pv_counter) {
-		/* Need a barrier here before writing the byte_count. */
-		rte_io_wmb();
-		for (--pv_counter; pv_counter  >= 0; pv_counter--)
-			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
-	}
-	/* Fill the control parameters for this packet. */
-	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
-	/*
-	 * For raw Ethernet, the SOLICIT flag is used to indicate that no ICRC
-	 * should be calculated.
-	 */
-	txq->elts_comp_cd -= nr_txbbs;
-	if (unlikely(txq->elts_comp_cd <= 0)) {
-		txq->elts_comp_cd = txq->elts_comp_cd_init;
-		srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
-				       MLX4_WQE_CTRL_CQ_UPDATE);
-	} else {
-		srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
-	}
-	/* Enable HW checksum offload if requested */
-	if (txq->csum &&
-	    (pkt->ol_flags &
-	     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) {
-		const uint64_t is_tunneled = (pkt->ol_flags &
-					      (PKT_TX_TUNNEL_GRE |
-					       PKT_TX_TUNNEL_VXLAN));
-
-		if (is_tunneled && txq->csum_l2tun) {
-			owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM |
-					MLX4_WQE_CTRL_IL4_HDR_CSUM;
-			if (pkt->ol_flags & PKT_TX_OUTER_IP_CKSUM)
-				srcrb.flags |=
-					RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM);
-		} else {
-			srcrb.flags |= RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM |
-						MLX4_WQE_CTRL_TCP_UDP_CSUM);
-		}
-	}
-	if (txq->lb) {
-		/*
-		 * Copy destination MAC address to the WQE, this allows
-		 * loopback in eSwitch, so that VFs and PF can communicate
-		 * with each other.
-		 */
-		srcrb.flags16[0] = *(rte_pktmbuf_mtod(pkt, uint16_t *));
-		ctrl->imm = *(rte_pktmbuf_mtod_offset(pkt, uint32_t *,
-						      sizeof(uint16_t)));
-	} else {
-		ctrl->imm = 0;
-	}
-	ctrl->srcrb_flags = srcrb.flags;
-	/*
-	 * Make sure descriptor is fully written before
-	 * setting ownership bit (because HW can start
-	 * executing as soon as we do).
-	 */
-	rte_wmb();
-	ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
-					      ((sq->head & sq->txbb_cnt) ?
-					       MLX4_BIT_WQE_OWN : 0));
-	sq->head += nr_txbbs;
-	return 0;
-}
-
-/**
  * DPDK callback for Tx.
  *
  * @param dpdk_txq
@@ -440,7 +261,8 @@ struct pv {
 	unsigned int bytes_sent = 0;
 	unsigned int i;
 	unsigned int max;
-	int err;
+	struct mlx4_sq *sq = &txq->msq;
+	struct pv *pv = (struct pv *)txq->bounce_buf;
 
 	assert(txq->elts_comp_cd != 0);
 	mlx4_txq_complete(txq);
@@ -459,6 +281,21 @@ struct pv {
 			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
 		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
 		struct txq_elt *elt = &(*txq->elts)[elts_head];
+		uint32_t owner_opcode = MLX4_OPCODE_SEND;
+		struct mlx4_wqe_ctrl_seg *ctrl;
+		struct mlx4_wqe_data_seg *dseg;
+		struct rte_mbuf *sbuf;
+		union {
+			uint32_t flags;
+			uint16_t flags16[2];
+		} srcrb;
+		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+		uint32_t lkey;
+		uintptr_t addr;
+		uint32_t byte_count;
+		int wqe_real_size;
+		int nr_txbbs;
+		int pv_counter = 0;
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -477,18 +314,168 @@ struct pv {
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		/* Post the packet for sending. */
-		err = mlx4_post_send(txq, buf);
-		if (unlikely(err)) {
+
+		/*
+		 * Calculate the needed work queue entry size
+		 * for this packet.
+		 */
+		wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+		nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
+		/*
+		 * Check that there is room for this WQE in the send
+		 * queue and that the WQE size is legal.
+		 */
+		if (((sq->head - sq->tail) + nr_txbbs +
+		     sq->headroom_txbbs) >= sq->txbb_cnt ||
+		    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
 			elt->buf = NULL;
-			goto stop;
+			break;
 		}
+		/* Get the control and data entries of the WQE. */
+		ctrl = (struct mlx4_wqe_ctrl_seg *)
+				mlx4_get_send_wqe(sq, head_idx);
+		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+				sizeof(struct mlx4_wqe_ctrl_seg));
+		/* Fill the data segments with buffer information. */
+		for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
+			addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+			rte_prefetch0((volatile void *)addr);
+			/* Handle WQE wraparound. */
+			if (unlikely(dseg >=
+			    (struct mlx4_wqe_data_seg *)sq->eob))
+				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+			dseg->addr = rte_cpu_to_be_64(addr);
+			/* Memory region key (big endian). */
+			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+			dseg->lkey = rte_cpu_to_be_32(lkey);
+#ifndef NDEBUG
+			if (unlikely(dseg->lkey ==
+				rte_cpu_to_be_32((uint32_t)-1))) {
+				/* MR does not exist. */
+				DEBUG("%p: unable to get MP <-> MR association",
+				      (void *)txq);
+				/*
+				 * Restamp entry in case of failure.
+				 * Make sure that size is written correctly
+				 * Note that we give ownership to the SW,
+				 * not the HW.
+				 */
+				ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+				mlx4_txq_stamp_freed_wqe(sq, head_idx,
+					     (sq->head & sq->txbb_cnt) ? 0 : 1);
+				elt->buf = NULL;
+				break;
+			}
+#endif /* NDEBUG */
+			if (likely(sbuf->data_len)) {
+				byte_count = rte_cpu_to_be_32(sbuf->data_len);
+			} else {
+				/*
+				 * Zero length segment is treated as inline
+				 * segment with zero data.
+				 */
+				byte_count = RTE_BE32(0x80000000);
+			}
+			/*
+			 * If the data segment is not at the beginning
+			 * of a Tx basic block (TXBB) then write the
+			 * byte count, else postpone the writing to
+			 * just before updating the control segment.
+			 */
+			if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+				/*
+				 * Need a barrier here before writing the
+				 * byte_count fields to make sure that all the
+				 * data is visible before the byte_count field
+				 * is set. otherwise, if the segment begins a
+				 * new cacheline, the HCA prefetcher could grab
+				 * the 64-byte chunk and get a valid
+				 * (!= 0xffffffff) byte count but stale data,
+				 * and end up sending the wrong data.
+				 */
+				rte_io_wmb();
+				dseg->byte_count = byte_count;
+			} else {
+				/*
+				 * This data segment starts at the beginning of
+				 * a new TXBB, so we need to postpone its
+				 * byte_count writing for later.
+				 */
+				pv[pv_counter].dseg = dseg;
+				pv[pv_counter++].val = byte_count;
+			}
+		}
+		/* Write the first DWORD of each TXBB save earlier. */
+		if (pv_counter) {
+			/* Need a barrier before writing the byte_count. */
+			rte_io_wmb();
+			for (--pv_counter; pv_counter  >= 0; pv_counter--)
+				pv[pv_counter].dseg->byte_count =
+						pv[pv_counter].val;
+		}
+		/* Fill the control parameters for this packet. */
+		ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+		/*
+		 * For raw Ethernet, the SOLICIT flag is used to indicate
+		 * that no ICRC should be calculated.
+		 */
+		txq->elts_comp_cd -= nr_txbbs;
+		if (unlikely(txq->elts_comp_cd <= 0)) {
+			txq->elts_comp_cd = txq->elts_comp_cd_init;
+			srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
+					       MLX4_WQE_CTRL_CQ_UPDATE);
+		} else {
+			srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
+		}
+		/* Enable HW checksum offload if requested */
+		if (txq->csum &&
+		    (buf->ol_flags &
+		     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) {
+			const uint64_t is_tunneled = (buf->ol_flags &
+						      (PKT_TX_TUNNEL_GRE |
+						       PKT_TX_TUNNEL_VXLAN));
+
+			if (is_tunneled && txq->csum_l2tun) {
+				owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM |
+						MLX4_WQE_CTRL_IL4_HDR_CSUM;
+				if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM)
+					srcrb.flags |=
+					    RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM);
+			} else {
+				srcrb.flags |=
+					RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM |
+						MLX4_WQE_CTRL_TCP_UDP_CSUM);
+			}
+		}
+		if (txq->lb) {
+			/*
+			 * Copy destination MAC address to the WQE, this allows
+			 * loopback in eSwitch, so that VFs and PF can
+			 * communicate with each other.
+			 */
+			srcrb.flags16[0] = *(rte_pktmbuf_mtod(buf, uint16_t *));
+			ctrl->imm = *(rte_pktmbuf_mtod_offset(buf, uint32_t *,
+					      sizeof(uint16_t)));
+		} else {
+			ctrl->imm = 0;
+		}
+		ctrl->srcrb_flags = srcrb.flags;
+		/*
+		 * Make sure descriptor is fully written before
+		 * setting ownership bit (because HW can start
+		 * executing as soon as we do).
+		 */
+		rte_wmb();
+		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
+					      ((sq->head & sq->txbb_cnt) ?
+						       MLX4_BIT_WQE_OWN : 0));
+		sq->head += nr_txbbs;
 		elt->buf = buf;
 		bytes_sent += buf->pkt_len;
 		++elts_comp;
 		elts_head = elts_head_next;
 	}
-stop:
 	/* Take a shortcut if nothing must be sent. */
 	if (unlikely(i == 0))
 		return 0;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v3 4/7] net/mlx4: remove completion counter in Tx burst
  2017-10-30 10:07   ` [PATCH v3 0/7] Tx path improvements Matan Azrad
                       ` (2 preceding siblings ...)
  2017-10-30 10:07     ` [PATCH v3 3/7] net/mlx4: merge Tx path functions Matan Azrad
@ 2017-10-30 10:07     ` Matan Azrad
  2017-10-30 14:23       ` Adrien Mazarguil
  2017-10-30 10:07     ` [PATCH v3 5/7] net/mlx4: separate Tx segment cases Matan Azrad
                       ` (4 subsequent siblings)
  8 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-30 10:07 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Remove usage of variable which count the packets for completion and
doesn't add more information than packets counter.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index a8d8e81..8ce70d6 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -257,7 +257,6 @@ struct pv {
 	struct txq *txq = (struct txq *)dpdk_txq;
 	unsigned int elts_head = txq->elts_head;
 	const unsigned int elts_n = txq->elts_n;
-	unsigned int elts_comp = 0;
 	unsigned int bytes_sent = 0;
 	unsigned int i;
 	unsigned int max;
@@ -473,7 +472,6 @@ struct pv {
 		sq->head += nr_txbbs;
 		elt->buf = buf;
 		bytes_sent += buf->pkt_len;
-		++elts_comp;
 		elts_head = elts_head_next;
 	}
 	/* Take a shortcut if nothing must be sent. */
@@ -487,7 +485,7 @@ struct pv {
 	/* Ring QP doorbell. */
 	rte_write32(txq->msq.doorbell_qpn, txq->msq.db);
 	txq->elts_head = elts_head;
-	txq->elts_comp += elts_comp;
+	txq->elts_comp += i;
 	return i;
 }
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v3 5/7] net/mlx4: separate Tx segment cases
  2017-10-30 10:07   ` [PATCH v3 0/7] Tx path improvements Matan Azrad
                       ` (3 preceding siblings ...)
  2017-10-30 10:07     ` [PATCH v3 4/7] net/mlx4: remove completion counter in Tx burst Matan Azrad
@ 2017-10-30 10:07     ` Matan Azrad
  2017-10-30 14:23       ` Adrien Mazarguil
  2017-10-30 10:07     ` [PATCH v3 6/7] net/mlx4: mitigate Tx path memory barriers Matan Azrad
                       ` (3 subsequent siblings)
  8 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-30 10:07 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Since single segment packets shouldn't use additional memory to
save segments byte count we can prevent additional memory
unnecessary usage in this case; Prevent loop management.

Call a dedicated function for handling multi segments case.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 247 +++++++++++++++++++++++++++----------------
 1 file changed, 158 insertions(+), 89 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 8ce70d6..8ea8851 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -62,6 +62,9 @@
 #include "mlx4_rxtx.h"
 #include "mlx4_utils.h"
 
+#define WQE_ONE_DATA_SEG_SIZE \
+	(sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg))
+
 /**
  * Pointer-value pair structure used in tx_post_send for saving the first
  * DWORD (32 byte) of a TXBB.
@@ -140,22 +143,19 @@ struct pv {
  * @return
  *   0 on success, -1 on failure.
  */
-static int
-mlx4_txq_complete(struct txq *txq)
+static inline int
+mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
+				struct mlx4_sq *sq)
 {
 	unsigned int elts_comp = txq->elts_comp;
 	unsigned int elts_tail = txq->elts_tail;
-	const unsigned int elts_n = txq->elts_n;
 	struct mlx4_cq *cq = &txq->mcq;
-	struct mlx4_sq *sq = &txq->msq;
 	struct mlx4_cqe *cqe;
 	uint32_t cons_index = cq->cons_index;
 	uint16_t new_index;
 	uint16_t nr_txbbs = 0;
 	int pkts = 0;
 
-	if (unlikely(elts_comp == 0))
-		return 0;
 	/*
 	 * Traverse over all CQ entries reported and handle each WQ entry
 	 * reported by them.
@@ -238,6 +238,122 @@ struct pv {
 	return buf->pool;
 }
 
+static int handle_multi_segs(struct rte_mbuf *buf,
+			    struct txq *txq,
+			    struct mlx4_wqe_ctrl_seg **pctrl)
+{
+	int wqe_real_size;
+	int nr_txbbs;
+	struct pv *pv = (struct pv *)txq->bounce_buf;
+	struct mlx4_sq *sq = &txq->msq;
+	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_data_seg *dseg;
+	uint32_t lkey;
+	uintptr_t addr;
+	uint32_t byte_count;
+	int pv_counter = 0;
+
+	/* Calculate the needed work queue entry size for this packet. */
+	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+		buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
+	/*
+	 * Check that there is room for this WQE in the send queue and that
+	 * the WQE size is legal.
+	 */
+	if (((sq->head - sq->tail) + nr_txbbs +
+				sq->headroom_txbbs) >= sq->txbb_cnt ||
+			nr_txbbs > MLX4_MAX_WQE_TXBBS) {
+		return -1;
+	}
+
+	/* Get the control and data entries of the WQE. */
+	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
+	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+			sizeof(struct mlx4_wqe_ctrl_seg));
+	*pctrl = ctrl;
+	/* Fill the data segments with buffer information. */
+	struct rte_mbuf *sbuf;
+
+	for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
+		addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+		rte_prefetch0((volatile void *)addr);
+		/* Handle WQE wraparound. */
+		if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
+			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+		dseg->addr = rte_cpu_to_be_64(addr);
+		/* Memory region key (big endian) for this memory pool. */
+		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+		dseg->lkey = rte_cpu_to_be_32(lkey);
+#ifndef NDEBUG
+		/* Calculate the needed work queue entry size for this packet */
+		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
+			/* MR does not exist. */
+			DEBUG("%p: unable to get MP <-> MR association",
+					(void *)txq);
+			/*
+			 * Restamp entry in case of failure.
+			 * Make sure that size is written correctly
+			 * Note that we give ownership to the SW, not the HW.
+			 */
+			wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+			mlx4_txq_stamp_freed_wqe(sq, head_idx,
+					(sq->head & sq->txbb_cnt) ? 0 : 1);
+			return -1;
+		}
+#endif /* NDEBUG */
+		if (likely(sbuf->data_len)) {
+			byte_count = rte_cpu_to_be_32(sbuf->data_len);
+		} else {
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			byte_count = RTE_BE32(0x80000000);
+		}
+		/*
+		 * If the data segment is not at the beginning of a
+		 * Tx basic block (TXBB) then write the byte count,
+		 * else postpone the writing to just before updating the
+		 * control segment.
+		 */
+		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+			/*
+			 * Need a barrier here before writing the byte_count
+			 * fields to make sure that all the data is visible
+			 * before the byte_count field is set.
+			 * Otherwise, if the segment begins a new cacheline,
+			 * the HCA prefetcher could grab the 64-byte chunk and
+			 * get a valid (!= 0xffffffff) byte count but stale
+			 * data, and end up sending the wrong data.
+			 */
+			rte_io_wmb();
+			dseg->byte_count = byte_count;
+		} else {
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[pv_counter].dseg = dseg;
+			pv[pv_counter++].val = byte_count;
+		}
+	}
+	/* Write the first DWORD of each TXBB save earlier. */
+	if (pv_counter) {
+		/* Need a barrier here before writing the byte_count. */
+		rte_io_wmb();
+		for (--pv_counter; pv_counter  >= 0; pv_counter--)
+			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
+	}
+	/* Fill the control parameters for this packet. */
+	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+
+	return nr_txbbs;
+}
 /**
  * DPDK callback for Tx.
  *
@@ -261,10 +377,11 @@ struct pv {
 	unsigned int i;
 	unsigned int max;
 	struct mlx4_sq *sq = &txq->msq;
-	struct pv *pv = (struct pv *)txq->bounce_buf;
+	int nr_txbbs;
 
 	assert(txq->elts_comp_cd != 0);
-	mlx4_txq_complete(txq);
+	if (likely(txq->elts_comp != 0))
+		mlx4_txq_complete(txq, elts_n, sq);
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
 		max -= elts_n;
@@ -283,7 +400,6 @@ struct pv {
 		uint32_t owner_opcode = MLX4_OPCODE_SEND;
 		struct mlx4_wqe_ctrl_seg *ctrl;
 		struct mlx4_wqe_data_seg *dseg;
-		struct rte_mbuf *sbuf;
 		union {
 			uint32_t flags;
 			uint16_t flags16[2];
@@ -291,10 +407,6 @@ struct pv {
 		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
 		uint32_t lkey;
 		uintptr_t addr;
-		uint32_t byte_count;
-		int wqe_real_size;
-		int nr_txbbs;
-		int pv_counter = 0;
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -313,40 +425,29 @@ struct pv {
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-
-		/*
-		 * Calculate the needed work queue entry size
-		 * for this packet.
-		 */
-		wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
-				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
-		nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
-		/*
-		 * Check that there is room for this WQE in the send
-		 * queue and that the WQE size is legal.
-		 */
-		if (((sq->head - sq->tail) + nr_txbbs +
-		     sq->headroom_txbbs) >= sq->txbb_cnt ||
-		    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-			elt->buf = NULL;
-			break;
-		}
-		/* Get the control and data entries of the WQE. */
-		ctrl = (struct mlx4_wqe_ctrl_seg *)
-				mlx4_get_send_wqe(sq, head_idx);
-		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
-				sizeof(struct mlx4_wqe_ctrl_seg));
-		/* Fill the data segments with buffer information. */
-		for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
-			addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+		if (buf->nb_segs == 1) {
+			/*
+			 * Check that there is room for this WQE in the send
+			 * queue and that the WQE size is legal
+			 */
+			if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs) >=
+			     sq->txbb_cnt || 1 > MLX4_MAX_WQE_TXBBS) {
+				elt->buf = NULL;
+				break;
+			}
+			/* Get the control and data entries of the WQE. */
+			ctrl = (struct mlx4_wqe_ctrl_seg *)
+					mlx4_get_send_wqe(sq, head_idx);
+			dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+					sizeof(struct mlx4_wqe_ctrl_seg));
+			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			rte_prefetch0((volatile void *)addr);
 			/* Handle WQE wraparound. */
-			if (unlikely(dseg >=
-			    (struct mlx4_wqe_data_seg *)sq->eob))
+			if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
 				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
 			dseg->addr = rte_cpu_to_be_64(addr);
 			/* Memory region key (big endian). */
-			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
 			dseg->lkey = rte_cpu_to_be_32(lkey);
 #ifndef NDEBUG
 			if (unlikely(dseg->lkey ==
@@ -360,61 +461,28 @@ struct pv {
 				 * Note that we give ownership to the SW,
 				 * not the HW.
 				 */
-				ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+				ctrl->fence_size =
+					(WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
 				mlx4_txq_stamp_freed_wqe(sq, head_idx,
 					     (sq->head & sq->txbb_cnt) ? 0 : 1);
 				elt->buf = NULL;
 				break;
 			}
 #endif /* NDEBUG */
-			if (likely(sbuf->data_len)) {
-				byte_count = rte_cpu_to_be_32(sbuf->data_len);
-			} else {
-				/*
-				 * Zero length segment is treated as inline
-				 * segment with zero data.
-				 */
-				byte_count = RTE_BE32(0x80000000);
-			}
-			/*
-			 * If the data segment is not at the beginning
-			 * of a Tx basic block (TXBB) then write the
-			 * byte count, else postpone the writing to
-			 * just before updating the control segment.
-			 */
-			if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
-				/*
-				 * Need a barrier here before writing the
-				 * byte_count fields to make sure that all the
-				 * data is visible before the byte_count field
-				 * is set. otherwise, if the segment begins a
-				 * new cacheline, the HCA prefetcher could grab
-				 * the 64-byte chunk and get a valid
-				 * (!= 0xffffffff) byte count but stale data,
-				 * and end up sending the wrong data.
-				 */
-				rte_io_wmb();
-				dseg->byte_count = byte_count;
-			} else {
-				/*
-				 * This data segment starts at the beginning of
-				 * a new TXBB, so we need to postpone its
-				 * byte_count writing for later.
-				 */
-				pv[pv_counter].dseg = dseg;
-				pv[pv_counter++].val = byte_count;
-			}
-		}
-		/* Write the first DWORD of each TXBB save earlier. */
-		if (pv_counter) {
-			/* Need a barrier before writing the byte_count. */
+			/* Need a barrier here before byte count store. */
 			rte_io_wmb();
-			for (--pv_counter; pv_counter  >= 0; pv_counter--)
-				pv[pv_counter].dseg->byte_count =
-						pv[pv_counter].val;
+			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
+
+			/* Fill the control parameters for this packet. */
+			ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
+			nr_txbbs = 1;
+		} else {
+			nr_txbbs = handle_multi_segs(buf, txq, &ctrl);
+			if (nr_txbbs < 0) {
+				elt->buf = NULL;
+				break;
+			}
 		}
-		/* Fill the control parameters for this packet. */
-		ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
 		/*
 		 * For raw Ethernet, the SOLICIT flag is used to indicate
 		 * that no ICRC should be calculated.
@@ -469,6 +537,7 @@ struct pv {
 		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
 					      ((sq->head & sq->txbb_cnt) ?
 						       MLX4_BIT_WQE_OWN : 0));
+
 		sq->head += nr_txbbs;
 		elt->buf = buf;
 		bytes_sent += buf->pkt_len;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v3 6/7] net/mlx4: mitigate Tx path memory barriers
  2017-10-30 10:07   ` [PATCH v3 0/7] Tx path improvements Matan Azrad
                       ` (4 preceding siblings ...)
  2017-10-30 10:07     ` [PATCH v3 5/7] net/mlx4: separate Tx segment cases Matan Azrad
@ 2017-10-30 10:07     ` Matan Azrad
  2017-10-30 14:23       ` Adrien Mazarguil
  2017-10-30 10:07     ` [PATCH v3 7/7] net/mlx4: remove empty Tx segment support Matan Azrad
                       ` (2 subsequent siblings)
  8 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-30 10:07 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Replace most of the memory barriers by compiler barriers since they are
all targeted to the DRAM; This improves code efficiency for systems
which force store order between different addresses.

Only the doorbell record store should be protected by memory barrier
since it is targeted to the PCI memory domain.

Limit pre byte count store compiler barrier for systems with cache line
size smaller than 64B (TXBB size).

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 8ea8851..482c399 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -168,7 +168,7 @@ struct pv {
 		/*
 		 * Make sure we read the CQE after we read the ownership bit.
 		 */
-		rte_rmb();
+		rte_io_rmb();
 #ifndef NDEBUG
 		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
 			     MLX4_CQE_OPCODE_ERROR)) {
@@ -203,7 +203,7 @@ struct pv {
 	 */
 	cq->cons_index = cons_index;
 	*cq->set_ci_db = rte_cpu_to_be_32(cq->cons_index & MLX4_CQ_DB_CI_MASK);
-	rte_wmb();
+	rte_io_wmb();
 	sq->tail = sq->tail + nr_txbbs;
 	/* Update the list of packets posted for transmission. */
 	elts_comp -= pkts;
@@ -321,6 +321,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
 		 * control segment.
 		 */
 		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+#if RTE_CACHE_LINE_SIZE < 64
 			/*
 			 * Need a barrier here before writing the byte_count
 			 * fields to make sure that all the data is visible
@@ -331,6 +332,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
 			 * data, and end up sending the wrong data.
 			 */
 			rte_io_wmb();
+#endif /* RTE_CACHE_LINE_SIZE */
 			dseg->byte_count = byte_count;
 		} else {
 			/*
@@ -469,8 +471,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
 				break;
 			}
 #endif /* NDEBUG */
-			/* Need a barrier here before byte count store. */
-			rte_io_wmb();
+			/* Never be TXBB aligned, no need compiler barrier. */
 			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
 
 			/* Fill the control parameters for this packet. */
@@ -533,7 +534,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
 		 * setting ownership bit (because HW can start
 		 * executing as soon as we do).
 		 */
-		rte_wmb();
+		rte_io_wmb();
 		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
 					      ((sq->head & sq->txbb_cnt) ?
 						       MLX4_BIT_WQE_OWN : 0));
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v3 7/7] net/mlx4: remove empty Tx segment support
  2017-10-30 10:07   ` [PATCH v3 0/7] Tx path improvements Matan Azrad
                       ` (5 preceding siblings ...)
  2017-10-30 10:07     ` [PATCH v3 6/7] net/mlx4: mitigate Tx path memory barriers Matan Azrad
@ 2017-10-30 10:07     ` Matan Azrad
  2017-10-30 14:24       ` Adrien Mazarguil
  2017-10-31 18:21     ` [PATCH v4 0/8] net/mlx4: Tx path improvements Matan Azrad
  2017-11-02 16:42     ` [PATCH v5 0/8] net/mlx4: Tx path improvements Matan Azrad
  8 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-30 10:07 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Move empty segment case processing to debug mode.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 482c399..c005a41 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -305,15 +305,18 @@ static int handle_multi_segs(struct rte_mbuf *buf,
 			return -1;
 		}
 #endif /* NDEBUG */
-		if (likely(sbuf->data_len)) {
-			byte_count = rte_cpu_to_be_32(sbuf->data_len);
-		} else {
+		byte_count = rte_cpu_to_be_32(sbuf->data_len);
+#ifndef NDEBUG
+		if (unlikely(!sbuf->data_len)) {
+			DEBUG("%p: Empty segment is not allowed",
+					(void *)txq);
 			/*
 			 * Zero length segment is treated as inline segment
 			 * with zero data.
 			 */
 			byte_count = RTE_BE32(0x80000000);
 		}
+#endif /* NDEBUG */
 		/*
 		 * If the data segment is not at the beginning of a
 		 * Tx basic block (TXBB) then write the byte count,
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 1/7] net/mlx4: remove error flows from Tx fast path
  2017-10-30 10:07     ` [PATCH v3 1/7] net/mlx4: remove error flows from Tx fast path Matan Azrad
@ 2017-10-30 14:23       ` Adrien Mazarguil
  2017-10-30 18:11         ` Matan Azrad
  0 siblings, 1 reply; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-30 14:23 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

On Mon, Oct 30, 2017 at 10:07:23AM +0000, Matan Azrad wrote:
> Move unnecessary error flows to DEBUG mode.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>
> Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>

I missed a couple of details while reviewing the original version, the first
one being mlx4_post_send()'s return value is still documented as updating
rte_errno in case of error, it's not the case anymore after this patch.

Please see below for the other one:

> ---
>  drivers/net/mlx4/mlx4_rxtx.c | 16 ++++++----------
>  1 file changed, 6 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
<snip>
>  /**
> @@ -510,8 +508,6 @@ struct pv {
>  	assert(max <= elts_n);
>  	/* Always leave one free entry in the ring. */
>  	--max;
> -	if (max == 0)
> -		return 0;
>  	if (max > pkts_n)
>  		max = pkts_n;
>  	for (i = 0; (i != max); ++i) {

While minor, this change has nothing to do with this patch, right?

I think it can slightly degrade an application performance as it removes the
guarantee that subsequent code only needs to be run if there is at least one
packet to process in case the TX ring is constantly full (SW faster than
HW).

Can you remove it?

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 2/7] net/mlx4: associate MR to MP in a short function
  2017-10-30 10:07     ` [PATCH v3 2/7] net/mlx4: associate MR to MP in a short function Matan Azrad
@ 2017-10-30 14:23       ` Adrien Mazarguil
  2017-10-31 13:25         ` Ophir Munk
  0 siblings, 1 reply; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-30 14:23 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

On Mon, Oct 30, 2017 at 10:07:24AM +0000, Matan Azrad wrote:
> From: Ophir Munk <ophirmu@mellanox.com>
> 
> Associate memory region to mempool (on data path)
> in a short function.
> Handle the less common case of adding a new memory
> region to mempool in a separate function.
> 
> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>

Thanks for moving that function out of the way, a few more comments below.

By the way, commit log lines can be up to 75 characters wide, only the title
line is limited to 50.

> ---
>  drivers/net/mlx4/mlx4.h      |  2 ++
>  drivers/net/mlx4/mlx4_mr.c   | 46 +++++++++++++++++++++++++++++++++++
>  drivers/net/mlx4/mlx4_rxtx.c | 57 --------------------------------------------
>  drivers/net/mlx4/mlx4_rxtx.h | 33 ++++++++++++++++++++++++-
>  4 files changed, 80 insertions(+), 58 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
> index e0a9853..70cf453 100644
> --- a/drivers/net/mlx4/mlx4.h
> +++ b/drivers/net/mlx4/mlx4.h
> @@ -160,5 +160,7 @@ int mlx4_flow_ctrl_set(struct rte_eth_dev *dev,
>  /* mlx4_mr.c */
>  
>  struct ibv_mr *mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp);
> +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> +			 uint32_t i);
>  
>  #endif /* RTE_PMD_MLX4_H_ */
> diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
> index 9700884..ebf6547 100644
> --- a/drivers/net/mlx4/mlx4_mr.c
> +++ b/drivers/net/mlx4/mlx4_mr.c
> @@ -56,6 +56,7 @@
>  #include <rte_mempool.h>
>  
>  #include "mlx4_utils.h"
> +#include "mlx4_rxtx.h"

Alphabetical order is preferred, or so I thought. Nothing seems to enforce
this in the coding rules, it's only a suggestion.

>  
>  struct mlx4_check_mempool_data {
>  	int ret;
> @@ -181,3 +182,48 @@ struct ibv_mr *
>  		rte_errno = errno ? errno : EINVAL;
>  	return mr;
>  }
> +
> +/**
> + * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
> + * If mp2mr[] is full, remove an entry first.
> + *
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param[in] mp
> + *   Memory pool for which a memory region lkey must be added

Missing "."

> + * @param[in] i
> + *   Index in memory pool (MP) where to add memory region (MR)

Same here.

> + *
> + * @return
> + *   Added mr->lkey on success, (uint32_t)-1 on failure.
> + */
> +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)

Coding style again; return type must be on a separate line for definitions
(not declarations).

> +{
> +	struct ibv_mr *mr;
> +
> +	/* Add a new entry, register MR first. */
> +	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
> +	      (void *)txq, mp->name, (void *)mp);
> +	mr = mlx4_mp2mr(txq->priv->pd, mp);
> +	if (unlikely(mr == NULL)) {
> +		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
> +		      (void *)txq);
> +		return (uint32_t)-1;
> +	}
> +	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
> +		/* Table is full, remove oldest entry. */
> +		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
> +		      (void *)txq);
> +		--i;
> +		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
> +		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
> +			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
> +	}
> +	/* Store the new entry. */
> +	txq->mp2mr[i].mp = mp;
> +	txq->mp2mr[i].mr = mr;
> +	txq->mp2mr[i].lkey = mr->lkey;
> +	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIU32,
> +	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
> +	return txq->mp2mr[i].lkey;
> +}
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 4f899ff..f89df46 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -239,63 +239,6 @@ struct pv {
>  }
>  
>  /**
> - * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
> - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
> - * remove an entry first.
> - *
> - * @param txq
> - *   Pointer to Tx queue structure.
> - * @param[in] mp
> - *   Memory pool for which a memory region lkey must be returned.
> - *
> - * @return
> - *   mr->lkey on success, (uint32_t)-1 on failure.
> - */
> -uint32_t
> -mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
> -{
> -	unsigned int i;
> -	struct ibv_mr *mr;
> -
> -	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> -		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> -			/* Unknown MP, add a new MR for it. */
> -			break;
> -		}
> -		if (txq->mp2mr[i].mp == mp) {
> -			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> -			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
> -			return txq->mp2mr[i].lkey;
> -		}
> -	}
> -	/* Add a new entry, register MR first. */
> -	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
> -	      (void *)txq, mp->name, (void *)mp);
> -	mr = mlx4_mp2mr(txq->priv->pd, mp);
> -	if (unlikely(mr == NULL)) {
> -		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
> -		      (void *)txq);
> -		return (uint32_t)-1;
> -	}
> -	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
> -		/* Table is full, remove oldest entry. */
> -		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
> -		      (void *)txq);
> -		--i;
> -		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
> -		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
> -			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
> -	}
> -	/* Store the new entry. */
> -	txq->mp2mr[i].mp = mp;
> -	txq->mp2mr[i].mr = mr;
> -	txq->mp2mr[i].lkey = mr->lkey;
> -	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
> -	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
> -	return txq->mp2mr[i].lkey;
> -}
> -
> -/**
>   * Posts a single work request to a send queue.
>   *
>   * @param txq
> diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
> index 7d67748..b32fcc9 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.h
> +++ b/drivers/net/mlx4/mlx4_rxtx.h
> @@ -36,6 +36,7 @@
>  
>  #include <stdint.h>
>  #include <sys/queue.h>
> +#include <assert.h>
>  
>  /* Verbs headers do not support -pedantic. */
>  #ifdef PEDANTIC
> @@ -165,7 +166,6 @@ int mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
>  
>  /* mlx4_rxtx.c */
>  
> -uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
>  uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
>  		       uint16_t pkts_n);
>  uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
> @@ -182,4 +182,35 @@ int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
>  			const struct rte_eth_txconf *conf);
>  void mlx4_tx_queue_release(void *dpdk_txq);
>  
> +

Extra empty line, however see below.

> +/**
> + * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
> + * Call mlx4_txq_add_mr() if MP is not registered yet.
> + *
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param[in] mp
> + *   Memory pool for which a memory region lkey must be returned.
> + *
> + * @return
> + *   mr->lkey on success, (uint32_t)-1 on failure.
> + */
> +static inline uint32_t
> +mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> +		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> +			/* Unknown MP, add a new MR for it. */
> +			break;
> +		}
> +		if (txq->mp2mr[i].mp == mp) {
> +			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> +			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
> +			return txq->mp2mr[i].lkey;
> +		}
> +	}
> +	return mlx4_txq_add_mr(txq, mp, i);
> +}

I'm OK with keeping this function as a static inline defined in a header
file, however I think mlx4_rxtx.h is not the right place for it given the
remaining MR-handling functions are declared by mlx4.h. You should move it
to that file as well (likely below mlx4_txq_add_mr()).

Note: RTE_DIM() and unlikely() require rte_common.h, NULL requires stddef.h.

While there, I think you should drop both assert() statements and the
reliance on assert.h because such statements shouldn't be part of a static
inline function not supposed to fail. Let's assume mp2mr[] is correctly
updated upstream.

(Also missing empty line here.)

>  #endif /* MLX4_RXTX_H_ */
> -- 
> 1.8.3.1
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 3/7] net/mlx4: merge Tx path functions
  2017-10-30 10:07     ` [PATCH v3 3/7] net/mlx4: merge Tx path functions Matan Azrad
@ 2017-10-30 14:23       ` Adrien Mazarguil
  2017-10-30 18:12         ` Matan Azrad
  0 siblings, 1 reply; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-30 14:23 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

On Mon, Oct 30, 2017 at 10:07:25AM +0000, Matan Azrad wrote:
> Merge tx_burst and mlx4_post_send functions to prevent
> double asking about WQ remain space.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>

Except for one nit below,

Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>

> ---
>  drivers/net/mlx4/mlx4_rxtx.c | 357 +++++++++++++++++++++----------------------
>  1 file changed, 172 insertions(+), 185 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
<snip>
> @@ -477,18 +314,168 @@ struct pv {
>  			} while (tmp != NULL);
>  		}
>  		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> -		/* Post the packet for sending. */
> -		err = mlx4_post_send(txq, buf);
> -		if (unlikely(err)) {
> +

Extra empty line.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 4/7] net/mlx4: remove completion counter in Tx burst
  2017-10-30 10:07     ` [PATCH v3 4/7] net/mlx4: remove completion counter in Tx burst Matan Azrad
@ 2017-10-30 14:23       ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-30 14:23 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

On Mon, Oct 30, 2017 at 10:07:26AM +0000, Matan Azrad wrote:
> Remove usage of variable which count the packets for completion and
> doesn't add more information than packets counter.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>

Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 5/7] net/mlx4: separate Tx segment cases
  2017-10-30 10:07     ` [PATCH v3 5/7] net/mlx4: separate Tx segment cases Matan Azrad
@ 2017-10-30 14:23       ` Adrien Mazarguil
  2017-10-30 18:23         ` Matan Azrad
  0 siblings, 1 reply; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-30 14:23 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

On Mon, Oct 30, 2017 at 10:07:27AM +0000, Matan Azrad wrote:
> Since single segment packets shouldn't use additional memory to
> save segments byte count we can prevent additional memory
> unnecessary usage in this case; Prevent loop management.
> 

Sorry for asking but I really don't understand the above, can you
reformulate the problem addressed by this patch?

> Call a dedicated function for handling multi segments case.

This sentence is clearer, I'll base my review on what this patch does, not
the reasons behind it.

> Signed-off-by: Matan Azrad <matan@mellanox.com>
> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> ---
>  drivers/net/mlx4/mlx4_rxtx.c | 247 +++++++++++++++++++++++++++----------------
>  1 file changed, 158 insertions(+), 89 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 8ce70d6..8ea8851 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -62,6 +62,9 @@
>  #include "mlx4_rxtx.h"
>  #include "mlx4_utils.h"
>  
> +#define WQE_ONE_DATA_SEG_SIZE \
> +	(sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg))
> +
>  /**
>   * Pointer-value pair structure used in tx_post_send for saving the first
>   * DWORD (32 byte) of a TXBB.
> @@ -140,22 +143,19 @@ struct pv {
>   * @return
>   *   0 on success, -1 on failure.
>   */
> -static int
> -mlx4_txq_complete(struct txq *txq)
> +static inline int

While likely harmless, I think the addition of this inline keyword is not
related to this patch.

> +mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
> +				struct mlx4_sq *sq)

Looks like an indentation issue, you should align it to the contents of the
opening "(" to match the coding style of this file.

>  {
>  	unsigned int elts_comp = txq->elts_comp;
>  	unsigned int elts_tail = txq->elts_tail;
> -	const unsigned int elts_n = txq->elts_n;
>  	struct mlx4_cq *cq = &txq->mcq;
> -	struct mlx4_sq *sq = &txq->msq;
>  	struct mlx4_cqe *cqe;
>  	uint32_t cons_index = cq->cons_index;
>  	uint16_t new_index;
>  	uint16_t nr_txbbs = 0;
>  	int pkts = 0;
>  
> -	if (unlikely(elts_comp == 0))
> -		return 0;
>  	/*
>  	 * Traverse over all CQ entries reported and handle each WQ entry
>  	 * reported by them.
> @@ -238,6 +238,122 @@ struct pv {
>  	return buf->pool;
>  }
>  
> +static int handle_multi_segs(struct rte_mbuf *buf,
> +			    struct txq *txq,
> +			    struct mlx4_wqe_ctrl_seg **pctrl)

How about naming this function in a way that follows the mlx4_something()
convention?

Here's a suggestion based on how this function remains tied to
mlx4_tx_burst():

 mlx4_tx_burst_seg()

> +{
> +	int wqe_real_size;
> +	int nr_txbbs;
> +	struct pv *pv = (struct pv *)txq->bounce_buf;
> +	struct mlx4_sq *sq = &txq->msq;
> +	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
> +	struct mlx4_wqe_ctrl_seg *ctrl;
> +	struct mlx4_wqe_data_seg *dseg;
> +	uint32_t lkey;
> +	uintptr_t addr;
> +	uint32_t byte_count;
> +	int pv_counter = 0;
> +
> +	/* Calculate the needed work queue entry size for this packet. */
> +	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> +		buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
> +	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
> +	/*
> +	 * Check that there is room for this WQE in the send queue and that
> +	 * the WQE size is legal.
> +	 */
> +	if (((sq->head - sq->tail) + nr_txbbs +
> +				sq->headroom_txbbs) >= sq->txbb_cnt ||
> +			nr_txbbs > MLX4_MAX_WQE_TXBBS) {
> +		return -1;
> +	}
> +

Extra empty line.

> +	/* Get the control and data entries of the WQE. */
> +	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
> +	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> +			sizeof(struct mlx4_wqe_ctrl_seg));
> +	*pctrl = ctrl;
> +	/* Fill the data segments with buffer information. */
> +	struct rte_mbuf *sbuf;

I'm usually fine with mixing declarations and code when there's a good
reason, however in this case there's no point. sbuf could have been defined
with the rest at the beginning of the function.

> +

Extra empty line here as well.

> +	for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
> +		addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> +		rte_prefetch0((volatile void *)addr);
> +		/* Handle WQE wraparound. */
> +		if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
> +			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
> +		dseg->addr = rte_cpu_to_be_64(addr);
> +		/* Memory region key (big endian) for this memory pool. */
> +		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
> +		dseg->lkey = rte_cpu_to_be_32(lkey);
> +#ifndef NDEBUG
> +		/* Calculate the needed work queue entry size for this packet */
> +		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
> +			/* MR does not exist. */
> +			DEBUG("%p: unable to get MP <-> MR association",
> +					(void *)txq);
> +			/*
> +			 * Restamp entry in case of failure.
> +			 * Make sure that size is written correctly
> +			 * Note that we give ownership to the SW, not the HW.
> +			 */
> +			wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> +				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
> +			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> +			mlx4_txq_stamp_freed_wqe(sq, head_idx,
> +					(sq->head & sq->txbb_cnt) ? 0 : 1);
> +			return -1;
> +		}
> +#endif /* NDEBUG */
> +		if (likely(sbuf->data_len)) {
> +			byte_count = rte_cpu_to_be_32(sbuf->data_len);
> +		} else {
> +			/*
> +			 * Zero length segment is treated as inline segment
> +			 * with zero data.
> +			 */
> +			byte_count = RTE_BE32(0x80000000);
> +		}
> +		/*
> +		 * If the data segment is not at the beginning of a
> +		 * Tx basic block (TXBB) then write the byte count,
> +		 * else postpone the writing to just before updating the
> +		 * control segment.
> +		 */
> +		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> +			/*
> +			 * Need a barrier here before writing the byte_count
> +			 * fields to make sure that all the data is visible
> +			 * before the byte_count field is set.
> +			 * Otherwise, if the segment begins a new cacheline,
> +			 * the HCA prefetcher could grab the 64-byte chunk and
> +			 * get a valid (!= 0xffffffff) byte count but stale
> +			 * data, and end up sending the wrong data.
> +			 */
> +			rte_io_wmb();
> +			dseg->byte_count = byte_count;
> +		} else {
> +			/*
> +			 * This data segment starts at the beginning of a new
> +			 * TXBB, so we need to postpone its byte_count writing
> +			 * for later.
> +			 */
> +			pv[pv_counter].dseg = dseg;
> +			pv[pv_counter++].val = byte_count;
> +		}
> +	}
> +	/* Write the first DWORD of each TXBB save earlier. */
> +	if (pv_counter) {
> +		/* Need a barrier here before writing the byte_count. */
> +		rte_io_wmb();
> +		for (--pv_counter; pv_counter  >= 0; pv_counter--)
> +			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
> +	}
> +	/* Fill the control parameters for this packet. */
> +	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> +

Extra empty line.

> +	return nr_txbbs;
> +}
>  /**
>   * DPDK callback for Tx.
>   *
> @@ -261,10 +377,11 @@ struct pv {
>  	unsigned int i;
>  	unsigned int max;
>  	struct mlx4_sq *sq = &txq->msq;
> -	struct pv *pv = (struct pv *)txq->bounce_buf;
> +	int nr_txbbs;
>  
>  	assert(txq->elts_comp_cd != 0);
> -	mlx4_txq_complete(txq);
> +	if (likely(txq->elts_comp != 0))
> +		mlx4_txq_complete(txq, elts_n, sq);
>  	max = (elts_n - (elts_head - txq->elts_tail));
>  	if (max > elts_n)
>  		max -= elts_n;
> @@ -283,7 +400,6 @@ struct pv {
>  		uint32_t owner_opcode = MLX4_OPCODE_SEND;
>  		struct mlx4_wqe_ctrl_seg *ctrl;
>  		struct mlx4_wqe_data_seg *dseg;
> -		struct rte_mbuf *sbuf;
>  		union {
>  			uint32_t flags;
>  			uint16_t flags16[2];
> @@ -291,10 +407,6 @@ struct pv {
>  		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
>  		uint32_t lkey;
>  		uintptr_t addr;
> -		uint32_t byte_count;
> -		int wqe_real_size;
> -		int nr_txbbs;
> -		int pv_counter = 0;
>  
>  		/* Clean up old buffer. */
>  		if (likely(elt->buf != NULL)) {
> @@ -313,40 +425,29 @@ struct pv {
>  			} while (tmp != NULL);
>  		}
>  		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> -
> -		/*
> -		 * Calculate the needed work queue entry size
> -		 * for this packet.
> -		 */
> -		wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> -				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
> -		nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
> -		/*
> -		 * Check that there is room for this WQE in the send
> -		 * queue and that the WQE size is legal.
> -		 */
> -		if (((sq->head - sq->tail) + nr_txbbs +
> -		     sq->headroom_txbbs) >= sq->txbb_cnt ||
> -		    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
> -			elt->buf = NULL;
> -			break;
> -		}
> -		/* Get the control and data entries of the WQE. */
> -		ctrl = (struct mlx4_wqe_ctrl_seg *)
> -				mlx4_get_send_wqe(sq, head_idx);
> -		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> -				sizeof(struct mlx4_wqe_ctrl_seg));
> -		/* Fill the data segments with buffer information. */
> -		for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
> -			addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> +		if (buf->nb_segs == 1) {
> +			/*
> +			 * Check that there is room for this WQE in the send
> +			 * queue and that the WQE size is legal
> +			 */
> +			if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs) >=
> +			     sq->txbb_cnt || 1 > MLX4_MAX_WQE_TXBBS) {
> +				elt->buf = NULL;
> +				break;
> +			}
> +			/* Get the control and data entries of the WQE. */
> +			ctrl = (struct mlx4_wqe_ctrl_seg *)
> +					mlx4_get_send_wqe(sq, head_idx);
> +			dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> +					sizeof(struct mlx4_wqe_ctrl_seg));
> +			addr = rte_pktmbuf_mtod(buf, uintptr_t);
>  			rte_prefetch0((volatile void *)addr);
>  			/* Handle WQE wraparound. */
> -			if (unlikely(dseg >=
> -			    (struct mlx4_wqe_data_seg *)sq->eob))
> +			if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)

Ideally this change should have been on its own in a fix commit.

>  				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
>  			dseg->addr = rte_cpu_to_be_64(addr);
>  			/* Memory region key (big endian). */
> -			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
> +			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
>  			dseg->lkey = rte_cpu_to_be_32(lkey);
>  #ifndef NDEBUG
>  			if (unlikely(dseg->lkey ==
> @@ -360,61 +461,28 @@ struct pv {
>  				 * Note that we give ownership to the SW,
>  				 * not the HW.
>  				 */
> -				ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> +				ctrl->fence_size =
> +					(WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
>  				mlx4_txq_stamp_freed_wqe(sq, head_idx,
>  					     (sq->head & sq->txbb_cnt) ? 0 : 1);
>  				elt->buf = NULL;
>  				break;
>  			}
>  #endif /* NDEBUG */
> -			if (likely(sbuf->data_len)) {
> -				byte_count = rte_cpu_to_be_32(sbuf->data_len);
> -			} else {
> -				/*
> -				 * Zero length segment is treated as inline
> -				 * segment with zero data.
> -				 */
> -				byte_count = RTE_BE32(0x80000000);
> -			}
> -			/*
> -			 * If the data segment is not at the beginning
> -			 * of a Tx basic block (TXBB) then write the
> -			 * byte count, else postpone the writing to
> -			 * just before updating the control segment.
> -			 */
> -			if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> -				/*
> -				 * Need a barrier here before writing the
> -				 * byte_count fields to make sure that all the
> -				 * data is visible before the byte_count field
> -				 * is set. otherwise, if the segment begins a
> -				 * new cacheline, the HCA prefetcher could grab
> -				 * the 64-byte chunk and get a valid
> -				 * (!= 0xffffffff) byte count but stale data,
> -				 * and end up sending the wrong data.
> -				 */
> -				rte_io_wmb();
> -				dseg->byte_count = byte_count;
> -			} else {
> -				/*
> -				 * This data segment starts at the beginning of
> -				 * a new TXBB, so we need to postpone its
> -				 * byte_count writing for later.
> -				 */
> -				pv[pv_counter].dseg = dseg;
> -				pv[pv_counter++].val = byte_count;
> -			}
> -		}
> -		/* Write the first DWORD of each TXBB save earlier. */
> -		if (pv_counter) {
> -			/* Need a barrier before writing the byte_count. */
> +			/* Need a barrier here before byte count store. */
>  			rte_io_wmb();
> -			for (--pv_counter; pv_counter  >= 0; pv_counter--)
> -				pv[pv_counter].dseg->byte_count =
> -						pv[pv_counter].val;
> +			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
> +

Extra empty line.

> +			/* Fill the control parameters for this packet. */
> +			ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
> +			nr_txbbs = 1;
> +		} else {
> +			nr_txbbs = handle_multi_segs(buf, txq, &ctrl);
> +			if (nr_txbbs < 0) {
> +				elt->buf = NULL;
> +				break;
> +			}
>  		}
> -		/* Fill the control parameters for this packet. */
> -		ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
>  		/*
>  		 * For raw Ethernet, the SOLICIT flag is used to indicate
>  		 * that no ICRC should be calculated.
> @@ -469,6 +537,7 @@ struct pv {
>  		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
>  					      ((sq->head & sq->txbb_cnt) ?
>  						       MLX4_BIT_WQE_OWN : 0));
> +

Extra empty line.

>  		sq->head += nr_txbbs;
>  		elt->buf = buf;
>  		bytes_sent += buf->pkt_len;
> -- 
> 1.8.3.1
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 6/7] net/mlx4: mitigate Tx path memory barriers
  2017-10-30 10:07     ` [PATCH v3 6/7] net/mlx4: mitigate Tx path memory barriers Matan Azrad
@ 2017-10-30 14:23       ` Adrien Mazarguil
  2017-10-30 19:47         ` Matan Azrad
  0 siblings, 1 reply; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-30 14:23 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

On Mon, Oct 30, 2017 at 10:07:28AM +0000, Matan Azrad wrote:
> Replace most of the memory barriers by compiler barriers since they are
> all targeted to the DRAM; This improves code efficiency for systems
> which force store order between different addresses.
> 
> Only the doorbell record store should be protected by memory barrier
> since it is targeted to the PCI memory domain.
> 
> Limit pre byte count store compiler barrier for systems with cache line
> size smaller than 64B (TXBB size).
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>

This sounds like an interesting performance improvement, can you share the
typical or expected amount (percentage/hard numbers) for a given use case as
part of the commit log?

More comments below.

> ---
>  drivers/net/mlx4/mlx4_rxtx.c | 11 ++++++-----
>  1 file changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 8ea8851..482c399 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -168,7 +168,7 @@ struct pv {
>  		/*
>  		 * Make sure we read the CQE after we read the ownership bit.
>  		 */
> -		rte_rmb();
> +		rte_io_rmb();

OK for this one since the rest of the code should not be run due to the
condition (I'm not even sure even a compiler barrier is necessary at all
here).

>  #ifndef NDEBUG
>  		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
>  			     MLX4_CQE_OPCODE_ERROR)) {
> @@ -203,7 +203,7 @@ struct pv {
>  	 */
>  	cq->cons_index = cons_index;
>  	*cq->set_ci_db = rte_cpu_to_be_32(cq->cons_index & MLX4_CQ_DB_CI_MASK);
> -	rte_wmb();
> +	rte_io_wmb();

This one could be removed entirely as well, which is more or less what the
move to a compiler barrier does. Nothing in subsequent code depends on this
doorbell being written, so this can piggy back on any subsequent rte_wmb().

On the other hand in my opinion a barrier (compiler or otherwise) might be
needed before the doorbell write, to make clear it cannot somehow be done
earlier in case something attempts to optimize it away.

>  	sq->tail = sq->tail + nr_txbbs;
>  	/* Update the list of packets posted for transmission. */
>  	elts_comp -= pkts;
> @@ -321,6 +321,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
>  		 * control segment.
>  		 */
>  		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> +#if RTE_CACHE_LINE_SIZE < 64
>  			/*
>  			 * Need a barrier here before writing the byte_count
>  			 * fields to make sure that all the data is visible
> @@ -331,6 +332,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
>  			 * data, and end up sending the wrong data.
>  			 */
>  			rte_io_wmb();
> +#endif /* RTE_CACHE_LINE_SIZE */

Interesting one.

>  			dseg->byte_count = byte_count;
>  		} else {
>  			/*
> @@ -469,8 +471,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
>  				break;
>  			}
>  #endif /* NDEBUG */
> -			/* Need a barrier here before byte count store. */
> -			rte_io_wmb();
> +			/* Never be TXBB aligned, no need compiler barrier. */

The reason there was a barrier here at all was unclear, so if it's really
useless, you don't even need to describe why.

>  			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
>  
>  			/* Fill the control parameters for this packet. */
> @@ -533,7 +534,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
>  		 * setting ownership bit (because HW can start
>  		 * executing as soon as we do).
>  		 */
> -		rte_wmb();
> +		rte_io_wmb();

This one looks dangerous. A compiler barrier is not strong enough to
guarantee the order in which CPU will execute instructions, it only makes
sure what follows the barrier doesn't appear before it in the generated
code.

Unless the comment above this barrier is wrong, this change may cause
hard-to-debug issues down the road, you should drop it.

>  		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
>  					      ((sq->head & sq->txbb_cnt) ?
>  						       MLX4_BIT_WQE_OWN : 0));
> -- 
> 1.8.3.1
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 7/7] net/mlx4: remove empty Tx segment support
  2017-10-30 10:07     ` [PATCH v3 7/7] net/mlx4: remove empty Tx segment support Matan Azrad
@ 2017-10-30 14:24       ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-30 14:24 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

On Mon, Oct 30, 2017 at 10:07:29AM +0000, Matan Azrad wrote:
> Move empty segment case processing to debug mode.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>

Whoa, I think there's a misunderstanding here. Nothing prevents applications
from attempting to send zero-length segments, and the PMD must survive this
somehow.

I think this commit should be dropped, more below.

> ---
>  drivers/net/mlx4/mlx4_rxtx.c | 9 ++++++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 482c399..c005a41 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -305,15 +305,18 @@ static int handle_multi_segs(struct rte_mbuf *buf,
>  			return -1;
>  		}
>  #endif /* NDEBUG */
> -		if (likely(sbuf->data_len)) {
> -			byte_count = rte_cpu_to_be_32(sbuf->data_len);
> -		} else {
> +		byte_count = rte_cpu_to_be_32(sbuf->data_len);
> +#ifndef NDEBUG
> +		if (unlikely(!sbuf->data_len)) {
> +			DEBUG("%p: Empty segment is not allowed",
> +					(void *)txq);
>  			/*
>  			 * Zero length segment is treated as inline segment
>  			 * with zero data.
>  			 */
>  			byte_count = RTE_BE32(0x80000000);
>  		}
> +#endif /* NDEBUG */

This change means outside of debug mode and according to PRM, a zero-length
segment is interpreted as containing 2 GiB worth of data, which guarantees
some sort of crash.

To properly enforce such a limitation, you'd need a check (possibly
unlikely()) to reject the packet and stop the TX function at this point
anyway. Such a check negates any kind of optimization brought by this
commit, as small as it is.

You'd better leave the existing code unmodified in my opinion.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 1/7] net/mlx4: remove error flows from Tx fast path
  2017-10-30 14:23       ` Adrien Mazarguil
@ 2017-10-30 18:11         ` Matan Azrad
  2017-10-31 10:16           ` Adrien Mazarguil
  0 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-30 18:11 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Hi Adrien

> -----Original Message-----
> From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> Sent: Monday, October 30, 2017 4:23 PM
> To: Matan Azrad <matan@mellanox.com>
> Cc: dev@dpdk.org; Ophir Munk <ophirmu@mellanox.com>
> Subject: Re: [PATCH v3 1/7] net/mlx4: remove error flows from Tx fast path
> 
> On Mon, Oct 30, 2017 at 10:07:23AM +0000, Matan Azrad wrote:
> > Move unnecessary error flows to DEBUG mode.
> >
> > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> 
> I missed a couple of details while reviewing the original version, the first one
> being mlx4_post_send()'s return value is still documented as updating
> rte_errno in case of error, it's not the case anymore after this patch.
> 
Good attention, Will be fixed in next version.

> Please see below for the other one:
> 
> > ---
> >  drivers/net/mlx4/mlx4_rxtx.c | 16 ++++++----------
> >  1 file changed, 6 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > b/drivers/net/mlx4/mlx4_rxtx.c
> <snip>
> >  /**
> > @@ -510,8 +508,6 @@ struct pv {
> >  	assert(max <= elts_n);
> >  	/* Always leave one free entry in the ring. */
> >  	--max;
> > -	if (max == 0)
> > -		return 0;
> >  	if (max > pkts_n)
> >  		max = pkts_n;
> >  	for (i = 0; (i != max); ++i) {
> 
> While minor, this change has nothing to do with this patch, right?
> 
Yes you right, maybe it can be merged in patch 4/7.
 
> I think it can slightly degrade an application performance as it removes the
> guarantee that subsequent code only needs to be run if there is at least one
> packet to process in case the TX ring is constantly full (SW faster than HW).
>

In case the TX ring is full, the loop condition should fail in the start and then return with 0  because the packet counter is 0.(more 2 checks)
Since this case are less common (in my opinion) than at least 1 free space in ring, we can prevent this unnecessary check for all these common cases.    
  
Are you sure the 2 extra check important for performance in this empty case? Doesn't the application will call us again? 
 
> Can you remove it?
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 3/7] net/mlx4: merge Tx path functions
  2017-10-30 14:23       ` Adrien Mazarguil
@ 2017-10-30 18:12         ` Matan Azrad
  0 siblings, 0 replies; 84+ messages in thread
From: Matan Azrad @ 2017-10-30 18:12 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Ok, thanks!

> -----Original Message-----
> From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> Sent: Monday, October 30, 2017 4:24 PM
> To: Matan Azrad <matan@mellanox.com>
> Cc: dev@dpdk.org; Ophir Munk <ophirmu@mellanox.com>
> Subject: Re: [PATCH v3 3/7] net/mlx4: merge Tx path functions
> 
> On Mon, Oct 30, 2017 at 10:07:25AM +0000, Matan Azrad wrote:
> > Merge tx_burst and mlx4_post_send functions to prevent double asking
> > about WQ remain space.
> >
> > Signed-off-by: Matan Azrad <matan@mellanox.com>
> 
> Except for one nit below,
> 
> Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> 
> > ---
> >  drivers/net/mlx4/mlx4_rxtx.c | 357
> > +++++++++++++++++++++----------------------
> >  1 file changed, 172 insertions(+), 185 deletions(-)
> >
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > b/drivers/net/mlx4/mlx4_rxtx.c
> <snip>
> > @@ -477,18 +314,168 @@ struct pv {
> >  			} while (tmp != NULL);
> >  		}
> >  		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> > -		/* Post the packet for sending. */
> > -		err = mlx4_post_send(txq, buf);
> > -		if (unlikely(err)) {
> > +
> 
> Extra empty line.
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 5/7] net/mlx4: separate Tx segment cases
  2017-10-30 14:23       ` Adrien Mazarguil
@ 2017-10-30 18:23         ` Matan Azrad
  2017-10-31 10:17           ` Adrien Mazarguil
  0 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-30 18:23 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Hi Adrien

> -----Original Message-----
> From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> Sent: Monday, October 30, 2017 4:24 PM
> To: Matan Azrad <matan@mellanox.com>
> Cc: dev@dpdk.org; Ophir Munk <ophirmu@mellanox.com>
> Subject: Re: [PATCH v3 5/7] net/mlx4: separate Tx segment cases
> 
> On Mon, Oct 30, 2017 at 10:07:27AM +0000, Matan Azrad wrote:
> > Since single segment packets shouldn't use additional memory to save
> > segments byte count we can prevent additional memory unnecessary
> usage
> > in this case; Prevent loop management.
> >
> 
> Sorry for asking but I really don't understand the above, can you reformulate
> the problem addressed by this patch?
> 
What's about next?
Optimize single segment case by processing it in different code which prevents checks and calculations relevant only to multi segment case. 

> > Call a dedicated function for handling multi segments case.
> 
> This sentence is clearer, I'll base my review on what this patch does, not the
> reasons behind it.
> 
> > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> > ---
> >  drivers/net/mlx4/mlx4_rxtx.c | 247
> > +++++++++++++++++++++++++++----------------
> >  1 file changed, 158 insertions(+), 89 deletions(-)
> >
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > b/drivers/net/mlx4/mlx4_rxtx.c index 8ce70d6..8ea8851 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > @@ -62,6 +62,9 @@
> >  #include "mlx4_rxtx.h"
> >  #include "mlx4_utils.h"
> >
> > +#define WQE_ONE_DATA_SEG_SIZE \
> > +	(sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct
> > +mlx4_wqe_data_seg))
> > +
> >  /**
> >   * Pointer-value pair structure used in tx_post_send for saving the first
> >   * DWORD (32 byte) of a TXBB.
> > @@ -140,22 +143,19 @@ struct pv {
> >   * @return
> >   *   0 on success, -1 on failure.
> >   */
> > -static int
> > -mlx4_txq_complete(struct txq *txq)
> > +static inline int
> 
> While likely harmless, I think the addition of this inline keyword is not related
> to this patch.
> 
Yes, you right, will be fixed in next version.

> > +mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
> > +				struct mlx4_sq *sq)
> 
> Looks like an indentation issue, you should align it to the contents of the
> opening "(" to match the coding style of this file.
> 
OK.
> >  {
> >  	unsigned int elts_comp = txq->elts_comp;
> >  	unsigned int elts_tail = txq->elts_tail;
> > -	const unsigned int elts_n = txq->elts_n;
> >  	struct mlx4_cq *cq = &txq->mcq;
> > -	struct mlx4_sq *sq = &txq->msq;
> >  	struct mlx4_cqe *cqe;
> >  	uint32_t cons_index = cq->cons_index;
> >  	uint16_t new_index;
> >  	uint16_t nr_txbbs = 0;
> >  	int pkts = 0;
> >
> > -	if (unlikely(elts_comp == 0))
> > -		return 0;
> >  	/*
> >  	 * Traverse over all CQ entries reported and handle each WQ entry
> >  	 * reported by them.
> > @@ -238,6 +238,122 @@ struct pv {
> >  	return buf->pool;
> >  }
> >
> > +static int handle_multi_segs(struct rte_mbuf *buf,
> > +			    struct txq *txq,
> > +			    struct mlx4_wqe_ctrl_seg **pctrl)
> 
> How about naming this function in a way that follows the mlx4_something()
> convention?
> 
> Here's a suggestion based on how this function remains tied to
> mlx4_tx_burst():
> 
>  mlx4_tx_burst_seg()
> 
Good, thanks!

> > +{
> > +	int wqe_real_size;
> > +	int nr_txbbs;
> > +	struct pv *pv = (struct pv *)txq->bounce_buf;
> > +	struct mlx4_sq *sq = &txq->msq;
> > +	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
> > +	struct mlx4_wqe_ctrl_seg *ctrl;
> > +	struct mlx4_wqe_data_seg *dseg;
> > +	uint32_t lkey;
> > +	uintptr_t addr;
> > +	uint32_t byte_count;
> > +	int pv_counter = 0;
> > +
> > +	/* Calculate the needed work queue entry size for this packet. */
> > +	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> > +		buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
> > +	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
> > +	/*
> > +	 * Check that there is room for this WQE in the send queue and that
> > +	 * the WQE size is legal.
> > +	 */
> > +	if (((sq->head - sq->tail) + nr_txbbs +
> > +				sq->headroom_txbbs) >= sq->txbb_cnt ||
> > +			nr_txbbs > MLX4_MAX_WQE_TXBBS) {
> > +		return -1;
> > +	}
> > +
> 
> Extra empty line.
> 
> > +	/* Get the control and data entries of the WQE. */
> > +	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq,
> head_idx);
> > +	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> > +			sizeof(struct mlx4_wqe_ctrl_seg));
> > +	*pctrl = ctrl;
> > +	/* Fill the data segments with buffer information. */
> > +	struct rte_mbuf *sbuf;
> 
> I'm usually fine with mixing declarations and code when there's a good
> reason, however in this case there's no point. sbuf could have been defined
> with the rest at the beginning of the function.
>

OK.
 
> > +
> 
> Extra empty line here as well.
> 
> > +	for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
> > +		addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> > +		rte_prefetch0((volatile void *)addr);
> > +		/* Handle WQE wraparound. */
> > +		if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
> > +			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
> > +		dseg->addr = rte_cpu_to_be_64(addr);
> > +		/* Memory region key (big endian) for this memory pool. */
> > +		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
> > +		dseg->lkey = rte_cpu_to_be_32(lkey); #ifndef NDEBUG
> > +		/* Calculate the needed work queue entry size for this
> packet */
> > +		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
> > +			/* MR does not exist. */
> > +			DEBUG("%p: unable to get MP <-> MR association",
> > +					(void *)txq);
> > +			/*
> > +			 * Restamp entry in case of failure.
> > +			 * Make sure that size is written correctly
> > +			 * Note that we give ownership to the SW, not the
> HW.
> > +			 */
> > +			wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> > +				buf->nb_segs * sizeof(struct
> mlx4_wqe_data_seg);
> > +			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> > +			mlx4_txq_stamp_freed_wqe(sq, head_idx,
> > +					(sq->head & sq->txbb_cnt) ? 0 : 1);
> > +			return -1;
> > +		}
> > +#endif /* NDEBUG */
> > +		if (likely(sbuf->data_len)) {
> > +			byte_count = rte_cpu_to_be_32(sbuf->data_len);
> > +		} else {
> > +			/*
> > +			 * Zero length segment is treated as inline segment
> > +			 * with zero data.
> > +			 */
> > +			byte_count = RTE_BE32(0x80000000);
> > +		}
> > +		/*
> > +		 * If the data segment is not at the beginning of a
> > +		 * Tx basic block (TXBB) then write the byte count,
> > +		 * else postpone the writing to just before updating the
> > +		 * control segment.
> > +		 */
> > +		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> > +			/*
> > +			 * Need a barrier here before writing the byte_count
> > +			 * fields to make sure that all the data is visible
> > +			 * before the byte_count field is set.
> > +			 * Otherwise, if the segment begins a new cacheline,
> > +			 * the HCA prefetcher could grab the 64-byte chunk
> and
> > +			 * get a valid (!= 0xffffffff) byte count but stale
> > +			 * data, and end up sending the wrong data.
> > +			 */
> > +			rte_io_wmb();
> > +			dseg->byte_count = byte_count;
> > +		} else {
> > +			/*
> > +			 * This data segment starts at the beginning of a new
> > +			 * TXBB, so we need to postpone its byte_count
> writing
> > +			 * for later.
> > +			 */
> > +			pv[pv_counter].dseg = dseg;
> > +			pv[pv_counter++].val = byte_count;
> > +		}
> > +	}
> > +	/* Write the first DWORD of each TXBB save earlier. */
> > +	if (pv_counter) {
> > +		/* Need a barrier here before writing the byte_count. */
> > +		rte_io_wmb();
> > +		for (--pv_counter; pv_counter  >= 0; pv_counter--)
> > +			pv[pv_counter].dseg->byte_count =
> pv[pv_counter].val;
> > +	}
> > +	/* Fill the control parameters for this packet. */
> > +	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> > +
> 
> Extra empty line.
> 
> > +	return nr_txbbs;
> > +}
> >  /**
> >   * DPDK callback for Tx.
> >   *
> > @@ -261,10 +377,11 @@ struct pv {
> >  	unsigned int i;
> >  	unsigned int max;
> >  	struct mlx4_sq *sq = &txq->msq;
> > -	struct pv *pv = (struct pv *)txq->bounce_buf;
> > +	int nr_txbbs;
> >
> >  	assert(txq->elts_comp_cd != 0);
> > -	mlx4_txq_complete(txq);
> > +	if (likely(txq->elts_comp != 0))
> > +		mlx4_txq_complete(txq, elts_n, sq);
> >  	max = (elts_n - (elts_head - txq->elts_tail));
> >  	if (max > elts_n)
> >  		max -= elts_n;
> > @@ -283,7 +400,6 @@ struct pv {
> >  		uint32_t owner_opcode = MLX4_OPCODE_SEND;
> >  		struct mlx4_wqe_ctrl_seg *ctrl;
> >  		struct mlx4_wqe_data_seg *dseg;
> > -		struct rte_mbuf *sbuf;
> >  		union {
> >  			uint32_t flags;
> >  			uint16_t flags16[2];
> > @@ -291,10 +407,6 @@ struct pv {
> >  		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
> >  		uint32_t lkey;
> >  		uintptr_t addr;
> > -		uint32_t byte_count;
> > -		int wqe_real_size;
> > -		int nr_txbbs;
> > -		int pv_counter = 0;
> >
> >  		/* Clean up old buffer. */
> >  		if (likely(elt->buf != NULL)) {
> > @@ -313,40 +425,29 @@ struct pv {
> >  			} while (tmp != NULL);
> >  		}
> >  		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> > -
> > -		/*
> > -		 * Calculate the needed work queue entry size
> > -		 * for this packet.
> > -		 */
> > -		wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> > -				buf->nb_segs * sizeof(struct
> mlx4_wqe_data_seg);
> > -		nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
> > -		/*
> > -		 * Check that there is room for this WQE in the send
> > -		 * queue and that the WQE size is legal.
> > -		 */
> > -		if (((sq->head - sq->tail) + nr_txbbs +
> > -		     sq->headroom_txbbs) >= sq->txbb_cnt ||
> > -		    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
> > -			elt->buf = NULL;
> > -			break;
> > -		}
> > -		/* Get the control and data entries of the WQE. */
> > -		ctrl = (struct mlx4_wqe_ctrl_seg *)
> > -				mlx4_get_send_wqe(sq, head_idx);
> > -		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> > -				sizeof(struct mlx4_wqe_ctrl_seg));
> > -		/* Fill the data segments with buffer information. */
> > -		for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
> > -			addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> > +		if (buf->nb_segs == 1) {
> > +			/*
> > +			 * Check that there is room for this WQE in the send
> > +			 * queue and that the WQE size is legal
> > +			 */
> > +			if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs)
> >=
> > +			     sq->txbb_cnt || 1 > MLX4_MAX_WQE_TXBBS) {
> > +				elt->buf = NULL;
> > +				break;
> > +			}
> > +			/* Get the control and data entries of the WQE. */
> > +			ctrl = (struct mlx4_wqe_ctrl_seg *)
> > +					mlx4_get_send_wqe(sq, head_idx);
> > +			dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl
> +
> > +					sizeof(struct mlx4_wqe_ctrl_seg));
> > +			addr = rte_pktmbuf_mtod(buf, uintptr_t);
> >  			rte_prefetch0((volatile void *)addr);
> >  			/* Handle WQE wraparound. */
> > -			if (unlikely(dseg >=
> > -			    (struct mlx4_wqe_data_seg *)sq->eob))
> > +			if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
> 
> Ideally this change should have been on its own in a fix commit.
> 
> >  				dseg = (struct mlx4_wqe_data_seg *)sq-
> >buf;
> >  			dseg->addr = rte_cpu_to_be_64(addr);
> >  			/* Memory region key (big endian). */
> > -			lkey = mlx4_txq_mp2mr(txq,
> mlx4_txq_mb2mp(sbuf));
> > +			lkey = mlx4_txq_mp2mr(txq,
> mlx4_txq_mb2mp(buf));
> >  			dseg->lkey = rte_cpu_to_be_32(lkey);  #ifndef
> NDEBUG
> >  			if (unlikely(dseg->lkey ==
> > @@ -360,61 +461,28 @@ struct pv {
> >  				 * Note that we give ownership to the SW,
> >  				 * not the HW.
> >  				 */
> > -				ctrl->fence_size = (wqe_real_size >> 4) &
> 0x3f;
> > +				ctrl->fence_size =
> > +					(WQE_ONE_DATA_SEG_SIZE >> 4) &
> 0x3f;
> >  				mlx4_txq_stamp_freed_wqe(sq, head_idx,
> >  					     (sq->head & sq->txbb_cnt) ? 0 : 1);
> >  				elt->buf = NULL;
> >  				break;
> >  			}
> >  #endif /* NDEBUG */
> > -			if (likely(sbuf->data_len)) {
> > -				byte_count = rte_cpu_to_be_32(sbuf-
> >data_len);
> > -			} else {
> > -				/*
> > -				 * Zero length segment is treated as inline
> > -				 * segment with zero data.
> > -				 */
> > -				byte_count = RTE_BE32(0x80000000);
> > -			}
> > -			/*
> > -			 * If the data segment is not at the beginning
> > -			 * of a Tx basic block (TXBB) then write the
> > -			 * byte count, else postpone the writing to
> > -			 * just before updating the control segment.
> > -			 */
> > -			if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE -
> 1)) {
> > -				/*
> > -				 * Need a barrier here before writing the
> > -				 * byte_count fields to make sure that all the
> > -				 * data is visible before the byte_count field
> > -				 * is set. otherwise, if the segment begins a
> > -				 * new cacheline, the HCA prefetcher could
> grab
> > -				 * the 64-byte chunk and get a valid
> > -				 * (!= 0xffffffff) byte count but stale data,
> > -				 * and end up sending the wrong data.
> > -				 */
> > -				rte_io_wmb();
> > -				dseg->byte_count = byte_count;
> > -			} else {
> > -				/*
> > -				 * This data segment starts at the beginning
> of
> > -				 * a new TXBB, so we need to postpone its
> > -				 * byte_count writing for later.
> > -				 */
> > -				pv[pv_counter].dseg = dseg;
> > -				pv[pv_counter++].val = byte_count;
> > -			}
> > -		}
> > -		/* Write the first DWORD of each TXBB save earlier. */
> > -		if (pv_counter) {
> > -			/* Need a barrier before writing the byte_count. */
> > +			/* Need a barrier here before byte count store. */
> >  			rte_io_wmb();
> > -			for (--pv_counter; pv_counter  >= 0; pv_counter--)
> > -				pv[pv_counter].dseg->byte_count =
> > -						pv[pv_counter].val;
> > +			dseg->byte_count = rte_cpu_to_be_32(buf-
> >data_len);
> > +
> 
> Extra empty line.
> 
> > +			/* Fill the control parameters for this packet. */
> > +			ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4)
> & 0x3f;
> > +			nr_txbbs = 1;
> > +		} else {
> > +			nr_txbbs = handle_multi_segs(buf, txq, &ctrl);
> > +			if (nr_txbbs < 0) {
> > +				elt->buf = NULL;
> > +				break;
> > +			}
> >  		}
> > -		/* Fill the control parameters for this packet. */
> > -		ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> >  		/*
> >  		 * For raw Ethernet, the SOLICIT flag is used to indicate
> >  		 * that no ICRC should be calculated.
> > @@ -469,6 +537,7 @@ struct pv {
> >  		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
> >  					      ((sq->head & sq->txbb_cnt) ?
> >  						       MLX4_BIT_WQE_OWN :
> 0));
> > +
> 
> Extra empty line.
> 
> >  		sq->head += nr_txbbs;
> >  		elt->buf = buf;
> >  		bytes_sent += buf->pkt_len;
> > --
> > 1.8.3.1
> >
> 
> --
> Adrien Mazarguil
> 6WIND

Thanks!

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 6/7] net/mlx4: mitigate Tx path memory barriers
  2017-10-30 14:23       ` Adrien Mazarguil
@ 2017-10-30 19:47         ` Matan Azrad
  2017-10-31 10:17           ` Adrien Mazarguil
  0 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-30 19:47 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Hi Adrien

> -----Original Message-----
> From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> Sent: Monday, October 30, 2017 4:24 PM
> To: Matan Azrad <matan@mellanox.com>
> Cc: dev@dpdk.org; Ophir Munk <ophirmu@mellanox.com>
> Subject: Re: [PATCH v3 6/7] net/mlx4: mitigate Tx path memory barriers
> 
> On Mon, Oct 30, 2017 at 10:07:28AM +0000, Matan Azrad wrote:
> > Replace most of the memory barriers by compiler barriers since they
> > are all targeted to the DRAM; This improves code efficiency for
> > systems which force store order between different addresses.
> >
> > Only the doorbell record store should be protected by memory barrier
> > since it is targeted to the PCI memory domain.
> >
> > Limit pre byte count store compiler barrier for systems with cache
> > line size smaller than 64B (TXBB size).
> >
> > Signed-off-by: Matan Azrad <matan@mellanox.com>
> 
> This sounds like an interesting performance improvement, can you share the
> typical or expected amount (percentage/hard numbers) for a given use case
> as part of the commit log?
> 

Yes, it improves performance, I will share numbers.

> More comments below.
> 
> > ---
> >  drivers/net/mlx4/mlx4_rxtx.c | 11 ++++++-----
> >  1 file changed, 6 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > b/drivers/net/mlx4/mlx4_rxtx.c index 8ea8851..482c399 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > @@ -168,7 +168,7 @@ struct pv {
> >  		/*
> >  		 * Make sure we read the CQE after we read the ownership
> bit.
> >  		 */
> > -		rte_rmb();
> > +		rte_io_rmb();
> 
> OK for this one since the rest of the code should not be run due to the
> condition (I'm not even sure even a compiler barrier is necessary at all here).
> 
> >  #ifndef NDEBUG
> >  		if (unlikely((cqe->owner_sr_opcode &
> MLX4_CQE_OPCODE_MASK) ==
> >  			     MLX4_CQE_OPCODE_ERROR)) {
> > @@ -203,7 +203,7 @@ struct pv {
> >  	 */
> >  	cq->cons_index = cons_index;
> >  	*cq->set_ci_db = rte_cpu_to_be_32(cq->cons_index &
> MLX4_CQ_DB_CI_MASK);
> > -	rte_wmb();
> > +	rte_io_wmb();
> 
> This one could be removed entirely as well, which is more or less what the
> move to a compiler barrier does. Nothing in subsequent code depends on
> this doorbell being written, so this can piggy back on any subsequent
> rte_wmb().

Yes, you right, probably this code was taken from multi thread implementation.
> 
> On the other hand in my opinion a barrier (compiler or otherwise) might be
> needed before the doorbell write, to make clear it cannot somehow be done
> earlier in case something attempts to optimize it away.
> 
I think we can remove it entirely (compiler can't optimize the ci_db store since in depends in previous code (cons_index).

> >  	sq->tail = sq->tail + nr_txbbs;
> >  	/* Update the list of packets posted for transmission. */
> >  	elts_comp -= pkts;
> > @@ -321,6 +321,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
> >  		 * control segment.
> >  		 */
> >  		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> > +#if RTE_CACHE_LINE_SIZE < 64
> >  			/*
> >  			 * Need a barrier here before writing the byte_count
> >  			 * fields to make sure that all the data is visible @@ -
> 331,6
> > +332,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
> >  			 * data, and end up sending the wrong data.
> >  			 */
> >  			rte_io_wmb();
> > +#endif /* RTE_CACHE_LINE_SIZE */
> 
> Interesting one.
> 
> >  			dseg->byte_count = byte_count;
> >  		} else {
> >  			/*
> > @@ -469,8 +471,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
> >  				break;
> >  			}
> >  #endif /* NDEBUG */
> > -			/* Need a barrier here before byte count store. */
> > -			rte_io_wmb();
> > +			/* Never be TXBB aligned, no need compiler barrier.
> */
> 
> The reason there was a barrier here at all was unclear, so if it's really useless,
> you don't even need to describe why.

It is because there is a barrier in multi segment similar stage.
I think it can help for future review.

> 
> >  			dseg->byte_count = rte_cpu_to_be_32(buf-
> >data_len);
> >
> >  			/* Fill the control parameters for this packet. */ @@ -
> 533,7
> > +534,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
> >  		 * setting ownership bit (because HW can start
> >  		 * executing as soon as we do).
> >  		 */
> > -		rte_wmb();
> > +		rte_io_wmb();
> 
> This one looks dangerous. A compiler barrier is not strong enough to
> guarantee the order in which CPU will execute instructions, it only makes
> sure what follows the barrier doesn't appear before it in the generated code.
> 
As I investigated, I understood that for CPUs which don't save store order between different addresses(arm,ppc), the rte_io_wmb is converted to rte_wmb.
So for thus who save it(x86) we just need the right order in compiler code because all the relevant stores are targeted to same memory domain(DRAM) and therefore also the actual store is guaranteed.
Unlike doorbell store which directed to different memory domain (PCI).
So the only place which need rte_wmb() is before doorbell write.

> Unless the comment above this barrier is wrong, this change may cause hard-
> to-debug issues down the road, you should drop it.
> 
> >  		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
> >  					      ((sq->head & sq->txbb_cnt) ?
> >  						       MLX4_BIT_WQE_OWN :
> 0));
> > --
> > 1.8.3.1
> >
> 
> --
> Adrien Mazarguil
> 6WIND

Thanks!

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 1/7] net/mlx4: remove error flows from Tx fast path
  2017-10-30 18:11         ` Matan Azrad
@ 2017-10-31 10:16           ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-31 10:16 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

Hi Matan,

On Mon, Oct 30, 2017 at 06:11:31PM +0000, Matan Azrad wrote:
> Hi Adrien
> 
> > -----Original Message-----
> > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > Sent: Monday, October 30, 2017 4:23 PM
> > To: Matan Azrad <matan@mellanox.com>
> > Cc: dev@dpdk.org; Ophir Munk <ophirmu@mellanox.com>
> > Subject: Re: [PATCH v3 1/7] net/mlx4: remove error flows from Tx fast path
> > 
> > On Mon, Oct 30, 2017 at 10:07:23AM +0000, Matan Azrad wrote:
> > > Move unnecessary error flows to DEBUG mode.
> > >
> > > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > > Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > 
> > I missed a couple of details while reviewing the original version, the first one
> > being mlx4_post_send()'s return value is still documented as updating
> > rte_errno in case of error, it's not the case anymore after this patch.
> > 
> Good attention, Will be fixed in next version.
> 
> > Please see below for the other one:
> > 
> > > ---
> > >  drivers/net/mlx4/mlx4_rxtx.c | 16 ++++++----------
> > >  1 file changed, 6 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > > b/drivers/net/mlx4/mlx4_rxtx.c
> > <snip>
> > >  /**
> > > @@ -510,8 +508,6 @@ struct pv {
> > >  	assert(max <= elts_n);
> > >  	/* Always leave one free entry in the ring. */
> > >  	--max;
> > > -	if (max == 0)
> > > -		return 0;
> > >  	if (max > pkts_n)
> > >  		max = pkts_n;
> > >  	for (i = 0; (i != max); ++i) {
> > 
> > While minor, this change has nothing to do with this patch, right?
> > 
> Yes you right, maybe it can be merged in patch 4/7.
>  
> > I think it can slightly degrade an application performance as it removes the
> > guarantee that subsequent code only needs to be run if there is at least one
> > packet to process in case the TX ring is constantly full (SW faster than HW).
> >
> 
> In case the TX ring is full, the loop condition should fail in the start and then return with 0  because the packet counter is 0.(more 2 checks)
> Since this case are less common (in my opinion) than at least 1 free space in ring, we can prevent this unnecessary check for all these common cases.    
>   
> Are you sure the 2 extra check important for performance in this empty case? Doesn't the application will call us again? 

No, I don't think they're important to performance, like the changes from
patch 4/7, I'm not certain they actually make any difference. My suggestion
was mainly to leave it alone because of that. It's OK if you want to keep
and move it to 4/7.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 5/7] net/mlx4: separate Tx segment cases
  2017-10-30 18:23         ` Matan Azrad
@ 2017-10-31 10:17           ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-31 10:17 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

On Mon, Oct 30, 2017 at 06:23:31PM +0000, Matan Azrad wrote:
> Hi Adrien
> 
> > -----Original Message-----
> > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > Sent: Monday, October 30, 2017 4:24 PM
> > To: Matan Azrad <matan@mellanox.com>
> > Cc: dev@dpdk.org; Ophir Munk <ophirmu@mellanox.com>
> > Subject: Re: [PATCH v3 5/7] net/mlx4: separate Tx segment cases
> > 
> > On Mon, Oct 30, 2017 at 10:07:27AM +0000, Matan Azrad wrote:
> > > Since single segment packets shouldn't use additional memory to save
> > > segments byte count we can prevent additional memory unnecessary
> > usage
> > > in this case; Prevent loop management.
> > >
> > 
> > Sorry for asking but I really don't understand the above, can you reformulate
> > the problem addressed by this patch?
> > 
> What's about next?
> Optimize single segment case by processing it in different code which prevents checks and calculations relevant only to multi segment case. 

All right, just add "Tx" somewhere and it should be OK.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 6/7] net/mlx4: mitigate Tx path memory barriers
  2017-10-30 19:47         ` Matan Azrad
@ 2017-10-31 10:17           ` Adrien Mazarguil
  2017-10-31 11:35             ` Matan Azrad
  0 siblings, 1 reply; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-31 10:17 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

Hi Matan,

On Mon, Oct 30, 2017 at 07:47:20PM +0000, Matan Azrad wrote:
> Hi Adrien
> 
> > -----Original Message-----
> > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > Sent: Monday, October 30, 2017 4:24 PM
> > To: Matan Azrad <matan@mellanox.com>
> > Cc: dev@dpdk.org; Ophir Munk <ophirmu@mellanox.com>
> > Subject: Re: [PATCH v3 6/7] net/mlx4: mitigate Tx path memory barriers
> > 
> > On Mon, Oct 30, 2017 at 10:07:28AM +0000, Matan Azrad wrote:
> > > Replace most of the memory barriers by compiler barriers since they
> > > are all targeted to the DRAM; This improves code efficiency for
> > > systems which force store order between different addresses.
> > >
> > > Only the doorbell record store should be protected by memory barrier
> > > since it is targeted to the PCI memory domain.
> > >
> > > Limit pre byte count store compiler barrier for systems with cache
> > > line size smaller than 64B (TXBB size).
> > >
> > > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > 
> > This sounds like an interesting performance improvement, can you share the
> > typical or expected amount (percentage/hard numbers) for a given use case
> > as part of the commit log?
> > 
> 
> Yes, it improves performance, I will share numbers.

First I must add I thought rte_io_[rw]mb() was really only a renamed
compiler barrier, I better understand its purpose now, thanks.

(more below.)

> > More comments below.
> > 
> > > ---
> > >  drivers/net/mlx4/mlx4_rxtx.c | 11 ++++++-----
> > >  1 file changed, 6 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > > b/drivers/net/mlx4/mlx4_rxtx.c index 8ea8851..482c399 100644
> > > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > > @@ -168,7 +168,7 @@ struct pv {
> > >  		/*
> > >  		 * Make sure we read the CQE after we read the ownership
> > bit.
> > >  		 */
> > > -		rte_rmb();
> > > +		rte_io_rmb();
> > 
> > OK for this one since the rest of the code should not be run due to the
> > condition (I'm not even sure even a compiler barrier is necessary at all here).
> > 
> > >  #ifndef NDEBUG
> > >  		if (unlikely((cqe->owner_sr_opcode &
> > MLX4_CQE_OPCODE_MASK) ==
> > >  			     MLX4_CQE_OPCODE_ERROR)) {
> > > @@ -203,7 +203,7 @@ struct pv {
> > >  	 */
> > >  	cq->cons_index = cons_index;
> > >  	*cq->set_ci_db = rte_cpu_to_be_32(cq->cons_index &
> > MLX4_CQ_DB_CI_MASK);
> > > -	rte_wmb();
> > > +	rte_io_wmb();
> > 
> > This one could be removed entirely as well, which is more or less what the
> > move to a compiler barrier does. Nothing in subsequent code depends on
> > this doorbell being written, so this can piggy back on any subsequent
> > rte_wmb().
> 
> Yes, you right, probably this code was taken from multi thread implementation.
> > 
> > On the other hand in my opinion a barrier (compiler or otherwise) might be
> > needed before the doorbell write, to make clear it cannot somehow be done
> > earlier in case something attempts to optimize it away.
> > 
> I think we can remove it entirely (compiler can't optimize the ci_db store since in depends in previous code (cons_index).

Right, however you may still run into issues if the compiler determines the
final cons_index value by looking at the loop and decides to store it before
entering/leaving it. That's the kind of problematic optimization I was
thinking of.

The barrier in that sense is just to assert the order of seemingly unrelated
load/stores.

> > >  	sq->tail = sq->tail + nr_txbbs;
> > >  	/* Update the list of packets posted for transmission. */
> > >  	elts_comp -= pkts;
> > > @@ -321,6 +321,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
> > >  		 * control segment.
> > >  		 */
> > >  		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> > > +#if RTE_CACHE_LINE_SIZE < 64
> > >  			/*
> > >  			 * Need a barrier here before writing the byte_count
> > >  			 * fields to make sure that all the data is visible @@ -
> > 331,6
> > > +332,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
> > >  			 * data, and end up sending the wrong data.
> > >  			 */
> > >  			rte_io_wmb();
> > > +#endif /* RTE_CACHE_LINE_SIZE */
> > 
> > Interesting one.
> > 
> > >  			dseg->byte_count = byte_count;
> > >  		} else {
> > >  			/*
> > > @@ -469,8 +471,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
> > >  				break;
> > >  			}
> > >  #endif /* NDEBUG */
> > > -			/* Need a barrier here before byte count store. */
> > > -			rte_io_wmb();
> > > +			/* Never be TXBB aligned, no need compiler barrier.
> > */
> > 
> > The reason there was a barrier here at all was unclear, so if it's really useless,
> > you don't even need to describe why.
> 
> It is because there is a barrier in multi segment similar stage.
> I think it can help for future review.

OK.

> > 
> > >  			dseg->byte_count = rte_cpu_to_be_32(buf-
> > >data_len);
> > >
> > >  			/* Fill the control parameters for this packet. */ @@ -
> > 533,7
> > > +534,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
> > >  		 * setting ownership bit (because HW can start
> > >  		 * executing as soon as we do).
> > >  		 */
> > > -		rte_wmb();
> > > +		rte_io_wmb();
> > 
> > This one looks dangerous. A compiler barrier is not strong enough to
> > guarantee the order in which CPU will execute instructions, it only makes
> > sure what follows the barrier doesn't appear before it in the generated code.
> > 
> As I investigated, I understood that for CPUs which don't save store order between different addresses(arm,ppc), the rte_io_wmb is converted to rte_wmb.
> So for thus who save it(x86) we just need the right order in compiler code because all the relevant stores are targeted to same memory domain(DRAM) and therefore also the actual store is guaranteed.
> Unlike doorbell store which directed to different memory domain (PCI).
> So the only place which need rte_wmb() is before doorbell write.

Fair enough, although after re-reading the code I think there's another
issue present since the beginning: both ctrl and dseg pointers are not
volatile, this fact doesn't guarantee intermediate writes will occur in the
expected order or even at all, even in the presence of a barrier.

The volatile attribute should be inherited from both struct mlx4_cq and
struct mlx4_sq (buf, db and most if not all other pointers). I think a
separate fixes commit should add it for safety.

> > Unless the comment above this barrier is wrong, this change may cause hard-
> > to-debug issues down the road, you should drop it.
> > 
> > >  		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
> > >  					      ((sq->head & sq->txbb_cnt) ?
> > >  						       MLX4_BIT_WQE_OWN :
> > 0));
> > > --
> > > 1.8.3.1
> > >
> > 
> > --
> > Adrien Mazarguil
> > 6WIND
> 
> Thanks!

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 6/7] net/mlx4: mitigate Tx path memory barriers
  2017-10-31 10:17           ` Adrien Mazarguil
@ 2017-10-31 11:35             ` Matan Azrad
  2017-10-31 13:21               ` Adrien Mazarguil
  0 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-31 11:35 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Hi Adrien

> -----Original Message-----
> From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> Sent: Tuesday, October 31, 2017 12:17 PM
> To: Matan Azrad <matan@mellanox.com>
> Cc: dev@dpdk.org; Ophir Munk <ophirmu@mellanox.com>
> Subject: Re: [PATCH v3 6/7] net/mlx4: mitigate Tx path memory barriers
> 
> Hi Matan,
> 
> On Mon, Oct 30, 2017 at 07:47:20PM +0000, Matan Azrad wrote:
> > Hi Adrien
> >
> > > -----Original Message-----
> > > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > > Sent: Monday, October 30, 2017 4:24 PM
> > > To: Matan Azrad <matan@mellanox.com>
> > > Cc: dev@dpdk.org; Ophir Munk <ophirmu@mellanox.com>
> > > Subject: Re: [PATCH v3 6/7] net/mlx4: mitigate Tx path memory
> > > barriers
> > >
> > > On Mon, Oct 30, 2017 at 10:07:28AM +0000, Matan Azrad wrote:
> > > > Replace most of the memory barriers by compiler barriers since
> > > > they are all targeted to the DRAM; This improves code efficiency
> > > > for systems which force store order between different addresses.
> > > >
> > > > Only the doorbell record store should be protected by memory
> > > > barrier since it is targeted to the PCI memory domain.
> > > >
> > > > Limit pre byte count store compiler barrier for systems with cache
> > > > line size smaller than 64B (TXBB size).
> > > >
> > > > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > >
> > > This sounds like an interesting performance improvement, can you
> > > share the typical or expected amount (percentage/hard numbers) for a
> > > given use case as part of the commit log?
> > >
> >
> > Yes, it improves performance, I will share numbers.
> 
> First I must add I thought rte_io_[rw]mb() was really only a renamed
> compiler barrier, I better understand its purpose now, thanks.
> 
> (more below.)
> 
> > > More comments below.
> > >
> > > > ---
> > > >  drivers/net/mlx4/mlx4_rxtx.c | 11 ++++++-----
> > > >  1 file changed, 6 insertions(+), 5 deletions(-)
> > > >
> > > > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > > > b/drivers/net/mlx4/mlx4_rxtx.c index 8ea8851..482c399 100644
> > > > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > > > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > > > @@ -168,7 +168,7 @@ struct pv {
> > > >  		/*
> > > >  		 * Make sure we read the CQE after we read the ownership
> > > bit.
> > > >  		 */
> > > > -		rte_rmb();
> > > > +		rte_io_rmb();
> > >
> > > OK for this one since the rest of the code should not be run due to
> > > the condition (I'm not even sure even a compiler barrier is necessary at all
> here).
> > >
> > > >  #ifndef NDEBUG
> > > >  		if (unlikely((cqe->owner_sr_opcode &
> > > MLX4_CQE_OPCODE_MASK) ==
> > > >  			     MLX4_CQE_OPCODE_ERROR)) { @@ -203,7 +203,7
> @@ struct pv {
> > > >  	 */
> > > >  	cq->cons_index = cons_index;
> > > >  	*cq->set_ci_db = rte_cpu_to_be_32(cq->cons_index &
> > > MLX4_CQ_DB_CI_MASK);
> > > > -	rte_wmb();
> > > > +	rte_io_wmb();
> > >
> > > This one could be removed entirely as well, which is more or less
> > > what the move to a compiler barrier does. Nothing in subsequent code
> > > depends on this doorbell being written, so this can piggy back on
> > > any subsequent rte_wmb().
> >
> > Yes, you right, probably this code was taken from multi thread
> implementation.
> > >
> > > On the other hand in my opinion a barrier (compiler or otherwise)
> > > might be needed before the doorbell write, to make clear it cannot
> > > somehow be done earlier in case something attempts to optimize it
> away.
> > >
> > I think we can remove it entirely (compiler can't optimize the ci_db store
> since in depends in previous code (cons_index).
> 
> Right, however you may still run into issues if the compiler determines the
> final cons_index value by looking at the loop and decides to store it before
> entering/leaving it. That's the kind of problematic optimization I was thinking
> of.
> 
> The barrier in that sense is just to assert the order of seemingly unrelated
> load/stores.

I think that If I left the rte_io_rmb after CQE owner check we can earn both:
1. The concern of read ordering while reading the owner before using CQE.
2. The ci_db concern: the compiler must read the last CQE(which is not valid and we have no more stamp to do) before it knows the last value of cons_index. 
So we can remove entirely this rte_io_wmb in completion function.
What do you think? 
 
> > > >  			/* Fill the control parameters for this packet. */ @@ -
> > > 533,7
> > > > +534,7 @@ static int handle_multi_segs(struct rte_mbuf *buf,
> > > >  		 * setting ownership bit (because HW can start
> > > >  		 * executing as soon as we do).
> > > >  		 */
> > > > -		rte_wmb();
> > > > +		rte_io_wmb();
> > >
> > > This one looks dangerous. A compiler barrier is not strong enough to
> > > guarantee the order in which CPU will execute instructions, it only
> > > makes sure what follows the barrier doesn't appear before it in the
> generated code.
> > >
> > As I investigated, I understood that for CPUs which don't save store order
> between different addresses(arm,ppc), the rte_io_wmb is converted to
> rte_wmb.
> > So for thus who save it(x86) we just need the right order in compiler code
> because all the relevant stores are targeted to same memory domain(DRAM)
> and therefore also the actual store is guaranteed.
> > Unlike doorbell store which directed to different memory domain (PCI).
> > So the only place which need rte_wmb() is before doorbell write.
> 
> Fair enough, although after re-reading the code I think there's another issue
> present since the beginning: both ctrl and dseg pointers are not volatile, this
> fact doesn't guarantee intermediate writes will occur in the expected order
> or even at all, even in the presence of a barrier.
> 
> The volatile attribute should be inherited from both struct mlx4_cq and struct
> mlx4_sq (buf, db and most if not all other pointers). I think a separate fixes
> commit should add it for safety.

Great notice , I will add it, Thanks!
> 
> > > Unless the comment above this barrier is wrong, this change may
> > > cause hard- to-debug issues down the road, you should drop it.
> > >
> > > >  		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
> > > >  					      ((sq->head & sq->txbb_cnt) ?
> > > >  						       MLX4_BIT_WQE_OWN :
> > > 0));
> > > > --
> > > > 1.8.3.1
> > > >
> > >
> > > --
> > > Adrien Mazarguil
> > > 6WIND
> >
> > Thanks!
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 6/7] net/mlx4: mitigate Tx path memory barriers
  2017-10-31 11:35             ` Matan Azrad
@ 2017-10-31 13:21               ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-10-31 13:21 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

Hi Matan,

On Tue, Oct 31, 2017 at 11:35:29AM +0000, Matan Azrad wrote:
<snip>
> > -----Original Message-----
> > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > Sent: Tuesday, October 31, 2017 12:17 PM
> > To: Matan Azrad <matan@mellanox.com>
> > Cc: dev@dpdk.org; Ophir Munk <ophirmu@mellanox.com>
> > Subject: Re: [PATCH v3 6/7] net/mlx4: mitigate Tx path memory barriers
> > 
> > Hi Matan,
> > 
> > On Mon, Oct 30, 2017 at 07:47:20PM +0000, Matan Azrad wrote:
> > > Hi Adrien
> > >
> > > > -----Original Message-----
> > > > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > > > Sent: Monday, October 30, 2017 4:24 PM
> > > > To: Matan Azrad <matan@mellanox.com>
> > > > Cc: dev@dpdk.org; Ophir Munk <ophirmu@mellanox.com>
> > > > Subject: Re: [PATCH v3 6/7] net/mlx4: mitigate Tx path memory
> > > > barriers
> > > >
> > > > On Mon, Oct 30, 2017 at 10:07:28AM +0000, Matan Azrad wrote:
> > > > > Replace most of the memory barriers by compiler barriers since
> > > > > they are all targeted to the DRAM; This improves code efficiency
> > > > > for systems which force store order between different addresses.
> > > > >
> > > > > Only the doorbell record store should be protected by memory
> > > > > barrier since it is targeted to the PCI memory domain.
> > > > >
> > > > > Limit pre byte count store compiler barrier for systems with cache
> > > > > line size smaller than 64B (TXBB size).
> > > > >
> > > > > Signed-off-by: Matan Azrad <matan@mellanox.com>
<snip>
> > > > >  	cq->cons_index = cons_index;
> > > > >  	*cq->set_ci_db = rte_cpu_to_be_32(cq->cons_index &
> > > > MLX4_CQ_DB_CI_MASK);
> > > > > -	rte_wmb();
> > > > > +	rte_io_wmb();
> > > >
> > > > This one could be removed entirely as well, which is more or less
> > > > what the move to a compiler barrier does. Nothing in subsequent code
> > > > depends on this doorbell being written, so this can piggy back on
> > > > any subsequent rte_wmb().
> > >
> > > Yes, you right, probably this code was taken from multi thread
> > implementation.
> > > >
> > > > On the other hand in my opinion a barrier (compiler or otherwise)
> > > > might be needed before the doorbell write, to make clear it cannot
> > > > somehow be done earlier in case something attempts to optimize it
> > away.
> > > >
> > > I think we can remove it entirely (compiler can't optimize the ci_db store
> > since in depends in previous code (cons_index).
> > 
> > Right, however you may still run into issues if the compiler determines the
> > final cons_index value by looking at the loop and decides to store it before
> > entering/leaving it. That's the kind of problematic optimization I was thinking
> > of.
> > 
> > The barrier in that sense is just to assert the order of seemingly unrelated
> > load/stores.
> 
> I think that If I left the rte_io_rmb after CQE owner check we can earn both:
> 1. The concern of read ordering while reading the owner before using CQE.
> 2. The ci_db concern: the compiler must read the last CQE(which is not valid and we have no more stamp to do) before it knows the last value of cons_index. 
> So we can remove entirely this rte_io_wmb in completion function.
> What do you think? 

That's right, this means there's a barrier before the doorbell write in any
case, OK then.

Just make sure cq->set_ci_db is volatile in a prior "fix" commit as
described in my previous suggestion, otherwise the remaining barriers won't
guarantee much. Thanks.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v3 2/7] net/mlx4: associate MR to MP in a short function
  2017-10-30 14:23       ` Adrien Mazarguil
@ 2017-10-31 13:25         ` Ophir Munk
  0 siblings, 0 replies; 84+ messages in thread
From: Ophir Munk @ 2017-10-31 13:25 UTC (permalink / raw)
  To: Adrien Mazarguil, Matan Azrad
  Cc: dev, Olga Shern, Thomas Monjalon, Ophir Munk

Hi,
Please see inline

On Monday, October 30, 2017 4:23 PM, Adrien Mazarguil wrote:
> 
> On Mon, Oct 30, 2017 at 10:07:24AM +0000, Matan Azrad wrote:
> > From: Ophir Munk <ophirmu@mellanox.com>
> >
> > Associate memory region to mempool (on data path) in a short function.
> > Handle the less common case of adding a new memory region to mempool
> > in a separate function.
> >
> > Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> 
> Thanks for moving that function out of the way, a few more comments
> below.
> 
> By the way, commit log lines can be up to 75 characters wide, only the title
> line is limited to 50.
> 

Commit message updated to be longer

> > ---
> >  drivers/net/mlx4/mlx4.h      |  2 ++
> >  drivers/net/mlx4/mlx4_mr.c   | 46
> +++++++++++++++++++++++++++++++++++
> >  drivers/net/mlx4/mlx4_rxtx.c | 57
> > --------------------------------------------
> >  drivers/net/mlx4/mlx4_rxtx.h | 33 ++++++++++++++++++++++++-
> >  4 files changed, 80 insertions(+), 58 deletions(-)
> >
> > diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index
> > e0a9853..70cf453 100644
> > --- a/drivers/net/mlx4/mlx4.h
> > +++ b/drivers/net/mlx4/mlx4.h
> > @@ -160,5 +160,7 @@ int mlx4_flow_ctrl_set(struct rte_eth_dev *dev,
> >  /* mlx4_mr.c */
> >
> >  struct ibv_mr *mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp);
> > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> > +			 uint32_t i);
> >
> >  #endif /* RTE_PMD_MLX4_H_ */
> > diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
> > index 9700884..ebf6547 100644
> > --- a/drivers/net/mlx4/mlx4_mr.c
> > +++ b/drivers/net/mlx4/mlx4_mr.c
> > @@ -56,6 +56,7 @@
> >  #include <rte_mempool.h>
> >
> >  #include "mlx4_utils.h"
> > +#include "mlx4_rxtx.h"
> 
> Alphabetical order is preferred, or so I thought. Nothing seems to enforce
> this in the coding rules, it's only a suggestion.

Suggestion accepted

> 
> >
> >  struct mlx4_check_mempool_data {
> >  	int ret;
> > @@ -181,3 +182,48 @@ struct ibv_mr *
> >  		rte_errno = errno ? errno : EINVAL;
> >  	return mr;
> >  }
> > +
> > +/**
> > + * Add memory region (MR) <-> memory pool (MP) association to txq-
> >mp2mr[].
> > + * If mp2mr[] is full, remove an entry first.
> > + *
> > + * @param txq
> > + *   Pointer to Tx queue structure.
> > + * @param[in] mp
> > + *   Memory pool for which a memory region lkey must be added
> 
> Missing "."

Added "."

> 
> > + * @param[in] i
> > + *   Index in memory pool (MP) where to add memory region (MR)
> 
> Same here.

Done

> 
> > + *
> > + * @return
> > + *   Added mr->lkey on success, (uint32_t)-1 on failure.
> > + */
> > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> > +uint32_t i)
> 
> Coding style again; return type must be on a separate line for definitions (not
> declarations).
> 
> > +{
> > +	struct ibv_mr *mr;
> > +
> > +	/* Add a new entry, register MR first. */
> > +	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
> > +	      (void *)txq, mp->name, (void *)mp);
> > +	mr = mlx4_mp2mr(txq->priv->pd, mp);
> > +	if (unlikely(mr == NULL)) {
> > +		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
> > +		      (void *)txq);
> > +		return (uint32_t)-1;
> > +	}
> > +	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
> > +		/* Table is full, remove oldest entry. */
> > +		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
> > +		      (void *)txq);
> > +		--i;
> > +		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
> > +		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
> > +			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
> > +	}
> > +	/* Store the new entry. */
> > +	txq->mp2mr[i].mp = mp;
> > +	txq->mp2mr[i].mr = mr;
> > +	txq->mp2mr[i].lkey = mr->lkey;
> > +	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIU32,
> > +	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
> > +	return txq->mp2mr[i].lkey;
> > +}
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > b/drivers/net/mlx4/mlx4_rxtx.c index 4f899ff..f89df46 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > @@ -239,63 +239,6 @@ struct pv {
> >  }
> >
> >  /**
> > - * Get memory region (MR) <-> memory pool (MP) association from txq-
> >mp2mr[].
> > - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is
> > full,
> > - * remove an entry first.
> > - *
> > - * @param txq
> > - *   Pointer to Tx queue structure.
> > - * @param[in] mp
> > - *   Memory pool for which a memory region lkey must be returned.
> > - *
> > - * @return
> > - *   mr->lkey on success, (uint32_t)-1 on failure.
> > - */
> > -uint32_t
> > -mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) -{
> > -	unsigned int i;
> > -	struct ibv_mr *mr;
> > -
> > -	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > -		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > -			/* Unknown MP, add a new MR for it. */
> > -			break;
> > -		}
> > -		if (txq->mp2mr[i].mp == mp) {
> > -			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > -			assert(txq->mp2mr[i].mr->lkey == txq-
> >mp2mr[i].lkey);
> > -			return txq->mp2mr[i].lkey;
> > -		}
> > -	}
> > -	/* Add a new entry, register MR first. */
> > -	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
> > -	      (void *)txq, mp->name, (void *)mp);
> > -	mr = mlx4_mp2mr(txq->priv->pd, mp);
> > -	if (unlikely(mr == NULL)) {
> > -		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
> > -		      (void *)txq);
> > -		return (uint32_t)-1;
> > -	}
> > -	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
> > -		/* Table is full, remove oldest entry. */
> > -		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
> > -		      (void *)txq);
> > -		--i;
> > -		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
> > -		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
> > -			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
> > -	}
> > -	/* Store the new entry. */
> > -	txq->mp2mr[i].mp = mp;
> > -	txq->mp2mr[i].mr = mr;
> > -	txq->mp2mr[i].lkey = mr->lkey;
> > -	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
> > -	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
> > -	return txq->mp2mr[i].lkey;
> > -}
> > -
> > -/**
> >   * Posts a single work request to a send queue.
> >   *
> >   * @param txq
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.h
> > b/drivers/net/mlx4/mlx4_rxtx.h index 7d67748..b32fcc9 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.h
> > +++ b/drivers/net/mlx4/mlx4_rxtx.h
> > @@ -36,6 +36,7 @@
> >
> >  #include <stdint.h>
> >  #include <sys/queue.h>
> > +#include <assert.h>
> >
> >  /* Verbs headers do not support -pedantic. */  #ifdef PEDANTIC @@
> > -165,7 +166,6 @@ int mlx4_rx_queue_setup(struct rte_eth_dev *dev,
> > uint16_t idx,
> >
> >  /* mlx4_rxtx.c */
> >
> > -uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
> > uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
> >  		       uint16_t pkts_n);
> >  uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, @@
> > -182,4 +182,35 @@ int mlx4_tx_queue_setup(struct rte_eth_dev *dev,
> uint16_t idx,
> >  			const struct rte_eth_txconf *conf);  void
> > mlx4_tx_queue_release(void *dpdk_txq);
> >
> > +
> 
> Extra empty line, however see below.
> 

Extra line removed

> > +/**
> > + * Get memory region (MR) <-> memory pool (MP) association from txq-
> >mp2mr[].
> > + * Call mlx4_txq_add_mr() if MP is not registered yet.
> > + *
> > + * @param txq
> > + *   Pointer to Tx queue structure.
> > + * @param[in] mp
> > + *   Memory pool for which a memory region lkey must be returned.
> > + *
> > + * @return
> > + *   mr->lkey on success, (uint32_t)-1 on failure.
> > + */
> > +static inline uint32_t
> > +mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) {
> > +	unsigned int i;
> > +
> > +	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > +		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > +			/* Unknown MP, add a new MR for it. */
> > +			break;
> > +		}
> > +		if (txq->mp2mr[i].mp == mp) {
> > +			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > +			assert(txq->mp2mr[i].mr->lkey == txq-
> >mp2mr[i].lkey);
> > +			return txq->mp2mr[i].lkey;
> > +		}
> > +	}
> > +	return mlx4_txq_add_mr(txq, mp, i);
> > +}
> 
> I'm OK with keeping this function as a static inline defined in a header file,
> however I think mlx4_rxtx.h is not the right place for it given the remaining
> MR-handling functions are declared by mlx4.h. You should move it to that file
> as well (likely below mlx4_txq_add_mr()).
> 
> Note: RTE_DIM() and unlikely() require rte_common.h, NULL requires
> stddef.h.
>

Please note that moving mlx4_txq_mp2mr() to mlx4.h requires to "#include mlx4_rxtx.h" in mlx4.h 
(because of accessing struct txq fields).
At the same time mlx4_rxtx.h requires to "#include mlx4.h" because it uses definitions such as 
MLX4_RSS_HASH_KEY_SIZE.
End result is that mlx4.h and mlx4_rxtx.h include each other. 
It does not  compile without rewriting code (domino effect) which I don't think is a desirable result 
of this review (unless you specify otherwise).
Therefore for v4 I have left mlx4_txq_mp2mr() in mlx4_rxtx.h
 
> While there, I think you should drop both assert() statements and the reliance
> on assert.h because such statements shouldn't be part of a static inline
> function not supposed to fail. Let's assume mp2mr[] is correctly updated
> upstream.
> 

assert() - removed

> (Also missing empty line here.)
> 

Empty line added

> >  #endif /* MLX4_RXTX_H_ */
> > --
> > 1.8.3.1
> >
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* [PATCH v4 0/8] net/mlx4: Tx path improvements
  2017-10-30 10:07   ` [PATCH v3 0/7] Tx path improvements Matan Azrad
                       ` (6 preceding siblings ...)
  2017-10-30 10:07     ` [PATCH v3 7/7] net/mlx4: remove empty Tx segment support Matan Azrad
@ 2017-10-31 18:21     ` Matan Azrad
  2017-10-31 18:21       ` [PATCH v4 1/8] net/mlx4: remove error flows from Tx fast path Matan Azrad
                         ` (8 more replies)
  2017-11-02 16:42     ` [PATCH v5 0/8] net/mlx4: Tx path improvements Matan Azrad
  8 siblings, 9 replies; 84+ messages in thread
From: Matan Azrad @ 2017-10-31 18:21 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

v1:
This series is a follow-up of the new datapath implementation introduced in RC1.
It is dedicated to Tx and improves the performance.

v2:
Change uint32_t be_lkey to rte_be32_t lkey

v3:
Rebase to 17.11-rc2.
Remove "net/mlx4: save lkey in big-endian format" patch which degrade performance.
Merge separate segment cases patches "net/mlx4: improve performance of one Tx segment" and "net/mlx4: separate Tx for multi-segments".
"net/mlx4: inline more Tx functions" renamed to "net/mlx4: associate MR to MP in a short function".
Leave bytes counter as old version.
Add memory barrier improvement patch.
Remove empty segment support.

v4:
Remove "net/mlx4: remove empty Tx segment support" patch.
Add "fix ring wraparound compiler hint" patch.
Add "fix HW memory optimizations careless" patch.
Remove unnecessary IO memory barrier in completion function.
Remove inline declaration of completion function.
Fix unnecessary empty lines.
Fix indetations.
Move max=0 check removal to patch "remove duplicate handling in Tx burst" and rename the patch.
Add performace improvement in patch "mitigate Tx path memory barriers"

Matan Azrad (7):
  net/mlx4: remove error flows from Tx fast path
  net/mlx4: fix ring wraparound compiler hint
  net/mlx4: merge Tx path functions
  net/mlx4: remove duplicate handling in Tx burst
  net/mlx4: separate Tx segment cases
  net/mlx4: fix HW memory optimizations careless
  net/mlx4: mitigate Tx path memory barriers

Ophir Munk (1):
  net/mlx4: associate MR to MP in a short function

 drivers/net/mlx4/mlx4.h      |   2 +
 drivers/net/mlx4/mlx4_mr.c   |  47 +++++
 drivers/net/mlx4/mlx4_prm.h  |  18 +-
 drivers/net/mlx4/mlx4_rxtx.c | 397 +++++++++++++++++++++----------------------
 drivers/net/mlx4/mlx4_rxtx.h |  31 +++-
 5 files changed, 281 insertions(+), 214 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 84+ messages in thread

* [PATCH v4 1/8] net/mlx4: remove error flows from Tx fast path
  2017-10-31 18:21     ` [PATCH v4 0/8] net/mlx4: Tx path improvements Matan Azrad
@ 2017-10-31 18:21       ` Matan Azrad
  2017-10-31 18:21       ` [PATCH v4 2/8] net/mlx4: associate MR to MP in a short function Matan Azrad
                         ` (7 subsequent siblings)
  8 siblings, 0 replies; 84+ messages in thread
From: Matan Azrad @ 2017-10-31 18:21 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Move unnecessary error flows to DEBUG mode.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 67dc712..79c7fa2 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -169,6 +169,7 @@ struct pv {
 		 * Make sure we read the CQE after we read the ownership bit.
 		 */
 		rte_rmb();
+#ifndef NDEBUG
 		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
 			     MLX4_CQE_OPCODE_ERROR)) {
 			struct mlx4_err_cqe *cqe_err =
@@ -178,6 +179,7 @@ struct pv {
 			      (void *)txq, cqe_err->vendor_err,
 			      cqe_err->syndrome);
 		}
+#endif /* NDEBUG */
 		/* Get WQE index reported in the CQE. */
 		new_index =
 			rte_be_to_cpu_16(cqe->wqe_index) & sq->txbb_cnt_mask;
@@ -302,7 +304,7 @@ struct pv {
  *   Packet to transmit.
  *
  * @return
- *   0 on success, negative errno value otherwise and rte_errno is set.
+ *   0 on success, negative errno value otherwise.
  */
 static inline int
 mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
@@ -322,7 +324,6 @@ struct pv {
 	uint32_t byte_count;
 	int wqe_real_size;
 	int nr_txbbs;
-	int rc;
 	struct pv *pv = (struct pv *)txq->bounce_buf;
 	int pv_counter = 0;
 
@@ -337,8 +338,7 @@ struct pv {
 	if (((sq->head - sq->tail) + nr_txbbs +
 	     sq->headroom_txbbs) >= sq->txbb_cnt ||
 	    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-		rc = ENOSPC;
-		goto err;
+		return -ENOSPC;
 	}
 	/* Get the control and data entries of the WQE. */
 	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
@@ -354,6 +354,7 @@ struct pv {
 		dseg->addr = rte_cpu_to_be_64(addr);
 		/* Memory region key for this memory pool. */
 		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+#ifndef NDEBUG
 		if (unlikely(lkey == (uint32_t)-1)) {
 			/* MR does not exist. */
 			DEBUG("%p: unable to get MP <-> MR association",
@@ -366,9 +367,9 @@ struct pv {
 			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
 			mlx4_txq_stamp_freed_wqe(sq, head_idx,
 				     (sq->head & sq->txbb_cnt) ? 0 : 1);
-			rc = EFAULT;
-			goto err;
+			return -EFAULT;
 		}
+#endif /* NDEBUG */
 		dseg->lkey = rte_cpu_to_be_32(lkey);
 		if (likely(buf->data_len)) {
 			byte_count = rte_cpu_to_be_32(buf->data_len);
@@ -471,9 +472,6 @@ struct pv {
 					       MLX4_BIT_WQE_OWN : 0));
 	sq->head += nr_txbbs;
 	return 0;
-err:
-	rte_errno = rc;
-	return -rc;
 }
 
 /**
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v4 2/8] net/mlx4: associate MR to MP in a short function
  2017-10-31 18:21     ` [PATCH v4 0/8] net/mlx4: Tx path improvements Matan Azrad
  2017-10-31 18:21       ` [PATCH v4 1/8] net/mlx4: remove error flows from Tx fast path Matan Azrad
@ 2017-10-31 18:21       ` Matan Azrad
  2017-11-02 13:42         ` Adrien Mazarguil
  2017-10-31 18:21       ` [PATCH v4 3/8] net/mlx4: fix ring wraparound compiler hint Matan Azrad
                         ` (6 subsequent siblings)
  8 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-31 18:21 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

From: Ophir Munk <ophirmu@mellanox.com>

Associate memory region to mempool (on data path) in a short function.
Handle the less common case of adding a new memory region to mempool
in a separate function.

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 drivers/net/mlx4/mlx4.h      |  2 ++
 drivers/net/mlx4/mlx4_mr.c   | 47 ++++++++++++++++++++++++++++++++++++
 drivers/net/mlx4/mlx4_rxtx.c | 57 --------------------------------------------
 drivers/net/mlx4/mlx4_rxtx.h | 31 +++++++++++++++++++++++-
 4 files changed, 79 insertions(+), 58 deletions(-)

diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index e0a9853..70cf453 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -160,5 +160,7 @@ int mlx4_flow_ctrl_set(struct rte_eth_dev *dev,
 /* mlx4_mr.c */
 
 struct ibv_mr *mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp);
+uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
+			 uint32_t i);
 
 #endif /* RTE_PMD_MLX4_H_ */
diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
index 9700884..75ee32f 100644
--- a/drivers/net/mlx4/mlx4_mr.c
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -55,6 +55,7 @@
 #include <rte_memory.h>
 #include <rte_mempool.h>
 
+#include "mlx4_rxtx.h"
 #include "mlx4_utils.h"
 
 struct mlx4_check_mempool_data {
@@ -181,3 +182,49 @@ struct ibv_mr *
 		rte_errno = errno ? errno : EINVAL;
 	return mr;
 }
+
+/**
+ * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
+ * If mp2mr[] is full, remove an entry first.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param[in] mp
+ *   Memory pool for which a memory region lkey must be added.
+ * @param[in] i
+ *   Index in memory pool (MP) where to add memory region (MR).
+ *
+ * @return
+ *   Added mr->lkey on success, (uint32_t)-1 on failure.
+ */
+uint32_t
+mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
+{
+	struct ibv_mr *mr;
+
+	/* Add a new entry, register MR first. */
+	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
+	      (void *)txq, mp->name, (void *)mp);
+	mr = mlx4_mp2mr(txq->priv->pd, mp);
+	if (unlikely(mr == NULL)) {
+		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
+		      (void *)txq);
+		return (uint32_t)-1;
+	}
+	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
+		/* Table is full, remove oldest entry. */
+		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
+		      (void *)txq);
+		--i;
+		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
+		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
+			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+	}
+	/* Store the new entry. */
+	txq->mp2mr[i].mp = mp;
+	txq->mp2mr[i].mr = mr;
+	txq->mp2mr[i].lkey = mr->lkey;
+	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIU32,
+	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
+	return txq->mp2mr[i].lkey;
+}
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 79c7fa2..3b5faa1 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -239,63 +239,6 @@ struct pv {
 }
 
 /**
- * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
- * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
- * remove an entry first.
- *
- * @param txq
- *   Pointer to Tx queue structure.
- * @param[in] mp
- *   Memory pool for which a memory region lkey must be returned.
- *
- * @return
- *   mr->lkey on success, (uint32_t)-1 on failure.
- */
-uint32_t
-mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
-{
-	unsigned int i;
-	struct ibv_mr *mr;
-
-	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
-		if (unlikely(txq->mp2mr[i].mp == NULL)) {
-			/* Unknown MP, add a new MR for it. */
-			break;
-		}
-		if (txq->mp2mr[i].mp == mp) {
-			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
-			return txq->mp2mr[i].lkey;
-		}
-	}
-	/* Add a new entry, register MR first. */
-	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
-	      (void *)txq, mp->name, (void *)mp);
-	mr = mlx4_mp2mr(txq->priv->pd, mp);
-	if (unlikely(mr == NULL)) {
-		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
-		      (void *)txq);
-		return (uint32_t)-1;
-	}
-	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
-		/* Table is full, remove oldest entry. */
-		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
-		      (void *)txq);
-		--i;
-		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
-		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
-			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
-	}
-	/* Store the new entry. */
-	txq->mp2mr[i].mp = mp;
-	txq->mp2mr[i].mr = mr;
-	txq->mp2mr[i].lkey = mr->lkey;
-	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
-	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
-	return txq->mp2mr[i].lkey;
-}
-
-/**
  * Posts a single work request to a send queue.
  *
  * @param txq
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 7d67748..185dbdf 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -165,7 +165,6 @@ int mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
 
 /* mlx4_rxtx.c */
 
-uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
 uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
 		       uint16_t pkts_n);
 uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
@@ -182,4 +181,34 @@ int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
 			const struct rte_eth_txconf *conf);
 void mlx4_tx_queue_release(void *dpdk_txq);
 
+/**
+ * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
+ * Call mlx4_txq_add_mr() if MP is not registered yet.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param[in] mp
+ *   Memory pool for which a memory region lkey must be returned.
+ *
+ * @return
+ *   mr->lkey on success, (uint32_t)-1 on failure.
+ */
+static inline uint32_t
+mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+{
+	unsigned int i;
+
+	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+		if (unlikely(txq->mp2mr[i].mp == NULL)) {
+			/* Unknown MP, add a new MR for it. */
+			break;
+		}
+		if (txq->mp2mr[i].mp == mp) {
+			/* MP found MP. */
+			return txq->mp2mr[i].lkey;
+		}
+	}
+	return mlx4_txq_add_mr(txq, mp, i);
+}
+
 #endif /* MLX4_RXTX_H_ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v4 3/8] net/mlx4: fix ring wraparound compiler hint
  2017-10-31 18:21     ` [PATCH v4 0/8] net/mlx4: Tx path improvements Matan Azrad
  2017-10-31 18:21       ` [PATCH v4 1/8] net/mlx4: remove error flows from Tx fast path Matan Azrad
  2017-10-31 18:21       ` [PATCH v4 2/8] net/mlx4: associate MR to MP in a short function Matan Azrad
@ 2017-10-31 18:21       ` Matan Azrad
  2017-11-02 13:42         ` Adrien Mazarguil
  2017-10-31 18:21       ` [PATCH v4 4/8] net/mlx4: merge Tx path functions Matan Azrad
                         ` (5 subsequent siblings)
  8 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-31 18:21 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Remove unlikely hint from WQ wraparound check because it is
expected case.

Fixes: c3c977bbecbd ("net/mlx4: add Tx bypassing Verbs")

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 3b5faa1..eddce82 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -292,7 +292,7 @@ struct pv {
 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
 		rte_prefetch0((volatile void *)addr);
 		/* Handle WQE wraparound. */
-		if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
+		if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
 			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
 		dseg->addr = rte_cpu_to_be_64(addr);
 		/* Memory region key for this memory pool. */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v4 4/8] net/mlx4: merge Tx path functions
  2017-10-31 18:21     ` [PATCH v4 0/8] net/mlx4: Tx path improvements Matan Azrad
                         ` (2 preceding siblings ...)
  2017-10-31 18:21       ` [PATCH v4 3/8] net/mlx4: fix ring wraparound compiler hint Matan Azrad
@ 2017-10-31 18:21       ` Matan Azrad
  2017-11-02 13:42         ` Adrien Mazarguil
  2017-10-31 18:21       ` [PATCH v4 5/8] net/mlx4: remove duplicate handling in Tx burst Matan Azrad
                         ` (4 subsequent siblings)
  8 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-31 18:21 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Merge tx_burst and mlx4_post_send functions to prevent
double asking about WQ remain space.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 355 +++++++++++++++++++++----------------------
 1 file changed, 170 insertions(+), 185 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index eddce82..0759e51 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -239,185 +239,6 @@ struct pv {
 }
 
 /**
- * Posts a single work request to a send queue.
- *
- * @param txq
- *   Target Tx queue.
- * @param pkt
- *   Packet to transmit.
- *
- * @return
- *   0 on success, negative errno value otherwise.
- */
-static inline int
-mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
-{
-	struct mlx4_wqe_ctrl_seg *ctrl;
-	struct mlx4_wqe_data_seg *dseg;
-	struct mlx4_sq *sq = &txq->msq;
-	struct rte_mbuf *buf;
-	union {
-		uint32_t flags;
-		uint16_t flags16[2];
-	} srcrb;
-	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
-	uint32_t lkey;
-	uintptr_t addr;
-	uint32_t owner_opcode = MLX4_OPCODE_SEND;
-	uint32_t byte_count;
-	int wqe_real_size;
-	int nr_txbbs;
-	struct pv *pv = (struct pv *)txq->bounce_buf;
-	int pv_counter = 0;
-
-	/* Calculate the needed work queue entry size for this packet. */
-	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
-			pkt->nb_segs * sizeof(struct mlx4_wqe_data_seg);
-	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
-	/*
-	 * Check that there is room for this WQE in the send queue and that
-	 * the WQE size is legal.
-	 */
-	if (((sq->head - sq->tail) + nr_txbbs +
-	     sq->headroom_txbbs) >= sq->txbb_cnt ||
-	    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-		return -ENOSPC;
-	}
-	/* Get the control and data entries of the WQE. */
-	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
-	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
-					    sizeof(struct mlx4_wqe_ctrl_seg));
-	/* Fill the data segments with buffer information. */
-	for (buf = pkt; buf != NULL; buf = buf->next, dseg++) {
-		addr = rte_pktmbuf_mtod(buf, uintptr_t);
-		rte_prefetch0((volatile void *)addr);
-		/* Handle WQE wraparound. */
-		if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
-			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
-		dseg->addr = rte_cpu_to_be_64(addr);
-		/* Memory region key for this memory pool. */
-		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
-#ifndef NDEBUG
-		if (unlikely(lkey == (uint32_t)-1)) {
-			/* MR does not exist. */
-			DEBUG("%p: unable to get MP <-> MR association",
-			      (void *)txq);
-			/*
-			 * Restamp entry in case of failure.
-			 * Make sure that size is written correctly
-			 * Note that we give ownership to the SW, not the HW.
-			 */
-			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
-			mlx4_txq_stamp_freed_wqe(sq, head_idx,
-				     (sq->head & sq->txbb_cnt) ? 0 : 1);
-			return -EFAULT;
-		}
-#endif /* NDEBUG */
-		dseg->lkey = rte_cpu_to_be_32(lkey);
-		if (likely(buf->data_len)) {
-			byte_count = rte_cpu_to_be_32(buf->data_len);
-		} else {
-			/*
-			 * Zero length segment is treated as inline segment
-			 * with zero data.
-			 */
-			byte_count = RTE_BE32(0x80000000);
-		}
-		/*
-		 * If the data segment is not at the beginning of a
-		 * Tx basic block (TXBB) then write the byte count,
-		 * else postpone the writing to just before updating the
-		 * control segment.
-		 */
-		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
-			/*
-			 * Need a barrier here before writing the byte_count
-			 * fields to make sure that all the data is visible
-			 * before the byte_count field is set.
-			 * Otherwise, if the segment begins a new cacheline,
-			 * the HCA prefetcher could grab the 64-byte chunk and
-			 * get a valid (!= 0xffffffff) byte count but stale
-			 * data, and end up sending the wrong data.
-			 */
-			rte_io_wmb();
-			dseg->byte_count = byte_count;
-		} else {
-			/*
-			 * This data segment starts at the beginning of a new
-			 * TXBB, so we need to postpone its byte_count writing
-			 * for later.
-			 */
-			pv[pv_counter].dseg = dseg;
-			pv[pv_counter++].val = byte_count;
-		}
-	}
-	/* Write the first DWORD of each TXBB save earlier. */
-	if (pv_counter) {
-		/* Need a barrier here before writing the byte_count. */
-		rte_io_wmb();
-		for (--pv_counter; pv_counter  >= 0; pv_counter--)
-			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
-	}
-	/* Fill the control parameters for this packet. */
-	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
-	/*
-	 * For raw Ethernet, the SOLICIT flag is used to indicate that no ICRC
-	 * should be calculated.
-	 */
-	txq->elts_comp_cd -= nr_txbbs;
-	if (unlikely(txq->elts_comp_cd <= 0)) {
-		txq->elts_comp_cd = txq->elts_comp_cd_init;
-		srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
-				       MLX4_WQE_CTRL_CQ_UPDATE);
-	} else {
-		srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
-	}
-	/* Enable HW checksum offload if requested */
-	if (txq->csum &&
-	    (pkt->ol_flags &
-	     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) {
-		const uint64_t is_tunneled = (pkt->ol_flags &
-					      (PKT_TX_TUNNEL_GRE |
-					       PKT_TX_TUNNEL_VXLAN));
-
-		if (is_tunneled && txq->csum_l2tun) {
-			owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM |
-					MLX4_WQE_CTRL_IL4_HDR_CSUM;
-			if (pkt->ol_flags & PKT_TX_OUTER_IP_CKSUM)
-				srcrb.flags |=
-					RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM);
-		} else {
-			srcrb.flags |= RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM |
-						MLX4_WQE_CTRL_TCP_UDP_CSUM);
-		}
-	}
-	if (txq->lb) {
-		/*
-		 * Copy destination MAC address to the WQE, this allows
-		 * loopback in eSwitch, so that VFs and PF can communicate
-		 * with each other.
-		 */
-		srcrb.flags16[0] = *(rte_pktmbuf_mtod(pkt, uint16_t *));
-		ctrl->imm = *(rte_pktmbuf_mtod_offset(pkt, uint32_t *,
-						      sizeof(uint16_t)));
-	} else {
-		ctrl->imm = 0;
-	}
-	ctrl->srcrb_flags = srcrb.flags;
-	/*
-	 * Make sure descriptor is fully written before
-	 * setting ownership bit (because HW can start
-	 * executing as soon as we do).
-	 */
-	rte_wmb();
-	ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
-					      ((sq->head & sq->txbb_cnt) ?
-					       MLX4_BIT_WQE_OWN : 0));
-	sq->head += nr_txbbs;
-	return 0;
-}
-
-/**
  * DPDK callback for Tx.
  *
  * @param dpdk_txq
@@ -440,7 +261,8 @@ struct pv {
 	unsigned int bytes_sent = 0;
 	unsigned int i;
 	unsigned int max;
-	int err;
+	struct mlx4_sq *sq = &txq->msq;
+	struct pv *pv = (struct pv *)txq->bounce_buf;
 
 	assert(txq->elts_comp_cd != 0);
 	mlx4_txq_complete(txq);
@@ -461,6 +283,21 @@ struct pv {
 			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
 		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
 		struct txq_elt *elt = &(*txq->elts)[elts_head];
+		uint32_t owner_opcode = MLX4_OPCODE_SEND;
+		struct mlx4_wqe_ctrl_seg *ctrl;
+		struct mlx4_wqe_data_seg *dseg;
+		struct rte_mbuf *sbuf;
+		union {
+			uint32_t flags;
+			uint16_t flags16[2];
+		} srcrb;
+		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+		uint32_t lkey;
+		uintptr_t addr;
+		uint32_t byte_count;
+		int wqe_real_size;
+		int nr_txbbs;
+		int pv_counter = 0;
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -479,18 +316,166 @@ struct pv {
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		/* Post the packet for sending. */
-		err = mlx4_post_send(txq, buf);
-		if (unlikely(err)) {
+		/*
+		 * Calculate the needed work queue entry size
+		 * for this packet.
+		 */
+		wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+		nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
+		/*
+		 * Check that there is room for this WQE in the send
+		 * queue and that the WQE size is legal.
+		 */
+		if (((sq->head - sq->tail) + nr_txbbs +
+		     sq->headroom_txbbs) >= sq->txbb_cnt ||
+		    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
 			elt->buf = NULL;
-			goto stop;
+			break;
 		}
+		/* Get the control and data entries of the WQE. */
+		ctrl = (struct mlx4_wqe_ctrl_seg *)
+				mlx4_get_send_wqe(sq, head_idx);
+		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+				sizeof(struct mlx4_wqe_ctrl_seg));
+		/* Fill the data segments with buffer information. */
+		for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
+			addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+			rte_prefetch0((volatile void *)addr);
+			/* Handle WQE wraparound. */
+			if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
+				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+			dseg->addr = rte_cpu_to_be_64(addr);
+			/* Memory region key (big endian). */
+			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+			dseg->lkey = rte_cpu_to_be_32(lkey);
+#ifndef NDEBUG
+			if (unlikely(dseg->lkey ==
+				rte_cpu_to_be_32((uint32_t)-1))) {
+				/* MR does not exist. */
+				DEBUG("%p: unable to get MP <-> MR association",
+				      (void *)txq);
+				/*
+				 * Restamp entry in case of failure.
+				 * Make sure that size is written correctly
+				 * Note that we give ownership to the SW,
+				 * not the HW.
+				 */
+				ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+				mlx4_txq_stamp_freed_wqe(sq, head_idx,
+					     (sq->head & sq->txbb_cnt) ? 0 : 1);
+				elt->buf = NULL;
+				break;
+			}
+#endif /* NDEBUG */
+			if (likely(sbuf->data_len)) {
+				byte_count = rte_cpu_to_be_32(sbuf->data_len);
+			} else {
+				/*
+				 * Zero length segment is treated as inline
+				 * segment with zero data.
+				 */
+				byte_count = RTE_BE32(0x80000000);
+			}
+			/*
+			 * If the data segment is not at the beginning
+			 * of a Tx basic block (TXBB) then write the
+			 * byte count, else postpone the writing to
+			 * just before updating the control segment.
+			 */
+			if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+				/*
+				 * Need a barrier here before writing the
+				 * byte_count fields to make sure that all the
+				 * data is visible before the byte_count field
+				 * is set. otherwise, if the segment begins a
+				 * new cacheline, the HCA prefetcher could grab
+				 * the 64-byte chunk and get a valid
+				 * (!= 0xffffffff) byte count but stale data,
+				 * and end up sending the wrong data.
+				 */
+				rte_io_wmb();
+				dseg->byte_count = byte_count;
+			} else {
+				/*
+				 * This data segment starts at the beginning of
+				 * a new TXBB, so we need to postpone its
+				 * byte_count writing for later.
+				 */
+				pv[pv_counter].dseg = dseg;
+				pv[pv_counter++].val = byte_count;
+			}
+		}
+		/* Write the first DWORD of each TXBB save earlier. */
+		if (pv_counter) {
+			/* Need a barrier before writing the byte_count. */
+			rte_io_wmb();
+			for (--pv_counter; pv_counter  >= 0; pv_counter--)
+				pv[pv_counter].dseg->byte_count =
+						pv[pv_counter].val;
+		}
+		/* Fill the control parameters for this packet. */
+		ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+		/*
+		 * For raw Ethernet, the SOLICIT flag is used to indicate
+		 * that no ICRC should be calculated.
+		 */
+		txq->elts_comp_cd -= nr_txbbs;
+		if (unlikely(txq->elts_comp_cd <= 0)) {
+			txq->elts_comp_cd = txq->elts_comp_cd_init;
+			srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
+					       MLX4_WQE_CTRL_CQ_UPDATE);
+		} else {
+			srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
+		}
+		/* Enable HW checksum offload if requested */
+		if (txq->csum &&
+		    (buf->ol_flags &
+		     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) {
+			const uint64_t is_tunneled = (buf->ol_flags &
+						      (PKT_TX_TUNNEL_GRE |
+						       PKT_TX_TUNNEL_VXLAN));
+
+			if (is_tunneled && txq->csum_l2tun) {
+				owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM |
+						MLX4_WQE_CTRL_IL4_HDR_CSUM;
+				if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM)
+					srcrb.flags |=
+					    RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM);
+			} else {
+				srcrb.flags |=
+					RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM |
+						MLX4_WQE_CTRL_TCP_UDP_CSUM);
+			}
+		}
+		if (txq->lb) {
+			/*
+			 * Copy destination MAC address to the WQE, this allows
+			 * loopback in eSwitch, so that VFs and PF can
+			 * communicate with each other.
+			 */
+			srcrb.flags16[0] = *(rte_pktmbuf_mtod(buf, uint16_t *));
+			ctrl->imm = *(rte_pktmbuf_mtod_offset(buf, uint32_t *,
+					      sizeof(uint16_t)));
+		} else {
+			ctrl->imm = 0;
+		}
+		ctrl->srcrb_flags = srcrb.flags;
+		/*
+		 * Make sure descriptor is fully written before
+		 * setting ownership bit (because HW can start
+		 * executing as soon as we do).
+		 */
+		rte_wmb();
+		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
+					      ((sq->head & sq->txbb_cnt) ?
+						       MLX4_BIT_WQE_OWN : 0));
+		sq->head += nr_txbbs;
 		elt->buf = buf;
 		bytes_sent += buf->pkt_len;
 		++elts_comp;
 		elts_head = elts_head_next;
 	}
-stop:
 	/* Take a shortcut if nothing must be sent. */
 	if (unlikely(i == 0))
 		return 0;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v4 5/8] net/mlx4: remove duplicate handling in Tx burst
  2017-10-31 18:21     ` [PATCH v4 0/8] net/mlx4: Tx path improvements Matan Azrad
                         ` (3 preceding siblings ...)
  2017-10-31 18:21       ` [PATCH v4 4/8] net/mlx4: merge Tx path functions Matan Azrad
@ 2017-10-31 18:21       ` Matan Azrad
  2017-11-02 13:42         ` Adrien Mazarguil
  2017-10-31 18:21       ` [PATCH v4 6/8] net/mlx4: separate Tx segment cases Matan Azrad
                         ` (3 subsequent siblings)
  8 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-31 18:21 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Remove usage of variable which count the packets for completion and
doesn't add more information than packets counter.

Remove no space in elements ring check which is already covered by
regular Tx flow.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 0759e51..0a6cbf1 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -257,7 +257,6 @@ struct pv {
 	struct txq *txq = (struct txq *)dpdk_txq;
 	unsigned int elts_head = txq->elts_head;
 	const unsigned int elts_n = txq->elts_n;
-	unsigned int elts_comp = 0;
 	unsigned int bytes_sent = 0;
 	unsigned int i;
 	unsigned int max;
@@ -273,8 +272,6 @@ struct pv {
 	assert(max <= elts_n);
 	/* Always leave one free entry in the ring. */
 	--max;
-	if (max == 0)
-		return 0;
 	if (max > pkts_n)
 		max = pkts_n;
 	for (i = 0; (i != max); ++i) {
@@ -473,7 +470,6 @@ struct pv {
 		sq->head += nr_txbbs;
 		elt->buf = buf;
 		bytes_sent += buf->pkt_len;
-		++elts_comp;
 		elts_head = elts_head_next;
 	}
 	/* Take a shortcut if nothing must be sent. */
@@ -487,7 +483,7 @@ struct pv {
 	/* Ring QP doorbell. */
 	rte_write32(txq->msq.doorbell_qpn, txq->msq.db);
 	txq->elts_head = elts_head;
-	txq->elts_comp += elts_comp;
+	txq->elts_comp += i;
 	return i;
 }
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v4 6/8] net/mlx4: separate Tx segment cases
  2017-10-31 18:21     ` [PATCH v4 0/8] net/mlx4: Tx path improvements Matan Azrad
                         ` (4 preceding siblings ...)
  2017-10-31 18:21       ` [PATCH v4 5/8] net/mlx4: remove duplicate handling in Tx burst Matan Azrad
@ 2017-10-31 18:21       ` Matan Azrad
  2017-11-02 13:43         ` Adrien Mazarguil
  2017-10-31 18:21       ` [PATCH v4 7/8] net/mlx4: fix HW memory optimizations careless Matan Azrad
                         ` (2 subsequent siblings)
  8 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-31 18:21 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Optimize single segment case by processing it in different block which
prevents checks, calculations and barriers relevant only for multi
segment case.

Call a dedicated function for handling multi segments case.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 236 +++++++++++++++++++++++++++----------------
 1 file changed, 151 insertions(+), 85 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 0a6cbf1..176000f 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -62,6 +62,9 @@
 #include "mlx4_rxtx.h"
 #include "mlx4_utils.h"
 
+#define WQE_ONE_DATA_SEG_SIZE \
+	(sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg))
+
 /**
  * Pointer-value pair structure used in tx_post_send for saving the first
  * DWORD (32 byte) of a TXBB.
@@ -141,21 +144,18 @@ struct pv {
  *   0 on success, -1 on failure.
  */
 static int
-mlx4_txq_complete(struct txq *txq)
+mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
+				  struct mlx4_sq *sq)
 {
 	unsigned int elts_comp = txq->elts_comp;
 	unsigned int elts_tail = txq->elts_tail;
-	const unsigned int elts_n = txq->elts_n;
 	struct mlx4_cq *cq = &txq->mcq;
-	struct mlx4_sq *sq = &txq->msq;
 	struct mlx4_cqe *cqe;
 	uint32_t cons_index = cq->cons_index;
 	uint16_t new_index;
 	uint16_t nr_txbbs = 0;
 	int pkts = 0;
 
-	if (unlikely(elts_comp == 0))
-		return 0;
 	/*
 	 * Traverse over all CQ entries reported and handle each WQ entry
 	 * reported by them.
@@ -238,6 +238,119 @@ struct pv {
 	return buf->pool;
 }
 
+static int
+mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq,
+			       struct mlx4_wqe_ctrl_seg **pctrl)
+{
+	int wqe_real_size;
+	int nr_txbbs;
+	struct pv *pv = (struct pv *)txq->bounce_buf;
+	struct mlx4_sq *sq = &txq->msq;
+	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_data_seg *dseg;
+	struct rte_mbuf *sbuf;
+	uint32_t lkey;
+	uintptr_t addr;
+	uint32_t byte_count;
+	int pv_counter = 0;
+
+	/* Calculate the needed work queue entry size for this packet. */
+	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+		buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
+	/*
+	 * Check that there is room for this WQE in the send queue and that
+	 * the WQE size is legal.
+	 */
+	if (((sq->head - sq->tail) + nr_txbbs +
+				sq->headroom_txbbs) >= sq->txbb_cnt ||
+			nr_txbbs > MLX4_MAX_WQE_TXBBS) {
+		return -1;
+	}
+	/* Get the control and data entries of the WQE. */
+	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
+	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+			sizeof(struct mlx4_wqe_ctrl_seg));
+	*pctrl = ctrl;
+	/* Fill the data segments with buffer information. */
+	for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
+		addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+		rte_prefetch0((volatile void *)addr);
+		/* Handle WQE wraparound. */
+		if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
+			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+		dseg->addr = rte_cpu_to_be_64(addr);
+		/* Memory region key (big endian) for this memory pool. */
+		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+		dseg->lkey = rte_cpu_to_be_32(lkey);
+#ifndef NDEBUG
+		/* Calculate the needed work queue entry size for this packet */
+		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
+			/* MR does not exist. */
+			DEBUG("%p: unable to get MP <-> MR association",
+					(void *)txq);
+			/*
+			 * Restamp entry in case of failure.
+			 * Make sure that size is written correctly
+			 * Note that we give ownership to the SW, not the HW.
+			 */
+			wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+			mlx4_txq_stamp_freed_wqe(sq, head_idx,
+					(sq->head & sq->txbb_cnt) ? 0 : 1);
+			return -1;
+		}
+#endif /* NDEBUG */
+		if (likely(sbuf->data_len)) {
+			byte_count = rte_cpu_to_be_32(sbuf->data_len);
+		} else {
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			byte_count = RTE_BE32(0x80000000);
+		}
+		/*
+		 * If the data segment is not at the beginning of a
+		 * Tx basic block (TXBB) then write the byte count,
+		 * else postpone the writing to just before updating the
+		 * control segment.
+		 */
+		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+			/*
+			 * Need a barrier here before writing the byte_count
+			 * fields to make sure that all the data is visible
+			 * before the byte_count field is set.
+			 * Otherwise, if the segment begins a new cacheline,
+			 * the HCA prefetcher could grab the 64-byte chunk and
+			 * get a valid (!= 0xffffffff) byte count but stale
+			 * data, and end up sending the wrong data.
+			 */
+			rte_io_wmb();
+			dseg->byte_count = byte_count;
+		} else {
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[pv_counter].dseg = dseg;
+			pv[pv_counter++].val = byte_count;
+		}
+	}
+	/* Write the first DWORD of each TXBB save earlier. */
+	if (pv_counter) {
+		/* Need a barrier here before writing the byte_count. */
+		rte_io_wmb();
+		for (--pv_counter; pv_counter  >= 0; pv_counter--)
+			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
+	}
+	/* Fill the control parameters for this packet. */
+	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+	return nr_txbbs;
+}
 /**
  * DPDK callback for Tx.
  *
@@ -261,10 +374,11 @@ struct pv {
 	unsigned int i;
 	unsigned int max;
 	struct mlx4_sq *sq = &txq->msq;
-	struct pv *pv = (struct pv *)txq->bounce_buf;
+	int nr_txbbs;
 
 	assert(txq->elts_comp_cd != 0);
-	mlx4_txq_complete(txq);
+	if (likely(txq->elts_comp != 0))
+		mlx4_txq_complete(txq, elts_n, sq);
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
 		max -= elts_n;
@@ -283,7 +397,6 @@ struct pv {
 		uint32_t owner_opcode = MLX4_OPCODE_SEND;
 		struct mlx4_wqe_ctrl_seg *ctrl;
 		struct mlx4_wqe_data_seg *dseg;
-		struct rte_mbuf *sbuf;
 		union {
 			uint32_t flags;
 			uint16_t flags16[2];
@@ -291,10 +404,6 @@ struct pv {
 		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
 		uint32_t lkey;
 		uintptr_t addr;
-		uint32_t byte_count;
-		int wqe_real_size;
-		int nr_txbbs;
-		int pv_counter = 0;
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -313,38 +422,29 @@ struct pv {
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		/*
-		 * Calculate the needed work queue entry size
-		 * for this packet.
-		 */
-		wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
-				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
-		nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
-		/*
-		 * Check that there is room for this WQE in the send
-		 * queue and that the WQE size is legal.
-		 */
-		if (((sq->head - sq->tail) + nr_txbbs +
-		     sq->headroom_txbbs) >= sq->txbb_cnt ||
-		    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-			elt->buf = NULL;
-			break;
-		}
-		/* Get the control and data entries of the WQE. */
-		ctrl = (struct mlx4_wqe_ctrl_seg *)
-				mlx4_get_send_wqe(sq, head_idx);
-		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
-				sizeof(struct mlx4_wqe_ctrl_seg));
-		/* Fill the data segments with buffer information. */
-		for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
-			addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+		if (buf->nb_segs == 1) {
+			/*
+			 * Check that there is room for this WQE in the send
+			 * queue and that the WQE size is legal
+			 */
+			if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs) >=
+			     sq->txbb_cnt || 1 > MLX4_MAX_WQE_TXBBS) {
+				elt->buf = NULL;
+				break;
+			}
+			/* Get the control and data entries of the WQE. */
+			ctrl = (struct mlx4_wqe_ctrl_seg *)
+					mlx4_get_send_wqe(sq, head_idx);
+			dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+					sizeof(struct mlx4_wqe_ctrl_seg));
+			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			rte_prefetch0((volatile void *)addr);
 			/* Handle WQE wraparound. */
 			if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
 				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
 			dseg->addr = rte_cpu_to_be_64(addr);
 			/* Memory region key (big endian). */
-			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
 			dseg->lkey = rte_cpu_to_be_32(lkey);
 #ifndef NDEBUG
 			if (unlikely(dseg->lkey ==
@@ -358,61 +458,27 @@ struct pv {
 				 * Note that we give ownership to the SW,
 				 * not the HW.
 				 */
-				ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+				ctrl->fence_size =
+					(WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
 				mlx4_txq_stamp_freed_wqe(sq, head_idx,
 					     (sq->head & sq->txbb_cnt) ? 0 : 1);
 				elt->buf = NULL;
 				break;
 			}
 #endif /* NDEBUG */
-			if (likely(sbuf->data_len)) {
-				byte_count = rte_cpu_to_be_32(sbuf->data_len);
-			} else {
-				/*
-				 * Zero length segment is treated as inline
-				 * segment with zero data.
-				 */
-				byte_count = RTE_BE32(0x80000000);
-			}
-			/*
-			 * If the data segment is not at the beginning
-			 * of a Tx basic block (TXBB) then write the
-			 * byte count, else postpone the writing to
-			 * just before updating the control segment.
-			 */
-			if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
-				/*
-				 * Need a barrier here before writing the
-				 * byte_count fields to make sure that all the
-				 * data is visible before the byte_count field
-				 * is set. otherwise, if the segment begins a
-				 * new cacheline, the HCA prefetcher could grab
-				 * the 64-byte chunk and get a valid
-				 * (!= 0xffffffff) byte count but stale data,
-				 * and end up sending the wrong data.
-				 */
-				rte_io_wmb();
-				dseg->byte_count = byte_count;
-			} else {
-				/*
-				 * This data segment starts at the beginning of
-				 * a new TXBB, so we need to postpone its
-				 * byte_count writing for later.
-				 */
-				pv[pv_counter].dseg = dseg;
-				pv[pv_counter++].val = byte_count;
-			}
-		}
-		/* Write the first DWORD of each TXBB save earlier. */
-		if (pv_counter) {
-			/* Need a barrier before writing the byte_count. */
+			/* Need a barrier here before byte count store. */
 			rte_io_wmb();
-			for (--pv_counter; pv_counter  >= 0; pv_counter--)
-				pv[pv_counter].dseg->byte_count =
-						pv[pv_counter].val;
+			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
+			/* Fill the control parameters for this packet. */
+			ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
+			nr_txbbs = 1;
+		} else {
+			nr_txbbs = mlx4_tx_burst_segs(buf, txq, &ctrl);
+			if (nr_txbbs < 0) {
+				elt->buf = NULL;
+				break;
+			}
 		}
-		/* Fill the control parameters for this packet. */
-		ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
 		/*
 		 * For raw Ethernet, the SOLICIT flag is used to indicate
 		 * that no ICRC should be calculated.
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v4 7/8] net/mlx4: fix HW memory optimizations careless
  2017-10-31 18:21     ` [PATCH v4 0/8] net/mlx4: Tx path improvements Matan Azrad
                         ` (5 preceding siblings ...)
  2017-10-31 18:21       ` [PATCH v4 6/8] net/mlx4: separate Tx segment cases Matan Azrad
@ 2017-10-31 18:21       ` Matan Azrad
  2017-11-02 13:43         ` Adrien Mazarguil
  2017-10-31 18:21       ` [PATCH v4 8/8] net/mlx4: mitigate Tx path memory barriers Matan Azrad
  2017-11-02 13:41       ` [PATCH] net/mlx4: fix missing include Adrien Mazarguil
  8 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-31 18:21 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Volatilize all Rx/Tx HW negotiation memories to be sure no compiler
optimization prevents either load or store commands.

Fixes: c3c977bbecbd ("net/mlx4: add Tx bypassing Verbs")
Fixes: 9f57340a8087 ("net/mlx4: restore Rx offloads")
Fixes: 6681b845034c ("net/mlx4: add Rx bypassing Verbs")
Fixes: 62e96ffb93ad ("net/mlx4: fix no Rx interrupts")

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_prm.h  | 18 ++++++------
 drivers/net/mlx4/mlx4_rxtx.c | 67 ++++++++++++++++++++++++--------------------
 2 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index b0fd982..6d10b4e 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -80,14 +80,14 @@ enum {
 
 /* Send queue information. */
 struct mlx4_sq {
-	uint8_t *buf; /**< SQ buffer. */
-	uint8_t *eob; /**< End of SQ buffer */
+	volatile uint8_t *buf; /**< SQ buffer. */
+	volatile uint8_t *eob; /**< End of SQ buffer */
 	uint32_t head; /**< SQ head counter in units of TXBBS. */
 	uint32_t tail; /**< SQ tail counter in units of TXBBS. */
 	uint32_t txbb_cnt; /**< Num of WQEBB in the Q (should be ^2). */
 	uint32_t txbb_cnt_mask; /**< txbbs_cnt mask (txbb_cnt is ^2). */
 	uint32_t headroom_txbbs; /**< Num of txbbs that should be kept free. */
-	uint32_t *db; /**< Pointer to the doorbell. */
+	volatile uint32_t *db; /**< Pointer to the doorbell. */
 	uint32_t doorbell_qpn; /**< qp number to write to the doorbell. */
 };
 
@@ -101,10 +101,10 @@ struct mlx4_sq {
 /* Completion queue information. */
 struct mlx4_cq {
 	void *cq_uar; /**< CQ user access region. */
-	void *cq_db_reg; /**< CQ doorbell register. */
-	uint32_t *set_ci_db; /**< Pointer to the completion queue doorbell. */
-	uint32_t *arm_db; /**< Pointer to doorbell for arming Rx events. */
-	uint8_t *buf; /**< Pointer to the completion queue buffer. */
+	volatile void *cq_db_reg; /**< CQ doorbell register. */
+	volatile uint32_t *set_ci_db; /**< Pointer to the CQ doorbell. */
+	volatile uint32_t *arm_db; /**< Arming Rx events doorbell. */
+	volatile uint8_t *buf; /**< Pointer to the completion queue buffer. */
 	uint32_t cqe_cnt; /**< Number of entries in the queue. */
 	uint32_t cqe_64:1; /**< CQ entry size is 64 bytes. */
 	uint32_t cons_index; /**< Last queue entry that was handled. */
@@ -128,10 +128,10 @@ struct mlx4_cq {
  * @return
  *   Pointer to CQE entry.
  */
-static inline struct mlx4_cqe *
+static inline volatile struct mlx4_cqe *
 mlx4_get_cqe(struct mlx4_cq *cq, uint32_t index)
 {
-	return (struct mlx4_cqe *)(cq->buf +
+	return (volatile struct mlx4_cqe *)(cq->buf +
 				   ((index & (cq->cqe_cnt - 1)) <<
 				    (5 + cq->cqe_64)) +
 				   (cq->cqe_64 << 5));
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 176000f..bd6d888 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -70,7 +70,7 @@
  * DWORD (32 byte) of a TXBB.
  */
 struct pv {
-	struct mlx4_wqe_data_seg *dseg;
+	volatile struct mlx4_wqe_data_seg *dseg;
 	uint32_t val;
 };
 
@@ -98,14 +98,15 @@ struct pv {
 {
 	uint32_t stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL |
 					  (!!owner << MLX4_SQ_STAMP_SHIFT));
-	uint8_t *wqe = mlx4_get_send_wqe(sq, (index & sq->txbb_cnt_mask));
-	uint32_t *ptr = (uint32_t *)wqe;
+	volatile uint8_t *wqe = mlx4_get_send_wqe(sq,
+						(index & sq->txbb_cnt_mask));
+	volatile uint32_t *ptr = (volatile uint32_t *)wqe;
 	int i;
 	int txbbs_size;
 	int num_txbbs;
 
 	/* Extract the size from the control segment of the WQE. */
-	num_txbbs = MLX4_SIZE_TO_TXBBS((((struct mlx4_wqe_ctrl_seg *)
+	num_txbbs = MLX4_SIZE_TO_TXBBS((((volatile struct mlx4_wqe_ctrl_seg *)
 					 wqe)->fence_size & 0x3f) << 4);
 	txbbs_size = num_txbbs * MLX4_TXBB_SIZE;
 	/* Optimize the common case when there is no wrap-around. */
@@ -120,8 +121,8 @@ struct pv {
 		for (i = 0; i < txbbs_size; i += MLX4_SQ_STAMP_STRIDE) {
 			*ptr = stamp;
 			ptr += MLX4_SQ_STAMP_DWORDS;
-			if ((uint8_t *)ptr >= sq->eob) {
-				ptr = (uint32_t *)sq->buf;
+			if ((volatile uint8_t *)ptr >= sq->eob) {
+				ptr = (volatile uint32_t *)sq->buf;
 				stamp ^= RTE_BE32(0x80000000);
 			}
 		}
@@ -150,7 +151,7 @@ struct pv {
 	unsigned int elts_comp = txq->elts_comp;
 	unsigned int elts_tail = txq->elts_tail;
 	struct mlx4_cq *cq = &txq->mcq;
-	struct mlx4_cqe *cqe;
+	volatile struct mlx4_cqe *cqe;
 	uint32_t cons_index = cq->cons_index;
 	uint16_t new_index;
 	uint16_t nr_txbbs = 0;
@@ -161,7 +162,7 @@ struct pv {
 	 * reported by them.
 	 */
 	do {
-		cqe = (struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index);
+		cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index);
 		if (unlikely(!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
 		    !!(cons_index & cq->cqe_cnt)))
 			break;
@@ -172,8 +173,8 @@ struct pv {
 #ifndef NDEBUG
 		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
 			     MLX4_CQE_OPCODE_ERROR)) {
-			struct mlx4_err_cqe *cqe_err =
-				(struct mlx4_err_cqe *)cqe;
+			volatile struct mlx4_err_cqe *cqe_err =
+				(volatile struct mlx4_err_cqe *)cqe;
 			ERROR("%p CQE error - vendor syndrome: 0x%x"
 			      " syndrome: 0x%x\n",
 			      (void *)txq, cqe_err->vendor_err,
@@ -240,15 +241,15 @@ struct pv {
 
 static int
 mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq,
-			       struct mlx4_wqe_ctrl_seg **pctrl)
+				   volatile struct mlx4_wqe_ctrl_seg **pctrl)
 {
 	int wqe_real_size;
 	int nr_txbbs;
 	struct pv *pv = (struct pv *)txq->bounce_buf;
 	struct mlx4_sq *sq = &txq->msq;
 	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
-	struct mlx4_wqe_ctrl_seg *ctrl;
-	struct mlx4_wqe_data_seg *dseg;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl;
+	volatile struct mlx4_wqe_data_seg *dseg;
 	struct rte_mbuf *sbuf;
 	uint32_t lkey;
 	uintptr_t addr;
@@ -256,8 +257,8 @@ struct pv {
 	int pv_counter = 0;
 
 	/* Calculate the needed work queue entry size for this packet. */
-	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
-		buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+	wqe_real_size = sizeof(volatile struct mlx4_wqe_ctrl_seg) +
+		buf->nb_segs * sizeof(volatile struct mlx4_wqe_data_seg);
 	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
 	/*
 	 * Check that there is room for this WQE in the send queue and that
@@ -269,17 +270,18 @@ struct pv {
 		return -1;
 	}
 	/* Get the control and data entries of the WQE. */
-	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
-	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
-			sizeof(struct mlx4_wqe_ctrl_seg));
+	ctrl = (volatile struct mlx4_wqe_ctrl_seg *)
+			mlx4_get_send_wqe(sq, head_idx);
+	dseg = (volatile struct mlx4_wqe_data_seg *)
+			((uintptr_t)ctrl + sizeof(struct mlx4_wqe_ctrl_seg));
 	*pctrl = ctrl;
 	/* Fill the data segments with buffer information. */
 	for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
 		addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
 		rte_prefetch0((volatile void *)addr);
 		/* Handle WQE wraparound. */
-		if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
-			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+		if (dseg >= (volatile struct mlx4_wqe_data_seg *)sq->eob)
+			dseg = (volatile struct mlx4_wqe_data_seg *)sq->buf;
 		dseg->addr = rte_cpu_to_be_64(addr);
 		/* Memory region key (big endian) for this memory pool. */
 		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
@@ -395,8 +397,8 @@ struct pv {
 		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
 		struct txq_elt *elt = &(*txq->elts)[elts_head];
 		uint32_t owner_opcode = MLX4_OPCODE_SEND;
-		struct mlx4_wqe_ctrl_seg *ctrl;
-		struct mlx4_wqe_data_seg *dseg;
+		volatile struct mlx4_wqe_ctrl_seg *ctrl;
+		volatile struct mlx4_wqe_data_seg *dseg;
 		union {
 			uint32_t flags;
 			uint16_t flags16[2];
@@ -433,15 +435,18 @@ struct pv {
 				break;
 			}
 			/* Get the control and data entries of the WQE. */
-			ctrl = (struct mlx4_wqe_ctrl_seg *)
+			ctrl = (volatile struct mlx4_wqe_ctrl_seg *)
 					mlx4_get_send_wqe(sq, head_idx);
-			dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+			dseg = (volatile struct mlx4_wqe_data_seg *)
+					((uintptr_t)ctrl +
 					sizeof(struct mlx4_wqe_ctrl_seg));
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			rte_prefetch0((volatile void *)addr);
 			/* Handle WQE wraparound. */
-			if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
-				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+			if (dseg >=
+				(volatile struct mlx4_wqe_data_seg *)sq->eob)
+				dseg = (volatile struct mlx4_wqe_data_seg *)
+						sq->buf;
 			dseg->addr = rte_cpu_to_be_64(addr);
 			/* Memory region key (big endian). */
 			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
@@ -633,7 +638,7 @@ struct pv {
  *   CQE checksum information.
  */
 static inline uint32_t
-mlx4_cqe_flags(struct mlx4_cqe *cqe, int csum, int csum_l2tun)
+mlx4_cqe_flags(volatile struct mlx4_cqe *cqe, int csum, int csum_l2tun)
 {
 	uint32_t flags = 0;
 
@@ -666,13 +671,13 @@ struct pv {
  *   Number of bytes of the CQE, 0 in case there is no completion.
  */
 static unsigned int
-mlx4_cq_poll_one(struct rxq *rxq, struct mlx4_cqe **out)
+mlx4_cq_poll_one(struct rxq *rxq, volatile struct mlx4_cqe **out)
 {
 	int ret = 0;
-	struct mlx4_cqe *cqe = NULL;
+	volatile struct mlx4_cqe *cqe = NULL;
 	struct mlx4_cq *cq = &rxq->mcq;
 
-	cqe = (struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index);
+	cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index);
 	if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
 	    !!(cq->cons_index & cq->cqe_cnt))
 		goto out;
@@ -717,7 +722,7 @@ struct pv {
 	int len = 0;
 
 	while (pkts_n) {
-		struct mlx4_cqe *cqe;
+		volatile struct mlx4_cqe *cqe;
 		uint32_t idx = rq_ci & wr_cnt;
 		struct rte_mbuf *rep = (*rxq->elts)[idx];
 		volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx];
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v4 8/8] net/mlx4: mitigate Tx path memory barriers
  2017-10-31 18:21     ` [PATCH v4 0/8] net/mlx4: Tx path improvements Matan Azrad
                         ` (6 preceding siblings ...)
  2017-10-31 18:21       ` [PATCH v4 7/8] net/mlx4: fix HW memory optimizations careless Matan Azrad
@ 2017-10-31 18:21       ` Matan Azrad
  2017-11-02 13:43         ` Adrien Mazarguil
  2017-11-02 13:41       ` [PATCH] net/mlx4: fix missing include Adrien Mazarguil
  8 siblings, 1 reply; 84+ messages in thread
From: Matan Azrad @ 2017-10-31 18:21 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Replace most of the memory barriers by IO memory barriers since they
are all targeted to the DRAM; This improves code efficiency for
systems which force store order between different addresses.

Only the doorbell register store should be protected by memory barrier
since it is targeted to the PCI memory domain.

Limit pre byte count store IO memory barrier for systems with cache
line size smaller than 64B (TXBB size).

This patch improves Tx performance by 0.2MPPS for one segment 64B
packets via 1 queue with 1 core test.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index bd6d888..fc7a922 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -169,7 +169,7 @@ struct pv {
 		/*
 		 * Make sure we read the CQE after we read the ownership bit.
 		 */
-		rte_rmb();
+		rte_io_rmb();
 #ifndef NDEBUG
 		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
 			     MLX4_CQE_OPCODE_ERROR)) {
@@ -197,14 +197,9 @@ struct pv {
 	} while (1);
 	if (unlikely(pkts == 0))
 		return 0;
-	/*
-	 * Update CQ.
-	 * To prevent CQ overflow we first update CQ consumer and only then
-	 * the ring consumer.
-	 */
+	/* Update CQ. */
 	cq->cons_index = cons_index;
 	*cq->set_ci_db = rte_cpu_to_be_32(cq->cons_index & MLX4_CQ_DB_CI_MASK);
-	rte_wmb();
 	sq->tail = sq->tail + nr_txbbs;
 	/* Update the list of packets posted for transmission. */
 	elts_comp -= pkts;
@@ -321,6 +316,7 @@ struct pv {
 		 * control segment.
 		 */
 		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+#if RTE_CACHE_LINE_SIZE < 64
 			/*
 			 * Need a barrier here before writing the byte_count
 			 * fields to make sure that all the data is visible
@@ -331,6 +327,7 @@ struct pv {
 			 * data, and end up sending the wrong data.
 			 */
 			rte_io_wmb();
+#endif /* RTE_CACHE_LINE_SIZE */
 			dseg->byte_count = byte_count;
 		} else {
 			/*
@@ -471,8 +468,7 @@ struct pv {
 				break;
 			}
 #endif /* NDEBUG */
-			/* Need a barrier here before byte count store. */
-			rte_io_wmb();
+			/* Never be TXBB aligned, no need compiler barrier. */
 			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
 			/* Fill the control parameters for this packet. */
 			ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
@@ -534,7 +530,7 @@ struct pv {
 		 * setting ownership bit (because HW can start
 		 * executing as soon as we do).
 		 */
-		rte_wmb();
+		rte_io_wmb();
 		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
 					      ((sq->head & sq->txbb_cnt) ?
 						       MLX4_BIT_WQE_OWN : 0));
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH] net/mlx4: fix missing include
  2017-10-31 18:21     ` [PATCH v4 0/8] net/mlx4: Tx path improvements Matan Azrad
                         ` (7 preceding siblings ...)
  2017-10-31 18:21       ` [PATCH v4 8/8] net/mlx4: mitigate Tx path memory barriers Matan Azrad
@ 2017-11-02 13:41       ` Adrien Mazarguil
  2017-11-02 20:35         ` Ferruh Yigit
  8 siblings, 1 reply; 84+ messages in thread
From: Adrien Mazarguil @ 2017-11-02 13:41 UTC (permalink / raw)
  To: Ferruh Yigit, Matan Azrad, Ophir Munk; +Cc: dev

Fixes: 76df01ff622f ("net/mlx4: separate debugging macros")

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx4/mlx4_utils.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/mlx4/mlx4_utils.h b/drivers/net/mlx4/mlx4_utils.h
index bebd4ae..dc529c9 100644
--- a/drivers/net/mlx4/mlx4_utils.h
+++ b/drivers/net/mlx4/mlx4_utils.h
@@ -34,6 +34,7 @@
 #ifndef MLX4_UTILS_H_
 #define MLX4_UTILS_H_
 
+#include <assert.h>
 #include <stddef.h>
 #include <stdio.h>
 
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* Re: [PATCH v4 2/8] net/mlx4: associate MR to MP in a short function
  2017-10-31 18:21       ` [PATCH v4 2/8] net/mlx4: associate MR to MP in a short function Matan Azrad
@ 2017-11-02 13:42         ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-11-02 13:42 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

Hi Matan/Ophir,

On Tue, Oct 31, 2017 at 06:21:27PM +0000, Matan Azrad wrote:
> From: Ophir Munk <ophirmu@mellanox.com>
> 
> Associate memory region to mempool (on data path) in a short function.
> Handle the less common case of adding a new memory region to mempool
> in a separate function.
> 
> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>

Thanks, a few minor comments below to address compilation issues, besides
that:

Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>

There's an issue about a missing inclusion of assert.h. This one should be
addressed before applying this series by the patch I submitted separately:

 "net/mlx4: fix missing include"

Can you include it in front of v5 to make things easier for Ferruh?

> ---
>  drivers/net/mlx4/mlx4.h      |  2 ++
>  drivers/net/mlx4/mlx4_mr.c   | 47 ++++++++++++++++++++++++++++++++++++
>  drivers/net/mlx4/mlx4_rxtx.c | 57 --------------------------------------------
>  drivers/net/mlx4/mlx4_rxtx.h | 31 +++++++++++++++++++++++-
>  4 files changed, 79 insertions(+), 58 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
> index e0a9853..70cf453 100644
> --- a/drivers/net/mlx4/mlx4.h
> +++ b/drivers/net/mlx4/mlx4.h
> @@ -160,5 +160,7 @@ int mlx4_flow_ctrl_set(struct rte_eth_dev *dev,
>  /* mlx4_mr.c */
>  
>  struct ibv_mr *mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp);
> +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> +			 uint32_t i);
>  
>  #endif /* RTE_PMD_MLX4_H_ */
> diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
> index 9700884..75ee32f 100644
> --- a/drivers/net/mlx4/mlx4_mr.c
> +++ b/drivers/net/mlx4/mlx4_mr.c
> @@ -55,6 +55,7 @@

You need to include inttypes.h for PRIu32 definition.

Even though it doesn't cause a compilation failure, rte_branch_prediction.h
must also be included here for unlikely().

>  #include <rte_memory.h>
>  #include <rte_mempool.h>
>  
> +#include "mlx4_rxtx.h"
>  #include "mlx4_utils.h"
>  
>  struct mlx4_check_mempool_data {
> @@ -181,3 +182,49 @@ struct ibv_mr *
>  		rte_errno = errno ? errno : EINVAL;
>  	return mr;
>  }
> +
> +/**
> + * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
> + * If mp2mr[] is full, remove an entry first.
> + *
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param[in] mp
> + *   Memory pool for which a memory region lkey must be added.
> + * @param[in] i
> + *   Index in memory pool (MP) where to add memory region (MR).
> + *
> + * @return
> + *   Added mr->lkey on success, (uint32_t)-1 on failure.
> + */
> +uint32_t
> +mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
> +{
> +	struct ibv_mr *mr;
> +
> +	/* Add a new entry, register MR first. */
> +	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
> +	      (void *)txq, mp->name, (void *)mp);
> +	mr = mlx4_mp2mr(txq->priv->pd, mp);
> +	if (unlikely(mr == NULL)) {
> +		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
> +		      (void *)txq);
> +		return (uint32_t)-1;
> +	}
> +	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
> +		/* Table is full, remove oldest entry. */
> +		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
> +		      (void *)txq);
> +		--i;
> +		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
> +		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
> +			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
> +	}
> +	/* Store the new entry. */
> +	txq->mp2mr[i].mp = mp;
> +	txq->mp2mr[i].mr = mr;
> +	txq->mp2mr[i].lkey = mr->lkey;
> +	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIU32,
> +	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);

PRIU32 -> PRIu32

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v4 3/8] net/mlx4: fix ring wraparound compiler hint
  2017-10-31 18:21       ` [PATCH v4 3/8] net/mlx4: fix ring wraparound compiler hint Matan Azrad
@ 2017-11-02 13:42         ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-11-02 13:42 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

On Tue, Oct 31, 2017 at 06:21:28PM +0000, Matan Azrad wrote:
> Remove unlikely hint from WQ wraparound check because it is
> expected case.
> 
> Fixes: c3c977bbecbd ("net/mlx4: add Tx bypassing Verbs")
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>

Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v4 4/8] net/mlx4: merge Tx path functions
  2017-10-31 18:21       ` [PATCH v4 4/8] net/mlx4: merge Tx path functions Matan Azrad
@ 2017-11-02 13:42         ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-11-02 13:42 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

On Tue, Oct 31, 2017 at 06:21:29PM +0000, Matan Azrad wrote:
> Merge tx_burst and mlx4_post_send functions to prevent
> double asking about WQ remain space.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>

Ideally this patch and subsequent ones should have come after:

 "net/mlx4: fix HW memory optimizations careless"

The volatile keyword fix should have been added before Tx path
optimizations. Since there's not much time left for that:

Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v4 5/8] net/mlx4: remove duplicate handling in Tx burst
  2017-10-31 18:21       ` [PATCH v4 5/8] net/mlx4: remove duplicate handling in Tx burst Matan Azrad
@ 2017-11-02 13:42         ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-11-02 13:42 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

On Tue, Oct 31, 2017 at 06:21:30PM +0000, Matan Azrad wrote:
> Remove usage of variable which count the packets for completion and
> doesn't add more information than packets counter.
> 
> Remove no space in elements ring check which is already covered by
> regular Tx flow.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>

Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v4 6/8] net/mlx4: separate Tx segment cases
  2017-10-31 18:21       ` [PATCH v4 6/8] net/mlx4: separate Tx segment cases Matan Azrad
@ 2017-11-02 13:43         ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-11-02 13:43 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

On Tue, Oct 31, 2017 at 06:21:31PM +0000, Matan Azrad wrote:
> Optimize single segment case by processing it in different block which
> prevents checks, calculations and barriers relevant only for multi
> segment case.
> 
> Call a dedicated function for handling multi segments case.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>
> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>

One remaining comment below, besides that:

Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>

> ---
>  drivers/net/mlx4/mlx4_rxtx.c | 236 +++++++++++++++++++++++++++----------------
>  1 file changed, 151 insertions(+), 85 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
<snip>
> +	/* Write the first DWORD of each TXBB save earlier. */
> +	if (pv_counter) {
> +		/* Need a barrier here before writing the byte_count. */
> +		rte_io_wmb();
> +		for (--pv_counter; pv_counter  >= 0; pv_counter--)
> +			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
> +	}
> +	/* Fill the control parameters for this packet. */
> +	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> +	return nr_txbbs;
> +}

Missing empty line here.

>  /**
>   * DPDK callback for Tx.
>   *
> @@ -261,10 +374,11 @@ struct pv {
>  	unsigned int i;
>  	unsigned int max;
>  	struct mlx4_sq *sq = &txq->msq;
> -	struct pv *pv = (struct pv *)txq->bounce_buf;
> +	int nr_txbbs;
<snip>

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v4 7/8] net/mlx4: fix HW memory optimizations careless
  2017-10-31 18:21       ` [PATCH v4 7/8] net/mlx4: fix HW memory optimizations careless Matan Azrad
@ 2017-11-02 13:43         ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-11-02 13:43 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

On Tue, Oct 31, 2017 at 06:21:32PM +0000, Matan Azrad wrote:
> Volatilize all Rx/Tx HW negotiation memories to be sure no compiler
> optimization prevents either load or store commands.
> 
> Fixes: c3c977bbecbd ("net/mlx4: add Tx bypassing Verbs")
> Fixes: 9f57340a8087 ("net/mlx4: restore Rx offloads")
> Fixes: 6681b845034c ("net/mlx4: add Rx bypassing Verbs")
> Fixes: 62e96ffb93ad ("net/mlx4: fix no Rx interrupts")
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>

Since this should fix all remaining concerns:

Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>

A few minor comments below.

> ---
>  drivers/net/mlx4/mlx4_prm.h  | 18 ++++++------
>  drivers/net/mlx4/mlx4_rxtx.c | 67 ++++++++++++++++++++++++--------------------
>  2 files changed, 45 insertions(+), 40 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
> index b0fd982..6d10b4e 100644
> --- a/drivers/net/mlx4/mlx4_prm.h
> +++ b/drivers/net/mlx4/mlx4_prm.h
> @@ -80,14 +80,14 @@ enum {
>  
>  /* Send queue information. */
>  struct mlx4_sq {
> -	uint8_t *buf; /**< SQ buffer. */
> -	uint8_t *eob; /**< End of SQ buffer */
> +	volatile uint8_t *buf; /**< SQ buffer. */
> +	volatile uint8_t *eob; /**< End of SQ buffer */
>  	uint32_t head; /**< SQ head counter in units of TXBBS. */
>  	uint32_t tail; /**< SQ tail counter in units of TXBBS. */
>  	uint32_t txbb_cnt; /**< Num of WQEBB in the Q (should be ^2). */
>  	uint32_t txbb_cnt_mask; /**< txbbs_cnt mask (txbb_cnt is ^2). */
>  	uint32_t headroom_txbbs; /**< Num of txbbs that should be kept free. */
> -	uint32_t *db; /**< Pointer to the doorbell. */
> +	volatile uint32_t *db; /**< Pointer to the doorbell. */
>  	uint32_t doorbell_qpn; /**< qp number to write to the doorbell. */
>  };
>  
> @@ -101,10 +101,10 @@ struct mlx4_sq {
>  /* Completion queue information. */
>  struct mlx4_cq {
>  	void *cq_uar; /**< CQ user access region. */

I'm curious why UAR isn't volatile as well?

> -	void *cq_db_reg; /**< CQ doorbell register. */
> -	uint32_t *set_ci_db; /**< Pointer to the completion queue doorbell. */
> -	uint32_t *arm_db; /**< Pointer to doorbell for arming Rx events. */
> -	uint8_t *buf; /**< Pointer to the completion queue buffer. */
> +	volatile void *cq_db_reg; /**< CQ doorbell register. */
> +	volatile uint32_t *set_ci_db; /**< Pointer to the CQ doorbell. */
> +	volatile uint32_t *arm_db; /**< Arming Rx events doorbell. */
> +	volatile uint8_t *buf; /**< Pointer to the completion queue buffer. */
>  	uint32_t cqe_cnt; /**< Number of entries in the queue. */
>  	uint32_t cqe_64:1; /**< CQ entry size is 64 bytes. */
>  	uint32_t cons_index; /**< Last queue entry that was handled. */
> @@ -128,10 +128,10 @@ struct mlx4_cq {
>   * @return
>   *   Pointer to CQE entry.
>   */
> -static inline struct mlx4_cqe *
> +static inline volatile struct mlx4_cqe *
>  mlx4_get_cqe(struct mlx4_cq *cq, uint32_t index)
>  {
> -	return (struct mlx4_cqe *)(cq->buf +
> +	return (volatile struct mlx4_cqe *)(cq->buf +
>  				   ((index & (cq->cqe_cnt - 1)) <<
>  				    (5 + cq->cqe_64)) +
>  				   (cq->cqe_64 << 5));
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 176000f..bd6d888 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -70,7 +70,7 @@
>   * DWORD (32 byte) of a TXBB.
>   */
>  struct pv {
> -	struct mlx4_wqe_data_seg *dseg;
> +	volatile struct mlx4_wqe_data_seg *dseg;
>  	uint32_t val;
>  };
>  
> @@ -98,14 +98,15 @@ struct pv {
>  {
>  	uint32_t stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL |
>  					  (!!owner << MLX4_SQ_STAMP_SHIFT));
> -	uint8_t *wqe = mlx4_get_send_wqe(sq, (index & sq->txbb_cnt_mask));
> -	uint32_t *ptr = (uint32_t *)wqe;
> +	volatile uint8_t *wqe = mlx4_get_send_wqe(sq,
> +						(index & sq->txbb_cnt_mask));
> +	volatile uint32_t *ptr = (volatile uint32_t *)wqe;
>  	int i;
>  	int txbbs_size;
>  	int num_txbbs;
>  
>  	/* Extract the size from the control segment of the WQE. */
> -	num_txbbs = MLX4_SIZE_TO_TXBBS((((struct mlx4_wqe_ctrl_seg *)
> +	num_txbbs = MLX4_SIZE_TO_TXBBS((((volatile struct mlx4_wqe_ctrl_seg *)
>  					 wqe)->fence_size & 0x3f) << 4);
>  	txbbs_size = num_txbbs * MLX4_TXBB_SIZE;
>  	/* Optimize the common case when there is no wrap-around. */
> @@ -120,8 +121,8 @@ struct pv {
>  		for (i = 0; i < txbbs_size; i += MLX4_SQ_STAMP_STRIDE) {
>  			*ptr = stamp;
>  			ptr += MLX4_SQ_STAMP_DWORDS;
> -			if ((uint8_t *)ptr >= sq->eob) {
> -				ptr = (uint32_t *)sq->buf;
> +			if ((volatile uint8_t *)ptr >= sq->eob) {
> +				ptr = (volatile uint32_t *)sq->buf;
>  				stamp ^= RTE_BE32(0x80000000);
>  			}
>  		}
> @@ -150,7 +151,7 @@ struct pv {
>  	unsigned int elts_comp = txq->elts_comp;
>  	unsigned int elts_tail = txq->elts_tail;
>  	struct mlx4_cq *cq = &txq->mcq;
> -	struct mlx4_cqe *cqe;
> +	volatile struct mlx4_cqe *cqe;
>  	uint32_t cons_index = cq->cons_index;
>  	uint16_t new_index;
>  	uint16_t nr_txbbs = 0;
> @@ -161,7 +162,7 @@ struct pv {
>  	 * reported by them.
>  	 */
>  	do {
> -		cqe = (struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index);
> +		cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index);
>  		if (unlikely(!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
>  		    !!(cons_index & cq->cqe_cnt)))
>  			break;
> @@ -172,8 +173,8 @@ struct pv {
>  #ifndef NDEBUG
>  		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
>  			     MLX4_CQE_OPCODE_ERROR)) {
> -			struct mlx4_err_cqe *cqe_err =
> -				(struct mlx4_err_cqe *)cqe;
> +			volatile struct mlx4_err_cqe *cqe_err =
> +				(volatile struct mlx4_err_cqe *)cqe;
>  			ERROR("%p CQE error - vendor syndrome: 0x%x"
>  			      " syndrome: 0x%x\n",
>  			      (void *)txq, cqe_err->vendor_err,
> @@ -240,15 +241,15 @@ struct pv {
>  
>  static int
>  mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq,
> -			       struct mlx4_wqe_ctrl_seg **pctrl)
> +				   volatile struct mlx4_wqe_ctrl_seg **pctrl)

Looks like an indentation issue here.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v4 8/8] net/mlx4: mitigate Tx path memory barriers
  2017-10-31 18:21       ` [PATCH v4 8/8] net/mlx4: mitigate Tx path memory barriers Matan Azrad
@ 2017-11-02 13:43         ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-11-02 13:43 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Ophir Munk

On Tue, Oct 31, 2017 at 06:21:33PM +0000, Matan Azrad wrote:
> Replace most of the memory barriers by IO memory barriers since they
> are all targeted to the DRAM; This improves code efficiency for
> systems which force store order between different addresses.
> 
> Only the doorbell register store should be protected by memory barrier
> since it is targeted to the PCI memory domain.
> 
> Limit pre byte count store IO memory barrier for systems with cache
> line size smaller than 64B (TXBB size).
> 
> This patch improves Tx performance by 0.2MPPS for one segment 64B
> packets via 1 queue with 1 core test.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>

Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* [PATCH v5 0/8] net/mlx4: Tx path improvements
  2017-10-30 10:07   ` [PATCH v3 0/7] Tx path improvements Matan Azrad
                       ` (7 preceding siblings ...)
  2017-10-31 18:21     ` [PATCH v4 0/8] net/mlx4: Tx path improvements Matan Azrad
@ 2017-11-02 16:42     ` Matan Azrad
  2017-11-02 16:42       ` [PATCH v5 1/8] net/mlx4: remove error flows from Tx fast path Matan Azrad
                         ` (10 more replies)
  8 siblings, 11 replies; 84+ messages in thread
From: Matan Azrad @ 2017-11-02 16:42 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

v1:
This series is a follow-up of the new datapath implementation introduced in RC1.
It is dedicated to Tx and improves the performance.

v2:
Change uint32_t be_lkey to rte_be32_t lkey

v3:
Rebase to 17.11-rc2.
Remove "net/mlx4: save lkey in big-endian format" patch which degrade performance.
Merge separate segment cases patches "net/mlx4: improve performance of one Tx segment" and "net/mlx4: separate Tx for multi-segments".
"net/mlx4: inline more Tx functions" renamed to "net/mlx4: associate MR to MP in a short function".
Leave bytes counter as old version.
Add memory barrier improvement patch.
Remove empty segment support.

v4:
Remove "net/mlx4: remove empty Tx segment support" patch.
Add "fix ring wraparound compiler hint" patch.
Add "fix HW memory optimizations careless" patch.
Remove unnecessary IO memory barrier in completion function.
Remove inline declaration of completion function.
Fix unnecessary empty lines.
Fix indetations.
Move max=0 check removal to patch "remove duplicate handling in Tx burst" and rename the patch.
Add performace improvement in patch "mitigate Tx path memory barriers"

v5:
Fix compilation issue in debug mode for patch "net/mlx4: associate MR to MP in a short function".
Add missing includes in "net/mlx4: associate MR to MP in a short function".
Add cq_uar as volatile too in "net/mlx4: fix HW memory optimizations careless".
Fix indentation issues in "net/mlx4: separate Tx segment cases".


Matan Azrad (7):
  net/mlx4: remove error flows from Tx fast path
  net/mlx4: fix ring wraparound compiler hint
  net/mlx4: merge Tx path functions
  net/mlx4: remove duplicate handling in Tx burst
  net/mlx4: separate Tx segment cases
  net/mlx4: fix HW memory optimizations careless
  net/mlx4: mitigate Tx path memory barriers

Ophir Munk (1):
  net/mlx4: associate MR to MP in a short function

 drivers/net/mlx4/mlx4.h      |   2 +
 drivers/net/mlx4/mlx4_mr.c   |  50 ++++++
 drivers/net/mlx4/mlx4_prm.h  |  20 +--
 drivers/net/mlx4/mlx4_rxtx.c | 397 +++++++++++++++++++++----------------------
 drivers/net/mlx4/mlx4_rxtx.h |  31 +++-
 5 files changed, 285 insertions(+), 215 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 84+ messages in thread

* [PATCH v5 1/8] net/mlx4: remove error flows from Tx fast path
  2017-11-02 16:42     ` [PATCH v5 0/8] net/mlx4: Tx path improvements Matan Azrad
@ 2017-11-02 16:42       ` Matan Azrad
  2017-11-02 16:42       ` [PATCH v5 2/8] net/mlx4: associate MR to MP in a short function Matan Azrad
                         ` (9 subsequent siblings)
  10 siblings, 0 replies; 84+ messages in thread
From: Matan Azrad @ 2017-11-02 16:42 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Move unnecessary error flows to DEBUG mode.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 67dc712..79c7fa2 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -169,6 +169,7 @@ struct pv {
 		 * Make sure we read the CQE after we read the ownership bit.
 		 */
 		rte_rmb();
+#ifndef NDEBUG
 		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
 			     MLX4_CQE_OPCODE_ERROR)) {
 			struct mlx4_err_cqe *cqe_err =
@@ -178,6 +179,7 @@ struct pv {
 			      (void *)txq, cqe_err->vendor_err,
 			      cqe_err->syndrome);
 		}
+#endif /* NDEBUG */
 		/* Get WQE index reported in the CQE. */
 		new_index =
 			rte_be_to_cpu_16(cqe->wqe_index) & sq->txbb_cnt_mask;
@@ -302,7 +304,7 @@ struct pv {
  *   Packet to transmit.
  *
  * @return
- *   0 on success, negative errno value otherwise and rte_errno is set.
+ *   0 on success, negative errno value otherwise.
  */
 static inline int
 mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
@@ -322,7 +324,6 @@ struct pv {
 	uint32_t byte_count;
 	int wqe_real_size;
 	int nr_txbbs;
-	int rc;
 	struct pv *pv = (struct pv *)txq->bounce_buf;
 	int pv_counter = 0;
 
@@ -337,8 +338,7 @@ struct pv {
 	if (((sq->head - sq->tail) + nr_txbbs +
 	     sq->headroom_txbbs) >= sq->txbb_cnt ||
 	    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-		rc = ENOSPC;
-		goto err;
+		return -ENOSPC;
 	}
 	/* Get the control and data entries of the WQE. */
 	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
@@ -354,6 +354,7 @@ struct pv {
 		dseg->addr = rte_cpu_to_be_64(addr);
 		/* Memory region key for this memory pool. */
 		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+#ifndef NDEBUG
 		if (unlikely(lkey == (uint32_t)-1)) {
 			/* MR does not exist. */
 			DEBUG("%p: unable to get MP <-> MR association",
@@ -366,9 +367,9 @@ struct pv {
 			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
 			mlx4_txq_stamp_freed_wqe(sq, head_idx,
 				     (sq->head & sq->txbb_cnt) ? 0 : 1);
-			rc = EFAULT;
-			goto err;
+			return -EFAULT;
 		}
+#endif /* NDEBUG */
 		dseg->lkey = rte_cpu_to_be_32(lkey);
 		if (likely(buf->data_len)) {
 			byte_count = rte_cpu_to_be_32(buf->data_len);
@@ -471,9 +472,6 @@ struct pv {
 					       MLX4_BIT_WQE_OWN : 0));
 	sq->head += nr_txbbs;
 	return 0;
-err:
-	rte_errno = rc;
-	return -rc;
 }
 
 /**
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v5 2/8] net/mlx4: associate MR to MP in a short function
  2017-11-02 16:42     ` [PATCH v5 0/8] net/mlx4: Tx path improvements Matan Azrad
  2017-11-02 16:42       ` [PATCH v5 1/8] net/mlx4: remove error flows from Tx fast path Matan Azrad
@ 2017-11-02 16:42       ` Matan Azrad
  2017-11-02 16:42       ` [PATCH v5 3/8] net/mlx4: fix ring wraparound compiler hint Matan Azrad
                         ` (8 subsequent siblings)
  10 siblings, 0 replies; 84+ messages in thread
From: Matan Azrad @ 2017-11-02 16:42 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

From: Ophir Munk <ophirmu@mellanox.com>

Associate memory region to mempool (on data path) in a short function.
Handle the less common case of adding a new memory region to mempool
in a separate function.

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx4/mlx4.h      |  2 ++
 drivers/net/mlx4/mlx4_mr.c   | 50 ++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx4/mlx4_rxtx.c | 58 --------------------------------------------
 drivers/net/mlx4/mlx4_rxtx.h | 31 ++++++++++++++++++++++-
 4 files changed, 82 insertions(+), 59 deletions(-)

diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index e0a9853..70cf453 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -160,5 +160,7 @@ int mlx4_flow_ctrl_set(struct rte_eth_dev *dev,
 /* mlx4_mr.c */
 
 struct ibv_mr *mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp);
+uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
+			 uint32_t i);
 
 #endif /* RTE_PMD_MLX4_H_ */
diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
index 9700884..8105cc5 100644
--- a/drivers/net/mlx4/mlx4_mr.c
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -36,7 +36,9 @@
  * Memory management functions for mlx4 driver.
  */
 
+#include <assert.h>
 #include <errno.h>
+#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
@@ -50,11 +52,13 @@
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
+#include <rte_branch_prediction.h>
 #include <rte_common.h>
 #include <rte_errno.h>
 #include <rte_memory.h>
 #include <rte_mempool.h>
 
+#include "mlx4_rxtx.h"
 #include "mlx4_utils.h"
 
 struct mlx4_check_mempool_data {
@@ -181,3 +185,49 @@ struct ibv_mr *
 		rte_errno = errno ? errno : EINVAL;
 	return mr;
 }
+
+/**
+ * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
+ * If mp2mr[] is full, remove an entry first.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param[in] mp
+ *   Memory pool for which a memory region lkey must be added.
+ * @param[in] i
+ *   Index in memory pool (MP) where to add memory region (MR).
+ *
+ * @return
+ *   Added mr->lkey on success, (uint32_t)-1 on failure.
+ */
+uint32_t
+mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
+{
+	struct ibv_mr *mr;
+
+	/* Add a new entry, register MR first. */
+	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
+	      (void *)txq, mp->name, (void *)mp);
+	mr = mlx4_mp2mr(txq->priv->pd, mp);
+	if (unlikely(mr == NULL)) {
+		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
+		      (void *)txq);
+		return (uint32_t)-1;
+	}
+	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
+		/* Table is full, remove oldest entry. */
+		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
+		      (void *)txq);
+		--i;
+		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
+		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
+			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+	}
+	/* Store the new entry. */
+	txq->mp2mr[i].mp = mp;
+	txq->mp2mr[i].mr = mr;
+	txq->mp2mr[i].lkey = mr->lkey;
+	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
+	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
+	return txq->mp2mr[i].lkey;
+}
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 79c7fa2..f1c8dae 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -37,7 +37,6 @@
  */
 
 #include <assert.h>
-#include <inttypes.h>
 #include <stdint.h>
 #include <string.h>
 
@@ -239,63 +238,6 @@ struct pv {
 }
 
 /**
- * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
- * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
- * remove an entry first.
- *
- * @param txq
- *   Pointer to Tx queue structure.
- * @param[in] mp
- *   Memory pool for which a memory region lkey must be returned.
- *
- * @return
- *   mr->lkey on success, (uint32_t)-1 on failure.
- */
-uint32_t
-mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
-{
-	unsigned int i;
-	struct ibv_mr *mr;
-
-	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
-		if (unlikely(txq->mp2mr[i].mp == NULL)) {
-			/* Unknown MP, add a new MR for it. */
-			break;
-		}
-		if (txq->mp2mr[i].mp == mp) {
-			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
-			return txq->mp2mr[i].lkey;
-		}
-	}
-	/* Add a new entry, register MR first. */
-	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
-	      (void *)txq, mp->name, (void *)mp);
-	mr = mlx4_mp2mr(txq->priv->pd, mp);
-	if (unlikely(mr == NULL)) {
-		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
-		      (void *)txq);
-		return (uint32_t)-1;
-	}
-	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
-		/* Table is full, remove oldest entry. */
-		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
-		      (void *)txq);
-		--i;
-		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
-		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
-			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
-	}
-	/* Store the new entry. */
-	txq->mp2mr[i].mp = mp;
-	txq->mp2mr[i].mr = mr;
-	txq->mp2mr[i].lkey = mr->lkey;
-	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
-	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
-	return txq->mp2mr[i].lkey;
-}
-
-/**
  * Posts a single work request to a send queue.
  *
  * @param txq
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 7d67748..185dbdf 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -165,7 +165,6 @@ int mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
 
 /* mlx4_rxtx.c */
 
-uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
 uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
 		       uint16_t pkts_n);
 uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
@@ -182,4 +181,34 @@ int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
 			const struct rte_eth_txconf *conf);
 void mlx4_tx_queue_release(void *dpdk_txq);
 
+/**
+ * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
+ * Call mlx4_txq_add_mr() if MP is not registered yet.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param[in] mp
+ *   Memory pool for which a memory region lkey must be returned.
+ *
+ * @return
+ *   mr->lkey on success, (uint32_t)-1 on failure.
+ */
+static inline uint32_t
+mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+{
+	unsigned int i;
+
+	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+		if (unlikely(txq->mp2mr[i].mp == NULL)) {
+			/* Unknown MP, add a new MR for it. */
+			break;
+		}
+		if (txq->mp2mr[i].mp == mp) {
+			/* MP found MP. */
+			return txq->mp2mr[i].lkey;
+		}
+	}
+	return mlx4_txq_add_mr(txq, mp, i);
+}
+
 #endif /* MLX4_RXTX_H_ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v5 3/8] net/mlx4: fix ring wraparound compiler hint
  2017-11-02 16:42     ` [PATCH v5 0/8] net/mlx4: Tx path improvements Matan Azrad
  2017-11-02 16:42       ` [PATCH v5 1/8] net/mlx4: remove error flows from Tx fast path Matan Azrad
  2017-11-02 16:42       ` [PATCH v5 2/8] net/mlx4: associate MR to MP in a short function Matan Azrad
@ 2017-11-02 16:42       ` Matan Azrad
  2017-11-02 16:42       ` [PATCH v5 4/8] net/mlx4: merge Tx path functions Matan Azrad
                         ` (7 subsequent siblings)
  10 siblings, 0 replies; 84+ messages in thread
From: Matan Azrad @ 2017-11-02 16:42 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Remove unlikely hint from WQ wraparound check because it is
expected case.

Fixes: c3c977bbecbd ("net/mlx4: add Tx bypassing Verbs")

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index f1c8dae..3169fe5 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -291,7 +291,7 @@ struct pv {
 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
 		rte_prefetch0((volatile void *)addr);
 		/* Handle WQE wraparound. */
-		if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
+		if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
 			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
 		dseg->addr = rte_cpu_to_be_64(addr);
 		/* Memory region key for this memory pool. */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v5 4/8] net/mlx4: merge Tx path functions
  2017-11-02 16:42     ` [PATCH v5 0/8] net/mlx4: Tx path improvements Matan Azrad
                         ` (2 preceding siblings ...)
  2017-11-02 16:42       ` [PATCH v5 3/8] net/mlx4: fix ring wraparound compiler hint Matan Azrad
@ 2017-11-02 16:42       ` Matan Azrad
  2017-11-02 16:42       ` [PATCH v5 5/8] net/mlx4: remove duplicate handling in Tx burst Matan Azrad
                         ` (6 subsequent siblings)
  10 siblings, 0 replies; 84+ messages in thread
From: Matan Azrad @ 2017-11-02 16:42 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Merge tx_burst and mlx4_post_send functions to prevent
double asking about WQ remain space.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 355 +++++++++++++++++++++----------------------
 1 file changed, 170 insertions(+), 185 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 3169fe5..e0afbea 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -238,185 +238,6 @@ struct pv {
 }
 
 /**
- * Posts a single work request to a send queue.
- *
- * @param txq
- *   Target Tx queue.
- * @param pkt
- *   Packet to transmit.
- *
- * @return
- *   0 on success, negative errno value otherwise.
- */
-static inline int
-mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
-{
-	struct mlx4_wqe_ctrl_seg *ctrl;
-	struct mlx4_wqe_data_seg *dseg;
-	struct mlx4_sq *sq = &txq->msq;
-	struct rte_mbuf *buf;
-	union {
-		uint32_t flags;
-		uint16_t flags16[2];
-	} srcrb;
-	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
-	uint32_t lkey;
-	uintptr_t addr;
-	uint32_t owner_opcode = MLX4_OPCODE_SEND;
-	uint32_t byte_count;
-	int wqe_real_size;
-	int nr_txbbs;
-	struct pv *pv = (struct pv *)txq->bounce_buf;
-	int pv_counter = 0;
-
-	/* Calculate the needed work queue entry size for this packet. */
-	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
-			pkt->nb_segs * sizeof(struct mlx4_wqe_data_seg);
-	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
-	/*
-	 * Check that there is room for this WQE in the send queue and that
-	 * the WQE size is legal.
-	 */
-	if (((sq->head - sq->tail) + nr_txbbs +
-	     sq->headroom_txbbs) >= sq->txbb_cnt ||
-	    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-		return -ENOSPC;
-	}
-	/* Get the control and data entries of the WQE. */
-	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
-	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
-					    sizeof(struct mlx4_wqe_ctrl_seg));
-	/* Fill the data segments with buffer information. */
-	for (buf = pkt; buf != NULL; buf = buf->next, dseg++) {
-		addr = rte_pktmbuf_mtod(buf, uintptr_t);
-		rte_prefetch0((volatile void *)addr);
-		/* Handle WQE wraparound. */
-		if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
-			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
-		dseg->addr = rte_cpu_to_be_64(addr);
-		/* Memory region key for this memory pool. */
-		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
-#ifndef NDEBUG
-		if (unlikely(lkey == (uint32_t)-1)) {
-			/* MR does not exist. */
-			DEBUG("%p: unable to get MP <-> MR association",
-			      (void *)txq);
-			/*
-			 * Restamp entry in case of failure.
-			 * Make sure that size is written correctly
-			 * Note that we give ownership to the SW, not the HW.
-			 */
-			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
-			mlx4_txq_stamp_freed_wqe(sq, head_idx,
-				     (sq->head & sq->txbb_cnt) ? 0 : 1);
-			return -EFAULT;
-		}
-#endif /* NDEBUG */
-		dseg->lkey = rte_cpu_to_be_32(lkey);
-		if (likely(buf->data_len)) {
-			byte_count = rte_cpu_to_be_32(buf->data_len);
-		} else {
-			/*
-			 * Zero length segment is treated as inline segment
-			 * with zero data.
-			 */
-			byte_count = RTE_BE32(0x80000000);
-		}
-		/*
-		 * If the data segment is not at the beginning of a
-		 * Tx basic block (TXBB) then write the byte count,
-		 * else postpone the writing to just before updating the
-		 * control segment.
-		 */
-		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
-			/*
-			 * Need a barrier here before writing the byte_count
-			 * fields to make sure that all the data is visible
-			 * before the byte_count field is set.
-			 * Otherwise, if the segment begins a new cacheline,
-			 * the HCA prefetcher could grab the 64-byte chunk and
-			 * get a valid (!= 0xffffffff) byte count but stale
-			 * data, and end up sending the wrong data.
-			 */
-			rte_io_wmb();
-			dseg->byte_count = byte_count;
-		} else {
-			/*
-			 * This data segment starts at the beginning of a new
-			 * TXBB, so we need to postpone its byte_count writing
-			 * for later.
-			 */
-			pv[pv_counter].dseg = dseg;
-			pv[pv_counter++].val = byte_count;
-		}
-	}
-	/* Write the first DWORD of each TXBB save earlier. */
-	if (pv_counter) {
-		/* Need a barrier here before writing the byte_count. */
-		rte_io_wmb();
-		for (--pv_counter; pv_counter  >= 0; pv_counter--)
-			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
-	}
-	/* Fill the control parameters for this packet. */
-	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
-	/*
-	 * For raw Ethernet, the SOLICIT flag is used to indicate that no ICRC
-	 * should be calculated.
-	 */
-	txq->elts_comp_cd -= nr_txbbs;
-	if (unlikely(txq->elts_comp_cd <= 0)) {
-		txq->elts_comp_cd = txq->elts_comp_cd_init;
-		srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
-				       MLX4_WQE_CTRL_CQ_UPDATE);
-	} else {
-		srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
-	}
-	/* Enable HW checksum offload if requested */
-	if (txq->csum &&
-	    (pkt->ol_flags &
-	     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) {
-		const uint64_t is_tunneled = (pkt->ol_flags &
-					      (PKT_TX_TUNNEL_GRE |
-					       PKT_TX_TUNNEL_VXLAN));
-
-		if (is_tunneled && txq->csum_l2tun) {
-			owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM |
-					MLX4_WQE_CTRL_IL4_HDR_CSUM;
-			if (pkt->ol_flags & PKT_TX_OUTER_IP_CKSUM)
-				srcrb.flags |=
-					RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM);
-		} else {
-			srcrb.flags |= RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM |
-						MLX4_WQE_CTRL_TCP_UDP_CSUM);
-		}
-	}
-	if (txq->lb) {
-		/*
-		 * Copy destination MAC address to the WQE, this allows
-		 * loopback in eSwitch, so that VFs and PF can communicate
-		 * with each other.
-		 */
-		srcrb.flags16[0] = *(rte_pktmbuf_mtod(pkt, uint16_t *));
-		ctrl->imm = *(rte_pktmbuf_mtod_offset(pkt, uint32_t *,
-						      sizeof(uint16_t)));
-	} else {
-		ctrl->imm = 0;
-	}
-	ctrl->srcrb_flags = srcrb.flags;
-	/*
-	 * Make sure descriptor is fully written before
-	 * setting ownership bit (because HW can start
-	 * executing as soon as we do).
-	 */
-	rte_wmb();
-	ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
-					      ((sq->head & sq->txbb_cnt) ?
-					       MLX4_BIT_WQE_OWN : 0));
-	sq->head += nr_txbbs;
-	return 0;
-}
-
-/**
  * DPDK callback for Tx.
  *
  * @param dpdk_txq
@@ -439,7 +260,8 @@ struct pv {
 	unsigned int bytes_sent = 0;
 	unsigned int i;
 	unsigned int max;
-	int err;
+	struct mlx4_sq *sq = &txq->msq;
+	struct pv *pv = (struct pv *)txq->bounce_buf;
 
 	assert(txq->elts_comp_cd != 0);
 	mlx4_txq_complete(txq);
@@ -460,6 +282,21 @@ struct pv {
 			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
 		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
 		struct txq_elt *elt = &(*txq->elts)[elts_head];
+		uint32_t owner_opcode = MLX4_OPCODE_SEND;
+		struct mlx4_wqe_ctrl_seg *ctrl;
+		struct mlx4_wqe_data_seg *dseg;
+		struct rte_mbuf *sbuf;
+		union {
+			uint32_t flags;
+			uint16_t flags16[2];
+		} srcrb;
+		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+		uint32_t lkey;
+		uintptr_t addr;
+		uint32_t byte_count;
+		int wqe_real_size;
+		int nr_txbbs;
+		int pv_counter = 0;
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -478,18 +315,166 @@ struct pv {
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		/* Post the packet for sending. */
-		err = mlx4_post_send(txq, buf);
-		if (unlikely(err)) {
+		/*
+		 * Calculate the needed work queue entry size
+		 * for this packet.
+		 */
+		wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+		nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
+		/*
+		 * Check that there is room for this WQE in the send
+		 * queue and that the WQE size is legal.
+		 */
+		if (((sq->head - sq->tail) + nr_txbbs +
+		     sq->headroom_txbbs) >= sq->txbb_cnt ||
+		    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
 			elt->buf = NULL;
-			goto stop;
+			break;
 		}
+		/* Get the control and data entries of the WQE. */
+		ctrl = (struct mlx4_wqe_ctrl_seg *)
+				mlx4_get_send_wqe(sq, head_idx);
+		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+				sizeof(struct mlx4_wqe_ctrl_seg));
+		/* Fill the data segments with buffer information. */
+		for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
+			addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+			rte_prefetch0((volatile void *)addr);
+			/* Handle WQE wraparound. */
+			if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
+				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+			dseg->addr = rte_cpu_to_be_64(addr);
+			/* Memory region key (big endian). */
+			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+			dseg->lkey = rte_cpu_to_be_32(lkey);
+#ifndef NDEBUG
+			if (unlikely(dseg->lkey ==
+				rte_cpu_to_be_32((uint32_t)-1))) {
+				/* MR does not exist. */
+				DEBUG("%p: unable to get MP <-> MR association",
+				      (void *)txq);
+				/*
+				 * Restamp entry in case of failure.
+				 * Make sure that size is written correctly
+				 * Note that we give ownership to the SW,
+				 * not the HW.
+				 */
+				ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+				mlx4_txq_stamp_freed_wqe(sq, head_idx,
+					     (sq->head & sq->txbb_cnt) ? 0 : 1);
+				elt->buf = NULL;
+				break;
+			}
+#endif /* NDEBUG */
+			if (likely(sbuf->data_len)) {
+				byte_count = rte_cpu_to_be_32(sbuf->data_len);
+			} else {
+				/*
+				 * Zero length segment is treated as inline
+				 * segment with zero data.
+				 */
+				byte_count = RTE_BE32(0x80000000);
+			}
+			/*
+			 * If the data segment is not at the beginning
+			 * of a Tx basic block (TXBB) then write the
+			 * byte count, else postpone the writing to
+			 * just before updating the control segment.
+			 */
+			if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+				/*
+				 * Need a barrier here before writing the
+				 * byte_count fields to make sure that all the
+				 * data is visible before the byte_count field
+				 * is set. otherwise, if the segment begins a
+				 * new cacheline, the HCA prefetcher could grab
+				 * the 64-byte chunk and get a valid
+				 * (!= 0xffffffff) byte count but stale data,
+				 * and end up sending the wrong data.
+				 */
+				rte_io_wmb();
+				dseg->byte_count = byte_count;
+			} else {
+				/*
+				 * This data segment starts at the beginning of
+				 * a new TXBB, so we need to postpone its
+				 * byte_count writing for later.
+				 */
+				pv[pv_counter].dseg = dseg;
+				pv[pv_counter++].val = byte_count;
+			}
+		}
+		/* Write the first DWORD of each TXBB save earlier. */
+		if (pv_counter) {
+			/* Need a barrier before writing the byte_count. */
+			rte_io_wmb();
+			for (--pv_counter; pv_counter  >= 0; pv_counter--)
+				pv[pv_counter].dseg->byte_count =
+						pv[pv_counter].val;
+		}
+		/* Fill the control parameters for this packet. */
+		ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+		/*
+		 * For raw Ethernet, the SOLICIT flag is used to indicate
+		 * that no ICRC should be calculated.
+		 */
+		txq->elts_comp_cd -= nr_txbbs;
+		if (unlikely(txq->elts_comp_cd <= 0)) {
+			txq->elts_comp_cd = txq->elts_comp_cd_init;
+			srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
+					       MLX4_WQE_CTRL_CQ_UPDATE);
+		} else {
+			srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
+		}
+		/* Enable HW checksum offload if requested */
+		if (txq->csum &&
+		    (buf->ol_flags &
+		     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) {
+			const uint64_t is_tunneled = (buf->ol_flags &
+						      (PKT_TX_TUNNEL_GRE |
+						       PKT_TX_TUNNEL_VXLAN));
+
+			if (is_tunneled && txq->csum_l2tun) {
+				owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM |
+						MLX4_WQE_CTRL_IL4_HDR_CSUM;
+				if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM)
+					srcrb.flags |=
+					    RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM);
+			} else {
+				srcrb.flags |=
+					RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM |
+						MLX4_WQE_CTRL_TCP_UDP_CSUM);
+			}
+		}
+		if (txq->lb) {
+			/*
+			 * Copy destination MAC address to the WQE, this allows
+			 * loopback in eSwitch, so that VFs and PF can
+			 * communicate with each other.
+			 */
+			srcrb.flags16[0] = *(rte_pktmbuf_mtod(buf, uint16_t *));
+			ctrl->imm = *(rte_pktmbuf_mtod_offset(buf, uint32_t *,
+					      sizeof(uint16_t)));
+		} else {
+			ctrl->imm = 0;
+		}
+		ctrl->srcrb_flags = srcrb.flags;
+		/*
+		 * Make sure descriptor is fully written before
+		 * setting ownership bit (because HW can start
+		 * executing as soon as we do).
+		 */
+		rte_wmb();
+		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
+					      ((sq->head & sq->txbb_cnt) ?
+						       MLX4_BIT_WQE_OWN : 0));
+		sq->head += nr_txbbs;
 		elt->buf = buf;
 		bytes_sent += buf->pkt_len;
 		++elts_comp;
 		elts_head = elts_head_next;
 	}
-stop:
 	/* Take a shortcut if nothing must be sent. */
 	if (unlikely(i == 0))
 		return 0;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v5 5/8] net/mlx4: remove duplicate handling in Tx burst
  2017-11-02 16:42     ` [PATCH v5 0/8] net/mlx4: Tx path improvements Matan Azrad
                         ` (3 preceding siblings ...)
  2017-11-02 16:42       ` [PATCH v5 4/8] net/mlx4: merge Tx path functions Matan Azrad
@ 2017-11-02 16:42       ` Matan Azrad
  2017-11-02 16:42       ` [PATCH v5 6/8] net/mlx4: separate Tx segment cases Matan Azrad
                         ` (5 subsequent siblings)
  10 siblings, 0 replies; 84+ messages in thread
From: Matan Azrad @ 2017-11-02 16:42 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Remove usage of variable which count the packets for completion and
doesn't add more information than packets counter.

Remove no space in elements ring check which is already covered by
regular Tx flow.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index e0afbea..abd125f 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -256,7 +256,6 @@ struct pv {
 	struct txq *txq = (struct txq *)dpdk_txq;
 	unsigned int elts_head = txq->elts_head;
 	const unsigned int elts_n = txq->elts_n;
-	unsigned int elts_comp = 0;
 	unsigned int bytes_sent = 0;
 	unsigned int i;
 	unsigned int max;
@@ -272,8 +271,6 @@ struct pv {
 	assert(max <= elts_n);
 	/* Always leave one free entry in the ring. */
 	--max;
-	if (max == 0)
-		return 0;
 	if (max > pkts_n)
 		max = pkts_n;
 	for (i = 0; (i != max); ++i) {
@@ -472,7 +469,6 @@ struct pv {
 		sq->head += nr_txbbs;
 		elt->buf = buf;
 		bytes_sent += buf->pkt_len;
-		++elts_comp;
 		elts_head = elts_head_next;
 	}
 	/* Take a shortcut if nothing must be sent. */
@@ -486,7 +482,7 @@ struct pv {
 	/* Ring QP doorbell. */
 	rte_write32(txq->msq.doorbell_qpn, txq->msq.db);
 	txq->elts_head = elts_head;
-	txq->elts_comp += elts_comp;
+	txq->elts_comp += i;
 	return i;
 }
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v5 6/8] net/mlx4: separate Tx segment cases
  2017-11-02 16:42     ` [PATCH v5 0/8] net/mlx4: Tx path improvements Matan Azrad
                         ` (4 preceding siblings ...)
  2017-11-02 16:42       ` [PATCH v5 5/8] net/mlx4: remove duplicate handling in Tx burst Matan Azrad
@ 2017-11-02 16:42       ` Matan Azrad
  2017-11-02 16:42       ` [PATCH v5 7/8] net/mlx4: fix HW memory optimizations careless Matan Azrad
                         ` (4 subsequent siblings)
  10 siblings, 0 replies; 84+ messages in thread
From: Matan Azrad @ 2017-11-02 16:42 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Optimize single segment case by processing it in different block which
prevents checks, calculations and barriers relevant only for multi
segment case.

Call a dedicated function for handling multi segments case.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 237 +++++++++++++++++++++++++++----------------
 1 file changed, 152 insertions(+), 85 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index abd125f..d13c8d2 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -61,6 +61,9 @@
 #include "mlx4_rxtx.h"
 #include "mlx4_utils.h"
 
+#define WQE_ONE_DATA_SEG_SIZE \
+	(sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg))
+
 /**
  * Pointer-value pair structure used in tx_post_send for saving the first
  * DWORD (32 byte) of a TXBB.
@@ -140,21 +143,18 @@ struct pv {
  *   0 on success, -1 on failure.
  */
 static int
-mlx4_txq_complete(struct txq *txq)
+mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
+				  struct mlx4_sq *sq)
 {
 	unsigned int elts_comp = txq->elts_comp;
 	unsigned int elts_tail = txq->elts_tail;
-	const unsigned int elts_n = txq->elts_n;
 	struct mlx4_cq *cq = &txq->mcq;
-	struct mlx4_sq *sq = &txq->msq;
 	struct mlx4_cqe *cqe;
 	uint32_t cons_index = cq->cons_index;
 	uint16_t new_index;
 	uint16_t nr_txbbs = 0;
 	int pkts = 0;
 
-	if (unlikely(elts_comp == 0))
-		return 0;
 	/*
 	 * Traverse over all CQ entries reported and handle each WQ entry
 	 * reported by them.
@@ -237,6 +237,120 @@ struct pv {
 	return buf->pool;
 }
 
+static int
+mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq,
+			       struct mlx4_wqe_ctrl_seg **pctrl)
+{
+	int wqe_real_size;
+	int nr_txbbs;
+	struct pv *pv = (struct pv *)txq->bounce_buf;
+	struct mlx4_sq *sq = &txq->msq;
+	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_data_seg *dseg;
+	struct rte_mbuf *sbuf;
+	uint32_t lkey;
+	uintptr_t addr;
+	uint32_t byte_count;
+	int pv_counter = 0;
+
+	/* Calculate the needed work queue entry size for this packet. */
+	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+		buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
+	/*
+	 * Check that there is room for this WQE in the send queue and that
+	 * the WQE size is legal.
+	 */
+	if (((sq->head - sq->tail) + nr_txbbs +
+				sq->headroom_txbbs) >= sq->txbb_cnt ||
+			nr_txbbs > MLX4_MAX_WQE_TXBBS) {
+		return -1;
+	}
+	/* Get the control and data entries of the WQE. */
+	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
+	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+			sizeof(struct mlx4_wqe_ctrl_seg));
+	*pctrl = ctrl;
+	/* Fill the data segments with buffer information. */
+	for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
+		addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+		rte_prefetch0((volatile void *)addr);
+		/* Handle WQE wraparound. */
+		if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
+			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+		dseg->addr = rte_cpu_to_be_64(addr);
+		/* Memory region key (big endian) for this memory pool. */
+		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+		dseg->lkey = rte_cpu_to_be_32(lkey);
+#ifndef NDEBUG
+		/* Calculate the needed work queue entry size for this packet */
+		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
+			/* MR does not exist. */
+			DEBUG("%p: unable to get MP <-> MR association",
+					(void *)txq);
+			/*
+			 * Restamp entry in case of failure.
+			 * Make sure that size is written correctly
+			 * Note that we give ownership to the SW, not the HW.
+			 */
+			wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+			mlx4_txq_stamp_freed_wqe(sq, head_idx,
+					(sq->head & sq->txbb_cnt) ? 0 : 1);
+			return -1;
+		}
+#endif /* NDEBUG */
+		if (likely(sbuf->data_len)) {
+			byte_count = rte_cpu_to_be_32(sbuf->data_len);
+		} else {
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			byte_count = RTE_BE32(0x80000000);
+		}
+		/*
+		 * If the data segment is not at the beginning of a
+		 * Tx basic block (TXBB) then write the byte count,
+		 * else postpone the writing to just before updating the
+		 * control segment.
+		 */
+		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+			/*
+			 * Need a barrier here before writing the byte_count
+			 * fields to make sure that all the data is visible
+			 * before the byte_count field is set.
+			 * Otherwise, if the segment begins a new cacheline,
+			 * the HCA prefetcher could grab the 64-byte chunk and
+			 * get a valid (!= 0xffffffff) byte count but stale
+			 * data, and end up sending the wrong data.
+			 */
+			rte_io_wmb();
+			dseg->byte_count = byte_count;
+		} else {
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[pv_counter].dseg = dseg;
+			pv[pv_counter++].val = byte_count;
+		}
+	}
+	/* Write the first DWORD of each TXBB save earlier. */
+	if (pv_counter) {
+		/* Need a barrier here before writing the byte_count. */
+		rte_io_wmb();
+		for (--pv_counter; pv_counter  >= 0; pv_counter--)
+			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
+	}
+	/* Fill the control parameters for this packet. */
+	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+	return nr_txbbs;
+}
+
 /**
  * DPDK callback for Tx.
  *
@@ -260,10 +374,11 @@ struct pv {
 	unsigned int i;
 	unsigned int max;
 	struct mlx4_sq *sq = &txq->msq;
-	struct pv *pv = (struct pv *)txq->bounce_buf;
+	int nr_txbbs;
 
 	assert(txq->elts_comp_cd != 0);
-	mlx4_txq_complete(txq);
+	if (likely(txq->elts_comp != 0))
+		mlx4_txq_complete(txq, elts_n, sq);
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
 		max -= elts_n;
@@ -282,7 +397,6 @@ struct pv {
 		uint32_t owner_opcode = MLX4_OPCODE_SEND;
 		struct mlx4_wqe_ctrl_seg *ctrl;
 		struct mlx4_wqe_data_seg *dseg;
-		struct rte_mbuf *sbuf;
 		union {
 			uint32_t flags;
 			uint16_t flags16[2];
@@ -290,10 +404,6 @@ struct pv {
 		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
 		uint32_t lkey;
 		uintptr_t addr;
-		uint32_t byte_count;
-		int wqe_real_size;
-		int nr_txbbs;
-		int pv_counter = 0;
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -312,38 +422,29 @@ struct pv {
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		/*
-		 * Calculate the needed work queue entry size
-		 * for this packet.
-		 */
-		wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
-				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
-		nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
-		/*
-		 * Check that there is room for this WQE in the send
-		 * queue and that the WQE size is legal.
-		 */
-		if (((sq->head - sq->tail) + nr_txbbs +
-		     sq->headroom_txbbs) >= sq->txbb_cnt ||
-		    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-			elt->buf = NULL;
-			break;
-		}
-		/* Get the control and data entries of the WQE. */
-		ctrl = (struct mlx4_wqe_ctrl_seg *)
-				mlx4_get_send_wqe(sq, head_idx);
-		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
-				sizeof(struct mlx4_wqe_ctrl_seg));
-		/* Fill the data segments with buffer information. */
-		for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
-			addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+		if (buf->nb_segs == 1) {
+			/*
+			 * Check that there is room for this WQE in the send
+			 * queue and that the WQE size is legal
+			 */
+			if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs) >=
+			     sq->txbb_cnt || 1 > MLX4_MAX_WQE_TXBBS) {
+				elt->buf = NULL;
+				break;
+			}
+			/* Get the control and data entries of the WQE. */
+			ctrl = (struct mlx4_wqe_ctrl_seg *)
+					mlx4_get_send_wqe(sq, head_idx);
+			dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+					sizeof(struct mlx4_wqe_ctrl_seg));
+			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			rte_prefetch0((volatile void *)addr);
 			/* Handle WQE wraparound. */
 			if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
 				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
 			dseg->addr = rte_cpu_to_be_64(addr);
 			/* Memory region key (big endian). */
-			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
 			dseg->lkey = rte_cpu_to_be_32(lkey);
 #ifndef NDEBUG
 			if (unlikely(dseg->lkey ==
@@ -357,61 +458,27 @@ struct pv {
 				 * Note that we give ownership to the SW,
 				 * not the HW.
 				 */
-				ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+				ctrl->fence_size =
+					(WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
 				mlx4_txq_stamp_freed_wqe(sq, head_idx,
 					     (sq->head & sq->txbb_cnt) ? 0 : 1);
 				elt->buf = NULL;
 				break;
 			}
 #endif /* NDEBUG */
-			if (likely(sbuf->data_len)) {
-				byte_count = rte_cpu_to_be_32(sbuf->data_len);
-			} else {
-				/*
-				 * Zero length segment is treated as inline
-				 * segment with zero data.
-				 */
-				byte_count = RTE_BE32(0x80000000);
-			}
-			/*
-			 * If the data segment is not at the beginning
-			 * of a Tx basic block (TXBB) then write the
-			 * byte count, else postpone the writing to
-			 * just before updating the control segment.
-			 */
-			if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
-				/*
-				 * Need a barrier here before writing the
-				 * byte_count fields to make sure that all the
-				 * data is visible before the byte_count field
-				 * is set. otherwise, if the segment begins a
-				 * new cacheline, the HCA prefetcher could grab
-				 * the 64-byte chunk and get a valid
-				 * (!= 0xffffffff) byte count but stale data,
-				 * and end up sending the wrong data.
-				 */
-				rte_io_wmb();
-				dseg->byte_count = byte_count;
-			} else {
-				/*
-				 * This data segment starts at the beginning of
-				 * a new TXBB, so we need to postpone its
-				 * byte_count writing for later.
-				 */
-				pv[pv_counter].dseg = dseg;
-				pv[pv_counter++].val = byte_count;
-			}
-		}
-		/* Write the first DWORD of each TXBB save earlier. */
-		if (pv_counter) {
-			/* Need a barrier before writing the byte_count. */
+			/* Need a barrier here before byte count store. */
 			rte_io_wmb();
-			for (--pv_counter; pv_counter  >= 0; pv_counter--)
-				pv[pv_counter].dseg->byte_count =
-						pv[pv_counter].val;
+			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
+			/* Fill the control parameters for this packet. */
+			ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
+			nr_txbbs = 1;
+		} else {
+			nr_txbbs = mlx4_tx_burst_segs(buf, txq, &ctrl);
+			if (nr_txbbs < 0) {
+				elt->buf = NULL;
+				break;
+			}
 		}
-		/* Fill the control parameters for this packet. */
-		ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
 		/*
 		 * For raw Ethernet, the SOLICIT flag is used to indicate
 		 * that no ICRC should be calculated.
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v5 7/8] net/mlx4: fix HW memory optimizations careless
  2017-11-02 16:42     ` [PATCH v5 0/8] net/mlx4: Tx path improvements Matan Azrad
                         ` (5 preceding siblings ...)
  2017-11-02 16:42       ` [PATCH v5 6/8] net/mlx4: separate Tx segment cases Matan Azrad
@ 2017-11-02 16:42       ` Matan Azrad
  2017-11-02 16:42       ` [PATCH v5 8/8] net/mlx4: mitigate Tx path memory barriers Matan Azrad
                         ` (3 subsequent siblings)
  10 siblings, 0 replies; 84+ messages in thread
From: Matan Azrad @ 2017-11-02 16:42 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Volatilize all Rx/Tx HW negotiation memories to be sure no compiler
optimization prevents either load or store commands.

Fixes: c3c977bbecbd ("net/mlx4: add Tx bypassing Verbs")
Fixes: 9f57340a8087 ("net/mlx4: restore Rx offloads")
Fixes: 6681b845034c ("net/mlx4: add Rx bypassing Verbs")
Fixes: 62e96ffb93ad ("net/mlx4: fix no Rx interrupts")

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx4/mlx4_prm.h  | 20 ++++++-------
 drivers/net/mlx4/mlx4_rxtx.c | 67 ++++++++++++++++++++++++--------------------
 2 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index b0fd982..339831a 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -80,14 +80,14 @@ enum {
 
 /* Send queue information. */
 struct mlx4_sq {
-	uint8_t *buf; /**< SQ buffer. */
-	uint8_t *eob; /**< End of SQ buffer */
+	volatile uint8_t *buf; /**< SQ buffer. */
+	volatile uint8_t *eob; /**< End of SQ buffer */
 	uint32_t head; /**< SQ head counter in units of TXBBS. */
 	uint32_t tail; /**< SQ tail counter in units of TXBBS. */
 	uint32_t txbb_cnt; /**< Num of WQEBB in the Q (should be ^2). */
 	uint32_t txbb_cnt_mask; /**< txbbs_cnt mask (txbb_cnt is ^2). */
 	uint32_t headroom_txbbs; /**< Num of txbbs that should be kept free. */
-	uint32_t *db; /**< Pointer to the doorbell. */
+	volatile uint32_t *db; /**< Pointer to the doorbell. */
 	uint32_t doorbell_qpn; /**< qp number to write to the doorbell. */
 };
 
@@ -100,11 +100,11 @@ struct mlx4_sq {
 
 /* Completion queue information. */
 struct mlx4_cq {
-	void *cq_uar; /**< CQ user access region. */
-	void *cq_db_reg; /**< CQ doorbell register. */
-	uint32_t *set_ci_db; /**< Pointer to the completion queue doorbell. */
-	uint32_t *arm_db; /**< Pointer to doorbell for arming Rx events. */
-	uint8_t *buf; /**< Pointer to the completion queue buffer. */
+	volatile void *cq_uar; /**< CQ user access region. */
+	volatile void *cq_db_reg; /**< CQ doorbell register. */
+	volatile uint32_t *set_ci_db; /**< Pointer to the CQ doorbell. */
+	volatile uint32_t *arm_db; /**< Arming Rx events doorbell. */
+	volatile uint8_t *buf; /**< Pointer to the completion queue buffer. */
 	uint32_t cqe_cnt; /**< Number of entries in the queue. */
 	uint32_t cqe_64:1; /**< CQ entry size is 64 bytes. */
 	uint32_t cons_index; /**< Last queue entry that was handled. */
@@ -128,10 +128,10 @@ struct mlx4_cq {
  * @return
  *   Pointer to CQE entry.
  */
-static inline struct mlx4_cqe *
+static inline volatile struct mlx4_cqe *
 mlx4_get_cqe(struct mlx4_cq *cq, uint32_t index)
 {
-	return (struct mlx4_cqe *)(cq->buf +
+	return (volatile struct mlx4_cqe *)(cq->buf +
 				   ((index & (cq->cqe_cnt - 1)) <<
 				    (5 + cq->cqe_64)) +
 				   (cq->cqe_64 << 5));
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index d13c8d2..20060ce 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -69,7 +69,7 @@
  * DWORD (32 byte) of a TXBB.
  */
 struct pv {
-	struct mlx4_wqe_data_seg *dseg;
+	volatile struct mlx4_wqe_data_seg *dseg;
 	uint32_t val;
 };
 
@@ -97,14 +97,15 @@ struct pv {
 {
 	uint32_t stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL |
 					  (!!owner << MLX4_SQ_STAMP_SHIFT));
-	uint8_t *wqe = mlx4_get_send_wqe(sq, (index & sq->txbb_cnt_mask));
-	uint32_t *ptr = (uint32_t *)wqe;
+	volatile uint8_t *wqe = mlx4_get_send_wqe(sq,
+						(index & sq->txbb_cnt_mask));
+	volatile uint32_t *ptr = (volatile uint32_t *)wqe;
 	int i;
 	int txbbs_size;
 	int num_txbbs;
 
 	/* Extract the size from the control segment of the WQE. */
-	num_txbbs = MLX4_SIZE_TO_TXBBS((((struct mlx4_wqe_ctrl_seg *)
+	num_txbbs = MLX4_SIZE_TO_TXBBS((((volatile struct mlx4_wqe_ctrl_seg *)
 					 wqe)->fence_size & 0x3f) << 4);
 	txbbs_size = num_txbbs * MLX4_TXBB_SIZE;
 	/* Optimize the common case when there is no wrap-around. */
@@ -119,8 +120,8 @@ struct pv {
 		for (i = 0; i < txbbs_size; i += MLX4_SQ_STAMP_STRIDE) {
 			*ptr = stamp;
 			ptr += MLX4_SQ_STAMP_DWORDS;
-			if ((uint8_t *)ptr >= sq->eob) {
-				ptr = (uint32_t *)sq->buf;
+			if ((volatile uint8_t *)ptr >= sq->eob) {
+				ptr = (volatile uint32_t *)sq->buf;
 				stamp ^= RTE_BE32(0x80000000);
 			}
 		}
@@ -149,7 +150,7 @@ struct pv {
 	unsigned int elts_comp = txq->elts_comp;
 	unsigned int elts_tail = txq->elts_tail;
 	struct mlx4_cq *cq = &txq->mcq;
-	struct mlx4_cqe *cqe;
+	volatile struct mlx4_cqe *cqe;
 	uint32_t cons_index = cq->cons_index;
 	uint16_t new_index;
 	uint16_t nr_txbbs = 0;
@@ -160,7 +161,7 @@ struct pv {
 	 * reported by them.
 	 */
 	do {
-		cqe = (struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index);
+		cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index);
 		if (unlikely(!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
 		    !!(cons_index & cq->cqe_cnt)))
 			break;
@@ -171,8 +172,8 @@ struct pv {
 #ifndef NDEBUG
 		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
 			     MLX4_CQE_OPCODE_ERROR)) {
-			struct mlx4_err_cqe *cqe_err =
-				(struct mlx4_err_cqe *)cqe;
+			volatile struct mlx4_err_cqe *cqe_err =
+				(volatile struct mlx4_err_cqe *)cqe;
 			ERROR("%p CQE error - vendor syndrome: 0x%x"
 			      " syndrome: 0x%x\n",
 			      (void *)txq, cqe_err->vendor_err,
@@ -239,15 +240,15 @@ struct pv {
 
 static int
 mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq,
-			       struct mlx4_wqe_ctrl_seg **pctrl)
+		   volatile struct mlx4_wqe_ctrl_seg **pctrl)
 {
 	int wqe_real_size;
 	int nr_txbbs;
 	struct pv *pv = (struct pv *)txq->bounce_buf;
 	struct mlx4_sq *sq = &txq->msq;
 	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
-	struct mlx4_wqe_ctrl_seg *ctrl;
-	struct mlx4_wqe_data_seg *dseg;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl;
+	volatile struct mlx4_wqe_data_seg *dseg;
 	struct rte_mbuf *sbuf;
 	uint32_t lkey;
 	uintptr_t addr;
@@ -255,8 +256,8 @@ struct pv {
 	int pv_counter = 0;
 
 	/* Calculate the needed work queue entry size for this packet. */
-	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
-		buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+	wqe_real_size = sizeof(volatile struct mlx4_wqe_ctrl_seg) +
+		buf->nb_segs * sizeof(volatile struct mlx4_wqe_data_seg);
 	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
 	/*
 	 * Check that there is room for this WQE in the send queue and that
@@ -268,17 +269,18 @@ struct pv {
 		return -1;
 	}
 	/* Get the control and data entries of the WQE. */
-	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
-	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
-			sizeof(struct mlx4_wqe_ctrl_seg));
+	ctrl = (volatile struct mlx4_wqe_ctrl_seg *)
+			mlx4_get_send_wqe(sq, head_idx);
+	dseg = (volatile struct mlx4_wqe_data_seg *)
+			((uintptr_t)ctrl + sizeof(struct mlx4_wqe_ctrl_seg));
 	*pctrl = ctrl;
 	/* Fill the data segments with buffer information. */
 	for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
 		addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
 		rte_prefetch0((volatile void *)addr);
 		/* Handle WQE wraparound. */
-		if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
-			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+		if (dseg >= (volatile struct mlx4_wqe_data_seg *)sq->eob)
+			dseg = (volatile struct mlx4_wqe_data_seg *)sq->buf;
 		dseg->addr = rte_cpu_to_be_64(addr);
 		/* Memory region key (big endian) for this memory pool. */
 		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
@@ -395,8 +397,8 @@ struct pv {
 		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
 		struct txq_elt *elt = &(*txq->elts)[elts_head];
 		uint32_t owner_opcode = MLX4_OPCODE_SEND;
-		struct mlx4_wqe_ctrl_seg *ctrl;
-		struct mlx4_wqe_data_seg *dseg;
+		volatile struct mlx4_wqe_ctrl_seg *ctrl;
+		volatile struct mlx4_wqe_data_seg *dseg;
 		union {
 			uint32_t flags;
 			uint16_t flags16[2];
@@ -433,15 +435,18 @@ struct pv {
 				break;
 			}
 			/* Get the control and data entries of the WQE. */
-			ctrl = (struct mlx4_wqe_ctrl_seg *)
+			ctrl = (volatile struct mlx4_wqe_ctrl_seg *)
 					mlx4_get_send_wqe(sq, head_idx);
-			dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+			dseg = (volatile struct mlx4_wqe_data_seg *)
+					((uintptr_t)ctrl +
 					sizeof(struct mlx4_wqe_ctrl_seg));
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			rte_prefetch0((volatile void *)addr);
 			/* Handle WQE wraparound. */
-			if (dseg >= (struct mlx4_wqe_data_seg *)sq->eob)
-				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+			if (dseg >=
+				(volatile struct mlx4_wqe_data_seg *)sq->eob)
+				dseg = (volatile struct mlx4_wqe_data_seg *)
+						sq->buf;
 			dseg->addr = rte_cpu_to_be_64(addr);
 			/* Memory region key (big endian). */
 			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
@@ -633,7 +638,7 @@ struct pv {
  *   CQE checksum information.
  */
 static inline uint32_t
-mlx4_cqe_flags(struct mlx4_cqe *cqe, int csum, int csum_l2tun)
+mlx4_cqe_flags(volatile struct mlx4_cqe *cqe, int csum, int csum_l2tun)
 {
 	uint32_t flags = 0;
 
@@ -666,13 +671,13 @@ struct pv {
  *   Number of bytes of the CQE, 0 in case there is no completion.
  */
 static unsigned int
-mlx4_cq_poll_one(struct rxq *rxq, struct mlx4_cqe **out)
+mlx4_cq_poll_one(struct rxq *rxq, volatile struct mlx4_cqe **out)
 {
 	int ret = 0;
-	struct mlx4_cqe *cqe = NULL;
+	volatile struct mlx4_cqe *cqe = NULL;
 	struct mlx4_cq *cq = &rxq->mcq;
 
-	cqe = (struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index);
+	cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index);
 	if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
 	    !!(cq->cons_index & cq->cqe_cnt))
 		goto out;
@@ -717,7 +722,7 @@ struct pv {
 	int len = 0;
 
 	while (pkts_n) {
-		struct mlx4_cqe *cqe;
+		volatile struct mlx4_cqe *cqe;
 		uint32_t idx = rq_ci & wr_cnt;
 		struct rte_mbuf *rep = (*rxq->elts)[idx];
 		volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx];
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v5 8/8] net/mlx4: mitigate Tx path memory barriers
  2017-11-02 16:42     ` [PATCH v5 0/8] net/mlx4: Tx path improvements Matan Azrad
                         ` (6 preceding siblings ...)
  2017-11-02 16:42       ` [PATCH v5 7/8] net/mlx4: fix HW memory optimizations careless Matan Azrad
@ 2017-11-02 16:42       ` Matan Azrad
  2017-11-02 17:07       ` [PATCH v5 0/8] net/mlx4: Tx path improvements Adrien Mazarguil
                         ` (2 subsequent siblings)
  10 siblings, 0 replies; 84+ messages in thread
From: Matan Azrad @ 2017-11-02 16:42 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ophir Munk

Replace most of the memory barriers by IO memory barriers since they
are all targeted to the DRAM; This improves code efficiency for
systems which force store order between different addresses.

Only the doorbell register store should be protected by memory barrier
since it is targeted to the PCI memory domain.

Limit pre byte count store IO memory barrier for systems with cache
line size smaller than 64B (TXBB size).

This patch improves Tx performance by 0.2MPPS for one segment 64B
packets via 1 queue with 1 core test.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 20060ce..5f8adec 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -168,7 +168,7 @@ struct pv {
 		/*
 		 * Make sure we read the CQE after we read the ownership bit.
 		 */
-		rte_rmb();
+		rte_io_rmb();
 #ifndef NDEBUG
 		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
 			     MLX4_CQE_OPCODE_ERROR)) {
@@ -196,14 +196,9 @@ struct pv {
 	} while (1);
 	if (unlikely(pkts == 0))
 		return 0;
-	/*
-	 * Update CQ.
-	 * To prevent CQ overflow we first update CQ consumer and only then
-	 * the ring consumer.
-	 */
+	/* Update CQ. */
 	cq->cons_index = cons_index;
 	*cq->set_ci_db = rte_cpu_to_be_32(cq->cons_index & MLX4_CQ_DB_CI_MASK);
-	rte_wmb();
 	sq->tail = sq->tail + nr_txbbs;
 	/* Update the list of packets posted for transmission. */
 	elts_comp -= pkts;
@@ -320,6 +315,7 @@ struct pv {
 		 * control segment.
 		 */
 		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+#if RTE_CACHE_LINE_SIZE < 64
 			/*
 			 * Need a barrier here before writing the byte_count
 			 * fields to make sure that all the data is visible
@@ -330,6 +326,7 @@ struct pv {
 			 * data, and end up sending the wrong data.
 			 */
 			rte_io_wmb();
+#endif /* RTE_CACHE_LINE_SIZE */
 			dseg->byte_count = byte_count;
 		} else {
 			/*
@@ -471,8 +468,7 @@ struct pv {
 				break;
 			}
 #endif /* NDEBUG */
-			/* Need a barrier here before byte count store. */
-			rte_io_wmb();
+			/* Never be TXBB aligned, no need compiler barrier. */
 			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
 			/* Fill the control parameters for this packet. */
 			ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
@@ -534,7 +530,7 @@ struct pv {
 		 * setting ownership bit (because HW can start
 		 * executing as soon as we do).
 		 */
-		rte_wmb();
+		rte_io_wmb();
 		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
 					      ((sq->head & sq->txbb_cnt) ?
 						       MLX4_BIT_WQE_OWN : 0));
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* Re: [PATCH v5 0/8] net/mlx4: Tx path improvements
  2017-11-02 16:42     ` [PATCH v5 0/8] net/mlx4: Tx path improvements Matan Azrad
                         ` (7 preceding siblings ...)
  2017-11-02 16:42       ` [PATCH v5 8/8] net/mlx4: mitigate Tx path memory barriers Matan Azrad
@ 2017-11-02 17:07       ` Adrien Mazarguil
  2017-11-02 20:35         ` Ferruh Yigit
  2017-11-02 20:41       ` Ferruh Yigit
  2017-11-03 19:25       ` Ferruh Yigit
  10 siblings, 1 reply; 84+ messages in thread
From: Adrien Mazarguil @ 2017-11-02 17:07 UTC (permalink / raw)
  To: Matan Azrad, Ferruh Yigit; +Cc: dev, Ophir Munk

On Thu, Nov 02, 2017 at 04:42:43PM +0000, Matan Azrad wrote:
> v1:
> This series is a follow-up of the new datapath implementation introduced in RC1.
> It is dedicated to Tx and improves the performance.
> 
> v2:
> Change uint32_t be_lkey to rte_be32_t lkey
> 
> v3:
> Rebase to 17.11-rc2.
> Remove "net/mlx4: save lkey in big-endian format" patch which degrade performance.
> Merge separate segment cases patches "net/mlx4: improve performance of one Tx segment" and "net/mlx4: separate Tx for multi-segments".
> "net/mlx4: inline more Tx functions" renamed to "net/mlx4: associate MR to MP in a short function".
> Leave bytes counter as old version.
> Add memory barrier improvement patch.
> Remove empty segment support.
> 
> v4:
> Remove "net/mlx4: remove empty Tx segment support" patch.
> Add "fix ring wraparound compiler hint" patch.
> Add "fix HW memory optimizations careless" patch.
> Remove unnecessary IO memory barrier in completion function.
> Remove inline declaration of completion function.
> Fix unnecessary empty lines.
> Fix indetations.
> Move max=0 check removal to patch "remove duplicate handling in Tx burst" and rename the patch.
> Add performace improvement in patch "mitigate Tx path memory barriers"
> 
> v5:
> Fix compilation issue in debug mode for patch "net/mlx4: associate MR to MP in a short function".
> Add missing includes in "net/mlx4: associate MR to MP in a short function".
> Add cq_uar as volatile too in "net/mlx4: fix HW memory optimizations careless".
> Fix indentation issues in "net/mlx4: separate Tx segment cases".

OK for this version, although looks like you didn't include:

 "net/mlx4: fix missing include"  

in front of the series. mlx4_utils.h remains to be fixed, although I don't
think a v6 is needed for such a minor commit.

Ferruh, can you apply the above patch first for consistency?

Thanks.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v5 0/8] net/mlx4: Tx path improvements
  2017-11-02 17:07       ` [PATCH v5 0/8] net/mlx4: Tx path improvements Adrien Mazarguil
@ 2017-11-02 20:35         ` Ferruh Yigit
  0 siblings, 0 replies; 84+ messages in thread
From: Ferruh Yigit @ 2017-11-02 20:35 UTC (permalink / raw)
  To: Adrien Mazarguil, Matan Azrad; +Cc: dev, Ophir Munk

On 11/2/2017 10:07 AM, Adrien Mazarguil wrote:
> On Thu, Nov 02, 2017 at 04:42:43PM +0000, Matan Azrad wrote:
>> v1:
>> This series is a follow-up of the new datapath implementation introduced in RC1.
>> It is dedicated to Tx and improves the performance.
>>
>> v2:
>> Change uint32_t be_lkey to rte_be32_t lkey
>>
>> v3:
>> Rebase to 17.11-rc2.
>> Remove "net/mlx4: save lkey in big-endian format" patch which degrade performance.
>> Merge separate segment cases patches "net/mlx4: improve performance of one Tx segment" and "net/mlx4: separate Tx for multi-segments".
>> "net/mlx4: inline more Tx functions" renamed to "net/mlx4: associate MR to MP in a short function".
>> Leave bytes counter as old version.
>> Add memory barrier improvement patch.
>> Remove empty segment support.
>>
>> v4:
>> Remove "net/mlx4: remove empty Tx segment support" patch.
>> Add "fix ring wraparound compiler hint" patch.
>> Add "fix HW memory optimizations careless" patch.
>> Remove unnecessary IO memory barrier in completion function.
>> Remove inline declaration of completion function.
>> Fix unnecessary empty lines.
>> Fix indetations.
>> Move max=0 check removal to patch "remove duplicate handling in Tx burst" and rename the patch.
>> Add performace improvement in patch "mitigate Tx path memory barriers"
>>
>> v5:
>> Fix compilation issue in debug mode for patch "net/mlx4: associate MR to MP in a short function".
>> Add missing includes in "net/mlx4: associate MR to MP in a short function".
>> Add cq_uar as volatile too in "net/mlx4: fix HW memory optimizations careless".
>> Fix indentation issues in "net/mlx4: separate Tx segment cases".
> 
> OK for this version, although looks like you didn't include:
> 
>  "net/mlx4: fix missing include"  
> 
> in front of the series. mlx4_utils.h remains to be fixed, although I don't
> think a v6 is needed for such a minor commit.
> 
> Ferruh, can you apply the above patch first for consistency?

Yes I can, no need a new version for this.

> 
> Thanks.
> 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH] net/mlx4: fix missing include
  2017-11-02 13:41       ` [PATCH] net/mlx4: fix missing include Adrien Mazarguil
@ 2017-11-02 20:35         ` Ferruh Yigit
  0 siblings, 0 replies; 84+ messages in thread
From: Ferruh Yigit @ 2017-11-02 20:35 UTC (permalink / raw)
  To: Adrien Mazarguil, Matan Azrad, Ophir Munk; +Cc: dev

On 11/2/2017 6:41 AM, Adrien Mazarguil wrote:
> Fixes: 76df01ff622f ("net/mlx4: separate debugging macros")
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>

Applied to dpdk-next-net/master, thanks.

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v5 0/8] net/mlx4: Tx path improvements
  2017-11-02 16:42     ` [PATCH v5 0/8] net/mlx4: Tx path improvements Matan Azrad
                         ` (8 preceding siblings ...)
  2017-11-02 17:07       ` [PATCH v5 0/8] net/mlx4: Tx path improvements Adrien Mazarguil
@ 2017-11-02 20:41       ` Ferruh Yigit
  2017-11-03  9:48         ` Adrien Mazarguil
  2017-11-03 19:25       ` Ferruh Yigit
  10 siblings, 1 reply; 84+ messages in thread
From: Ferruh Yigit @ 2017-11-02 20:41 UTC (permalink / raw)
  To: Matan Azrad, Adrien Mazarguil; +Cc: dev, Ophir Munk

On 11/2/2017 9:42 AM, Matan Azrad wrote:
> v1:
> This series is a follow-up of the new datapath implementation introduced in RC1.
> It is dedicated to Tx and improves the performance.
> 
> v2:
> Change uint32_t be_lkey to rte_be32_t lkey
> 
> v3:
> Rebase to 17.11-rc2.
> Remove "net/mlx4: save lkey in big-endian format" patch which degrade performance.
> Merge separate segment cases patches "net/mlx4: improve performance of one Tx segment" and "net/mlx4: separate Tx for multi-segments".
> "net/mlx4: inline more Tx functions" renamed to "net/mlx4: associate MR to MP in a short function".
> Leave bytes counter as old version.
> Add memory barrier improvement patch.
> Remove empty segment support.
> 
> v4:
> Remove "net/mlx4: remove empty Tx segment support" patch.
> Add "fix ring wraparound compiler hint" patch.
> Add "fix HW memory optimizations careless" patch.
> Remove unnecessary IO memory barrier in completion function.
> Remove inline declaration of completion function.
> Fix unnecessary empty lines.
> Fix indetations.
> Move max=0 check removal to patch "remove duplicate handling in Tx burst" and rename the patch.
> Add performace improvement in patch "mitigate Tx path memory barriers"
> 
> v5:
> Fix compilation issue in debug mode for patch "net/mlx4: associate MR to MP in a short function".
> Add missing includes in "net/mlx4: associate MR to MP in a short function".
> Add cq_uar as volatile too in "net/mlx4: fix HW memory optimizations careless".
> Fix indentation issues in "net/mlx4: separate Tx segment cases".
> 
> 
> Matan Azrad (7):
>   net/mlx4: remove error flows from Tx fast path
>   net/mlx4: fix ring wraparound compiler hint
>   net/mlx4: merge Tx path functions
>   net/mlx4: remove duplicate handling in Tx burst
>   net/mlx4: separate Tx segment cases
>   net/mlx4: fix HW memory optimizations careless
>   net/mlx4: mitigate Tx path memory barriers
> 
> Ophir Munk (1):
>   net/mlx4: associate MR to MP in a short function

Patches has been sent and acked, so I already have my answer, but just to double
check, these are not just fixes which normally rc3 should target, there are data
path updates and performance improvements.

Can you please confirm you want these patches in at rc3 phase?
I will wait for your explicit approval before getting them.

Thanks,
ferruh

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v5 0/8] net/mlx4: Tx path improvements
  2017-11-02 20:41       ` Ferruh Yigit
@ 2017-11-03  9:48         ` Adrien Mazarguil
  0 siblings, 0 replies; 84+ messages in thread
From: Adrien Mazarguil @ 2017-11-03  9:48 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: Matan Azrad, dev, Ophir Munk

On Thu, Nov 02, 2017 at 01:41:39PM -0700, Ferruh Yigit wrote:
> On 11/2/2017 9:42 AM, Matan Azrad wrote:
> > v1:
> > This series is a follow-up of the new datapath implementation introduced in RC1.
> > It is dedicated to Tx and improves the performance.
> > 
> > v2:
> > Change uint32_t be_lkey to rte_be32_t lkey
> > 
> > v3:
> > Rebase to 17.11-rc2.
> > Remove "net/mlx4: save lkey in big-endian format" patch which degrade performance.
> > Merge separate segment cases patches "net/mlx4: improve performance of one Tx segment" and "net/mlx4: separate Tx for multi-segments".
> > "net/mlx4: inline more Tx functions" renamed to "net/mlx4: associate MR to MP in a short function".
> > Leave bytes counter as old version.
> > Add memory barrier improvement patch.
> > Remove empty segment support.
> > 
> > v4:
> > Remove "net/mlx4: remove empty Tx segment support" patch.
> > Add "fix ring wraparound compiler hint" patch.
> > Add "fix HW memory optimizations careless" patch.
> > Remove unnecessary IO memory barrier in completion function.
> > Remove inline declaration of completion function.
> > Fix unnecessary empty lines.
> > Fix indetations.
> > Move max=0 check removal to patch "remove duplicate handling in Tx burst" and rename the patch.
> > Add performace improvement in patch "mitigate Tx path memory barriers"
> > 
> > v5:
> > Fix compilation issue in debug mode for patch "net/mlx4: associate MR to MP in a short function".
> > Add missing includes in "net/mlx4: associate MR to MP in a short function".
> > Add cq_uar as volatile too in "net/mlx4: fix HW memory optimizations careless".
> > Fix indentation issues in "net/mlx4: separate Tx segment cases".
> > 
> > 
> > Matan Azrad (7):
> >   net/mlx4: remove error flows from Tx fast path
> >   net/mlx4: fix ring wraparound compiler hint
> >   net/mlx4: merge Tx path functions
> >   net/mlx4: remove duplicate handling in Tx burst
> >   net/mlx4: separate Tx segment cases
> >   net/mlx4: fix HW memory optimizations careless
> >   net/mlx4: mitigate Tx path memory barriers
> > 
> > Ophir Munk (1):
> >   net/mlx4: associate MR to MP in a short function
> 
> Patches has been sent and acked, so I already have my answer, but just to double
> check, these are not just fixes which normally rc3 should target, there are data
> path updates and performance improvements.
> 
> Can you please confirm you want these patches in at rc3 phase?
> I will wait for your explicit approval before getting them.

Yes, since they address a bunch of remaining performance issues, we'd like
them included in v17.11 (otherwise we'll have to backport them for v17.11.1
anyway).

Thanks Ferruh.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v5 0/8] net/mlx4: Tx path improvements
  2017-11-02 16:42     ` [PATCH v5 0/8] net/mlx4: Tx path improvements Matan Azrad
                         ` (9 preceding siblings ...)
  2017-11-02 20:41       ` Ferruh Yigit
@ 2017-11-03 19:25       ` Ferruh Yigit
  10 siblings, 0 replies; 84+ messages in thread
From: Ferruh Yigit @ 2017-11-03 19:25 UTC (permalink / raw)
  To: Matan Azrad, Adrien Mazarguil; +Cc: dev, Ophir Munk

On 11/2/2017 9:42 AM, Matan Azrad wrote:
> v1:
> This series is a follow-up of the new datapath implementation introduced in RC1.
> It is dedicated to Tx and improves the performance.
> 
> v2:
> Change uint32_t be_lkey to rte_be32_t lkey
> 
> v3:
> Rebase to 17.11-rc2.
> Remove "net/mlx4: save lkey in big-endian format" patch which degrade performance.
> Merge separate segment cases patches "net/mlx4: improve performance of one Tx segment" and "net/mlx4: separate Tx for multi-segments".
> "net/mlx4: inline more Tx functions" renamed to "net/mlx4: associate MR to MP in a short function".
> Leave bytes counter as old version.
> Add memory barrier improvement patch.
> Remove empty segment support.
> 
> v4:
> Remove "net/mlx4: remove empty Tx segment support" patch.
> Add "fix ring wraparound compiler hint" patch.
> Add "fix HW memory optimizations careless" patch.
> Remove unnecessary IO memory barrier in completion function.
> Remove inline declaration of completion function.
> Fix unnecessary empty lines.
> Fix indetations.
> Move max=0 check removal to patch "remove duplicate handling in Tx burst" and rename the patch.
> Add performace improvement in patch "mitigate Tx path memory barriers"
> 
> v5:
> Fix compilation issue in debug mode for patch "net/mlx4: associate MR to MP in a short function".
> Add missing includes in "net/mlx4: associate MR to MP in a short function".
> Add cq_uar as volatile too in "net/mlx4: fix HW memory optimizations careless".
> Fix indentation issues in "net/mlx4: separate Tx segment cases".
> 
> 
> Matan Azrad (7):
>   net/mlx4: remove error flows from Tx fast path
>   net/mlx4: fix ring wraparound compiler hint
>   net/mlx4: merge Tx path functions
>   net/mlx4: remove duplicate handling in Tx burst
>   net/mlx4: separate Tx segment cases
>   net/mlx4: fix HW memory optimizations careless
>   net/mlx4: mitigate Tx path memory barriers
> 
> Ophir Munk (1):
>   net/mlx4: associate MR to MP in a short function

Series applied to dpdk-next-net/master, thanks.

^ permalink raw reply	[flat|nested] 84+ messages in thread

end of thread, other threads:[~2017-11-03 19:25 UTC | newest]

Thread overview: 84+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <1508752838-30408-1-git-send-email-ophirmu@mellanox.com>
2017-10-23 14:21 ` [PATCH v2 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
2017-10-23 14:21   ` [PATCH v2 1/7] net/mlx4: remove error flows from Tx fast path Ophir Munk
2017-10-25 16:49     ` Adrien Mazarguil
2017-10-23 14:21   ` [PATCH v2 2/7] net/mlx4: inline more Tx functions Ophir Munk
2017-10-25 16:49     ` Adrien Mazarguil
2017-10-25 21:42       ` Ophir Munk
2017-10-26  7:48         ` Adrien Mazarguil
2017-10-26 14:27           ` Ophir Munk
2017-10-29 19:30             ` Ophir Munk
2017-10-23 14:21   ` [PATCH v2 3/7] net/mlx4: save lkey in big-endian format Ophir Munk
2017-10-23 15:24     ` Nélio Laranjeiro
2017-10-23 14:21   ` [PATCH v2 4/7] net/mlx4: merge Tx path functions Ophir Munk
2017-10-24 13:51     ` Nélio Laranjeiro
2017-10-24 20:36       ` Ophir Munk
2017-10-25  7:50         ` Nélio Laranjeiro
2017-10-26 10:31           ` Matan Azrad
2017-10-26 12:12             ` Nélio Laranjeiro
2017-10-26 12:30               ` Matan Azrad
2017-10-26 13:44                 ` Nélio Laranjeiro
2017-10-26 16:21                   ` Matan Azrad
2017-10-23 14:21   ` [PATCH v2 5/7] net/mlx4: remove unnecessary variables in Tx burst Ophir Munk
2017-10-25 16:49     ` Adrien Mazarguil
2017-10-23 14:21   ` [PATCH v2 6/7] net/mlx4: improve performance of one Tx segment Ophir Munk
2017-10-25 16:50     ` Adrien Mazarguil
2017-10-23 14:22   ` [PATCH v2 7/7] net/mlx4: separate Tx for multi-segments Ophir Munk
2017-10-25 16:50     ` Adrien Mazarguil
2017-10-30  8:15       ` Ophir Munk
2017-10-30 10:07   ` [PATCH v3 0/7] Tx path improvements Matan Azrad
2017-10-30 10:07     ` [PATCH v3 1/7] net/mlx4: remove error flows from Tx fast path Matan Azrad
2017-10-30 14:23       ` Adrien Mazarguil
2017-10-30 18:11         ` Matan Azrad
2017-10-31 10:16           ` Adrien Mazarguil
2017-10-30 10:07     ` [PATCH v3 2/7] net/mlx4: associate MR to MP in a short function Matan Azrad
2017-10-30 14:23       ` Adrien Mazarguil
2017-10-31 13:25         ` Ophir Munk
2017-10-30 10:07     ` [PATCH v3 3/7] net/mlx4: merge Tx path functions Matan Azrad
2017-10-30 14:23       ` Adrien Mazarguil
2017-10-30 18:12         ` Matan Azrad
2017-10-30 10:07     ` [PATCH v3 4/7] net/mlx4: remove completion counter in Tx burst Matan Azrad
2017-10-30 14:23       ` Adrien Mazarguil
2017-10-30 10:07     ` [PATCH v3 5/7] net/mlx4: separate Tx segment cases Matan Azrad
2017-10-30 14:23       ` Adrien Mazarguil
2017-10-30 18:23         ` Matan Azrad
2017-10-31 10:17           ` Adrien Mazarguil
2017-10-30 10:07     ` [PATCH v3 6/7] net/mlx4: mitigate Tx path memory barriers Matan Azrad
2017-10-30 14:23       ` Adrien Mazarguil
2017-10-30 19:47         ` Matan Azrad
2017-10-31 10:17           ` Adrien Mazarguil
2017-10-31 11:35             ` Matan Azrad
2017-10-31 13:21               ` Adrien Mazarguil
2017-10-30 10:07     ` [PATCH v3 7/7] net/mlx4: remove empty Tx segment support Matan Azrad
2017-10-30 14:24       ` Adrien Mazarguil
2017-10-31 18:21     ` [PATCH v4 0/8] net/mlx4: Tx path improvements Matan Azrad
2017-10-31 18:21       ` [PATCH v4 1/8] net/mlx4: remove error flows from Tx fast path Matan Azrad
2017-10-31 18:21       ` [PATCH v4 2/8] net/mlx4: associate MR to MP in a short function Matan Azrad
2017-11-02 13:42         ` Adrien Mazarguil
2017-10-31 18:21       ` [PATCH v4 3/8] net/mlx4: fix ring wraparound compiler hint Matan Azrad
2017-11-02 13:42         ` Adrien Mazarguil
2017-10-31 18:21       ` [PATCH v4 4/8] net/mlx4: merge Tx path functions Matan Azrad
2017-11-02 13:42         ` Adrien Mazarguil
2017-10-31 18:21       ` [PATCH v4 5/8] net/mlx4: remove duplicate handling in Tx burst Matan Azrad
2017-11-02 13:42         ` Adrien Mazarguil
2017-10-31 18:21       ` [PATCH v4 6/8] net/mlx4: separate Tx segment cases Matan Azrad
2017-11-02 13:43         ` Adrien Mazarguil
2017-10-31 18:21       ` [PATCH v4 7/8] net/mlx4: fix HW memory optimizations careless Matan Azrad
2017-11-02 13:43         ` Adrien Mazarguil
2017-10-31 18:21       ` [PATCH v4 8/8] net/mlx4: mitigate Tx path memory barriers Matan Azrad
2017-11-02 13:43         ` Adrien Mazarguil
2017-11-02 13:41       ` [PATCH] net/mlx4: fix missing include Adrien Mazarguil
2017-11-02 20:35         ` Ferruh Yigit
2017-11-02 16:42     ` [PATCH v5 0/8] net/mlx4: Tx path improvements Matan Azrad
2017-11-02 16:42       ` [PATCH v5 1/8] net/mlx4: remove error flows from Tx fast path Matan Azrad
2017-11-02 16:42       ` [PATCH v5 2/8] net/mlx4: associate MR to MP in a short function Matan Azrad
2017-11-02 16:42       ` [PATCH v5 3/8] net/mlx4: fix ring wraparound compiler hint Matan Azrad
2017-11-02 16:42       ` [PATCH v5 4/8] net/mlx4: merge Tx path functions Matan Azrad
2017-11-02 16:42       ` [PATCH v5 5/8] net/mlx4: remove duplicate handling in Tx burst Matan Azrad
2017-11-02 16:42       ` [PATCH v5 6/8] net/mlx4: separate Tx segment cases Matan Azrad
2017-11-02 16:42       ` [PATCH v5 7/8] net/mlx4: fix HW memory optimizations careless Matan Azrad
2017-11-02 16:42       ` [PATCH v5 8/8] net/mlx4: mitigate Tx path memory barriers Matan Azrad
2017-11-02 17:07       ` [PATCH v5 0/8] net/mlx4: Tx path improvements Adrien Mazarguil
2017-11-02 20:35         ` Ferruh Yigit
2017-11-02 20:41       ` Ferruh Yigit
2017-11-03  9:48         ` Adrien Mazarguil
2017-11-03 19:25       ` Ferruh Yigit

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.