All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1
@ 2017-10-23 10:04 Ophir Munk
  2017-10-23 10:04 ` [PATCH v1 1/7] net/mlx4: remove error flows from Tx fast path Ophir Munk
                   ` (6 more replies)
  0 siblings, 7 replies; 8+ messages in thread
From: Ophir Munk @ 2017-10-23 10:04 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad, Ophir Munk

v1:
This series is a follow-up of the new datapath implementation introduced in RC1.
It is dedicated to Tx and improves the performance.

Matan Azrad (4):
  net/mlx4: remove error flows from Tx fast path
  net/mlx4: merge Tx path functions
  net/mlx4: remove unnecessary variables in Tx burst
  net/mlx4: improve performance of one Tx segment

Ophir Munk (3):
  net/mlx4: inline more Tx functions
  net/mlx4: save lkey in big-endian format
  net/mlx4: separate Tx for multi-segments

 drivers/net/mlx4/mlx4_rxtx.c | 345 +++++++++++++++++++++++--------------------
 drivers/net/mlx4/mlx4_rxtx.h |  56 ++++++-
 2 files changed, 235 insertions(+), 166 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v1 1/7] net/mlx4: remove error flows from Tx fast path
  2017-10-23 10:04 [PATCH v1 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
@ 2017-10-23 10:04 ` Ophir Munk
  2017-10-23 10:04 ` [PATCH v1 2/7] net/mlx4: inline more Tx functions Ophir Munk
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Ophir Munk @ 2017-10-23 10:04 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

From: Matan Azrad <matan@mellanox.com>

Move unnecessary error flows to DEBUG mode for
performance improvements.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 36173ad..011ea79 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -169,6 +169,7 @@ mlx4_txq_complete(struct txq *txq)
 		 * Make sure we read the CQE after we read the ownership bit.
 		 */
 		rte_rmb();
+#ifndef NDEBUG
 		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
 			     MLX4_CQE_OPCODE_ERROR)) {
 			struct mlx4_err_cqe *cqe_err =
@@ -178,6 +179,7 @@ mlx4_txq_complete(struct txq *txq)
 			      (void *)txq, cqe_err->vendor_err,
 			      cqe_err->syndrome);
 		}
+#endif /* NDEBUG */
 		/* Get WQE index reported in the CQE. */
 		new_index =
 			rte_be_to_cpu_16(cqe->wqe_index) & sq->txbb_cnt_mask;
@@ -322,7 +324,6 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 	uint32_t byte_count;
 	int wqe_real_size;
 	int nr_txbbs;
-	int rc;
 	struct pv *pv = (struct pv *)txq->bounce_buf;
 	int pv_counter = 0;
 
@@ -337,8 +338,7 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 	if (((sq->head - sq->tail) + nr_txbbs +
 	     sq->headroom_txbbs) >= sq->txbb_cnt ||
 	    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-		rc = ENOSPC;
-		goto err;
+		return -ENOSPC;
 	}
 	/* Get the control and data entries of the WQE. */
 	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
@@ -354,6 +354,7 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 		dseg->addr = rte_cpu_to_be_64(addr);
 		/* Memory region key for this memory pool. */
 		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+#ifndef NDEBUG
 		if (unlikely(lkey == (uint32_t)-1)) {
 			/* MR does not exist. */
 			DEBUG("%p: unable to get MP <-> MR association",
@@ -366,9 +367,9 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
 			mlx4_txq_stamp_freed_wqe(sq, head_idx,
 				     (sq->head & sq->txbb_cnt) ? 0 : 1);
-			rc = EFAULT;
-			goto err;
+			return -EFAULT;
 		}
+#endif /* NDEBUG */
 		dseg->lkey = rte_cpu_to_be_32(lkey);
 		if (likely(buf->data_len)) {
 			byte_count = rte_cpu_to_be_32(buf->data_len);
@@ -471,9 +472,6 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 					       MLX4_BIT_WQE_OWN : 0));
 	sq->head += nr_txbbs;
 	return 0;
-err:
-	rte_errno = rc;
-	return -rc;
 }
 
 /**
@@ -510,8 +508,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	assert(max <= elts_n);
 	/* Always leave one free entry in the ring. */
 	--max;
-	if (max == 0)
-		return 0;
 	if (max > pkts_n)
 		max = pkts_n;
 	for (i = 0; (i != max); ++i) {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v1 2/7] net/mlx4: inline more Tx functions
  2017-10-23 10:04 [PATCH v1 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
  2017-10-23 10:04 ` [PATCH v1 1/7] net/mlx4: remove error flows from Tx fast path Ophir Munk
@ 2017-10-23 10:04 ` Ophir Munk
  2017-10-23 10:04 ` [PATCH v1 3/7] net/mlx4: save lkey in big-endian format Ophir Munk
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Ophir Munk @ 2017-10-23 10:04 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad, Ophir Munk

Change functions to inline on Tx fast path to improve performance

Inside the inline function call other functions to handle "unlikely"
cases such that the inline function code footprint is small.

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 43 ++++++------------------------------
 drivers/net/mlx4/mlx4_rxtx.h | 52 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 011ea79..ae37f9b 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -220,54 +220,25 @@ mlx4_txq_complete(struct txq *txq)
 	return 0;
 }
 
-/**
- * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
- * the cloned mbuf is allocated is returned instead.
- *
- * @param buf
- *   Pointer to mbuf.
- *
- * @return
- *   Memory pool where data is located for given mbuf.
- */
-static struct rte_mempool *
-mlx4_txq_mb2mp(struct rte_mbuf *buf)
-{
-	if (unlikely(RTE_MBUF_INDIRECT(buf)))
-		return rte_mbuf_from_indirect(buf)->pool;
-	return buf->pool;
-}
 
 /**
- * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
- * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
- * remove an entry first.
+ * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
+ * If mp2mr[] is full, remove an entry first.
  *
  * @param txq
  *   Pointer to Tx queue structure.
  * @param[in] mp
- *   Memory pool for which a memory region lkey must be returned.
+ *   Memory pool for which a memory region lkey must be added
+ * @param[in] i
+ *   Index in memory pool (MP) where to add memory region (MR)
  *
  * @return
- *   mr->lkey on success, (uint32_t)-1 on failure.
+ *   Added mr->lkey on success, (uint32_t)-1 on failure.
  */
-uint32_t
-mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
 {
-	unsigned int i;
 	struct ibv_mr *mr;
 
-	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
-		if (unlikely(txq->mp2mr[i].mp == NULL)) {
-			/* Unknown MP, add a new MR for it. */
-			break;
-		}
-		if (txq->mp2mr[i].mp == mp) {
-			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
-			return txq->mp2mr[i].lkey;
-		}
-	}
 	/* Add a new entry, register MR first. */
 	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
 	      (void *)txq, mp->name, (void *)mp);
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index e10bbca..719ef45 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -53,6 +53,7 @@
 
 #include "mlx4.h"
 #include "mlx4_prm.h"
+#include "mlx4_utils.h"
 
 /** Rx queue counters. */
 struct mlx4_rxq_stats {
@@ -160,7 +161,6 @@ void mlx4_rx_queue_release(void *dpdk_rxq);
 
 /* mlx4_rxtx.c */
 
-uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
 uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
 		       uint16_t pkts_n);
 uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
@@ -169,6 +169,8 @@ uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
 uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
+uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
+				unsigned int i);
 
 /* mlx4_txq.c */
 
@@ -177,4 +179,52 @@ int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
 			const struct rte_eth_txconf *conf);
 void mlx4_tx_queue_release(void *dpdk_txq);
 
+/**
+ * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
+ * the cloned mbuf is allocated is returned instead.
+ *
+ * @param buf
+ *   Pointer to mbuf.
+ *
+ * @return
+ *   Memory pool where data is located for given mbuf.
+ */
+static __rte_always_inline struct rte_mempool *
+mlx4_txq_mb2mp(struct rte_mbuf *buf)
+{
+	if (unlikely(RTE_MBUF_INDIRECT(buf)))
+		return rte_mbuf_from_indirect(buf)->pool;
+	return buf->pool;
+}
+
+/**
+ * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
+ * Call mlx4_txq_add_mr() if MP is not registered yet.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param[in] mp
+ *   Memory pool for which a memory region lkey must be returned.
+ *
+ * @return
+ *   mr->lkey on success, (uint32_t)-1 on failure.
+ */
+static __rte_always_inline uint32_t
+mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+{
+	unsigned int i;
+
+	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+		if (unlikely(txq->mp2mr[i].mp == NULL)) {
+			/* Unknown MP, add a new MR for it. */
+			break;
+		}
+		if (txq->mp2mr[i].mp == mp) {
+			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
+			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
+			return txq->mp2mr[i].lkey;
+		}
+	}
+	return mlx4_txq_add_mr(txq, mp, i);
+}
 #endif /* MLX4_RXTX_H_ */
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v1 3/7] net/mlx4: save lkey in big-endian format
  2017-10-23 10:04 [PATCH v1 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
  2017-10-23 10:04 ` [PATCH v1 1/7] net/mlx4: remove error flows from Tx fast path Ophir Munk
  2017-10-23 10:04 ` [PATCH v1 2/7] net/mlx4: inline more Tx functions Ophir Munk
@ 2017-10-23 10:04 ` Ophir Munk
  2017-10-23 10:04 ` [PATCH v1 4/7] net/mlx4: merge Tx path functions Ophir Munk
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Ophir Munk @ 2017-10-23 10:04 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad, Ophir Munk

mlx4 NIC is using lkey in big endian format. Save lkey in this format
in order to avoid conversions during Tx fast path

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 14 ++++++--------
 drivers/net/mlx4/mlx4_rxtx.h | 10 ++++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index ae37f9b..4e06cca 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -260,10 +260,10 @@ uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
 	/* Store the new entry. */
 	txq->mp2mr[i].mp = mp;
 	txq->mp2mr[i].mr = mr;
-	txq->mp2mr[i].lkey = mr->lkey;
+	txq->mp2mr[i].be_lkey = rte_cpu_to_be_32(mr->lkey);
 	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
-	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
-	return txq->mp2mr[i].lkey;
+	      (void *)txq, mp->name, (void *)mp, mr->lkey);
+	return txq->mp2mr[i].be_lkey;
 }
 
 /**
@@ -289,7 +289,6 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 		uint16_t flags16[2];
 	} srcrb;
 	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
-	uint32_t lkey;
 	uintptr_t addr;
 	uint32_t owner_opcode = MLX4_OPCODE_SEND;
 	uint32_t byte_count;
@@ -323,10 +322,10 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 		if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
 			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
 		dseg->addr = rte_cpu_to_be_64(addr);
-		/* Memory region key for this memory pool. */
-		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+		/* Memory region key (big endian) for this memory pool. */
+		dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
 #ifndef NDEBUG
-		if (unlikely(lkey == (uint32_t)-1)) {
+		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
 			/* MR does not exist. */
 			DEBUG("%p: unable to get MP <-> MR association",
 			      (void *)txq);
@@ -341,7 +340,6 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
 			return -EFAULT;
 		}
 #endif /* NDEBUG */
-		dseg->lkey = rte_cpu_to_be_32(lkey);
 		if (likely(buf->data_len)) {
 			byte_count = rte_cpu_to_be_32(buf->data_len);
 		} else {
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 719ef45..8d39afb 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -135,7 +135,7 @@ struct txq {
 	struct {
 		const struct rte_mempool *mp; /**< Cached memory pool. */
 		struct ibv_mr *mr; /**< Memory region (for mp). */
-		uint32_t lkey; /**< mr->lkey copy. */
+		uint32_t be_lkey; /**< mr->lkey copy. (big endian) */
 	} mp2mr[MLX4_PMD_TX_MP_CACHE]; /**< MP to MR translation table. */
 	struct priv *priv; /**< Back pointer to private data. */
 	unsigned int socket; /**< CPU socket ID for allocations. */
@@ -220,9 +220,11 @@ mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
 			break;
 		}
 		if (txq->mp2mr[i].mp == mp) {
-			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
-			return txq->mp2mr[i].lkey;
+			assert(txq->mp2mr[i].be_lkey !=
+				rte_cpu_to_be_32((uint32_t)-1));
+			assert(txq->mp2mr[i].mr->lkey ==
+				rte_be_to_cpu_32(txq->mp2mr[i].be_lkey));
+			return txq->mp2mr[i].be_lkey;
 		}
 	}
 	return mlx4_txq_add_mr(txq, mp, i);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v1 4/7] net/mlx4: merge Tx path functions
  2017-10-23 10:04 [PATCH v1 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
                   ` (2 preceding siblings ...)
  2017-10-23 10:04 ` [PATCH v1 3/7] net/mlx4: save lkey in big-endian format Ophir Munk
@ 2017-10-23 10:04 ` Ophir Munk
  2017-10-23 10:04 ` [PATCH v1 5/7] net/mlx4: remove unnecessary variables in Tx burst Ophir Munk
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Ophir Munk @ 2017-10-23 10:04 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

From: Matan Azrad <matan@mellanox.com>

Merge tx_burst and mlx4_post_send functions to prevent
double asking about WQ remain space.

This should improve performance.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 353 +++++++++++++++++++++----------------------
 1 file changed, 170 insertions(+), 183 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 4e06cca..ff15827 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -267,183 +267,6 @@ uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
 }
 
 /**
- * Posts a single work request to a send queue.
- *
- * @param txq
- *   Target Tx queue.
- * @param pkt
- *   Packet to transmit.
- *
- * @return
- *   0 on success, negative errno value otherwise and rte_errno is set.
- */
-static inline int
-mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt)
-{
-	struct mlx4_wqe_ctrl_seg *ctrl;
-	struct mlx4_wqe_data_seg *dseg;
-	struct mlx4_sq *sq = &txq->msq;
-	struct rte_mbuf *buf;
-	union {
-		uint32_t flags;
-		uint16_t flags16[2];
-	} srcrb;
-	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
-	uintptr_t addr;
-	uint32_t owner_opcode = MLX4_OPCODE_SEND;
-	uint32_t byte_count;
-	int wqe_real_size;
-	int nr_txbbs;
-	struct pv *pv = (struct pv *)txq->bounce_buf;
-	int pv_counter = 0;
-
-	/* Calculate the needed work queue entry size for this packet. */
-	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
-			pkt->nb_segs * sizeof(struct mlx4_wqe_data_seg);
-	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
-	/*
-	 * Check that there is room for this WQE in the send queue and that
-	 * the WQE size is legal.
-	 */
-	if (((sq->head - sq->tail) + nr_txbbs +
-	     sq->headroom_txbbs) >= sq->txbb_cnt ||
-	    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-		return -ENOSPC;
-	}
-	/* Get the control and data entries of the WQE. */
-	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
-	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
-					    sizeof(struct mlx4_wqe_ctrl_seg));
-	/* Fill the data segments with buffer information. */
-	for (buf = pkt; buf != NULL; buf = buf->next, dseg++) {
-		addr = rte_pktmbuf_mtod(buf, uintptr_t);
-		rte_prefetch0((volatile void *)addr);
-		/* Handle WQE wraparound. */
-		if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
-			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
-		dseg->addr = rte_cpu_to_be_64(addr);
-		/* Memory region key (big endian) for this memory pool. */
-		dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
-#ifndef NDEBUG
-		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
-			/* MR does not exist. */
-			DEBUG("%p: unable to get MP <-> MR association",
-			      (void *)txq);
-			/*
-			 * Restamp entry in case of failure.
-			 * Make sure that size is written correctly
-			 * Note that we give ownership to the SW, not the HW.
-			 */
-			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
-			mlx4_txq_stamp_freed_wqe(sq, head_idx,
-				     (sq->head & sq->txbb_cnt) ? 0 : 1);
-			return -EFAULT;
-		}
-#endif /* NDEBUG */
-		if (likely(buf->data_len)) {
-			byte_count = rte_cpu_to_be_32(buf->data_len);
-		} else {
-			/*
-			 * Zero length segment is treated as inline segment
-			 * with zero data.
-			 */
-			byte_count = RTE_BE32(0x80000000);
-		}
-		/*
-		 * If the data segment is not at the beginning of a
-		 * Tx basic block (TXBB) then write the byte count,
-		 * else postpone the writing to just before updating the
-		 * control segment.
-		 */
-		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
-			/*
-			 * Need a barrier here before writing the byte_count
-			 * fields to make sure that all the data is visible
-			 * before the byte_count field is set.
-			 * Otherwise, if the segment begins a new cacheline,
-			 * the HCA prefetcher could grab the 64-byte chunk and
-			 * get a valid (!= 0xffffffff) byte count but stale
-			 * data, and end up sending the wrong data.
-			 */
-			rte_io_wmb();
-			dseg->byte_count = byte_count;
-		} else {
-			/*
-			 * This data segment starts at the beginning of a new
-			 * TXBB, so we need to postpone its byte_count writing
-			 * for later.
-			 */
-			pv[pv_counter].dseg = dseg;
-			pv[pv_counter++].val = byte_count;
-		}
-	}
-	/* Write the first DWORD of each TXBB save earlier. */
-	if (pv_counter) {
-		/* Need a barrier here before writing the byte_count. */
-		rte_io_wmb();
-		for (--pv_counter; pv_counter  >= 0; pv_counter--)
-			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
-	}
-	/* Fill the control parameters for this packet. */
-	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
-	/*
-	 * For raw Ethernet, the SOLICIT flag is used to indicate that no ICRC
-	 * should be calculated.
-	 */
-	txq->elts_comp_cd -= nr_txbbs;
-	if (unlikely(txq->elts_comp_cd <= 0)) {
-		txq->elts_comp_cd = txq->elts_comp_cd_init;
-		srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
-				       MLX4_WQE_CTRL_CQ_UPDATE);
-	} else {
-		srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
-	}
-	/* Enable HW checksum offload if requested */
-	if (txq->csum &&
-	    (pkt->ol_flags &
-	     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) {
-		const uint64_t is_tunneled = (pkt->ol_flags &
-					      (PKT_TX_TUNNEL_GRE |
-					       PKT_TX_TUNNEL_VXLAN));
-
-		if (is_tunneled && txq->csum_l2tun) {
-			owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM |
-					MLX4_WQE_CTRL_IL4_HDR_CSUM;
-			if (pkt->ol_flags & PKT_TX_OUTER_IP_CKSUM)
-				srcrb.flags |=
-					RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM);
-		} else {
-			srcrb.flags |= RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM |
-						MLX4_WQE_CTRL_TCP_UDP_CSUM);
-		}
-	}
-	if (txq->lb) {
-		/*
-		 * Copy destination MAC address to the WQE, this allows
-		 * loopback in eSwitch, so that VFs and PF can communicate
-		 * with each other.
-		 */
-		srcrb.flags16[0] = *(rte_pktmbuf_mtod(pkt, uint16_t *));
-		ctrl->imm = *(rte_pktmbuf_mtod_offset(pkt, uint32_t *,
-						      sizeof(uint16_t)));
-	} else {
-		ctrl->imm = 0;
-	}
-	ctrl->srcrb_flags = srcrb.flags;
-	/*
-	 * Make sure descriptor is fully written before
-	 * setting ownership bit (because HW can start
-	 * executing as soon as we do).
-	 */
-	rte_wmb();
-	ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
-					      ((sq->head & sq->txbb_cnt) ?
-					       MLX4_BIT_WQE_OWN : 0));
-	sq->head += nr_txbbs;
-	return 0;
-}
-
-/**
  * DPDK callback for Tx.
  *
  * @param dpdk_txq
@@ -466,7 +289,8 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int bytes_sent = 0;
 	unsigned int i;
 	unsigned int max;
-	int err;
+	struct mlx4_sq *sq = &txq->msq;
+	struct pv *pv = (struct pv *)txq->bounce_buf;
 
 	assert(txq->elts_comp_cd != 0);
 	mlx4_txq_complete(txq);
@@ -485,6 +309,20 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
 		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
 		struct txq_elt *elt = &(*txq->elts)[elts_head];
+		uint32_t owner_opcode = MLX4_OPCODE_SEND;
+		struct mlx4_wqe_ctrl_seg *ctrl;
+		struct mlx4_wqe_data_seg *dseg;
+		struct rte_mbuf *sbuf;
+		union {
+			uint32_t flags;
+			uint16_t flags16[2];
+		} srcrb;
+		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+		uintptr_t addr;
+		uint32_t byte_count;
+		int wqe_real_size;
+		int nr_txbbs;
+		int pv_counter = 0;
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -503,18 +341,167 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		/* Post the packet for sending. */
-		err = mlx4_post_send(txq, buf);
-		if (unlikely(err)) {
+
+		/*
+		 * Calculate the needed work queue entry size
+		 * for this packet.
+		 */
+		wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+		nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
+		/*
+		 * Check that there is room for this WQE in the send
+		 * queue and that the WQE size is legal.
+		 */
+		if (((sq->head - sq->tail) + nr_txbbs +
+		     sq->headroom_txbbs) >= sq->txbb_cnt ||
+		    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
 			elt->buf = NULL;
-			goto stop;
+			break;
+		}
+		/* Get the control and data entries of the WQE. */
+		ctrl = (struct mlx4_wqe_ctrl_seg *)
+				mlx4_get_send_wqe(sq, head_idx);
+		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+				sizeof(struct mlx4_wqe_ctrl_seg));
+		/* Fill the data segments with buffer information. */
+		for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
+			addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+			rte_prefetch0((volatile void *)addr);
+			/* Handle WQE wraparound. */
+			if (unlikely(dseg >=
+			    (struct mlx4_wqe_data_seg *)sq->eob))
+				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+			dseg->addr = rte_cpu_to_be_64(addr);
+			/* Memory region key (big endian). */
+			dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+	#ifndef NDEBUG
+			if (unlikely(dseg->lkey ==
+				rte_cpu_to_be_32((uint32_t)-1))) {
+				/* MR does not exist. */
+				DEBUG("%p: unable to get MP <-> MR association",
+				      (void *)txq);
+				/*
+				 * Restamp entry in case of failure.
+				 * Make sure that size is written correctly
+				 * Note that we give ownership to the SW,
+				 * not the HW.
+				 */
+				ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+				mlx4_txq_stamp_freed_wqe(sq, head_idx,
+					     (sq->head & sq->txbb_cnt) ? 0 : 1);
+				elt->buf = NULL;
+				break;
+			}
+	#endif /* NDEBUG */
+			if (likely(sbuf->data_len)) {
+				byte_count = rte_cpu_to_be_32(sbuf->data_len);
+			} else {
+				/*
+				 * Zero length segment is treated as inline
+				 * segment with zero data.
+				 */
+				byte_count = RTE_BE32(0x80000000);
+			}
+			/*
+			 * If the data segment is not at the beginning
+			 * of a Tx basic block (TXBB) then write the
+			 * byte count, else postpone the writing to
+			 * just before updating the control segment.
+			 */
+			if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+				/*
+				 * Need a barrier here before writing the
+				 * byte_count fields to make sure that all the
+				 * data is visible before the byte_count field
+				 * is set. otherwise, if the segment begins a
+				 * new cacheline, the HCA prefetcher could grab
+				 * the 64-byte chunk and get a valid
+				 * (!= 0xffffffff) byte count but stale data,
+				 * and end up sending the wrong data.
+				 */
+				rte_io_wmb();
+				dseg->byte_count = byte_count;
+			} else {
+				/*
+				 * This data segment starts at the beginning of
+				 * a new TXBB, so we need to postpone its
+				 * byte_count writing for later.
+				 */
+				pv[pv_counter].dseg = dseg;
+				pv[pv_counter++].val = byte_count;
+			}
+		}
+		/* Write the first DWORD of each TXBB save earlier. */
+		if (pv_counter) {
+			/* Need a barrier before writing the byte_count. */
+			rte_io_wmb();
+			for (--pv_counter; pv_counter  >= 0; pv_counter--)
+				pv[pv_counter].dseg->byte_count =
+						pv[pv_counter].val;
+		}
+		/* Fill the control parameters for this packet. */
+		ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+		/*
+		 * For raw Ethernet, the SOLICIT flag is used to indicate
+		 * that no ICRC should be calculated.
+		 */
+		txq->elts_comp_cd -= nr_txbbs;
+		if (unlikely(txq->elts_comp_cd <= 0)) {
+			txq->elts_comp_cd = txq->elts_comp_cd_init;
+			srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
+					       MLX4_WQE_CTRL_CQ_UPDATE);
+		} else {
+			srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
 		}
+		/* Enable HW checksum offload if requested */
+		if (txq->csum &&
+		    (buf->ol_flags &
+		     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) {
+			const uint64_t is_tunneled = (buf->ol_flags &
+						      (PKT_TX_TUNNEL_GRE |
+						       PKT_TX_TUNNEL_VXLAN));
+
+			if (is_tunneled && txq->csum_l2tun) {
+				owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM |
+						MLX4_WQE_CTRL_IL4_HDR_CSUM;
+				if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM)
+					srcrb.flags |=
+					    RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM);
+			} else {
+				srcrb.flags |=
+					RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM |
+						MLX4_WQE_CTRL_TCP_UDP_CSUM);
+			}
+		}
+		if (txq->lb) {
+			/*
+			 * Copy destination MAC address to the WQE, this allows
+			 * loopback in eSwitch, so that VFs and PF can
+			 * communicate with each other.
+			 */
+			srcrb.flags16[0] = *(rte_pktmbuf_mtod(buf, uint16_t *));
+			ctrl->imm = *(rte_pktmbuf_mtod_offset(buf, uint32_t *,
+					      sizeof(uint16_t)));
+		} else {
+			ctrl->imm = 0;
+		}
+		ctrl->srcrb_flags = srcrb.flags;
+		/*
+		 * Make sure descriptor is fully written before
+		 * setting ownership bit (because HW can start
+		 * executing as soon as we do).
+		 */
+		rte_wmb();
+		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
+					      ((sq->head & sq->txbb_cnt) ?
+						       MLX4_BIT_WQE_OWN : 0));
+		sq->head += nr_txbbs;
 		elt->buf = buf;
 		bytes_sent += buf->pkt_len;
 		++elts_comp;
 		elts_head = elts_head_next;
 	}
-stop:
 	/* Take a shortcut if nothing must be sent. */
 	if (unlikely(i == 0))
 		return 0;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v1 5/7] net/mlx4: remove unnecessary variables in Tx burst
  2017-10-23 10:04 [PATCH v1 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
                   ` (3 preceding siblings ...)
  2017-10-23 10:04 ` [PATCH v1 4/7] net/mlx4: merge Tx path functions Ophir Munk
@ 2017-10-23 10:04 ` Ophir Munk
  2017-10-23 10:04 ` [PATCH v1 6/7] net/mlx4: improve performance of one Tx segment Ophir Munk
  2017-10-23 10:04 ` [PATCH v1 7/7] net/mlx4: separate Tx for multi-segments Ophir Munk
  6 siblings, 0 replies; 8+ messages in thread
From: Ophir Munk @ 2017-10-23 10:04 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

From: Matan Azrad <matan@mellanox.com>

Remove usage of variables which doesn't add new information for
performance improvement.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index ff15827..8adac0f 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -285,8 +285,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	struct txq *txq = (struct txq *)dpdk_txq;
 	unsigned int elts_head = txq->elts_head;
 	const unsigned int elts_n = txq->elts_n;
-	unsigned int elts_comp = 0;
-	unsigned int bytes_sent = 0;
 	unsigned int i;
 	unsigned int max;
 	struct mlx4_sq *sq = &txq->msq;
@@ -498,8 +496,7 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 						       MLX4_BIT_WQE_OWN : 0));
 		sq->head += nr_txbbs;
 		elt->buf = buf;
-		bytes_sent += buf->pkt_len;
-		++elts_comp;
+		txq->stats.obytes += buf->pkt_len;
 		elts_head = elts_head_next;
 	}
 	/* Take a shortcut if nothing must be sent. */
@@ -507,13 +504,12 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		return 0;
 	/* Increment send statistics counters. */
 	txq->stats.opackets += i;
-	txq->stats.obytes += bytes_sent;
 	/* Make sure that descriptors are written before doorbell record. */
 	rte_wmb();
 	/* Ring QP doorbell. */
 	rte_write32(txq->msq.doorbell_qpn, txq->msq.db);
 	txq->elts_head = elts_head;
-	txq->elts_comp += elts_comp;
+	txq->elts_comp += i;
 	return i;
 }
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v1 6/7] net/mlx4: improve performance of one Tx segment
  2017-10-23 10:04 [PATCH v1 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
                   ` (4 preceding siblings ...)
  2017-10-23 10:04 ` [PATCH v1 5/7] net/mlx4: remove unnecessary variables in Tx burst Ophir Munk
@ 2017-10-23 10:04 ` Ophir Munk
  2017-10-23 10:04 ` [PATCH v1 7/7] net/mlx4: separate Tx for multi-segments Ophir Munk
  6 siblings, 0 replies; 8+ messages in thread
From: Ophir Munk @ 2017-10-23 10:04 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad

From: Matan Azrad <matan@mellanox.com>

Since one segment shouldn't use additional memory to save segments
byte_count for writing them in different order we can prevent
additional memory unnecessary usage in this case.
By the way, prevent loop management.

All for performance improvement.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 125 +++++++++++++++++++++++++++++--------------
 1 file changed, 85 insertions(+), 40 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 8adac0f..321ffe4 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -310,7 +310,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		uint32_t owner_opcode = MLX4_OPCODE_SEND;
 		struct mlx4_wqe_ctrl_seg *ctrl;
 		struct mlx4_wqe_data_seg *dseg;
-		struct rte_mbuf *sbuf;
 		union {
 			uint32_t flags;
 			uint16_t flags16[2];
@@ -363,12 +362,12 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
 				sizeof(struct mlx4_wqe_ctrl_seg));
 		/* Fill the data segments with buffer information. */
-		for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
-			addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+		if (likely(buf->nb_segs == 1)) {
+			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			rte_prefetch0((volatile void *)addr);
 			/* Handle WQE wraparound. */
-			if (unlikely(dseg >=
-			    (struct mlx4_wqe_data_seg *)sq->eob))
+			if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)
+					sq->eob))
 				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
 			dseg->addr = rte_cpu_to_be_64(addr);
 			/* Memory region key (big endian). */
@@ -392,44 +391,90 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				break;
 			}
 	#endif /* NDEBUG */
-			if (likely(sbuf->data_len)) {
-				byte_count = rte_cpu_to_be_32(sbuf->data_len);
-			} else {
-				/*
-				 * Zero length segment is treated as inline
-				 * segment with zero data.
-				 */
-				byte_count = RTE_BE32(0x80000000);
-			}
-			/*
-			 * If the data segment is not at the beginning
-			 * of a Tx basic block (TXBB) then write the
-			 * byte count, else postpone the writing to
-			 * just before updating the control segment.
-			 */
-			if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
-				/*
-				 * Need a barrier here before writing the
-				 * byte_count fields to make sure that all the
-				 * data is visible before the byte_count field
-				 * is set. otherwise, if the segment begins a
-				 * new cacheline, the HCA prefetcher could grab
-				 * the 64-byte chunk and get a valid
-				 * (!= 0xffffffff) byte count but stale data,
-				 * and end up sending the wrong data.
-				 */
-				rte_io_wmb();
-				dseg->byte_count = byte_count;
-			} else {
+			/* Need a barrier here before writing the byte_count. */
+			rte_io_wmb();
+			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
+		} else {
+			/* Fill the data segments with buffer information. */
+			struct rte_mbuf *sbuf;
+
+			for (sbuf = buf;
+				 sbuf != NULL;
+				 sbuf = sbuf->next, dseg++) {
+				addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+				rte_prefetch0((volatile void *)addr);
+				/* Handle WQE wraparound. */
+				if (unlikely(dseg >=
+					(struct mlx4_wqe_data_seg *)sq->eob))
+					dseg = (struct mlx4_wqe_data_seg *)
+							sq->buf;
+				dseg->addr = rte_cpu_to_be_64(addr);
+				/* Memory region key (big endian). */
+				dseg->lkey = mlx4_txq_mp2mr(txq,
+						mlx4_txq_mb2mp(sbuf));
+		#ifndef NDEBUG
+				if (unlikely(dseg->lkey ==
+					rte_cpu_to_be_32((uint32_t)-1))) {
+					/* MR does not exist. */
+					DEBUG("%p: unable to get MP <-> MR association",
+						  (void *)txq);
+					/*
+					 * Restamp entry in case of failure.
+					 * Make sure that size is written
+					 * correctly, note that we give
+					 * ownership to the SW, not the HW.
+					 */
+					ctrl->fence_size =
+						(wqe_real_size >> 4) & 0x3f;
+					mlx4_txq_stamp_freed_wqe(sq, head_idx,
+					    (sq->head & sq->txbb_cnt) ? 0 : 1);
+					elt->buf = NULL;
+					break;
+				}
+		#endif /* NDEBUG */
+				if (likely(sbuf->data_len)) {
+					byte_count =
+					  rte_cpu_to_be_32(sbuf->data_len);
+				} else {
+					/*
+					 * Zero length segment is treated as
+					 * inline segment with zero data.
+					 */
+					byte_count = RTE_BE32(0x80000000);
+				}
 				/*
-				 * This data segment starts at the beginning of
-				 * a new TXBB, so we need to postpone its
-				 * byte_count writing for later.
+				 * If the data segment is not at the beginning
+				 * of a Tx basic block (TXBB) then write the
+				 * byte count, else postpone the writing to
+				 * just before updating the control segment.
 				 */
-				pv[pv_counter].dseg = dseg;
-				pv[pv_counter++].val = byte_count;
+				if ((uintptr_t)dseg &
+					(uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+					/*
+					 * Need a barrier here before writing
+					 * the byte_count fields to make sure
+					 * that all the data is visible before
+					 * the byte_count field is set.
+					 * Otherwise, if the segment begins a
+					 * new cacheline, the HCA prefetcher
+					 * could grab the 64-byte chunk and get
+					 * a valid (!= 0xffffffff) byte count
+					 * but stale data, and end up sending
+					 * the wrong data.
+					 */
+					rte_io_wmb();
+					dseg->byte_count = byte_count;
+				} else {
+					/*
+					 * This data segment starts at the
+					 * beginning of a new TXBB, so we
+					 * need to postpone its byte_count
+					 * writing for later.
+					 */
+					pv[pv_counter].dseg = dseg;
+					pv[pv_counter++].val = byte_count;
+				}
 			}
-		}
 		/* Write the first DWORD of each TXBB save earlier. */
 		if (pv_counter) {
 			/* Need a barrier before writing the byte_count. */
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v1 7/7] net/mlx4: separate Tx for multi-segments
  2017-10-23 10:04 [PATCH v1 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
                   ` (5 preceding siblings ...)
  2017-10-23 10:04 ` [PATCH v1 6/7] net/mlx4: improve performance of one Tx segment Ophir Munk
@ 2017-10-23 10:04 ` Ophir Munk
  6 siblings, 0 replies; 8+ messages in thread
From: Ophir Munk @ 2017-10-23 10:04 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: dev, Thomas Monjalon, Olga Shern, Matan Azrad, Ophir Munk

This commit optimizes handling of one segment and calls a
dedicated function for handling multi segments

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 284 +++++++++++++++++++++++--------------------
 1 file changed, 154 insertions(+), 130 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 321ffe4..b471064 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -62,6 +62,9 @@
 #include "mlx4_rxtx.h"
 #include "mlx4_utils.h"
 
+#define WQE_ONE_DATA_SEG_SIZE \
+	(sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg))
+
 /**
  * Pointer-value pair structure used in tx_post_send for saving the first
  * DWORD (32 byte) of a TXBB.
@@ -140,22 +143,19 @@ mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, uint16_t index, uint8_t owner)
  * @return
  *   0 on success, -1 on failure.
  */
-static int
-mlx4_txq_complete(struct txq *txq)
+static inline int __attribute__((always_inline))
+mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
+				struct mlx4_sq *sq)
 {
 	unsigned int elts_comp = txq->elts_comp;
 	unsigned int elts_tail = txq->elts_tail;
-	const unsigned int elts_n = txq->elts_n;
 	struct mlx4_cq *cq = &txq->mcq;
-	struct mlx4_sq *sq = &txq->msq;
 	struct mlx4_cqe *cqe;
 	uint32_t cons_index = cq->cons_index;
 	uint16_t new_index;
 	uint16_t nr_txbbs = 0;
 	int pkts = 0;
 
-	if (unlikely(elts_comp == 0))
-		return 0;
 	/*
 	 * Traverse over all CQ entries reported and handle each WQ entry
 	 * reported by them.
@@ -266,6 +266,120 @@ uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
 	return txq->mp2mr[i].be_lkey;
 }
 
+static int handle_multi_segs(struct rte_mbuf *buf,
+			    struct txq *txq,
+			    struct mlx4_wqe_ctrl_seg **pctrl)
+{
+	int wqe_real_size;
+	int nr_txbbs;
+	struct pv *pv = (struct pv *)txq->bounce_buf;
+	struct mlx4_sq *sq = &txq->msq;
+	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_data_seg *dseg;
+	uintptr_t addr;
+	uint32_t byte_count;
+	int pv_counter = 0;
+
+	/* Calculate the needed work queue entry size for this packet. */
+	wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+		buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
+	/*
+	 * Check that there is room for this WQE in the send queue and that
+	 * the WQE size is legal.
+	 */
+	if (((sq->head - sq->tail) + nr_txbbs +
+				sq->headroom_txbbs) >= sq->txbb_cnt ||
+			nr_txbbs > MLX4_MAX_WQE_TXBBS) {
+		return -1;
+	}
+
+	/* Get the control and data entries of the WQE. */
+	ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
+	dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+			sizeof(struct mlx4_wqe_ctrl_seg));
+	*pctrl = ctrl;
+	/* Fill the data segments with buffer information. */
+	struct rte_mbuf *sbuf;
+
+	for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
+		addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+		rte_prefetch0((volatile void *)addr);
+		/* Handle WQE wraparound. */
+		if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
+			dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+		dseg->addr = rte_cpu_to_be_64(addr);
+		/* Memory region key (big endian) for this memory pool. */
+		dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+#ifndef NDEBUG
+		/* Calculate the needed work queue entry size for this packet */
+		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
+			/* MR does not exist. */
+			DEBUG("%p: unable to get MP <-> MR association",
+					(void *)txq);
+			/*
+			 * Restamp entry in case of failure.
+			 * Make sure that size is written correctly
+			 * Note that we give ownership to the SW, not the HW.
+			 */
+			wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+			mlx4_txq_stamp_freed_wqe(sq, head_idx,
+					(sq->head & sq->txbb_cnt) ? 0 : 1);
+			return -1;
+		}
+#endif /* NDEBUG */
+		if (likely(sbuf->data_len)) {
+			byte_count = rte_cpu_to_be_32(sbuf->data_len);
+		} else {
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			byte_count = RTE_BE32(0x80000000);
+		}
+		/*
+		 * If the data segment is not at the beginning of a
+		 * Tx basic block (TXBB) then write the byte count,
+		 * else postpone the writing to just before updating the
+		 * control segment.
+		 */
+		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+			/*
+			 * Need a barrier here before writing the byte_count
+			 * fields to make sure that all the data is visible
+			 * before the byte_count field is set.
+			 * Otherwise, if the segment begins a new cacheline,
+			 * the HCA prefetcher could grab the 64-byte chunk and
+			 * get a valid (!= 0xffffffff) byte count but stale
+			 * data, and end up sending the wrong data.
+			 */
+			rte_io_wmb();
+			dseg->byte_count = byte_count;
+		} else {
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[pv_counter].dseg = dseg;
+			pv[pv_counter++].val = byte_count;
+		}
+	}
+	/* Write the first DWORD of each TXBB save earlier. */
+	if (pv_counter) {
+		/* Need a barrier here before writing the byte_count. */
+		rte_io_wmb();
+		for (--pv_counter; pv_counter  >= 0; pv_counter--)
+			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
+	}
+	/* Fill the control parameters for this packet. */
+	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+
+	return nr_txbbs;
+}
 /**
  * DPDK callback for Tx.
  *
@@ -288,10 +402,11 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int i;
 	unsigned int max;
 	struct mlx4_sq *sq = &txq->msq;
-	struct pv *pv = (struct pv *)txq->bounce_buf;
+	int nr_txbbs;
 
 	assert(txq->elts_comp_cd != 0);
-	mlx4_txq_complete(txq);
+	if (likely(txq->elts_comp != 0))
+		mlx4_txq_complete(txq, elts_n, sq);
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
 		max -= elts_n;
@@ -316,10 +431,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		} srcrb;
 		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
 		uintptr_t addr;
-		uint32_t byte_count;
-		int wqe_real_size;
-		int nr_txbbs;
-		int pv_counter = 0;
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -338,31 +449,22 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-
-		/*
-		 * Calculate the needed work queue entry size
-		 * for this packet.
-		 */
-		wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
-				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
-		nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
-		/*
-		 * Check that there is room for this WQE in the send
-		 * queue and that the WQE size is legal.
-		 */
-		if (((sq->head - sq->tail) + nr_txbbs +
-		     sq->headroom_txbbs) >= sq->txbb_cnt ||
-		    nr_txbbs > MLX4_MAX_WQE_TXBBS) {
-			elt->buf = NULL;
-			break;
-		}
-		/* Get the control and data entries of the WQE. */
-		ctrl = (struct mlx4_wqe_ctrl_seg *)
-				mlx4_get_send_wqe(sq, head_idx);
-		dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
-				sizeof(struct mlx4_wqe_ctrl_seg));
-		/* Fill the data segments with buffer information. */
 		if (likely(buf->nb_segs == 1)) {
+			/*
+			 * Check that there is room for this WQE in the send
+			 * queue and that the WQE size is legal
+			 */
+			if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs)
+						>= sq->txbb_cnt ||
+						1 > MLX4_MAX_WQE_TXBBS) {
+				elt->buf = NULL;
+				break;
+			}
+			/* Get the control and data entries of the WQE. */
+			ctrl = (struct mlx4_wqe_ctrl_seg *)
+					mlx4_get_send_wqe(sq, head_idx);
+			dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+					sizeof(struct mlx4_wqe_ctrl_seg));
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			rte_prefetch0((volatile void *)addr);
 			/* Handle WQE wraparound. */
@@ -371,120 +473,42 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				dseg = (struct mlx4_wqe_data_seg *)sq->buf;
 			dseg->addr = rte_cpu_to_be_64(addr);
 			/* Memory region key (big endian). */
-			dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
-	#ifndef NDEBUG
+			dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+#ifndef NDEBUG
 			if (unlikely(dseg->lkey ==
 				rte_cpu_to_be_32((uint32_t)-1))) {
 				/* MR does not exist. */
 				DEBUG("%p: unable to get MP <-> MR association",
-				      (void *)txq);
+						(void *)txq);
 				/*
 				 * Restamp entry in case of failure.
 				 * Make sure that size is written correctly
 				 * Note that we give ownership to the SW,
 				 * not the HW.
 				 */
-				ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+				ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4)
+							& 0x3f;
 				mlx4_txq_stamp_freed_wqe(sq, head_idx,
-					     (sq->head & sq->txbb_cnt) ? 0 : 1);
+					(sq->head & sq->txbb_cnt) ? 0 : 1);
 				elt->buf = NULL;
 				break;
 			}
-	#endif /* NDEBUG */
+#endif /* NDEBUG */
 			/* Need a barrier here before writing the byte_count. */
 			rte_io_wmb();
 			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
+
+			/* Fill the control parameters for this packet. */
+			ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
+			nr_txbbs = 1;
 		} else {
-			/* Fill the data segments with buffer information. */
-			struct rte_mbuf *sbuf;
-
-			for (sbuf = buf;
-				 sbuf != NULL;
-				 sbuf = sbuf->next, dseg++) {
-				addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
-				rte_prefetch0((volatile void *)addr);
-				/* Handle WQE wraparound. */
-				if (unlikely(dseg >=
-					(struct mlx4_wqe_data_seg *)sq->eob))
-					dseg = (struct mlx4_wqe_data_seg *)
-							sq->buf;
-				dseg->addr = rte_cpu_to_be_64(addr);
-				/* Memory region key (big endian). */
-				dseg->lkey = mlx4_txq_mp2mr(txq,
-						mlx4_txq_mb2mp(sbuf));
-		#ifndef NDEBUG
-				if (unlikely(dseg->lkey ==
-					rte_cpu_to_be_32((uint32_t)-1))) {
-					/* MR does not exist. */
-					DEBUG("%p: unable to get MP <-> MR association",
-						  (void *)txq);
-					/*
-					 * Restamp entry in case of failure.
-					 * Make sure that size is written
-					 * correctly, note that we give
-					 * ownership to the SW, not the HW.
-					 */
-					ctrl->fence_size =
-						(wqe_real_size >> 4) & 0x3f;
-					mlx4_txq_stamp_freed_wqe(sq, head_idx,
-					    (sq->head & sq->txbb_cnt) ? 0 : 1);
-					elt->buf = NULL;
-					break;
-				}
-		#endif /* NDEBUG */
-				if (likely(sbuf->data_len)) {
-					byte_count =
-					  rte_cpu_to_be_32(sbuf->data_len);
-				} else {
-					/*
-					 * Zero length segment is treated as
-					 * inline segment with zero data.
-					 */
-					byte_count = RTE_BE32(0x80000000);
-				}
-				/*
-				 * If the data segment is not at the beginning
-				 * of a Tx basic block (TXBB) then write the
-				 * byte count, else postpone the writing to
-				 * just before updating the control segment.
-				 */
-				if ((uintptr_t)dseg &
-					(uintptr_t)(MLX4_TXBB_SIZE - 1)) {
-					/*
-					 * Need a barrier here before writing
-					 * the byte_count fields to make sure
-					 * that all the data is visible before
-					 * the byte_count field is set.
-					 * Otherwise, if the segment begins a
-					 * new cacheline, the HCA prefetcher
-					 * could grab the 64-byte chunk and get
-					 * a valid (!= 0xffffffff) byte count
-					 * but stale data, and end up sending
-					 * the wrong data.
-					 */
-					rte_io_wmb();
-					dseg->byte_count = byte_count;
-				} else {
-					/*
-					 * This data segment starts at the
-					 * beginning of a new TXBB, so we
-					 * need to postpone its byte_count
-					 * writing for later.
-					 */
-					pv[pv_counter].dseg = dseg;
-					pv[pv_counter++].val = byte_count;
-				}
+			nr_txbbs = handle_multi_segs(buf, txq, &ctrl);
+			if (nr_txbbs < 0) {
+				elt->buf = NULL;
+				break;
 			}
-		/* Write the first DWORD of each TXBB save earlier. */
-		if (pv_counter) {
-			/* Need a barrier before writing the byte_count. */
-			rte_io_wmb();
-			for (--pv_counter; pv_counter  >= 0; pv_counter--)
-				pv[pv_counter].dseg->byte_count =
-						pv[pv_counter].val;
 		}
-		/* Fill the control parameters for this packet. */
-		ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+
 		/*
 		 * For raw Ethernet, the SOLICIT flag is used to indicate
 		 * that no ICRC should be calculated.
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2017-10-23 10:05 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-23 10:04 [PATCH v1 0/7] net/mlx4: follow-up on new TX datapath introduced in RC1 Ophir Munk
2017-10-23 10:04 ` [PATCH v1 1/7] net/mlx4: remove error flows from Tx fast path Ophir Munk
2017-10-23 10:04 ` [PATCH v1 2/7] net/mlx4: inline more Tx functions Ophir Munk
2017-10-23 10:04 ` [PATCH v1 3/7] net/mlx4: save lkey in big-endian format Ophir Munk
2017-10-23 10:04 ` [PATCH v1 4/7] net/mlx4: merge Tx path functions Ophir Munk
2017-10-23 10:04 ` [PATCH v1 5/7] net/mlx4: remove unnecessary variables in Tx burst Ophir Munk
2017-10-23 10:04 ` [PATCH v1 6/7] net/mlx4: improve performance of one Tx segment Ophir Munk
2017-10-23 10:04 ` [PATCH v1 7/7] net/mlx4: separate Tx for multi-segments Ophir Munk

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.