DPDK-dev Archive on lore.kernel.org
 help / color / Atom feed
* [dpdk-dev] [PATCH 0/7] net/mlx5: consolidate Tx datapath
@ 2019-07-04 16:29 Viacheslav Ovsiienko
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 1/7] net/mlx5: remove Tx datapath implementation Viacheslav Ovsiienko
                   ` (6 more replies)
  0 siblings, 7 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-04 16:29 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patchset introduces the new implementation of tx_burst
routine of mlx5 PMD. The existing implementation is based on the
several branches of tx_burst routines optimizied for most
common sets of Tx hardware offload, supported by Mellanox NICs.

It was not very easy to update, support and develop such kind
of code - multiple branches impose multiple points to process.
Also, many of frequently requested offload combinations are not
supported yet in the most efficient way. That leads to selecting
of not completely matching tx_burst routine and harms the performance.

The new Tx datapath:
  - introduces the unified template for tx_burst routine to generate
    on compile time the most efficient instances, tuned for specified
    sets of hardware offloads
  - the best matching instance is chosen in run-time at sending queue
    configuration
  - almost all possible sets of Tx are supported
  - the best possible method to send each packet is chosen on runtime
  - introduces new devargs for more accurate tuning of data inlining 

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>

Viacheslav Ovsiienko (7):
  net/mlx5: remove Tx datapath implementation
  net/mlx5: add Tx datapath related devargs
  net/mlx5: update Tx datapath definitions
  net/mlx5: add Tx datapath configuration and setup
  net/mlx5: introduce Tx burst routine template
  net/mlx5: implement Tx burst template
  net/mlx5: add minimal required Tx data inline

 doc/guides/nics/mlx5.rst              |  168 +-
 drivers/net/mlx5/mlx5.c               |  155 +-
 drivers/net/mlx5/mlx5.h               |   13 +-
 drivers/net/mlx5/mlx5_defs.h          |   36 +-
 drivers/net/mlx5/mlx5_devx_cmds.c     |  100 +-
 drivers/net/mlx5/mlx5_ethdev.c        |   58 -
 drivers/net/mlx5/mlx5_prm.h           |  355 ++-
 drivers/net/mlx5/mlx5_rxtx.c          | 5260 ++++++++++++++++++++++-----------
 drivers/net/mlx5/mlx5_rxtx.h          |  333 +--
 drivers/net/mlx5/mlx5_rxtx_vec.c      |  175 --
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h |  289 --
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  284 --
 drivers/net/mlx5/mlx5_txq.c           |  242 +-
 13 files changed, 4389 insertions(+), 3079 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH 1/7] net/mlx5: remove Tx datapath implementation
  2019-07-04 16:29 [dpdk-dev] [PATCH 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
@ 2019-07-04 16:29 ` Viacheslav Ovsiienko
  2019-07-15 13:59   ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 2/7] net/mlx5: add Tx datapath related devargs Viacheslav Ovsiienko
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-04 16:29 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch removes the existing Tx datapath code
as preparation step before introducing the new
implementation. The following entities are being
removed:

- obsolete devargs
- tx_burst() routines
- related PRM definitions
- SQ configuration code
- Tx routine selection code
- incompatible Tx completion code

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c               |   43 -
 drivers/net/mlx5/mlx5.h               |    5 -
 drivers/net/mlx5/mlx5_defs.h          |   16 -
 drivers/net/mlx5/mlx5_ethdev.c        |   58 --
 drivers/net/mlx5/mlx5_prm.h           |   77 --
 drivers/net/mlx5/mlx5_rxtx.c          | 1434 +--------------------------------
 drivers/net/mlx5/mlx5_rxtx.h          |  273 -------
 drivers/net/mlx5/mlx5_rxtx_vec.c      |  175 ----
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h |  289 -------
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  284 -------
 drivers/net/mlx5/mlx5_txq.c           |  110 +--
 11 files changed, 35 insertions(+), 2729 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index d93f92d..42b36a7 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -68,33 +68,15 @@
 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
 
-/* Device parameter to configure inline send. */
-#define MLX5_TXQ_INLINE "txq_inline"
-
 /*
  * Device parameter to configure the number of TX queues threshold for
  * enabling inline send.
  */
 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
 
-/*
- * Device parameter to configure the number of TX queues threshold for
- * enabling vectorized Tx.
- */
-#define MLX5_TXQS_MAX_VEC "txqs_max_vec"
-
 /* Device parameter to enable multi-packet send WQEs. */
 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
 
-/* Device parameter to include 2 dsegs in the title WQEBB. */
-#define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
-
-/* Device parameter to limit the size of inlining packet. */
-#define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
-
-/* Device parameter to enable hardware Tx vector. */
-#define MLX5_TX_VEC_EN "tx_vec_en"
-
 /* Device parameter to enable hardware Rx vector. */
 #define MLX5_RX_VEC_EN "rx_vec_en"
 
@@ -902,20 +884,10 @@ struct mlx5_dev_spawn_data {
 		config->mprq.max_memcpy_len = tmp;
 	} else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
 		config->mprq.min_rxqs_num = tmp;
-	} else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
-		config->txq_inline = tmp;
 	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
 		config->txqs_inline = tmp;
-	} else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
-		config->txqs_vec = tmp;
 	} else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
 		config->mps = !!tmp;
-	} else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
-		config->mpw_hdr_dseg = !!tmp;
-	} else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
-		config->inline_max_packet_sz = tmp;
-	} else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
-		config->tx_vec_en = !!tmp;
 	} else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
 		config->rx_vec_en = !!tmp;
 	} else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
@@ -960,13 +932,8 @@ struct mlx5_dev_spawn_data {
 		MLX5_RX_MPRQ_LOG_STRIDE_NUM,
 		MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
 		MLX5_RXQS_MIN_MPRQ,
-		MLX5_TXQ_INLINE,
 		MLX5_TXQS_MIN_INLINE,
-		MLX5_TXQS_MAX_VEC,
 		MLX5_TXQ_MPW_EN,
-		MLX5_TXQ_MPW_HDR_DSEG_EN,
-		MLX5_TXQ_MAX_INLINE_LEN,
-		MLX5_TX_VEC_EN,
 		MLX5_RX_VEC_EN,
 		MLX5_L3_VXLAN_EN,
 		MLX5_VF_NL_EN,
@@ -1914,12 +1881,8 @@ struct mlx5_dev_spawn_data {
 	dev_config = (struct mlx5_dev_config){
 		.hw_padding = 0,
 		.mps = MLX5_ARG_UNSET,
-		.tx_vec_en = 1,
 		.rx_vec_en = 1,
-		.txq_inline = MLX5_ARG_UNSET,
 		.txqs_inline = MLX5_ARG_UNSET,
-		.txqs_vec = MLX5_ARG_UNSET,
-		.inline_max_packet_sz = MLX5_ARG_UNSET,
 		.vf_nl_en = 1,
 		.mr_ext_memseg_en = 1,
 		.mprq = {
@@ -1932,9 +1895,6 @@ struct mlx5_dev_spawn_data {
 	};
 	/* Device specific configuration. */
 	switch (pci_dev->id.device_id) {
-	case PCI_DEVICE_ID_MELLANOX_CONNECTX5BF:
-		dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS_BLUEFIELD;
-		break;
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
@@ -1944,9 +1904,6 @@ struct mlx5_dev_spawn_data {
 	default:
 		break;
 	}
-	/* Set architecture-dependent default value if unset. */
-	if (dev_config.txqs_vec == MLX5_ARG_UNSET)
-		dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS;
 	for (i = 0; i != ns; ++i) {
 		uint32_t restore;
 
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index e6d24d7..47c9e47 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -188,9 +188,7 @@ struct mlx5_dev_config {
 	unsigned int cqe_comp:1; /* CQE compression is enabled. */
 	unsigned int cqe_pad:1; /* CQE padding is enabled. */
 	unsigned int tso:1; /* Whether TSO is supported. */
-	unsigned int tx_vec_en:1; /* Tx vector is enabled. */
 	unsigned int rx_vec_en:1; /* Rx vector is enabled. */
-	unsigned int mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
 	unsigned int mr_ext_memseg_en:1;
 	/* Whether memseg should be extended for MR creation. */
 	unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */
@@ -214,10 +212,7 @@ struct mlx5_dev_config {
 	unsigned int tso_max_payload_sz; /* Maximum TCP payload for TSO. */
 	unsigned int ind_table_max_size; /* Maximum indirection table size. */
 	unsigned int max_dump_files_num; /* Maximum dump files per queue. */
-	int txq_inline; /* Maximum packet size for inlining. */
 	int txqs_inline; /* Queue number threshold for inlining. */
-	int txqs_vec; /* Queue number threshold for vectorized Tx. */
-	int inline_max_packet_sz; /* Max packet size for inlining. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 };
 
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 13801a5..6861304 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -60,15 +60,6 @@
 /* Maximum Packet headers size (L2+L3+L4) for TSO. */
 #define MLX5_MAX_TSO_HEADER 192
 
-/* Default maximum number of Tx queues for vectorized Tx. */
-#if defined(RTE_ARCH_ARM64)
-#define MLX5_VPMD_MAX_TXQS 8
-#define MLX5_VPMD_MAX_TXQS_BLUEFIELD 16
-#else
-#define MLX5_VPMD_MAX_TXQS 4
-#define MLX5_VPMD_MAX_TXQS_BLUEFIELD MLX5_VPMD_MAX_TXQS
-#endif
-
 /* Threshold of buffer replenishment for vectorized Rx. */
 #define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \
 	(RTE_MIN(MLX5_VPMD_RX_MAX_BURST, (unsigned int)(n) >> 2))
@@ -76,13 +67,6 @@
 /* Maximum size of burst for vectorized Rx. */
 #define MLX5_VPMD_RX_MAX_BURST 64U
 
-/*
- * Maximum size of burst for vectorized Tx. This is related to the maximum size
- * of Enhanced MPW (eMPW) WQE as vectorized Tx is supported with eMPW.
- * Careful when changing, large value can cause WQE DS to overlap.
- */
-#define MLX5_VPMD_TX_MAX_BURST        32U
-
 /* Number of packets vectorized Rx can simultaneously process in a loop. */
 #define MLX5_VPMD_DESCS_PER_LOOP      4
 
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 9377f96..e98f1c1 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1576,64 +1576,6 @@ int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
 }
 
 /**
- * Configure the TX function to use.
- *
- * @param dev
- *   Pointer to private data structure.
- *
- * @return
- *   Pointer to selected Tx burst function.
- */
-eth_tx_burst_t
-mlx5_select_tx_function(struct rte_eth_dev *dev)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
-	struct mlx5_dev_config *config = &priv->config;
-	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
-	int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
-				    DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
-				    DEV_TX_OFFLOAD_GRE_TNL_TSO |
-				    DEV_TX_OFFLOAD_IP_TNL_TSO |
-				    DEV_TX_OFFLOAD_UDP_TNL_TSO));
-	int swp = !!(tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
-				    DEV_TX_OFFLOAD_UDP_TNL_TSO |
-				    DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM));
-	int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT);
-
-	assert(priv != NULL);
-	/* Select appropriate TX function. */
-	if (vlan_insert || tso || swp)
-		return tx_pkt_burst;
-	if (config->mps == MLX5_MPW_ENHANCED) {
-		if (mlx5_check_vec_tx_support(dev) > 0) {
-			if (mlx5_check_raw_vec_tx_support(dev) > 0)
-				tx_pkt_burst = mlx5_tx_burst_raw_vec;
-			else
-				tx_pkt_burst = mlx5_tx_burst_vec;
-			DRV_LOG(DEBUG,
-				"port %u selected enhanced MPW Tx vectorized"
-				" function",
-				dev->data->port_id);
-		} else {
-			tx_pkt_burst = mlx5_tx_burst_empw;
-			DRV_LOG(DEBUG,
-				"port %u selected enhanced MPW Tx function",
-				dev->data->port_id);
-		}
-	} else if (config->mps && (config->txq_inline > 0)) {
-		tx_pkt_burst = mlx5_tx_burst_mpw_inline;
-		DRV_LOG(DEBUG, "port %u selected MPW inline Tx function",
-			dev->data->port_id);
-	} else if (config->mps) {
-		tx_pkt_burst = mlx5_tx_burst_mpw;
-		DRV_LOG(DEBUG, "port %u selected MPW Tx function",
-			dev->data->port_id);
-	}
-	return tx_pkt_burst;
-}
-
-/**
  * Configure the RX function to use.
  *
  * @param dev
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 1a19958..3589f51 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -39,32 +39,12 @@
 /* Invalidate a CQE. */
 #define MLX5_CQE_INVALIDATE (MLX5_CQE_INVALID << 4)
 
-/* Maximum number of packets a multi-packet WQE can handle. */
-#define MLX5_MPW_DSEG_MAX 5
-
 /* WQE DWORD size */
 #define MLX5_WQE_DWORD_SIZE 16
 
 /* WQE size */
 #define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
 
-/* Max size of a WQE session. */
-#define MLX5_WQE_SIZE_MAX 960U
-
-/* Compute the number of DS. */
-#define MLX5_WQE_DS(n) \
-	(((n) + MLX5_WQE_DWORD_SIZE - 1) / MLX5_WQE_DWORD_SIZE)
-
-/* Room for inline data in multi-packet WQE. */
-#define MLX5_MWQE64_INL_DATA 28
-
-/* Default minimum number of Tx queues for inlining packets. */
-#define MLX5_EMPW_MIN_TXQS 8
-
-/* Default max packet length to be inlined. */
-#define MLX5_EMPW_MAX_INLINE_LEN (4U * MLX5_WQE_SIZE)
-
-
 #define MLX5_OPC_MOD_ENHANCED_MPSW 0
 #define MLX5_OPCODE_ENHANCED_MPSW 0x29
 
@@ -164,47 +144,11 @@ enum mlx5_completion_mode {
 	MLX5_COMP_CQE_AND_EQE = 0x3,
 };
 
-/* Subset of struct mlx5_wqe_eth_seg. */
-struct mlx5_wqe_eth_seg_small {
-	uint32_t rsvd0;
-	uint8_t	cs_flags;
-	uint8_t	rsvd1;
-	uint16_t mss;
-	uint32_t flow_table_metadata;
-	uint16_t inline_hdr_sz;
-	uint8_t inline_hdr[2];
-} __rte_aligned(MLX5_WQE_DWORD_SIZE);
-
-struct mlx5_wqe_inl_small {
-	uint32_t byte_cnt;
-	uint8_t raw;
-} __rte_aligned(MLX5_WQE_DWORD_SIZE);
-
-struct mlx5_wqe_ctrl {
-	uint32_t ctrl0;
-	uint32_t ctrl1;
-	uint32_t ctrl2;
-	uint32_t ctrl3;
-} __rte_aligned(MLX5_WQE_DWORD_SIZE);
-
 /* Small common part of the WQE. */
 struct mlx5_wqe {
 	uint32_t ctrl[4];
-	struct mlx5_wqe_eth_seg_small eseg;
-};
-
-/* Vectorize WQE header. */
-struct mlx5_wqe_v {
-	rte_v128u32_t ctrl;
-	rte_v128u32_t eseg;
 };
 
-/* WQE. */
-struct mlx5_wqe64 {
-	struct mlx5_wqe hdr;
-	uint8_t raw[32];
-} __rte_aligned(MLX5_WQE_SIZE);
-
 /* MPW mode. */
 enum mlx5_mpw_mode {
 	MLX5_MPW_DISABLED,
@@ -212,27 +156,6 @@ enum mlx5_mpw_mode {
 	MLX5_MPW_ENHANCED, /* Enhanced Multi-Packet Send WQE, a.k.a MPWv2. */
 };
 
-/* MPW session status. */
-enum mlx5_mpw_state {
-	MLX5_MPW_STATE_OPENED,
-	MLX5_MPW_INL_STATE_OPENED,
-	MLX5_MPW_ENHANCED_STATE_OPENED,
-	MLX5_MPW_STATE_CLOSED,
-};
-
-/* MPW session descriptor. */
-struct mlx5_mpw {
-	enum mlx5_mpw_state state;
-	unsigned int pkts_n;
-	unsigned int len;
-	unsigned int total_len;
-	volatile struct mlx5_wqe *wqe;
-	union {
-		volatile struct mlx5_wqe_data_seg *dseg[MLX5_MPW_DSEG_MAX];
-		volatile uint8_t *raw;
-	} data;
-};
-
 /* WQE for Multi-Packet RQ. */
 struct mlx5_wqe_mprq {
 	struct mlx5_wqe_srq_next_seg next_seg;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index c1dc8c4..f2d6918 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -288,140 +288,6 @@
 }
 
 /**
- * Return the size of tailroom of WQ.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param addr
- *   Pointer to tail of WQ.
- *
- * @return
- *   Size of tailroom.
- */
-static inline size_t
-tx_mlx5_wq_tailroom(struct mlx5_txq_data *txq, void *addr)
-{
-	size_t tailroom;
-	tailroom = (uintptr_t)(txq->wqes) +
-		   (1 << txq->wqe_n) * MLX5_WQE_SIZE -
-		   (uintptr_t)addr;
-	return tailroom;
-}
-
-/**
- * Copy data to tailroom of circular queue.
- *
- * @param dst
- *   Pointer to destination.
- * @param src
- *   Pointer to source.
- * @param n
- *   Number of bytes to copy.
- * @param base
- *   Pointer to head of queue.
- * @param tailroom
- *   Size of tailroom from dst.
- *
- * @return
- *   Pointer after copied data.
- */
-static inline void *
-mlx5_copy_to_wq(void *dst, const void *src, size_t n,
-		void *base, size_t tailroom)
-{
-	void *ret;
-
-	if (n > tailroom) {
-		rte_memcpy(dst, src, tailroom);
-		rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
-			   n - tailroom);
-		ret = (uint8_t *)base + n - tailroom;
-	} else {
-		rte_memcpy(dst, src, n);
-		ret = (n == tailroom) ? base : (uint8_t *)dst + n;
-	}
-	return ret;
-}
-
-/**
- * Inline TSO headers into WQE.
- *
- * @return
- *   0 on success, negative errno value on failure.
- */
-static int
-inline_tso(struct mlx5_txq_data *txq, struct rte_mbuf *buf,
-	   uint32_t *length,
-	   uintptr_t *addr,
-	   uint16_t *pkt_inline_sz,
-	   uint8_t **raw,
-	   uint16_t *max_wqe,
-	   uint16_t *tso_segsz,
-	   uint16_t *tso_header_sz)
-{
-	uintptr_t end = (uintptr_t)(((uintptr_t)txq->wqes) +
-				    (1 << txq->wqe_n) * MLX5_WQE_SIZE);
-	unsigned int copy_b;
-	uint8_t vlan_sz = (buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0;
-	const uint8_t tunneled = txq->tunnel_en && (buf->ol_flags &
-				 PKT_TX_TUNNEL_MASK);
-	uint16_t n_wqe;
-
-	*tso_segsz = buf->tso_segsz;
-	*tso_header_sz = buf->l2_len + vlan_sz + buf->l3_len + buf->l4_len;
-	if (unlikely(*tso_segsz == 0 || *tso_header_sz == 0)) {
-		txq->stats.oerrors++;
-		return -EINVAL;
-	}
-	if (tunneled)
-		*tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
-	/* First seg must contain all TSO headers. */
-	if (unlikely(*tso_header_sz > MLX5_MAX_TSO_HEADER) ||
-		     *tso_header_sz > DATA_LEN(buf)) {
-		txq->stats.oerrors++;
-		return -EINVAL;
-	}
-	copy_b = *tso_header_sz - *pkt_inline_sz;
-	if (!copy_b || ((end - (uintptr_t)*raw) < copy_b))
-		return -EAGAIN;
-	n_wqe = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
-	if (unlikely(*max_wqe < n_wqe))
-		return -EINVAL;
-	*max_wqe -= n_wqe;
-	rte_memcpy((void *)*raw, (void *)*addr, copy_b);
-	*length -= copy_b;
-	*addr += copy_b;
-	copy_b = MLX5_WQE_DS(copy_b) * MLX5_WQE_DWORD_SIZE;
-	*pkt_inline_sz += copy_b;
-	*raw += copy_b;
-	return 0;
-}
-
-/**
- * DPDK callback to check the status of a tx descriptor.
- *
- * @param tx_queue
- *   The tx queue.
- * @param[in] offset
- *   The index of the descriptor in the ring.
- *
- * @return
- *   The status of the tx descriptor.
- */
-int
-mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
-{
-	struct mlx5_txq_data *txq = tx_queue;
-	uint16_t used;
-
-	mlx5_tx_complete(txq);
-	used = txq->elts_head - txq->elts_tail;
-	if (offset < used)
-		return RTE_ETH_TX_DESC_FULL;
-	return RTE_ETH_TX_DESC_DONE;
-}
-
-/**
  * Internal function to compute the number of used descriptors in an RX queue
  *
  * @param rxq
@@ -655,7 +521,7 @@
 						    (1 << txq->cqe_n));
 			mlx5_dump_debug_information(name, "MLX5 Error SQ:",
 						    (const void *)((uintptr_t)
-						    tx_mlx5_wqe(txq, 0)),
+						    txq->wqes),
 						    MLX5_WQE_SIZE *
 						    (1 << txq->wqe_n));
 			txq_ctrl->dump_file_n++;
@@ -683,1247 +549,6 @@
 }
 
 /**
- * DPDK callback for TX.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	unsigned int k = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int comp;
-	volatile struct mlx5_wqe_ctrl *last_wqe = NULL;
-	unsigned int segs_n = 0;
-	const unsigned int max_inline = txq->max_inline;
-	uint64_t addr_64;
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Prefetch first packet cacheline. */
-	rte_prefetch0(*pkts);
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	if (unlikely(!max_wqe))
-		return 0;
-	do {
-		struct rte_mbuf *buf = *pkts; /* First_seg. */
-		uint8_t *raw;
-		volatile struct mlx5_wqe_v *wqe = NULL;
-		volatile rte_v128u32_t *dseg = NULL;
-		uint32_t length;
-		unsigned int ds = 0;
-		unsigned int sg = 0; /* counter of additional segs attached. */
-		uintptr_t addr;
-		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
-		uint16_t tso_header_sz = 0;
-		uint16_t ehdr;
-		uint8_t cs_flags;
-		uint8_t tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
-		uint32_t swp_offsets = 0;
-		uint8_t swp_types = 0;
-		rte_be32_t metadata;
-		uint16_t tso_segsz = 0;
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		uint32_t total_length = 0;
-#endif
-		int ret;
-
-		segs_n = buf->nb_segs;
-		/*
-		 * Make sure there is enough room to store this packet and
-		 * that one ring entry remains unused.
-		 */
-		assert(segs_n);
-		if (max_elts < segs_n)
-			break;
-		max_elts -= segs_n;
-		sg = --segs_n;
-		if (unlikely(--max_wqe == 0))
-			break;
-		wqe = (volatile struct mlx5_wqe_v *)
-			tx_mlx5_wqe(txq, txq->wqe_ci);
-		rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-		if (pkts_n - i > 1)
-			rte_prefetch0(*(pkts + 1));
-		addr = rte_pktmbuf_mtod(buf, uintptr_t);
-		length = DATA_LEN(buf);
-		ehdr = (((uint8_t *)addr)[1] << 8) |
-		       ((uint8_t *)addr)[0];
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		total_length = length;
-#endif
-		if (length < (MLX5_WQE_DWORD_SIZE + 2)) {
-			txq->stats.oerrors++;
-			break;
-		}
-		/* Update element. */
-		(*txq->elts)[elts_head & elts_m] = buf;
-		/* Prefetch next buffer data. */
-		if (pkts_n - i > 1)
-			rte_prefetch0(
-			    rte_pktmbuf_mtod(*(pkts + 1), volatile void *));
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets, &swp_types);
-		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Replace the Ethernet type by the VLAN if necessary. */
-		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
-			uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
-							 buf->vlan_tci);
-			unsigned int len = 2 * RTE_ETHER_ADDR_LEN - 2;
-
-			addr += 2;
-			length -= 2;
-			/* Copy Destination and source mac address. */
-			memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
-			/* Copy VLAN. */
-			memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
-			/* Copy missing two bytes to end the DSeg. */
-			memcpy((uint8_t *)raw + len + sizeof(vlan),
-			       ((uint8_t *)addr) + len, 2);
-			addr += len + 2;
-			length -= (len + 2);
-		} else {
-			memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
-			       MLX5_WQE_DWORD_SIZE);
-			length -= pkt_inline_sz;
-			addr += pkt_inline_sz;
-		}
-		raw += MLX5_WQE_DWORD_SIZE;
-		if (tso) {
-			ret = inline_tso(txq, buf, &length,
-					 &addr, &pkt_inline_sz,
-					 &raw, &max_wqe,
-					 &tso_segsz, &tso_header_sz);
-			if (ret == -EINVAL) {
-				break;
-			} else if (ret == -EAGAIN) {
-				/* NOP WQE. */
-				wqe->ctrl = (rte_v128u32_t){
-					rte_cpu_to_be_32(txq->wqe_ci << 8),
-					rte_cpu_to_be_32(txq->qp_num_8s | 1),
-					rte_cpu_to_be_32
-						(MLX5_COMP_ONLY_FIRST_ERR <<
-						 MLX5_COMP_MODE_OFFSET),
-					0,
-				};
-				ds = 1;
-#ifdef MLX5_PMD_SOFT_COUNTERS
-				total_length = 0;
-#endif
-				k++;
-				goto next_wqe;
-			}
-		}
-		/* Inline if enough room. */
-		if (max_inline || tso) {
-			uint32_t inl = 0;
-			uintptr_t end = (uintptr_t)
-				(((uintptr_t)txq->wqes) +
-				 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
-			unsigned int inline_room = max_inline *
-						   RTE_CACHE_LINE_SIZE -
-						   (pkt_inline_sz - 2) -
-						   !!tso * sizeof(inl);
-			uintptr_t addr_end;
-			unsigned int copy_b;
-
-pkt_inline:
-			addr_end = RTE_ALIGN_FLOOR(addr + inline_room,
-						   RTE_CACHE_LINE_SIZE);
-			copy_b = (addr_end > addr) ?
-				 RTE_MIN((addr_end - addr), length) : 0;
-			if (copy_b && ((end - (uintptr_t)raw) >
-				       (copy_b + sizeof(inl)))) {
-				/*
-				 * One Dseg remains in the current WQE.  To
-				 * keep the computation positive, it is
-				 * removed after the bytes to Dseg conversion.
-				 */
-				uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
-
-				if (unlikely(max_wqe < n))
-					break;
-				max_wqe -= n;
-				if (tso) {
-					assert(inl == 0);
-					inl = rte_cpu_to_be_32(copy_b |
-							       MLX5_INLINE_SEG);
-					rte_memcpy((void *)raw,
-						   (void *)&inl, sizeof(inl));
-					raw += sizeof(inl);
-					pkt_inline_sz += sizeof(inl);
-				}
-				rte_memcpy((void *)raw, (void *)addr, copy_b);
-				addr += copy_b;
-				length -= copy_b;
-				pkt_inline_sz += copy_b;
-			}
-			/*
-			 * 2 DWORDs consumed by the WQE header + ETH segment +
-			 * the size of the inline part of the packet.
-			 */
-			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
-			if (length > 0) {
-				if (ds % (MLX5_WQE_SIZE /
-					  MLX5_WQE_DWORD_SIZE) == 0) {
-					if (unlikely(--max_wqe == 0))
-						break;
-					dseg = (volatile rte_v128u32_t *)
-					       tx_mlx5_wqe(txq, txq->wqe_ci +
-							   ds / 4);
-				} else {
-					dseg = (volatile rte_v128u32_t *)
-						((uintptr_t)wqe +
-						 (ds * MLX5_WQE_DWORD_SIZE));
-				}
-				goto use_dseg;
-			} else if (!segs_n) {
-				goto next_pkt;
-			} else {
-				/*
-				 * Further inline the next segment only for
-				 * non-TSO packets.
-				 */
-				if (!tso) {
-					raw += copy_b;
-					inline_room -= copy_b;
-				} else {
-					inline_room = 0;
-				}
-				/* Move to the next segment. */
-				--segs_n;
-				buf = buf->next;
-				assert(buf);
-				addr = rte_pktmbuf_mtod(buf, uintptr_t);
-				length = DATA_LEN(buf);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-				total_length += length;
-#endif
-				(*txq->elts)[++elts_head & elts_m] = buf;
-				goto pkt_inline;
-			}
-		} else {
-			/*
-			 * No inline has been done in the packet, only the
-			 * Ethernet Header as been stored.
-			 */
-			dseg = (volatile rte_v128u32_t *)
-				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
-			ds = 3;
-use_dseg:
-			/* Add the remaining packet as a simple ds. */
-			addr_64 = rte_cpu_to_be_64(addr);
-			*dseg = (rte_v128u32_t){
-				rte_cpu_to_be_32(length),
-				mlx5_tx_mb2mr(txq, buf),
-				addr_64,
-				addr_64 >> 32,
-			};
-			++ds;
-			if (!segs_n)
-				goto next_pkt;
-		}
-next_seg:
-		assert(buf);
-		assert(ds);
-		assert(wqe);
-		/*
-		 * Spill on next WQE when the current one does not have
-		 * enough room left. Size of WQE must a be a multiple
-		 * of data segment size.
-		 */
-		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
-		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
-			if (unlikely(--max_wqe == 0))
-				break;
-			dseg = (volatile rte_v128u32_t *)
-			       tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
-			rte_prefetch0(tx_mlx5_wqe(txq,
-						  txq->wqe_ci + ds / 4 + 1));
-		} else {
-			++dseg;
-		}
-		++ds;
-		buf = buf->next;
-		assert(buf);
-		length = DATA_LEN(buf);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		total_length += length;
-#endif
-		/* Store segment information. */
-		addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t));
-		*dseg = (rte_v128u32_t){
-			rte_cpu_to_be_32(length),
-			mlx5_tx_mb2mr(txq, buf),
-			addr_64,
-			addr_64 >> 32,
-		};
-		(*txq->elts)[++elts_head & elts_m] = buf;
-		if (--segs_n)
-			goto next_seg;
-next_pkt:
-		if (ds > MLX5_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		++elts_head;
-		++pkts;
-		++i;
-		j += sg;
-		/* Initialize known and common part of the WQE structure. */
-		if (tso) {
-			wqe->ctrl = (rte_v128u32_t){
-				rte_cpu_to_be_32((txq->wqe_ci << 8) |
-						 MLX5_OPCODE_TSO),
-				rte_cpu_to_be_32(txq->qp_num_8s | ds),
-				rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-						 MLX5_COMP_MODE_OFFSET),
-				0,
-			};
-			wqe->eseg = (rte_v128u32_t){
-				swp_offsets,
-				cs_flags | (swp_types << 8) |
-				(rte_cpu_to_be_16(tso_segsz) << 16),
-				metadata,
-				(ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
-			};
-		} else {
-			wqe->ctrl = (rte_v128u32_t){
-				rte_cpu_to_be_32((txq->wqe_ci << 8) |
-						 MLX5_OPCODE_SEND),
-				rte_cpu_to_be_32(txq->qp_num_8s | ds),
-				rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-						 MLX5_COMP_MODE_OFFSET),
-				0,
-			};
-			wqe->eseg = (rte_v128u32_t){
-				swp_offsets,
-				cs_flags | (swp_types << 8),
-				metadata,
-				(ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
-			};
-		}
-next_wqe:
-		txq->wqe_ci += (ds + 3) / 4;
-		/* Save the last successful WQE for completion request */
-		last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe;
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += total_length;
-#endif
-	} while (i < pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely((i + k) == 0))
-		return 0;
-	txq->elts_head += (i + j);
-	/* Check whether completion threshold has been reached. */
-	comp = txq->elts_comp + i + j + k;
-	if (comp >= MLX5_TX_COMP_THRESH) {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		last_wqe->ctrl2 = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						   MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		last_wqe->ctrl3 = txq->elts_head;
-		txq->elts_comp = 0;
-	} else {
-		txq->elts_comp = comp;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe);
-	return i;
-}
-
-/**
- * Open a MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- * @param length
- *   Packet length.
- */
-static inline void
-mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length)
-{
-	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
-	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
-		(volatile struct mlx5_wqe_data_seg (*)[])
-		tx_mlx5_wqe(txq, idx + 1);
-
-	mpw->state = MLX5_MPW_STATE_OPENED;
-	mpw->pkts_n = 0;
-	mpw->len = length;
-	mpw->total_len = 0;
-	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
-	mpw->wqe->eseg.inline_hdr_sz = 0;
-	mpw->wqe->eseg.rsvd0 = 0;
-	mpw->wqe->eseg.rsvd1 = 0;
-	mpw->wqe->eseg.flow_table_metadata = 0;
-	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
-					     (txq->wqe_ci << 8) |
-					     MLX5_OPCODE_TSO);
-	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-					     MLX5_COMP_MODE_OFFSET);
-	mpw->wqe->ctrl[3] = 0;
-	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
-		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
-	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
-		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
-	mpw->data.dseg[2] = &(*dseg)[0];
-	mpw->data.dseg[3] = &(*dseg)[1];
-	mpw->data.dseg[4] = &(*dseg)[2];
-}
-
-/**
- * Close a MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- */
-static inline void
-mlx5_mpw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
-{
-	unsigned int num = mpw->pkts_n;
-
-	/*
-	 * Store size in multiple of 16 bytes. Control and Ethernet segments
-	 * count as 2.
-	 */
-	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | (2 + num));
-	mpw->state = MLX5_MPW_STATE_CLOSED;
-	if (num < 3)
-		++txq->wqe_ci;
-	else
-		txq->wqe_ci += 2;
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-}
-
-/**
- * DPDK callback for TX with MPW support.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int comp;
-	struct mlx5_mpw mpw = {
-		.state = MLX5_MPW_STATE_CLOSED,
-	};
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Prefetch first packet cacheline. */
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	if (unlikely(!max_wqe))
-		return 0;
-	do {
-		struct rte_mbuf *buf = *(pkts++);
-		uint32_t length;
-		unsigned int segs_n = buf->nb_segs;
-		uint32_t cs_flags;
-		rte_be32_t metadata;
-
-		/*
-		 * Make sure there is enough room to store this packet and
-		 * that one ring entry remains unused.
-		 */
-		assert(segs_n);
-		if (max_elts < segs_n)
-			break;
-		/* Do not bother with large packets MPW cannot handle. */
-		if (segs_n > MLX5_MPW_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		max_elts -= segs_n;
-		--pkts_n;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Retrieve packet information. */
-		length = PKT_LEN(buf);
-		assert(length);
-		/* Start new session if packet differs. */
-		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
-		    ((mpw.len != length) ||
-		     (segs_n != 1) ||
-		     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-		     (mpw.wqe->eseg.cs_flags != cs_flags)))
-			mlx5_mpw_close(txq, &mpw);
-		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
-			/*
-			 * Multi-Packet WQE consumes at most two WQE.
-			 * mlx5_mpw_new() expects to be able to use such
-			 * resources.
-			 */
-			if (unlikely(max_wqe < 2))
-				break;
-			max_wqe -= 2;
-			mlx5_mpw_new(txq, &mpw, length);
-			mpw.wqe->eseg.cs_flags = cs_flags;
-			mpw.wqe->eseg.flow_table_metadata = metadata;
-		}
-		/* Multi-segment packets must be alone in their MPW. */
-		assert((segs_n == 1) || (mpw.pkts_n == 0));
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-		length = 0;
-#endif
-		do {
-			volatile struct mlx5_wqe_data_seg *dseg;
-			uintptr_t addr;
-
-			assert(buf);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			dseg = mpw.data.dseg[mpw.pkts_n];
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			*dseg = (struct mlx5_wqe_data_seg){
-				.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
-				.lkey = mlx5_tx_mb2mr(txq, buf),
-				.addr = rte_cpu_to_be_64(addr),
-			};
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-			length += DATA_LEN(buf);
-#endif
-			buf = buf->next;
-			++mpw.pkts_n;
-			++j;
-		} while (--segs_n);
-		assert(length == mpw.len);
-		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
-			mlx5_mpw_close(txq, &mpw);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
-#endif
-		++i;
-	} while (pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely(i == 0))
-		return 0;
-	/* Check whether completion threshold has been reached. */
-	/* "j" includes both packets and segments. */
-	comp = txq->elts_comp + j;
-	if (comp >= MLX5_TX_COMP_THRESH) {
-		volatile struct mlx5_wqe *wqe = mpw.wqe;
-
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->ctrl[3] = elts_head;
-		txq->elts_comp = 0;
-	} else {
-		txq->elts_comp = comp;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	/* Ring QP doorbell. */
-	if (mpw.state == MLX5_MPW_STATE_OPENED)
-		mlx5_mpw_close(txq, &mpw);
-	mlx5_tx_dbrec(txq, mpw.wqe);
-	txq->elts_head = elts_head;
-	return i;
-}
-
-/**
- * Open a MPW inline session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- * @param length
- *   Packet length.
- */
-static inline void
-mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw,
-		    uint32_t length)
-{
-	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
-	struct mlx5_wqe_inl_small *inl;
-
-	mpw->state = MLX5_MPW_INL_STATE_OPENED;
-	mpw->pkts_n = 0;
-	mpw->len = length;
-	mpw->total_len = 0;
-	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
-					     (txq->wqe_ci << 8) |
-					     MLX5_OPCODE_TSO);
-	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-					     MLX5_COMP_MODE_OFFSET);
-	mpw->wqe->ctrl[3] = 0;
-	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
-	mpw->wqe->eseg.inline_hdr_sz = 0;
-	mpw->wqe->eseg.cs_flags = 0;
-	mpw->wqe->eseg.rsvd0 = 0;
-	mpw->wqe->eseg.rsvd1 = 0;
-	mpw->wqe->eseg.flow_table_metadata = 0;
-	inl = (struct mlx5_wqe_inl_small *)
-		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
-	mpw->data.raw = (uint8_t *)&inl->raw;
-}
-
-/**
- * Close a MPW inline session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- */
-static inline void
-mlx5_mpw_inline_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
-{
-	unsigned int size;
-	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
-		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
-
-	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
-	/*
-	 * Store size in multiple of 16 bytes. Control and Ethernet segments
-	 * count as 2.
-	 */
-	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
-					     MLX5_WQE_DS(size));
-	mpw->state = MLX5_MPW_STATE_CLOSED;
-	inl->byte_cnt = rte_cpu_to_be_32(mpw->total_len | MLX5_INLINE_SEG);
-	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
-}
-
-/**
- * DPDK callback for TX with MPW inline support.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
-			 uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int comp;
-	unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
-	struct mlx5_mpw mpw = {
-		.state = MLX5_MPW_STATE_CLOSED,
-	};
-	/*
-	 * Compute the maximum number of WQE which can be consumed by inline
-	 * code.
-	 * - 2 DSEG for:
-	 *   - 1 control segment,
-	 *   - 1 Ethernet segment,
-	 * - N Dseg from the inline request.
-	 */
-	const unsigned int wqe_inl_n =
-		((2 * MLX5_WQE_DWORD_SIZE +
-		  txq->max_inline * RTE_CACHE_LINE_SIZE) +
-		 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Prefetch first packet cacheline. */
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	do {
-		struct rte_mbuf *buf = *(pkts++);
-		uintptr_t addr;
-		uint32_t length;
-		unsigned int segs_n = buf->nb_segs;
-		uint8_t cs_flags;
-		rte_be32_t metadata;
-
-		/*
-		 * Make sure there is enough room to store this packet and
-		 * that one ring entry remains unused.
-		 */
-		assert(segs_n);
-		if (max_elts < segs_n)
-			break;
-		/* Do not bother with large packets MPW cannot handle. */
-		if (segs_n > MLX5_MPW_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		max_elts -= segs_n;
-		--pkts_n;
-		/*
-		 * Compute max_wqe in case less WQE were consumed in previous
-		 * iteration.
-		 */
-		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Retrieve packet information. */
-		length = PKT_LEN(buf);
-		/* Start new session if packet differs. */
-		if (mpw.state == MLX5_MPW_STATE_OPENED) {
-			if ((mpw.len != length) ||
-			    (segs_n != 1) ||
-			    (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-			    (mpw.wqe->eseg.cs_flags != cs_flags))
-				mlx5_mpw_close(txq, &mpw);
-		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
-			if ((mpw.len != length) ||
-			    (segs_n != 1) ||
-			    (length > inline_room) ||
-			    (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
-				mlx5_mpw_inline_close(txq, &mpw);
-				inline_room =
-					txq->max_inline * RTE_CACHE_LINE_SIZE;
-			}
-		}
-		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
-			if ((segs_n != 1) ||
-			    (length > inline_room)) {
-				/*
-				 * Multi-Packet WQE consumes at most two WQE.
-				 * mlx5_mpw_new() expects to be able to use
-				 * such resources.
-				 */
-				if (unlikely(max_wqe < 2))
-					break;
-				max_wqe -= 2;
-				mlx5_mpw_new(txq, &mpw, length);
-				mpw.wqe->eseg.cs_flags = cs_flags;
-				mpw.wqe->eseg.flow_table_metadata = metadata;
-			} else {
-				if (unlikely(max_wqe < wqe_inl_n))
-					break;
-				max_wqe -= wqe_inl_n;
-				mlx5_mpw_inline_new(txq, &mpw, length);
-				mpw.wqe->eseg.cs_flags = cs_flags;
-				mpw.wqe->eseg.flow_table_metadata = metadata;
-			}
-		}
-		/* Multi-segment packets must be alone in their MPW. */
-		assert((segs_n == 1) || (mpw.pkts_n == 0));
-		if (mpw.state == MLX5_MPW_STATE_OPENED) {
-			assert(inline_room ==
-			       txq->max_inline * RTE_CACHE_LINE_SIZE);
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-			length = 0;
-#endif
-			do {
-				volatile struct mlx5_wqe_data_seg *dseg;
-
-				assert(buf);
-				(*txq->elts)[elts_head++ & elts_m] = buf;
-				dseg = mpw.data.dseg[mpw.pkts_n];
-				addr = rte_pktmbuf_mtod(buf, uintptr_t);
-				*dseg = (struct mlx5_wqe_data_seg){
-					.byte_count =
-					       rte_cpu_to_be_32(DATA_LEN(buf)),
-					.lkey = mlx5_tx_mb2mr(txq, buf),
-					.addr = rte_cpu_to_be_64(addr),
-				};
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-				length += DATA_LEN(buf);
-#endif
-				buf = buf->next;
-				++mpw.pkts_n;
-				++j;
-			} while (--segs_n);
-			assert(length == mpw.len);
-			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
-				mlx5_mpw_close(txq, &mpw);
-		} else {
-			unsigned int max;
-
-			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
-			assert(length <= inline_room);
-			assert(length == DATA_LEN(buf));
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			/* Maximum number of bytes before wrapping. */
-			max = ((((uintptr_t)(txq->wqes)) +
-				(1 << txq->wqe_n) *
-				MLX5_WQE_SIZE) -
-			       (uintptr_t)mpw.data.raw);
-			if (length > max) {
-				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
-					   (void *)addr,
-					   max);
-				mpw.data.raw = (volatile void *)txq->wqes;
-				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
-					   (void *)(addr + max),
-					   length - max);
-				mpw.data.raw += length - max;
-			} else {
-				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
-					   (void *)addr,
-					   length);
-
-				if (length == max)
-					mpw.data.raw =
-						(volatile void *)txq->wqes;
-				else
-					mpw.data.raw += length;
-			}
-			++mpw.pkts_n;
-			mpw.total_len += length;
-			++j;
-			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
-				mlx5_mpw_inline_close(txq, &mpw);
-				inline_room =
-					txq->max_inline * RTE_CACHE_LINE_SIZE;
-			} else {
-				inline_room -= length;
-			}
-		}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
-#endif
-		++i;
-	} while (pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely(i == 0))
-		return 0;
-	/* Check whether completion threshold has been reached. */
-	/* "j" includes both packets and segments. */
-	comp = txq->elts_comp + j;
-	if (comp >= MLX5_TX_COMP_THRESH) {
-		volatile struct mlx5_wqe *wqe = mpw.wqe;
-
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->ctrl[3] = elts_head;
-		txq->elts_comp = 0;
-	} else {
-		txq->elts_comp = comp;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	/* Ring QP doorbell. */
-	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
-		mlx5_mpw_inline_close(txq, &mpw);
-	else if (mpw.state == MLX5_MPW_STATE_OPENED)
-		mlx5_mpw_close(txq, &mpw);
-	mlx5_tx_dbrec(txq, mpw.wqe);
-	txq->elts_head = elts_head;
-	return i;
-}
-
-/**
- * Open an Enhanced MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- * @param length
- *   Packet length.
- */
-static inline void
-mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding)
-{
-	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
-
-	mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
-	mpw->pkts_n = 0;
-	mpw->total_len = sizeof(struct mlx5_wqe);
-	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->ctrl[0] =
-		rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
-				 (txq->wqe_ci << 8) |
-				 MLX5_OPCODE_ENHANCED_MPSW);
-	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-					     MLX5_COMP_MODE_OFFSET);
-	mpw->wqe->ctrl[3] = 0;
-	memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
-	if (unlikely(padding)) {
-		uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
-
-		/* Pad the first 2 DWORDs with zero-length inline header. */
-		*(volatile uint32_t *)addr = rte_cpu_to_be_32(MLX5_INLINE_SEG);
-		*(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
-			rte_cpu_to_be_32(MLX5_INLINE_SEG);
-		mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
-		/* Start from the next WQEBB. */
-		mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
-	} else {
-		mpw->data.raw = (volatile void *)(mpw->wqe + 1);
-	}
-}
-
-/**
- * Close an Enhanced MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- *
- * @return
- *   Number of consumed WQEs.
- */
-static inline uint16_t
-mlx5_empw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
-{
-	uint16_t ret;
-
-	/* Store size in multiple of 16 bytes. Control and Ethernet segments
-	 * count as 2.
-	 */
-	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
-					     MLX5_WQE_DS(mpw->total_len));
-	mpw->state = MLX5_MPW_STATE_CLOSED;
-	ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
-	txq->wqe_ci += ret;
-	return ret;
-}
-
-/**
- * TX with Enhanced MPW support.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static inline uint16_t
-txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
-	       uint16_t pkts_n)
-{
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
-	unsigned int mpw_room = 0;
-	unsigned int inl_pad = 0;
-	uint32_t inl_hdr;
-	uint64_t addr_64;
-	struct mlx5_mpw mpw = {
-		.state = MLX5_MPW_STATE_CLOSED,
-	};
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	if (unlikely(!max_wqe))
-		return 0;
-	do {
-		struct rte_mbuf *buf = *(pkts++);
-		uintptr_t addr;
-		unsigned int do_inline = 0; /* Whether inline is possible. */
-		uint32_t length;
-		uint8_t cs_flags;
-		rte_be32_t metadata;
-
-		/* Multi-segmented packet is handled in slow-path outside. */
-		assert(NB_SEGS(buf) == 1);
-		/* Make sure there is enough room to store this packet. */
-		if (max_elts - j == 0)
-			break;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Retrieve packet information. */
-		length = PKT_LEN(buf);
-		/* Start new session if:
-		 * - multi-segment packet
-		 * - no space left even for a dseg
-		 * - next packet can be inlined with a new WQE
-		 * - cs_flag differs
-		 */
-		if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
-			if ((inl_pad + sizeof(struct mlx5_wqe_data_seg) >
-			     mpw_room) ||
-			    (length <= txq->inline_max_packet_sz &&
-			     inl_pad + sizeof(inl_hdr) + length >
-			     mpw_room) ||
-			     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-			    (mpw.wqe->eseg.cs_flags != cs_flags))
-				max_wqe -= mlx5_empw_close(txq, &mpw);
-		}
-		if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
-			/* In Enhanced MPW, inline as much as the budget is
-			 * allowed. The remaining space is to be filled with
-			 * dsegs. If the title WQEBB isn't padded, it will have
-			 * 2 dsegs there.
-			 */
-			mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
-					   (max_inline ? max_inline :
-					    pkts_n * MLX5_WQE_DWORD_SIZE) +
-					   MLX5_WQE_SIZE);
-			if (unlikely(max_wqe * MLX5_WQE_SIZE < mpw_room))
-				break;
-			/* Don't pad the title WQEBB to not waste WQ. */
-			mlx5_empw_new(txq, &mpw, 0);
-			mpw_room -= mpw.total_len;
-			inl_pad = 0;
-			do_inline = length <= txq->inline_max_packet_sz &&
-				    sizeof(inl_hdr) + length <= mpw_room &&
-				    !txq->mpw_hdr_dseg;
-			mpw.wqe->eseg.cs_flags = cs_flags;
-			mpw.wqe->eseg.flow_table_metadata = metadata;
-		} else {
-			/* Evaluate whether the next packet can be inlined.
-			 * Inlininig is possible when:
-			 * - length is less than configured value
-			 * - length fits for remaining space
-			 * - not required to fill the title WQEBB with dsegs
-			 */
-			do_inline =
-				length <= txq->inline_max_packet_sz &&
-				inl_pad + sizeof(inl_hdr) + length <=
-				 mpw_room &&
-				(!txq->mpw_hdr_dseg ||
-				 mpw.total_len >= MLX5_WQE_SIZE);
-		}
-		if (max_inline && do_inline) {
-			/* Inline packet into WQE. */
-			unsigned int max;
-
-			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
-			assert(length == DATA_LEN(buf));
-			inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG);
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			mpw.data.raw = (volatile void *)
-				((uintptr_t)mpw.data.raw + inl_pad);
-			max = tx_mlx5_wq_tailroom(txq,
-					(void *)(uintptr_t)mpw.data.raw);
-			/* Copy inline header. */
-			mpw.data.raw = (volatile void *)
-				mlx5_copy_to_wq(
-					  (void *)(uintptr_t)mpw.data.raw,
-					  &inl_hdr,
-					  sizeof(inl_hdr),
-					  (void *)(uintptr_t)txq->wqes,
-					  max);
-			max = tx_mlx5_wq_tailroom(txq,
-					(void *)(uintptr_t)mpw.data.raw);
-			/* Copy packet data. */
-			mpw.data.raw = (volatile void *)
-				mlx5_copy_to_wq(
-					  (void *)(uintptr_t)mpw.data.raw,
-					  (void *)addr,
-					  length,
-					  (void *)(uintptr_t)txq->wqes,
-					  max);
-			++mpw.pkts_n;
-			mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
-			/* No need to get completion as the entire packet is
-			 * copied to WQ. Free the buf right away.
-			 */
-			rte_pktmbuf_free_seg(buf);
-			mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
-			/* Add pad in the next packet if any. */
-			inl_pad = (((uintptr_t)mpw.data.raw +
-					(MLX5_WQE_DWORD_SIZE - 1)) &
-					~(MLX5_WQE_DWORD_SIZE - 1)) -
-				  (uintptr_t)mpw.data.raw;
-		} else {
-			/* No inline. Load a dseg of packet pointer. */
-			volatile rte_v128u32_t *dseg;
-
-			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
-			assert((inl_pad + sizeof(*dseg)) <= mpw_room);
-			assert(length == DATA_LEN(buf));
-			if (!tx_mlx5_wq_tailroom(txq,
-					(void *)((uintptr_t)mpw.data.raw
-						+ inl_pad)))
-				dseg = (volatile void *)txq->wqes;
-			else
-				dseg = (volatile void *)
-					((uintptr_t)mpw.data.raw +
-					 inl_pad);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
-								    uintptr_t));
-			*dseg = (rte_v128u32_t) {
-				rte_cpu_to_be_32(length),
-				mlx5_tx_mb2mr(txq, buf),
-				addr_64,
-				addr_64 >> 32,
-			};
-			mpw.data.raw = (volatile void *)(dseg + 1);
-			mpw.total_len += (inl_pad + sizeof(*dseg));
-			++j;
-			++mpw.pkts_n;
-			mpw_room -= (inl_pad + sizeof(*dseg));
-			inl_pad = 0;
-		}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
-#endif
-		++i;
-	} while (i < pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely(i == 0))
-		return 0;
-	/* Check whether completion threshold has been reached. */
-	if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
-			(uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
-			 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
-		volatile struct mlx5_wqe *wqe = mpw.wqe;
-
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->ctrl[3] = elts_head;
-		txq->elts_comp = 0;
-		txq->mpw_comp = txq->wqe_ci;
-	} else {
-		txq->elts_comp += j;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
-		mlx5_empw_close(txq, &mpw);
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, mpw.wqe);
-	txq->elts_head = elts_head;
-	return i;
-}
-
-/**
- * DPDK callback for TX with Enhanced MPW support.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t nb_tx = 0;
-
-	while (pkts_n > nb_tx) {
-		uint16_t n;
-		uint16_t ret;
-
-		n = txq_count_contig_multi_seg(&pkts[nb_tx], pkts_n - nb_tx);
-		if (n) {
-			ret = mlx5_tx_burst(dpdk_txq, &pkts[nb_tx], n);
-			if (!ret)
-				break;
-			nb_tx += ret;
-		}
-		n = txq_count_contig_single_seg(&pkts[nb_tx], pkts_n - nb_tx);
-		if (n) {
-			ret = txq_burst_empw(txq, &pkts[nb_tx], n);
-			if (!ret)
-				break;
-			nb_tx += ret;
-		}
-	}
-	return nb_tx;
-}
-
-/**
  * Translate RX completion flags to packet type.
  *
  * @param[in] rxq
@@ -2867,22 +1492,6 @@
  */
 
 __rte_weak uint16_t
-mlx5_tx_burst_raw_vec(void *dpdk_txq __rte_unused,
-		      struct rte_mbuf **pkts __rte_unused,
-		      uint16_t pkts_n __rte_unused)
-{
-	return 0;
-}
-
-__rte_weak uint16_t
-mlx5_tx_burst_vec(void *dpdk_txq __rte_unused,
-		  struct rte_mbuf **pkts __rte_unused,
-		  uint16_t pkts_n __rte_unused)
-{
-	return 0;
-}
-
-__rte_weak uint16_t
 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
 		  struct rte_mbuf **pkts __rte_unused,
 		  uint16_t pkts_n __rte_unused)
@@ -2891,25 +1500,50 @@
 }
 
 __rte_weak int
-mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
+mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
 {
 	return -ENOTSUP;
 }
 
 __rte_weak int
-mlx5_check_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
+mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
 {
 	return -ENOTSUP;
 }
 
-__rte_weak int
-mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
+/**
+ * DPDK callback to check the status of a tx descriptor.
+ *
+ * @param tx_queue
+ *   The tx queue.
+ * @param[in] offset
+ *   The index of the descriptor in the ring.
+ *
+ * @return
+ *   The status of the tx descriptor.
+ */
+int
+mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
 {
-	return -ENOTSUP;
+	(void)tx_queue;
+	(void)offset;
+	return RTE_ETH_TX_DESC_FULL;
 }
 
-__rte_weak int
-mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
+/**
+ * Configure the TX function to use.
+ *
+ * @param dev
+ *   Pointer to private data structure.
+ *
+ * @return
+ *   Pointer to selected Tx burst function.
+ */
+eth_tx_burst_t
+mlx5_select_tx_function(struct rte_eth_dev *dev)
 {
-	return -ENOTSUP;
+	(void)dev;
+	return removed_tx_burst;
 }
+
+
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3d79c18..acde09d 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -329,14 +329,6 @@ struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx,
 void mlx5_set_ptype_table(void);
 void mlx5_set_cksum_table(void);
 void mlx5_set_swp_types_table(void);
-uint16_t mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
-		       uint16_t pkts_n);
-uint16_t mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts,
-			   uint16_t pkts_n);
-uint16_t mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
-				  uint16_t pkts_n);
-uint16_t mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts,
-			    uint16_t pkts_n);
 __rte_noinline uint16_t mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
 					volatile struct mlx5_err_cqe *err_cqe);
 uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
@@ -360,14 +352,8 @@ int mlx5_queue_state_modify_primary(struct rte_eth_dev *dev,
 			const struct mlx5_mp_arg_queue_state_modify *sm);
 
 /* Vectorized version of mlx5_rxtx.c */
-int mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev);
-int mlx5_check_vec_tx_support(struct rte_eth_dev *dev);
 int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data);
 int mlx5_check_vec_rx_support(struct rte_eth_dev *dev);
-uint16_t mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts,
-			       uint16_t pkts_n);
-uint16_t mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
-			   uint16_t pkts_n);
 uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
 			   uint16_t pkts_n);
 
@@ -478,122 +464,6 @@ enum mlx5_cqe_status {
 }
 
 /**
- * Return the address of the WQE.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param  wqe_ci
- *   WQE consumer index.
- *
- * @return
- *   WQE address.
- */
-static inline uintptr_t *
-tx_mlx5_wqe(struct mlx5_txq_data *txq, uint16_t ci)
-{
-	ci &= ((1 << txq->wqe_n) - 1);
-	return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE);
-}
-
-/**
- * Handle the next CQE.
- *
- * @param txq
- *   Pointer to TX queue structure.
- *
- * @return
- *   The last Tx buffer element to free.
- */
-static __rte_always_inline uint16_t
-mlx5_tx_cqe_handle(struct mlx5_txq_data *txq)
-{
-	const unsigned int cqe_n = 1 << txq->cqe_n;
-	const unsigned int cqe_cnt = cqe_n - 1;
-	uint16_t last_elts;
-	union {
-		volatile struct mlx5_cqe *cqe;
-		volatile struct mlx5_err_cqe *err_cqe;
-	} u = {
-		.cqe =  &(*txq->cqes)[txq->cq_ci & cqe_cnt],
-	};
-	int ret = check_cqe(u.cqe, cqe_n, txq->cq_ci);
-
-	if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
-		if (unlikely(ret == MLX5_CQE_STATUS_ERR))
-			last_elts = mlx5_tx_error_cqe_handle(txq, u.err_cqe);
-		else
-			/* Do not release buffers. */
-			return txq->elts_tail;
-	} else {
-		uint16_t new_wqe_pi = rte_be_to_cpu_16(u.cqe->wqe_counter);
-		volatile struct mlx5_wqe_ctrl *ctrl =
-				(volatile struct mlx5_wqe_ctrl *)
-					tx_mlx5_wqe(txq, new_wqe_pi);
-
-		/* Release completion burst buffers. */
-		last_elts = ctrl->ctrl3;
-		txq->wqe_pi = new_wqe_pi;
-		txq->cq_ci++;
-	}
-	rte_compiler_barrier();
-	*txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
-	return last_elts;
-}
-
-/**
- * Manage TX completions.
- *
- * When sending a burst, mlx5_tx_burst() posts several WRs.
- *
- * @param txq
- *   Pointer to TX queue structure.
- */
-static __rte_always_inline void
-mlx5_tx_complete(struct mlx5_txq_data *txq)
-{
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	uint16_t elts_free = txq->elts_tail;
-	uint16_t elts_tail;
-	struct rte_mbuf *m, *free[elts_n];
-	struct rte_mempool *pool = NULL;
-	unsigned int blk_n = 0;
-
-	elts_tail = mlx5_tx_cqe_handle(txq);
-	assert((elts_tail & elts_m) < (1 << txq->wqe_n));
-	/* Free buffers. */
-	while (elts_free != elts_tail) {
-		m = rte_pktmbuf_prefree_seg((*txq->elts)[elts_free++ & elts_m]);
-		if (likely(m != NULL)) {
-			if (likely(m->pool == pool)) {
-				free[blk_n++] = m;
-			} else {
-				if (likely(pool != NULL))
-					rte_mempool_put_bulk(pool,
-							     (void *)free,
-							     blk_n);
-				free[0] = m;
-				pool = m->pool;
-				blk_n = 1;
-			}
-		}
-	}
-	if (blk_n)
-		rte_mempool_put_bulk(pool, (void *)free, blk_n);
-#ifndef NDEBUG
-	elts_free = txq->elts_tail;
-	/* Poisoning. */
-	while (elts_free != elts_tail) {
-		memset(&(*txq->elts)[elts_free & elts_m],
-		       0x66,
-		       sizeof((*txq->elts)[elts_free & elts_m]));
-		++elts_free;
-	}
-#endif
-	txq->elts_tail = elts_tail;
-}
-
-/**
  * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which the
  * cloned mbuf is allocated is returned instead.
  *
@@ -710,147 +580,4 @@ enum mlx5_cqe_status {
 	mlx5_tx_dbrec_cond_wmb(txq, wqe, 1);
 }
 
-/**
- * Convert mbuf to Verb SWP.
- *
- * @param txq_data
- *   Pointer to the Tx queue.
- * @param buf
- *   Pointer to the mbuf.
- * @param offsets
- *   Pointer to the SWP header offsets.
- * @param swp_types
- *   Pointer to the SWP header types.
- */
-static __rte_always_inline void
-txq_mbuf_to_swp(struct mlx5_txq_data *txq, struct rte_mbuf *buf,
-		uint8_t *offsets, uint8_t *swp_types)
-{
-	const uint64_t vlan = buf->ol_flags & PKT_TX_VLAN_PKT;
-	const uint64_t tunnel = buf->ol_flags & PKT_TX_TUNNEL_MASK;
-	const uint64_t tso = buf->ol_flags & PKT_TX_TCP_SEG;
-	const uint64_t csum_flags = buf->ol_flags & PKT_TX_L4_MASK;
-	const uint64_t inner_ip =
-		buf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6);
-	const uint64_t ol_flags_mask = PKT_TX_L4_MASK | PKT_TX_IPV6 |
-				       PKT_TX_OUTER_IPV6;
-	uint16_t idx;
-	uint16_t off;
-
-	if (likely(!txq->swp_en || (tunnel != PKT_TX_TUNNEL_UDP &&
-				    tunnel != PKT_TX_TUNNEL_IP)))
-		return;
-	/*
-	 * The index should have:
-	 * bit[0:1] = PKT_TX_L4_MASK
-	 * bit[4] = PKT_TX_IPV6
-	 * bit[8] = PKT_TX_OUTER_IPV6
-	 * bit[9] = PKT_TX_OUTER_UDP
-	 */
-	idx = (buf->ol_flags & ol_flags_mask) >> 52;
-	if (tunnel == PKT_TX_TUNNEL_UDP)
-		idx |= 1 << 9;
-	*swp_types = mlx5_swp_types_table[idx];
-	/*
-	 * Set offsets for SW parser. Since ConnectX-5, SW parser just
-	 * complements HW parser. SW parser starts to engage only if HW parser
-	 * can't reach a header. For the older devices, HW parser will not kick
-	 * in if any of SWP offsets is set. Therefore, all of the L3 offsets
-	 * should be set regardless of HW offload.
-	 */
-	off = buf->outer_l2_len + (vlan ? sizeof(struct rte_vlan_hdr) : 0);
-	offsets[1] = off >> 1; /* Outer L3 offset. */
-	off += buf->outer_l3_len;
-	if (tunnel == PKT_TX_TUNNEL_UDP)
-		offsets[0] = off >> 1; /* Outer L4 offset. */
-	if (inner_ip) {
-		off += buf->l2_len;
-		offsets[3] = off >> 1; /* Inner L3 offset. */
-		if (csum_flags == PKT_TX_TCP_CKSUM || tso ||
-		    csum_flags == PKT_TX_UDP_CKSUM) {
-			off += buf->l3_len;
-			offsets[2] = off >> 1; /* Inner L4 offset. */
-		}
-	}
-}
-
-/**
- * Convert the Checksum offloads to Verbs.
- *
- * @param buf
- *   Pointer to the mbuf.
- *
- * @return
- *   Converted checksum flags.
- */
-static __rte_always_inline uint8_t
-txq_ol_cksum_to_cs(struct rte_mbuf *buf)
-{
-	uint32_t idx;
-	uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK);
-	const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK |
-				       PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM;
-
-	/*
-	 * The index should have:
-	 * bit[0] = PKT_TX_TCP_SEG
-	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
-	 * bit[4] = PKT_TX_IP_CKSUM
-	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
-	 * bit[9] = tunnel
-	 */
-	idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
-	return mlx5_cksum_table[idx];
-}
-
-/**
- * Count the number of contiguous single segment packets.
- *
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- *
- * @return
- *   Number of contiguous single segment packets.
- */
-static __rte_always_inline unsigned int
-txq_count_contig_single_seg(struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	unsigned int pos;
-
-	if (!pkts_n)
-		return 0;
-	/* Count the number of contiguous single segment packets. */
-	for (pos = 0; pos < pkts_n; ++pos)
-		if (NB_SEGS(pkts[pos]) > 1)
-			break;
-	return pos;
-}
-
-/**
- * Count the number of contiguous multi-segment packets.
- *
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- *
- * @return
- *   Number of contiguous multi-segment packets.
- */
-static __rte_always_inline unsigned int
-txq_count_contig_multi_seg(struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	unsigned int pos;
-
-	if (!pkts_n)
-		return 0;
-	/* Count the number of contiguous multi-segment packets. */
-	for (pos = 0; pos < pkts_n; ++pos)
-		if (NB_SEGS(pkts[pos]) == 1)
-			break;
-	return pos;
-}
-
 #endif /* RTE_PMD_MLX5_RXTX_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
index 073044f..f6ec828 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -40,138 +40,6 @@
 #endif
 
 /**
- * Count the number of packets having same ol_flags and same metadata (if
- * PKT_TX_METADATA is set in ol_flags), and calculate cs_flags.
- *
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- * @param cs_flags
- *   Pointer of flags to be returned.
- * @param metadata
- *   Pointer of metadata to be returned.
- * @param txq_offloads
- *   Offloads enabled on Tx queue
- *
- * @return
- *   Number of packets having same ol_flags and metadata, if relevant.
- */
-static inline unsigned int
-txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags,
-		 rte_be32_t *metadata, const uint64_t txq_offloads)
-{
-	unsigned int pos;
-	const uint64_t cksum_ol_mask =
-		PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
-		PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
-		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
-	rte_be32_t p0_metadata, pn_metadata;
-
-	if (!pkts_n)
-		return 0;
-	p0_metadata = pkts[0]->ol_flags & PKT_TX_METADATA ?
-			pkts[0]->tx_metadata : 0;
-	/* Count the number of packets having same offload parameters. */
-	for (pos = 1; pos < pkts_n; ++pos) {
-		/* Check if packet has same checksum flags. */
-		if ((txq_offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP) &&
-		    ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & cksum_ol_mask))
-			break;
-		/* Check if packet has same metadata. */
-		if (txq_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) {
-			pn_metadata = pkts[pos]->ol_flags & PKT_TX_METADATA ?
-					pkts[pos]->tx_metadata : 0;
-			if (pn_metadata != p0_metadata)
-				break;
-		}
-	}
-	*cs_flags = txq_ol_cksum_to_cs(pkts[0]);
-	*metadata = p0_metadata;
-	return pos;
-}
-
-/**
- * DPDK callback for vectorized TX.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts,
-		      uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t nb_tx = 0;
-
-	while (pkts_n > nb_tx) {
-		uint16_t n;
-		uint16_t ret;
-
-		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
-		ret = txq_burst_v(txq, &pkts[nb_tx], n, 0, 0);
-		nb_tx += ret;
-		if (!ret)
-			break;
-	}
-	return nb_tx;
-}
-
-/**
- * DPDK callback for vectorized TX with multi-seg packets and offload.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t nb_tx = 0;
-
-	while (pkts_n > nb_tx) {
-		uint8_t cs_flags = 0;
-		uint16_t n;
-		uint16_t ret;
-		rte_be32_t metadata = 0;
-
-		/* Transmit multi-seg packets in the head of pkts list. */
-		if ((txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS) &&
-		    NB_SEGS(pkts[nb_tx]) > 1)
-			nb_tx += txq_scatter_v(txq,
-					       &pkts[nb_tx],
-					       pkts_n - nb_tx);
-		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
-		if (txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
-			n = txq_count_contig_single_seg(&pkts[nb_tx], n);
-		if (txq->offloads & (MLX5_VEC_TX_CKSUM_OFFLOAD_CAP |
-				     DEV_TX_OFFLOAD_MATCH_METADATA))
-			n = txq_calc_offload(&pkts[nb_tx], n,
-					     &cs_flags, &metadata,
-					     txq->offloads);
-		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags, metadata);
-		nb_tx += ret;
-		if (!ret)
-			break;
-	}
-	return nb_tx;
-}
-
-/**
  * Skip error packets.
  *
  * @param rxq
@@ -243,49 +111,6 @@
 }
 
 /**
- * Check Tx queue flags are set for raw vectorized Tx.
- *
- * @param dev
- *   Pointer to Ethernet device.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev)
-{
-	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
-
-	/* Doesn't support any offload. */
-	if (offloads)
-		return -ENOTSUP;
-	return 1;
-}
-
-/**
- * Check a device can support vectorized TX.
- *
- * @param dev
- *   Pointer to Ethernet device.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-mlx5_check_vec_tx_support(struct rte_eth_dev *dev)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
-
-	if (!priv->config.tx_vec_en ||
-	    priv->txqs_n > (unsigned int)priv->config.txqs_vec ||
-	    priv->config.mps != MLX5_MPW_ENHANCED ||
-	    offloads & ~MLX5_VEC_TX_OFFLOAD_CAP)
-		return -ENOTSUP;
-	return 1;
-}
-
-/**
  * Check a RX queue can support vectorized RX.
  *
  * @param rxq
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 1c7e3b4..9930286 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -27,295 +27,6 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 
 /**
- * Fill in buffer descriptors in a multi-packet send descriptor.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param dseg
- *   Pointer to buffer descriptor to be written.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param n
- *   Number of packets to be filled.
- */
-static inline void
-txq_wr_dseg_v(struct mlx5_txq_data *txq, uint8_t *dseg,
-	      struct rte_mbuf **pkts, unsigned int n)
-{
-	unsigned int pos;
-	uintptr_t addr;
-	const uint8x16_t dseg_shuf_m = {
-		 3,  2,  1,  0, /* length, bswap32 */
-		 4,  5,  6,  7, /* lkey */
-		15, 14, 13, 12, /* addr, bswap64 */
-		11, 10,  9,  8
-	};
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	uint32_t tx_byte = 0;
-#endif
-
-	for (pos = 0; pos < n; ++pos, dseg += MLX5_WQE_DWORD_SIZE) {
-		uint8x16_t desc;
-		struct rte_mbuf *pkt = pkts[pos];
-
-		addr = rte_pktmbuf_mtod(pkt, uintptr_t);
-		desc = vreinterpretq_u8_u32((uint32x4_t) {
-				DATA_LEN(pkt),
-				mlx5_tx_mb2mr(txq, pkt),
-				addr,
-				addr >> 32 });
-		desc = vqtbl1q_u8(desc, dseg_shuf_m);
-		vst1q_u8(dseg, desc);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		tx_byte += DATA_LEN(pkt);
-#endif
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.obytes += tx_byte;
-#endif
-}
-
-/**
- * Send multi-segmented packets until it encounters a single segment packet in
- * the pkts list.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
-	      uint16_t pkts_n)
-{
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n;
-	volatile struct mlx5_wqe *wqe = NULL;
-	bool metadata_ol =
-		txq->offloads & DEV_TX_OFFLOAD_MATCH_METADATA ? true : false;
-
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	if (unlikely(!pkts_n))
-		return 0;
-	for (n = 0; n < pkts_n; ++n) {
-		struct rte_mbuf *buf = pkts[n];
-		unsigned int segs_n = buf->nb_segs;
-		unsigned int ds = nb_dword_in_hdr;
-		unsigned int len = PKT_LEN(buf);
-		uint16_t wqe_ci = txq->wqe_ci;
-		const uint8x16_t ctrl_shuf_m = {
-			3,  2,  1,  0, /* bswap32 */
-			7,  6,  5,  4, /* bswap32 */
-			11, 10,  9,  8, /* bswap32 */
-			12, 13, 14, 15
-		};
-		uint8_t cs_flags;
-		uint16_t max_elts;
-		uint16_t max_wqe;
-		uint8x16_t *t_wqe;
-		uint8_t *dseg;
-		uint8x16_t ctrl;
-		rte_be32_t metadata =
-			metadata_ol && (buf->ol_flags & PKT_TX_METADATA) ?
-			buf->tx_metadata : 0;
-
-		assert(segs_n);
-		max_elts = elts_n - (elts_head - txq->elts_tail);
-		max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi);
-		/*
-		 * A MPW session consumes 2 WQEs at most to
-		 * include MLX5_MPW_DSEG_MAX pointers.
-		 */
-		if (segs_n == 1 ||
-		    max_elts < segs_n || max_wqe < 2)
-			break;
-		wqe = &((volatile struct mlx5_wqe64 *)
-			 txq->wqes)[wqe_ci & wq_mask].hdr;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Title WQEBB pointer. */
-		t_wqe = (uint8x16_t *)wqe;
-		dseg = (uint8_t *)(wqe + 1);
-		do {
-			if (!(ds++ % nb_dword_per_wqebb)) {
-				dseg = (uint8_t *)
-					&((volatile struct mlx5_wqe64 *)
-					   txq->wqes)[++wqe_ci & wq_mask];
-			}
-			txq_wr_dseg_v(txq, dseg, &buf, 1);
-			dseg += MLX5_WQE_DWORD_SIZE;
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			buf = buf->next;
-		} while (--segs_n);
-		++wqe_ci;
-		/* Fill CTRL in the header. */
-		ctrl = vreinterpretq_u8_u32((uint32x4_t) {
-				MLX5_OPC_MOD_MPW << 24 |
-				txq->wqe_ci << 8 | MLX5_OPCODE_TSO,
-				txq->qp_num_8s | ds, 4, 0});
-		ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m);
-		vst1q_u8((void *)t_wqe, ctrl);
-		/* Fill ESEG in the header. */
-		vst1q_u32((void *)(t_wqe + 1),
-			  ((uint32x4_t){ 0,
-					 rte_cpu_to_be_16(len) << 16 | cs_flags,
-					 metadata, 0 }));
-		txq->wqe_ci = wqe_ci;
-	}
-	if (!n)
-		return 0;
-	txq->elts_comp += (uint16_t)(elts_head - txq->elts_head);
-	txq->elts_head = elts_head;
-	if (txq->elts_comp >= MLX5_TX_COMP_THRESH) {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		wqe->ctrl[3] = txq->elts_head;
-		txq->elts_comp = 0;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += n;
-#endif
-	mlx5_tx_dbrec(txq, wqe);
-	return n;
-}
-
-/**
- * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet,
- * it returns to make it processed by txq_scatter_v(). All the packets in
- * the pkts list should be single segment packets having same offload flags.
- * This must be checked by txq_count_contig_single_seg() and txq_calc_offload().
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
- * @param cs_flags
- *   Checksum offload flags to be written in the descriptor.
- * @param metadata
- *   Metadata value to be written in the descriptor.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static inline uint16_t
-txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint8_t cs_flags, rte_be32_t metadata)
-{
-	struct rte_mbuf **elts;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n = 0;
-	unsigned int pos;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	uint32_t comp_req;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	uint16_t wq_idx = txq->wqe_ci & wq_mask;
-	volatile struct mlx5_wqe64 *wq =
-		&((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx];
-	volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq;
-	const uint8x16_t ctrl_shuf_m = {
-		 3,  2,  1,  0, /* bswap32 */
-		 7,  6,  5,  4, /* bswap32 */
-		11, 10,  9,  8, /* bswap32 */
-		12, 13, 14, 15
-	};
-	uint8x16_t *t_wqe;
-	uint8_t *dseg;
-	uint8x16_t ctrl;
-
-	/* Make sure all packets can fit into a single WQE. */
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts);
-	if (unlikely(!pkts_n))
-		return 0;
-	elts = &(*txq->elts)[elts_head & elts_m];
-	/* Loop for available tailroom first. */
-	n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n);
-	for (pos = 0; pos < (n & -2); pos += 2)
-		vst1q_u64((void *)&elts[pos], vld1q_u64((void *)&pkts[pos]));
-	if (n & 1)
-		elts[pos] = pkts[pos];
-	/* Check if it crosses the end of the queue. */
-	if (unlikely(n < pkts_n)) {
-		elts = &(*txq->elts)[0];
-		for (pos = 0; pos < pkts_n - n; ++pos)
-			elts[pos] = pkts[n + pos];
-	}
-	txq->elts_head += pkts_n;
-	/* Save title WQEBB pointer. */
-	t_wqe = (uint8x16_t *)wqe;
-	dseg = (uint8_t *)(wqe + 1);
-	/* Calculate the number of entries to the end. */
-	n = RTE_MIN(
-		(wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr,
-		pkts_n);
-	/* Fill DSEGs. */
-	txq_wr_dseg_v(txq, dseg, pkts, n);
-	/* Check if it crosses the end of the queue. */
-	if (n < pkts_n) {
-		dseg = (uint8_t *)txq->wqes;
-		txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n);
-	}
-	if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) {
-		txq->elts_comp += pkts_n;
-		comp_req = MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET;
-	} else {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request a completion. */
-		txq->elts_comp = 0;
-		comp_req = MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET;
-	}
-	/* Fill CTRL in the header. */
-	ctrl = vreinterpretq_u8_u32((uint32x4_t) {
-			MLX5_OPC_MOD_ENHANCED_MPSW << 24 |
-			txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW,
-			txq->qp_num_8s | (pkts_n + 2),
-			comp_req,
-			txq->elts_head });
-	ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m);
-	vst1q_u8((void *)t_wqe, ctrl);
-	/* Fill ESEG in the header. */
-	vst1q_u32((void *)(t_wqe + 1),
-		 ((uint32x4_t) { 0, cs_flags, metadata, 0 }));
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += pkts_n;
-#endif
-	txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
-		       nb_dword_per_wqebb;
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
-	return pkts_n;
-}
-
-/**
  * Store free buffers to RX SW ring.
  *
  * @param rxq
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 503ca0f..7bd254f 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -29,290 +29,6 @@
 #endif
 
 /**
- * Fill in buffer descriptors in a multi-packet send descriptor.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param dseg
- *   Pointer to buffer descriptor to be written.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param n
- *   Number of packets to be filled.
- */
-static inline void
-txq_wr_dseg_v(struct mlx5_txq_data *txq, __m128i *dseg,
-	      struct rte_mbuf **pkts, unsigned int n)
-{
-	unsigned int pos;
-	uintptr_t addr;
-	const __m128i shuf_mask_dseg =
-		_mm_set_epi8(8,  9, 10, 11, /* addr, bswap64 */
-			    12, 13, 14, 15,
-			     7,  6,  5,  4, /* lkey */
-			     0,  1,  2,  3  /* length, bswap32 */);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	uint32_t tx_byte = 0;
-#endif
-
-	for (pos = 0; pos < n; ++pos, ++dseg) {
-		__m128i desc;
-		struct rte_mbuf *pkt = pkts[pos];
-
-		addr = rte_pktmbuf_mtod(pkt, uintptr_t);
-		desc = _mm_set_epi32(addr >> 32,
-				     addr,
-				     mlx5_tx_mb2mr(txq, pkt),
-				     DATA_LEN(pkt));
-		desc = _mm_shuffle_epi8(desc, shuf_mask_dseg);
-		_mm_store_si128(dseg, desc);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		tx_byte += DATA_LEN(pkt);
-#endif
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.obytes += tx_byte;
-#endif
-}
-
-/**
- * Send multi-segmented packets until it encounters a single segment packet in
- * the pkts list.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
-	      uint16_t pkts_n)
-{
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n;
-	volatile struct mlx5_wqe *wqe = NULL;
-	bool metadata_ol =
-		txq->offloads & DEV_TX_OFFLOAD_MATCH_METADATA ? true : false;
-
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	if (unlikely(!pkts_n))
-		return 0;
-	for (n = 0; n < pkts_n; ++n) {
-		struct rte_mbuf *buf = pkts[n];
-		unsigned int segs_n = buf->nb_segs;
-		unsigned int ds = nb_dword_in_hdr;
-		unsigned int len = PKT_LEN(buf);
-		uint16_t wqe_ci = txq->wqe_ci;
-		const __m128i shuf_mask_ctrl =
-			_mm_set_epi8(15, 14, 13, 12,
-				      8,  9, 10, 11, /* bswap32 */
-				      4,  5,  6,  7, /* bswap32 */
-				      0,  1,  2,  3  /* bswap32 */);
-		uint8_t cs_flags;
-		uint16_t max_elts;
-		uint16_t max_wqe;
-		__m128i *t_wqe, *dseg;
-		__m128i ctrl;
-		rte_be32_t metadata =
-			metadata_ol && (buf->ol_flags & PKT_TX_METADATA) ?
-			buf->tx_metadata : 0;
-
-		assert(segs_n);
-		max_elts = elts_n - (elts_head - txq->elts_tail);
-		max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi);
-		/*
-		 * A MPW session consumes 2 WQEs at most to
-		 * include MLX5_MPW_DSEG_MAX pointers.
-		 */
-		if (segs_n == 1 ||
-		    max_elts < segs_n || max_wqe < 2)
-			break;
-		if (segs_n > MLX5_MPW_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		wqe = &((volatile struct mlx5_wqe64 *)
-			 txq->wqes)[wqe_ci & wq_mask].hdr;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Title WQEBB pointer. */
-		t_wqe = (__m128i *)wqe;
-		dseg = (__m128i *)(wqe + 1);
-		do {
-			if (!(ds++ % nb_dword_per_wqebb)) {
-				dseg = (__m128i *)
-					&((volatile struct mlx5_wqe64 *)
-					   txq->wqes)[++wqe_ci & wq_mask];
-			}
-			txq_wr_dseg_v(txq, dseg++, &buf, 1);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			buf = buf->next;
-		} while (--segs_n);
-		++wqe_ci;
-		/* Fill CTRL in the header. */
-		ctrl = _mm_set_epi32(0, 4, txq->qp_num_8s | ds,
-				     MLX5_OPC_MOD_MPW << 24 |
-				     txq->wqe_ci << 8 | MLX5_OPCODE_TSO);
-		ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
-		_mm_store_si128(t_wqe, ctrl);
-		/* Fill ESEG in the header. */
-		_mm_store_si128(t_wqe + 1,
-				_mm_set_epi32(0, metadata,
-					      (rte_cpu_to_be_16(len) << 16) |
-					      cs_flags, 0));
-		txq->wqe_ci = wqe_ci;
-	}
-	if (!n)
-		return 0;
-	txq->elts_comp += (uint16_t)(elts_head - txq->elts_head);
-	txq->elts_head = elts_head;
-	if (txq->elts_comp >= MLX5_TX_COMP_THRESH) {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		wqe->ctrl[3] = txq->elts_head;
-		txq->elts_comp = 0;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += n;
-#endif
-	mlx5_tx_dbrec(txq, wqe);
-	return n;
-}
-
-/**
- * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet,
- * it returns to make it processed by txq_scatter_v(). All the packets in
- * the pkts list should be single segment packets having same offload flags.
- * This must be checked by txq_count_contig_single_seg() and txq_calc_offload().
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
- * @param cs_flags
- *   Checksum offload flags to be written in the descriptor.
- * @param metadata
- *   Metadata value to be written in the descriptor.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static inline uint16_t
-txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint8_t cs_flags, rte_be32_t metadata)
-{
-	struct rte_mbuf **elts;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n = 0;
-	unsigned int pos;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	uint32_t comp_req;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	uint16_t wq_idx = txq->wqe_ci & wq_mask;
-	volatile struct mlx5_wqe64 *wq =
-		&((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx];
-	volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq;
-	const __m128i shuf_mask_ctrl =
-		_mm_set_epi8(15, 14, 13, 12,
-			      8,  9, 10, 11, /* bswap32 */
-			      4,  5,  6,  7, /* bswap32 */
-			      0,  1,  2,  3  /* bswap32 */);
-	__m128i *t_wqe, *dseg;
-	__m128i ctrl;
-
-	/* Make sure all packets can fit into a single WQE. */
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts);
-	assert(pkts_n <= MLX5_DSEG_MAX - nb_dword_in_hdr);
-	if (unlikely(!pkts_n))
-		return 0;
-	elts = &(*txq->elts)[elts_head & elts_m];
-	/* Loop for available tailroom first. */
-	n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n);
-	for (pos = 0; pos < (n & -2); pos += 2)
-		_mm_storeu_si128((__m128i *)&elts[pos],
-				 _mm_loadu_si128((__m128i *)&pkts[pos]));
-	if (n & 1)
-		elts[pos] = pkts[pos];
-	/* Check if it crosses the end of the queue. */
-	if (unlikely(n < pkts_n)) {
-		elts = &(*txq->elts)[0];
-		for (pos = 0; pos < pkts_n - n; ++pos)
-			elts[pos] = pkts[n + pos];
-	}
-	txq->elts_head += pkts_n;
-	/* Save title WQEBB pointer. */
-	t_wqe = (__m128i *)wqe;
-	dseg = (__m128i *)(wqe + 1);
-	/* Calculate the number of entries to the end. */
-	n = RTE_MIN(
-		(wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr,
-		pkts_n);
-	/* Fill DSEGs. */
-	txq_wr_dseg_v(txq, dseg, pkts, n);
-	/* Check if it crosses the end of the queue. */
-	if (n < pkts_n) {
-		dseg = (__m128i *)txq->wqes;
-		txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n);
-	}
-	if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) {
-		txq->elts_comp += pkts_n;
-		comp_req = MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET;
-	} else {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request a completion. */
-		txq->elts_comp = 0;
-		comp_req = MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET;
-	}
-	/* Fill CTRL in the header. */
-	ctrl = _mm_set_epi32(txq->elts_head, comp_req,
-			     txq->qp_num_8s | (pkts_n + 2),
-			     MLX5_OPC_MOD_ENHANCED_MPSW << 24 |
-				txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW);
-	ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
-	_mm_store_si128(t_wqe, ctrl);
-	/* Fill ESEG in the header. */
-	_mm_store_si128(t_wqe + 1, _mm_set_epi32(0, metadata, cs_flags, 0));
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += pkts_n;
-#endif
-	txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
-		       nb_dword_per_wqebb;
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
-	return pkts_n;
-}
-
-/**
  * Store free buffers to RX SW ring.
  *
  * @param rxq
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 7741095..0d2dbfa 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -365,25 +365,6 @@
 }
 
 /**
- * Check if the burst function is using eMPW.
- *
- * @param tx_pkt_burst
- *   Tx burst function pointer.
- *
- * @return
- *   1 if the burst function is using eMPW, 0 otherwise.
- */
-static int
-is_empw_burst_func(eth_tx_burst_t tx_pkt_burst)
-{
-	if (tx_pkt_burst == mlx5_tx_burst_raw_vec ||
-	    tx_pkt_burst == mlx5_tx_burst_vec ||
-	    tx_pkt_burst == mlx5_tx_burst_empw)
-		return 1;
-	return 0;
-}
-
-/**
  * Create the Tx queue Verbs object.
  *
  * @param dev
@@ -414,7 +395,6 @@ struct mlx5_txq_ibv *
 	struct mlx5dv_cq cq_info;
 	struct mlx5dv_obj obj;
 	const int desc = 1 << txq_data->elts_n;
-	eth_tx_burst_t tx_pkt_burst = mlx5_select_tx_function(dev);
 	int ret = 0;
 
 	assert(txq_data);
@@ -432,8 +412,6 @@ struct mlx5_txq_ibv *
 		.comp_mask = 0,
 	};
 	cqe_n = desc / MLX5_TX_COMP_THRESH + 1;
-	if (is_empw_burst_func(tx_pkt_burst))
-		cqe_n += MLX5_TX_COMP_THRESH_INLINE_DIV;
 	tmpl.cq = mlx5_glue->create_cq(priv->sh->ctx, cqe_n, NULL, NULL, 0);
 	if (tmpl.cq == NULL) {
 		DRV_LOG(ERR, "port %u Tx queue %u CQ creation failure",
@@ -698,93 +676,7 @@ struct mlx5_txq_ibv *
 static void
 txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
 {
-	struct mlx5_priv *priv = txq_ctrl->priv;
-	struct mlx5_dev_config *config = &priv->config;
-	const unsigned int max_tso_inline =
-		((MLX5_MAX_TSO_HEADER + (RTE_CACHE_LINE_SIZE - 1)) /
-		 RTE_CACHE_LINE_SIZE);
-	unsigned int txq_inline;
-	unsigned int txqs_inline;
-	unsigned int inline_max_packet_sz;
-	eth_tx_burst_t tx_pkt_burst =
-		mlx5_select_tx_function(ETH_DEV(priv));
-	int is_empw_func = is_empw_burst_func(tx_pkt_burst);
-	int tso = !!(txq_ctrl->txq.offloads & (DEV_TX_OFFLOAD_TCP_TSO |
-					       DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
-					       DEV_TX_OFFLOAD_GRE_TNL_TSO |
-					       DEV_TX_OFFLOAD_IP_TNL_TSO |
-					       DEV_TX_OFFLOAD_UDP_TNL_TSO));
-
-	txq_inline = (config->txq_inline == MLX5_ARG_UNSET) ?
-		0 : config->txq_inline;
-	txqs_inline = (config->txqs_inline == MLX5_ARG_UNSET) ?
-		0 : config->txqs_inline;
-	inline_max_packet_sz =
-		(config->inline_max_packet_sz == MLX5_ARG_UNSET) ?
-		0 : config->inline_max_packet_sz;
-	if (is_empw_func) {
-		if (config->txq_inline == MLX5_ARG_UNSET)
-			txq_inline = MLX5_WQE_SIZE_MAX - MLX5_WQE_SIZE;
-		if (config->txqs_inline == MLX5_ARG_UNSET)
-			txqs_inline = MLX5_EMPW_MIN_TXQS;
-		if (config->inline_max_packet_sz == MLX5_ARG_UNSET)
-			inline_max_packet_sz = MLX5_EMPW_MAX_INLINE_LEN;
-		txq_ctrl->txq.mpw_hdr_dseg = config->mpw_hdr_dseg;
-		txq_ctrl->txq.inline_max_packet_sz = inline_max_packet_sz;
-	}
-	if (txq_inline && priv->txqs_n >= txqs_inline) {
-		unsigned int ds_cnt;
-
-		txq_ctrl->txq.max_inline =
-			((txq_inline + (RTE_CACHE_LINE_SIZE - 1)) /
-			 RTE_CACHE_LINE_SIZE);
-		if (is_empw_func) {
-			/* To minimize the size of data set, avoid requesting
-			 * too large WQ.
-			 */
-			txq_ctrl->max_inline_data =
-				((RTE_MIN(txq_inline,
-					  inline_max_packet_sz) +
-				  (RTE_CACHE_LINE_SIZE - 1)) /
-				 RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE;
-		} else {
-			txq_ctrl->max_inline_data =
-				txq_ctrl->txq.max_inline * RTE_CACHE_LINE_SIZE;
-		}
-		/*
-		 * Check if the inline size is too large in a way which
-		 * can make the WQE DS to overflow.
-		 * Considering in calculation:
-		 *      WQE CTRL (1 DS)
-		 *      WQE ETH  (1 DS)
-		 *      Inline part (N DS)
-		 */
-		ds_cnt = 2 + (txq_ctrl->txq.max_inline / MLX5_WQE_DWORD_SIZE);
-		if (ds_cnt > MLX5_DSEG_MAX) {
-			unsigned int max_inline = (MLX5_DSEG_MAX - 2) *
-						  MLX5_WQE_DWORD_SIZE;
-
-			max_inline = max_inline - (max_inline %
-						   RTE_CACHE_LINE_SIZE);
-			DRV_LOG(WARNING,
-				"port %u txq inline is too large (%d) setting"
-				" it to the maximum possible: %d\n",
-				PORT_ID(priv), txq_inline, max_inline);
-			txq_ctrl->txq.max_inline = max_inline /
-						   RTE_CACHE_LINE_SIZE;
-		}
-	}
-	if (tso) {
-		txq_ctrl->max_tso_header = max_tso_inline * RTE_CACHE_LINE_SIZE;
-		txq_ctrl->txq.max_inline = RTE_MAX(txq_ctrl->txq.max_inline,
-						   max_tso_inline);
-		txq_ctrl->txq.tso_en = 1;
-	}
-	txq_ctrl->txq.tunnel_en = config->tunnel_en | config->swp;
-	txq_ctrl->txq.swp_en = ((DEV_TX_OFFLOAD_IP_TNL_TSO |
-				 DEV_TX_OFFLOAD_UDP_TNL_TSO |
-				 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM) &
-				txq_ctrl->txq.offloads) && config->swp;
+	(void)txq_ctrl;
 }
 
 /**
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH 2/7] net/mlx5: add Tx datapath related devargs
  2019-07-04 16:29 [dpdk-dev] [PATCH 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 1/7] net/mlx5: remove Tx datapath implementation Viacheslav Ovsiienko
@ 2019-07-04 16:29 ` Viacheslav Ovsiienko
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 3/7] net/mlx5: update Tx datapath definitions Viacheslav Ovsiienko
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-04 16:29 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch introduces new mlx5 PMD devarg options:

- txq_inline_min - specifies minimal amount of data to be inlined into
  WQE during Tx operations. NICs may require this minimal data amount
  to operate correctly. The exact value may depend on NIC operation mode,
  requested offloads, etc.

- txq_inline_max - specifies the maximal packet length to be completely
  inlined into WQE Ethernet Segment for ordinary SEND method. If packet
  is larger the specified value, the packet data won't be copied by the
  driver at all, data buffer is addressed with a pointer. If packet length
  is less or equal all packet data will be copied into WQE.

- txq_inline_mpw - specifies the maximal packet length to be completely
  inlined into WQE for Enhanced MPW method.

Driver documentation is also updated.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 doc/guides/nics/mlx5.rst | 168 +++++++++++++++++++++++++----------------------
 drivers/net/mlx5/mlx5.c  |  21 ++++++
 drivers/net/mlx5/mlx5.h  |   4 ++
 3 files changed, 113 insertions(+), 80 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 4cd6cf8..3f2a344 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -346,99 +346,107 @@ Run-time configuration
   greater or equal to this value. The default value is 12, valid only if
   ``mprq_en`` is set.
 
-- ``txq_inline`` parameter [int]
-
-  Amount of data to be inlined during TX operations. Improves latency.
-  Can improve PPS performance when PCI back pressure is detected and may be
-  useful for scenarios involving heavy traffic on many queues.
-
-  Because additional software logic is necessary to handle this mode, this
-  option should be used with care, as it can lower performance when back
-  pressure is not expected.
+- ``txq_inline_min`` parameter [int]
+
+  Minimal amount of data to be inlined into WQE during Tx operations. NICs
+  may require this minimal data amount to operate correctly. The exact value
+  may depend on NIC operation mode, requested offloads, etc.
+
+  If ``txq_inline_min`` key is present the specified value (may be aligned
+  by the driver in order not to exceed the limits and provide better descriptor
+  space utilization) will be used by the driver and it is guaranteed the
+  requested data bytes are inlined into the WQE beside other inline settings.
+
+  If ``txq_inline_min`` key is not present the value may be queried by the
+  driver from the NIC via DevX if this feature is available. If there is no DevX
+  enabled/supported the value 18 (supposing L2 header including VLAN) is set
+  for ConnectX-4, value 58 (supposing L2-L4 headers, required by configurations
+  over E-Switch) is set for ConnectX-4 Lx, and 0 is set by default for ConnectX-5
+  and newer NICs. If packet is shorter the ``txq_inline_min`` value, the entire
+  packet is inlined.
+
+  Please, note, this minimal data inlining disengages eMPW feature (Enhanced
+  Multi-Packet Write), because last one does not support partial packet inlining.
+  This is not very critical due to minimal data inlining is mostly required
+  by ConnectX-4 and ConnectX-4 Lx, these NICs does not support eMPW feature.
 
 - ``txqs_min_inline`` parameter [int]
 
-  Enable inline send only when the number of TX queues is greater or equal
+  Enable inline data send only when the number of TX queues is greater or equal
   to this value.
 
-  This option should be used in combination with ``txq_inline`` above.
-
-  On ConnectX-4, ConnectX-4 LX, ConnectX-5, ConnectX-6 and BlueField without
-  Enhanced MPW:
-
-        - Disabled by default.
-        - In case ``txq_inline`` is set recommendation is 4.
+  This option should be used in combination with ``txq_inline_max`` and
+  ``txq_inline_mpw`` below and does not affect ``txq_inline_min`` settings above.
 
-  On ConnectX-5, ConnectX-6 and BlueField with Enhanced MPW:
+  If this option is not specified the default value 8 is used.
 
-        - Set to 8 by default.
+  The data inlining consumes the CPU cycles, so this option is intended to
+  auto enable inline data if we have enough Tx queues, which means we have
+  enough CPU cores and PCI bandwidth is getting more critical and CPU
+  is not supposed to be bottleneck anymore.
 
-- ``txqs_max_vec`` parameter [int]
+  The copying data into WQE improves latency and can improve PPS performance
+  when PCI back pressure is detected and may be useful for scenarios involving
+  heavy traffic on many queues.
 
-  Enable vectorized Tx only when the number of TX queues is less than or
-  equal to this value. Effective only when ``tx_vec_en`` is enabled.
-
-  On ConnectX-5:
-
-        - Set to 8 by default on ARMv8.
-        - Set to 4 by default otherwise.
-
-  On BlueField
+  Because additional software logic is necessary to handle this mode, this
+  option should be used with care, as it may lower performance when back
+  pressure is not expected.
 
-        - Set to 16 by default.
+- ``txq_inline_max`` parameter [int]
+
+  Specifies the maximal packet length to be completely inlined into WQE
+  Ethernet Segment for ordinary SEND method. If packet is larger the specified
+  value, the packet data won't be copied by the driver at all, data buffer
+  is addressed with a pointer. If packet length is less or equal all packet
+  data will be copied into WQE. This may improve PCI bandwidth utilization for
+  short packets significantly but requires the extra CPU cycles.
+
+  The data inline feature is controlled by number of Tx queues, if number of Tx
+  queues is larger than ``txqs_min_inline`` key parameter, the inline feature
+  is engaged, if there are not enough Tx queues (which means not enough CPU cores
+  and CPU resources are scarce), data inline is not performed by the driver.
+  Assigning ``txqs_min_inline`` with zero always enables the data inline.
+
+  The default ``txq_inline_max`` value is 290. The specified value may be adjusted
+  by the driver in order not to exceed the limit (930 bytes) and to provide better
+  WQE space filling without gaps, the adjustment is reflected in the debug log.
+
+- ``txq_inline_mpw`` parameter [int]
+
+  Specifies the maximal packet length to be completely inlined into WQE for
+  Enhanced MPW method. If packet is large the specified value, the packet data
+  won't be copied, and data buffer is addressed with pointer. If packet length
+  is less or equal, all packet data will be copied into WQE. This may improve PCI
+  bandwidth utilization for short packets significantly but requires the extra
+  CPU cycles.
+
+  The data inline feature is controlled by number of TX queues, if number of Tx
+  queues is larger than ``txqs_min_inline`` key parameter, the inline feature
+  is engaged, if there are not enough Tx queues (which means not enough CPU cores
+  and CPU resources are scarce), data inline is not performed by the driver.
+  Assigning ``txqs_min_inline`` with zero always enables the data inline.
+
+  The default ``txq_inline_mpw`` value is 188. The specified value may be adjusted
+  by the driver in order not to exceed the limit (930 bytes) and to provide better
+  WQE space filling without gaps, the adjustment is reflected in the debug log.
+  Due to multiple packets may be included to the same WQE with Enhanced Multi
+  Packet Write Method and overall WQE size is limited it is not recommended to
+  specify large values for the ``txq_inline_mpw``.
 
 - ``txq_mpw_en`` parameter [int]
 
-  A nonzero value enables multi-packet send (MPS) for ConnectX-4 Lx and
-  enhanced multi-packet send (Enhanced MPS) for ConnectX-5, ConnectX-6 and BlueField.
-  MPS allows the TX burst function to pack up multiple packets in a
-  single descriptor session in order to save PCI bandwidth and improve
-  performance at the cost of a slightly higher CPU usage. When
-  ``txq_inline`` is set along with ``txq_mpw_en``, TX burst function tries
-  to copy entire packet data on to TX descriptor instead of including
-  pointer of packet only if there is enough room remained in the
-  descriptor. ``txq_inline`` sets per-descriptor space for either pointers
-  or inlined packets. In addition, Enhanced MPS supports hybrid mode -
-  mixing inlined packets and pointers in the same descriptor.
-
-  This option cannot be used with certain offloads such as ``DEV_TX_OFFLOAD_TCP_TSO,
-  DEV_TX_OFFLOAD_VXLAN_TNL_TSO, DEV_TX_OFFLOAD_GRE_TNL_TSO, DEV_TX_OFFLOAD_VLAN_INSERT``.
-  When those offloads are requested the MPS send function will not be used.
-
-  It is currently only supported on the ConnectX-4 Lx, ConnectX-5, ConnectX-6 and BlueField
-  families of adapters.
-  On ConnectX-4 Lx the MPW is considered un-secure hence disabled by default.
-  Users which enable the MPW should be aware that application which provides incorrect
-  mbuf descriptors in the Tx burst can lead to serious errors in the host including, on some cases,
-  NIC to get stuck.
-  On ConnectX-5, ConnectX-6 and BlueField the MPW is secure and enabled by default.
-
-- ``txq_mpw_hdr_dseg_en`` parameter [int]
-
-  A nonzero value enables including two pointers in the first block of TX
-  descriptor. This can be used to lessen CPU load for memory copy.
-
-  Effective only when Enhanced MPS is supported. Disabled by default.
-
-- ``txq_max_inline_len`` parameter [int]
-
-  Maximum size of packet to be inlined. This limits the size of packet to
-  be inlined. If the size of a packet is larger than configured value, the
-  packet isn't inlined even though there's enough space remained in the
-  descriptor. Instead, the packet is included with pointer.
-
-  Effective only when Enhanced MPS is supported. The default value is 256.
-
-- ``tx_vec_en`` parameter [int]
-
-  A nonzero value enables Tx vector on ConnectX-5, ConnectX-6 and BlueField NICs if the number of
-  global Tx queues on the port is less than ``txqs_max_vec``.
-
-  This option cannot be used with certain offloads such as ``DEV_TX_OFFLOAD_TCP_TSO,
-  DEV_TX_OFFLOAD_VXLAN_TNL_TSO, DEV_TX_OFFLOAD_GRE_TNL_TSO, DEV_TX_OFFLOAD_VLAN_INSERT``.
-  When those offloads are requested the MPS send function will not be used.
-
-  Enabled by default on ConnectX-5, ConnectX-6 and BlueField.
+  A nonzero value enables Enhanced Multi-Packet Write (eMPW) for ConnectX-5,
+  ConnectX-6 and BlueField. eMPW allows the TX burst function to pack up multiple
+  packets in a single descriptor session in order to save PCI bandwidth and improve
+  performance at the cost of a slightly higher CPU usage. When ``txq_inline_mpw``
+  is set along with ``txq_mpw_en``, TX burst function copies entire packet
+  data on to TX descriptor instead of including pointer of packet.
+
+  The Enhanced Multi-Packet Write feature is enabled by default if NIC supports
+  it, can be disabled by explicit specifying 0 value for ``txq_mpw_en`` option.
+  Also, if minimal data inlining is requested by non-zero ``txq_inline_min``
+  option or reported by the NIC, the eMPW feature is disengaged.
 
 - ``rx_vec_en`` parameter [int]
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 42b36a7..e803f08 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -68,6 +68,15 @@
 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
 
+/* Device parameter to limit packet size to inline with ordinary SEND. */
+#define MLX5_TXQ_INLINE_MAX "txq_inline_max"
+
+/* Device parameter to configure minimal data size to inline. */
+#define MLX5_TXQ_INLINE_MIN "txq_inline_min"
+
+/* Device parameter to limit packet size to inline with Enhanced MPW. */
+#define MLX5_TXQ_INLINE_MPW "txq_inline_mpw"
+
 /*
  * Device parameter to configure the number of TX queues threshold for
  * enabling inline send.
@@ -884,6 +893,12 @@ struct mlx5_dev_spawn_data {
 		config->mprq.max_memcpy_len = tmp;
 	} else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
 		config->mprq.min_rxqs_num = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) {
+		config->txq_inline_max = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) {
+		config->txq_inline_min = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) {
+		config->txq_inline_mpw = tmp;
 	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
 		config->txqs_inline = tmp;
 	} else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
@@ -932,6 +947,9 @@ struct mlx5_dev_spawn_data {
 		MLX5_RX_MPRQ_LOG_STRIDE_NUM,
 		MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
 		MLX5_RXQS_MIN_MPRQ,
+		MLX5_TXQ_INLINE_MIN,
+		MLX5_TXQ_INLINE_MAX,
+		MLX5_TXQ_INLINE_MPW,
 		MLX5_TXQS_MIN_INLINE,
 		MLX5_TXQ_MPW_EN,
 		MLX5_RX_VEC_EN,
@@ -1882,6 +1900,9 @@ struct mlx5_dev_spawn_data {
 		.hw_padding = 0,
 		.mps = MLX5_ARG_UNSET,
 		.rx_vec_en = 1,
+		.txq_inline_max = MLX5_ARG_UNSET,
+		.txq_inline_min = MLX5_ARG_UNSET,
+		.txq_inline_mpw = MLX5_ARG_UNSET,
 		.txqs_inline = MLX5_ARG_UNSET,
 		.vf_nl_en = 1,
 		.mr_ext_memseg_en = 1,
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 47c9e47..ac7ea1d 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -188,6 +188,7 @@ struct mlx5_dev_config {
 	unsigned int cqe_comp:1; /* CQE compression is enabled. */
 	unsigned int cqe_pad:1; /* CQE padding is enabled. */
 	unsigned int tso:1; /* Whether TSO is supported. */
+	unsigned int tx_inline:1; /* Engage TX data inlining. */
 	unsigned int rx_vec_en:1; /* Rx vector is enabled. */
 	unsigned int mr_ext_memseg_en:1;
 	/* Whether memseg should be extended for MR creation. */
@@ -213,6 +214,9 @@ struct mlx5_dev_config {
 	unsigned int ind_table_max_size; /* Maximum indirection table size. */
 	unsigned int max_dump_files_num; /* Maximum dump files per queue. */
 	int txqs_inline; /* Queue number threshold for inlining. */
+	int txq_inline_min; /* Minimal amount of data bytes to inline. */
+	int txq_inline_max; /* Max packet size for inlining with SEND. */
+	int txq_inline_mpw; /* Max packet size for inlining with eMPW. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 };
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH 3/7] net/mlx5: update Tx datapath definitions
  2019-07-04 16:29 [dpdk-dev] [PATCH 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 1/7] net/mlx5: remove Tx datapath implementation Viacheslav Ovsiienko
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 2/7] net/mlx5: add Tx datapath related devargs Viacheslav Ovsiienko
@ 2019-07-04 16:29 ` Viacheslav Ovsiienko
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 4/7] net/mlx5: add Tx datapath configuration and setup Viacheslav Ovsiienko
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-04 16:29 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch updates Tx datapath definitions, mostly hardware related.
The Tx descriptor structures are redefined with required fields,
size definitions are renamed to reflect the meanings in more
appropriate way. This is a preparation step before introducing
the new Tx datapath implementation.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_defs.h |   2 +-
 drivers/net/mlx5/mlx5_prm.h  | 163 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 151 insertions(+), 14 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 6861304..873a595 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -58,7 +58,7 @@
 #define MLX5_MAX_XSTATS 32
 
 /* Maximum Packet headers size (L2+L3+L4) for TSO. */
-#define MLX5_MAX_TSO_HEADER 192
+#define MLX5_MAX_TSO_HEADER (128u + 34u)
 
 /* Threshold of buffer replenishment for vectorized Rx. */
 #define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 3589f51..471a3e3 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -39,14 +39,84 @@
 /* Invalidate a CQE. */
 #define MLX5_CQE_INVALIDATE (MLX5_CQE_INVALID << 4)
 
-/* WQE DWORD size */
-#define MLX5_WQE_DWORD_SIZE 16
-
-/* WQE size */
-#define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
+/* WQE Segment sizes in bytes. */
+#define MLX5_WSEG_SIZE 16u
+#define MLX5_WQE_CSEG_SIZE sizeof(struct mlx5_wqe_cseg)
+#define MLX5_WQE_DSEG_SIZE sizeof(struct mlx5_wqe_dseg)
+#define MLX5_WQE_ESEG_SIZE sizeof(struct mlx5_wqe_eseg)
+
+/* WQE/WQEBB size in bytes. */
+#define MLX5_WQE_SIZE sizeof(struct mlx5_wqe)
+
+/*
+ * Max size of a WQE session.
+ * Absolute maximum size is 63 (MLX5_DSEG_MAX) segments,
+ * the WQE size field in Control Segment is 6 bits wide.
+ */
+#define MLX5_WQE_SIZE_MAX (60 * MLX5_WSEG_SIZE)
+
+/*
+ * Default minimum number of Tx queues for inlining packets.
+ * If there are less queues as specified we assume we have
+ * no enough CPU resources (cycles) to perform inlining,
+ * the PCIe throughput is not supposed as bottleneck and
+ * inlining is disabled.
+ */
+#define MLX5_EMPW_MIN_TXQS 8u
+
+/*
+ * Default packet length threshold to be inlined with
+ * enhanced MPW. If packet length exceeds the threshold
+ * the data are not inlined. Should be aligned in WQEBB
+ * boundary with accounting the title Control and Ethernet
+ * segments.
+ */
+#define MLX5_EMPW_DEF_INLINE_LEN (3U * MLX5_WQE_SIZE + \
+				  MLX5_DSEG_MIN_INLINE_SIZE - \
+				  MLX5_WQE_DSEG_SIZE)
+/*
+ * Maximal inline data length sent with enhanced MPW.
+ * Is based on maximal WQE size.
+ */
+#define MLX5_EMPW_MAX_INLINE_LEN (MLX5_WQE_SIZE_MAX - \
+				  MLX5_WQE_CSEG_SIZE - \
+				  MLX5_WQE_ESEG_SIZE - \
+				  MLX5_WQE_DSEG_SIZE + \
+				  MLX5_DSEG_MIN_INLINE_SIZE)
+/*
+ * Minimal amount of packets to be sent with EMPW.
+ * This limits the minimal required size of sent EMPW.
+ * If there are no enough resources to built minimal
+ * EMPW the sending loop exits.
+ */
+#define MLX5_EMPW_MIN_PACKETS 6
+#define MLX5_EMPW_MAX_PACKETS ((MLX5_WQE_SIZE_MAX - \
+				MLX5_WQE_CSEG_SIZE - \
+				MLX5_WQE_ESEG_SIZE) / \
+				MLX5_WSEG_SIZE)
+/*
+ * Default packet length threshold to be inlined with
+ * ordinary SEND. Inlining saves the MR key search
+ * and extra PCIe data fetch transaction, but eats the
+ * CPU cycles.
+ */
+#define MLX5_SEND_DEF_INLINE_LEN (5U * MLX5_WQE_SIZE + \
+				  MLX5_ESEG_MIN_INLINE_SIZE - \
+				  MLX5_WQE_CSEG_SIZE - \
+				  MLX5_WQE_ESEG_SIZE - \
+				  MLX5_WQE_DSEG_SIZE)
+/*
+ * Maximal inline data length sent with ordinary SEND.
+ * Is based on maximal WQE size.
+ */
+#define MLX5_SEND_MAX_INLINE_LEN (MLX5_WQE_SIZE_MAX - \
+				  MLX5_WQE_CSEG_SIZE - \
+				  MLX5_WQE_ESEG_SIZE - \
+				  MLX5_WQE_DSEG_SIZE + \
+				  MLX5_ESEG_MIN_INLINE_SIZE)
 
-#define MLX5_OPC_MOD_ENHANCED_MPSW 0
-#define MLX5_OPCODE_ENHANCED_MPSW 0x29
+/* Missed in mlv5dv.h, should define here. */
+#define MLX5_OPCODE_ENHANCED_MPSW 0x29u
 
 /* CQE value to inform that VLAN is stripped. */
 #define MLX5_CQE_VLAN_STRIPPED (1u << 0)
@@ -114,6 +184,12 @@
 /* Inner L3 type is IPV6. */
 #define MLX5_ETH_WQE_L3_INNER_IPV6 (1u << 0)
 
+/* VLAN insertion flag. */
+#define MLX5_ETH_WQE_VLAN_INSERT (1u << 31)
+
+/* Data inline segment flag. */
+#define MLX5_ETH_WQE_DATA_INLINE (1u << 31)
+
 /* Is flow mark valid. */
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 #define MLX5_FLOW_MARK_IS_VALID(val) ((val) & 0xffffff00)
@@ -130,12 +206,21 @@
 /* Default mark value used when none is provided. */
 #define MLX5_FLOW_MARK_DEFAULT 0xffffff
 
-/* Maximum number of DS in WQE. */
+/* Maximum number of DS in WQE. Limited by 6-bit field. */
 #define MLX5_DSEG_MAX 63
 
 /* The completion mode offset in the WQE control segment line 2. */
 #define MLX5_COMP_MODE_OFFSET 2
 
+/* Amount of data bytes in minimal inline data segment. */
+#define MLX5_DSEG_MIN_INLINE_SIZE 12
+
+/* Amount of data bytes in minimal inline eth segment. */
+#define MLX5_ESEG_MIN_INLINE_SIZE 18
+
+/* Amount of data bytes after eth data segment. */
+#define MLX5_ESEG_EXTRA_DATA_SIZE 32
+
 /* Completion mode. */
 enum mlx5_completion_mode {
 	MLX5_COMP_ONLY_ERR = 0x0,
@@ -144,11 +229,6 @@ enum mlx5_completion_mode {
 	MLX5_COMP_CQE_AND_EQE = 0x3,
 };
 
-/* Small common part of the WQE. */
-struct mlx5_wqe {
-	uint32_t ctrl[4];
-};
-
 /* MPW mode. */
 enum mlx5_mpw_mode {
 	MLX5_MPW_DISABLED,
@@ -156,6 +236,63 @@ enum mlx5_mpw_mode {
 	MLX5_MPW_ENHANCED, /* Enhanced Multi-Packet Send WQE, a.k.a MPWv2. */
 };
 
+/* WQE Control segment. */
+struct mlx5_wqe_cseg {
+	uint32_t opcode;
+	uint32_t sq_ds;
+	uint32_t flags;
+	uint32_t misc;
+} __rte_packed __rte_aligned(MLX5_WSEG_SIZE);
+
+/* Header of data segment. Minimal size Data Segment */
+struct mlx5_wqe_dseg {
+	uint32_t bcount;
+	union {
+		uint8_t inline_data[MLX5_DSEG_MIN_INLINE_SIZE];
+		struct {
+			uint32_t lkey;
+			uint64_t pbuf;
+		} __rte_packed;
+	};
+} __rte_packed;
+
+/* Subset of struct WQE Ethernet Segment. */
+struct mlx5_wqe_eseg {
+	union {
+		struct {
+			uint32_t swp_offs;
+			uint8_t	cs_flags;
+			uint8_t	swp_flags;
+			uint16_t mss;
+			uint32_t metadata;
+			uint16_t inline_hdr_sz;
+			union {
+				uint16_t inline_data;
+				uint16_t vlan_tag;
+			};
+		} __rte_packed;
+		struct {
+			uint32_t offsets;
+			uint32_t flags;
+			uint32_t flow_metadata;
+			uint32_t inline_hdr;
+		} __rte_packed;
+	};
+} __rte_packed;
+
+/* The title WQEBB, header of WQE. */
+struct mlx5_wqe {
+	union {
+		struct mlx5_wqe_cseg cseg;
+		uint32_t ctrl[4];
+	};
+	struct mlx5_wqe_eseg eseg;
+	union {
+		struct mlx5_wqe_dseg dseg[2];
+		uint8_t data[MLX5_ESEG_EXTRA_DATA_SIZE];
+	};
+} __rte_packed;
+
 /* WQE for Multi-Packet RQ. */
 struct mlx5_wqe_mprq {
 	struct mlx5_wqe_srq_next_seg next_seg;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH 4/7] net/mlx5: add Tx datapath configuration and setup
  2019-07-04 16:29 [dpdk-dev] [PATCH 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                   ` (2 preceding siblings ...)
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 3/7] net/mlx5: update Tx datapath definitions Viacheslav Ovsiienko
@ 2019-07-04 16:29 ` Viacheslav Ovsiienko
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 5/7] net/mlx5: introduce Tx burst routine template Viacheslav Ovsiienko
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-04 16:29 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch updates the Tx datapath control and configuration
structures and code for mananging Tx datapath settings.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c |   4 +-
 drivers/net/mlx5/mlx5_rxtx.h |  55 +++++++++----
 drivers/net/mlx5/mlx5_txq.c  | 182 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 212 insertions(+), 29 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index f2d6918..13f9431 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -507,7 +507,7 @@
 			MKSTR(err_str, "Unexpected CQE error syndrome "
 			      "0x%02x CQN = %u SQN = %u wqe_counter = %u "
 			      "wq_ci = %u cq_ci = %u", err_cqe->syndrome,
-			      txq_ctrl->cqn, txq->qp_num_8s >> 8,
+			      txq->cqe_s, txq->qp_num_8s >> 8,
 			      rte_be_to_cpu_16(err_cqe->wqe_counter),
 			      txq->wqe_ci, txq->cq_ci);
 			MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u",
@@ -516,7 +516,7 @@
 			mlx5_dump_debug_information(name, NULL, err_str, 0);
 			mlx5_dump_debug_information(name, "MLX5 Error CQ:",
 						    (const void *)((uintptr_t)
-						    &(*txq->cqes)[0]),
+						    txq->cqes),
 						    sizeof(*err_cqe) *
 						    (1 << txq->cqe_n));
 			mlx5_dump_debug_information(name, "MLX5 Error SQ:",
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index acde09d..d8c6f35 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -188,37 +188,60 @@ struct mlx5_hrxq {
 	uint8_t rss_key[]; /* Hash key. */
 };
 
+/* TX queue send local data. */
+__extension__
+struct mlx5_txq_local {
+	struct mlx5_wqe *wqe_last; /* last sent WQE pointer. */
+	struct rte_mbuf *mbuf; /* first mbuf to process. */
+	uint16_t pkts_copy; /* packets copied to elts. */
+	uint16_t pkts_sent; /* packets sent. */
+	uint16_t elts_free; /* available elts remain. */
+	uint16_t wqe_free; /* available wqe remain. */
+	uint16_t mbuf_off; /* data offset in current mbuf. */
+	uint16_t mbuf_nseg; /* number of remaining mbuf. */
+};
+
 /* TX queue descriptor. */
 __extension__
 struct mlx5_txq_data {
 	uint16_t elts_head; /* Current counter in (*elts)[]. */
 	uint16_t elts_tail; /* Counter of first element awaiting completion. */
-	uint16_t elts_comp; /* Counter since last completion request. */
-	uint16_t mpw_comp; /* WQ index since last completion request. */
+	uint16_t elts_comp; /* elts index since last completion request. */
+	uint16_t elts_s; /* Number of mbuf elements. */
+	uint16_t elts_m; /* Mask for mbuf elements indices. */
+	/* Fields related to elts mbuf storage. */
+	uint16_t wqe_ci; /* Consumer index for work queue. */
+	uint16_t wqe_pi; /* Producer index for work queue. */
+	uint16_t wqe_s; /* Number of WQ elements. */
+	uint16_t wqe_m; /* Mask Number for WQ elements. */
+	uint16_t wqe_comp; /* WQE index since last completion request. */
+	uint16_t wqe_thres; /* WQE threshold to request completion in CQ. */
+	/* WQ related fields. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
 #ifndef NDEBUG
-	uint16_t cq_pi; /* Producer index for completion queue. */
+	uint16_t cq_pi; /* Counter of issued CQE "always" requests. */
 #endif
-	uint16_t wqe_ci; /* Consumer index for work queue. */
-	uint16_t wqe_pi; /* Producer index for work queue. */
-	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
+	uint16_t cqe_s; /* Number of CQ elements. */
+	uint16_t cqe_m; /* Mask for CQ indices. */
+	/* CQ related fields. */
+	uint16_t elts_n:4; /* elts[] length (in log2). */
 	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
-	uint16_t wqe_n:4; /* Number of of WQ elements (in log2). */
+	uint16_t wqe_n:4; /* Number of WQ elements (in log2). */
 	uint16_t tso_en:1; /* When set hardware TSO is enabled. */
 	uint16_t tunnel_en:1;
 	/* When set TX offload for tunneled packets are supported. */
 	uint16_t swp_en:1; /* Whether SW parser is enabled. */
-	uint16_t mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
-	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
-	uint16_t inline_max_packet_sz; /* Max packet size for inlining. */
+	uint16_t inlen_send; /* Ordinary send data inline size. */
+	uint16_t inlen_empw; /* eMPW max packet size to inline. */
+	uint16_t inlen_mode; /* Minimal data length to inline. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	uint64_t offloads; /* Offloads for Tx Queue. */
 	struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
-	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
-	volatile void *wqes; /* Work queue (use volatile to write into). */
+	struct mlx5_wqe *wqes; /* Work queue. */
+	struct mlx5_wqe *wqes_end; /* Work queue array limit. */
+	volatile struct mlx5_cqe *cqes; /* Completion queue. */
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
-	struct rte_mbuf *(*elts)[]; /* TX elements. */
 	uint16_t port_id; /* Port ID of device. */
 	uint16_t idx; /* Queue index. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
@@ -226,6 +249,8 @@ struct mlx5_txq_data {
 	rte_spinlock_t *uar_lock;
 	/* UAR access lock required for 32bit implementations */
 #endif
+	struct rte_mbuf *elts[0];
+	/* Storage for queued packets, must be the last field. */
 } __rte_cache_aligned;
 
 /* Verbs Rx queue elements. */
@@ -239,7 +264,6 @@ struct mlx5_txq_ibv {
 
 /* TX queue control descriptor. */
 struct mlx5_txq_ctrl {
-	struct mlx5_txq_data txq; /* Data path structure. */
 	LIST_ENTRY(mlx5_txq_ctrl) next; /* Pointer to the next element. */
 	rte_atomic32_t refcnt; /* Reference counter. */
 	unsigned int socket; /* CPU socket ID for allocations. */
@@ -249,8 +273,9 @@ struct mlx5_txq_ctrl {
 	struct mlx5_priv *priv; /* Back pointer to private data. */
 	off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
 	void *bf_reg; /* BlueFlame register from Verbs. */
-	uint32_t cqn; /* CQ number. */
 	uint16_t dump_file_n; /* Number of dump files. */
+	struct mlx5_txq_data txq; /* Data path structure. */
+	/* Must be the last field in the structure, contains elts[]. */
 };
 
 #define MLX5_TX_BFREG(txq) \
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 0d2dbfa..147599b 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -47,7 +47,7 @@
 	unsigned int i;
 
 	for (i = 0; (i != elts_n); ++i)
-		(*txq_ctrl->txq.elts)[i] = NULL;
+		txq_ctrl->txq.elts[i] = NULL;
 	DRV_LOG(DEBUG, "port %u Tx queue %u allocated and configured %u WRs",
 		PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx, elts_n);
 	txq_ctrl->txq.elts_head = 0;
@@ -68,7 +68,7 @@
 	const uint16_t elts_m = elts_n - 1;
 	uint16_t elts_head = txq_ctrl->txq.elts_head;
 	uint16_t elts_tail = txq_ctrl->txq.elts_tail;
-	struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts;
+	struct rte_mbuf *(*elts)[elts_n] = &txq_ctrl->txq.elts;
 
 	DRV_LOG(DEBUG, "port %u Tx queue %u freeing WRs",
 		PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx);
@@ -411,7 +411,8 @@ struct mlx5_txq_ibv *
 	attr.cq = (struct ibv_cq_init_attr_ex){
 		.comp_mask = 0,
 	};
-	cqe_n = desc / MLX5_TX_COMP_THRESH + 1;
+	cqe_n = desc / MLX5_TX_COMP_THRESH +
+		1 + MLX5_TX_COMP_THRESH_INLINE_DIV;
 	tmpl.cq = mlx5_glue->create_cq(priv->sh->ctx, cqe_n, NULL, NULL, 0);
 	if (tmpl.cq == NULL) {
 		DRV_LOG(ERR, "port %u Tx queue %u CQ creation failure",
@@ -449,7 +450,7 @@ struct mlx5_txq_ibv *
 		.pd = priv->sh->pd,
 		.comp_mask = IBV_QP_INIT_ATTR_PD,
 	};
-	if (txq_data->max_inline)
+	if (txq_data->inlen_send)
 		attr.init.cap.max_inline_data = txq_ctrl->max_inline_data;
 	if (txq_data->tso_en) {
 		attr.init.max_tso_header = txq_ctrl->max_tso_header;
@@ -523,25 +524,29 @@ struct mlx5_txq_ibv *
 		goto error;
 	}
 	txq_data->cqe_n = log2above(cq_info.cqe_cnt);
+	txq_data->cqe_s = 1 << txq_data->cqe_n;
+	txq_data->cqe_m = txq_data->cqe_s - 1;
 	txq_data->qp_num_8s = tmpl.qp->qp_num << 8;
 	txq_data->wqes = qp.sq.buf;
 	txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
+	txq_data->wqe_s = 1 << txq_data->wqe_n;
+	txq_data->wqe_m = txq_data->wqe_s - 1;
+	txq_data->wqes_end = txq_data->wqes + txq_data->wqe_s;
 	txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
 	txq_data->cq_db = cq_info.dbrec;
-	txq_data->cqes =
-		(volatile struct mlx5_cqe (*)[])
-		(uintptr_t)cq_info.buf;
+	txq_data->cqes = (volatile struct mlx5_cqe *)cq_info.buf;
 	txq_data->cq_ci = 0;
 #ifndef NDEBUG
 	txq_data->cq_pi = 0;
 #endif
 	txq_data->wqe_ci = 0;
 	txq_data->wqe_pi = 0;
+	txq_data->wqe_comp = 0;
+	txq_data->wqe_thres = txq_data->wqe_s / MLX5_TX_COMP_THRESH_INLINE_DIV;
 	txq_ibv->qp = tmpl.qp;
 	txq_ibv->cq = tmpl.cq;
 	rte_atomic32_inc(&txq_ibv->refcnt);
 	txq_ctrl->bf_reg = qp.bf.reg;
-	txq_ctrl->cqn = cq_info.cqn;
 	txq_uar_init(txq_ctrl);
 	if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) {
 		txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset;
@@ -663,7 +668,11 @@ struct mlx5_txq_ibv *
 	unsigned int wqe_size;
 	const unsigned int desc = 1 << txq_ctrl->txq.elts_n;
 
-	wqe_size = MLX5_WQE_SIZE + txq_ctrl->max_inline_data;
+	wqe_size = MLX5_WQE_CSEG_SIZE +
+		   MLX5_WQE_ESEG_SIZE +
+		   MLX5_WSEG_SIZE -
+		   MLX5_ESEG_MIN_INLINE_SIZE +
+		   txq_ctrl->max_inline_data;
 	return rte_align32pow2(wqe_size * desc) / MLX5_WQE_SIZE;
 }
 
@@ -676,7 +685,156 @@ struct mlx5_txq_ibv *
 static void
 txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
 {
-	(void)txq_ctrl;
+	struct mlx5_priv *priv = txq_ctrl->priv;
+	struct mlx5_dev_config *config = &priv->config;
+	unsigned int inlen_send; /* Inline data for ordinary SEND.*/
+	unsigned int inlen_empw; /* Inline data for enhanced MPW. */
+	unsigned int inlen_mode; /* Minimal required Inline data. */
+	unsigned int txqs_inline; /* Min Tx queues to enable inline. */
+	int tso = !!(txq_ctrl->txq.offloads & (DEV_TX_OFFLOAD_TCP_TSO |
+					       DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+					       DEV_TX_OFFLOAD_GRE_TNL_TSO |
+					       DEV_TX_OFFLOAD_IP_TNL_TSO |
+					       DEV_TX_OFFLOAD_UDP_TNL_TSO));
+	unsigned int temp;
+
+	txqs_inline = (config->txqs_inline == MLX5_ARG_UNSET) ?
+		      MLX5_EMPW_MIN_TXQS :
+		      (unsigned int)config->txqs_inline;
+	inlen_send = (config->txq_inline_max == MLX5_ARG_UNSET) ?
+		     MLX5_SEND_DEF_INLINE_LEN :
+		     (unsigned int)config->txq_inline_max;
+	inlen_empw = (config->txq_inline_mpw == MLX5_ARG_UNSET) ?
+		     MLX5_EMPW_DEF_INLINE_LEN :
+		     (unsigned int)config->txq_inline_mpw;
+	inlen_mode = (config->txq_inline_min == MLX5_ARG_UNSET) ?
+		     0 : (unsigned int)config->txq_inline_min;
+
+	/*
+	 * If there is requested minimal amount of data to inline
+	 * we MUST enable inlining. This is a case for ConnectX-4
+	 * which usually requires L2 inlined for correct operating
+	 * and ConnectX-4LX which requires L2-L4 inlined to
+	 * support E-Switch Flows.
+	 */
+	if (inlen_mode) {
+		if (inlen_mode <= MLX5_ESEG_MIN_INLINE_SIZE) {
+			/*
+			 * Optimize minimal inlining for single
+			 * segment packets to fill one WQEBB
+			 * without gaps.
+			 */
+			temp = MLX5_ESEG_MIN_INLINE_SIZE;
+		} else {
+			temp = inlen_mode - MLX5_ESEG_MIN_INLINE_SIZE;
+			temp = RTE_ALIGN(temp, MLX5_WSEG_SIZE) +
+			       MLX5_ESEG_MIN_INLINE_SIZE;
+			temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN);
+		}
+		if (temp != inlen_mode) {
+			DRV_LOG(INFO,
+				"port %u minimal required inline setting"
+				" aligned from %u to %u",
+				PORT_ID(priv), inlen_mode, temp);
+			inlen_mode = temp;
+		}
+	}
+	/*
+	 * If there are few Tx queues it is prioritized
+	 * to save CPU cycles and disable data inlining at all.
+	 */
+	if (inlen_send && priv->txqs_n >= txqs_inline) {
+		/*
+		 * The data sent with ordinal MLX5_OPCODE_SEND
+		 * may be inlined in Ethernet Segment, align the
+		 * length accordingly to fit entire WQEBBs.
+		 */
+		temp = (inlen_send / MLX5_WQE_SIZE) * MLX5_WQE_SIZE +
+			MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE;
+		temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX +
+				     MLX5_ESEG_MIN_INLINE_SIZE -
+				     MLX5_WQE_CSEG_SIZE -
+				     MLX5_WQE_ESEG_SIZE -
+				     MLX5_WQE_DSEG_SIZE);
+		temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN);
+		temp = RTE_MAX(temp, inlen_mode);
+		if (temp != inlen_send) {
+			DRV_LOG(INFO,
+				"port %u ordinary send inline setting"
+				" aligned from %u to %u",
+				PORT_ID(priv), inlen_send, temp);
+			inlen_send = temp;
+		}
+		/*
+		 * Not aligned to cache lines, but to WQEs.
+		 * First bytes of data (initial alignment)
+		 * is going to be copied explicitly at the
+		 * beginning of inlining buffer in Ethernet
+		 * Segment.
+		 */
+		assert(inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+		assert(inlen_send <= MLX5_WQE_SIZE_MAX +
+				     MLX5_ESEG_MIN_INLINE_SIZE -
+				     MLX5_WQE_CSEG_SIZE -
+				     MLX5_WQE_ESEG_SIZE -
+				     MLX5_WQE_DSEG_SIZE);
+		txq_ctrl->txq.inlen_send = inlen_send;
+		txq_ctrl->txq.inlen_mode = inlen_mode;
+	} else {
+		/*
+		 * If minimal inlining is requested we must
+		 * enable inlining in general, despite the
+		 * number of configured queues.
+		 */
+		txq_ctrl->txq.inlen_send = inlen_mode;
+		txq_ctrl->txq.inlen_mode = inlen_mode;
+		txq_ctrl->txq.inlen_empw = 0;
+		inlen_send = 0;
+		inlen_empw = 0;
+	}
+	if (inlen_send && inlen_empw && priv->txqs_n >= txqs_inline) {
+		/*
+		 * The data sent with MLX5_OPCODE_ENHANCED_MPSW
+		 * may be inlined in Data Segment, align the
+		 * length accordingly to fit entire WQEBBs.
+		 */
+		temp = (inlen_empw + MLX5_WQE_SIZE - 1) / MLX5_WQE_SIZE;
+		temp = temp * MLX5_WQE_SIZE +
+		       MLX5_DSEG_MIN_INLINE_SIZE - MLX5_WQE_DSEG_SIZE;
+		temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX +
+				     MLX5_DSEG_MIN_INLINE_SIZE -
+				     MLX5_WQE_CSEG_SIZE -
+				     MLX5_WQE_ESEG_SIZE -
+				     MLX5_WQE_DSEG_SIZE);
+		temp = RTE_MIN(temp, MLX5_EMPW_MAX_INLINE_LEN);
+		if (temp != inlen_empw) {
+			DRV_LOG(INFO,
+				"port %u enhanced empw inline setting"
+				" aligned from %u to %u",
+				PORT_ID(priv), inlen_empw, temp);
+			inlen_empw = temp;
+		}
+		assert(inlen_empw >= MLX5_ESEG_MIN_INLINE_SIZE);
+		assert(inlen_empw <= MLX5_WQE_SIZE_MAX +
+				     MLX5_DSEG_MIN_INLINE_SIZE -
+				     MLX5_WQE_CSEG_SIZE -
+				     MLX5_WQE_ESEG_SIZE -
+				     MLX5_WQE_DSEG_SIZE);
+		txq_ctrl->txq.inlen_empw = inlen_send;
+	}
+	txq_ctrl->max_inline_data = RTE_MAX(inlen_send, inlen_empw);
+	if (tso) {
+		txq_ctrl->max_tso_header = MLX5_MAX_TSO_HEADER;
+		txq_ctrl->max_inline_data = RTE_MAX(txq_ctrl->max_inline_data,
+						    MLX5_MAX_TSO_HEADER);
+		txq_ctrl->txq.tso_en = 1;
+	}
+	txq_ctrl->txq.tunnel_en = config->tunnel_en | config->swp;
+	txq_ctrl->txq.swp_en = ((DEV_TX_OFFLOAD_IP_TNL_TSO |
+				 DEV_TX_OFFLOAD_UDP_TNL_TSO |
+				 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM) &
+				txq_ctrl->txq.offloads) && config->swp;
+	config->tx_inline = txq_ctrl->txq.inlen_send ? 1 : 0;
 }
 
 /**
@@ -724,6 +882,8 @@ struct mlx5_txq_ctrl *
 	tmpl->priv = priv;
 	tmpl->socket = socket;
 	tmpl->txq.elts_n = log2above(desc);
+	tmpl->txq.elts_s = desc;
+	tmpl->txq.elts_m = desc - 1;
 	tmpl->txq.port_id = dev->data->port_id;
 	tmpl->txq.idx = idx;
 	txq_set_params(tmpl);
@@ -737,8 +897,6 @@ struct mlx5_txq_ctrl *
 		rte_errno = ENOMEM;
 		goto error;
 	}
-	tmpl->txq.elts =
-		(struct rte_mbuf *(*)[1 << tmpl->txq.elts_n])(tmpl + 1);
 	rte_atomic32_inc(&tmpl->refcnt);
 	LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next);
 	return tmpl;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH 5/7] net/mlx5: introduce Tx burst routine template
  2019-07-04 16:29 [dpdk-dev] [PATCH 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                   ` (3 preceding siblings ...)
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 4/7] net/mlx5: add Tx datapath configuration and setup Viacheslav Ovsiienko
@ 2019-07-04 16:29 ` Viacheslav Ovsiienko
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 6/7] net/mlx5: implement Tx burst template Viacheslav Ovsiienko
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 7/7] net/mlx5: add minimal required Tx data inline Viacheslav Ovsiienko
  6 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-04 16:29 UTC (permalink / raw)
  To: dev; +Cc: yskoh

Mellanox NICs support the wide set of Tx offloads. The supported
offloads are reported by the mlx5 PMD in rte_eth_dev_info tx_offload_capa
field. An application may choose any combination of supported offloads
and configure the device appropriately. Some of Tx offloads may be
not requested by application, or ever all of them may be omitted.
Most of the Tx offloads require some code branches in tx_burst routine
to support ones. If Tx offload is not requested the tx_burst routine
code may be significantly simplified and consume less CPU cycles.

For example, if application does not engage TSO offload this code
can be omitted, if multi-segment packet is not supposed the tx_burst
may assume single mbuf packets only, etc.

Currently, the mlx5 PMD implements multiple tx_burst subroutines
for most common combinations of requested Tx offloads, each branch
has its own dedicated implementation. It is not very easy to update,
support and develop such kind of code - multiple branches impose
the multiple points to process. Also many of frequently requested
offload combinations are not supported yet. That leads to selecting of
not completely matching tx_burst routine and harms the performance.

This patch introduces the new approach for tx_burst code. It is proposed
to develop the unified template for tx_burst routine, which supports
all the Tx offloads and takes the compile time defined parameter
describing the supposed set of supported offloads. On the base
of this template, the compiler is able to generate multiple tx_burst
routines highly optimized for the statically specified set of Tx offloads.
Next, in runtime, at Tx queue configuration the best matching optimized
implementation of tx_burst is chosen.

This patch intentionally omits the template internal implementation,
but just introduces the template itself to emboss the approach of
the multiple specially tuned tx_burst routines.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 511 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 507 insertions(+), 4 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 13f9431..af6f705 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright 2015 6WIND S.A.
- * Copyright 2015 Mellanox Technologies, Ltd
+ * Copyright 2015-2019 Mellanox Technologies, Ltd
  */
 
 #include <assert.h>
@@ -34,6 +34,57 @@
 #include "mlx5_defs.h"
 #include "mlx5_prm.h"
 
+/* TX burst subroutines return codes. */
+enum mlx5_txcmp_code {
+	MLX5_TXCMP_CODE_EXIT = 0,
+	MLX5_TXCMP_CODE_ERROR,
+	MLX5_TXCMP_CODE_SINGLE,
+	MLX5_TXCMP_CODE_MULTI,
+	MLX5_TXCMP_CODE_TSO,
+	MLX5_TXCMP_CODE_EMPW,
+};
+
+/*
+ * These defines are used to configure Tx burst routine option set
+ * supported at compile time. The not specified options are optimized out
+ * out due to if conditions can be explicitly calculated at compile time.
+ * The offloads with bigger runtime check (require more CPU cycles to
+ * skip) overhead should have the bigger index - this is needed to
+ * select the better matching routine function if no exact match and
+ * some offloads are not actually requested.
+ */
+#define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/
+#define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/
+#define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/
+#define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */
+#define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */
+#define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/
+#define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */
+#define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/
+
+/* The most common offloads groups. */
+#define MLX5_TXOFF_CONFIG_NONE 0
+#define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \
+				MLX5_TXOFF_CONFIG_TSO | \
+				MLX5_TXOFF_CONFIG_SWP | \
+				MLX5_TXOFF_CONFIG_CSUM | \
+				MLX5_TXOFF_CONFIG_INLINE | \
+				MLX5_TXOFF_CONFIG_VLAN | \
+				MLX5_TXOFF_CONFIG_METADATA)
+
+#define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask)
+
+#define MLX5_TXOFF_DECL(func, olx) \
+static uint16_t mlx5_tx_burst_##func(void *txq, \
+				     struct rte_mbuf **pkts, \
+				    uint16_t pkts_n) \
+{ \
+	return mlx5_tx_burst_tmpl((struct mlx5_txq_data *restrict)txq, \
+		    pkts, pkts_n, (olx)); \
+}
+
+#define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx},
+
 static __rte_always_inline uint32_t
 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
 
@@ -1531,7 +1582,323 @@
 }
 
 /**
- * Configure the TX function to use.
+ * DPDK Tx callback template. This is configured template
+ * used to generate routines optimized for specified offload setup.
+ * One of this generated functions is chosen at SQ configuration
+ * time.
+ *
+ * @param txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param olx
+ *   Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
+ *   values. Should be static to take compile time static configuration
+ *   advantages.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+static __rte_always_inline uint16_t
+mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq,
+		   struct rte_mbuf **restrict pkts,
+		   uint16_t pkts_n,
+		   unsigned int olx)
+{
+	(void)txq;
+	(void)pkts;
+	(void)pkts_n;
+	(void)olx;
+	return 0;
+}
+
+/* Generate routines with Enhanced Multi-Packet Write support. */
+MLX5_TXOFF_DECL(full_empw,
+		MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(none_empw,
+		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(md_empw,
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mt_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtsc_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mti_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtiv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sc_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sci_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(scv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sciv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(i_empw,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(v_empw,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(iv_empw,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+/* Generate routines without Enhanced Multi-Packet Write support. */
+MLX5_TXOFF_DECL(full,
+		MLX5_TXOFF_CONFIG_FULL)
+
+MLX5_TXOFF_DECL(none,
+		MLX5_TXOFF_CONFIG_NONE)
+
+MLX5_TXOFF_DECL(md,
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mt,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mtsc,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mti,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(mtv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(mtiv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(sc,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(sci,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(scv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(sciv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(i,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(v,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(iv,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+/*
+ * Array of declared and compiled Tx burst function and corresponding
+ * supported offloads set. The array is used to select the Tx burst
+ * function for specified offloads set at Tx queue configuration time.
+ */
+const struct {
+	eth_tx_burst_t func;
+	unsigned int olx;
+} txoff_func[] = {
+MLX5_TXOFF_INFO(full_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(none_empw,
+		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(md_empw,
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mt_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtsc_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mti_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtiv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sc_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sci_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(scv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sciv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(i_empw,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(v_empw,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(iv_empw,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(full,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(none,
+		MLX5_TXOFF_CONFIG_NONE)
+
+MLX5_TXOFF_INFO(md,
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mt,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mtsc,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mti,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_INFO(mtv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mtiv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sc,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sci,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(scv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sciv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(i,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(v,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(iv,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+};
+
+/**
+ * Configure the Tx function to use. The routine checks configured
+ * Tx offloads for the device and selects appropriate Tx burst
+ * routine. There are multiple Tx burst routines compiled from
+ * the same template in the most optimal way for the dedicated
+ * Tx offloads set.
  *
  * @param dev
  *   Pointer to private data structure.
@@ -1542,8 +1909,144 @@
 eth_tx_burst_t
 mlx5_select_tx_function(struct rte_eth_dev *dev)
 {
-	(void)dev;
-	return removed_tx_burst;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
+	unsigned int diff = 0, olx = 0, i, m;
+
+	static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <=
+		      MLX5_DSEG_MAX, "invalid WQE max size");
+	static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE,
+		      "invalid WQE Control Segment size");
+	static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE,
+		      "invalid WQE Ethernet Segment size");
+	static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE,
+		      "invalid WQE Data Segment size");
+	static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE,
+		      "invalid WQE size");
+	assert(priv);
+	if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) {
+		/* We should support Multi-Segment Packets. */
+		olx |= MLX5_TXOFF_CONFIG_MULTI;
+	}
+	if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
+			   DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+			   DEV_TX_OFFLOAD_GRE_TNL_TSO |
+			   DEV_TX_OFFLOAD_IP_TNL_TSO |
+			   DEV_TX_OFFLOAD_UDP_TNL_TSO)) {
+		/* We should support TCP Send Offload. */
+		olx |= MLX5_TXOFF_CONFIG_TSO;
+	}
+	if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
+			   DEV_TX_OFFLOAD_UDP_TNL_TSO |
+			   DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) {
+		/* We should support Software Parser for Tunnels. */
+		olx |= MLX5_TXOFF_CONFIG_SWP;
+	}
+	if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM |
+			   DEV_TX_OFFLOAD_UDP_CKSUM |
+			   DEV_TX_OFFLOAD_TCP_CKSUM |
+			   DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) {
+		/* We should support IP/TCP/UDP Checksums. */
+		olx |= MLX5_TXOFF_CONFIG_CSUM;
+	}
+	if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) {
+		/* We should support VLAN insertion. */
+		olx |= MLX5_TXOFF_CONFIG_VLAN;
+	}
+	if (config->tx_inline) {
+		/*
+		 * Data inlining is enabled by default.
+		 * Required inline data are disabled by default.
+		 */
+		olx |= MLX5_TXOFF_CONFIG_INLINE;
+	}
+	if (config->mps == MLX5_MPW_ENHANCED &&
+	    config->txq_inline_min <= 0) {
+		/*
+		 * The NIC supports Enhanced Multi-Packet Write.
+		 * We do not support legacy MPW due to its
+		 * hardware related problems, so we just ignore
+		 * legacy MLX5_MPW settings. There should be no
+		 * minimal required inline data.
+		 */
+		olx |= MLX5_TXOFF_CONFIG_EMPW;
+	}
+	if (tx_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) {
+		/* We should support Flow metadata. */
+		olx |= MLX5_TXOFF_CONFIG_METADATA;
+	}
+	/*
+	 * Scan the routines table to find the minimal
+	 * satisfying routine with requested offloads.
+	 */
+	m = RTE_DIM(txoff_func);
+	for (i = 0; i < RTE_DIM(txoff_func); i++) {
+		unsigned int tmp;
+
+		tmp = txoff_func[i].olx;
+		if (tmp == olx) {
+			/* Meets requested offloads exactly.*/
+			m = i;
+			break;
+		}
+		if ((tmp & olx) != olx) {
+			/* Does not meet requested offloads at all. */
+			continue;
+		}
+		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW)
+			/* Do not enable eMPW if not configured. */
+			continue;
+		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE)
+			/* Do not enable inlining if not configured. */
+			continue;
+		/*
+		 * Some routine meets the requirements.
+		 * Check whether it has minimal amount
+		 * of not requested offloads.
+		 */
+		tmp = __builtin_popcountl(tmp & ~olx);
+		if (m >= RTE_DIM(txoff_func) || tmp < diff) {
+			/* First or better match, save and continue. */
+			m = i;
+			diff = tmp;
+			continue;
+		}
+		if (tmp == diff) {
+			tmp = txoff_func[i].olx ^ txoff_func[m].olx;
+			if (__builtin_ffsl(txoff_func[i].olx & ~tmp) <
+			    __builtin_ffsl(txoff_func[m].olx & ~tmp)) {
+				/* Lighter not requested offload. */
+				m = i;
+			}
+		}
+	}
+	if (m >= RTE_DIM(txoff_func)) {
+		DRV_LOG(DEBUG, "port %u has no selected Tx function"
+			       " for requested offloads %04X",
+				dev->data->port_id, olx);
+		return NULL;
+	}
+	DRV_LOG(DEBUG, "port %u has selected Tx function"
+		       " supporting offloads %04X/%04X",
+			dev->data->port_id, olx, txoff_func[m].olx);
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI)
+		DRV_LOG(DEBUG, "\tMULTI (multi segment)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO)
+		DRV_LOG(DEBUG, "\tTSO   (TCP send offload)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP)
+		DRV_LOG(DEBUG, "\tSWP   (software parser)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM)
+		DRV_LOG(DEBUG, "\tCSUM  (checksum offload)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE)
+		DRV_LOG(DEBUG, "\tINLIN (inline data)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN)
+		DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA)
+		DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW)
+		DRV_LOG(DEBUG, "\tEMPW  (Enhanced MPW)");
+	return txoff_func[m].func;
 }
 
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH 6/7] net/mlx5: implement Tx burst template
  2019-07-04 16:29 [dpdk-dev] [PATCH 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                   ` (4 preceding siblings ...)
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 5/7] net/mlx5: introduce Tx burst routine template Viacheslav Ovsiienko
@ 2019-07-04 16:29 ` Viacheslav Ovsiienko
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 7/7] net/mlx5: add minimal required Tx data inline Viacheslav Ovsiienko
  6 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-04 16:29 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch adds the implementation of tx_burst routine template.
The template supports all Tx offloads and multiple optimized
tx_burst routines can be generated by compiler from this one.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 2869 +++++++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx.h |    5 +-
 2 files changed, 2845 insertions(+), 29 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index af6f705..115d073 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -339,6 +339,109 @@ enum mlx5_txcmp_code {
 }
 
 /**
+ * Set Software Parser flags and offsets in Ethernet Segment of WQE.
+ * Flags must be preliminary initialized to zero.
+ *
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param swp_flags
+ *   Pointer to store Software Parser flags
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Software Parser offsets packed in dword.
+ *   Software Parser flags are set by pointer.
+ */
+static __rte_always_inline uint32_t
+txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc,
+		uint8_t *swp_flags,
+		unsigned int olx)
+{
+	uint64_t ol, tunnel;
+	unsigned int idx, off;
+	uint32_t set;
+
+	if (!MLX5_TXOFF_CONFIG(SWP))
+		return 0;
+	ol = loc->mbuf->ol_flags;
+	tunnel = ol & PKT_TX_TUNNEL_MASK;
+	/*
+	 * Check whether Software Parser is required.
+	 * Only customized tunnels may ask for.
+	 */
+	if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP))
+		return 0;
+	/*
+	 * The index should have:
+	 * bit[0:1] = PKT_TX_L4_MASK
+	 * bit[4] = PKT_TX_IPV6
+	 * bit[8] = PKT_TX_OUTER_IPV6
+	 * bit[9] = PKT_TX_OUTER_UDP
+	 */
+	idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52;
+	idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0;
+	*swp_flags = mlx5_swp_types_table[idx];
+	/*
+	 * Set offsets for SW parser. Since ConnectX-5, SW parser just
+	 * complements HW parser. SW parser starts to engage only if HW parser
+	 * can't reach a header. For the older devices, HW parser will not kick
+	 * in if any of SWP offsets is set. Therefore, all of the L3 offsets
+	 * should be set regardless of HW offload.
+	 */
+	off = loc->mbuf->outer_l2_len;
+	if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT)
+		off += sizeof(struct rte_vlan_hdr);
+	set = (off >> 1) << 8; /* Outer L3 offset. */
+	off += loc->mbuf->outer_l3_len;
+	if (tunnel == PKT_TX_TUNNEL_UDP)
+		set |= off >> 1; /* Outer L4 offset. */
+	if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */
+		const uint64_t csum = ol & PKT_TX_L4_MASK;
+			off += loc->mbuf->l2_len;
+		set |= (off >> 1) << 24; /* Inner L3 offset. */
+		if (csum == PKT_TX_TCP_CKSUM ||
+		    csum == PKT_TX_UDP_CKSUM ||
+		    (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) {
+			off += loc->mbuf->l3_len;
+			set |= (off >> 1) << 16; /* Inner L4 offset. */
+		}
+	}
+	set = rte_cpu_to_le_32(set);
+	return set;
+}
+
+/**
+ * Convert the Checksum offloads to Verbs.
+ *
+ * @param buf
+ *   Pointer to the mbuf.
+ *
+ * @return
+ *   Converted checksum flags.
+ */
+static __rte_always_inline uint8_t
+txq_ol_cksum_to_cs(struct rte_mbuf *buf)
+{
+	uint32_t idx;
+	uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK);
+	const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK |
+				       PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM;
+
+	/*
+	 * The index should have:
+	 * bit[0] = PKT_TX_TCP_SEG
+	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
+	 * bit[4] = PKT_TX_IP_CKSUM
+	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
+	 * bit[9] = tunnel
+	 */
+	idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
+	return mlx5_cksum_table[idx];
+}
+
+/**
  * Internal function to compute the number of used descriptors in an RX queue
  *
  * @param rxq
@@ -543,7 +646,7 @@ enum mlx5_txcmp_code {
  *   The last Tx buffer element to free.
  */
 uint16_t
-mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
+mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq,
 			 volatile struct mlx5_err_cqe *err_cqe)
 {
 	if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
@@ -1563,6 +1666,296 @@ enum mlx5_txcmp_code {
 }
 
 /**
+ * Free the mbufs from the linear array of pointers.
+ *
+ * @param pkts
+ *   Pointer to array of packets to be free.
+ * @param pkts_n
+ *   Number of packets to be freed.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts,
+		  unsigned int pkts_n,
+		  unsigned int olx __rte_unused)
+{
+	struct rte_mempool *pool = NULL;
+	struct rte_mbuf **p_free = NULL;
+	struct rte_mbuf *mbuf;
+	unsigned int n_free = 0;
+
+	/*
+	 * The implemented algorithm eliminates
+	 * copying pointers to temporary array
+	 * for rte_mempool_put_bulk() calls.
+	 */
+	assert(pkts);
+	assert(pkts_n);
+	for (;;) {
+		for (;;) {
+			/*
+			 * Decrement mbuf reference counter, detach
+			 * indirect and external buffers if needed.
+			 */
+			mbuf = rte_pktmbuf_prefree_seg(*pkts);
+			if (likely(mbuf != NULL)) {
+				assert(mbuf == *pkts);
+				if (likely(n_free != 0)) {
+					if (unlikely(pool != mbuf->pool))
+						/* From different pool. */
+						break;
+				} else {
+					/* Start new scan array. */
+					pool = mbuf->pool;
+					p_free = pkts;
+				}
+				++n_free;
+				++pkts;
+				--pkts_n;
+				if (unlikely(pkts_n == 0)) {
+					mbuf = NULL;
+					break;
+				}
+			} else {
+				/*
+				 * This happens if mbuf is still referenced.
+				 * We can't put it back to the pool, skip.
+				 */
+				++pkts;
+				--pkts_n;
+				if (unlikely(n_free != 0))
+					/* There is some array to free.*/
+					break;
+				if (unlikely(pkts_n == 0))
+					/* Last mbuf, nothing to free. */
+					return;
+			}
+		}
+		for (;;) {
+			/*
+			 * This loop is implemented to avoid multiple
+			 * inlining of rte_mempool_put_bulk().
+			 */
+			assert(pool);
+			assert(p_free);
+			assert(n_free);
+			/*
+			 * Free the array of pre-freed mbufs
+			 * belonging to the same memory pool.
+			 */
+			rte_mempool_put_bulk(pool, (void *)p_free, n_free);
+			if (unlikely(mbuf != NULL)) {
+				/* There is the request to start new scan. */
+				pool = mbuf->pool;
+				p_free = pkts++;
+				n_free = 1;
+				--pkts_n;
+				if (likely(pkts_n != 0))
+					break;
+				/*
+				 * This is the last mbuf to be freed.
+				 * Do one more loop iteration to complete.
+				 * This is rare case of the last unique mbuf.
+				 */
+				mbuf = NULL;
+				continue;
+			}
+			if (likely(pkts_n == 0))
+				return;
+			n_free = 0;
+			break;
+		}
+	}
+}
+
+/**
+ * Free the mbuf from the elts ring buffer till new tail.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tail
+ *   Index in elts to free up to, becomes new elts tail.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq,
+		  uint16_t tail,
+		  unsigned int olx __rte_unused)
+{
+	uint16_t n_elts = tail - txq->elts_tail;
+
+	assert(n_elts);
+	assert(n_elts <= txq->elts_s);
+	/*
+	 * Implement a loop to support ring buffer wraparound
+	 * with single inlining of mlx5_tx_free_mbuf().
+	 */
+	do {
+		unsigned int part;
+
+		part = txq->elts_s - (txq->elts_tail & txq->elts_m);
+		part = RTE_MIN(part, n_elts);
+		assert(part);
+		assert(part <= txq->elts_s);
+		mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m],
+				  part, olx);
+		txq->elts_tail += part;
+		n_elts -= part;
+	} while (n_elts);
+}
+
+/**
+ * Store the mbuf being sent into elts ring buffer.
+ * On Tx completion these mbufs will be freed.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq,
+		  struct rte_mbuf **restrict pkts,
+		  unsigned int pkts_n,
+		  unsigned int olx __rte_unused)
+{
+	unsigned int part;
+	struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts;
+
+	assert(pkts);
+	assert(pkts_n);
+	part = txq->elts_s - (txq->elts_head & txq->elts_m);
+	assert(part);
+	assert(part <= txq->elts_s);
+	/* This code is a good candidate for vectorizing with SIMD. */
+	rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)),
+		   (void *)pkts,
+		   RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *));
+	txq->elts_head += pkts_n;
+	if (unlikely(part < pkts_n))
+		/* The copy is wrapping around the elts array. */
+		rte_memcpy((void *)elts, (void *)(pkts + part),
+			   (pkts_n - part) * sizeof(struct rte_mbuf *));
+}
+
+/**
+ * Manage TX completions. This routine checks the CQ for
+ * arrived CQEs, deduces the last accomplished WQE in SQ,
+ * updates SQ producing index and frees all completed mbufs.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * NOTE: not inlined intentionally, it makes tx_burst
+ * routine smaller, simple and faster - from experiments.
+ */
+static void
+mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
+			  unsigned int olx __rte_unused)
+{
+	bool update = false;
+	int ret;
+
+	do {
+		volatile struct mlx5_wqe_cseg *cseg;
+		volatile struct mlx5_cqe *cqe;
+		uint16_t tail;
+
+		cqe = &txq->cqes[txq->cq_ci & txq->cqe_m];
+		ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci);
+		if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+			if (likely(ret != MLX5_CQE_STATUS_ERR)) {
+				/* No new CQEs in completion queue. */
+				assert(ret == MLX5_CQE_STATUS_HW_OWN);
+				if (likely(update)) {
+					/* Update the consumer index. */
+					rte_compiler_barrier();
+					*txq->cq_db =
+						rte_cpu_to_be_32(txq->cq_ci);
+				}
+				return;
+			}
+			/* Some error occurred, try to restart. */
+			tail = mlx5_tx_error_cqe_handle
+				(txq, (volatile struct mlx5_err_cqe *)cqe);
+		} else {
+			/* Normal transmit completion. */
+			++txq->cq_ci;
+			rte_cio_rmb();
+			txq->wqe_pi = rte_be_to_cpu_16(cqe->wqe_counter);
+			cseg = (volatile struct mlx5_wqe_cseg *)
+				(txq->wqes + (txq->wqe_pi & txq->wqe_m));
+			tail = cseg->misc;
+		}
+#ifndef NDEBUG
+		if (txq->cq_pi)
+			--txq->cq_pi;
+#endif
+		if (likely(tail != txq->elts_tail)) {
+			/* Free data buffers from elts. */
+			mlx5_tx_free_elts(txq, tail, olx);
+			assert(tail == txq->elts_tail);
+		}
+		update = true;
+	} while (true);
+}
+
+/**
+ * Check if the completion request flag should be set in the last WQE.
+ * Both pushed mbufs and WQEs are monitored and the completion request
+ * flag is set if any of thresholds is reached.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param n_mbuf
+ *   Number of mbuf not stored yet in elts array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq,
+			   unsigned int n_mbuf,
+			   struct mlx5_txq_local *restrict loc,
+			   unsigned int olx __rte_unused)
+{
+	uint16_t head = txq->elts_head + n_mbuf;
+
+	if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH ||
+	    (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres) {
+		volatile struct mlx5_wqe *last = loc->wqe_last;
+
+		txq->elts_comp = head;
+		txq->wqe_comp = txq->wqe_ci;
+		/* Request unconditional completion on last WQE. */
+		last->cseg.flags = RTE_BE32(MLX5_WQE_CTRL_CQ_UPDATE);
+		/* Save elts_head in unused "immediate" field of WQE. */
+		last->cseg.misc = head;
+		/*
+		 * A CQE slot must always be available. Count the
+		 * issued CEQ "always" request instead of production
+		 * index due to here can be CQE with errors and
+		 * difference with ci may become inconsistent.
+		 */
+		assert(txq->cqe_s > ++txq->cq_pi);
+	}
+}
+
+/**
  * DPDK callback to check the status of a tx descriptor.
  *
  * @param tx_queue
@@ -1576,42 +1969,2464 @@ enum mlx5_txcmp_code {
 int
 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
 {
-	(void)tx_queue;
-	(void)offset;
-	return RTE_ETH_TX_DESC_FULL;
+	struct mlx5_txq_data *restrict txq = tx_queue;
+	uint16_t used;
+
+	mlx5_tx_handle_completion(txq, 0);
+	used = txq->elts_head - txq->elts_tail;
+	if (offset < used)
+		return RTE_ETH_TX_DESC_FULL;
+	return RTE_ETH_TX_DESC_DONE;
 }
 
 /**
- * DPDK Tx callback template. This is configured template
- * used to generate routines optimized for specified offload setup.
- * One of this generated functions is chosen at SQ configuration
- * time.
+ * Build the Control Segment with specified opcode:
+ * - MLX5_OPCODE_SEND
+ * - MLX5_OPCODE_ENHANCED_MPSW
+ * - MLX5_OPCODE_TSO
  *
  * @param txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Control Segment.
+ * @param ds
+ *   Supposed length of WQE in segments.
+ * @param opcode
+ *   SQ WQE opcode to put into Control Segment.
  * @param olx
- *   Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
- *   values. Should be static to take compile time static configuration
- *   advantages.
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc __rte_unused,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int ds,
+		  unsigned int opcode,
+		  unsigned int olx __rte_unused)
+{
+	struct mlx5_wqe_cseg *restrict cs = &wqe->cseg;
+
+	cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode);
+	cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	cs->flags = RTE_BE32(0);
+	cs->misc = RTE_BE32(0);
+}
+
+/**
+ * Build the Ethernet Segment without inlined data.
+ * Supports Software Parser, Checksums and VLAN
+ * insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int olx)
+{
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+
+	/*
+	 * Calculate and set check sum flags first, dword field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	es->flags = rte_cpu_to_le_32(csum);
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
+		       loc->mbuf->tx_metadata : 0 : 0;
+	/* Engage VLAN tag insertion feature if requested. */
+	if (MLX5_TXOFF_CONFIG(VLAN) &&
+	    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT |
+						  loc->mbuf->vlan_tci);
+	else
+		es->inline_hdr = RTE_BE32(0);
+}
+
+/**
+ * Build the Ethernet Segment with minimal inlined data
+ * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is
+ * used to fill the gap in single WQEBB WQEs.
+ * Supports Software Parser, Checksums and VLAN
+ * insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ *   Length of VLAN tag insertion if any.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int vlan,
+		  unsigned int olx)
+{
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+	uint8_t *psrc, *pdst;
+
+	/*
+	 * Calculate and set check sum flags first, dword field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	es->flags = rte_cpu_to_le_32(csum);
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
+		       loc->mbuf->tx_metadata : 0 : 0;
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(rte_v128u32_t)),
+		      "invalid Ethernet Segment data size");
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(struct rte_vlan_hdr) +
+				 2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Ethernet Segment data size");
+	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+	es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE);
+	es->inline_data = *(uint16_t *)psrc;
+	psrc +=	sizeof(uint16_t);
+	pdst = (uint8_t *)(es + 1);
+	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+		/* Implement VLAN tag insertion as part inline data. */
+		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
+		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		/* Insert VLAN ethertype + VLAN tag. */
+		*(uint32_t *)pdst = rte_cpu_to_be_32
+					((RTE_ETHER_TYPE_VLAN << 16) |
+					 loc->mbuf->vlan_tci);
+		pdst += sizeof(struct rte_vlan_hdr);
+		/* Copy the rest two bytes from packet data. */
+		*(uint16_t *)pdst = *(uint16_t *)psrc;
+	} else {
+		/* Fill the gap in the title WQEBB with inline data. */
+		rte_mov16(pdst, psrc);
+	}
+}
+
+/**
+ * Build the Ethernet Segment with entire packet
+ * data inlining. Checks the boundary of WQEBB and
+ * ring buffer wrapping, supports Software Parser,
+ * Checksums and VLAN insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ *   Length of VLAN tag insertion if any.
+ * @param inlen
+ *   Length of data to inline (VLAN included, if any).
+ * @param tso
+ *   TSO flag, set mss field from the packet.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
  *
  * @return
- *   Number of packets successfully transmitted (<= pkts_n).
+ *   Pointer to the next Data Segment (aligned and wrapped around).
  */
-static __rte_always_inline uint16_t
-mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq,
-		   struct rte_mbuf **restrict pkts,
-		   uint16_t pkts_n,
-		   unsigned int olx)
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int vlan,
+		  unsigned int inlen,
+		  unsigned int tso,
+		  unsigned int olx)
 {
-	(void)txq;
-	(void)pkts;
-	(void)pkts_n;
-	(void)olx;
-	return 0;
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+	uint8_t *psrc, *pdst;
+	unsigned int part;
+
+	/*
+	 * Calculate and set check sum flags first, dword field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	if (tso) {
+		csum <<= 24;
+		csum |= loc->mbuf->tso_segsz;
+		es->flags = rte_cpu_to_be_32(csum);
+	} else {
+		es->flags = rte_cpu_to_le_32(csum);
+	}
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
+		       loc->mbuf->tx_metadata : 0 : 0;
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(rte_v128u32_t)),
+		      "invalid Ethernet Segment data size");
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(struct rte_vlan_hdr) +
+				 2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Ethernet Segment data size");
+	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+	es->inline_hdr_sz = RTE_BE16(inlen);
+	es->inline_data = *(uint16_t *)psrc;
+	psrc +=	sizeof(uint16_t);
+	pdst = (uint8_t *)(es + 1);
+	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+		/* Implement VLAN tag insertion as part inline data. */
+		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
+		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		/* Insert VLAN ethertype + VLAN tag. */
+		*(uint32_t *)pdst = rte_cpu_to_be_32
+					((RTE_ETHER_TYPE_VLAN << 16) |
+					 loc->mbuf->vlan_tci);
+		pdst += sizeof(struct rte_vlan_hdr);
+		/* Copy the rest two bytes from packet data. */
+		*(uint16_t *)pdst = *(uint16_t *)psrc;
+		psrc += sizeof(uint16_t);
+	} else {
+		/* Fill the gap in the title WQEBB with inline data. */
+		rte_mov16(pdst, psrc);
+		psrc += sizeof(rte_v128u32_t);
+	}
+	pdst = (uint8_t *)(es + 2);
+	assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
+	assert(pdst < (uint8_t *)txq->wqes_end);
+	inlen -= MLX5_ESEG_MIN_INLINE_SIZE;
+	if (!inlen) {
+		assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
+		return (struct mlx5_wqe_dseg *)pdst;
+	}
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, inlen);
+	do {
+		rte_memcpy(pdst, psrc, part);
+		inlen -= part;
+		if (likely(!inlen)) {
+			/*
+			 * If return value is not used by the caller
+			 * the code below will be optimized out.
+			 */
+			pdst += part;
+			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
+				pdst = (uint8_t *)txq->wqes;
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		psrc += part;
+		part = inlen;
+	} while (true);
+}
+
+/**
+ * Copy data from chain of mbuf to the specified linear buffer.
+ * Checksums and VLAN insertion Tx offload features. If data
+ * from some mbuf copied completely this mbuf is freed. Local
+ * structure is used to keep the byte stream state.
+ *
+ * @param pdst
+ *   Pointer to the destination linear buffer.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param len
+ *   Length of data to be copied.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_mseg_memcpy(uint8_t *pdst,
+		    struct mlx5_txq_local *restrict loc,
+		    unsigned int len,
+		    unsigned int olx __rte_unused)
+{
+	struct rte_mbuf *mbuf;
+	unsigned int part, dlen;
+	uint8_t *psrc;
+
+	assert(len);
+	do {
+		/* Allow zero length packets, must check first. */
+		dlen = rte_pktmbuf_data_len(loc->mbuf);
+		if (dlen <= loc->mbuf_off) {
+			/* Exhausted packet, just free. */
+			mbuf = loc->mbuf;
+			loc->mbuf = mbuf->next;
+			rte_pktmbuf_free_seg(mbuf);
+			loc->mbuf_off = 0;
+			assert(loc->mbuf_nseg > 1);
+			assert(loc->mbuf);
+			--loc->mbuf_nseg;
+			continue;
+		}
+		dlen -= loc->mbuf_off;
+		psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+		psrc += loc->mbuf_off;
+		part = RTE_MIN(len, dlen);
+		rte_memcpy(pdst, psrc, part);
+		loc->mbuf_off += part;
+		len -= part;
+		if (!len) {
+			if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) {
+				loc->mbuf_off = 0;
+				/* Exhausted packet, just free. */
+				mbuf = loc->mbuf;
+				loc->mbuf = mbuf->next;
+				rte_pktmbuf_free_seg(mbuf);
+				loc->mbuf_off = 0;
+				assert(loc->mbuf_nseg >= 1);
+				--loc->mbuf_nseg;
+			}
+			return;
+		}
+		pdst += part;
+	} while (true);
+}
+
+/**
+ * Build the Ethernet Segment with inlined data from
+ * multi-segment packet. Checks the boundary of WQEBB
+ * and ring buffer wrapping, supports Software Parser,
+ * Checksums and VLAN insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ *   Length of VLAN tag insertion if any.
+ * @param inlen
+ *   Length of data to inline (VLAN included, if any).
+ * @param tso
+ *   TSO flag, set mss field from the packet.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Pointer to the next Data Segment (aligned and
+ *   possible NOT wrapped around - caller should do
+ *   wrapping check on its own).
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int vlan,
+		  unsigned int inlen,
+		  unsigned int tso,
+		  unsigned int olx)
+{
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+	uint8_t *pdst;
+	unsigned int part;
+
+	/*
+	 * Calculate and set check sum flags first, uint32_t field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	if (tso) {
+		csum <<= 24;
+		csum |= loc->mbuf->tso_segsz;
+		es->flags = rte_cpu_to_be_32(csum);
+	} else {
+		es->flags = rte_cpu_to_le_32(csum);
+	}
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
+		       loc->mbuf->tx_metadata : 0 : 0;
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(rte_v128u32_t)),
+		      "invalid Ethernet Segment data size");
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(struct rte_vlan_hdr) +
+				 2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Ethernet Segment data size");
+	assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
+	es->inline_hdr_sz = RTE_BE16(inlen);
+	pdst = (uint8_t *)&es->inline_data;
+	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+		/* Implement VLAN tag insertion as part inline data. */
+		mlx5_tx_mseg_memcpy(pdst, loc, 2 * RTE_ETHER_ADDR_LEN, olx);
+		*(uint32_t *)pdst = rte_cpu_to_be_32
+					((RTE_ETHER_TYPE_VLAN << 16) |
+					 loc->mbuf->vlan_tci);
+		pdst += sizeof(struct rte_vlan_hdr);
+		inlen -= 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr);
+	}
+	assert(pdst < (uint8_t *)txq->wqes_end);
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, inlen);
+	assert(part);
+	do {
+		mlx5_tx_mseg_memcpy(pdst, loc, part, olx);
+		inlen -= part;
+		if (likely(!inlen)) {
+			pdst += part;
+			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		part = inlen;
+	} while (true);
+}
+
+/**
+ * Build the Data Segment of pointer type.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq,
+		 struct mlx5_txq_local *restrict loc,
+		 struct mlx5_wqe_dseg *restrict dseg,
+		 uint8_t *buf,
+		 unsigned int len,
+		 unsigned int olx __rte_unused)
+
+{
+	assert(len);
+	dseg->bcount = rte_cpu_to_be_32(len);
+	dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
+	dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
+}
+
+/**
+ * Build the Data Segment of pointer type or inline
+ * if data length is less than buffer in minimal
+ * Data Segment size.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe_dseg *restrict dseg,
+		  uint8_t *buf,
+		  unsigned int len,
+		  unsigned int olx __rte_unused)
+
+{
+	uintptr_t dst, src;
+
+	assert(len);
+	if (len > MLX5_DSEG_MIN_INLINE_SIZE) {
+		dseg->bcount = rte_cpu_to_be_32(len);
+		dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
+		dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
+
+		return;
+	}
+	dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
+	/* Unrolled implementation of generic rte_memcpy. */
+	dst = (uintptr_t)&dseg->inline_data[0];
+	src = (uintptr_t)buf;
+	if (len & 0x08) {
+		*(uint64_t *)dst = *(uint64_t *)src;
+		dst += sizeof(uint64_t);
+		src += sizeof(uint64_t);
+	}
+	if (len & 0x04) {
+		*(uint32_t *)dst = *(uint32_t *)src;
+		dst += sizeof(uint32_t);
+		src += sizeof(uint32_t);
+	}
+	if (len & 0x02) {
+		*(uint16_t *)dst = *(uint16_t *)src;
+		dst += sizeof(uint16_t);
+		src += sizeof(uint16_t);
+	}
+	if (len & 0x01)
+		*(uint8_t *)dst = *(uint8_t *)src;
+}
+
+/**
+ * Build the Data Segment of inlined data from single
+ * segment packet, no VLAN insertion.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Pointer to the next Data Segment after inlined data.
+ *   Ring buffer wraparound check is needed. We do not
+ *   do it here because it may not be needed for the
+ *   last packet in the eMPW session.
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc __rte_unused,
+		  struct mlx5_wqe_dseg *restrict dseg,
+		  uint8_t *buf,
+		  unsigned int len,
+		  unsigned int olx __rte_unused)
+{
+	unsigned int part;
+	uint8_t *pdst;
+
+	dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
+	pdst = &dseg->inline_data[0];
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, len);
+	do {
+		rte_memcpy(pdst, buf, part);
+		len -= part;
+		if (likely(!len)) {
+			pdst += part;
+			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			/* Note: no final wraparound check here. */
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		buf += part;
+		part = len;
+	} while (true);
+}
+
+/**
+ * Build the Data Segment of inlined data from single
+ * segment packet with VLAN insertion.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to the dseg fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Pointer to the next Data Segment after inlined data.
+ *   Ring buffer wraparound check is needed.
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc __rte_unused,
+		  struct mlx5_wqe_dseg *restrict dseg,
+		  uint8_t *buf,
+		  unsigned int len,
+		  unsigned int olx __rte_unused)
+
+{
+	unsigned int part;
+	uint8_t *pdst;
+
+	assert(len > MLX5_ESEG_MIN_INLINE_SIZE);
+	static_assert(MLX5_DSEG_MIN_INLINE_SIZE ==
+				 (2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Data Segment data size");
+	dseg->bcount = rte_cpu_to_be_32((len + sizeof(struct rte_vlan_hdr)) |
+					MLX5_ETH_WQE_DATA_INLINE);
+	pdst = &dseg->inline_data[0];
+	memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE);
+	buf += MLX5_DSEG_MIN_INLINE_SIZE;
+	/* Insert VLAN ethertype + VLAN tag. */
+	*(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) |
+					      loc->mbuf->vlan_tci);
+	pdst += sizeof(struct rte_vlan_hdr);
+	if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
+		pdst = (uint8_t *)txq->wqes;
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, len);
+	do {
+		rte_memcpy(pdst, buf, part);
+		len -= part;
+		if (likely(!len)) {
+			pdst += part;
+			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			/* Note: no final wraparound check here. */
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		buf += part;
+		part = len;
+	} while (true);
+}
+
+/**
+ * Build the Ethernet Segment with optionally inlined data with
+ * VLAN insertion and following Data Segments (if any) from
+ * multi-segment packet. Used by ordinary send and TSO.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet/Data Segments.
+ * @param vlan
+ *   Length of VLAN header to insert, 0 means no VLAN insertion.
+ * @param inlen
+ *   Data length to inline. This is minimal amount of data bytes
+ *   to be inlined. For TSO this parameter specifies exact value,
+ *   for ordinary send routine can extend beyond specified value
+ *   to provide better WQE space saving. This length includes
+ *   VLAN header being inserted.
+ * @param tso
+ *   Zero means ordinary send, inlined data can be extended,
+ *   otherwise this is TSO, inlined data length is fixed.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Actual size of built WQE in segments.
+ */
+static __rte_always_inline unsigned int
+mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq,
+		   struct mlx5_txq_local *restrict loc,
+		   struct mlx5_wqe *restrict wqe,
+		   unsigned int vlan,
+		   unsigned int inlen,
+		   unsigned int tso,
+		   unsigned int olx __rte_unused)
+{
+	struct mlx5_wqe_dseg *restrict dseg;
+	unsigned int ds;
+
+	assert((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen);
+	loc->mbuf_nseg = NB_SEGS(loc->mbuf);
+	loc->mbuf_off = 0;
+
+	dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx);
+	if (!loc->mbuf_nseg)
+		goto dseg_done;
+	/*
+	 * There are still some mbuf remaining, not inlined.
+	 * The first mbuf may be partially inlined and we
+	 * must process the possible non-zero data offset.
+	 */
+	if (loc->mbuf_off) {
+		unsigned int dlen;
+		uint8_t *dptr;
+
+		/*
+		 * Exhausted packets must be dropped before.
+		 * Non-zero offset means there are some data
+		 * remained in the packet.
+		 */
+		assert(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf));
+		assert(rte_pktmbuf_data_len(loc->mbuf));
+		dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + loc->mbuf_off;
+		dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off;
+		/*
+		 * Build the pointer/minimal data Data Segment.
+		 * Do ring buffer wrapping check in advance.
+		 */
+		if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+			dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+		mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx);
+		/* Store the mbuf to be freed on completion. */
+		assert(loc->elts_free);
+		txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+		--loc->elts_free;
+		++dseg;
+		if (--loc->mbuf_nseg == 0)
+			goto dseg_done;
+		loc->mbuf = loc->mbuf->next;
+		loc->mbuf_off = 0;
+	}
+	do {
+		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
+			struct rte_mbuf *mbuf;
+
+			/* Zero length segment found, just skip. */
+			mbuf = loc->mbuf;
+			loc->mbuf = loc->mbuf->next;
+			rte_pktmbuf_free_seg(mbuf);
+			if (--loc->mbuf_nseg == 0)
+				break;
+		} else {
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+			mlx5_tx_dseg_iptr
+				(txq, loc, dseg,
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 rte_pktmbuf_data_len(loc->mbuf), olx);
+			assert(loc->elts_free);
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+			--loc->elts_free;
+			++dseg;
+			if (--loc->mbuf_nseg == 0)
+				break;
+			loc->mbuf = loc->mbuf->next;
+		}
+	} while (true);
+
+dseg_done:
+	/* Calculate actual segments used from the dseg pointer. */
+	if ((uintptr_t)wqe < (uintptr_t)dseg)
+		ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE;
+	else
+		ds = (((uintptr_t)dseg - (uintptr_t)wqe) +
+		      txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE;
+	return ds;
+}
+
+/**
+ * Tx one packet function for multi-segment TSO. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs,
+ * sends one packet per WQE.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq,
+			struct mlx5_txq_local *restrict loc,
+			unsigned int olx)
+{
+	struct mlx5_wqe *restrict wqe;
+	unsigned int ds, dlen, inlen, ntcp, vlan = 0;
+
+	/*
+	 * Calculate data length to be inlined to estimate
+	 * the required space in WQE ring buffer.
+	 */
+	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
+	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		vlan = sizeof(struct rte_vlan_hdr);
+	inlen = loc->mbuf->l2_len + vlan +
+		loc->mbuf->l3_len + loc->mbuf->l4_len;
+	if (unlikely((!inlen || !loc->mbuf->tso_segsz)))
+		return MLX5_TXCMP_CODE_ERROR;
+	if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK)
+		inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len;
+	/* Packet must contain all TSO headers. */
+	if (unlikely(inlen > MLX5_MAX_TSO_HEADER ||
+		     inlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
+		     inlen > (dlen + vlan)))
+		return MLX5_TXCMP_CODE_ERROR;
+	/*
+	 * Check whether there are enough free WQEBBs:
+	 * - Control Segment
+	 * - Ethernet Segment
+	 * - First Segment of inlined Ethernet data
+	 * - ... data continued ...
+	 * - Data Segments of pointer/min inline type
+	 */
+	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
+				       MLX5_ESEG_MIN_INLINE_SIZE +
+				       MLX5_WSEG_SIZE +
+				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_EXIT;
+	/* Check for maximal WQE size. */
+	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes/packets counters. */
+	ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) /
+		loc->mbuf->tso_segsz;
+	/*
+	 * One will be added for mbuf itself
+	 * at the end of the mlx5_tx_burst from
+	 * loc->pkts_sent field.
+	 */
+	--ntcp;
+	txq->stats.opackets += ntcp;
+	txq->stats.obytes += dlen + vlan + ntcp * inlen;
+#endif
+	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+	loc->wqe_last = wqe;
+	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx);
+	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx);
+	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+	return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx one packet function for multi-segment SEND. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
+ * sends one packet per WQE, without any data inlining in
+ * Ethernet Segment.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	struct mlx5_wqe_dseg *restrict dseg;
+	struct mlx5_wqe *restrict wqe;
+	unsigned int ds, nseg;
+
+	assert(NB_SEGS(loc->mbuf) > 1);
+	/*
+	 * No inline at all, it means the CPU cycles saving
+	 * is prioritized at configuration, we should not
+	 * copy any packet data to WQE.
+	 */
+	nseg = NB_SEGS(loc->mbuf);
+	ds = 2 + nseg;
+	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_EXIT;
+	/* Check for maximal WQE size. */
+	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_ERROR;
+	/*
+	 * Some Tx offloads may cause an error if
+	 * packet is not long enough, check against
+	 * assumed minimal length.
+	 */
+	if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE)
+		return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes counter. */
+	txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf);
+	if (MLX5_TXOFF_CONFIG(VLAN) &&
+	    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		txq->stats.obytes += sizeof(struct rte_vlan_hdr);
+#endif
+	/*
+	 * SEND WQE, one WQEBB:
+	 * - Control Segment, SEND opcode
+	 * - Ethernet Segment, optional VLAN, no inline
+	 * - Data Segments, pointer only type
+	 */
+	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+	loc->wqe_last = wqe;
+	mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx);
+	mlx5_tx_eseg_none(txq, loc, wqe, olx);
+	dseg = &wqe->dseg[0];
+	do {
+		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
+			struct rte_mbuf *mbuf;
+
+			/*
+			 * Zero length segment found, have to
+			 * correct total size of WQE in segments.
+			 * It is supposed to be rare occasion, so
+			 * in normal case (no zero length segments)
+			 * we avoid extra writing to the Control
+			 * Segment.
+			 */
+			--ds;
+			wqe->cseg.sq_ds -= RTE_BE32(1);
+			mbuf = loc->mbuf;
+			loc->mbuf = mbuf->next;
+			rte_pktmbuf_free_seg(mbuf);
+			if (--nseg == 0)
+				break;
+		} else {
+			mlx5_tx_dseg_ptr
+				(txq, loc, dseg,
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 rte_pktmbuf_data_len(loc->mbuf), olx);
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+			--loc->elts_free;
+			if (--nseg == 0)
+				break;
+			++dseg;
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+			loc->mbuf = loc->mbuf->next;
+		}
+	} while (true);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+	return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx one packet function for multi-segment SEND. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
+ * sends one packet per WQE, with data inlining in
+ * Ethernet Segment and minimal Data Segments.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq,
+			    struct mlx5_txq_local *restrict loc,
+			    unsigned int olx)
+{
+	struct mlx5_wqe *restrict wqe;
+	unsigned int ds, inlen, dlen, vlan = 0;
+
+	assert(MLX5_TXOFF_CONFIG(INLINE));
+	assert(NB_SEGS(loc->mbuf) > 1);
+	/*
+	 * First calculate data length to be inlined
+	 * to estimate the required space for WQE.
+	 */
+	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
+	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		vlan = sizeof(struct rte_vlan_hdr);
+	inlen = dlen + vlan;
+	/* Check against minimal length. */
+	if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
+		return MLX5_TXCMP_CODE_ERROR;
+	assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+	if (inlen > txq->inlen_send) {
+		/*
+		 * Packet length exceeds the allowed inline
+		 * data length, check whether the minimal
+		 * inlining is required. eMPW check is here
+		 * to statically exclude check - no minimal
+		 * inlining required if eMPW is enabled.
+		 */
+		if (MLX5_TXOFF_CONFIG(EMPW) || !txq->inlen_mode) {
+			/*
+			 * VLAN insertion will be done inside by HW.
+			 * It is not utmost effective - VLAN flag is
+			 * checked twice, but we should proceed the
+			 * inlining length correctly and take into
+			 * account the VLAN header being inserted.
+			 */
+			assert(!txq->inlen_mode);
+			return mlx5_tx_packet_multi_send(txq, loc, olx);
+		}
+		assert(txq->inlen_mode >= MLX5_ESEG_MIN_INLINE_SIZE);
+		assert(txq->inlen_mode <= txq->inlen_send);
+		inlen = txq->inlen_mode;
+	}
+	/*
+	 * Check whether there are enough free WQEBBs:
+	 * - Control Segment
+	 * - Ethernet Segment
+	 * - First Segment of inlined Ethernet data
+	 * - ... data continued ...
+	 * - Data Segments of pointer/min inline type
+	 *
+	 * Estimate the number of Data Segments conservatively,
+	 * supposing no any mbufs is being freed during inlining.
+	 */
+	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
+				       MLX5_ESEG_MIN_INLINE_SIZE +
+				       MLX5_WSEG_SIZE +
+				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+	/*
+	 * We may have extra space in WQE to put inline
+	 * data to fill the WQEBBs completely.
+	 */
+	inlen += (4 - ds % 4) * MLX5_WSEG_SIZE;
+	inlen = RTE_MIN(inlen, dlen + vlan);
+	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_EXIT;
+	/* Check for maximal WQE size. */
+	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes/packets counters. */
+	txq->stats.obytes += dlen + vlan;
+#endif
+	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+	loc->wqe_last = wqe;
+	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx);
+	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx);
+	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+	return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx burst function for multi-segment packets. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs,
+ * sends one packet per WQE. Function stops sending if it
+ * encounters the single-segment packet.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
+ *   MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered.
+ * Local context variables updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq,
+		   struct rte_mbuf **restrict pkts,
+		   unsigned int pkts_n,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int olx)
+{
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		enum mlx5_txcmp_code ret;
+
+		assert(NB_SEGS(loc->mbuf) > 1);
+		/*
+		 * Estimate the number of free elts quickly but
+		 * conservatively. Some segment may be fully inlined
+		 * and freed, ignore this here - precise estimation
+		 * is costly.
+		 */
+		if (loc->elts_free < NB_SEGS(loc->mbuf))
+			return MLX5_TXCMP_CODE_EXIT;
+		if (MLX5_TXOFF_CONFIG(TSO) &&
+		    unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) {
+			/* Proceed with multi-segment TSO. */
+			ret = mlx5_tx_packet_multi_tso(txq, loc, olx);
+		} else if (MLX5_TXOFF_CONFIG(INLINE)) {
+			/* Proceed with multi-segment SEND with inlining. */
+			ret = mlx5_tx_packet_multi_inline(txq, loc, olx);
+		} else {
+			/* Proceed with multi-segment SEND w/o inlining. */
+			ret = mlx5_tx_packet_multi_send(txq, loc, olx);
+		}
+		if (ret == MLX5_TXCMP_CODE_EXIT)
+			return MLX5_TXCMP_CODE_EXIT;
+		if (ret == MLX5_TXCMP_CODE_ERROR)
+			return MLX5_TXCMP_CODE_ERROR;
+		/* WQE is built, go to the next packet. */
+		++loc->pkts_sent;
+		--pkts_n;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		if (pkts_n > 1)
+			rte_prefetch0(*pkts);
+		if (likely(NB_SEGS(loc->mbuf) > 1))
+			continue;
+		/* Here ends the series of multi-segment packets. */
+		if (MLX5_TXOFF_CONFIG(TSO) &&
+		    unlikely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)))
+			return MLX5_TXCMP_CODE_TSO;
+		return MLX5_TXCMP_CODE_SINGLE;
+	}
+	assert(false);
+}
+
+/**
+ * Tx burst function for single-segment packets with TSO.
+ * Supports all types of Tx offloads, except multi-packets.
+ * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE.
+ * Function stops sending if it encounters the multi-segment
+ * packet or packet without TSO requested.
+ *
+ * The routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head if inline
+ * offloads is requested due to possible early freeing
+ * of the inlined mbufs (can not store pkts array in elts
+ * as a batch).
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
+ *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ * Local context variables updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq,
+		  struct rte_mbuf **restrict pkts,
+		  unsigned int pkts_n,
+		  struct mlx5_txq_local *restrict loc,
+		  unsigned int olx)
+{
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe_dseg *restrict dseg;
+		struct mlx5_wqe *restrict wqe;
+		unsigned int ds, dlen, hlen, ntcp, vlan = 0;
+		uint8_t *dptr;
+
+		assert(NB_SEGS(loc->mbuf) == 1);
+		dlen = rte_pktmbuf_data_len(loc->mbuf);
+		if (MLX5_TXOFF_CONFIG(VLAN) &&
+		    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+			vlan = sizeof(struct rte_vlan_hdr);
+		}
+		/*
+		 * First calculate the WQE size to check
+		 * whether we have enough space in ring buffer.
+		 */
+		hlen = loc->mbuf->l2_len + vlan +
+		       loc->mbuf->l3_len + loc->mbuf->l4_len;
+		if (unlikely((!hlen || !loc->mbuf->tso_segsz)))
+			return MLX5_TXCMP_CODE_ERROR;
+		if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK)
+			hlen += loc->mbuf->outer_l2_len +
+				loc->mbuf->outer_l3_len;
+		/* Segment must contain all TSO headers. */
+		if (unlikely(hlen > MLX5_MAX_TSO_HEADER ||
+			     hlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
+			     hlen > (dlen + vlan)))
+			return MLX5_TXCMP_CODE_ERROR;
+		/*
+		 * Check whether there are enough free WQEBBs:
+		 * - Control Segment
+		 * - Ethernet Segment
+		 * - First Segment of inlined Ethernet data
+		 * - ... data continued ...
+		 * - Finishing Data Segment of pointer type
+		 */
+		ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE +
+			  MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+		if (loc->wqe_free < ((ds + 3) / 4))
+			return MLX5_TXCMP_CODE_EXIT;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Update sent data bytes/packets counters. */
+		ntcp = (dlen + vlan - hlen +
+			loc->mbuf->tso_segsz - 1) /
+			loc->mbuf->tso_segsz;
+		/*
+		 * One will be added for mbuf itself at the end
+		 * of the mlx5_tx_burst from loc->pkts_sent field.
+		 */
+		--ntcp;
+		txq->stats.opackets += ntcp;
+		txq->stats.obytes += dlen + vlan + ntcp * hlen;
+#endif
+		/*
+		 * Build the TSO WQE:
+		 * - Control Segment
+		 * - Ethernet Segment with hlen bytes inlined
+		 * - Data Segment of pointer type
+		 */
+		wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+		loc->wqe_last = wqe;
+		mlx5_tx_cseg_init(txq, loc, wqe, ds,
+				  MLX5_OPCODE_TSO, olx);
+		dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx);
+		dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan;
+		dlen -= hlen - vlan;
+		mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
+		/*
+		 * WQE is built, update the loop parameters
+		 * and go to the next packet.
+		 */
+		txq->wqe_ci += (ds + 3) / 4;
+		loc->wqe_free -= (ds + 3) / 4;
+		if (MLX5_TXOFF_CONFIG(INLINE))
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+		--loc->elts_free;
+		++loc->pkts_sent;
+		--pkts_n;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		if (pkts_n > 1)
+			rte_prefetch0(*pkts);
+		if (MLX5_TXOFF_CONFIG(MULTI) &&
+		    unlikely(NB_SEGS(loc->mbuf) > 1))
+			return MLX5_TXCMP_CODE_MULTI;
+		if (unlikely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)))
+			return MLX5_TXCMP_CODE_SINGLE;
+		/* Continue with the next TSO packet. */
+	}
+	assert(false);
+}
+
+/**
+ * Analyze the packet and select the best method to send.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ * @param newp
+ *   The predefined flag whether do complete check for
+ *   multi-segment packets and TSO.
+ *
+ * @return
+ *  MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ *  MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO.
+ *  MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND.
+ *  MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq,
+		     struct mlx5_txq_local *restrict loc,
+		     unsigned int olx,
+		     bool newp)
+{
+	/* Check for multi-segment packet. */
+	if (newp &&
+	    MLX5_TXOFF_CONFIG(MULTI) &&
+	    unlikely(NB_SEGS(loc->mbuf) > 1))
+		return MLX5_TXCMP_CODE_MULTI;
+	/* Check for TSO packet. */
+	if (newp &&
+	    MLX5_TXOFF_CONFIG(TSO) &&
+	    unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))
+		return MLX5_TXCMP_CODE_TSO;
+	/* Check if eMPW is enabled at all. */
+	if (!MLX5_TXOFF_CONFIG(EMPW))
+		return MLX5_TXCMP_CODE_SINGLE;
+	/* Check if eMPW can be engaged. */
+	if (MLX5_TXOFF_CONFIG(VLAN) &&
+	    unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) &&
+		(!MLX5_TXOFF_CONFIG(INLINE) ||
+		 unlikely((rte_pktmbuf_data_len(loc->mbuf) +
+			   sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) {
+		/*
+		 * eMPW does not support VLAN insertion offload,
+		 * we have to inline the entire packet but
+		 * packet is too long for inlining.
+		 */
+		return MLX5_TXCMP_CODE_SINGLE;
+	}
+	return MLX5_TXCMP_CODE_EMPW;
+}
+
+/**
+ * Check the next packet attributes to match with the eMPW batch ones.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param es
+ *   Pointer to Ethernet Segment of eMPW batch.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *  true - packet match with eMPW batch attributes.
+ *  false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline bool
+mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused,
+		   struct mlx5_wqe_eseg *restrict es,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int olx)
+{
+	uint8_t swp_flags = 0;
+
+	/* Compare the checksum flags, if any. */
+	if (MLX5_TXOFF_CONFIG(CSUM) &&
+	    txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags)
+		return false;
+	/* Compare the Software Parser offsets and flags. */
+	if (MLX5_TXOFF_CONFIG(SWP) &&
+	    (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) ||
+	     es->swp_flags != swp_flags))
+		return false;
+	/* Fill metadata field if needed. */
+	if (MLX5_TXOFF_CONFIG(METADATA) &&
+		es->metadata != (loc->mbuf->ol_flags & PKT_TX_METADATA ?
+				 loc->mbuf->tx_metadata : 0))
+		return false;
+	/* There must be no VLAN packets in eMPW loop. */
+	if (MLX5_TXOFF_CONFIG(VLAN))
+		assert(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT));
+	return true;
+}
+
+/*
+ * Update send loop variables and WQE for eMPW loop
+ * without data inlining. Number of Data Segments is
+ * equal to the number of sent packets.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param ds
+ *   Number of packets/Data Segments/Packets.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *  true - packet match with eMPW batch attributes.
+ *  false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline void
+mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int ds,
+		   unsigned int olx __rte_unused)
+{
+	assert(!MLX5_TXOFF_CONFIG(INLINE));
+	loc->elts_free -= ds;
+	loc->pkts_sent += ds;
+	ds += 2;
+	loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+}
+
+/*
+ * Update send loop variables and WQE for eMPW loop
+ * with data inlining. Gets the size of pushed descriptors
+ * and data to the WQE.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param len
+ *   Total size of descriptor/data in bytes.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *  true - packet match with eMPW batch attributes.
+ *  false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline void
+mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int len,
+		   unsigned int olx __rte_unused)
+{
+	assert(MLX5_TXOFF_CONFIG(INLINE));
+	assert((len % MLX5_WSEG_SIZE) == 0);
+	len = len / MLX5_WSEG_SIZE + 2;
+	loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
+	txq->wqe_ci += (len + 3) / 4;
+	loc->wqe_free -= (len + 3) / 4;
+}
+
+/**
+ * Tx burst functions for single-segment packets without TSO
+ * and with Multi-Packet Writing feature support. Supports
+ * all types of Tx offloads, except multi-packets and TSO.
+ * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends
+ * as many packet per WQE as it can. If eMPW is not configured
+ * or packet can not be sent with eMPW (VLAN insertion) the
+ * ordinary SEND opcode is used and only one packet placed
+ * in WQE.
+ *
+ * Function stops sending if it encounters the multi-segment
+ * packet or packet with TSO requested.
+ *
+ * The routines are responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head if inlining
+ * offload is requested. Otherwise the copying mbufs to elts
+ * can be postponed and completed at the end of burst routine.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ *   MLX5_TXCMP_CODE_TSO - TSO packet encountered.
+ * Local context variables updated.
+ */
+
+/**
+ * The routine sends packets with MLX5_OPCODE_EMPW
+ * without inlining, this is dedicated optimized branch.
+ * No VLAN insertion is supported.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq,
+			  struct rte_mbuf **restrict pkts,
+			  unsigned int pkts_n,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	/*
+	 * Subroutine is the part of mlx5_tx_burst_single()
+	 * and sends single-segment packet with eMPW opcode
+	 * without data inlining.
+	 */
+	assert(!MLX5_TXOFF_CONFIG(INLINE));
+	assert(MLX5_TXOFF_CONFIG(EMPW));
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe_dseg *restrict dseg;
+		struct mlx5_wqe_eseg *restrict eseg;
+		enum mlx5_txcmp_code ret;
+		unsigned int part, loop;
+
+next_empw:
+		part = RTE_MIN(pkts_n, MLX5_EMPW_MAX_PACKETS);
+		if (unlikely(loc->elts_free < part)) {
+			/* We have no enough elts to save all mbufs. */
+			if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS))
+				return MLX5_TXCMP_CODE_EXIT;
+			/* But we still able to send at least minimal eMPW. */
+			part = loc->elts_free;
+		}
+		/* Check whether we have enough WQEs */
+		if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) {
+			if (unlikely(loc->wqe_free <
+				((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
+				return MLX5_TXCMP_CODE_EXIT;
+			part = (loc->wqe_free * 4) - 2;
+		}
+		if (likely(part > 1))
+			rte_prefetch0(*pkts);
+		loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+		/*
+		 * Build eMPW title WQEBB:
+		 * - Control Segment, eMPW opcode
+		 * - Ethernet Segment, no inline
+		 */
+		mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2,
+				  MLX5_OPCODE_ENHANCED_MPSW, olx);
+		mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
+				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
+		eseg = &loc->wqe_last->eseg;
+		dseg = &loc->wqe_last->dseg[0];
+		loop = part;
+		for (;;) {
+			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
+			/*
+			 * Some Tx offloads may cause an error if
+			 * packet is not long enough, check against
+			 * assumed minimal length.
+			 */
+			if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) {
+				part -= loop;
+				if (unlikely(!part))
+					return MLX5_TXCMP_CODE_ERROR;
+				/*
+				 * We have some successfully built
+				 * packet Data Segments to send.
+				 */
+				mlx5_tx_sdone_empw(txq, loc, part, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			txq->stats.obytes += dlen;
+#endif
+			mlx5_tx_dseg_ptr
+				(txq, loc, dseg,
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 dlen, olx);
+			if (unlikely(--loop == 0))
+				break;
+			loc->mbuf = *pkts++;
+			if (likely(loop > 1))
+				rte_prefetch0(*pkts);
+			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+			/*
+			 * Unroll the completion code to avoid
+			 * returning variable value - it results in
+			 * unoptimized sequent checking in caller.
+			 */
+			if (ret == MLX5_TXCMP_CODE_MULTI) {
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_MULTI;
+			}
+			if (ret == MLX5_TXCMP_CODE_TSO) {
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_TSO;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE) {
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_SINGLE;
+			}
+			if (ret != MLX5_TXCMP_CODE_EMPW) {
+				assert(false);
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+			/*
+			 * Check whether packet parameters coincide
+			 * within assumed eMPW batch:
+			 * - check sum settings
+			 * - metadata value
+			 * - software parser settings
+			 */
+			if (!mlx5_tx_match_empw(txq, eseg, loc, olx)) {
+				assert(loop);
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				goto next_empw;
+			}
+			/* Packet attributes match, continue the same eMPW. */
+			++dseg;
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+		}
+		/* eMPW is built successfully, update loop parameters. */
+		assert(!loop);
+		assert(pkts_n >= part);
+		loc->elts_free -= part;
+		loc->pkts_sent += part;
+		txq->wqe_ci += (2 + part + 3) / 4;
+		loc->wqe_free -= (2 + part + 3) / 4;
+		pkts_n -= part;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+		if (unlikely(ret != MLX5_TXCMP_CODE_EMPW))
+			return ret;
+		/* Continue sending eMPW batches. */
+	}
+	assert(false);
+}
+
+/**
+ * The routine sends packets with MLX5_OPCODE_EMPW
+ * with inlining, optionally supports VLAN insertion.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
+			  struct rte_mbuf **restrict pkts,
+			  unsigned int pkts_n,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	/*
+	 * Subroutine is the part of mlx5_tx_burst_single()
+	 * and sends single-segment packet with eMPW opcode
+	 * with data inlining.
+	 */
+	assert(MLX5_TXOFF_CONFIG(INLINE));
+	assert(MLX5_TXOFF_CONFIG(EMPW));
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe_dseg *restrict dseg;
+		struct mlx5_wqe_eseg *restrict eseg;
+		enum mlx5_txcmp_code ret;
+		unsigned int room, part;
+
+next_empw:
+		/* Check whether we have minimal amount WQEs */
+		if (unlikely(loc->wqe_free <
+			    ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
+			return MLX5_TXCMP_CODE_EXIT;
+		if (likely(pkts_n > 1))
+			rte_prefetch0(*pkts);
+		loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+		/*
+		 * Build eMPW title WQEBB:
+		 * - Control Segment, eMPW opcode, zero DS
+		 * - Ethernet Segment, no inline
+		 */
+		mlx5_tx_cseg_init(txq, loc, loc->wqe_last, 0,
+				  MLX5_OPCODE_ENHANCED_MPSW, olx);
+		mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
+				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
+		eseg = &loc->wqe_last->eseg;
+		dseg = &loc->wqe_last->dseg[0];
+		room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE,
+			       loc->wqe_free) * MLX5_WQE_SIZE -
+					MLX5_WQE_CSEG_SIZE -
+					MLX5_WQE_ESEG_SIZE;
+		/* Build WQE till we have space, packets and resources. */
+		part = room;
+		for (;;) {
+			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
+			uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+			unsigned int tlen;
+
+			assert(room >= MLX5_WQE_DSEG_SIZE);
+			assert((room % MLX5_WQE_DSEG_SIZE) == 0);
+			assert((uintptr_t)dseg < (uintptr_t)txq->wqes_end);
+			/*
+			 * Some Tx offloads may cause an error if
+			 * packet is not long enough, check against
+			 * assumed minimal length.
+			 */
+			if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) {
+				part -= room;
+				if (unlikely(!part))
+					return MLX5_TXCMP_CODE_ERROR;
+				/*
+				 * We have some successfully built
+				 * packet Data Segments to send.
+				 */
+				mlx5_tx_idone_empw(txq, loc, part, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+			/* Inline or not inline - that's the Question. */
+			if (dlen > txq->inlen_empw)
+				goto pointer_empw;
+			/* Inline entire packet, optional VLAN insertion. */
+			tlen = sizeof(dseg->bcount) + dlen;
+			if (MLX5_TXOFF_CONFIG(VLAN) &&
+			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+				/*
+				 * The packet length must be checked in
+				 * mlx5_tx_able_to_empw() and packet
+				 * fits into inline length guaranteed.
+				 */
+				assert((dlen + sizeof(struct rte_vlan_hdr)) <=
+					txq->inlen_empw);
+				tlen += sizeof(struct rte_vlan_hdr);
+				if (room < tlen)
+					break;
+				dseg = mlx5_tx_dseg_vlan(txq, loc, dseg,
+							 dptr, dlen, olx);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+				/* Update sent data bytes counter. */
+				txq->stats.obytes +=
+					sizeof(struct rte_vlan_hdr);
+#endif
+			} else {
+				if (room < tlen)
+					break;
+				dseg = mlx5_tx_dseg_empw(txq, loc, dseg,
+							 dptr, dlen, olx);
+			}
+			tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
+			assert(room >= tlen);
+			room -= tlen;
+			/*
+			 * Packet data are completely inlined,
+			 * free the packet immediately.
+			 */
+			rte_pktmbuf_free_seg(loc->mbuf);
+			goto next_mbuf;
+pointer_empw:
+			/*
+			 * Not inlinable VLAN packets are
+			 * proceeded outside of this routine.
+			 */
+			assert(room >= MLX5_WQE_DSEG_SIZE);
+			if (MLX5_TXOFF_CONFIG(VLAN))
+				assert(!(loc->mbuf->ol_flags &
+					 PKT_TX_VLAN_PKT));
+			mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
+			/* We have to store mbuf in elts.*/
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+			room -= MLX5_WQE_DSEG_SIZE;
+			/* Ring buffer wraparound is checked at the loop end.*/
+			++dseg;
+next_mbuf:
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			txq->stats.obytes += dlen;
+#endif
+			loc->pkts_sent++;
+			loc->elts_free--;
+			pkts_n--;
+			if (unlikely(!pkts_n || !loc->elts_free)) {
+				/*
+				 * We have no resources/packets to
+				 * continue build descriptors.
+				 */
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, olx);
+				return MLX5_TXCMP_CODE_EXIT;
+			}
+			/* Check if we have minimal room left. */
+			if (room < MLX5_WQE_DSEG_SIZE) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, olx);
+				goto next_empw;
+			}
+			loc->mbuf = *pkts++;
+			if (likely(pkts_n > 1))
+				rte_prefetch0(*pkts);
+			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+			/*
+			 * Unroll the completion code to avoid
+			 * returning variable value - it results in
+			 * unoptimized sequent checking in caller.
+			 */
+			if (ret == MLX5_TXCMP_CODE_MULTI) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_MULTI;
+			}
+			if (ret == MLX5_TXCMP_CODE_TSO) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_TSO;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_SINGLE;
+			}
+			if (ret != MLX5_TXCMP_CODE_EMPW) {
+				assert(false);
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+			/*
+			 * Check whether packet parameters coincide
+			 * within assumed eMPW batch:
+			 * - check sum settings
+			 * - metadata value
+			 * - software parser settings
+			 */
+			if (!mlx5_tx_match_empw(txq, eseg, loc, olx))
+				break;
+			/* Packet attributes match, continue the same eMPW. */
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+		}
+		/*
+		 * We get here to close an existing eMPW
+		 * session and start the new one.
+		 */
+		assert(pkts_n);
+		part -= room;
+		if (unlikely(!part))
+			return MLX5_TXCMP_CODE_EXIT;
+		mlx5_tx_idone_empw(txq, loc, part, olx);
+		if (unlikely(!loc->elts_free ||
+			     !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		goto next_empw;
+	}
+	assert(false);
+}
+
+/**
+ * The routine sends packets with ordinary MLX5_OPCODE_SEND.
+ * Data inlining and VLAN insertion are supported.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
+			  struct rte_mbuf **restrict pkts,
+			  unsigned int pkts_n,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	/*
+	 * Subroutine is the part of mlx5_tx_burst_single()
+	 * and sends single-segment packet with SEND opcode.
+	 */
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe *restrict wqe;
+		enum mlx5_txcmp_code ret;
+
+		assert(NB_SEGS(loc->mbuf) == 1);
+		if (MLX5_TXOFF_CONFIG(INLINE)) {
+			unsigned int inlen, vlan = 0;
+
+			inlen = rte_pktmbuf_data_len(loc->mbuf);
+			if (MLX5_TXOFF_CONFIG(VLAN) &&
+			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+				vlan = sizeof(struct rte_vlan_hdr);
+				inlen += vlan;
+				static_assert((sizeof(struct rte_vlan_hdr) +
+					       sizeof(struct rte_ether_hdr)) ==
+					       MLX5_ESEG_MIN_INLINE_SIZE,
+					       "invalid min inline data size");
+			}
+			/*
+			 * If inlining is enabled at configuration time
+			 * the limit must be not less than minimal size.
+			 * Otherwise we would do extra check for data
+			 * size to avoid crashes due to length overflow.
+			 */
+			assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+			if (inlen <= txq->inlen_send) {
+				unsigned int seg_n, wqe_n;
+
+				rte_prefetch0(rte_pktmbuf_mtod
+						(loc->mbuf, uint8_t *));
+				/* Check against minimal length. */
+				if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
+					return MLX5_TXCMP_CODE_ERROR;
+				/*
+				 * Completely inlined packet data WQE:
+				 * - Control Segment, SEND opcode
+				 * - Ethernet Segment, no VLAN insertion
+				 * - Data inlined, VLAN optionally inserted
+				 * - Alignment to MLX5_WSEG_SIZE
+				 * Have to estimate amount of WQEBBs
+				 */
+				seg_n = (inlen + 3 * MLX5_WSEG_SIZE -
+					 MLX5_ESEG_MIN_INLINE_SIZE +
+					 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+				/* Check if there are enough WQEBBs. */
+				wqe_n = (seg_n + 3) / 4;
+				if (wqe_n > loc->wqe_free)
+					return MLX5_TXCMP_CODE_EXIT;
+				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+				loc->wqe_last = wqe;
+				mlx5_tx_cseg_init(txq, loc, wqe, seg_n,
+						  MLX5_OPCODE_SEND, olx);
+				mlx5_tx_eseg_data(txq, loc, wqe,
+						  vlan, inlen, 0, olx);
+				txq->wqe_ci += wqe_n;
+				loc->wqe_free -= wqe_n;
+				/*
+				 * Packet data are completely inlined,
+				 * free the packet immediately.
+				 */
+				rte_pktmbuf_free_seg(loc->mbuf);
+			} else if (!MLX5_TXOFF_CONFIG(EMPW) &&
+				   txq->inlen_mode) {
+				/*
+				 * If minimal inlining is requested the eMPW
+				 * feature should be disabled due to data is
+				 * inlined into Ethernet Segment, which can
+				 * not contain inlined data for eMPW due to
+				 * segment shared for all packets.
+				 */
+				struct mlx5_wqe_dseg *restrict dseg;
+				unsigned int ds;
+				uint8_t *dptr;
+
+				/*
+				 * The inline-mode settings require
+				 * to inline the specified amount of
+				 * data bytes to the Ethernet Segment.
+				 * We should check the free space in
+				 * WQE ring buffer to inline partially.
+				 */
+				assert(txq->inlen_send >= txq->inlen_mode);
+				assert(inlen > txq->inlen_mode);
+				assert(txq->inlen_mode >=
+						MLX5_ESEG_MIN_INLINE_SIZE);
+				/*
+				 * Check whether there are enough free WQEBBs:
+				 * - Control Segment
+				 * - Ethernet Segment
+				 * - First Segment of inlined Ethernet data
+				 * - ... data continued ...
+				 * - Finishing Data Segment of pointer type
+				 */
+				ds = (MLX5_WQE_CSEG_SIZE +
+				      MLX5_WQE_ESEG_SIZE +
+				      MLX5_WQE_DSEG_SIZE +
+				      txq->inlen_mode -
+				      MLX5_ESEG_MIN_INLINE_SIZE +
+				      MLX5_WQE_DSEG_SIZE +
+				      MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+				if (loc->wqe_free < ((ds + 3) / 4))
+					return MLX5_TXCMP_CODE_EXIT;
+				/*
+				 * Build the ordinary SEND WQE:
+				 * - Control Segment
+				 * - Ethernet Segment, inline inlen_mode bytes
+				 * - Data Segment of pointer type
+				 */
+				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+				loc->wqe_last = wqe;
+				mlx5_tx_cseg_init(txq, loc, wqe, ds,
+						  MLX5_OPCODE_SEND, olx);
+				dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan,
+							 txq->inlen_mode,
+							 0, olx);
+				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
+				       txq->inlen_mode - vlan;
+				inlen -= txq->inlen_mode;
+				mlx5_tx_dseg_ptr(txq, loc, dseg,
+						 dptr, inlen, olx);
+				/*
+				 * WQE is built, update the loop parameters
+				 * and got to the next packet.
+				 */
+				txq->wqe_ci += (ds + 3) / 4;
+				loc->wqe_free -= (ds + 3) / 4;
+				/* We have to store mbuf in elts.*/
+				assert(MLX5_TXOFF_CONFIG(INLINE));
+				txq->elts[txq->elts_head++ & txq->elts_m] =
+						loc->mbuf;
+				--loc->elts_free;
+			} else {
+				uint8_t *dptr;
+				unsigned int dlen;
+
+				/*
+				 * Partially inlined packet data WQE, we have
+				 * some space in title WQEBB, we can fill it
+				 * with some packet data. It takes one WQEBB,
+				 * it is available, no extra space check:
+				 * - Control Segment, SEND opcode
+				 * - Ethernet Segment, no VLAN insertion
+				 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data
+				 * - Data Segment, pointer type
+				 */
+				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+				loc->wqe_last = wqe;
+				mlx5_tx_cseg_init(txq, loc, wqe, 4,
+						  MLX5_OPCODE_SEND, olx);
+				mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx);
+				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
+				       MLX5_ESEG_MIN_INLINE_SIZE - vlan;
+				/*
+				 * The length check is performed above, by
+				 * comparing with txq->inlen_send. We should
+				 * not get overflow here.
+				 */
+				assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
+				dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE;
+				mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1],
+						 dptr, dlen, olx);
+				++txq->wqe_ci;
+				--loc->wqe_free;
+				/* We have to store mbuf in elts.*/
+				assert(MLX5_TXOFF_CONFIG(INLINE));
+				txq->elts[txq->elts_head++ & txq->elts_m] =
+						loc->mbuf;
+				--loc->elts_free;
+			}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			txq->stats.obytes += vlan +
+					rte_pktmbuf_data_len(loc->mbuf);
+#endif
+		} else {
+			/*
+			 * No inline at all, it means the CPU cycles saving
+			 * is prioritized at configuration, we should not
+			 * copy any packet data to WQE.
+			 *
+			 * SEND WQE, one WQEBB:
+			 * - Control Segment, SEND opcode
+			 * - Ethernet Segment, optional VLAN, no inline
+			 * - Data Segment, pointer type
+			 */
+			wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+			loc->wqe_last = wqe;
+			mlx5_tx_cseg_init(txq, loc, wqe, 3,
+					  MLX5_OPCODE_SEND, olx);
+			mlx5_tx_eseg_none(txq, loc, wqe, olx);
+			/*
+			 * Some Tx offloads may cause an error if
+			 * packet is not long enough, check against
+			 * assumed minimal length.
+			 */
+			if (rte_pktmbuf_data_len(loc->mbuf) <=
+						 MLX5_ESEG_MIN_INLINE_SIZE)
+				return MLX5_TXCMP_CODE_ERROR;
+			mlx5_tx_dseg_ptr
+				(txq, loc, &wqe->dseg[0],
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 rte_pktmbuf_data_len(loc->mbuf), olx);
+			++txq->wqe_ci;
+			--loc->wqe_free;
+			/*
+			 * We should not store mbuf pointer in elts
+			 * if no inlining is configured, this is done
+			 * by calling routine in a batch copy.
+			 */
+			assert(!MLX5_TXOFF_CONFIG(INLINE));
+			--loc->elts_free;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf);
+			if (MLX5_TXOFF_CONFIG(VLAN) &&
+			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+				txq->stats.obytes +=
+					sizeof(struct rte_vlan_hdr);
+#endif
+		}
+		++loc->pkts_sent;
+		--pkts_n;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		if (pkts_n > 1)
+			rte_prefetch0(*pkts);
+		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+		if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE))
+			return ret;
+	}
+	assert(false);
+}
+
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq,
+		     struct rte_mbuf **restrict pkts,
+		     unsigned int pkts_n,
+		     struct mlx5_txq_local *restrict loc,
+		     unsigned int olx)
+{
+	enum mlx5_txcmp_code ret;
+
+	ret = mlx5_tx_able_to_empw(txq, loc, olx, false);
+	if (ret == MLX5_TXCMP_CODE_SINGLE)
+		goto ordinary_send;
+	assert(ret == MLX5_TXCMP_CODE_EMPW);
+	for (;;) {
+		/* Optimize for inline/no inline eMPW send. */
+		ret = (MLX5_TXOFF_CONFIG(INLINE)) ?
+			mlx5_tx_burst_empw_inline
+				(txq, pkts, pkts_n, loc, olx) :
+			mlx5_tx_burst_empw_simple
+				(txq, pkts, pkts_n, loc, olx);
+		if (ret != MLX5_TXCMP_CODE_SINGLE)
+			return ret;
+		/* The resources to send one packet should remain. */
+		assert(loc->elts_free && loc->wqe_free);
+ordinary_send:
+		ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx);
+		if (ret != MLX5_TXCMP_CODE_EMPW)
+			return ret;
+		/* The resources to send one packet should remain. */
+		assert(loc->elts_free && loc->wqe_free);
+	}
+}
+
+/**
+ * DPDK Tx callback template. This is configured template
+ * used to generate routines optimized for specified offload setup.
+ * One of this generated functions is chosen at SQ configuration
+ * time.
+ *
+ * @param txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param olx
+ *   Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
+ *   values. Should be static to take compile time static configuration
+ *   advantages.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+static __rte_always_inline uint16_t
+mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq,
+		   struct rte_mbuf **restrict pkts,
+		   uint16_t pkts_n,
+		   unsigned int olx)
+{
+	struct mlx5_txq_local loc;
+	enum mlx5_txcmp_code ret;
+	unsigned int part;
+
+	assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+	assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+	/*
+	 * Check if there are some CQEs, if any:
+	 * - process an encountered errors
+	 * - process the completed WQEs
+	 * - free related mbufs
+	 * - doorbell the NIC about processed CQEs
+	 */
+	rte_prefetch0(*pkts);
+	mlx5_tx_handle_completion(txq, olx);
+	/*
+	 * Calculate the number of available resources - elts and WQEs.
+	 * There are two possible different scenarios:
+	 * - no data inlining into WQEs, one WQEBB may contains upto
+	 *   four packets, in this case elts become scarce resource
+	 * - data inlining into WQEs, one packet may require multiple
+	 *   WQEBBs, the WQEs become the limiting factor.
+	 */
+	assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+	loc.elts_free = txq->elts_s -
+				(uint16_t)(txq->elts_head - txq->elts_tail);
+	assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+	loc.wqe_free = txq->wqe_s -
+				(uint16_t)(txq->wqe_ci - txq->wqe_pi);
+	if (unlikely(!pkts_n || !loc.elts_free || !loc.wqe_free))
+		return 0;
+	loc.pkts_sent = 0;
+	loc.pkts_copy = 0;
+	loc.wqe_last = NULL;
+	for (;;) {
+		/*
+		 * Fetch the packet from array. Usually this is
+		 * the first packet in series of multi/single
+		 * segment packets.
+		 */
+		loc.mbuf = *(pkts + loc.pkts_sent);
+		/* Dedicated branch for multi-segment packets. */
+		if (MLX5_TXOFF_CONFIG(MULTI) &&
+		    unlikely(NB_SEGS(loc.mbuf) > 1)) {
+			/*
+			 * Multi-segment packet encountered.
+			 * Hardware is able to process it only
+			 * with SEND/TSO opcodes, one packet
+			 * per WQE, do it in dedicated routine.
+			 */
+enter_send_multi:
+			assert(loc.pkts_sent >= loc.pkts_copy);
+			part = loc.pkts_sent - loc.pkts_copy;
+			if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
+				/*
+				 * There are some single-segment mbufs not
+				 * stored in elts. The mbufs must be in the
+				 * same order as WQEs, so we must copy the
+				 * mbufs to elts here, before the coming
+				 * multi-segment packet mbufs is appended.
+				 */
+				mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy,
+						  part, olx);
+				loc.pkts_copy = loc.pkts_sent;
+			}
+			assert(pkts_n > loc.pkts_sent);
+			ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx);
+			if (!MLX5_TXOFF_CONFIG(INLINE))
+				loc.pkts_copy = loc.pkts_sent;
+			/*
+			 * These returned code checks are supposed
+			 * to be optimized out due to routine inlining.
+			 */
+			if (ret == MLX5_TXCMP_CODE_EXIT) {
+				/*
+				 * The routine returns this code when
+				 * all packets are sent or there is no
+				 * enough resources to complete request.
+				 */
+				break;
+			}
+			if (ret == MLX5_TXCMP_CODE_ERROR) {
+				/*
+				 * The routine returns this code when
+				 * some error in the incoming packets
+				 * format occurred.
+				 */
+				txq->stats.oerrors++;
+				break;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE) {
+				/*
+				 * The single-segment packet was encountered
+				 * in the array, try to send it with the
+				 * best optimized way, possible engaging eMPW.
+				 */
+				goto enter_send_single;
+			}
+			if (MLX5_TXOFF_CONFIG(TSO) &&
+			    ret == MLX5_TXCMP_CODE_TSO) {
+				/*
+				 * The single-segment TSO packet was
+				 * encountered in the array.
+				 */
+				goto enter_send_tso;
+			}
+			/* We must not get here. Something is going wrong. */
+			assert(false);
+			txq->stats.oerrors++;
+			break;
+		}
+		/* Dedicated branch for single-segment TSO packets. */
+		if (MLX5_TXOFF_CONFIG(TSO) &&
+		    unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) {
+			/*
+			 * TSO might require special way for inlining
+			 * (dedicated parameters) and is sent with
+			 * MLX5_OPCODE_TSO opcode only, provide this
+			 * in dedicated branch.
+			 */
+enter_send_tso:
+			assert(NB_SEGS(loc.mbuf) == 1);
+			assert(pkts_n > loc.pkts_sent);
+			ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx);
+			/*
+			 * These returned code checks are supposed
+			 * to be optimized out due to routine inlining.
+			 */
+			if (ret == MLX5_TXCMP_CODE_EXIT)
+				break;
+			if (ret == MLX5_TXCMP_CODE_ERROR) {
+				txq->stats.oerrors++;
+				break;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE)
+				goto enter_send_single;
+			if (MLX5_TXOFF_CONFIG(MULTI) &&
+			    ret == MLX5_TXCMP_CODE_MULTI) {
+				/*
+				 * The multi-segment packet was
+				 * encountered in the array.
+				 */
+				goto enter_send_multi;
+			}
+			/* We must not get here. Something is going wrong. */
+			assert(false);
+			txq->stats.oerrors++;
+			break;
+		}
+		/*
+		 * The dedicated branch for the single-segment packets
+		 * without TSO. Often these ones can be sent using
+		 * MLX5_OPCODE_EMPW with multiple packets in one WQE.
+		 * The routine builds the WQEs till it encounters
+		 * the TSO or multi-segment packet (in case if these
+		 * offloads are requested at SQ configuration time).
+		 */
+enter_send_single:
+		assert(pkts_n > loc.pkts_sent);
+		ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx);
+		/*
+		 * These returned code checks are supposed
+		 * to be optimized out due to routine inlining.
+		 */
+		if (ret == MLX5_TXCMP_CODE_EXIT)
+			break;
+		if (ret == MLX5_TXCMP_CODE_ERROR) {
+			txq->stats.oerrors++;
+			break;
+		}
+		if (ret == MLX5_TXCMP_CODE_SINGLE)
+			goto enter_send_single;
+		if (MLX5_TXOFF_CONFIG(MULTI) &&
+		    ret == MLX5_TXCMP_CODE_MULTI)
+			goto enter_send_multi;
+		/* We must not get here. Something is going wrong. */
+		assert(false);
+		txq->stats.oerrors++;
+		break;
+	}
+	/*
+	 * Main Tx loop is completed, do the rest:
+	 * - set completion request if thresholds are reached
+	 * - doorbell the hardware
+	 * - copy the rest of mbufs to elts (if any)
+	 */
+	assert(MLX5_TXOFF_CONFIG(INLINE) || loc.pkts_sent >= loc.pkts_copy);
+	/* Take a shortcut if nothing is sent. */
+	if (unlikely(loc.pkts_sent == 0))
+		return 0;
+	/* Not all of the mbufs may be stored into elts yet. */
+	part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy;
+	mlx5_tx_request_completion(txq, part, &loc, olx);
+	/*
+	 * Ring QP doorbell immediately after WQE building completion
+	 * to improve latencies. The pure software related data treatment
+	 * can be completed after doorbell. Tx CQEs for this SQ are
+	 * processed in this thread only by the polling.
+	 */
+	mlx5_tx_dbrec(txq, loc.wqe_last);
+	if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
+		/*
+		 * There are some single-segment mbufs not stored in elts.
+		 * It can be only if last packet was single-segment.
+		 * The copying is gathered into one place due to it is
+		 * a good opportunity to optimize that with SIMD.
+		 * Unfortunately if inlining is enabled the gaps in
+		 * pointer array may happen due to early freeing of the
+		 * inlined mbufs.
+		 */
+		mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx);
+	}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Increment sent packets counter. */
+	txq->stats.opackets += loc.pkts_sent;
+#endif
+	assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+	assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+	return loc.pkts_sent;
 }
 
 /* Generate routines with Enhanced Multi-Packet Write support. */
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index d8c6f35..4439e88 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -354,8 +354,9 @@ struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx,
 void mlx5_set_ptype_table(void);
 void mlx5_set_cksum_table(void);
 void mlx5_set_swp_types_table(void);
-__rte_noinline uint16_t mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
-					volatile struct mlx5_err_cqe *err_cqe);
+__rte_noinline uint16_t mlx5_tx_error_cqe_handle
+				(struct mlx5_txq_data *restrict txq,
+				 volatile struct mlx5_err_cqe *err_cqe);
 uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
 void mlx5_rxq_initialize(struct mlx5_rxq_data *rxq);
 __rte_noinline int mlx5_rx_err_handle(struct mlx5_rxq_data *rxq,
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH 7/7] net/mlx5: add minimal required Tx data inline
  2019-07-04 16:29 [dpdk-dev] [PATCH 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                   ` (5 preceding siblings ...)
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 6/7] net/mlx5: implement Tx burst template Viacheslav Ovsiienko
@ 2019-07-04 16:29 ` Viacheslav Ovsiienko
  6 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-04 16:29 UTC (permalink / raw)
  To: dev; +Cc: yskoh

Tx data packet data may be inlined into transmit descriptor.
At some circumstances ConnectX NICs may require data to be
inlined for correct operation. The exact data amount may
depend on NIC operation mode, requested Tx offloads,
E-Switch configuration, etc.

The number of data bytes to inline may be specified with
devargs key "txq_inline_min". If this key is present the
specified value (may be aligned by the driver in order
not to exceed the limits and provide better descriptor
space utilization) will be used by the driver and it
is guaranteed the requested data bytes are inlined into
the descriptor beside other inline settings.

If "txq_inline_min" key is not present the value may
be queried by the driver from the NIC via DevX if this
feature is available. This patch provides the implementation
of this query.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c           |  93 +++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5.h           |   4 ++
 drivers/net/mlx5/mlx5_defs.h      |  18 ++++++
 drivers/net/mlx5/mlx5_devx_cmds.c | 100 ++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_prm.h       | 121 +++++++++++++++++++++++++++++++++++++-
 5 files changed, 334 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index e803f08..ce3a62b 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1045,6 +1045,97 @@ struct mlx5_dev_spawn_data {
 }
 
 /**
+ * Configures the minimal amount of data to inline into WQE
+ * while sending packets.
+ *
+ * - the txq_inline_min has the maximal priority, if this
+ *   key is specified in devargs
+ * - if DevX is enabled the inline mode is queried from the
+ *   device (HCA attributes and NIC vport context if needed).
+ * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4LX
+ *   and none (0 bytes) for other NICs
+ *
+ * @param spawn
+ *   Verbs device parameters (name, port, switch_info) to spawn.
+ * @param config
+ *   Device configuration parameters.
+ */
+static void
+mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn,
+		    struct mlx5_dev_config *config)
+{
+	if (config->txq_inline_min != MLX5_ARG_UNSET) {
+		/* Application defines size of inlined data explicitly. */
+		goto exit;
+	}
+	if (config->hca_attr.eth_net_offloads) {
+		/* We have DevX enabled, inline mode queried successfully. */
+		switch (config->hca_attr.wqe_inline_mode) {
+		case MLX5_CAP_INLINE_MODE_L2:
+			/* outer L2 header must be inlined. */
+			config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
+			goto exit;
+		case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+			/* No inline data are required by NIC. */
+			config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
+			goto exit;
+		case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+			/* inline mode is defined by NIC vport context. */
+			if (!config->hca_attr.eth_virt)
+				break;
+			switch (config->hca_attr.vport_inline_mode) {
+			case MLX5_INLINE_MODE_NONE:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_NONE;
+				goto exit;
+			case MLX5_INLINE_MODE_L2:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_L2;
+				goto exit;
+			case MLX5_INLINE_MODE_IP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_L3;
+				goto exit;
+			case MLX5_INLINE_MODE_TCP_UDP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_L4;
+				goto exit;
+			case MLX5_INLINE_MODE_INNER_L2:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_INNER_L2;
+				goto exit;
+			case MLX5_INLINE_MODE_INNER_IP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_INNER_L3;
+				goto exit;
+			case MLX5_INLINE_MODE_INNER_TCP_UDP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_INNER_L4;
+				goto exit;
+			}
+		}
+	}
+	/*
+	 * We get here if we are unable to deduce
+	 * inline data size with DevX. Try PCI ID
+	 * to determine old NICs.
+	 */
+	switch (spawn->pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+		config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
+		break;
+	default:
+		config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
+		break;
+	}
+exit:
+	DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min);
+}
+
+/**
  * Spawn an Ethernet device from Verbs information.
  *
  * @param dpdk_dev
@@ -1529,6 +1620,8 @@ struct mlx5_dev_spawn_data {
 #else
 	config.dv_esw_en = 0;
 #endif
+	/* Detect minimal data bytes to inline. */
+	mlx5_set_min_inline(spawn, &config);
 	/* Store device configuration on private structure. */
 	priv->config = config;
 	if (config.dv_flow_en) {
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index ac7ea1d..db462bc 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -160,6 +160,10 @@ struct mlx5_devx_counter_set {
 /* HCA attributes. */
 struct mlx5_hca_attr {
 	uint32_t eswitch_manager:1;
+	uint32_t eth_net_offloads:1;
+	uint32_t eth_virt:1;
+	uint32_t wqe_inline_mode:2;
+	uint32_t vport_inline_mode:3;
 };
 
 /* Flow list . */
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 873a595..8c118d5 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -60,6 +60,24 @@
 /* Maximum Packet headers size (L2+L3+L4) for TSO. */
 #define MLX5_MAX_TSO_HEADER (128u + 34u)
 
+/* Inline data size required by NICs. */
+#define MLX5_INLINE_HSIZE_NONE 0
+#define MLX5_INLINE_HSIZE_L2 (sizeof(struct rte_ether_hdr) + \
+			      sizeof(struct rte_vlan_hdr))
+#define MLX5_INLINE_HSIZE_L3 (MLX5_INLINE_HSIZE_L2 + \
+			      sizeof(struct rte_ipv6_hdr))
+#define MLX5_INLINE_HSIZE_L4 (MLX5_INLINE_HSIZE_L3 + \
+			      sizeof(struct rte_tcp_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L2 (MLX5_INLINE_HSIZE_L3 + \
+				    sizeof(struct rte_udp_hdr) + \
+				    sizeof(struct rte_vxlan_hdr) + \
+				    sizeof(struct rte_ether_hdr) + \
+				    sizeof(struct rte_vlan_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L3 (MLX5_INLINE_HSIZE_INNER_L2 + \
+				    sizeof(struct rte_ipv6_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L4 (MLX5_INLINE_HSIZE_INNER_L3 + \
+				    sizeof(struct rte_tcp_hdr))
+
 /* Threshold of buffer replenishment for vectorized Rx. */
 #define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \
 	(RTE_MIN(MLX5_VPMD_RX_MAX_BURST, (unsigned int)(n) >> 2))
diff --git a/drivers/net/mlx5/mlx5_devx_cmds.c b/drivers/net/mlx5/mlx5_devx_cmds.c
index e5776c4..de470a6 100644
--- a/drivers/net/mlx5/mlx5_devx_cmds.c
+++ b/drivers/net/mlx5/mlx5_devx_cmds.c
@@ -107,6 +107,59 @@ int mlx5_devx_cmd_flow_counter_free(struct mlx5dv_devx_obj *obj)
 }
 
 /**
+ * Query NIC vport context.
+ * Currently fiils minimal inline attribute.
+ *
+ * @param[in] ctx
+ *   ibv contexts returned from mlx5dv_open_device.
+ * @param[in] vport
+ *   vport index
+ * @param[out] attr
+ *   Attributes device values.
+ *
+ * @return
+ *   0 on success, a negative value otherwise.
+ */
+static int
+mlx5_devx_cmd_query_nic_vport_context(struct ibv_context *ctx,
+				      unsigned int vport,
+				      struct mlx5_hca_attr *attr)
+{
+	uint32_t in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {0};
+	uint32_t out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {0};
+	void *vctx;
+	int status, syndrome, rc;
+
+	/* Query NIC vport context to determine inline mode. */
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+	rc = mlx5_glue->devx_general_cmd(ctx,
+					 in, sizeof(in),
+					 out, sizeof(out));
+	if (rc)
+		goto error;
+	status = MLX5_GET(query_nic_vport_context_out, out, status);
+	syndrome = MLX5_GET(query_nic_vport_context_out, out, syndrome);
+	if (status) {
+		DRV_LOG(DEBUG, "Failed to query NIC vport context, "
+			"status %x, syndrome = %x",
+			status, syndrome);
+		return -1;
+	}
+	vctx = MLX5_ADDR_OF(query_nic_vport_context_out, out,
+			    nic_vport_context);
+	attr->vport_inline_mode = MLX5_GET(nic_vport_context, vctx,
+					   min_wqe_inline_mode);
+	return 0;
+error:
+	rc = (rc > 0) ? -rc : rc;
+	return rc;
+}
+
+/**
  * Query HCA attributes.
  * Using those attributes we can check on run time if the device
  * is having the required capabilities.
@@ -136,7 +189,7 @@ int mlx5_devx_cmd_flow_counter_free(struct mlx5dv_devx_obj *obj)
 	rc = mlx5_glue->devx_general_cmd(ctx,
 					 in, sizeof(in), out, sizeof(out));
 	if (rc)
-		return rc;
+		goto error;
 	status = MLX5_GET(query_hca_cap_out, out, status);
 	syndrome = MLX5_GET(query_hca_cap_out, out, syndrome);
 	if (status) {
@@ -147,5 +200,50 @@ int mlx5_devx_cmd_flow_counter_free(struct mlx5dv_devx_obj *obj)
 	}
 	hcattr = MLX5_ADDR_OF(query_hca_cap_out, out, capability);
 	attr->eswitch_manager = MLX5_GET(cmd_hca_cap, hcattr, eswitch_manager);
+	attr->eth_net_offloads = MLX5_GET(cmd_hca_cap, hcattr,
+					  eth_net_offloads);
+	attr->eth_virt = MLX5_GET(cmd_hca_cap, hcattr, eth_virt);
+	if (!attr->eth_net_offloads)
+		return 0;
+
+	/* Query HCA offloads for Ethernet protocol. */
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+	MLX5_SET(query_hca_cap_in, in, op_mod,
+		 MLX5_GET_HCA_CAP_OP_MOD_ETHERNET |
+		 MLX5_HCA_CAP_OPMOD_GET_CUR);
+
+	rc = mlx5_glue->devx_general_cmd(ctx,
+					 in, sizeof(in),
+					 out, sizeof(out));
+	if (rc) {
+		attr->eth_net_offloads = 0;
+		goto error;
+	}
+	status = MLX5_GET(query_hca_cap_out, out, status);
+	syndrome = MLX5_GET(query_hca_cap_out, out, syndrome);
+	if (status) {
+		DRV_LOG(DEBUG, "Failed to query devx HCA capabilities, "
+			"status %x, syndrome = %x",
+			status, syndrome);
+		attr->eth_net_offloads = 0;
+		return -1;
+	}
+	hcattr = MLX5_ADDR_OF(query_hca_cap_out, out, capability);
+	attr->wqe_inline_mode = MLX5_GET(eth_offload_cap, hcattr,
+					 wqe_inline_mode);
+	if (attr->wqe_inline_mode != MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
+		return 0;
+	if (attr->eth_virt) {
+		rc = mlx5_devx_cmd_query_nic_vport_context(ctx, 0, attr);
+		if (rc) {
+			attr->eth_virt = 0;
+			goto error;
+		}
+	}
 	return 0;
+error:
+	rc = (rc > 0) ? -rc : rc;
+	return rc;
 }
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 471a3e3..765b3f1 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -600,6 +600,7 @@ enum {
 
 enum {
 	MLX5_CMD_OP_QUERY_HCA_CAP = 0x100,
+	MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT = 0x754,
 	MLX5_CMD_OP_ALLOC_FLOW_COUNTER = 0x939,
 	MLX5_CMD_OP_QUERY_FLOW_COUNTER = 0x93b,
 };
@@ -664,7 +665,8 @@ struct mlx5_ifc_query_flow_counter_in_bits {
 
 enum {
 	MLX5_GET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0 << 1,
-	MLX5_GET_HCA_CAP_OP_MOD_QOS_CAP        = 0xc << 1,
+	MLX5_GET_HCA_CAP_OP_MOD_ETHERNET = 0x1 << 1,
+	MLX5_GET_HCA_CAP_OP_MOD_QOS_CAP = 0xc << 1,
 };
 
 enum {
@@ -672,6 +674,23 @@ enum {
 	MLX5_HCA_CAP_OPMOD_GET_CUR   = 1,
 };
 
+enum {
+	MLX5_CAP_INLINE_MODE_L2,
+	MLX5_CAP_INLINE_MODE_VPORT_CONTEXT,
+	MLX5_CAP_INLINE_MODE_NOT_REQUIRED,
+};
+
+enum {
+	MLX5_INLINE_MODE_NONE,
+	MLX5_INLINE_MODE_L2,
+	MLX5_INLINE_MODE_IP,
+	MLX5_INLINE_MODE_TCP_UDP,
+	MLX5_INLINE_MODE_RESERVED4,
+	MLX5_INLINE_MODE_INNER_L2,
+	MLX5_INLINE_MODE_INNER_IP,
+	MLX5_INLINE_MODE_INNER_TCP_UDP,
+};
+
 struct mlx5_ifc_cmd_hca_cap_bits {
 	u8 reserved_at_0[0x30];
 	u8 vhca_id[0x10];
@@ -942,6 +961,42 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8 reserved_at_61f[0x1e1];
 };
 
+struct mlx5_ifc_eth_offload_cap_bits {
+	u8 csum_cap[0x1];
+	u8 vlan_cap[0x1];
+	u8 lro_cap[0x1];
+	u8 lro_psh_flag[0x1];
+	u8 lro_time_stamp[0x1];
+	u8 reserved_at_5[0x2];
+	u8 wqe_vlan_insert[0x1];
+	u8 self_lb_en_modifiable[0x1];
+	u8 reserved_at_9[0x2];
+	u8 max_lso_cap[0x5];
+	u8 multi_pkt_send_wqe[0x2];
+	u8 wqe_inline_mode[0x2];
+	u8 rss_ind_tbl_cap[0x4];
+	u8 reg_umr_sq[0x1];
+	u8 scatter_fcs[0x1];
+	u8 enhanced_multi_pkt_send_wqe[0x1];
+	u8 tunnel_lso_const_out_ip_id[0x1];
+	u8 reserved_at_1c[0x2];
+	u8 tunnel_stateless_gre[0x1];
+	u8 tunnel_stateless_vxlan[0x1];
+	u8 swp[0x1];
+	u8 swp_csum[0x1];
+	u8 swp_lso[0x1];
+	u8 reserved_at_23[0xd];
+	u8 max_vxlan_udp_ports[0x8];
+	u8 reserved_at_38[0x6];
+	u8 max_geneve_opt_len[0x1];
+	u8 tunnel_stateless_geneve_rx[0x1];
+	u8 reserved_at_40[0x10];
+	u8 lro_min_mss_size[0x10];
+	u8 reserved_at_60[0x120];
+	u8 lro_timer_supported_periods[4][0x20];
+	u8 reserved_at_200[0x600];
+};
+
 struct mlx5_ifc_qos_cap_bits {
 	u8 packet_pacing[0x1];
 	u8 esw_scheduling[0x1];
@@ -969,6 +1024,7 @@ struct mlx5_ifc_qos_cap_bits {
 
 union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap;
+	struct mlx5_ifc_eth_offload_cap_bits eth_offload_cap;
 	struct mlx5_ifc_qos_cap_bits qos_cap;
 	u8 reserved_at_0[0x8000];
 };
@@ -989,6 +1045,69 @@ struct mlx5_ifc_query_hca_cap_in_bits {
 	u8 reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_mac_address_layout_bits {
+	u8 reserved_at_0[0x10];
+	u8 mac_addr_47_32[0x10];
+	u8 mac_addr_31_0[0x20];
+};
+
+struct mlx5_ifc_nic_vport_context_bits {
+	u8 reserved_at_0[0x5];
+	u8 min_wqe_inline_mode[0x3];
+	u8 reserved_at_8[0x15];
+	u8 disable_mc_local_lb[0x1];
+	u8 disable_uc_local_lb[0x1];
+	u8 roce_en[0x1];
+	u8 arm_change_event[0x1];
+	u8 reserved_at_21[0x1a];
+	u8 event_on_mtu[0x1];
+	u8 event_on_promisc_change[0x1];
+	u8 event_on_vlan_change[0x1];
+	u8 event_on_mc_address_change[0x1];
+	u8 event_on_uc_address_change[0x1];
+	u8 reserved_at_40[0xc];
+	u8 affiliation_criteria[0x4];
+	u8 affiliated_vhca_id[0x10];
+	u8 reserved_at_60[0xd0];
+	u8 mtu[0x10];
+	u8 system_image_guid[0x40];
+	u8 port_guid[0x40];
+	u8 node_guid[0x40];
+	u8 reserved_at_200[0x140];
+	u8 qkey_violation_counter[0x10];
+	u8 reserved_at_350[0x430];
+	u8 promisc_uc[0x1];
+	u8 promisc_mc[0x1];
+	u8 promisc_all[0x1];
+	u8 reserved_at_783[0x2];
+	u8 allowed_list_type[0x3];
+	u8 reserved_at_788[0xc];
+	u8 allowed_list_size[0xc];
+	struct mlx5_ifc_mac_address_layout_bits permanent_address;
+	u8 reserved_at_7e0[0x20];
+};
+
+struct mlx5_ifc_query_nic_vport_context_out_bits {
+	u8 status[0x8];
+	u8 reserved_at_8[0x18];
+	u8 syndrome[0x20];
+	u8 reserved_at_40[0x40];
+	struct mlx5_ifc_nic_vport_context_bits nic_vport_context;
+};
+
+struct mlx5_ifc_query_nic_vport_context_in_bits {
+	u8 opcode[0x10];
+	u8 reserved_at_10[0x10];
+	u8 reserved_at_20[0x10];
+	u8 op_mod[0x10];
+	u8 other_vport[0x1];
+	u8 reserved_at_41[0xf];
+	u8 vport_number[0x10];
+	u8 reserved_at_60[0x5];
+	u8 allowed_list_type[0x3];
+	u8 reserved_at_68[0x18];
+};
+
 /* CQE format mask. */
 #define MLX5E_CQE_FORMAT_MASK 0xc
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v2 0/7] net/mlx5: consolidate Tx datapath
  2019-07-04 16:29 ` [dpdk-dev] [PATCH 1/7] net/mlx5: remove Tx datapath implementation Viacheslav Ovsiienko
@ 2019-07-15 13:59   ` Viacheslav Ovsiienko
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 1/7] net/mlx5: remove Tx datapath implementation Viacheslav Ovsiienko
                       ` (7 more replies)
  0 siblings, 8 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-15 13:59 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patchset introduces the new implementation of tx_burst
routine of mlx5 PMD. The existing implementation is based on the
several branches of tx_burst routines optimizied for most
common sets of Tx hardware offload, supported by Mellanox NICs.

It was not very easy to update, support and develop such kind
of code - multiple branches impose multiple points to process.
Also, many of frequently requested offload combinations are not
supported yet in the most efficient way. That leads to selecting
of not completely matching tx_burst routine and harms the performance.

The new Tx datapath:
  - introduces the unified template for tx_burst routine to generate
    on compile time the most efficient instances, tuned for specified
    sets of hardware offloads
  - the best matching instance is chosen in run-time at sending queue
    configuration
  - almost all possible sets of Tx are supported
  - the best possible method to send each packet is chosen on runtime
  - introduces new devargs for more accurate tuning of data inlining 

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>

---
v2: 
  - minor performance issues resolved
  - typos in comments and docs

v1: http://patches.dpdk.org/patch/56091/

Viacheslav Ovsiienko (7):
  net/mlx5: remove Tx datapath implementation
  net/mlx5: add Tx datapath related devargs
  net/mlx5: update Tx datapath definitions
  net/mlx5: add Tx datapath configuration and setup
  net/mlx5: introduce Tx burst routine template
  net/mlx5: implement Tx burst template
  net/mlx5: add minimal required Tx data inline

 doc/guides/nics/mlx5.rst              |  168 +-
 drivers/net/mlx5/mlx5.c               |  155 +-
 drivers/net/mlx5/mlx5.h               |   13 +-
 drivers/net/mlx5/mlx5_defs.h          |   36 +-
 drivers/net/mlx5/mlx5_devx_cmds.c     |  100 +-
 drivers/net/mlx5/mlx5_ethdev.c        |   58 -
 drivers/net/mlx5/mlx5_prm.h           |  355 ++-
 drivers/net/mlx5/mlx5_rxtx.c          | 5280 ++++++++++++++++++++++-----------
 drivers/net/mlx5/mlx5_rxtx.h          |  333 +--
 drivers/net/mlx5/mlx5_rxtx_vec.c      |  175 --
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h |  289 --
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  284 --
 drivers/net/mlx5/mlx5_txq.c           |  242 +-
 13 files changed, 4413 insertions(+), 3075 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v2 1/7] net/mlx5: remove Tx datapath implementation
  2019-07-15 13:59   ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
@ 2019-07-15 13:59     ` Viacheslav Ovsiienko
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 2/7] net/mlx5: add Tx datapath related devargs Viacheslav Ovsiienko
                       ` (6 subsequent siblings)
  7 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-15 13:59 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch removes the existing Tx datapath code
as preparation step before introducing the new
implementation. The following entities are being
removed:

- obsolete devargs
- tx_burst() routines
- related PRM definitions
- SQ configuration code
- Tx routine selection code
- incompatible Tx completion code

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c               |   43 -
 drivers/net/mlx5/mlx5.h               |    5 -
 drivers/net/mlx5/mlx5_defs.h          |   16 -
 drivers/net/mlx5/mlx5_ethdev.c        |   58 --
 drivers/net/mlx5/mlx5_prm.h           |   77 --
 drivers/net/mlx5/mlx5_rxtx.c          | 1434 +--------------------------------
 drivers/net/mlx5/mlx5_rxtx.h          |  273 -------
 drivers/net/mlx5/mlx5_rxtx_vec.c      |  175 ----
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h |  289 -------
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  284 -------
 drivers/net/mlx5/mlx5_txq.c           |  110 +--
 11 files changed, 35 insertions(+), 2729 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index d93f92d..42b36a7 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -68,33 +68,15 @@
 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
 
-/* Device parameter to configure inline send. */
-#define MLX5_TXQ_INLINE "txq_inline"
-
 /*
  * Device parameter to configure the number of TX queues threshold for
  * enabling inline send.
  */
 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
 
-/*
- * Device parameter to configure the number of TX queues threshold for
- * enabling vectorized Tx.
- */
-#define MLX5_TXQS_MAX_VEC "txqs_max_vec"
-
 /* Device parameter to enable multi-packet send WQEs. */
 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
 
-/* Device parameter to include 2 dsegs in the title WQEBB. */
-#define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
-
-/* Device parameter to limit the size of inlining packet. */
-#define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
-
-/* Device parameter to enable hardware Tx vector. */
-#define MLX5_TX_VEC_EN "tx_vec_en"
-
 /* Device parameter to enable hardware Rx vector. */
 #define MLX5_RX_VEC_EN "rx_vec_en"
 
@@ -902,20 +884,10 @@ struct mlx5_dev_spawn_data {
 		config->mprq.max_memcpy_len = tmp;
 	} else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
 		config->mprq.min_rxqs_num = tmp;
-	} else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
-		config->txq_inline = tmp;
 	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
 		config->txqs_inline = tmp;
-	} else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
-		config->txqs_vec = tmp;
 	} else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
 		config->mps = !!tmp;
-	} else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
-		config->mpw_hdr_dseg = !!tmp;
-	} else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
-		config->inline_max_packet_sz = tmp;
-	} else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
-		config->tx_vec_en = !!tmp;
 	} else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
 		config->rx_vec_en = !!tmp;
 	} else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
@@ -960,13 +932,8 @@ struct mlx5_dev_spawn_data {
 		MLX5_RX_MPRQ_LOG_STRIDE_NUM,
 		MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
 		MLX5_RXQS_MIN_MPRQ,
-		MLX5_TXQ_INLINE,
 		MLX5_TXQS_MIN_INLINE,
-		MLX5_TXQS_MAX_VEC,
 		MLX5_TXQ_MPW_EN,
-		MLX5_TXQ_MPW_HDR_DSEG_EN,
-		MLX5_TXQ_MAX_INLINE_LEN,
-		MLX5_TX_VEC_EN,
 		MLX5_RX_VEC_EN,
 		MLX5_L3_VXLAN_EN,
 		MLX5_VF_NL_EN,
@@ -1914,12 +1881,8 @@ struct mlx5_dev_spawn_data {
 	dev_config = (struct mlx5_dev_config){
 		.hw_padding = 0,
 		.mps = MLX5_ARG_UNSET,
-		.tx_vec_en = 1,
 		.rx_vec_en = 1,
-		.txq_inline = MLX5_ARG_UNSET,
 		.txqs_inline = MLX5_ARG_UNSET,
-		.txqs_vec = MLX5_ARG_UNSET,
-		.inline_max_packet_sz = MLX5_ARG_UNSET,
 		.vf_nl_en = 1,
 		.mr_ext_memseg_en = 1,
 		.mprq = {
@@ -1932,9 +1895,6 @@ struct mlx5_dev_spawn_data {
 	};
 	/* Device specific configuration. */
 	switch (pci_dev->id.device_id) {
-	case PCI_DEVICE_ID_MELLANOX_CONNECTX5BF:
-		dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS_BLUEFIELD;
-		break;
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
@@ -1944,9 +1904,6 @@ struct mlx5_dev_spawn_data {
 	default:
 		break;
 	}
-	/* Set architecture-dependent default value if unset. */
-	if (dev_config.txqs_vec == MLX5_ARG_UNSET)
-		dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS;
 	for (i = 0; i != ns; ++i) {
 		uint32_t restore;
 
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 5af3f41..b8986fc 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -189,9 +189,7 @@ struct mlx5_dev_config {
 	unsigned int cqe_comp:1; /* CQE compression is enabled. */
 	unsigned int cqe_pad:1; /* CQE padding is enabled. */
 	unsigned int tso:1; /* Whether TSO is supported. */
-	unsigned int tx_vec_en:1; /* Tx vector is enabled. */
 	unsigned int rx_vec_en:1; /* Rx vector is enabled. */
-	unsigned int mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
 	unsigned int mr_ext_memseg_en:1;
 	/* Whether memseg should be extended for MR creation. */
 	unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */
@@ -215,10 +213,7 @@ struct mlx5_dev_config {
 	unsigned int tso_max_payload_sz; /* Maximum TCP payload for TSO. */
 	unsigned int ind_table_max_size; /* Maximum indirection table size. */
 	unsigned int max_dump_files_num; /* Maximum dump files per queue. */
-	int txq_inline; /* Maximum packet size for inlining. */
 	int txqs_inline; /* Queue number threshold for inlining. */
-	int txqs_vec; /* Queue number threshold for vectorized Tx. */
-	int inline_max_packet_sz; /* Max packet size for inlining. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 };
 
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 13801a5..6861304 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -60,15 +60,6 @@
 /* Maximum Packet headers size (L2+L3+L4) for TSO. */
 #define MLX5_MAX_TSO_HEADER 192
 
-/* Default maximum number of Tx queues for vectorized Tx. */
-#if defined(RTE_ARCH_ARM64)
-#define MLX5_VPMD_MAX_TXQS 8
-#define MLX5_VPMD_MAX_TXQS_BLUEFIELD 16
-#else
-#define MLX5_VPMD_MAX_TXQS 4
-#define MLX5_VPMD_MAX_TXQS_BLUEFIELD MLX5_VPMD_MAX_TXQS
-#endif
-
 /* Threshold of buffer replenishment for vectorized Rx. */
 #define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \
 	(RTE_MIN(MLX5_VPMD_RX_MAX_BURST, (unsigned int)(n) >> 2))
@@ -76,13 +67,6 @@
 /* Maximum size of burst for vectorized Rx. */
 #define MLX5_VPMD_RX_MAX_BURST 64U
 
-/*
- * Maximum size of burst for vectorized Tx. This is related to the maximum size
- * of Enhanced MPW (eMPW) WQE as vectorized Tx is supported with eMPW.
- * Careful when changing, large value can cause WQE DS to overlap.
- */
-#define MLX5_VPMD_TX_MAX_BURST        32U
-
 /* Number of packets vectorized Rx can simultaneously process in a loop. */
 #define MLX5_VPMD_DESCS_PER_LOOP      4
 
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index eeefe4d..3c3253d 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1577,64 +1577,6 @@ int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
 }
 
 /**
- * Configure the TX function to use.
- *
- * @param dev
- *   Pointer to private data structure.
- *
- * @return
- *   Pointer to selected Tx burst function.
- */
-eth_tx_burst_t
-mlx5_select_tx_function(struct rte_eth_dev *dev)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
-	struct mlx5_dev_config *config = &priv->config;
-	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
-	int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
-				    DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
-				    DEV_TX_OFFLOAD_GRE_TNL_TSO |
-				    DEV_TX_OFFLOAD_IP_TNL_TSO |
-				    DEV_TX_OFFLOAD_UDP_TNL_TSO));
-	int swp = !!(tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
-				    DEV_TX_OFFLOAD_UDP_TNL_TSO |
-				    DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM));
-	int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT);
-
-	assert(priv != NULL);
-	/* Select appropriate TX function. */
-	if (vlan_insert || tso || swp)
-		return tx_pkt_burst;
-	if (config->mps == MLX5_MPW_ENHANCED) {
-		if (mlx5_check_vec_tx_support(dev) > 0) {
-			if (mlx5_check_raw_vec_tx_support(dev) > 0)
-				tx_pkt_burst = mlx5_tx_burst_raw_vec;
-			else
-				tx_pkt_burst = mlx5_tx_burst_vec;
-			DRV_LOG(DEBUG,
-				"port %u selected enhanced MPW Tx vectorized"
-				" function",
-				dev->data->port_id);
-		} else {
-			tx_pkt_burst = mlx5_tx_burst_empw;
-			DRV_LOG(DEBUG,
-				"port %u selected enhanced MPW Tx function",
-				dev->data->port_id);
-		}
-	} else if (config->mps && (config->txq_inline > 0)) {
-		tx_pkt_burst = mlx5_tx_burst_mpw_inline;
-		DRV_LOG(DEBUG, "port %u selected MPW inline Tx function",
-			dev->data->port_id);
-	} else if (config->mps) {
-		tx_pkt_burst = mlx5_tx_burst_mpw;
-		DRV_LOG(DEBUG, "port %u selected MPW Tx function",
-			dev->data->port_id);
-	}
-	return tx_pkt_burst;
-}
-
-/**
  * Configure the RX function to use.
  *
  * @param dev
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index fe171f1..ff5dfbb 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -39,32 +39,12 @@
 /* Invalidate a CQE. */
 #define MLX5_CQE_INVALIDATE (MLX5_CQE_INVALID << 4)
 
-/* Maximum number of packets a multi-packet WQE can handle. */
-#define MLX5_MPW_DSEG_MAX 5
-
 /* WQE DWORD size */
 #define MLX5_WQE_DWORD_SIZE 16
 
 /* WQE size */
 #define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
 
-/* Max size of a WQE session. */
-#define MLX5_WQE_SIZE_MAX 960U
-
-/* Compute the number of DS. */
-#define MLX5_WQE_DS(n) \
-	(((n) + MLX5_WQE_DWORD_SIZE - 1) / MLX5_WQE_DWORD_SIZE)
-
-/* Room for inline data in multi-packet WQE. */
-#define MLX5_MWQE64_INL_DATA 28
-
-/* Default minimum number of Tx queues for inlining packets. */
-#define MLX5_EMPW_MIN_TXQS 8
-
-/* Default max packet length to be inlined. */
-#define MLX5_EMPW_MAX_INLINE_LEN (4U * MLX5_WQE_SIZE)
-
-
 #define MLX5_OPC_MOD_ENHANCED_MPSW 0
 #define MLX5_OPCODE_ENHANCED_MPSW 0x29
 
@@ -164,47 +144,11 @@ enum mlx5_completion_mode {
 	MLX5_COMP_CQE_AND_EQE = 0x3,
 };
 
-/* Subset of struct mlx5_wqe_eth_seg. */
-struct mlx5_wqe_eth_seg_small {
-	uint32_t rsvd0;
-	uint8_t	cs_flags;
-	uint8_t	rsvd1;
-	uint16_t mss;
-	uint32_t flow_table_metadata;
-	uint16_t inline_hdr_sz;
-	uint8_t inline_hdr[2];
-} __rte_aligned(MLX5_WQE_DWORD_SIZE);
-
-struct mlx5_wqe_inl_small {
-	uint32_t byte_cnt;
-	uint8_t raw;
-} __rte_aligned(MLX5_WQE_DWORD_SIZE);
-
-struct mlx5_wqe_ctrl {
-	uint32_t ctrl0;
-	uint32_t ctrl1;
-	uint32_t ctrl2;
-	uint32_t ctrl3;
-} __rte_aligned(MLX5_WQE_DWORD_SIZE);
-
 /* Small common part of the WQE. */
 struct mlx5_wqe {
 	uint32_t ctrl[4];
-	struct mlx5_wqe_eth_seg_small eseg;
-};
-
-/* Vectorize WQE header. */
-struct mlx5_wqe_v {
-	rte_v128u32_t ctrl;
-	rte_v128u32_t eseg;
 };
 
-/* WQE. */
-struct mlx5_wqe64 {
-	struct mlx5_wqe hdr;
-	uint8_t raw[32];
-} __rte_aligned(MLX5_WQE_SIZE);
-
 /* MPW mode. */
 enum mlx5_mpw_mode {
 	MLX5_MPW_DISABLED,
@@ -212,27 +156,6 @@ enum mlx5_mpw_mode {
 	MLX5_MPW_ENHANCED, /* Enhanced Multi-Packet Send WQE, a.k.a MPWv2. */
 };
 
-/* MPW session status. */
-enum mlx5_mpw_state {
-	MLX5_MPW_STATE_OPENED,
-	MLX5_MPW_INL_STATE_OPENED,
-	MLX5_MPW_ENHANCED_STATE_OPENED,
-	MLX5_MPW_STATE_CLOSED,
-};
-
-/* MPW session descriptor. */
-struct mlx5_mpw {
-	enum mlx5_mpw_state state;
-	unsigned int pkts_n;
-	unsigned int len;
-	unsigned int total_len;
-	volatile struct mlx5_wqe *wqe;
-	union {
-		volatile struct mlx5_wqe_data_seg *dseg[MLX5_MPW_DSEG_MAX];
-		volatile uint8_t *raw;
-	} data;
-};
-
 /* WQE for Multi-Packet RQ. */
 struct mlx5_wqe_mprq {
 	struct mlx5_wqe_srq_next_seg next_seg;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index c1dc8c4..f2d6918 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -288,140 +288,6 @@
 }
 
 /**
- * Return the size of tailroom of WQ.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param addr
- *   Pointer to tail of WQ.
- *
- * @return
- *   Size of tailroom.
- */
-static inline size_t
-tx_mlx5_wq_tailroom(struct mlx5_txq_data *txq, void *addr)
-{
-	size_t tailroom;
-	tailroom = (uintptr_t)(txq->wqes) +
-		   (1 << txq->wqe_n) * MLX5_WQE_SIZE -
-		   (uintptr_t)addr;
-	return tailroom;
-}
-
-/**
- * Copy data to tailroom of circular queue.
- *
- * @param dst
- *   Pointer to destination.
- * @param src
- *   Pointer to source.
- * @param n
- *   Number of bytes to copy.
- * @param base
- *   Pointer to head of queue.
- * @param tailroom
- *   Size of tailroom from dst.
- *
- * @return
- *   Pointer after copied data.
- */
-static inline void *
-mlx5_copy_to_wq(void *dst, const void *src, size_t n,
-		void *base, size_t tailroom)
-{
-	void *ret;
-
-	if (n > tailroom) {
-		rte_memcpy(dst, src, tailroom);
-		rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
-			   n - tailroom);
-		ret = (uint8_t *)base + n - tailroom;
-	} else {
-		rte_memcpy(dst, src, n);
-		ret = (n == tailroom) ? base : (uint8_t *)dst + n;
-	}
-	return ret;
-}
-
-/**
- * Inline TSO headers into WQE.
- *
- * @return
- *   0 on success, negative errno value on failure.
- */
-static int
-inline_tso(struct mlx5_txq_data *txq, struct rte_mbuf *buf,
-	   uint32_t *length,
-	   uintptr_t *addr,
-	   uint16_t *pkt_inline_sz,
-	   uint8_t **raw,
-	   uint16_t *max_wqe,
-	   uint16_t *tso_segsz,
-	   uint16_t *tso_header_sz)
-{
-	uintptr_t end = (uintptr_t)(((uintptr_t)txq->wqes) +
-				    (1 << txq->wqe_n) * MLX5_WQE_SIZE);
-	unsigned int copy_b;
-	uint8_t vlan_sz = (buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0;
-	const uint8_t tunneled = txq->tunnel_en && (buf->ol_flags &
-				 PKT_TX_TUNNEL_MASK);
-	uint16_t n_wqe;
-
-	*tso_segsz = buf->tso_segsz;
-	*tso_header_sz = buf->l2_len + vlan_sz + buf->l3_len + buf->l4_len;
-	if (unlikely(*tso_segsz == 0 || *tso_header_sz == 0)) {
-		txq->stats.oerrors++;
-		return -EINVAL;
-	}
-	if (tunneled)
-		*tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
-	/* First seg must contain all TSO headers. */
-	if (unlikely(*tso_header_sz > MLX5_MAX_TSO_HEADER) ||
-		     *tso_header_sz > DATA_LEN(buf)) {
-		txq->stats.oerrors++;
-		return -EINVAL;
-	}
-	copy_b = *tso_header_sz - *pkt_inline_sz;
-	if (!copy_b || ((end - (uintptr_t)*raw) < copy_b))
-		return -EAGAIN;
-	n_wqe = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
-	if (unlikely(*max_wqe < n_wqe))
-		return -EINVAL;
-	*max_wqe -= n_wqe;
-	rte_memcpy((void *)*raw, (void *)*addr, copy_b);
-	*length -= copy_b;
-	*addr += copy_b;
-	copy_b = MLX5_WQE_DS(copy_b) * MLX5_WQE_DWORD_SIZE;
-	*pkt_inline_sz += copy_b;
-	*raw += copy_b;
-	return 0;
-}
-
-/**
- * DPDK callback to check the status of a tx descriptor.
- *
- * @param tx_queue
- *   The tx queue.
- * @param[in] offset
- *   The index of the descriptor in the ring.
- *
- * @return
- *   The status of the tx descriptor.
- */
-int
-mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
-{
-	struct mlx5_txq_data *txq = tx_queue;
-	uint16_t used;
-
-	mlx5_tx_complete(txq);
-	used = txq->elts_head - txq->elts_tail;
-	if (offset < used)
-		return RTE_ETH_TX_DESC_FULL;
-	return RTE_ETH_TX_DESC_DONE;
-}
-
-/**
  * Internal function to compute the number of used descriptors in an RX queue
  *
  * @param rxq
@@ -655,7 +521,7 @@
 						    (1 << txq->cqe_n));
 			mlx5_dump_debug_information(name, "MLX5 Error SQ:",
 						    (const void *)((uintptr_t)
-						    tx_mlx5_wqe(txq, 0)),
+						    txq->wqes),
 						    MLX5_WQE_SIZE *
 						    (1 << txq->wqe_n));
 			txq_ctrl->dump_file_n++;
@@ -683,1247 +549,6 @@
 }
 
 /**
- * DPDK callback for TX.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	unsigned int k = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int comp;
-	volatile struct mlx5_wqe_ctrl *last_wqe = NULL;
-	unsigned int segs_n = 0;
-	const unsigned int max_inline = txq->max_inline;
-	uint64_t addr_64;
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Prefetch first packet cacheline. */
-	rte_prefetch0(*pkts);
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	if (unlikely(!max_wqe))
-		return 0;
-	do {
-		struct rte_mbuf *buf = *pkts; /* First_seg. */
-		uint8_t *raw;
-		volatile struct mlx5_wqe_v *wqe = NULL;
-		volatile rte_v128u32_t *dseg = NULL;
-		uint32_t length;
-		unsigned int ds = 0;
-		unsigned int sg = 0; /* counter of additional segs attached. */
-		uintptr_t addr;
-		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
-		uint16_t tso_header_sz = 0;
-		uint16_t ehdr;
-		uint8_t cs_flags;
-		uint8_t tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
-		uint32_t swp_offsets = 0;
-		uint8_t swp_types = 0;
-		rte_be32_t metadata;
-		uint16_t tso_segsz = 0;
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		uint32_t total_length = 0;
-#endif
-		int ret;
-
-		segs_n = buf->nb_segs;
-		/*
-		 * Make sure there is enough room to store this packet and
-		 * that one ring entry remains unused.
-		 */
-		assert(segs_n);
-		if (max_elts < segs_n)
-			break;
-		max_elts -= segs_n;
-		sg = --segs_n;
-		if (unlikely(--max_wqe == 0))
-			break;
-		wqe = (volatile struct mlx5_wqe_v *)
-			tx_mlx5_wqe(txq, txq->wqe_ci);
-		rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-		if (pkts_n - i > 1)
-			rte_prefetch0(*(pkts + 1));
-		addr = rte_pktmbuf_mtod(buf, uintptr_t);
-		length = DATA_LEN(buf);
-		ehdr = (((uint8_t *)addr)[1] << 8) |
-		       ((uint8_t *)addr)[0];
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		total_length = length;
-#endif
-		if (length < (MLX5_WQE_DWORD_SIZE + 2)) {
-			txq->stats.oerrors++;
-			break;
-		}
-		/* Update element. */
-		(*txq->elts)[elts_head & elts_m] = buf;
-		/* Prefetch next buffer data. */
-		if (pkts_n - i > 1)
-			rte_prefetch0(
-			    rte_pktmbuf_mtod(*(pkts + 1), volatile void *));
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets, &swp_types);
-		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Replace the Ethernet type by the VLAN if necessary. */
-		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
-			uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
-							 buf->vlan_tci);
-			unsigned int len = 2 * RTE_ETHER_ADDR_LEN - 2;
-
-			addr += 2;
-			length -= 2;
-			/* Copy Destination and source mac address. */
-			memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
-			/* Copy VLAN. */
-			memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
-			/* Copy missing two bytes to end the DSeg. */
-			memcpy((uint8_t *)raw + len + sizeof(vlan),
-			       ((uint8_t *)addr) + len, 2);
-			addr += len + 2;
-			length -= (len + 2);
-		} else {
-			memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
-			       MLX5_WQE_DWORD_SIZE);
-			length -= pkt_inline_sz;
-			addr += pkt_inline_sz;
-		}
-		raw += MLX5_WQE_DWORD_SIZE;
-		if (tso) {
-			ret = inline_tso(txq, buf, &length,
-					 &addr, &pkt_inline_sz,
-					 &raw, &max_wqe,
-					 &tso_segsz, &tso_header_sz);
-			if (ret == -EINVAL) {
-				break;
-			} else if (ret == -EAGAIN) {
-				/* NOP WQE. */
-				wqe->ctrl = (rte_v128u32_t){
-					rte_cpu_to_be_32(txq->wqe_ci << 8),
-					rte_cpu_to_be_32(txq->qp_num_8s | 1),
-					rte_cpu_to_be_32
-						(MLX5_COMP_ONLY_FIRST_ERR <<
-						 MLX5_COMP_MODE_OFFSET),
-					0,
-				};
-				ds = 1;
-#ifdef MLX5_PMD_SOFT_COUNTERS
-				total_length = 0;
-#endif
-				k++;
-				goto next_wqe;
-			}
-		}
-		/* Inline if enough room. */
-		if (max_inline || tso) {
-			uint32_t inl = 0;
-			uintptr_t end = (uintptr_t)
-				(((uintptr_t)txq->wqes) +
-				 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
-			unsigned int inline_room = max_inline *
-						   RTE_CACHE_LINE_SIZE -
-						   (pkt_inline_sz - 2) -
-						   !!tso * sizeof(inl);
-			uintptr_t addr_end;
-			unsigned int copy_b;
-
-pkt_inline:
-			addr_end = RTE_ALIGN_FLOOR(addr + inline_room,
-						   RTE_CACHE_LINE_SIZE);
-			copy_b = (addr_end > addr) ?
-				 RTE_MIN((addr_end - addr), length) : 0;
-			if (copy_b && ((end - (uintptr_t)raw) >
-				       (copy_b + sizeof(inl)))) {
-				/*
-				 * One Dseg remains in the current WQE.  To
-				 * keep the computation positive, it is
-				 * removed after the bytes to Dseg conversion.
-				 */
-				uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
-
-				if (unlikely(max_wqe < n))
-					break;
-				max_wqe -= n;
-				if (tso) {
-					assert(inl == 0);
-					inl = rte_cpu_to_be_32(copy_b |
-							       MLX5_INLINE_SEG);
-					rte_memcpy((void *)raw,
-						   (void *)&inl, sizeof(inl));
-					raw += sizeof(inl);
-					pkt_inline_sz += sizeof(inl);
-				}
-				rte_memcpy((void *)raw, (void *)addr, copy_b);
-				addr += copy_b;
-				length -= copy_b;
-				pkt_inline_sz += copy_b;
-			}
-			/*
-			 * 2 DWORDs consumed by the WQE header + ETH segment +
-			 * the size of the inline part of the packet.
-			 */
-			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
-			if (length > 0) {
-				if (ds % (MLX5_WQE_SIZE /
-					  MLX5_WQE_DWORD_SIZE) == 0) {
-					if (unlikely(--max_wqe == 0))
-						break;
-					dseg = (volatile rte_v128u32_t *)
-					       tx_mlx5_wqe(txq, txq->wqe_ci +
-							   ds / 4);
-				} else {
-					dseg = (volatile rte_v128u32_t *)
-						((uintptr_t)wqe +
-						 (ds * MLX5_WQE_DWORD_SIZE));
-				}
-				goto use_dseg;
-			} else if (!segs_n) {
-				goto next_pkt;
-			} else {
-				/*
-				 * Further inline the next segment only for
-				 * non-TSO packets.
-				 */
-				if (!tso) {
-					raw += copy_b;
-					inline_room -= copy_b;
-				} else {
-					inline_room = 0;
-				}
-				/* Move to the next segment. */
-				--segs_n;
-				buf = buf->next;
-				assert(buf);
-				addr = rte_pktmbuf_mtod(buf, uintptr_t);
-				length = DATA_LEN(buf);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-				total_length += length;
-#endif
-				(*txq->elts)[++elts_head & elts_m] = buf;
-				goto pkt_inline;
-			}
-		} else {
-			/*
-			 * No inline has been done in the packet, only the
-			 * Ethernet Header as been stored.
-			 */
-			dseg = (volatile rte_v128u32_t *)
-				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
-			ds = 3;
-use_dseg:
-			/* Add the remaining packet as a simple ds. */
-			addr_64 = rte_cpu_to_be_64(addr);
-			*dseg = (rte_v128u32_t){
-				rte_cpu_to_be_32(length),
-				mlx5_tx_mb2mr(txq, buf),
-				addr_64,
-				addr_64 >> 32,
-			};
-			++ds;
-			if (!segs_n)
-				goto next_pkt;
-		}
-next_seg:
-		assert(buf);
-		assert(ds);
-		assert(wqe);
-		/*
-		 * Spill on next WQE when the current one does not have
-		 * enough room left. Size of WQE must a be a multiple
-		 * of data segment size.
-		 */
-		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
-		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
-			if (unlikely(--max_wqe == 0))
-				break;
-			dseg = (volatile rte_v128u32_t *)
-			       tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
-			rte_prefetch0(tx_mlx5_wqe(txq,
-						  txq->wqe_ci + ds / 4 + 1));
-		} else {
-			++dseg;
-		}
-		++ds;
-		buf = buf->next;
-		assert(buf);
-		length = DATA_LEN(buf);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		total_length += length;
-#endif
-		/* Store segment information. */
-		addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t));
-		*dseg = (rte_v128u32_t){
-			rte_cpu_to_be_32(length),
-			mlx5_tx_mb2mr(txq, buf),
-			addr_64,
-			addr_64 >> 32,
-		};
-		(*txq->elts)[++elts_head & elts_m] = buf;
-		if (--segs_n)
-			goto next_seg;
-next_pkt:
-		if (ds > MLX5_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		++elts_head;
-		++pkts;
-		++i;
-		j += sg;
-		/* Initialize known and common part of the WQE structure. */
-		if (tso) {
-			wqe->ctrl = (rte_v128u32_t){
-				rte_cpu_to_be_32((txq->wqe_ci << 8) |
-						 MLX5_OPCODE_TSO),
-				rte_cpu_to_be_32(txq->qp_num_8s | ds),
-				rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-						 MLX5_COMP_MODE_OFFSET),
-				0,
-			};
-			wqe->eseg = (rte_v128u32_t){
-				swp_offsets,
-				cs_flags | (swp_types << 8) |
-				(rte_cpu_to_be_16(tso_segsz) << 16),
-				metadata,
-				(ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
-			};
-		} else {
-			wqe->ctrl = (rte_v128u32_t){
-				rte_cpu_to_be_32((txq->wqe_ci << 8) |
-						 MLX5_OPCODE_SEND),
-				rte_cpu_to_be_32(txq->qp_num_8s | ds),
-				rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-						 MLX5_COMP_MODE_OFFSET),
-				0,
-			};
-			wqe->eseg = (rte_v128u32_t){
-				swp_offsets,
-				cs_flags | (swp_types << 8),
-				metadata,
-				(ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
-			};
-		}
-next_wqe:
-		txq->wqe_ci += (ds + 3) / 4;
-		/* Save the last successful WQE for completion request */
-		last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe;
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += total_length;
-#endif
-	} while (i < pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely((i + k) == 0))
-		return 0;
-	txq->elts_head += (i + j);
-	/* Check whether completion threshold has been reached. */
-	comp = txq->elts_comp + i + j + k;
-	if (comp >= MLX5_TX_COMP_THRESH) {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		last_wqe->ctrl2 = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						   MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		last_wqe->ctrl3 = txq->elts_head;
-		txq->elts_comp = 0;
-	} else {
-		txq->elts_comp = comp;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe);
-	return i;
-}
-
-/**
- * Open a MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- * @param length
- *   Packet length.
- */
-static inline void
-mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length)
-{
-	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
-	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
-		(volatile struct mlx5_wqe_data_seg (*)[])
-		tx_mlx5_wqe(txq, idx + 1);
-
-	mpw->state = MLX5_MPW_STATE_OPENED;
-	mpw->pkts_n = 0;
-	mpw->len = length;
-	mpw->total_len = 0;
-	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
-	mpw->wqe->eseg.inline_hdr_sz = 0;
-	mpw->wqe->eseg.rsvd0 = 0;
-	mpw->wqe->eseg.rsvd1 = 0;
-	mpw->wqe->eseg.flow_table_metadata = 0;
-	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
-					     (txq->wqe_ci << 8) |
-					     MLX5_OPCODE_TSO);
-	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-					     MLX5_COMP_MODE_OFFSET);
-	mpw->wqe->ctrl[3] = 0;
-	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
-		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
-	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
-		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
-	mpw->data.dseg[2] = &(*dseg)[0];
-	mpw->data.dseg[3] = &(*dseg)[1];
-	mpw->data.dseg[4] = &(*dseg)[2];
-}
-
-/**
- * Close a MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- */
-static inline void
-mlx5_mpw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
-{
-	unsigned int num = mpw->pkts_n;
-
-	/*
-	 * Store size in multiple of 16 bytes. Control and Ethernet segments
-	 * count as 2.
-	 */
-	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | (2 + num));
-	mpw->state = MLX5_MPW_STATE_CLOSED;
-	if (num < 3)
-		++txq->wqe_ci;
-	else
-		txq->wqe_ci += 2;
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-}
-
-/**
- * DPDK callback for TX with MPW support.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int comp;
-	struct mlx5_mpw mpw = {
-		.state = MLX5_MPW_STATE_CLOSED,
-	};
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Prefetch first packet cacheline. */
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	if (unlikely(!max_wqe))
-		return 0;
-	do {
-		struct rte_mbuf *buf = *(pkts++);
-		uint32_t length;
-		unsigned int segs_n = buf->nb_segs;
-		uint32_t cs_flags;
-		rte_be32_t metadata;
-
-		/*
-		 * Make sure there is enough room to store this packet and
-		 * that one ring entry remains unused.
-		 */
-		assert(segs_n);
-		if (max_elts < segs_n)
-			break;
-		/* Do not bother with large packets MPW cannot handle. */
-		if (segs_n > MLX5_MPW_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		max_elts -= segs_n;
-		--pkts_n;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Retrieve packet information. */
-		length = PKT_LEN(buf);
-		assert(length);
-		/* Start new session if packet differs. */
-		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
-		    ((mpw.len != length) ||
-		     (segs_n != 1) ||
-		     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-		     (mpw.wqe->eseg.cs_flags != cs_flags)))
-			mlx5_mpw_close(txq, &mpw);
-		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
-			/*
-			 * Multi-Packet WQE consumes at most two WQE.
-			 * mlx5_mpw_new() expects to be able to use such
-			 * resources.
-			 */
-			if (unlikely(max_wqe < 2))
-				break;
-			max_wqe -= 2;
-			mlx5_mpw_new(txq, &mpw, length);
-			mpw.wqe->eseg.cs_flags = cs_flags;
-			mpw.wqe->eseg.flow_table_metadata = metadata;
-		}
-		/* Multi-segment packets must be alone in their MPW. */
-		assert((segs_n == 1) || (mpw.pkts_n == 0));
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-		length = 0;
-#endif
-		do {
-			volatile struct mlx5_wqe_data_seg *dseg;
-			uintptr_t addr;
-
-			assert(buf);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			dseg = mpw.data.dseg[mpw.pkts_n];
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			*dseg = (struct mlx5_wqe_data_seg){
-				.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
-				.lkey = mlx5_tx_mb2mr(txq, buf),
-				.addr = rte_cpu_to_be_64(addr),
-			};
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-			length += DATA_LEN(buf);
-#endif
-			buf = buf->next;
-			++mpw.pkts_n;
-			++j;
-		} while (--segs_n);
-		assert(length == mpw.len);
-		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
-			mlx5_mpw_close(txq, &mpw);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
-#endif
-		++i;
-	} while (pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely(i == 0))
-		return 0;
-	/* Check whether completion threshold has been reached. */
-	/* "j" includes both packets and segments. */
-	comp = txq->elts_comp + j;
-	if (comp >= MLX5_TX_COMP_THRESH) {
-		volatile struct mlx5_wqe *wqe = mpw.wqe;
-
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->ctrl[3] = elts_head;
-		txq->elts_comp = 0;
-	} else {
-		txq->elts_comp = comp;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	/* Ring QP doorbell. */
-	if (mpw.state == MLX5_MPW_STATE_OPENED)
-		mlx5_mpw_close(txq, &mpw);
-	mlx5_tx_dbrec(txq, mpw.wqe);
-	txq->elts_head = elts_head;
-	return i;
-}
-
-/**
- * Open a MPW inline session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- * @param length
- *   Packet length.
- */
-static inline void
-mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw,
-		    uint32_t length)
-{
-	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
-	struct mlx5_wqe_inl_small *inl;
-
-	mpw->state = MLX5_MPW_INL_STATE_OPENED;
-	mpw->pkts_n = 0;
-	mpw->len = length;
-	mpw->total_len = 0;
-	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
-					     (txq->wqe_ci << 8) |
-					     MLX5_OPCODE_TSO);
-	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-					     MLX5_COMP_MODE_OFFSET);
-	mpw->wqe->ctrl[3] = 0;
-	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
-	mpw->wqe->eseg.inline_hdr_sz = 0;
-	mpw->wqe->eseg.cs_flags = 0;
-	mpw->wqe->eseg.rsvd0 = 0;
-	mpw->wqe->eseg.rsvd1 = 0;
-	mpw->wqe->eseg.flow_table_metadata = 0;
-	inl = (struct mlx5_wqe_inl_small *)
-		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
-	mpw->data.raw = (uint8_t *)&inl->raw;
-}
-
-/**
- * Close a MPW inline session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- */
-static inline void
-mlx5_mpw_inline_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
-{
-	unsigned int size;
-	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
-		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
-
-	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
-	/*
-	 * Store size in multiple of 16 bytes. Control and Ethernet segments
-	 * count as 2.
-	 */
-	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
-					     MLX5_WQE_DS(size));
-	mpw->state = MLX5_MPW_STATE_CLOSED;
-	inl->byte_cnt = rte_cpu_to_be_32(mpw->total_len | MLX5_INLINE_SEG);
-	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
-}
-
-/**
- * DPDK callback for TX with MPW inline support.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
-			 uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int comp;
-	unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
-	struct mlx5_mpw mpw = {
-		.state = MLX5_MPW_STATE_CLOSED,
-	};
-	/*
-	 * Compute the maximum number of WQE which can be consumed by inline
-	 * code.
-	 * - 2 DSEG for:
-	 *   - 1 control segment,
-	 *   - 1 Ethernet segment,
-	 * - N Dseg from the inline request.
-	 */
-	const unsigned int wqe_inl_n =
-		((2 * MLX5_WQE_DWORD_SIZE +
-		  txq->max_inline * RTE_CACHE_LINE_SIZE) +
-		 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Prefetch first packet cacheline. */
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	do {
-		struct rte_mbuf *buf = *(pkts++);
-		uintptr_t addr;
-		uint32_t length;
-		unsigned int segs_n = buf->nb_segs;
-		uint8_t cs_flags;
-		rte_be32_t metadata;
-
-		/*
-		 * Make sure there is enough room to store this packet and
-		 * that one ring entry remains unused.
-		 */
-		assert(segs_n);
-		if (max_elts < segs_n)
-			break;
-		/* Do not bother with large packets MPW cannot handle. */
-		if (segs_n > MLX5_MPW_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		max_elts -= segs_n;
-		--pkts_n;
-		/*
-		 * Compute max_wqe in case less WQE were consumed in previous
-		 * iteration.
-		 */
-		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Retrieve packet information. */
-		length = PKT_LEN(buf);
-		/* Start new session if packet differs. */
-		if (mpw.state == MLX5_MPW_STATE_OPENED) {
-			if ((mpw.len != length) ||
-			    (segs_n != 1) ||
-			    (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-			    (mpw.wqe->eseg.cs_flags != cs_flags))
-				mlx5_mpw_close(txq, &mpw);
-		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
-			if ((mpw.len != length) ||
-			    (segs_n != 1) ||
-			    (length > inline_room) ||
-			    (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
-				mlx5_mpw_inline_close(txq, &mpw);
-				inline_room =
-					txq->max_inline * RTE_CACHE_LINE_SIZE;
-			}
-		}
-		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
-			if ((segs_n != 1) ||
-			    (length > inline_room)) {
-				/*
-				 * Multi-Packet WQE consumes at most two WQE.
-				 * mlx5_mpw_new() expects to be able to use
-				 * such resources.
-				 */
-				if (unlikely(max_wqe < 2))
-					break;
-				max_wqe -= 2;
-				mlx5_mpw_new(txq, &mpw, length);
-				mpw.wqe->eseg.cs_flags = cs_flags;
-				mpw.wqe->eseg.flow_table_metadata = metadata;
-			} else {
-				if (unlikely(max_wqe < wqe_inl_n))
-					break;
-				max_wqe -= wqe_inl_n;
-				mlx5_mpw_inline_new(txq, &mpw, length);
-				mpw.wqe->eseg.cs_flags = cs_flags;
-				mpw.wqe->eseg.flow_table_metadata = metadata;
-			}
-		}
-		/* Multi-segment packets must be alone in their MPW. */
-		assert((segs_n == 1) || (mpw.pkts_n == 0));
-		if (mpw.state == MLX5_MPW_STATE_OPENED) {
-			assert(inline_room ==
-			       txq->max_inline * RTE_CACHE_LINE_SIZE);
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-			length = 0;
-#endif
-			do {
-				volatile struct mlx5_wqe_data_seg *dseg;
-
-				assert(buf);
-				(*txq->elts)[elts_head++ & elts_m] = buf;
-				dseg = mpw.data.dseg[mpw.pkts_n];
-				addr = rte_pktmbuf_mtod(buf, uintptr_t);
-				*dseg = (struct mlx5_wqe_data_seg){
-					.byte_count =
-					       rte_cpu_to_be_32(DATA_LEN(buf)),
-					.lkey = mlx5_tx_mb2mr(txq, buf),
-					.addr = rte_cpu_to_be_64(addr),
-				};
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-				length += DATA_LEN(buf);
-#endif
-				buf = buf->next;
-				++mpw.pkts_n;
-				++j;
-			} while (--segs_n);
-			assert(length == mpw.len);
-			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
-				mlx5_mpw_close(txq, &mpw);
-		} else {
-			unsigned int max;
-
-			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
-			assert(length <= inline_room);
-			assert(length == DATA_LEN(buf));
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			/* Maximum number of bytes before wrapping. */
-			max = ((((uintptr_t)(txq->wqes)) +
-				(1 << txq->wqe_n) *
-				MLX5_WQE_SIZE) -
-			       (uintptr_t)mpw.data.raw);
-			if (length > max) {
-				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
-					   (void *)addr,
-					   max);
-				mpw.data.raw = (volatile void *)txq->wqes;
-				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
-					   (void *)(addr + max),
-					   length - max);
-				mpw.data.raw += length - max;
-			} else {
-				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
-					   (void *)addr,
-					   length);
-
-				if (length == max)
-					mpw.data.raw =
-						(volatile void *)txq->wqes;
-				else
-					mpw.data.raw += length;
-			}
-			++mpw.pkts_n;
-			mpw.total_len += length;
-			++j;
-			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
-				mlx5_mpw_inline_close(txq, &mpw);
-				inline_room =
-					txq->max_inline * RTE_CACHE_LINE_SIZE;
-			} else {
-				inline_room -= length;
-			}
-		}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
-#endif
-		++i;
-	} while (pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely(i == 0))
-		return 0;
-	/* Check whether completion threshold has been reached. */
-	/* "j" includes both packets and segments. */
-	comp = txq->elts_comp + j;
-	if (comp >= MLX5_TX_COMP_THRESH) {
-		volatile struct mlx5_wqe *wqe = mpw.wqe;
-
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->ctrl[3] = elts_head;
-		txq->elts_comp = 0;
-	} else {
-		txq->elts_comp = comp;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	/* Ring QP doorbell. */
-	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
-		mlx5_mpw_inline_close(txq, &mpw);
-	else if (mpw.state == MLX5_MPW_STATE_OPENED)
-		mlx5_mpw_close(txq, &mpw);
-	mlx5_tx_dbrec(txq, mpw.wqe);
-	txq->elts_head = elts_head;
-	return i;
-}
-
-/**
- * Open an Enhanced MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- * @param length
- *   Packet length.
- */
-static inline void
-mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding)
-{
-	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
-
-	mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
-	mpw->pkts_n = 0;
-	mpw->total_len = sizeof(struct mlx5_wqe);
-	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->ctrl[0] =
-		rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
-				 (txq->wqe_ci << 8) |
-				 MLX5_OPCODE_ENHANCED_MPSW);
-	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-					     MLX5_COMP_MODE_OFFSET);
-	mpw->wqe->ctrl[3] = 0;
-	memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
-	if (unlikely(padding)) {
-		uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
-
-		/* Pad the first 2 DWORDs with zero-length inline header. */
-		*(volatile uint32_t *)addr = rte_cpu_to_be_32(MLX5_INLINE_SEG);
-		*(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
-			rte_cpu_to_be_32(MLX5_INLINE_SEG);
-		mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
-		/* Start from the next WQEBB. */
-		mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
-	} else {
-		mpw->data.raw = (volatile void *)(mpw->wqe + 1);
-	}
-}
-
-/**
- * Close an Enhanced MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- *
- * @return
- *   Number of consumed WQEs.
- */
-static inline uint16_t
-mlx5_empw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
-{
-	uint16_t ret;
-
-	/* Store size in multiple of 16 bytes. Control and Ethernet segments
-	 * count as 2.
-	 */
-	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
-					     MLX5_WQE_DS(mpw->total_len));
-	mpw->state = MLX5_MPW_STATE_CLOSED;
-	ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
-	txq->wqe_ci += ret;
-	return ret;
-}
-
-/**
- * TX with Enhanced MPW support.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static inline uint16_t
-txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
-	       uint16_t pkts_n)
-{
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
-	unsigned int mpw_room = 0;
-	unsigned int inl_pad = 0;
-	uint32_t inl_hdr;
-	uint64_t addr_64;
-	struct mlx5_mpw mpw = {
-		.state = MLX5_MPW_STATE_CLOSED,
-	};
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	if (unlikely(!max_wqe))
-		return 0;
-	do {
-		struct rte_mbuf *buf = *(pkts++);
-		uintptr_t addr;
-		unsigned int do_inline = 0; /* Whether inline is possible. */
-		uint32_t length;
-		uint8_t cs_flags;
-		rte_be32_t metadata;
-
-		/* Multi-segmented packet is handled in slow-path outside. */
-		assert(NB_SEGS(buf) == 1);
-		/* Make sure there is enough room to store this packet. */
-		if (max_elts - j == 0)
-			break;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Retrieve packet information. */
-		length = PKT_LEN(buf);
-		/* Start new session if:
-		 * - multi-segment packet
-		 * - no space left even for a dseg
-		 * - next packet can be inlined with a new WQE
-		 * - cs_flag differs
-		 */
-		if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
-			if ((inl_pad + sizeof(struct mlx5_wqe_data_seg) >
-			     mpw_room) ||
-			    (length <= txq->inline_max_packet_sz &&
-			     inl_pad + sizeof(inl_hdr) + length >
-			     mpw_room) ||
-			     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-			    (mpw.wqe->eseg.cs_flags != cs_flags))
-				max_wqe -= mlx5_empw_close(txq, &mpw);
-		}
-		if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
-			/* In Enhanced MPW, inline as much as the budget is
-			 * allowed. The remaining space is to be filled with
-			 * dsegs. If the title WQEBB isn't padded, it will have
-			 * 2 dsegs there.
-			 */
-			mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
-					   (max_inline ? max_inline :
-					    pkts_n * MLX5_WQE_DWORD_SIZE) +
-					   MLX5_WQE_SIZE);
-			if (unlikely(max_wqe * MLX5_WQE_SIZE < mpw_room))
-				break;
-			/* Don't pad the title WQEBB to not waste WQ. */
-			mlx5_empw_new(txq, &mpw, 0);
-			mpw_room -= mpw.total_len;
-			inl_pad = 0;
-			do_inline = length <= txq->inline_max_packet_sz &&
-				    sizeof(inl_hdr) + length <= mpw_room &&
-				    !txq->mpw_hdr_dseg;
-			mpw.wqe->eseg.cs_flags = cs_flags;
-			mpw.wqe->eseg.flow_table_metadata = metadata;
-		} else {
-			/* Evaluate whether the next packet can be inlined.
-			 * Inlininig is possible when:
-			 * - length is less than configured value
-			 * - length fits for remaining space
-			 * - not required to fill the title WQEBB with dsegs
-			 */
-			do_inline =
-				length <= txq->inline_max_packet_sz &&
-				inl_pad + sizeof(inl_hdr) + length <=
-				 mpw_room &&
-				(!txq->mpw_hdr_dseg ||
-				 mpw.total_len >= MLX5_WQE_SIZE);
-		}
-		if (max_inline && do_inline) {
-			/* Inline packet into WQE. */
-			unsigned int max;
-
-			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
-			assert(length == DATA_LEN(buf));
-			inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG);
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			mpw.data.raw = (volatile void *)
-				((uintptr_t)mpw.data.raw + inl_pad);
-			max = tx_mlx5_wq_tailroom(txq,
-					(void *)(uintptr_t)mpw.data.raw);
-			/* Copy inline header. */
-			mpw.data.raw = (volatile void *)
-				mlx5_copy_to_wq(
-					  (void *)(uintptr_t)mpw.data.raw,
-					  &inl_hdr,
-					  sizeof(inl_hdr),
-					  (void *)(uintptr_t)txq->wqes,
-					  max);
-			max = tx_mlx5_wq_tailroom(txq,
-					(void *)(uintptr_t)mpw.data.raw);
-			/* Copy packet data. */
-			mpw.data.raw = (volatile void *)
-				mlx5_copy_to_wq(
-					  (void *)(uintptr_t)mpw.data.raw,
-					  (void *)addr,
-					  length,
-					  (void *)(uintptr_t)txq->wqes,
-					  max);
-			++mpw.pkts_n;
-			mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
-			/* No need to get completion as the entire packet is
-			 * copied to WQ. Free the buf right away.
-			 */
-			rte_pktmbuf_free_seg(buf);
-			mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
-			/* Add pad in the next packet if any. */
-			inl_pad = (((uintptr_t)mpw.data.raw +
-					(MLX5_WQE_DWORD_SIZE - 1)) &
-					~(MLX5_WQE_DWORD_SIZE - 1)) -
-				  (uintptr_t)mpw.data.raw;
-		} else {
-			/* No inline. Load a dseg of packet pointer. */
-			volatile rte_v128u32_t *dseg;
-
-			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
-			assert((inl_pad + sizeof(*dseg)) <= mpw_room);
-			assert(length == DATA_LEN(buf));
-			if (!tx_mlx5_wq_tailroom(txq,
-					(void *)((uintptr_t)mpw.data.raw
-						+ inl_pad)))
-				dseg = (volatile void *)txq->wqes;
-			else
-				dseg = (volatile void *)
-					((uintptr_t)mpw.data.raw +
-					 inl_pad);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
-								    uintptr_t));
-			*dseg = (rte_v128u32_t) {
-				rte_cpu_to_be_32(length),
-				mlx5_tx_mb2mr(txq, buf),
-				addr_64,
-				addr_64 >> 32,
-			};
-			mpw.data.raw = (volatile void *)(dseg + 1);
-			mpw.total_len += (inl_pad + sizeof(*dseg));
-			++j;
-			++mpw.pkts_n;
-			mpw_room -= (inl_pad + sizeof(*dseg));
-			inl_pad = 0;
-		}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
-#endif
-		++i;
-	} while (i < pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely(i == 0))
-		return 0;
-	/* Check whether completion threshold has been reached. */
-	if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
-			(uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
-			 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
-		volatile struct mlx5_wqe *wqe = mpw.wqe;
-
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->ctrl[3] = elts_head;
-		txq->elts_comp = 0;
-		txq->mpw_comp = txq->wqe_ci;
-	} else {
-		txq->elts_comp += j;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
-		mlx5_empw_close(txq, &mpw);
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, mpw.wqe);
-	txq->elts_head = elts_head;
-	return i;
-}
-
-/**
- * DPDK callback for TX with Enhanced MPW support.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t nb_tx = 0;
-
-	while (pkts_n > nb_tx) {
-		uint16_t n;
-		uint16_t ret;
-
-		n = txq_count_contig_multi_seg(&pkts[nb_tx], pkts_n - nb_tx);
-		if (n) {
-			ret = mlx5_tx_burst(dpdk_txq, &pkts[nb_tx], n);
-			if (!ret)
-				break;
-			nb_tx += ret;
-		}
-		n = txq_count_contig_single_seg(&pkts[nb_tx], pkts_n - nb_tx);
-		if (n) {
-			ret = txq_burst_empw(txq, &pkts[nb_tx], n);
-			if (!ret)
-				break;
-			nb_tx += ret;
-		}
-	}
-	return nb_tx;
-}
-
-/**
  * Translate RX completion flags to packet type.
  *
  * @param[in] rxq
@@ -2867,22 +1492,6 @@
  */
 
 __rte_weak uint16_t
-mlx5_tx_burst_raw_vec(void *dpdk_txq __rte_unused,
-		      struct rte_mbuf **pkts __rte_unused,
-		      uint16_t pkts_n __rte_unused)
-{
-	return 0;
-}
-
-__rte_weak uint16_t
-mlx5_tx_burst_vec(void *dpdk_txq __rte_unused,
-		  struct rte_mbuf **pkts __rte_unused,
-		  uint16_t pkts_n __rte_unused)
-{
-	return 0;
-}
-
-__rte_weak uint16_t
 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
 		  struct rte_mbuf **pkts __rte_unused,
 		  uint16_t pkts_n __rte_unused)
@@ -2891,25 +1500,50 @@
 }
 
 __rte_weak int
-mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
+mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
 {
 	return -ENOTSUP;
 }
 
 __rte_weak int
-mlx5_check_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
+mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
 {
 	return -ENOTSUP;
 }
 
-__rte_weak int
-mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
+/**
+ * DPDK callback to check the status of a tx descriptor.
+ *
+ * @param tx_queue
+ *   The tx queue.
+ * @param[in] offset
+ *   The index of the descriptor in the ring.
+ *
+ * @return
+ *   The status of the tx descriptor.
+ */
+int
+mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
 {
-	return -ENOTSUP;
+	(void)tx_queue;
+	(void)offset;
+	return RTE_ETH_TX_DESC_FULL;
 }
 
-__rte_weak int
-mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
+/**
+ * Configure the TX function to use.
+ *
+ * @param dev
+ *   Pointer to private data structure.
+ *
+ * @return
+ *   Pointer to selected Tx burst function.
+ */
+eth_tx_burst_t
+mlx5_select_tx_function(struct rte_eth_dev *dev)
 {
-	return -ENOTSUP;
+	(void)dev;
+	return removed_tx_burst;
 }
+
+
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3d79c18..acde09d 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -329,14 +329,6 @@ struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx,
 void mlx5_set_ptype_table(void);
 void mlx5_set_cksum_table(void);
 void mlx5_set_swp_types_table(void);
-uint16_t mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
-		       uint16_t pkts_n);
-uint16_t mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts,
-			   uint16_t pkts_n);
-uint16_t mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
-				  uint16_t pkts_n);
-uint16_t mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts,
-			    uint16_t pkts_n);
 __rte_noinline uint16_t mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
 					volatile struct mlx5_err_cqe *err_cqe);
 uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
@@ -360,14 +352,8 @@ int mlx5_queue_state_modify_primary(struct rte_eth_dev *dev,
 			const struct mlx5_mp_arg_queue_state_modify *sm);
 
 /* Vectorized version of mlx5_rxtx.c */
-int mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev);
-int mlx5_check_vec_tx_support(struct rte_eth_dev *dev);
 int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data);
 int mlx5_check_vec_rx_support(struct rte_eth_dev *dev);
-uint16_t mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts,
-			       uint16_t pkts_n);
-uint16_t mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
-			   uint16_t pkts_n);
 uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
 			   uint16_t pkts_n);
 
@@ -478,122 +464,6 @@ enum mlx5_cqe_status {
 }
 
 /**
- * Return the address of the WQE.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param  wqe_ci
- *   WQE consumer index.
- *
- * @return
- *   WQE address.
- */
-static inline uintptr_t *
-tx_mlx5_wqe(struct mlx5_txq_data *txq, uint16_t ci)
-{
-	ci &= ((1 << txq->wqe_n) - 1);
-	return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE);
-}
-
-/**
- * Handle the next CQE.
- *
- * @param txq
- *   Pointer to TX queue structure.
- *
- * @return
- *   The last Tx buffer element to free.
- */
-static __rte_always_inline uint16_t
-mlx5_tx_cqe_handle(struct mlx5_txq_data *txq)
-{
-	const unsigned int cqe_n = 1 << txq->cqe_n;
-	const unsigned int cqe_cnt = cqe_n - 1;
-	uint16_t last_elts;
-	union {
-		volatile struct mlx5_cqe *cqe;
-		volatile struct mlx5_err_cqe *err_cqe;
-	} u = {
-		.cqe =  &(*txq->cqes)[txq->cq_ci & cqe_cnt],
-	};
-	int ret = check_cqe(u.cqe, cqe_n, txq->cq_ci);
-
-	if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
-		if (unlikely(ret == MLX5_CQE_STATUS_ERR))
-			last_elts = mlx5_tx_error_cqe_handle(txq, u.err_cqe);
-		else
-			/* Do not release buffers. */
-			return txq->elts_tail;
-	} else {
-		uint16_t new_wqe_pi = rte_be_to_cpu_16(u.cqe->wqe_counter);
-		volatile struct mlx5_wqe_ctrl *ctrl =
-				(volatile struct mlx5_wqe_ctrl *)
-					tx_mlx5_wqe(txq, new_wqe_pi);
-
-		/* Release completion burst buffers. */
-		last_elts = ctrl->ctrl3;
-		txq->wqe_pi = new_wqe_pi;
-		txq->cq_ci++;
-	}
-	rte_compiler_barrier();
-	*txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
-	return last_elts;
-}
-
-/**
- * Manage TX completions.
- *
- * When sending a burst, mlx5_tx_burst() posts several WRs.
- *
- * @param txq
- *   Pointer to TX queue structure.
- */
-static __rte_always_inline void
-mlx5_tx_complete(struct mlx5_txq_data *txq)
-{
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	uint16_t elts_free = txq->elts_tail;
-	uint16_t elts_tail;
-	struct rte_mbuf *m, *free[elts_n];
-	struct rte_mempool *pool = NULL;
-	unsigned int blk_n = 0;
-
-	elts_tail = mlx5_tx_cqe_handle(txq);
-	assert((elts_tail & elts_m) < (1 << txq->wqe_n));
-	/* Free buffers. */
-	while (elts_free != elts_tail) {
-		m = rte_pktmbuf_prefree_seg((*txq->elts)[elts_free++ & elts_m]);
-		if (likely(m != NULL)) {
-			if (likely(m->pool == pool)) {
-				free[blk_n++] = m;
-			} else {
-				if (likely(pool != NULL))
-					rte_mempool_put_bulk(pool,
-							     (void *)free,
-							     blk_n);
-				free[0] = m;
-				pool = m->pool;
-				blk_n = 1;
-			}
-		}
-	}
-	if (blk_n)
-		rte_mempool_put_bulk(pool, (void *)free, blk_n);
-#ifndef NDEBUG
-	elts_free = txq->elts_tail;
-	/* Poisoning. */
-	while (elts_free != elts_tail) {
-		memset(&(*txq->elts)[elts_free & elts_m],
-		       0x66,
-		       sizeof((*txq->elts)[elts_free & elts_m]));
-		++elts_free;
-	}
-#endif
-	txq->elts_tail = elts_tail;
-}
-
-/**
  * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which the
  * cloned mbuf is allocated is returned instead.
  *
@@ -710,147 +580,4 @@ enum mlx5_cqe_status {
 	mlx5_tx_dbrec_cond_wmb(txq, wqe, 1);
 }
 
-/**
- * Convert mbuf to Verb SWP.
- *
- * @param txq_data
- *   Pointer to the Tx queue.
- * @param buf
- *   Pointer to the mbuf.
- * @param offsets
- *   Pointer to the SWP header offsets.
- * @param swp_types
- *   Pointer to the SWP header types.
- */
-static __rte_always_inline void
-txq_mbuf_to_swp(struct mlx5_txq_data *txq, struct rte_mbuf *buf,
-		uint8_t *offsets, uint8_t *swp_types)
-{
-	const uint64_t vlan = buf->ol_flags & PKT_TX_VLAN_PKT;
-	const uint64_t tunnel = buf->ol_flags & PKT_TX_TUNNEL_MASK;
-	const uint64_t tso = buf->ol_flags & PKT_TX_TCP_SEG;
-	const uint64_t csum_flags = buf->ol_flags & PKT_TX_L4_MASK;
-	const uint64_t inner_ip =
-		buf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6);
-	const uint64_t ol_flags_mask = PKT_TX_L4_MASK | PKT_TX_IPV6 |
-				       PKT_TX_OUTER_IPV6;
-	uint16_t idx;
-	uint16_t off;
-
-	if (likely(!txq->swp_en || (tunnel != PKT_TX_TUNNEL_UDP &&
-				    tunnel != PKT_TX_TUNNEL_IP)))
-		return;
-	/*
-	 * The index should have:
-	 * bit[0:1] = PKT_TX_L4_MASK
-	 * bit[4] = PKT_TX_IPV6
-	 * bit[8] = PKT_TX_OUTER_IPV6
-	 * bit[9] = PKT_TX_OUTER_UDP
-	 */
-	idx = (buf->ol_flags & ol_flags_mask) >> 52;
-	if (tunnel == PKT_TX_TUNNEL_UDP)
-		idx |= 1 << 9;
-	*swp_types = mlx5_swp_types_table[idx];
-	/*
-	 * Set offsets for SW parser. Since ConnectX-5, SW parser just
-	 * complements HW parser. SW parser starts to engage only if HW parser
-	 * can't reach a header. For the older devices, HW parser will not kick
-	 * in if any of SWP offsets is set. Therefore, all of the L3 offsets
-	 * should be set regardless of HW offload.
-	 */
-	off = buf->outer_l2_len + (vlan ? sizeof(struct rte_vlan_hdr) : 0);
-	offsets[1] = off >> 1; /* Outer L3 offset. */
-	off += buf->outer_l3_len;
-	if (tunnel == PKT_TX_TUNNEL_UDP)
-		offsets[0] = off >> 1; /* Outer L4 offset. */
-	if (inner_ip) {
-		off += buf->l2_len;
-		offsets[3] = off >> 1; /* Inner L3 offset. */
-		if (csum_flags == PKT_TX_TCP_CKSUM || tso ||
-		    csum_flags == PKT_TX_UDP_CKSUM) {
-			off += buf->l3_len;
-			offsets[2] = off >> 1; /* Inner L4 offset. */
-		}
-	}
-}
-
-/**
- * Convert the Checksum offloads to Verbs.
- *
- * @param buf
- *   Pointer to the mbuf.
- *
- * @return
- *   Converted checksum flags.
- */
-static __rte_always_inline uint8_t
-txq_ol_cksum_to_cs(struct rte_mbuf *buf)
-{
-	uint32_t idx;
-	uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK);
-	const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK |
-				       PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM;
-
-	/*
-	 * The index should have:
-	 * bit[0] = PKT_TX_TCP_SEG
-	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
-	 * bit[4] = PKT_TX_IP_CKSUM
-	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
-	 * bit[9] = tunnel
-	 */
-	idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
-	return mlx5_cksum_table[idx];
-}
-
-/**
- * Count the number of contiguous single segment packets.
- *
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- *
- * @return
- *   Number of contiguous single segment packets.
- */
-static __rte_always_inline unsigned int
-txq_count_contig_single_seg(struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	unsigned int pos;
-
-	if (!pkts_n)
-		return 0;
-	/* Count the number of contiguous single segment packets. */
-	for (pos = 0; pos < pkts_n; ++pos)
-		if (NB_SEGS(pkts[pos]) > 1)
-			break;
-	return pos;
-}
-
-/**
- * Count the number of contiguous multi-segment packets.
- *
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- *
- * @return
- *   Number of contiguous multi-segment packets.
- */
-static __rte_always_inline unsigned int
-txq_count_contig_multi_seg(struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	unsigned int pos;
-
-	if (!pkts_n)
-		return 0;
-	/* Count the number of contiguous multi-segment packets. */
-	for (pos = 0; pos < pkts_n; ++pos)
-		if (NB_SEGS(pkts[pos]) == 1)
-			break;
-	return pos;
-}
-
 #endif /* RTE_PMD_MLX5_RXTX_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
index 073044f..f6ec828 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -40,138 +40,6 @@
 #endif
 
 /**
- * Count the number of packets having same ol_flags and same metadata (if
- * PKT_TX_METADATA is set in ol_flags), and calculate cs_flags.
- *
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- * @param cs_flags
- *   Pointer of flags to be returned.
- * @param metadata
- *   Pointer of metadata to be returned.
- * @param txq_offloads
- *   Offloads enabled on Tx queue
- *
- * @return
- *   Number of packets having same ol_flags and metadata, if relevant.
- */
-static inline unsigned int
-txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags,
-		 rte_be32_t *metadata, const uint64_t txq_offloads)
-{
-	unsigned int pos;
-	const uint64_t cksum_ol_mask =
-		PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
-		PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
-		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
-	rte_be32_t p0_metadata, pn_metadata;
-
-	if (!pkts_n)
-		return 0;
-	p0_metadata = pkts[0]->ol_flags & PKT_TX_METADATA ?
-			pkts[0]->tx_metadata : 0;
-	/* Count the number of packets having same offload parameters. */
-	for (pos = 1; pos < pkts_n; ++pos) {
-		/* Check if packet has same checksum flags. */
-		if ((txq_offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP) &&
-		    ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & cksum_ol_mask))
-			break;
-		/* Check if packet has same metadata. */
-		if (txq_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) {
-			pn_metadata = pkts[pos]->ol_flags & PKT_TX_METADATA ?
-					pkts[pos]->tx_metadata : 0;
-			if (pn_metadata != p0_metadata)
-				break;
-		}
-	}
-	*cs_flags = txq_ol_cksum_to_cs(pkts[0]);
-	*metadata = p0_metadata;
-	return pos;
-}
-
-/**
- * DPDK callback for vectorized TX.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts,
-		      uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t nb_tx = 0;
-
-	while (pkts_n > nb_tx) {
-		uint16_t n;
-		uint16_t ret;
-
-		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
-		ret = txq_burst_v(txq, &pkts[nb_tx], n, 0, 0);
-		nb_tx += ret;
-		if (!ret)
-			break;
-	}
-	return nb_tx;
-}
-
-/**
- * DPDK callback for vectorized TX with multi-seg packets and offload.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t nb_tx = 0;
-
-	while (pkts_n > nb_tx) {
-		uint8_t cs_flags = 0;
-		uint16_t n;
-		uint16_t ret;
-		rte_be32_t metadata = 0;
-
-		/* Transmit multi-seg packets in the head of pkts list. */
-		if ((txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS) &&
-		    NB_SEGS(pkts[nb_tx]) > 1)
-			nb_tx += txq_scatter_v(txq,
-					       &pkts[nb_tx],
-					       pkts_n - nb_tx);
-		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
-		if (txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
-			n = txq_count_contig_single_seg(&pkts[nb_tx], n);
-		if (txq->offloads & (MLX5_VEC_TX_CKSUM_OFFLOAD_CAP |
-				     DEV_TX_OFFLOAD_MATCH_METADATA))
-			n = txq_calc_offload(&pkts[nb_tx], n,
-					     &cs_flags, &metadata,
-					     txq->offloads);
-		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags, metadata);
-		nb_tx += ret;
-		if (!ret)
-			break;
-	}
-	return nb_tx;
-}
-
-/**
  * Skip error packets.
  *
  * @param rxq
@@ -243,49 +111,6 @@
 }
 
 /**
- * Check Tx queue flags are set for raw vectorized Tx.
- *
- * @param dev
- *   Pointer to Ethernet device.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev)
-{
-	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
-
-	/* Doesn't support any offload. */
-	if (offloads)
-		return -ENOTSUP;
-	return 1;
-}
-
-/**
- * Check a device can support vectorized TX.
- *
- * @param dev
- *   Pointer to Ethernet device.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-mlx5_check_vec_tx_support(struct rte_eth_dev *dev)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
-
-	if (!priv->config.tx_vec_en ||
-	    priv->txqs_n > (unsigned int)priv->config.txqs_vec ||
-	    priv->config.mps != MLX5_MPW_ENHANCED ||
-	    offloads & ~MLX5_VEC_TX_OFFLOAD_CAP)
-		return -ENOTSUP;
-	return 1;
-}
-
-/**
  * Check a RX queue can support vectorized RX.
  *
  * @param rxq
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 1c7e3b4..9930286 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -27,295 +27,6 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 
 /**
- * Fill in buffer descriptors in a multi-packet send descriptor.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param dseg
- *   Pointer to buffer descriptor to be written.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param n
- *   Number of packets to be filled.
- */
-static inline void
-txq_wr_dseg_v(struct mlx5_txq_data *txq, uint8_t *dseg,
-	      struct rte_mbuf **pkts, unsigned int n)
-{
-	unsigned int pos;
-	uintptr_t addr;
-	const uint8x16_t dseg_shuf_m = {
-		 3,  2,  1,  0, /* length, bswap32 */
-		 4,  5,  6,  7, /* lkey */
-		15, 14, 13, 12, /* addr, bswap64 */
-		11, 10,  9,  8
-	};
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	uint32_t tx_byte = 0;
-#endif
-
-	for (pos = 0; pos < n; ++pos, dseg += MLX5_WQE_DWORD_SIZE) {
-		uint8x16_t desc;
-		struct rte_mbuf *pkt = pkts[pos];
-
-		addr = rte_pktmbuf_mtod(pkt, uintptr_t);
-		desc = vreinterpretq_u8_u32((uint32x4_t) {
-				DATA_LEN(pkt),
-				mlx5_tx_mb2mr(txq, pkt),
-				addr,
-				addr >> 32 });
-		desc = vqtbl1q_u8(desc, dseg_shuf_m);
-		vst1q_u8(dseg, desc);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		tx_byte += DATA_LEN(pkt);
-#endif
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.obytes += tx_byte;
-#endif
-}
-
-/**
- * Send multi-segmented packets until it encounters a single segment packet in
- * the pkts list.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
-	      uint16_t pkts_n)
-{
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n;
-	volatile struct mlx5_wqe *wqe = NULL;
-	bool metadata_ol =
-		txq->offloads & DEV_TX_OFFLOAD_MATCH_METADATA ? true : false;
-
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	if (unlikely(!pkts_n))
-		return 0;
-	for (n = 0; n < pkts_n; ++n) {
-		struct rte_mbuf *buf = pkts[n];
-		unsigned int segs_n = buf->nb_segs;
-		unsigned int ds = nb_dword_in_hdr;
-		unsigned int len = PKT_LEN(buf);
-		uint16_t wqe_ci = txq->wqe_ci;
-		const uint8x16_t ctrl_shuf_m = {
-			3,  2,  1,  0, /* bswap32 */
-			7,  6,  5,  4, /* bswap32 */
-			11, 10,  9,  8, /* bswap32 */
-			12, 13, 14, 15
-		};
-		uint8_t cs_flags;
-		uint16_t max_elts;
-		uint16_t max_wqe;
-		uint8x16_t *t_wqe;
-		uint8_t *dseg;
-		uint8x16_t ctrl;
-		rte_be32_t metadata =
-			metadata_ol && (buf->ol_flags & PKT_TX_METADATA) ?
-			buf->tx_metadata : 0;
-
-		assert(segs_n);
-		max_elts = elts_n - (elts_head - txq->elts_tail);
-		max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi);
-		/*
-		 * A MPW session consumes 2 WQEs at most to
-		 * include MLX5_MPW_DSEG_MAX pointers.
-		 */
-		if (segs_n == 1 ||
-		    max_elts < segs_n || max_wqe < 2)
-			break;
-		wqe = &((volatile struct mlx5_wqe64 *)
-			 txq->wqes)[wqe_ci & wq_mask].hdr;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Title WQEBB pointer. */
-		t_wqe = (uint8x16_t *)wqe;
-		dseg = (uint8_t *)(wqe + 1);
-		do {
-			if (!(ds++ % nb_dword_per_wqebb)) {
-				dseg = (uint8_t *)
-					&((volatile struct mlx5_wqe64 *)
-					   txq->wqes)[++wqe_ci & wq_mask];
-			}
-			txq_wr_dseg_v(txq, dseg, &buf, 1);
-			dseg += MLX5_WQE_DWORD_SIZE;
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			buf = buf->next;
-		} while (--segs_n);
-		++wqe_ci;
-		/* Fill CTRL in the header. */
-		ctrl = vreinterpretq_u8_u32((uint32x4_t) {
-				MLX5_OPC_MOD_MPW << 24 |
-				txq->wqe_ci << 8 | MLX5_OPCODE_TSO,
-				txq->qp_num_8s | ds, 4, 0});
-		ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m);
-		vst1q_u8((void *)t_wqe, ctrl);
-		/* Fill ESEG in the header. */
-		vst1q_u32((void *)(t_wqe + 1),
-			  ((uint32x4_t){ 0,
-					 rte_cpu_to_be_16(len) << 16 | cs_flags,
-					 metadata, 0 }));
-		txq->wqe_ci = wqe_ci;
-	}
-	if (!n)
-		return 0;
-	txq->elts_comp += (uint16_t)(elts_head - txq->elts_head);
-	txq->elts_head = elts_head;
-	if (txq->elts_comp >= MLX5_TX_COMP_THRESH) {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		wqe->ctrl[3] = txq->elts_head;
-		txq->elts_comp = 0;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += n;
-#endif
-	mlx5_tx_dbrec(txq, wqe);
-	return n;
-}
-
-/**
- * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet,
- * it returns to make it processed by txq_scatter_v(). All the packets in
- * the pkts list should be single segment packets having same offload flags.
- * This must be checked by txq_count_contig_single_seg() and txq_calc_offload().
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
- * @param cs_flags
- *   Checksum offload flags to be written in the descriptor.
- * @param metadata
- *   Metadata value to be written in the descriptor.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static inline uint16_t
-txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint8_t cs_flags, rte_be32_t metadata)
-{
-	struct rte_mbuf **elts;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n = 0;
-	unsigned int pos;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	uint32_t comp_req;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	uint16_t wq_idx = txq->wqe_ci & wq_mask;
-	volatile struct mlx5_wqe64 *wq =
-		&((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx];
-	volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq;
-	const uint8x16_t ctrl_shuf_m = {
-		 3,  2,  1,  0, /* bswap32 */
-		 7,  6,  5,  4, /* bswap32 */
-		11, 10,  9,  8, /* bswap32 */
-		12, 13, 14, 15
-	};
-	uint8x16_t *t_wqe;
-	uint8_t *dseg;
-	uint8x16_t ctrl;
-
-	/* Make sure all packets can fit into a single WQE. */
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts);
-	if (unlikely(!pkts_n))
-		return 0;
-	elts = &(*txq->elts)[elts_head & elts_m];
-	/* Loop for available tailroom first. */
-	n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n);
-	for (pos = 0; pos < (n & -2); pos += 2)
-		vst1q_u64((void *)&elts[pos], vld1q_u64((void *)&pkts[pos]));
-	if (n & 1)
-		elts[pos] = pkts[pos];
-	/* Check if it crosses the end of the queue. */
-	if (unlikely(n < pkts_n)) {
-		elts = &(*txq->elts)[0];
-		for (pos = 0; pos < pkts_n - n; ++pos)
-			elts[pos] = pkts[n + pos];
-	}
-	txq->elts_head += pkts_n;
-	/* Save title WQEBB pointer. */
-	t_wqe = (uint8x16_t *)wqe;
-	dseg = (uint8_t *)(wqe + 1);
-	/* Calculate the number of entries to the end. */
-	n = RTE_MIN(
-		(wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr,
-		pkts_n);
-	/* Fill DSEGs. */
-	txq_wr_dseg_v(txq, dseg, pkts, n);
-	/* Check if it crosses the end of the queue. */
-	if (n < pkts_n) {
-		dseg = (uint8_t *)txq->wqes;
-		txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n);
-	}
-	if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) {
-		txq->elts_comp += pkts_n;
-		comp_req = MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET;
-	} else {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request a completion. */
-		txq->elts_comp = 0;
-		comp_req = MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET;
-	}
-	/* Fill CTRL in the header. */
-	ctrl = vreinterpretq_u8_u32((uint32x4_t) {
-			MLX5_OPC_MOD_ENHANCED_MPSW << 24 |
-			txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW,
-			txq->qp_num_8s | (pkts_n + 2),
-			comp_req,
-			txq->elts_head });
-	ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m);
-	vst1q_u8((void *)t_wqe, ctrl);
-	/* Fill ESEG in the header. */
-	vst1q_u32((void *)(t_wqe + 1),
-		 ((uint32x4_t) { 0, cs_flags, metadata, 0 }));
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += pkts_n;
-#endif
-	txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
-		       nb_dword_per_wqebb;
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
-	return pkts_n;
-}
-
-/**
  * Store free buffers to RX SW ring.
  *
  * @param rxq
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 503ca0f..7bd254f 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -29,290 +29,6 @@
 #endif
 
 /**
- * Fill in buffer descriptors in a multi-packet send descriptor.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param dseg
- *   Pointer to buffer descriptor to be written.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param n
- *   Number of packets to be filled.
- */
-static inline void
-txq_wr_dseg_v(struct mlx5_txq_data *txq, __m128i *dseg,
-	      struct rte_mbuf **pkts, unsigned int n)
-{
-	unsigned int pos;
-	uintptr_t addr;
-	const __m128i shuf_mask_dseg =
-		_mm_set_epi8(8,  9, 10, 11, /* addr, bswap64 */
-			    12, 13, 14, 15,
-			     7,  6,  5,  4, /* lkey */
-			     0,  1,  2,  3  /* length, bswap32 */);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	uint32_t tx_byte = 0;
-#endif
-
-	for (pos = 0; pos < n; ++pos, ++dseg) {
-		__m128i desc;
-		struct rte_mbuf *pkt = pkts[pos];
-
-		addr = rte_pktmbuf_mtod(pkt, uintptr_t);
-		desc = _mm_set_epi32(addr >> 32,
-				     addr,
-				     mlx5_tx_mb2mr(txq, pkt),
-				     DATA_LEN(pkt));
-		desc = _mm_shuffle_epi8(desc, shuf_mask_dseg);
-		_mm_store_si128(dseg, desc);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		tx_byte += DATA_LEN(pkt);
-#endif
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.obytes += tx_byte;
-#endif
-}
-
-/**
- * Send multi-segmented packets until it encounters a single segment packet in
- * the pkts list.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
-	      uint16_t pkts_n)
-{
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n;
-	volatile struct mlx5_wqe *wqe = NULL;
-	bool metadata_ol =
-		txq->offloads & DEV_TX_OFFLOAD_MATCH_METADATA ? true : false;
-
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	if (unlikely(!pkts_n))
-		return 0;
-	for (n = 0; n < pkts_n; ++n) {
-		struct rte_mbuf *buf = pkts[n];
-		unsigned int segs_n = buf->nb_segs;
-		unsigned int ds = nb_dword_in_hdr;
-		unsigned int len = PKT_LEN(buf);
-		uint16_t wqe_ci = txq->wqe_ci;
-		const __m128i shuf_mask_ctrl =
-			_mm_set_epi8(15, 14, 13, 12,
-				      8,  9, 10, 11, /* bswap32 */
-				      4,  5,  6,  7, /* bswap32 */
-				      0,  1,  2,  3  /* bswap32 */);
-		uint8_t cs_flags;
-		uint16_t max_elts;
-		uint16_t max_wqe;
-		__m128i *t_wqe, *dseg;
-		__m128i ctrl;
-		rte_be32_t metadata =
-			metadata_ol && (buf->ol_flags & PKT_TX_METADATA) ?
-			buf->tx_metadata : 0;
-
-		assert(segs_n);
-		max_elts = elts_n - (elts_head - txq->elts_tail);
-		max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi);
-		/*
-		 * A MPW session consumes 2 WQEs at most to
-		 * include MLX5_MPW_DSEG_MAX pointers.
-		 */
-		if (segs_n == 1 ||
-		    max_elts < segs_n || max_wqe < 2)
-			break;
-		if (segs_n > MLX5_MPW_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		wqe = &((volatile struct mlx5_wqe64 *)
-			 txq->wqes)[wqe_ci & wq_mask].hdr;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Title WQEBB pointer. */
-		t_wqe = (__m128i *)wqe;
-		dseg = (__m128i *)(wqe + 1);
-		do {
-			if (!(ds++ % nb_dword_per_wqebb)) {
-				dseg = (__m128i *)
-					&((volatile struct mlx5_wqe64 *)
-					   txq->wqes)[++wqe_ci & wq_mask];
-			}
-			txq_wr_dseg_v(txq, dseg++, &buf, 1);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			buf = buf->next;
-		} while (--segs_n);
-		++wqe_ci;
-		/* Fill CTRL in the header. */
-		ctrl = _mm_set_epi32(0, 4, txq->qp_num_8s | ds,
-				     MLX5_OPC_MOD_MPW << 24 |
-				     txq->wqe_ci << 8 | MLX5_OPCODE_TSO);
-		ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
-		_mm_store_si128(t_wqe, ctrl);
-		/* Fill ESEG in the header. */
-		_mm_store_si128(t_wqe + 1,
-				_mm_set_epi32(0, metadata,
-					      (rte_cpu_to_be_16(len) << 16) |
-					      cs_flags, 0));
-		txq->wqe_ci = wqe_ci;
-	}
-	if (!n)
-		return 0;
-	txq->elts_comp += (uint16_t)(elts_head - txq->elts_head);
-	txq->elts_head = elts_head;
-	if (txq->elts_comp >= MLX5_TX_COMP_THRESH) {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		wqe->ctrl[3] = txq->elts_head;
-		txq->elts_comp = 0;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += n;
-#endif
-	mlx5_tx_dbrec(txq, wqe);
-	return n;
-}
-
-/**
- * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet,
- * it returns to make it processed by txq_scatter_v(). All the packets in
- * the pkts list should be single segment packets having same offload flags.
- * This must be checked by txq_count_contig_single_seg() and txq_calc_offload().
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
- * @param cs_flags
- *   Checksum offload flags to be written in the descriptor.
- * @param metadata
- *   Metadata value to be written in the descriptor.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static inline uint16_t
-txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint8_t cs_flags, rte_be32_t metadata)
-{
-	struct rte_mbuf **elts;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n = 0;
-	unsigned int pos;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	uint32_t comp_req;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	uint16_t wq_idx = txq->wqe_ci & wq_mask;
-	volatile struct mlx5_wqe64 *wq =
-		&((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx];
-	volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq;
-	const __m128i shuf_mask_ctrl =
-		_mm_set_epi8(15, 14, 13, 12,
-			      8,  9, 10, 11, /* bswap32 */
-			      4,  5,  6,  7, /* bswap32 */
-			      0,  1,  2,  3  /* bswap32 */);
-	__m128i *t_wqe, *dseg;
-	__m128i ctrl;
-
-	/* Make sure all packets can fit into a single WQE. */
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts);
-	assert(pkts_n <= MLX5_DSEG_MAX - nb_dword_in_hdr);
-	if (unlikely(!pkts_n))
-		return 0;
-	elts = &(*txq->elts)[elts_head & elts_m];
-	/* Loop for available tailroom first. */
-	n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n);
-	for (pos = 0; pos < (n & -2); pos += 2)
-		_mm_storeu_si128((__m128i *)&elts[pos],
-				 _mm_loadu_si128((__m128i *)&pkts[pos]));
-	if (n & 1)
-		elts[pos] = pkts[pos];
-	/* Check if it crosses the end of the queue. */
-	if (unlikely(n < pkts_n)) {
-		elts = &(*txq->elts)[0];
-		for (pos = 0; pos < pkts_n - n; ++pos)
-			elts[pos] = pkts[n + pos];
-	}
-	txq->elts_head += pkts_n;
-	/* Save title WQEBB pointer. */
-	t_wqe = (__m128i *)wqe;
-	dseg = (__m128i *)(wqe + 1);
-	/* Calculate the number of entries to the end. */
-	n = RTE_MIN(
-		(wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr,
-		pkts_n);
-	/* Fill DSEGs. */
-	txq_wr_dseg_v(txq, dseg, pkts, n);
-	/* Check if it crosses the end of the queue. */
-	if (n < pkts_n) {
-		dseg = (__m128i *)txq->wqes;
-		txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n);
-	}
-	if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) {
-		txq->elts_comp += pkts_n;
-		comp_req = MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET;
-	} else {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request a completion. */
-		txq->elts_comp = 0;
-		comp_req = MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET;
-	}
-	/* Fill CTRL in the header. */
-	ctrl = _mm_set_epi32(txq->elts_head, comp_req,
-			     txq->qp_num_8s | (pkts_n + 2),
-			     MLX5_OPC_MOD_ENHANCED_MPSW << 24 |
-				txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW);
-	ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
-	_mm_store_si128(t_wqe, ctrl);
-	/* Fill ESEG in the header. */
-	_mm_store_si128(t_wqe + 1, _mm_set_epi32(0, metadata, cs_flags, 0));
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += pkts_n;
-#endif
-	txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
-		       nb_dword_per_wqebb;
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
-	return pkts_n;
-}
-
-/**
  * Store free buffers to RX SW ring.
  *
  * @param rxq
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 7741095..0d2dbfa 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -365,25 +365,6 @@
 }
 
 /**
- * Check if the burst function is using eMPW.
- *
- * @param tx_pkt_burst
- *   Tx burst function pointer.
- *
- * @return
- *   1 if the burst function is using eMPW, 0 otherwise.
- */
-static int
-is_empw_burst_func(eth_tx_burst_t tx_pkt_burst)
-{
-	if (tx_pkt_burst == mlx5_tx_burst_raw_vec ||
-	    tx_pkt_burst == mlx5_tx_burst_vec ||
-	    tx_pkt_burst == mlx5_tx_burst_empw)
-		return 1;
-	return 0;
-}
-
-/**
  * Create the Tx queue Verbs object.
  *
  * @param dev
@@ -414,7 +395,6 @@ struct mlx5_txq_ibv *
 	struct mlx5dv_cq cq_info;
 	struct mlx5dv_obj obj;
 	const int desc = 1 << txq_data->elts_n;
-	eth_tx_burst_t tx_pkt_burst = mlx5_select_tx_function(dev);
 	int ret = 0;
 
 	assert(txq_data);
@@ -432,8 +412,6 @@ struct mlx5_txq_ibv *
 		.comp_mask = 0,
 	};
 	cqe_n = desc / MLX5_TX_COMP_THRESH + 1;
-	if (is_empw_burst_func(tx_pkt_burst))
-		cqe_n += MLX5_TX_COMP_THRESH_INLINE_DIV;
 	tmpl.cq = mlx5_glue->create_cq(priv->sh->ctx, cqe_n, NULL, NULL, 0);
 	if (tmpl.cq == NULL) {
 		DRV_LOG(ERR, "port %u Tx queue %u CQ creation failure",
@@ -698,93 +676,7 @@ struct mlx5_txq_ibv *
 static void
 txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
 {
-	struct mlx5_priv *priv = txq_ctrl->priv;
-	struct mlx5_dev_config *config = &priv->config;
-	const unsigned int max_tso_inline =
-		((MLX5_MAX_TSO_HEADER + (RTE_CACHE_LINE_SIZE - 1)) /
-		 RTE_CACHE_LINE_SIZE);
-	unsigned int txq_inline;
-	unsigned int txqs_inline;
-	unsigned int inline_max_packet_sz;
-	eth_tx_burst_t tx_pkt_burst =
-		mlx5_select_tx_function(ETH_DEV(priv));
-	int is_empw_func = is_empw_burst_func(tx_pkt_burst);
-	int tso = !!(txq_ctrl->txq.offloads & (DEV_TX_OFFLOAD_TCP_TSO |
-					       DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
-					       DEV_TX_OFFLOAD_GRE_TNL_TSO |
-					       DEV_TX_OFFLOAD_IP_TNL_TSO |
-					       DEV_TX_OFFLOAD_UDP_TNL_TSO));
-
-	txq_inline = (config->txq_inline == MLX5_ARG_UNSET) ?
-		0 : config->txq_inline;
-	txqs_inline = (config->txqs_inline == MLX5_ARG_UNSET) ?
-		0 : config->txqs_inline;
-	inline_max_packet_sz =
-		(config->inline_max_packet_sz == MLX5_ARG_UNSET) ?
-		0 : config->inline_max_packet_sz;
-	if (is_empw_func) {
-		if (config->txq_inline == MLX5_ARG_UNSET)
-			txq_inline = MLX5_WQE_SIZE_MAX - MLX5_WQE_SIZE;
-		if (config->txqs_inline == MLX5_ARG_UNSET)
-			txqs_inline = MLX5_EMPW_MIN_TXQS;
-		if (config->inline_max_packet_sz == MLX5_ARG_UNSET)
-			inline_max_packet_sz = MLX5_EMPW_MAX_INLINE_LEN;
-		txq_ctrl->txq.mpw_hdr_dseg = config->mpw_hdr_dseg;
-		txq_ctrl->txq.inline_max_packet_sz = inline_max_packet_sz;
-	}
-	if (txq_inline && priv->txqs_n >= txqs_inline) {
-		unsigned int ds_cnt;
-
-		txq_ctrl->txq.max_inline =
-			((txq_inline + (RTE_CACHE_LINE_SIZE - 1)) /
-			 RTE_CACHE_LINE_SIZE);
-		if (is_empw_func) {
-			/* To minimize the size of data set, avoid requesting
-			 * too large WQ.
-			 */
-			txq_ctrl->max_inline_data =
-				((RTE_MIN(txq_inline,
-					  inline_max_packet_sz) +
-				  (RTE_CACHE_LINE_SIZE - 1)) /
-				 RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE;
-		} else {
-			txq_ctrl->max_inline_data =
-				txq_ctrl->txq.max_inline * RTE_CACHE_LINE_SIZE;
-		}
-		/*
-		 * Check if the inline size is too large in a way which
-		 * can make the WQE DS to overflow.
-		 * Considering in calculation:
-		 *      WQE CTRL (1 DS)
-		 *      WQE ETH  (1 DS)
-		 *      Inline part (N DS)
-		 */
-		ds_cnt = 2 + (txq_ctrl->txq.max_inline / MLX5_WQE_DWORD_SIZE);
-		if (ds_cnt > MLX5_DSEG_MAX) {
-			unsigned int max_inline = (MLX5_DSEG_MAX - 2) *
-						  MLX5_WQE_DWORD_SIZE;
-
-			max_inline = max_inline - (max_inline %
-						   RTE_CACHE_LINE_SIZE);
-			DRV_LOG(WARNING,
-				"port %u txq inline is too large (%d) setting"
-				" it to the maximum possible: %d\n",
-				PORT_ID(priv), txq_inline, max_inline);
-			txq_ctrl->txq.max_inline = max_inline /
-						   RTE_CACHE_LINE_SIZE;
-		}
-	}
-	if (tso) {
-		txq_ctrl->max_tso_header = max_tso_inline * RTE_CACHE_LINE_SIZE;
-		txq_ctrl->txq.max_inline = RTE_MAX(txq_ctrl->txq.max_inline,
-						   max_tso_inline);
-		txq_ctrl->txq.tso_en = 1;
-	}
-	txq_ctrl->txq.tunnel_en = config->tunnel_en | config->swp;
-	txq_ctrl->txq.swp_en = ((DEV_TX_OFFLOAD_IP_TNL_TSO |
-				 DEV_TX_OFFLOAD_UDP_TNL_TSO |
-				 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM) &
-				txq_ctrl->txq.offloads) && config->swp;
+	(void)txq_ctrl;
 }
 
 /**
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v2 2/7] net/mlx5: add Tx datapath related devargs
  2019-07-15 13:59   ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 1/7] net/mlx5: remove Tx datapath implementation Viacheslav Ovsiienko
@ 2019-07-15 13:59     ` Viacheslav Ovsiienko
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 3/7] net/mlx5: update Tx datapath definitions Viacheslav Ovsiienko
                       ` (5 subsequent siblings)
  7 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-15 13:59 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch introduces new mlx5 PMD devarg options:

- txq_inline_min - specifies minimal amount of data to be inlined into
  WQE during Tx operations. NICs may require this minimal data amount
  to operate correctly. The exact value may depend on NIC operation mode,
  requested offloads, etc.

- txq_inline_max - specifies the maximal packet length to be completely
  inlined into WQE Ethernet Segment for ordinary SEND method. If packet
  is larger the specified value, the packet data won't be copied by the
  driver at all, data buffer is addressed with a pointer. If packet length
  is less or equal all packet data will be copied into WQE.

- txq_inline_mpw - specifies the maximal packet length to be completely
  inlined into WQE for Enhanced MPW method.

Driver documentation is also updated.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 doc/guides/nics/mlx5.rst | 168 +++++++++++++++++++++++++----------------------
 drivers/net/mlx5/mlx5.c  |  21 ++++++
 drivers/net/mlx5/mlx5.h  |   4 ++
 3 files changed, 113 insertions(+), 80 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 16aa390..e3c2a11 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -348,99 +348,107 @@ Run-time configuration
   greater or equal to this value. The default value is 12, valid only if
   ``mprq_en`` is set.
 
-- ``txq_inline`` parameter [int]
-
-  Amount of data to be inlined during TX operations. Improves latency.
-  Can improve PPS performance when PCI back pressure is detected and may be
-  useful for scenarios involving heavy traffic on many queues.
-
-  Because additional software logic is necessary to handle this mode, this
-  option should be used with care, as it can lower performance when back
-  pressure is not expected.
+- ``txq_inline_min`` parameter [int]
+
+  Minimal amount of data to be inlined into WQE during Tx operations. NICs
+  may require this minimal data amount to operate correctly. The exact value
+  may depend on NIC operation mode, requested offloads, etc.
+
+  If ``txq_inline_min`` key is present the specified value (may be aligned
+  by the driver in order not to exceed the limits and provide better descriptor
+  space utilization) will be used by the driver and it is guaranteed the
+  requested data bytes are inlined into the WQE beside other inline settings.
+
+  If ``txq_inline_min`` key is not present, the value may be queried by the
+  driver from the NIC via DevX if this feature is available. If there is no DevX
+  enabled/supported the value 18 (supposing L2 header including VLAN) is set
+  for ConnectX-4, value 58 (supposing L2-L4 headers, required by configurations
+  over E-Switch) is set for ConnectX-4 Lx, and 0 is set by default for ConnectX-5
+  and newer NICs. If packet is shorter the ``txq_inline_min`` value, the entire
+  packet is inlined.
+
+  Please, note, this minimal data inlining disengages eMPW feature (Enhanced
+  Multi-Packet Write), because last one does not support partial packet inlining.
+  This is not very critical due to minimal data inlining is mostly required
+  by ConnectX-4 and ConnectX-4 Lx, these NICs do not support eMPW feature.
 
 - ``txqs_min_inline`` parameter [int]
 
-  Enable inline send only when the number of TX queues is greater or equal
+  Enable inline data send only when the number of TX queues is greater or equal
   to this value.
 
-  This option should be used in combination with ``txq_inline`` above.
-
-  On ConnectX-4, ConnectX-4 LX, ConnectX-5, ConnectX-6 and BlueField without
-  Enhanced MPW:
-
-        - Disabled by default.
-        - In case ``txq_inline`` is set recommendation is 4.
+  This option should be used in combination with ``txq_inline_max`` and
+  ``txq_inline_mpw`` below and does not affect ``txq_inline_min`` settings above.
 
-  On ConnectX-5, ConnectX-6 and BlueField with Enhanced MPW:
+  If this option is not specified the default value 8 is used.
 
-        - Set to 8 by default.
+  The data inlining consumes the CPU cycles, so this option is intended to
+  auto enable inline data if we have enough Tx queues, which means we have
+  enough CPU cores and PCI bandwidth is getting more critical and CPU
+  is not supposed to be bottleneck anymore.
 
-- ``txqs_max_vec`` parameter [int]
+  The copying data into WQE improves latency and can improve PPS performance
+  when PCI back pressure is detected and may be useful for scenarios involving
+  heavy traffic on many queues.
 
-  Enable vectorized Tx only when the number of TX queues is less than or
-  equal to this value. Effective only when ``tx_vec_en`` is enabled.
-
-  On ConnectX-5:
-
-        - Set to 8 by default on ARMv8.
-        - Set to 4 by default otherwise.
-
-  On BlueField
+  Because additional software logic is necessary to handle this mode, this
+  option should be used with care, as it may lower performance when back
+  pressure is not expected.
 
-        - Set to 16 by default.
+- ``txq_inline_max`` parameter [int]
+
+  Specifies the maximal packet length to be completely inlined into WQE
+  Ethernet Segment for ordinary SEND method. If packet is larger than specified
+  value, the packet data won't be copied by the driver at all, data buffer
+  is addressed with a pointer. If packet length is less or equal all packet
+  data will be copied into WQE. This may improve PCI bandwidth utilization for
+  short packets significantly but requires the extra CPU cycles.
+
+  The data inline feature is controlled by number of Tx queues, if number of Tx
+  queues is larger than ``txqs_min_inline`` key parameter, the inline feature
+  is engaged, if there are not enough Tx queues (which means not enough CPU cores
+  and CPU resources are scarce), data inline is not performed by the driver.
+  Assigning ``txqs_min_inline`` with zero always enables the data inline.
+
+  The default ``txq_inline_max`` value is 290. The specified value may be adjusted
+  by the driver in order not to exceed the limit (930 bytes) and to provide better
+  WQE space filling without gaps, the adjustment is reflected in the debug log.
+
+- ``txq_inline_mpw`` parameter [int]
+
+  Specifies the maximal packet length to be completely inlined into WQE for
+  Enhanced MPW method. If packet is large the specified value, the packet data
+  won't be copied, and data buffer is addressed with pointer. If packet length
+  is less or equal, all packet data will be copied into WQE. This may improve PCI
+  bandwidth utilization for short packets significantly but requires the extra
+  CPU cycles.
+
+  The data inline feature is controlled by number of TX queues, if number of Tx
+  queues is larger than ``txqs_min_inline`` key parameter, the inline feature
+  is engaged, if there are not enough Tx queues (which means not enough CPU cores
+  and CPU resources are scarce), data inline is not performed by the driver.
+  Assigning ``txqs_min_inline`` with zero always enables the data inline.
+
+  The default ``txq_inline_mpw`` value is 188. The specified value may be adjusted
+  by the driver in order not to exceed the limit (930 bytes) and to provide better
+  WQE space filling without gaps, the adjustment is reflected in the debug log.
+  Due to multiple packets may be included to the same WQE with Enhanced Multi
+  Packet Write Method and overall WQE size is limited it is not recommended to
+  specify large values for the ``txq_inline_mpw``.
 
 - ``txq_mpw_en`` parameter [int]
 
-  A nonzero value enables multi-packet send (MPS) for ConnectX-4 Lx and
-  enhanced multi-packet send (Enhanced MPS) for ConnectX-5, ConnectX-6 and BlueField.
-  MPS allows the TX burst function to pack up multiple packets in a
-  single descriptor session in order to save PCI bandwidth and improve
-  performance at the cost of a slightly higher CPU usage. When
-  ``txq_inline`` is set along with ``txq_mpw_en``, TX burst function tries
-  to copy entire packet data on to TX descriptor instead of including
-  pointer of packet only if there is enough room remained in the
-  descriptor. ``txq_inline`` sets per-descriptor space for either pointers
-  or inlined packets. In addition, Enhanced MPS supports hybrid mode -
-  mixing inlined packets and pointers in the same descriptor.
-
-  This option cannot be used with certain offloads such as ``DEV_TX_OFFLOAD_TCP_TSO,
-  DEV_TX_OFFLOAD_VXLAN_TNL_TSO, DEV_TX_OFFLOAD_GRE_TNL_TSO, DEV_TX_OFFLOAD_VLAN_INSERT``.
-  When those offloads are requested the MPS send function will not be used.
-
-  It is currently only supported on the ConnectX-4 Lx, ConnectX-5, ConnectX-6 and BlueField
-  families of adapters.
-  On ConnectX-4 Lx the MPW is considered un-secure hence disabled by default.
-  Users which enable the MPW should be aware that application which provides incorrect
-  mbuf descriptors in the Tx burst can lead to serious errors in the host including, on some cases,
-  NIC to get stuck.
-  On ConnectX-5, ConnectX-6 and BlueField the MPW is secure and enabled by default.
-
-- ``txq_mpw_hdr_dseg_en`` parameter [int]
-
-  A nonzero value enables including two pointers in the first block of TX
-  descriptor. This can be used to lessen CPU load for memory copy.
-
-  Effective only when Enhanced MPS is supported. Disabled by default.
-
-- ``txq_max_inline_len`` parameter [int]
-
-  Maximum size of packet to be inlined. This limits the size of packet to
-  be inlined. If the size of a packet is larger than configured value, the
-  packet isn't inlined even though there's enough space remained in the
-  descriptor. Instead, the packet is included with pointer.
-
-  Effective only when Enhanced MPS is supported. The default value is 256.
-
-- ``tx_vec_en`` parameter [int]
-
-  A nonzero value enables Tx vector on ConnectX-5, ConnectX-6 and BlueField NICs if the number of
-  global Tx queues on the port is less than ``txqs_max_vec``.
-
-  This option cannot be used with certain offloads such as ``DEV_TX_OFFLOAD_TCP_TSO,
-  DEV_TX_OFFLOAD_VXLAN_TNL_TSO, DEV_TX_OFFLOAD_GRE_TNL_TSO, DEV_TX_OFFLOAD_VLAN_INSERT``.
-  When those offloads are requested the MPS send function will not be used.
-
-  Enabled by default on ConnectX-5, ConnectX-6 and BlueField.
+  A nonzero value enables Enhanced Multi-Packet Write (eMPW) for ConnectX-5,
+  ConnectX-6 and BlueField. eMPW allows the TX burst function to pack up multiple
+  packets in a single descriptor session in order to save PCI bandwidth and improve
+  performance at the cost of a slightly higher CPU usage. When ``txq_inline_mpw``
+  is set along with ``txq_mpw_en``, TX burst function copies entire packet
+  data on to TX descriptor instead of including pointer of packet.
+
+  The Enhanced Multi-Packet Write feature is enabled by default if NIC supports
+  it, can be disabled by explicit specifying 0 value for ``txq_mpw_en`` option.
+  Also, if minimal data inlining is requested by non-zero ``txq_inline_min``
+  option or reported by the NIC, the eMPW feature is disengaged.
 
 - ``rx_vec_en`` parameter [int]
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 42b36a7..e803f08 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -68,6 +68,15 @@
 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
 
+/* Device parameter to limit packet size to inline with ordinary SEND. */
+#define MLX5_TXQ_INLINE_MAX "txq_inline_max"
+
+/* Device parameter to configure minimal data size to inline. */
+#define MLX5_TXQ_INLINE_MIN "txq_inline_min"
+
+/* Device parameter to limit packet size to inline with Enhanced MPW. */
+#define MLX5_TXQ_INLINE_MPW "txq_inline_mpw"
+
 /*
  * Device parameter to configure the number of TX queues threshold for
  * enabling inline send.
@@ -884,6 +893,12 @@ struct mlx5_dev_spawn_data {
 		config->mprq.max_memcpy_len = tmp;
 	} else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
 		config->mprq.min_rxqs_num = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) {
+		config->txq_inline_max = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) {
+		config->txq_inline_min = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) {
+		config->txq_inline_mpw = tmp;
 	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
 		config->txqs_inline = tmp;
 	} else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
@@ -932,6 +947,9 @@ struct mlx5_dev_spawn_data {
 		MLX5_RX_MPRQ_LOG_STRIDE_NUM,
 		MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
 		MLX5_RXQS_MIN_MPRQ,
+		MLX5_TXQ_INLINE_MIN,
+		MLX5_TXQ_INLINE_MAX,
+		MLX5_TXQ_INLINE_MPW,
 		MLX5_TXQS_MIN_INLINE,
 		MLX5_TXQ_MPW_EN,
 		MLX5_RX_VEC_EN,
@@ -1882,6 +1900,9 @@ struct mlx5_dev_spawn_data {
 		.hw_padding = 0,
 		.mps = MLX5_ARG_UNSET,
 		.rx_vec_en = 1,
+		.txq_inline_max = MLX5_ARG_UNSET,
+		.txq_inline_min = MLX5_ARG_UNSET,
+		.txq_inline_mpw = MLX5_ARG_UNSET,
 		.txqs_inline = MLX5_ARG_UNSET,
 		.vf_nl_en = 1,
 		.mr_ext_memseg_en = 1,
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index b8986fc..8e2eab3 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -189,6 +189,7 @@ struct mlx5_dev_config {
 	unsigned int cqe_comp:1; /* CQE compression is enabled. */
 	unsigned int cqe_pad:1; /* CQE padding is enabled. */
 	unsigned int tso:1; /* Whether TSO is supported. */
+	unsigned int tx_inline:1; /* Engage TX data inlining. */
 	unsigned int rx_vec_en:1; /* Rx vector is enabled. */
 	unsigned int mr_ext_memseg_en:1;
 	/* Whether memseg should be extended for MR creation. */
@@ -214,6 +215,9 @@ struct mlx5_dev_config {
 	unsigned int ind_table_max_size; /* Maximum indirection table size. */
 	unsigned int max_dump_files_num; /* Maximum dump files per queue. */
 	int txqs_inline; /* Queue number threshold for inlining. */
+	int txq_inline_min; /* Minimal amount of data bytes to inline. */
+	int txq_inline_max; /* Max packet size for inlining with SEND. */
+	int txq_inline_mpw; /* Max packet size for inlining with eMPW. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 };
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v2 3/7] net/mlx5: update Tx datapath definitions
  2019-07-15 13:59   ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 1/7] net/mlx5: remove Tx datapath implementation Viacheslav Ovsiienko
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 2/7] net/mlx5: add Tx datapath related devargs Viacheslav Ovsiienko
@ 2019-07-15 13:59     ` Viacheslav Ovsiienko
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 4/7] net/mlx5: add Tx datapath configuration and setup Viacheslav Ovsiienko
                       ` (4 subsequent siblings)
  7 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-15 13:59 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch updates Tx datapath definitions, mostly hardware related.
The Tx descriptor structures are redefined with required fields,
size definitions are renamed to reflect the meanings in more
appropriate way. This is a preparation step before introducing
the new Tx datapath implementation.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_defs.h |   2 +-
 drivers/net/mlx5/mlx5_prm.h  | 163 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 151 insertions(+), 14 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 6861304..873a595 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -58,7 +58,7 @@
 #define MLX5_MAX_XSTATS 32
 
 /* Maximum Packet headers size (L2+L3+L4) for TSO. */
-#define MLX5_MAX_TSO_HEADER 192
+#define MLX5_MAX_TSO_HEADER (128u + 34u)
 
 /* Threshold of buffer replenishment for vectorized Rx. */
 #define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index ff5dfbb..a251369 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -39,14 +39,84 @@
 /* Invalidate a CQE. */
 #define MLX5_CQE_INVALIDATE (MLX5_CQE_INVALID << 4)
 
-/* WQE DWORD size */
-#define MLX5_WQE_DWORD_SIZE 16
-
-/* WQE size */
-#define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
+/* WQE Segment sizes in bytes. */
+#define MLX5_WSEG_SIZE 16u
+#define MLX5_WQE_CSEG_SIZE sizeof(struct mlx5_wqe_cseg)
+#define MLX5_WQE_DSEG_SIZE sizeof(struct mlx5_wqe_dseg)
+#define MLX5_WQE_ESEG_SIZE sizeof(struct mlx5_wqe_eseg)
+
+/* WQE/WQEBB size in bytes. */
+#define MLX5_WQE_SIZE sizeof(struct mlx5_wqe)
+
+/*
+ * Max size of a WQE session.
+ * Absolute maximum size is 63 (MLX5_DSEG_MAX) segments,
+ * the WQE size field in Control Segment is 6 bits wide.
+ */
+#define MLX5_WQE_SIZE_MAX (60 * MLX5_WSEG_SIZE)
+
+/*
+ * Default minimum number of Tx queues for inlining packets.
+ * If there are less queues as specified we assume we have
+ * no enough CPU resources (cycles) to perform inlining,
+ * the PCIe throughput is not supposed as bottleneck and
+ * inlining is disabled.
+ */
+#define MLX5_EMPW_MIN_TXQS 8u
+
+/*
+ * Default packet length threshold to be inlined with
+ * enhanced MPW. If packet length exceeds the threshold
+ * the data are not inlined. Should be aligned in WQEBB
+ * boundary with accounting the title Control and Ethernet
+ * segments.
+ */
+#define MLX5_EMPW_DEF_INLINE_LEN (3U * MLX5_WQE_SIZE + \
+				  MLX5_DSEG_MIN_INLINE_SIZE - \
+				  MLX5_WQE_DSEG_SIZE)
+/*
+ * Maximal inline data length sent with enhanced MPW.
+ * Is based on maximal WQE size.
+ */
+#define MLX5_EMPW_MAX_INLINE_LEN (MLX5_WQE_SIZE_MAX - \
+				  MLX5_WQE_CSEG_SIZE - \
+				  MLX5_WQE_ESEG_SIZE - \
+				  MLX5_WQE_DSEG_SIZE + \
+				  MLX5_DSEG_MIN_INLINE_SIZE)
+/*
+ * Minimal amount of packets to be sent with EMPW.
+ * This limits the minimal required size of sent EMPW.
+ * If there are no enough resources to built minimal
+ * EMPW the sending loop exits.
+ */
+#define MLX5_EMPW_MIN_PACKETS (2 + 3 * 4)
+#define MLX5_EMPW_MAX_PACKETS ((MLX5_WQE_SIZE_MAX - \
+				MLX5_WQE_CSEG_SIZE - \
+				MLX5_WQE_ESEG_SIZE) / \
+				MLX5_WSEG_SIZE)
+/*
+ * Default packet length threshold to be inlined with
+ * ordinary SEND. Inlining saves the MR key search
+ * and extra PCIe data fetch transaction, but eats the
+ * CPU cycles.
+ */
+#define MLX5_SEND_DEF_INLINE_LEN (5U * MLX5_WQE_SIZE + \
+				  MLX5_ESEG_MIN_INLINE_SIZE - \
+				  MLX5_WQE_CSEG_SIZE - \
+				  MLX5_WQE_ESEG_SIZE - \
+				  MLX5_WQE_DSEG_SIZE)
+/*
+ * Maximal inline data length sent with ordinary SEND.
+ * Is based on maximal WQE size.
+ */
+#define MLX5_SEND_MAX_INLINE_LEN (MLX5_WQE_SIZE_MAX - \
+				  MLX5_WQE_CSEG_SIZE - \
+				  MLX5_WQE_ESEG_SIZE - \
+				  MLX5_WQE_DSEG_SIZE + \
+				  MLX5_ESEG_MIN_INLINE_SIZE)
 
-#define MLX5_OPC_MOD_ENHANCED_MPSW 0
-#define MLX5_OPCODE_ENHANCED_MPSW 0x29
+/* Missed in mlv5dv.h, should define here. */
+#define MLX5_OPCODE_ENHANCED_MPSW 0x29u
 
 /* CQE value to inform that VLAN is stripped. */
 #define MLX5_CQE_VLAN_STRIPPED (1u << 0)
@@ -114,6 +184,12 @@
 /* Inner L3 type is IPV6. */
 #define MLX5_ETH_WQE_L3_INNER_IPV6 (1u << 0)
 
+/* VLAN insertion flag. */
+#define MLX5_ETH_WQE_VLAN_INSERT (1u << 31)
+
+/* Data inline segment flag. */
+#define MLX5_ETH_WQE_DATA_INLINE (1u << 31)
+
 /* Is flow mark valid. */
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 #define MLX5_FLOW_MARK_IS_VALID(val) ((val) & 0xffffff00)
@@ -130,12 +206,21 @@
 /* Default mark value used when none is provided. */
 #define MLX5_FLOW_MARK_DEFAULT 0xffffff
 
-/* Maximum number of DS in WQE. */
+/* Maximum number of DS in WQE. Limited by 6-bit field. */
 #define MLX5_DSEG_MAX 63
 
 /* The completion mode offset in the WQE control segment line 2. */
 #define MLX5_COMP_MODE_OFFSET 2
 
+/* Amount of data bytes in minimal inline data segment. */
+#define MLX5_DSEG_MIN_INLINE_SIZE 12
+
+/* Amount of data bytes in minimal inline eth segment. */
+#define MLX5_ESEG_MIN_INLINE_SIZE 18
+
+/* Amount of data bytes after eth data segment. */
+#define MLX5_ESEG_EXTRA_DATA_SIZE 32
+
 /* Completion mode. */
 enum mlx5_completion_mode {
 	MLX5_COMP_ONLY_ERR = 0x0,
@@ -144,11 +229,6 @@ enum mlx5_completion_mode {
 	MLX5_COMP_CQE_AND_EQE = 0x3,
 };
 
-/* Small common part of the WQE. */
-struct mlx5_wqe {
-	uint32_t ctrl[4];
-};
-
 /* MPW mode. */
 enum mlx5_mpw_mode {
 	MLX5_MPW_DISABLED,
@@ -156,6 +236,63 @@ enum mlx5_mpw_mode {
 	MLX5_MPW_ENHANCED, /* Enhanced Multi-Packet Send WQE, a.k.a MPWv2. */
 };
 
+/* WQE Control segment. */
+struct mlx5_wqe_cseg {
+	uint32_t opcode;
+	uint32_t sq_ds;
+	uint32_t flags;
+	uint32_t misc;
+} __rte_packed __rte_aligned(MLX5_WSEG_SIZE);
+
+/* Header of data segment. Minimal size Data Segment */
+struct mlx5_wqe_dseg {
+	uint32_t bcount;
+	union {
+		uint8_t inline_data[MLX5_DSEG_MIN_INLINE_SIZE];
+		struct {
+			uint32_t lkey;
+			uint64_t pbuf;
+		} __rte_packed;
+	};
+} __rte_packed;
+
+/* Subset of struct WQE Ethernet Segment. */
+struct mlx5_wqe_eseg {
+	union {
+		struct {
+			uint32_t swp_offs;
+			uint8_t	cs_flags;
+			uint8_t	swp_flags;
+			uint16_t mss;
+			uint32_t metadata;
+			uint16_t inline_hdr_sz;
+			union {
+				uint16_t inline_data;
+				uint16_t vlan_tag;
+			};
+		} __rte_packed;
+		struct {
+			uint32_t offsets;
+			uint32_t flags;
+			uint32_t flow_metadata;
+			uint32_t inline_hdr;
+		} __rte_packed;
+	};
+} __rte_packed;
+
+/* The title WQEBB, header of WQE. */
+struct mlx5_wqe {
+	union {
+		struct mlx5_wqe_cseg cseg;
+		uint32_t ctrl[4];
+	};
+	struct mlx5_wqe_eseg eseg;
+	union {
+		struct mlx5_wqe_dseg dseg[2];
+		uint8_t data[MLX5_ESEG_EXTRA_DATA_SIZE];
+	};
+} __rte_packed;
+
 /* WQE for Multi-Packet RQ. */
 struct mlx5_wqe_mprq {
 	struct mlx5_wqe_srq_next_seg next_seg;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v2 4/7] net/mlx5: add Tx datapath configuration and setup
  2019-07-15 13:59   ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                       ` (2 preceding siblings ...)
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 3/7] net/mlx5: update Tx datapath definitions Viacheslav Ovsiienko
@ 2019-07-15 13:59     ` Viacheslav Ovsiienko
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 5/7] net/mlx5: introduce Tx burst routine template Viacheslav Ovsiienko
                       ` (3 subsequent siblings)
  7 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-15 13:59 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch updates the Tx datapath control and configuration
structures and code for mananging Tx datapath settings.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c |   4 +-
 drivers/net/mlx5/mlx5_rxtx.h |  55 +++++++++----
 drivers/net/mlx5/mlx5_txq.c  | 182 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 212 insertions(+), 29 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index f2d6918..13f9431 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -507,7 +507,7 @@
 			MKSTR(err_str, "Unexpected CQE error syndrome "
 			      "0x%02x CQN = %u SQN = %u wqe_counter = %u "
 			      "wq_ci = %u cq_ci = %u", err_cqe->syndrome,
-			      txq_ctrl->cqn, txq->qp_num_8s >> 8,
+			      txq->cqe_s, txq->qp_num_8s >> 8,
 			      rte_be_to_cpu_16(err_cqe->wqe_counter),
 			      txq->wqe_ci, txq->cq_ci);
 			MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u",
@@ -516,7 +516,7 @@
 			mlx5_dump_debug_information(name, NULL, err_str, 0);
 			mlx5_dump_debug_information(name, "MLX5 Error CQ:",
 						    (const void *)((uintptr_t)
-						    &(*txq->cqes)[0]),
+						    txq->cqes),
 						    sizeof(*err_cqe) *
 						    (1 << txq->cqe_n));
 			mlx5_dump_debug_information(name, "MLX5 Error SQ:",
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index acde09d..d8c6f35 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -188,37 +188,60 @@ struct mlx5_hrxq {
 	uint8_t rss_key[]; /* Hash key. */
 };
 
+/* TX queue send local data. */
+__extension__
+struct mlx5_txq_local {
+	struct mlx5_wqe *wqe_last; /* last sent WQE pointer. */
+	struct rte_mbuf *mbuf; /* first mbuf to process. */
+	uint16_t pkts_copy; /* packets copied to elts. */
+	uint16_t pkts_sent; /* packets sent. */
+	uint16_t elts_free; /* available elts remain. */
+	uint16_t wqe_free; /* available wqe remain. */
+	uint16_t mbuf_off; /* data offset in current mbuf. */
+	uint16_t mbuf_nseg; /* number of remaining mbuf. */
+};
+
 /* TX queue descriptor. */
 __extension__
 struct mlx5_txq_data {
 	uint16_t elts_head; /* Current counter in (*elts)[]. */
 	uint16_t elts_tail; /* Counter of first element awaiting completion. */
-	uint16_t elts_comp; /* Counter since last completion request. */
-	uint16_t mpw_comp; /* WQ index since last completion request. */
+	uint16_t elts_comp; /* elts index since last completion request. */
+	uint16_t elts_s; /* Number of mbuf elements. */
+	uint16_t elts_m; /* Mask for mbuf elements indices. */
+	/* Fields related to elts mbuf storage. */
+	uint16_t wqe_ci; /* Consumer index for work queue. */
+	uint16_t wqe_pi; /* Producer index for work queue. */
+	uint16_t wqe_s; /* Number of WQ elements. */
+	uint16_t wqe_m; /* Mask Number for WQ elements. */
+	uint16_t wqe_comp; /* WQE index since last completion request. */
+	uint16_t wqe_thres; /* WQE threshold to request completion in CQ. */
+	/* WQ related fields. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
 #ifndef NDEBUG
-	uint16_t cq_pi; /* Producer index for completion queue. */
+	uint16_t cq_pi; /* Counter of issued CQE "always" requests. */
 #endif
-	uint16_t wqe_ci; /* Consumer index for work queue. */
-	uint16_t wqe_pi; /* Producer index for work queue. */
-	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
+	uint16_t cqe_s; /* Number of CQ elements. */
+	uint16_t cqe_m; /* Mask for CQ indices. */
+	/* CQ related fields. */
+	uint16_t elts_n:4; /* elts[] length (in log2). */
 	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
-	uint16_t wqe_n:4; /* Number of of WQ elements (in log2). */
+	uint16_t wqe_n:4; /* Number of WQ elements (in log2). */
 	uint16_t tso_en:1; /* When set hardware TSO is enabled. */
 	uint16_t tunnel_en:1;
 	/* When set TX offload for tunneled packets are supported. */
 	uint16_t swp_en:1; /* Whether SW parser is enabled. */
-	uint16_t mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
-	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
-	uint16_t inline_max_packet_sz; /* Max packet size for inlining. */
+	uint16_t inlen_send; /* Ordinary send data inline size. */
+	uint16_t inlen_empw; /* eMPW max packet size to inline. */
+	uint16_t inlen_mode; /* Minimal data length to inline. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	uint64_t offloads; /* Offloads for Tx Queue. */
 	struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
-	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
-	volatile void *wqes; /* Work queue (use volatile to write into). */
+	struct mlx5_wqe *wqes; /* Work queue. */
+	struct mlx5_wqe *wqes_end; /* Work queue array limit. */
+	volatile struct mlx5_cqe *cqes; /* Completion queue. */
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
-	struct rte_mbuf *(*elts)[]; /* TX elements. */
 	uint16_t port_id; /* Port ID of device. */
 	uint16_t idx; /* Queue index. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
@@ -226,6 +249,8 @@ struct mlx5_txq_data {
 	rte_spinlock_t *uar_lock;
 	/* UAR access lock required for 32bit implementations */
 #endif
+	struct rte_mbuf *elts[0];
+	/* Storage for queued packets, must be the last field. */
 } __rte_cache_aligned;
 
 /* Verbs Rx queue elements. */
@@ -239,7 +264,6 @@ struct mlx5_txq_ibv {
 
 /* TX queue control descriptor. */
 struct mlx5_txq_ctrl {
-	struct mlx5_txq_data txq; /* Data path structure. */
 	LIST_ENTRY(mlx5_txq_ctrl) next; /* Pointer to the next element. */
 	rte_atomic32_t refcnt; /* Reference counter. */
 	unsigned int socket; /* CPU socket ID for allocations. */
@@ -249,8 +273,9 @@ struct mlx5_txq_ctrl {
 	struct mlx5_priv *priv; /* Back pointer to private data. */
 	off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
 	void *bf_reg; /* BlueFlame register from Verbs. */
-	uint32_t cqn; /* CQ number. */
 	uint16_t dump_file_n; /* Number of dump files. */
+	struct mlx5_txq_data txq; /* Data path structure. */
+	/* Must be the last field in the structure, contains elts[]. */
 };
 
 #define MLX5_TX_BFREG(txq) \
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 0d2dbfa..147599b 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -47,7 +47,7 @@
 	unsigned int i;
 
 	for (i = 0; (i != elts_n); ++i)
-		(*txq_ctrl->txq.elts)[i] = NULL;
+		txq_ctrl->txq.elts[i] = NULL;
 	DRV_LOG(DEBUG, "port %u Tx queue %u allocated and configured %u WRs",
 		PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx, elts_n);
 	txq_ctrl->txq.elts_head = 0;
@@ -68,7 +68,7 @@
 	const uint16_t elts_m = elts_n - 1;
 	uint16_t elts_head = txq_ctrl->txq.elts_head;
 	uint16_t elts_tail = txq_ctrl->txq.elts_tail;
-	struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts;
+	struct rte_mbuf *(*elts)[elts_n] = &txq_ctrl->txq.elts;
 
 	DRV_LOG(DEBUG, "port %u Tx queue %u freeing WRs",
 		PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx);
@@ -411,7 +411,8 @@ struct mlx5_txq_ibv *
 	attr.cq = (struct ibv_cq_init_attr_ex){
 		.comp_mask = 0,
 	};
-	cqe_n = desc / MLX5_TX_COMP_THRESH + 1;
+	cqe_n = desc / MLX5_TX_COMP_THRESH +
+		1 + MLX5_TX_COMP_THRESH_INLINE_DIV;
 	tmpl.cq = mlx5_glue->create_cq(priv->sh->ctx, cqe_n, NULL, NULL, 0);
 	if (tmpl.cq == NULL) {
 		DRV_LOG(ERR, "port %u Tx queue %u CQ creation failure",
@@ -449,7 +450,7 @@ struct mlx5_txq_ibv *
 		.pd = priv->sh->pd,
 		.comp_mask = IBV_QP_INIT_ATTR_PD,
 	};
-	if (txq_data->max_inline)
+	if (txq_data->inlen_send)
 		attr.init.cap.max_inline_data = txq_ctrl->max_inline_data;
 	if (txq_data->tso_en) {
 		attr.init.max_tso_header = txq_ctrl->max_tso_header;
@@ -523,25 +524,29 @@ struct mlx5_txq_ibv *
 		goto error;
 	}
 	txq_data->cqe_n = log2above(cq_info.cqe_cnt);
+	txq_data->cqe_s = 1 << txq_data->cqe_n;
+	txq_data->cqe_m = txq_data->cqe_s - 1;
 	txq_data->qp_num_8s = tmpl.qp->qp_num << 8;
 	txq_data->wqes = qp.sq.buf;
 	txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
+	txq_data->wqe_s = 1 << txq_data->wqe_n;
+	txq_data->wqe_m = txq_data->wqe_s - 1;
+	txq_data->wqes_end = txq_data->wqes + txq_data->wqe_s;
 	txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
 	txq_data->cq_db = cq_info.dbrec;
-	txq_data->cqes =
-		(volatile struct mlx5_cqe (*)[])
-		(uintptr_t)cq_info.buf;
+	txq_data->cqes = (volatile struct mlx5_cqe *)cq_info.buf;
 	txq_data->cq_ci = 0;
 #ifndef NDEBUG
 	txq_data->cq_pi = 0;
 #endif
 	txq_data->wqe_ci = 0;
 	txq_data->wqe_pi = 0;
+	txq_data->wqe_comp = 0;
+	txq_data->wqe_thres = txq_data->wqe_s / MLX5_TX_COMP_THRESH_INLINE_DIV;
 	txq_ibv->qp = tmpl.qp;
 	txq_ibv->cq = tmpl.cq;
 	rte_atomic32_inc(&txq_ibv->refcnt);
 	txq_ctrl->bf_reg = qp.bf.reg;
-	txq_ctrl->cqn = cq_info.cqn;
 	txq_uar_init(txq_ctrl);
 	if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) {
 		txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset;
@@ -663,7 +668,11 @@ struct mlx5_txq_ibv *
 	unsigned int wqe_size;
 	const unsigned int desc = 1 << txq_ctrl->txq.elts_n;
 
-	wqe_size = MLX5_WQE_SIZE + txq_ctrl->max_inline_data;
+	wqe_size = MLX5_WQE_CSEG_SIZE +
+		   MLX5_WQE_ESEG_SIZE +
+		   MLX5_WSEG_SIZE -
+		   MLX5_ESEG_MIN_INLINE_SIZE +
+		   txq_ctrl->max_inline_data;
 	return rte_align32pow2(wqe_size * desc) / MLX5_WQE_SIZE;
 }
 
@@ -676,7 +685,156 @@ struct mlx5_txq_ibv *
 static void
 txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
 {
-	(void)txq_ctrl;
+	struct mlx5_priv *priv = txq_ctrl->priv;
+	struct mlx5_dev_config *config = &priv->config;
+	unsigned int inlen_send; /* Inline data for ordinary SEND.*/
+	unsigned int inlen_empw; /* Inline data for enhanced MPW. */
+	unsigned int inlen_mode; /* Minimal required Inline data. */
+	unsigned int txqs_inline; /* Min Tx queues to enable inline. */
+	int tso = !!(txq_ctrl->txq.offloads & (DEV_TX_OFFLOAD_TCP_TSO |
+					       DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+					       DEV_TX_OFFLOAD_GRE_TNL_TSO |
+					       DEV_TX_OFFLOAD_IP_TNL_TSO |
+					       DEV_TX_OFFLOAD_UDP_TNL_TSO));
+	unsigned int temp;
+
+	txqs_inline = (config->txqs_inline == MLX5_ARG_UNSET) ?
+		      MLX5_EMPW_MIN_TXQS :
+		      (unsigned int)config->txqs_inline;
+	inlen_send = (config->txq_inline_max == MLX5_ARG_UNSET) ?
+		     MLX5_SEND_DEF_INLINE_LEN :
+		     (unsigned int)config->txq_inline_max;
+	inlen_empw = (config->txq_inline_mpw == MLX5_ARG_UNSET) ?
+		     MLX5_EMPW_DEF_INLINE_LEN :
+		     (unsigned int)config->txq_inline_mpw;
+	inlen_mode = (config->txq_inline_min == MLX5_ARG_UNSET) ?
+		     0 : (unsigned int)config->txq_inline_min;
+
+	/*
+	 * If there is requested minimal amount of data to inline
+	 * we MUST enable inlining. This is a case for ConnectX-4
+	 * which usually requires L2 inlined for correct operating
+	 * and ConnectX-4LX which requires L2-L4 inlined to
+	 * support E-Switch Flows.
+	 */
+	if (inlen_mode) {
+		if (inlen_mode <= MLX5_ESEG_MIN_INLINE_SIZE) {
+			/*
+			 * Optimize minimal inlining for single
+			 * segment packets to fill one WQEBB
+			 * without gaps.
+			 */
+			temp = MLX5_ESEG_MIN_INLINE_SIZE;
+		} else {
+			temp = inlen_mode - MLX5_ESEG_MIN_INLINE_SIZE;
+			temp = RTE_ALIGN(temp, MLX5_WSEG_SIZE) +
+			       MLX5_ESEG_MIN_INLINE_SIZE;
+			temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN);
+		}
+		if (temp != inlen_mode) {
+			DRV_LOG(INFO,
+				"port %u minimal required inline setting"
+				" aligned from %u to %u",
+				PORT_ID(priv), inlen_mode, temp);
+			inlen_mode = temp;
+		}
+	}
+	/*
+	 * If there are few Tx queues it is prioritized
+	 * to save CPU cycles and disable data inlining at all.
+	 */
+	if (inlen_send && priv->txqs_n >= txqs_inline) {
+		/*
+		 * The data sent with ordinal MLX5_OPCODE_SEND
+		 * may be inlined in Ethernet Segment, align the
+		 * length accordingly to fit entire WQEBBs.
+		 */
+		temp = (inlen_send / MLX5_WQE_SIZE) * MLX5_WQE_SIZE +
+			MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE;
+		temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX +
+				     MLX5_ESEG_MIN_INLINE_SIZE -
+				     MLX5_WQE_CSEG_SIZE -
+				     MLX5_WQE_ESEG_SIZE -
+				     MLX5_WQE_DSEG_SIZE);
+		temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN);
+		temp = RTE_MAX(temp, inlen_mode);
+		if (temp != inlen_send) {
+			DRV_LOG(INFO,
+				"port %u ordinary send inline setting"
+				" aligned from %u to %u",
+				PORT_ID(priv), inlen_send, temp);
+			inlen_send = temp;
+		}
+		/*
+		 * Not aligned to cache lines, but to WQEs.
+		 * First bytes of data (initial alignment)
+		 * is going to be copied explicitly at the
+		 * beginning of inlining buffer in Ethernet
+		 * Segment.
+		 */
+		assert(inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+		assert(inlen_send <= MLX5_WQE_SIZE_MAX +
+				     MLX5_ESEG_MIN_INLINE_SIZE -
+				     MLX5_WQE_CSEG_SIZE -
+				     MLX5_WQE_ESEG_SIZE -
+				     MLX5_WQE_DSEG_SIZE);
+		txq_ctrl->txq.inlen_send = inlen_send;
+		txq_ctrl->txq.inlen_mode = inlen_mode;
+	} else {
+		/*
+		 * If minimal inlining is requested we must
+		 * enable inlining in general, despite the
+		 * number of configured queues.
+		 */
+		txq_ctrl->txq.inlen_send = inlen_mode;
+		txq_ctrl->txq.inlen_mode = inlen_mode;
+		txq_ctrl->txq.inlen_empw = 0;
+		inlen_send = 0;
+		inlen_empw = 0;
+	}
+	if (inlen_send && inlen_empw && priv->txqs_n >= txqs_inline) {
+		/*
+		 * The data sent with MLX5_OPCODE_ENHANCED_MPSW
+		 * may be inlined in Data Segment, align the
+		 * length accordingly to fit entire WQEBBs.
+		 */
+		temp = (inlen_empw + MLX5_WQE_SIZE - 1) / MLX5_WQE_SIZE;
+		temp = temp * MLX5_WQE_SIZE +
+		       MLX5_DSEG_MIN_INLINE_SIZE - MLX5_WQE_DSEG_SIZE;
+		temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX +
+				     MLX5_DSEG_MIN_INLINE_SIZE -
+				     MLX5_WQE_CSEG_SIZE -
+				     MLX5_WQE_ESEG_SIZE -
+				     MLX5_WQE_DSEG_SIZE);
+		temp = RTE_MIN(temp, MLX5_EMPW_MAX_INLINE_LEN);
+		if (temp != inlen_empw) {
+			DRV_LOG(INFO,
+				"port %u enhanced empw inline setting"
+				" aligned from %u to %u",
+				PORT_ID(priv), inlen_empw, temp);
+			inlen_empw = temp;
+		}
+		assert(inlen_empw >= MLX5_ESEG_MIN_INLINE_SIZE);
+		assert(inlen_empw <= MLX5_WQE_SIZE_MAX +
+				     MLX5_DSEG_MIN_INLINE_SIZE -
+				     MLX5_WQE_CSEG_SIZE -
+				     MLX5_WQE_ESEG_SIZE -
+				     MLX5_WQE_DSEG_SIZE);
+		txq_ctrl->txq.inlen_empw = inlen_send;
+	}
+	txq_ctrl->max_inline_data = RTE_MAX(inlen_send, inlen_empw);
+	if (tso) {
+		txq_ctrl->max_tso_header = MLX5_MAX_TSO_HEADER;
+		txq_ctrl->max_inline_data = RTE_MAX(txq_ctrl->max_inline_data,
+						    MLX5_MAX_TSO_HEADER);
+		txq_ctrl->txq.tso_en = 1;
+	}
+	txq_ctrl->txq.tunnel_en = config->tunnel_en | config->swp;
+	txq_ctrl->txq.swp_en = ((DEV_TX_OFFLOAD_IP_TNL_TSO |
+				 DEV_TX_OFFLOAD_UDP_TNL_TSO |
+				 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM) &
+				txq_ctrl->txq.offloads) && config->swp;
+	config->tx_inline = txq_ctrl->txq.inlen_send ? 1 : 0;
 }
 
 /**
@@ -724,6 +882,8 @@ struct mlx5_txq_ctrl *
 	tmpl->priv = priv;
 	tmpl->socket = socket;
 	tmpl->txq.elts_n = log2above(desc);
+	tmpl->txq.elts_s = desc;
+	tmpl->txq.elts_m = desc - 1;
 	tmpl->txq.port_id = dev->data->port_id;
 	tmpl->txq.idx = idx;
 	txq_set_params(tmpl);
@@ -737,8 +897,6 @@ struct mlx5_txq_ctrl *
 		rte_errno = ENOMEM;
 		goto error;
 	}
-	tmpl->txq.elts =
-		(struct rte_mbuf *(*)[1 << tmpl->txq.elts_n])(tmpl + 1);
 	rte_atomic32_inc(&tmpl->refcnt);
 	LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next);
 	return tmpl;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v2 5/7] net/mlx5: introduce Tx burst routine template
  2019-07-15 13:59   ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                       ` (3 preceding siblings ...)
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 4/7] net/mlx5: add Tx datapath configuration and setup Viacheslav Ovsiienko
@ 2019-07-15 13:59     ` Viacheslav Ovsiienko
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 6/7] net/mlx5: implement Tx burst template Viacheslav Ovsiienko
                       ` (2 subsequent siblings)
  7 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-15 13:59 UTC (permalink / raw)
  To: dev; +Cc: yskoh

Mellanox NICs support the wide set of Tx offloads. The supported
offloads are reported by the mlx5 PMD in rte_eth_dev_info tx_offload_capa
field. An application may choose any combination of supported offloads
and configure the device appropriately. Some of Tx offloads may be
not requested by application, or ever all of them may be omitted.
Most of the Tx offloads require some code branches in tx_burst routine
to support ones. If Tx offload is not requested the tx_burst routine
code may be significantly simplified and consume less CPU cycles.

For example, if application does not engage TSO offload this code
can be omitted, if multi-segment packet is not supposed the tx_burst
may assume single mbuf packets only, etc.

Currently, the mlx5 PMD implements multiple tx_burst subroutines
for most common combinations of requested Tx offloads, each branch
has its own dedicated implementation. It is not very easy to update,
support and develop such kind of code - multiple branches impose
the multiple points to process. Also many of frequently requested
offload combinations are not supported yet. That leads to selecting of
not completely matching tx_burst routine and harms the performance.

This patch introduces the new approach for tx_burst code. It is proposed
to develop the unified template for tx_burst routine, which supports
all the Tx offloads and takes the compile time defined parameter
describing the supposed set of supported offloads. On the base
of this template, the compiler is able to generate multiple tx_burst
routines highly optimized for the statically specified set of Tx offloads.
Next, in runtime, at Tx queue configuration the best matching optimized
implementation of tx_burst is chosen.

This patch intentionally omits the template internal implementation,
but just introduces the template itself to emboss the approach of
the multiple specially tuned tx_burst routines.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 511 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 507 insertions(+), 4 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 13f9431..af6f705 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright 2015 6WIND S.A.
- * Copyright 2015 Mellanox Technologies, Ltd
+ * Copyright 2015-2019 Mellanox Technologies, Ltd
  */
 
 #include <assert.h>
@@ -34,6 +34,57 @@
 #include "mlx5_defs.h"
 #include "mlx5_prm.h"
 
+/* TX burst subroutines return codes. */
+enum mlx5_txcmp_code {
+	MLX5_TXCMP_CODE_EXIT = 0,
+	MLX5_TXCMP_CODE_ERROR,
+	MLX5_TXCMP_CODE_SINGLE,
+	MLX5_TXCMP_CODE_MULTI,
+	MLX5_TXCMP_CODE_TSO,
+	MLX5_TXCMP_CODE_EMPW,
+};
+
+/*
+ * These defines are used to configure Tx burst routine option set
+ * supported at compile time. The not specified options are optimized out
+ * out due to if conditions can be explicitly calculated at compile time.
+ * The offloads with bigger runtime check (require more CPU cycles to
+ * skip) overhead should have the bigger index - this is needed to
+ * select the better matching routine function if no exact match and
+ * some offloads are not actually requested.
+ */
+#define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/
+#define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/
+#define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/
+#define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */
+#define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */
+#define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/
+#define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */
+#define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/
+
+/* The most common offloads groups. */
+#define MLX5_TXOFF_CONFIG_NONE 0
+#define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \
+				MLX5_TXOFF_CONFIG_TSO | \
+				MLX5_TXOFF_CONFIG_SWP | \
+				MLX5_TXOFF_CONFIG_CSUM | \
+				MLX5_TXOFF_CONFIG_INLINE | \
+				MLX5_TXOFF_CONFIG_VLAN | \
+				MLX5_TXOFF_CONFIG_METADATA)
+
+#define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask)
+
+#define MLX5_TXOFF_DECL(func, olx) \
+static uint16_t mlx5_tx_burst_##func(void *txq, \
+				     struct rte_mbuf **pkts, \
+				    uint16_t pkts_n) \
+{ \
+	return mlx5_tx_burst_tmpl((struct mlx5_txq_data *restrict)txq, \
+		    pkts, pkts_n, (olx)); \
+}
+
+#define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx},
+
 static __rte_always_inline uint32_t
 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
 
@@ -1531,7 +1582,323 @@
 }
 
 /**
- * Configure the TX function to use.
+ * DPDK Tx callback template. This is configured template
+ * used to generate routines optimized for specified offload setup.
+ * One of this generated functions is chosen at SQ configuration
+ * time.
+ *
+ * @param txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param olx
+ *   Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
+ *   values. Should be static to take compile time static configuration
+ *   advantages.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+static __rte_always_inline uint16_t
+mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq,
+		   struct rte_mbuf **restrict pkts,
+		   uint16_t pkts_n,
+		   unsigned int olx)
+{
+	(void)txq;
+	(void)pkts;
+	(void)pkts_n;
+	(void)olx;
+	return 0;
+}
+
+/* Generate routines with Enhanced Multi-Packet Write support. */
+MLX5_TXOFF_DECL(full_empw,
+		MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(none_empw,
+		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(md_empw,
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mt_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtsc_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mti_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtiv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sc_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sci_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(scv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sciv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(i_empw,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(v_empw,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(iv_empw,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+/* Generate routines without Enhanced Multi-Packet Write support. */
+MLX5_TXOFF_DECL(full,
+		MLX5_TXOFF_CONFIG_FULL)
+
+MLX5_TXOFF_DECL(none,
+		MLX5_TXOFF_CONFIG_NONE)
+
+MLX5_TXOFF_DECL(md,
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mt,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mtsc,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mti,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(mtv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(mtiv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(sc,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(sci,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(scv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(sciv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(i,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(v,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(iv,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+/*
+ * Array of declared and compiled Tx burst function and corresponding
+ * supported offloads set. The array is used to select the Tx burst
+ * function for specified offloads set at Tx queue configuration time.
+ */
+const struct {
+	eth_tx_burst_t func;
+	unsigned int olx;
+} txoff_func[] = {
+MLX5_TXOFF_INFO(full_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(none_empw,
+		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(md_empw,
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mt_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtsc_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mti_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtiv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sc_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sci_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(scv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sciv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(i_empw,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(v_empw,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(iv_empw,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(full,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(none,
+		MLX5_TXOFF_CONFIG_NONE)
+
+MLX5_TXOFF_INFO(md,
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mt,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mtsc,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mti,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_INFO(mtv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mtiv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sc,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sci,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(scv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sciv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(i,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(v,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(iv,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+};
+
+/**
+ * Configure the Tx function to use. The routine checks configured
+ * Tx offloads for the device and selects appropriate Tx burst
+ * routine. There are multiple Tx burst routines compiled from
+ * the same template in the most optimal way for the dedicated
+ * Tx offloads set.
  *
  * @param dev
  *   Pointer to private data structure.
@@ -1542,8 +1909,144 @@
 eth_tx_burst_t
 mlx5_select_tx_function(struct rte_eth_dev *dev)
 {
-	(void)dev;
-	return removed_tx_burst;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
+	unsigned int diff = 0, olx = 0, i, m;
+
+	static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <=
+		      MLX5_DSEG_MAX, "invalid WQE max size");
+	static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE,
+		      "invalid WQE Control Segment size");
+	static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE,
+		      "invalid WQE Ethernet Segment size");
+	static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE,
+		      "invalid WQE Data Segment size");
+	static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE,
+		      "invalid WQE size");
+	assert(priv);
+	if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) {
+		/* We should support Multi-Segment Packets. */
+		olx |= MLX5_TXOFF_CONFIG_MULTI;
+	}
+	if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
+			   DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+			   DEV_TX_OFFLOAD_GRE_TNL_TSO |
+			   DEV_TX_OFFLOAD_IP_TNL_TSO |
+			   DEV_TX_OFFLOAD_UDP_TNL_TSO)) {
+		/* We should support TCP Send Offload. */
+		olx |= MLX5_TXOFF_CONFIG_TSO;
+	}
+	if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
+			   DEV_TX_OFFLOAD_UDP_TNL_TSO |
+			   DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) {
+		/* We should support Software Parser for Tunnels. */
+		olx |= MLX5_TXOFF_CONFIG_SWP;
+	}
+	if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM |
+			   DEV_TX_OFFLOAD_UDP_CKSUM |
+			   DEV_TX_OFFLOAD_TCP_CKSUM |
+			   DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) {
+		/* We should support IP/TCP/UDP Checksums. */
+		olx |= MLX5_TXOFF_CONFIG_CSUM;
+	}
+	if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) {
+		/* We should support VLAN insertion. */
+		olx |= MLX5_TXOFF_CONFIG_VLAN;
+	}
+	if (config->tx_inline) {
+		/*
+		 * Data inlining is enabled by default.
+		 * Required inline data are disabled by default.
+		 */
+		olx |= MLX5_TXOFF_CONFIG_INLINE;
+	}
+	if (config->mps == MLX5_MPW_ENHANCED &&
+	    config->txq_inline_min <= 0) {
+		/*
+		 * The NIC supports Enhanced Multi-Packet Write.
+		 * We do not support legacy MPW due to its
+		 * hardware related problems, so we just ignore
+		 * legacy MLX5_MPW settings. There should be no
+		 * minimal required inline data.
+		 */
+		olx |= MLX5_TXOFF_CONFIG_EMPW;
+	}
+	if (tx_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) {
+		/* We should support Flow metadata. */
+		olx |= MLX5_TXOFF_CONFIG_METADATA;
+	}
+	/*
+	 * Scan the routines table to find the minimal
+	 * satisfying routine with requested offloads.
+	 */
+	m = RTE_DIM(txoff_func);
+	for (i = 0; i < RTE_DIM(txoff_func); i++) {
+		unsigned int tmp;
+
+		tmp = txoff_func[i].olx;
+		if (tmp == olx) {
+			/* Meets requested offloads exactly.*/
+			m = i;
+			break;
+		}
+		if ((tmp & olx) != olx) {
+			/* Does not meet requested offloads at all. */
+			continue;
+		}
+		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW)
+			/* Do not enable eMPW if not configured. */
+			continue;
+		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE)
+			/* Do not enable inlining if not configured. */
+			continue;
+		/*
+		 * Some routine meets the requirements.
+		 * Check whether it has minimal amount
+		 * of not requested offloads.
+		 */
+		tmp = __builtin_popcountl(tmp & ~olx);
+		if (m >= RTE_DIM(txoff_func) || tmp < diff) {
+			/* First or better match, save and continue. */
+			m = i;
+			diff = tmp;
+			continue;
+		}
+		if (tmp == diff) {
+			tmp = txoff_func[i].olx ^ txoff_func[m].olx;
+			if (__builtin_ffsl(txoff_func[i].olx & ~tmp) <
+			    __builtin_ffsl(txoff_func[m].olx & ~tmp)) {
+				/* Lighter not requested offload. */
+				m = i;
+			}
+		}
+	}
+	if (m >= RTE_DIM(txoff_func)) {
+		DRV_LOG(DEBUG, "port %u has no selected Tx function"
+			       " for requested offloads %04X",
+				dev->data->port_id, olx);
+		return NULL;
+	}
+	DRV_LOG(DEBUG, "port %u has selected Tx function"
+		       " supporting offloads %04X/%04X",
+			dev->data->port_id, olx, txoff_func[m].olx);
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI)
+		DRV_LOG(DEBUG, "\tMULTI (multi segment)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO)
+		DRV_LOG(DEBUG, "\tTSO   (TCP send offload)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP)
+		DRV_LOG(DEBUG, "\tSWP   (software parser)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM)
+		DRV_LOG(DEBUG, "\tCSUM  (checksum offload)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE)
+		DRV_LOG(DEBUG, "\tINLIN (inline data)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN)
+		DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA)
+		DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW)
+		DRV_LOG(DEBUG, "\tEMPW  (Enhanced MPW)");
+	return txoff_func[m].func;
 }
 
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v2 6/7] net/mlx5: implement Tx burst template
  2019-07-15 13:59   ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                       ` (4 preceding siblings ...)
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 5/7] net/mlx5: introduce Tx burst routine template Viacheslav Ovsiienko
@ 2019-07-15 13:59     ` Viacheslav Ovsiienko
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 7/7] net/mlx5: add minimal required Tx data inline Viacheslav Ovsiienko
  2019-07-17  6:53     ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
  7 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-15 13:59 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch adds the implementation of tx_burst routine template.
The template supports all Tx offloads and multiple optimized
tx_burst routines can be generated by compiler from this one.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 2897 +++++++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx.h |    5 +-
 2 files changed, 2873 insertions(+), 29 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index af6f705..2674ea8 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -339,6 +339,109 @@ enum mlx5_txcmp_code {
 }
 
 /**
+ * Set Software Parser flags and offsets in Ethernet Segment of WQE.
+ * Flags must be preliminary initialized to zero.
+ *
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param swp_flags
+ *   Pointer to store Software Parser flags
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Software Parser offsets packed in dword.
+ *   Software Parser flags are set by pointer.
+ */
+static __rte_always_inline uint32_t
+txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc,
+		uint8_t *swp_flags,
+		unsigned int olx)
+{
+	uint64_t ol, tunnel;
+	unsigned int idx, off;
+	uint32_t set;
+
+	if (!MLX5_TXOFF_CONFIG(SWP))
+		return 0;
+	ol = loc->mbuf->ol_flags;
+	tunnel = ol & PKT_TX_TUNNEL_MASK;
+	/*
+	 * Check whether Software Parser is required.
+	 * Only customized tunnels may ask for.
+	 */
+	if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP))
+		return 0;
+	/*
+	 * The index should have:
+	 * bit[0:1] = PKT_TX_L4_MASK
+	 * bit[4] = PKT_TX_IPV6
+	 * bit[8] = PKT_TX_OUTER_IPV6
+	 * bit[9] = PKT_TX_OUTER_UDP
+	 */
+	idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52;
+	idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0;
+	*swp_flags = mlx5_swp_types_table[idx];
+	/*
+	 * Set offsets for SW parser. Since ConnectX-5, SW parser just
+	 * complements HW parser. SW parser starts to engage only if HW parser
+	 * can't reach a header. For the older devices, HW parser will not kick
+	 * in if any of SWP offsets is set. Therefore, all of the L3 offsets
+	 * should be set regardless of HW offload.
+	 */
+	off = loc->mbuf->outer_l2_len;
+	if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT)
+		off += sizeof(struct rte_vlan_hdr);
+	set = (off >> 1) << 8; /* Outer L3 offset. */
+	off += loc->mbuf->outer_l3_len;
+	if (tunnel == PKT_TX_TUNNEL_UDP)
+		set |= off >> 1; /* Outer L4 offset. */
+	if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */
+		const uint64_t csum = ol & PKT_TX_L4_MASK;
+			off += loc->mbuf->l2_len;
+		set |= (off >> 1) << 24; /* Inner L3 offset. */
+		if (csum == PKT_TX_TCP_CKSUM ||
+		    csum == PKT_TX_UDP_CKSUM ||
+		    (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) {
+			off += loc->mbuf->l3_len;
+			set |= (off >> 1) << 16; /* Inner L4 offset. */
+		}
+	}
+	set = rte_cpu_to_le_32(set);
+	return set;
+}
+
+/**
+ * Convert the Checksum offloads to Verbs.
+ *
+ * @param buf
+ *   Pointer to the mbuf.
+ *
+ * @return
+ *   Converted checksum flags.
+ */
+static __rte_always_inline uint8_t
+txq_ol_cksum_to_cs(struct rte_mbuf *buf)
+{
+	uint32_t idx;
+	uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK);
+	const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK |
+				       PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM;
+
+	/*
+	 * The index should have:
+	 * bit[0] = PKT_TX_TCP_SEG
+	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
+	 * bit[4] = PKT_TX_IP_CKSUM
+	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
+	 * bit[9] = tunnel
+	 */
+	idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
+	return mlx5_cksum_table[idx];
+}
+
+/**
  * Internal function to compute the number of used descriptors in an RX queue
  *
  * @param rxq
@@ -543,7 +646,7 @@ enum mlx5_txcmp_code {
  *   The last Tx buffer element to free.
  */
 uint16_t
-mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
+mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq,
 			 volatile struct mlx5_err_cqe *err_cqe)
 {
 	if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
@@ -1563,6 +1666,298 @@ enum mlx5_txcmp_code {
 }
 
 /**
+ * Free the mbufs from the linear array of pointers.
+ *
+ * @param pkts
+ *   Pointer to array of packets to be free.
+ * @param pkts_n
+ *   Number of packets to be freed.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts,
+		  unsigned int pkts_n,
+		  unsigned int olx __rte_unused)
+{
+	struct rte_mempool *pool = NULL;
+	struct rte_mbuf **p_free = NULL;
+	struct rte_mbuf *mbuf;
+	unsigned int n_free = 0;
+
+	/*
+	 * The implemented algorithm eliminates
+	 * copying pointers to temporary array
+	 * for rte_mempool_put_bulk() calls.
+	 */
+	assert(pkts);
+	assert(pkts_n);
+	for (;;) {
+		for (;;) {
+			/*
+			 * Decrement mbuf reference counter, detach
+			 * indirect and external buffers if needed.
+			 */
+			mbuf = rte_pktmbuf_prefree_seg(*pkts);
+			if (likely(mbuf != NULL)) {
+				assert(mbuf == *pkts);
+				if (likely(n_free != 0)) {
+					if (unlikely(pool != mbuf->pool))
+						/* From different pool. */
+						break;
+				} else {
+					/* Start new scan array. */
+					pool = mbuf->pool;
+					p_free = pkts;
+				}
+				++n_free;
+				++pkts;
+				--pkts_n;
+				if (unlikely(pkts_n == 0)) {
+					mbuf = NULL;
+					break;
+				}
+			} else {
+				/*
+				 * This happens if mbuf is still referenced.
+				 * We can't put it back to the pool, skip.
+				 */
+				++pkts;
+				--pkts_n;
+				if (unlikely(n_free != 0))
+					/* There is some array to free.*/
+					break;
+				if (unlikely(pkts_n == 0))
+					/* Last mbuf, nothing to free. */
+					return;
+			}
+		}
+		for (;;) {
+			/*
+			 * This loop is implemented to avoid multiple
+			 * inlining of rte_mempool_put_bulk().
+			 */
+			assert(pool);
+			assert(p_free);
+			assert(n_free);
+			/*
+			 * Free the array of pre-freed mbufs
+			 * belonging to the same memory pool.
+			 */
+			rte_mempool_put_bulk(pool, (void *)p_free, n_free);
+			if (unlikely(mbuf != NULL)) {
+				/* There is the request to start new scan. */
+				pool = mbuf->pool;
+				p_free = pkts++;
+				n_free = 1;
+				--pkts_n;
+				if (likely(pkts_n != 0))
+					break;
+				/*
+				 * This is the last mbuf to be freed.
+				 * Do one more loop iteration to complete.
+				 * This is rare case of the last unique mbuf.
+				 */
+				mbuf = NULL;
+				continue;
+			}
+			if (likely(pkts_n == 0))
+				return;
+			n_free = 0;
+			break;
+		}
+	}
+}
+
+/**
+ * Free the mbuf from the elts ring buffer till new tail.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tail
+ *   Index in elts to free up to, becomes new elts tail.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq,
+		  uint16_t tail,
+		  unsigned int olx __rte_unused)
+{
+	uint16_t n_elts = tail - txq->elts_tail;
+
+	assert(n_elts);
+	assert(n_elts <= txq->elts_s);
+	/*
+	 * Implement a loop to support ring buffer wraparound
+	 * with single inlining of mlx5_tx_free_mbuf().
+	 */
+	do {
+		unsigned int part;
+
+		part = txq->elts_s - (txq->elts_tail & txq->elts_m);
+		part = RTE_MIN(part, n_elts);
+		assert(part);
+		assert(part <= txq->elts_s);
+		mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m],
+				  part, olx);
+		txq->elts_tail += part;
+		n_elts -= part;
+	} while (n_elts);
+}
+
+/**
+ * Store the mbuf being sent into elts ring buffer.
+ * On Tx completion these mbufs will be freed.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq,
+		  struct rte_mbuf **restrict pkts,
+		  unsigned int pkts_n,
+		  unsigned int olx __rte_unused)
+{
+	unsigned int part;
+	struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts;
+
+	assert(pkts);
+	assert(pkts_n);
+	part = txq->elts_s - (txq->elts_head & txq->elts_m);
+	assert(part);
+	assert(part <= txq->elts_s);
+	/* This code is a good candidate for vectorizing with SIMD. */
+	rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)),
+		   (void *)pkts,
+		   RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *));
+	txq->elts_head += pkts_n;
+	if (unlikely(part < pkts_n))
+		/* The copy is wrapping around the elts array. */
+		rte_memcpy((void *)elts, (void *)(pkts + part),
+			   (pkts_n - part) * sizeof(struct rte_mbuf *));
+}
+
+/**
+ * Manage TX completions. This routine checks the CQ for
+ * arrived CQEs, deduces the last accomplished WQE in SQ,
+ * updates SQ producing index and frees all completed mbufs.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * NOTE: not inlined intentionally, it makes tx_burst
+ * routine smaller, simple and faster - from experiments.
+ */
+static void
+mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
+			  unsigned int olx __rte_unused)
+{
+	bool update = false;
+	int ret;
+
+	do {
+		volatile struct mlx5_wqe_cseg *cseg;
+		volatile struct mlx5_cqe *cqe;
+		uint16_t tail;
+
+		cqe = &txq->cqes[txq->cq_ci & txq->cqe_m];
+		ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci);
+		if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+			if (likely(ret != MLX5_CQE_STATUS_ERR)) {
+				/* No new CQEs in completion queue. */
+				assert(ret == MLX5_CQE_STATUS_HW_OWN);
+				if (likely(update)) {
+					/* Update the consumer index. */
+					rte_compiler_barrier();
+					*txq->cq_db =
+						rte_cpu_to_be_32(txq->cq_ci);
+				}
+				return;
+			}
+			/* Some error occurred, try to restart. */
+			rte_wmb();
+			tail = mlx5_tx_error_cqe_handle
+				(txq, (volatile struct mlx5_err_cqe *)cqe);
+		} else {
+			/* Normal transmit completion. */
+			++txq->cq_ci;
+			rte_cio_rmb();
+			txq->wqe_pi = rte_be_to_cpu_16(cqe->wqe_counter);
+			cseg = (volatile struct mlx5_wqe_cseg *)
+				(txq->wqes + (txq->wqe_pi & txq->wqe_m));
+			tail = cseg->misc;
+		}
+#ifndef NDEBUG
+		if (txq->cq_pi)
+			--txq->cq_pi;
+#endif
+		if (likely(tail != txq->elts_tail)) {
+			/* Free data buffers from elts. */
+			mlx5_tx_free_elts(txq, tail, olx);
+			assert(tail == txq->elts_tail);
+		}
+		update = true;
+	} while (true);
+}
+
+/**
+ * Check if the completion request flag should be set in the last WQE.
+ * Both pushed mbufs and WQEs are monitored and the completion request
+ * flag is set if any of thresholds is reached.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param n_mbuf
+ *   Number of mbuf not stored yet in elts array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq,
+			   unsigned int n_mbuf,
+			   struct mlx5_txq_local *restrict loc,
+			   unsigned int olx __rte_unused)
+{
+	uint16_t head = txq->elts_head + n_mbuf;
+
+	if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH ||
+	    (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres) {
+		volatile struct mlx5_wqe *last = loc->wqe_last;
+
+		txq->elts_comp = head;
+		txq->wqe_comp = txq->wqe_ci;
+		/* Request unconditional completion on last WQE. */
+		last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+					    MLX5_COMP_MODE_OFFSET);
+		/* Save elts_head in unused "immediate" field of WQE. */
+		last->cseg.misc = head;
+		/*
+		 * A CQE slot must always be available. Count the
+		 * issued CEQ "always" request instead of production
+		 * index due to here can be CQE with errors and
+		 * difference with ci may become inconsistent.
+		 */
+		assert(txq->cqe_s > ++txq->cq_pi);
+	}
+}
+
+/**
  * DPDK callback to check the status of a tx descriptor.
  *
  * @param tx_queue
@@ -1576,42 +1971,2490 @@ enum mlx5_txcmp_code {
 int
 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
 {
-	(void)tx_queue;
-	(void)offset;
-	return RTE_ETH_TX_DESC_FULL;
+	struct mlx5_txq_data *restrict txq = tx_queue;
+	uint16_t used;
+
+	mlx5_tx_handle_completion(txq, 0);
+	used = txq->elts_head - txq->elts_tail;
+	if (offset < used)
+		return RTE_ETH_TX_DESC_FULL;
+	return RTE_ETH_TX_DESC_DONE;
 }
 
 /**
- * DPDK Tx callback template. This is configured template
- * used to generate routines optimized for specified offload setup.
- * One of this generated functions is chosen at SQ configuration
- * time.
+ * Build the Control Segment with specified opcode:
+ * - MLX5_OPCODE_SEND
+ * - MLX5_OPCODE_ENHANCED_MPSW
+ * - MLX5_OPCODE_TSO
  *
  * @param txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Control Segment.
+ * @param ds
+ *   Supposed length of WQE in segments.
+ * @param opcode
+ *   SQ WQE opcode to put into Control Segment.
  * @param olx
- *   Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
- *   values. Should be static to take compile time static configuration
- *   advantages.
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc __rte_unused,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int ds,
+		  unsigned int opcode,
+		  unsigned int olx __rte_unused)
+{
+	struct mlx5_wqe_cseg *restrict cs = &wqe->cseg;
+
+	cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode);
+	cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR <<
+			     MLX5_COMP_MODE_OFFSET);
+	cs->misc = RTE_BE32(0);
+}
+
+/**
+ * Build the Ethernet Segment without inlined data.
+ * Supports Software Parser, Checksums and VLAN
+ * insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int olx)
+{
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+
+	/*
+	 * Calculate and set check sum flags first, dword field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	es->flags = rte_cpu_to_le_32(csum);
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
+		       loc->mbuf->tx_metadata : 0 : 0;
+	/* Engage VLAN tag insertion feature if requested. */
+	if (MLX5_TXOFF_CONFIG(VLAN) &&
+	    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT |
+						  loc->mbuf->vlan_tci);
+	else
+		es->inline_hdr = RTE_BE32(0);
+}
+
+/**
+ * Build the Ethernet Segment with minimal inlined data
+ * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is
+ * used to fill the gap in single WQEBB WQEs.
+ * Supports Software Parser, Checksums and VLAN
+ * insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ *   Length of VLAN tag insertion if any.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int vlan,
+		  unsigned int olx)
+{
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+	uint8_t *psrc, *pdst;
+
+	/*
+	 * Calculate and set check sum flags first, dword field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	es->flags = rte_cpu_to_le_32(csum);
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
+		       loc->mbuf->tx_metadata : 0 : 0;
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(rte_v128u32_t)),
+		      "invalid Ethernet Segment data size");
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(struct rte_vlan_hdr) +
+				 2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Ethernet Segment data size");
+	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+	es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE);
+	es->inline_data = *(uint16_t *)psrc;
+	psrc +=	sizeof(uint16_t);
+	pdst = (uint8_t *)(es + 1);
+	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+		/* Implement VLAN tag insertion as part inline data. */
+		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
+		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		/* Insert VLAN ethertype + VLAN tag. */
+		*(uint32_t *)pdst = rte_cpu_to_be_32
+					((RTE_ETHER_TYPE_VLAN << 16) |
+					 loc->mbuf->vlan_tci);
+		pdst += sizeof(struct rte_vlan_hdr);
+		/* Copy the rest two bytes from packet data. */
+		*(uint16_t *)pdst = *(uint16_t *)psrc;
+	} else {
+		/* Fill the gap in the title WQEBB with inline data. */
+		rte_mov16(pdst, psrc);
+	}
+}
+
+/**
+ * Build the Ethernet Segment with entire packet
+ * data inlining. Checks the boundary of WQEBB and
+ * ring buffer wrapping, supports Software Parser,
+ * Checksums and VLAN insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ *   Length of VLAN tag insertion if any.
+ * @param inlen
+ *   Length of data to inline (VLAN included, if any).
+ * @param tso
+ *   TSO flag, set mss field from the packet.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
  *
  * @return
- *   Number of packets successfully transmitted (<= pkts_n).
+ *   Pointer to the next Data Segment (aligned and wrapped around).
  */
-static __rte_always_inline uint16_t
-mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq,
-		   struct rte_mbuf **restrict pkts,
-		   uint16_t pkts_n,
-		   unsigned int olx)
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int vlan,
+		  unsigned int inlen,
+		  unsigned int tso,
+		  unsigned int olx)
 {
-	(void)txq;
-	(void)pkts;
-	(void)pkts_n;
-	(void)olx;
-	return 0;
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+	uint8_t *psrc, *pdst;
+	unsigned int part;
+
+	/*
+	 * Calculate and set check sum flags first, dword field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	if (tso) {
+		csum <<= 24;
+		csum |= loc->mbuf->tso_segsz;
+		es->flags = rte_cpu_to_be_32(csum);
+	} else {
+		es->flags = rte_cpu_to_le_32(csum);
+	}
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
+		       loc->mbuf->tx_metadata : 0 : 0;
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(rte_v128u32_t)),
+		      "invalid Ethernet Segment data size");
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(struct rte_vlan_hdr) +
+				 2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Ethernet Segment data size");
+	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+	es->inline_hdr_sz = RTE_BE16(inlen);
+	es->inline_data = *(uint16_t *)psrc;
+	psrc +=	sizeof(uint16_t);
+	pdst = (uint8_t *)(es + 1);
+	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+		/* Implement VLAN tag insertion as part inline data. */
+		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
+		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		/* Insert VLAN ethertype + VLAN tag. */
+		*(uint32_t *)pdst = rte_cpu_to_be_32
+					((RTE_ETHER_TYPE_VLAN << 16) |
+					 loc->mbuf->vlan_tci);
+		pdst += sizeof(struct rte_vlan_hdr);
+		/* Copy the rest two bytes from packet data. */
+		*(uint16_t *)pdst = *(uint16_t *)psrc;
+		psrc += sizeof(uint16_t);
+	} else {
+		/* Fill the gap in the title WQEBB with inline data. */
+		rte_mov16(pdst, psrc);
+		psrc += sizeof(rte_v128u32_t);
+	}
+	pdst = (uint8_t *)(es + 2);
+	assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
+	assert(pdst < (uint8_t *)txq->wqes_end);
+	inlen -= MLX5_ESEG_MIN_INLINE_SIZE;
+	if (!inlen) {
+		assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
+		return (struct mlx5_wqe_dseg *)pdst;
+	}
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, inlen);
+	do {
+		rte_memcpy(pdst, psrc, part);
+		inlen -= part;
+		if (likely(!inlen)) {
+			/*
+			 * If return value is not used by the caller
+			 * the code below will be optimized out.
+			 */
+			pdst += part;
+			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
+				pdst = (uint8_t *)txq->wqes;
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		psrc += part;
+		part = inlen;
+	} while (true);
+}
+
+/**
+ * Copy data from chain of mbuf to the specified linear buffer.
+ * Checksums and VLAN insertion Tx offload features. If data
+ * from some mbuf copied completely this mbuf is freed. Local
+ * structure is used to keep the byte stream state.
+ *
+ * @param pdst
+ *   Pointer to the destination linear buffer.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param len
+ *   Length of data to be copied.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_mseg_memcpy(uint8_t *pdst,
+		    struct mlx5_txq_local *restrict loc,
+		    unsigned int len,
+		    unsigned int olx __rte_unused)
+{
+	struct rte_mbuf *mbuf;
+	unsigned int part, dlen;
+	uint8_t *psrc;
+
+	assert(len);
+	do {
+		/* Allow zero length packets, must check first. */
+		dlen = rte_pktmbuf_data_len(loc->mbuf);
+		if (dlen <= loc->mbuf_off) {
+			/* Exhausted packet, just free. */
+			mbuf = loc->mbuf;
+			loc->mbuf = mbuf->next;
+			rte_pktmbuf_free_seg(mbuf);
+			loc->mbuf_off = 0;
+			assert(loc->mbuf_nseg > 1);
+			assert(loc->mbuf);
+			--loc->mbuf_nseg;
+			continue;
+		}
+		dlen -= loc->mbuf_off;
+		psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+		psrc += loc->mbuf_off;
+		part = RTE_MIN(len, dlen);
+		rte_memcpy(pdst, psrc, part);
+		loc->mbuf_off += part;
+		len -= part;
+		if (!len) {
+			if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) {
+				loc->mbuf_off = 0;
+				/* Exhausted packet, just free. */
+				mbuf = loc->mbuf;
+				loc->mbuf = mbuf->next;
+				rte_pktmbuf_free_seg(mbuf);
+				loc->mbuf_off = 0;
+				assert(loc->mbuf_nseg >= 1);
+				--loc->mbuf_nseg;
+			}
+			return;
+		}
+		pdst += part;
+	} while (true);
+}
+
+/**
+ * Build the Ethernet Segment with inlined data from
+ * multi-segment packet. Checks the boundary of WQEBB
+ * and ring buffer wrapping, supports Software Parser,
+ * Checksums and VLAN insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ *   Length of VLAN tag insertion if any.
+ * @param inlen
+ *   Length of data to inline (VLAN included, if any).
+ * @param tso
+ *   TSO flag, set mss field from the packet.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Pointer to the next Data Segment (aligned and
+ *   possible NOT wrapped around - caller should do
+ *   wrapping check on its own).
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int vlan,
+		  unsigned int inlen,
+		  unsigned int tso,
+		  unsigned int olx)
+{
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+	uint8_t *pdst;
+	unsigned int part;
+
+	/*
+	 * Calculate and set check sum flags first, uint32_t field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	if (tso) {
+		csum <<= 24;
+		csum |= loc->mbuf->tso_segsz;
+		es->flags = rte_cpu_to_be_32(csum);
+	} else {
+		es->flags = rte_cpu_to_le_32(csum);
+	}
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
+		       loc->mbuf->tx_metadata : 0 : 0;
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(rte_v128u32_t)),
+		      "invalid Ethernet Segment data size");
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(struct rte_vlan_hdr) +
+				 2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Ethernet Segment data size");
+	assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
+	es->inline_hdr_sz = RTE_BE16(inlen);
+	pdst = (uint8_t *)&es->inline_data;
+	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+		/* Implement VLAN tag insertion as part inline data. */
+		mlx5_tx_mseg_memcpy(pdst, loc, 2 * RTE_ETHER_ADDR_LEN, olx);
+		*(uint32_t *)pdst = rte_cpu_to_be_32
+					((RTE_ETHER_TYPE_VLAN << 16) |
+					 loc->mbuf->vlan_tci);
+		pdst += sizeof(struct rte_vlan_hdr);
+		inlen -= 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr);
+	}
+	assert(pdst < (uint8_t *)txq->wqes_end);
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, inlen);
+	assert(part);
+	do {
+		mlx5_tx_mseg_memcpy(pdst, loc, part, olx);
+		inlen -= part;
+		if (likely(!inlen)) {
+			pdst += part;
+			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		part = inlen;
+	} while (true);
+}
+
+/**
+ * Build the Data Segment of pointer type.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq,
+		 struct mlx5_txq_local *restrict loc,
+		 struct mlx5_wqe_dseg *restrict dseg,
+		 uint8_t *buf,
+		 unsigned int len,
+		 unsigned int olx __rte_unused)
+
+{
+	assert(len);
+	dseg->bcount = rte_cpu_to_be_32(len);
+	dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
+	dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
+}
+
+/**
+ * Build the Data Segment of pointer type or inline
+ * if data length is less than buffer in minimal
+ * Data Segment size.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe_dseg *restrict dseg,
+		  uint8_t *buf,
+		  unsigned int len,
+		  unsigned int olx __rte_unused)
+
+{
+	uintptr_t dst, src;
+
+	assert(len);
+	if (len > MLX5_DSEG_MIN_INLINE_SIZE) {
+		dseg->bcount = rte_cpu_to_be_32(len);
+		dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
+		dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
+
+		return;
+	}
+	dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
+	/* Unrolled implementation of generic rte_memcpy. */
+	dst = (uintptr_t)&dseg->inline_data[0];
+	src = (uintptr_t)buf;
+	if (len & 0x08) {
+		*(uint64_t *)dst = *(uint64_t *)src;
+		dst += sizeof(uint64_t);
+		src += sizeof(uint64_t);
+	}
+	if (len & 0x04) {
+		*(uint32_t *)dst = *(uint32_t *)src;
+		dst += sizeof(uint32_t);
+		src += sizeof(uint32_t);
+	}
+	if (len & 0x02) {
+		*(uint16_t *)dst = *(uint16_t *)src;
+		dst += sizeof(uint16_t);
+		src += sizeof(uint16_t);
+	}
+	if (len & 0x01)
+		*(uint8_t *)dst = *(uint8_t *)src;
+}
+
+/**
+ * Build the Data Segment of inlined data from single
+ * segment packet, no VLAN insertion.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Pointer to the next Data Segment after inlined data.
+ *   Ring buffer wraparound check is needed. We do not
+ *   do it here because it may not be needed for the
+ *   last packet in the eMPW session.
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc __rte_unused,
+		  struct mlx5_wqe_dseg *restrict dseg,
+		  uint8_t *buf,
+		  unsigned int len,
+		  unsigned int olx __rte_unused)
+{
+	unsigned int part;
+	uint8_t *pdst;
+
+	dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
+	pdst = &dseg->inline_data[0];
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, len);
+	do {
+		rte_memcpy(pdst, buf, part);
+		len -= part;
+		if (likely(!len)) {
+			pdst += part;
+			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			/* Note: no final wraparound check here. */
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		buf += part;
+		part = len;
+	} while (true);
+}
+
+/**
+ * Build the Data Segment of inlined data from single
+ * segment packet with VLAN insertion.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to the dseg fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Pointer to the next Data Segment after inlined data.
+ *   Ring buffer wraparound check is needed.
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc __rte_unused,
+		  struct mlx5_wqe_dseg *restrict dseg,
+		  uint8_t *buf,
+		  unsigned int len,
+		  unsigned int olx __rte_unused)
+
+{
+	unsigned int part;
+	uint8_t *pdst;
+
+	assert(len > MLX5_ESEG_MIN_INLINE_SIZE);
+	static_assert(MLX5_DSEG_MIN_INLINE_SIZE ==
+				 (2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Data Segment data size");
+	dseg->bcount = rte_cpu_to_be_32((len + sizeof(struct rte_vlan_hdr)) |
+					MLX5_ETH_WQE_DATA_INLINE);
+	pdst = &dseg->inline_data[0];
+	memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE);
+	buf += MLX5_DSEG_MIN_INLINE_SIZE;
+	/* Insert VLAN ethertype + VLAN tag. */
+	*(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) |
+					      loc->mbuf->vlan_tci);
+	pdst += sizeof(struct rte_vlan_hdr);
+	if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
+		pdst = (uint8_t *)txq->wqes;
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, len);
+	do {
+		rte_memcpy(pdst, buf, part);
+		len -= part;
+		if (likely(!len)) {
+			pdst += part;
+			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			/* Note: no final wraparound check here. */
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		buf += part;
+		part = len;
+	} while (true);
+}
+
+/**
+ * Build the Ethernet Segment with optionally inlined data with
+ * VLAN insertion and following Data Segments (if any) from
+ * multi-segment packet. Used by ordinary send and TSO.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet/Data Segments.
+ * @param vlan
+ *   Length of VLAN header to insert, 0 means no VLAN insertion.
+ * @param inlen
+ *   Data length to inline. This is minimal amount of data bytes
+ *   to be inlined. For TSO this parameter specifies exact value,
+ *   for ordinary send routine can extend beyond specified value
+ *   to provide better WQE space saving. This length includes
+ *   VLAN header being inserted.
+ * @param tso
+ *   Zero means ordinary send, inlined data can be extended,
+ *   otherwise this is TSO, inlined data length is fixed.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Actual size of built WQE in segments.
+ */
+static __rte_always_inline unsigned int
+mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq,
+		   struct mlx5_txq_local *restrict loc,
+		   struct mlx5_wqe *restrict wqe,
+		   unsigned int vlan,
+		   unsigned int inlen,
+		   unsigned int tso,
+		   unsigned int olx __rte_unused)
+{
+	struct mlx5_wqe_dseg *restrict dseg;
+	unsigned int ds;
+
+	assert((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen);
+	loc->mbuf_nseg = NB_SEGS(loc->mbuf);
+	loc->mbuf_off = 0;
+
+	dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx);
+	if (!loc->mbuf_nseg)
+		goto dseg_done;
+	/*
+	 * There are still some mbuf remaining, not inlined.
+	 * The first mbuf may be partially inlined and we
+	 * must process the possible non-zero data offset.
+	 */
+	if (loc->mbuf_off) {
+		unsigned int dlen;
+		uint8_t *dptr;
+
+		/*
+		 * Exhausted packets must be dropped before.
+		 * Non-zero offset means there are some data
+		 * remained in the packet.
+		 */
+		assert(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf));
+		assert(rte_pktmbuf_data_len(loc->mbuf));
+		dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + loc->mbuf_off;
+		dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off;
+		/*
+		 * Build the pointer/minimal data Data Segment.
+		 * Do ring buffer wrapping check in advance.
+		 */
+		if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+			dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+		mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx);
+		/* Store the mbuf to be freed on completion. */
+		assert(loc->elts_free);
+		txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+		--loc->elts_free;
+		++dseg;
+		if (--loc->mbuf_nseg == 0)
+			goto dseg_done;
+		loc->mbuf = loc->mbuf->next;
+		loc->mbuf_off = 0;
+	}
+	do {
+		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
+			struct rte_mbuf *mbuf;
+
+			/* Zero length segment found, just skip. */
+			mbuf = loc->mbuf;
+			loc->mbuf = loc->mbuf->next;
+			rte_pktmbuf_free_seg(mbuf);
+			if (--loc->mbuf_nseg == 0)
+				break;
+		} else {
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+			mlx5_tx_dseg_iptr
+				(txq, loc, dseg,
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 rte_pktmbuf_data_len(loc->mbuf), olx);
+			assert(loc->elts_free);
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+			--loc->elts_free;
+			++dseg;
+			if (--loc->mbuf_nseg == 0)
+				break;
+			loc->mbuf = loc->mbuf->next;
+		}
+	} while (true);
+
+dseg_done:
+	/* Calculate actual segments used from the dseg pointer. */
+	if ((uintptr_t)wqe < (uintptr_t)dseg)
+		ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE;
+	else
+		ds = (((uintptr_t)dseg - (uintptr_t)wqe) +
+		      txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE;
+	return ds;
+}
+
+/**
+ * Tx one packet function for multi-segment TSO. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs,
+ * sends one packet per WQE.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq,
+			struct mlx5_txq_local *restrict loc,
+			unsigned int olx)
+{
+	struct mlx5_wqe *restrict wqe;
+	unsigned int ds, dlen, inlen, ntcp, vlan = 0;
+
+	/*
+	 * Calculate data length to be inlined to estimate
+	 * the required space in WQE ring buffer.
+	 */
+	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
+	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		vlan = sizeof(struct rte_vlan_hdr);
+	inlen = loc->mbuf->l2_len + vlan +
+		loc->mbuf->l3_len + loc->mbuf->l4_len;
+	if (unlikely((!inlen || !loc->mbuf->tso_segsz)))
+		return MLX5_TXCMP_CODE_ERROR;
+	if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK)
+		inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len;
+	/* Packet must contain all TSO headers. */
+	if (unlikely(inlen > MLX5_MAX_TSO_HEADER ||
+		     inlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
+		     inlen > (dlen + vlan)))
+		return MLX5_TXCMP_CODE_ERROR;
+	/*
+	 * Check whether there are enough free WQEBBs:
+	 * - Control Segment
+	 * - Ethernet Segment
+	 * - First Segment of inlined Ethernet data
+	 * - ... data continued ...
+	 * - Data Segments of pointer/min inline type
+	 */
+	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
+				       MLX5_ESEG_MIN_INLINE_SIZE +
+				       MLX5_WSEG_SIZE +
+				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_EXIT;
+	/* Check for maximal WQE size. */
+	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes/packets counters. */
+	ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) /
+		loc->mbuf->tso_segsz;
+	/*
+	 * One will be added for mbuf itself
+	 * at the end of the mlx5_tx_burst from
+	 * loc->pkts_sent field.
+	 */
+	--ntcp;
+	txq->stats.opackets += ntcp;
+	txq->stats.obytes += dlen + vlan + ntcp * inlen;
+#endif
+	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+	loc->wqe_last = wqe;
+	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx);
+	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx);
+	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+	return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx one packet function for multi-segment SEND. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
+ * sends one packet per WQE, without any data inlining in
+ * Ethernet Segment.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	struct mlx5_wqe_dseg *restrict dseg;
+	struct mlx5_wqe *restrict wqe;
+	unsigned int ds, nseg;
+
+	assert(NB_SEGS(loc->mbuf) > 1);
+	/*
+	 * No inline at all, it means the CPU cycles saving
+	 * is prioritized at configuration, we should not
+	 * copy any packet data to WQE.
+	 */
+	nseg = NB_SEGS(loc->mbuf);
+	ds = 2 + nseg;
+	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_EXIT;
+	/* Check for maximal WQE size. */
+	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_ERROR;
+	/*
+	 * Some Tx offloads may cause an error if
+	 * packet is not long enough, check against
+	 * assumed minimal length.
+	 */
+	if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE)
+		return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes counter. */
+	txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf);
+	if (MLX5_TXOFF_CONFIG(VLAN) &&
+	    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		txq->stats.obytes += sizeof(struct rte_vlan_hdr);
+#endif
+	/*
+	 * SEND WQE, one WQEBB:
+	 * - Control Segment, SEND opcode
+	 * - Ethernet Segment, optional VLAN, no inline
+	 * - Data Segments, pointer only type
+	 */
+	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+	loc->wqe_last = wqe;
+	mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx);
+	mlx5_tx_eseg_none(txq, loc, wqe, olx);
+	dseg = &wqe->dseg[0];
+	do {
+		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
+			struct rte_mbuf *mbuf;
+
+			/*
+			 * Zero length segment found, have to
+			 * correct total size of WQE in segments.
+			 * It is supposed to be rare occasion, so
+			 * in normal case (no zero length segments)
+			 * we avoid extra writing to the Control
+			 * Segment.
+			 */
+			--ds;
+			wqe->cseg.sq_ds -= RTE_BE32(1);
+			mbuf = loc->mbuf;
+			loc->mbuf = mbuf->next;
+			rte_pktmbuf_free_seg(mbuf);
+			if (--nseg == 0)
+				break;
+		} else {
+			mlx5_tx_dseg_ptr
+				(txq, loc, dseg,
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 rte_pktmbuf_data_len(loc->mbuf), olx);
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+			--loc->elts_free;
+			if (--nseg == 0)
+				break;
+			++dseg;
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+			loc->mbuf = loc->mbuf->next;
+		}
+	} while (true);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+	return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx one packet function for multi-segment SEND. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
+ * sends one packet per WQE, with data inlining in
+ * Ethernet Segment and minimal Data Segments.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq,
+			    struct mlx5_txq_local *restrict loc,
+			    unsigned int olx)
+{
+	struct mlx5_wqe *restrict wqe;
+	unsigned int ds, inlen, dlen, vlan = 0;
+
+	assert(MLX5_TXOFF_CONFIG(INLINE));
+	assert(NB_SEGS(loc->mbuf) > 1);
+	/*
+	 * First calculate data length to be inlined
+	 * to estimate the required space for WQE.
+	 */
+	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
+	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		vlan = sizeof(struct rte_vlan_hdr);
+	inlen = dlen + vlan;
+	/* Check against minimal length. */
+	if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
+		return MLX5_TXCMP_CODE_ERROR;
+	assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+	if (inlen > txq->inlen_send) {
+		/*
+		 * Packet length exceeds the allowed inline
+		 * data length, check whether the minimal
+		 * inlining is required. eMPW check is here
+		 * to statically exclude check - no minimal
+		 * inlining required if eMPW is enabled.
+		 */
+		if (MLX5_TXOFF_CONFIG(EMPW) || !txq->inlen_mode) {
+			/*
+			 * VLAN insertion will be done inside by HW.
+			 * It is not utmost effective - VLAN flag is
+			 * checked twice, but we should proceed the
+			 * inlining length correctly and take into
+			 * account the VLAN header being inserted.
+			 */
+			assert(!txq->inlen_mode);
+			return mlx5_tx_packet_multi_send(txq, loc, olx);
+		}
+		assert(txq->inlen_mode >= MLX5_ESEG_MIN_INLINE_SIZE);
+		assert(txq->inlen_mode <= txq->inlen_send);
+		inlen = txq->inlen_mode;
+	}
+	/*
+	 * Check whether there are enough free WQEBBs:
+	 * - Control Segment
+	 * - Ethernet Segment
+	 * - First Segment of inlined Ethernet data
+	 * - ... data continued ...
+	 * - Data Segments of pointer/min inline type
+	 *
+	 * Estimate the number of Data Segments conservatively,
+	 * supposing no any mbufs is being freed during inlining.
+	 */
+	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
+				       MLX5_ESEG_MIN_INLINE_SIZE +
+				       MLX5_WSEG_SIZE +
+				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+	/*
+	 * We may have extra space in WQE to put inline
+	 * data to fill the WQEBBs completely.
+	 */
+	inlen += (4 - ds % 4) * MLX5_WSEG_SIZE;
+	inlen = RTE_MIN(inlen, dlen + vlan);
+	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_EXIT;
+	/* Check for maximal WQE size. */
+	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes/packets counters. */
+	txq->stats.obytes += dlen + vlan;
+#endif
+	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+	loc->wqe_last = wqe;
+	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx);
+	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx);
+	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+	return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx burst function for multi-segment packets. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs,
+ * sends one packet per WQE. Function stops sending if it
+ * encounters the single-segment packet.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
+ *   MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered.
+ * Local context variables updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq,
+		   struct rte_mbuf **restrict pkts,
+		   unsigned int pkts_n,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int olx)
+{
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		enum mlx5_txcmp_code ret;
+
+		assert(NB_SEGS(loc->mbuf) > 1);
+		/*
+		 * Estimate the number of free elts quickly but
+		 * conservatively. Some segment may be fully inlined
+		 * and freed, ignore this here - precise estimation
+		 * is costly.
+		 */
+		if (loc->elts_free < NB_SEGS(loc->mbuf))
+			return MLX5_TXCMP_CODE_EXIT;
+		if (MLX5_TXOFF_CONFIG(TSO) &&
+		    unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) {
+			/* Proceed with multi-segment TSO. */
+			ret = mlx5_tx_packet_multi_tso(txq, loc, olx);
+		} else if (MLX5_TXOFF_CONFIG(INLINE)) {
+			/* Proceed with multi-segment SEND with inlining. */
+			ret = mlx5_tx_packet_multi_inline(txq, loc, olx);
+		} else {
+			/* Proceed with multi-segment SEND w/o inlining. */
+			ret = mlx5_tx_packet_multi_send(txq, loc, olx);
+		}
+		if (ret == MLX5_TXCMP_CODE_EXIT)
+			return MLX5_TXCMP_CODE_EXIT;
+		if (ret == MLX5_TXCMP_CODE_ERROR)
+			return MLX5_TXCMP_CODE_ERROR;
+		/* WQE is built, go to the next packet. */
+		++loc->pkts_sent;
+		--pkts_n;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		if (pkts_n > 1)
+			rte_prefetch0(*pkts);
+		if (likely(NB_SEGS(loc->mbuf) > 1))
+			continue;
+		/* Here ends the series of multi-segment packets. */
+		if (MLX5_TXOFF_CONFIG(TSO) &&
+		    unlikely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)))
+			return MLX5_TXCMP_CODE_TSO;
+		return MLX5_TXCMP_CODE_SINGLE;
+	}
+	assert(false);
+}
+
+/**
+ * Tx burst function for single-segment packets with TSO.
+ * Supports all types of Tx offloads, except multi-packets.
+ * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE.
+ * Function stops sending if it encounters the multi-segment
+ * packet or packet without TSO requested.
+ *
+ * The routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head if inline
+ * offloads is requested due to possible early freeing
+ * of the inlined mbufs (can not store pkts array in elts
+ * as a batch).
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
+ *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ * Local context variables updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq,
+		  struct rte_mbuf **restrict pkts,
+		  unsigned int pkts_n,
+		  struct mlx5_txq_local *restrict loc,
+		  unsigned int olx)
+{
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe_dseg *restrict dseg;
+		struct mlx5_wqe *restrict wqe;
+		unsigned int ds, dlen, hlen, ntcp, vlan = 0;
+		uint8_t *dptr;
+
+		assert(NB_SEGS(loc->mbuf) == 1);
+		dlen = rte_pktmbuf_data_len(loc->mbuf);
+		if (MLX5_TXOFF_CONFIG(VLAN) &&
+		    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+			vlan = sizeof(struct rte_vlan_hdr);
+		}
+		/*
+		 * First calculate the WQE size to check
+		 * whether we have enough space in ring buffer.
+		 */
+		hlen = loc->mbuf->l2_len + vlan +
+		       loc->mbuf->l3_len + loc->mbuf->l4_len;
+		if (unlikely((!hlen || !loc->mbuf->tso_segsz)))
+			return MLX5_TXCMP_CODE_ERROR;
+		if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK)
+			hlen += loc->mbuf->outer_l2_len +
+				loc->mbuf->outer_l3_len;
+		/* Segment must contain all TSO headers. */
+		if (unlikely(hlen > MLX5_MAX_TSO_HEADER ||
+			     hlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
+			     hlen > (dlen + vlan)))
+			return MLX5_TXCMP_CODE_ERROR;
+		/*
+		 * Check whether there are enough free WQEBBs:
+		 * - Control Segment
+		 * - Ethernet Segment
+		 * - First Segment of inlined Ethernet data
+		 * - ... data continued ...
+		 * - Finishing Data Segment of pointer type
+		 */
+		ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE +
+			  MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+		if (loc->wqe_free < ((ds + 3) / 4))
+			return MLX5_TXCMP_CODE_EXIT;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Update sent data bytes/packets counters. */
+		ntcp = (dlen + vlan - hlen +
+			loc->mbuf->tso_segsz - 1) /
+			loc->mbuf->tso_segsz;
+		/*
+		 * One will be added for mbuf itself at the end
+		 * of the mlx5_tx_burst from loc->pkts_sent field.
+		 */
+		--ntcp;
+		txq->stats.opackets += ntcp;
+		txq->stats.obytes += dlen + vlan + ntcp * hlen;
+#endif
+		/*
+		 * Build the TSO WQE:
+		 * - Control Segment
+		 * - Ethernet Segment with hlen bytes inlined
+		 * - Data Segment of pointer type
+		 */
+		wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+		loc->wqe_last = wqe;
+		mlx5_tx_cseg_init(txq, loc, wqe, ds,
+				  MLX5_OPCODE_TSO, olx);
+		dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx);
+		dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan;
+		dlen -= hlen - vlan;
+		mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
+		/*
+		 * WQE is built, update the loop parameters
+		 * and go to the next packet.
+		 */
+		txq->wqe_ci += (ds + 3) / 4;
+		loc->wqe_free -= (ds + 3) / 4;
+		if (MLX5_TXOFF_CONFIG(INLINE))
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+		--loc->elts_free;
+		++loc->pkts_sent;
+		--pkts_n;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		if (pkts_n > 1)
+			rte_prefetch0(*pkts);
+		if (MLX5_TXOFF_CONFIG(MULTI) &&
+		    unlikely(NB_SEGS(loc->mbuf) > 1))
+			return MLX5_TXCMP_CODE_MULTI;
+		if (unlikely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)))
+			return MLX5_TXCMP_CODE_SINGLE;
+		/* Continue with the next TSO packet. */
+	}
+	assert(false);
+}
+
+/**
+ * Analyze the packet and select the best method to send.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ * @param newp
+ *   The predefined flag whether do complete check for
+ *   multi-segment packets and TSO.
+ *
+ * @return
+ *  MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ *  MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO.
+ *  MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND.
+ *  MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq,
+		     struct mlx5_txq_local *restrict loc,
+		     unsigned int olx,
+		     bool newp)
+{
+	/* Check for multi-segment packet. */
+	if (newp &&
+	    MLX5_TXOFF_CONFIG(MULTI) &&
+	    unlikely(NB_SEGS(loc->mbuf) > 1))
+		return MLX5_TXCMP_CODE_MULTI;
+	/* Check for TSO packet. */
+	if (newp &&
+	    MLX5_TXOFF_CONFIG(TSO) &&
+	    unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))
+		return MLX5_TXCMP_CODE_TSO;
+	/* Check if eMPW is enabled at all. */
+	if (!MLX5_TXOFF_CONFIG(EMPW))
+		return MLX5_TXCMP_CODE_SINGLE;
+	/* Check if eMPW can be engaged. */
+	if (MLX5_TXOFF_CONFIG(VLAN) &&
+	    unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) &&
+		(!MLX5_TXOFF_CONFIG(INLINE) ||
+		 unlikely((rte_pktmbuf_data_len(loc->mbuf) +
+			   sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) {
+		/*
+		 * eMPW does not support VLAN insertion offload,
+		 * we have to inline the entire packet but
+		 * packet is too long for inlining.
+		 */
+		return MLX5_TXCMP_CODE_SINGLE;
+	}
+	return MLX5_TXCMP_CODE_EMPW;
+}
+
+/**
+ * Check the next packet attributes to match with the eMPW batch ones.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param es
+ *   Pointer to Ethernet Segment of eMPW batch.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *  true - packet match with eMPW batch attributes.
+ *  false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline bool
+mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused,
+		   struct mlx5_wqe_eseg *restrict es,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int olx)
+{
+	uint8_t swp_flags = 0;
+
+	/* Compare the checksum flags, if any. */
+	if (MLX5_TXOFF_CONFIG(CSUM) &&
+	    txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags)
+		return false;
+	/* Compare the Software Parser offsets and flags. */
+	if (MLX5_TXOFF_CONFIG(SWP) &&
+	    (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) ||
+	     es->swp_flags != swp_flags))
+		return false;
+	/* Fill metadata field if needed. */
+	if (MLX5_TXOFF_CONFIG(METADATA) &&
+		es->metadata != (loc->mbuf->ol_flags & PKT_TX_METADATA ?
+				 loc->mbuf->tx_metadata : 0))
+		return false;
+	/* There must be no VLAN packets in eMPW loop. */
+	if (MLX5_TXOFF_CONFIG(VLAN))
+		assert(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT));
+	return true;
+}
+
+/*
+ * Update send loop variables and WQE for eMPW loop
+ * without data inlining. Number of Data Segments is
+ * equal to the number of sent packets.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param ds
+ *   Number of packets/Data Segments/Packets.
+ * @param slen
+ *   Accumulated statistics, bytes sent
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *  true - packet match with eMPW batch attributes.
+ *  false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline void
+mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int ds,
+		   unsigned int slen,
+		   unsigned int olx __rte_unused)
+{
+	assert(!MLX5_TXOFF_CONFIG(INLINE));
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes counter. */
+	 txq->stats.obytes += slen;
+#else
+	(void)slen;
+#endif
+	loc->elts_free -= ds;
+	loc->pkts_sent += ds;
+	ds += 2;
+	loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+}
+
+/*
+ * Update send loop variables and WQE for eMPW loop
+ * with data inlining. Gets the size of pushed descriptors
+ * and data to the WQE.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param len
+ *   Total size of descriptor/data in bytes.
+ * @param slen
+ *   Accumulated statistics, data bytes sent.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *  true - packet match with eMPW batch attributes.
+ *  false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline void
+mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int len,
+		   unsigned int slen,
+		   unsigned int olx __rte_unused)
+{
+	assert(MLX5_TXOFF_CONFIG(INLINE));
+	assert((len % MLX5_WSEG_SIZE) == 0);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes counter. */
+	 txq->stats.obytes += slen;
+#else
+	(void)slen;
+#endif
+	len = len / MLX5_WSEG_SIZE + 2;
+	loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
+	txq->wqe_ci += (len + 3) / 4;
+	loc->wqe_free -= (len + 3) / 4;
+}
+
+/**
+ * Tx burst functions for single-segment packets without TSO
+ * and with Multi-Packet Writing feature support. Supports
+ * all types of Tx offloads, except multi-packets and TSO.
+ * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends
+ * as many packet per WQE as it can. If eMPW is not configured
+ * or packet can not be sent with eMPW (VLAN insertion) the
+ * ordinary SEND opcode is used and only one packet placed
+ * in WQE.
+ *
+ * Function stops sending if it encounters the multi-segment
+ * packet or packet with TSO requested.
+ *
+ * The routines are responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head if inlining
+ * offload is requested. Otherwise the copying mbufs to elts
+ * can be postponed and completed at the end of burst routine.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ *   MLX5_TXCMP_CODE_TSO - TSO packet encountered.
+ * Local context variables updated.
+ */
+
+/**
+ * The routine sends packets with MLX5_OPCODE_EMPW
+ * without inlining, this is dedicated optimized branch.
+ * No VLAN insertion is supported.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq,
+			  struct rte_mbuf **restrict pkts,
+			  unsigned int pkts_n,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	/*
+	 * Subroutine is the part of mlx5_tx_burst_single()
+	 * and sends single-segment packet with eMPW opcode
+	 * without data inlining.
+	 */
+	assert(!MLX5_TXOFF_CONFIG(INLINE));
+	assert(MLX5_TXOFF_CONFIG(EMPW));
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe_dseg *restrict dseg;
+		struct mlx5_wqe_eseg *restrict eseg;
+		enum mlx5_txcmp_code ret;
+		unsigned int part, loop;
+		unsigned int slen = 0;
+
+next_empw:
+		part = RTE_MIN(pkts_n, MLX5_EMPW_MAX_PACKETS);
+		if (unlikely(loc->elts_free < part)) {
+			/* We have no enough elts to save all mbufs. */
+			if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS))
+				return MLX5_TXCMP_CODE_EXIT;
+			/* But we still able to send at least minimal eMPW. */
+			part = loc->elts_free;
+		}
+		/* Check whether we have enough WQEs */
+		if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) {
+			if (unlikely(loc->wqe_free <
+				((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
+				return MLX5_TXCMP_CODE_EXIT;
+			part = (loc->wqe_free * 4) - 2;
+		}
+		if (likely(part > 1))
+			rte_prefetch0(*pkts);
+		loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+		/*
+		 * Build eMPW title WQEBB:
+		 * - Control Segment, eMPW opcode
+		 * - Ethernet Segment, no inline
+		 */
+		mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2,
+				  MLX5_OPCODE_ENHANCED_MPSW, olx);
+		mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
+				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
+		eseg = &loc->wqe_last->eseg;
+		dseg = &loc->wqe_last->dseg[0];
+		loop = part;
+		for (;;) {
+			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
+			/*
+			 * Some Tx offloads may cause an error if
+			 * packet is not long enough, check against
+			 * assumed minimal length.
+			 */
+			if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) {
+				part -= loop;
+				if (unlikely(!part))
+					return MLX5_TXCMP_CODE_ERROR;
+				/*
+				 * We have some successfully built
+				 * packet Data Segments to send.
+				 */
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			slen += dlen;
+#endif
+			mlx5_tx_dseg_ptr
+				(txq, loc, dseg,
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 dlen, olx);
+			if (unlikely(--loop == 0))
+				break;
+			loc->mbuf = *pkts++;
+			if (likely(loop > 1))
+				rte_prefetch0(*pkts);
+			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+			/*
+			 * Unroll the completion code to avoid
+			 * returning variable value - it results in
+			 * unoptimized sequent checking in caller.
+			 */
+			if (ret == MLX5_TXCMP_CODE_MULTI) {
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_MULTI;
+			}
+			if (ret == MLX5_TXCMP_CODE_TSO) {
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_TSO;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE) {
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_SINGLE;
+			}
+			if (ret != MLX5_TXCMP_CODE_EMPW) {
+				assert(false);
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+			/*
+			 * Check whether packet parameters coincide
+			 * within assumed eMPW batch:
+			 * - check sum settings
+			 * - metadata value
+			 * - software parser settings
+			 */
+			if (!mlx5_tx_match_empw(txq, eseg, loc, olx)) {
+				assert(loop);
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				goto next_empw;
+			}
+			/* Packet attributes match, continue the same eMPW. */
+			++dseg;
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+		}
+		/* eMPW is built successfully, update loop parameters. */
+		assert(!loop);
+		assert(pkts_n >= part);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Update sent data bytes counter. */
+		txq->stats.obytes += slen;
+#endif
+		loc->elts_free -= part;
+		loc->pkts_sent += part;
+		txq->wqe_ci += (2 + part + 3) / 4;
+		loc->wqe_free -= (2 + part + 3) / 4;
+		pkts_n -= part;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+		if (unlikely(ret != MLX5_TXCMP_CODE_EMPW))
+			return ret;
+		/* Continue sending eMPW batches. */
+	}
+	assert(false);
+}
+
+/**
+ * The routine sends packets with MLX5_OPCODE_EMPW
+ * with inlining, optionally supports VLAN insertion.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
+			  struct rte_mbuf **restrict pkts,
+			  unsigned int pkts_n,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	/*
+	 * Subroutine is the part of mlx5_tx_burst_single()
+	 * and sends single-segment packet with eMPW opcode
+	 * with data inlining.
+	 */
+	assert(MLX5_TXOFF_CONFIG(INLINE));
+	assert(MLX5_TXOFF_CONFIG(EMPW));
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe_dseg *restrict dseg;
+		struct mlx5_wqe_eseg *restrict eseg;
+		enum mlx5_txcmp_code ret;
+		unsigned int room, part;
+		unsigned int slen = 0;
+
+next_empw:
+		/* Check whether we have minimal amount WQEs */
+		if (unlikely(loc->wqe_free <
+			    ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
+			return MLX5_TXCMP_CODE_EXIT;
+		if (likely(pkts_n > 1))
+			rte_prefetch0(*pkts);
+		loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+		/*
+		 * Build eMPW title WQEBB:
+		 * - Control Segment, eMPW opcode, zero DS
+		 * - Ethernet Segment, no inline
+		 */
+		mlx5_tx_cseg_init(txq, loc, loc->wqe_last, 0,
+				  MLX5_OPCODE_ENHANCED_MPSW, olx);
+		mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
+				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
+		eseg = &loc->wqe_last->eseg;
+		dseg = &loc->wqe_last->dseg[0];
+		room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE,
+			       loc->wqe_free) * MLX5_WQE_SIZE -
+					MLX5_WQE_CSEG_SIZE -
+					MLX5_WQE_ESEG_SIZE;
+		/* Build WQE till we have space, packets and resources. */
+		part = room;
+		for (;;) {
+			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
+			uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+			unsigned int tlen;
+
+			assert(room >= MLX5_WQE_DSEG_SIZE);
+			assert((room % MLX5_WQE_DSEG_SIZE) == 0);
+			assert((uintptr_t)dseg < (uintptr_t)txq->wqes_end);
+			/*
+			 * Some Tx offloads may cause an error if
+			 * packet is not long enough, check against
+			 * assumed minimal length.
+			 */
+			if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) {
+				part -= room;
+				if (unlikely(!part))
+					return MLX5_TXCMP_CODE_ERROR;
+				/*
+				 * We have some successfully built
+				 * packet Data Segments to send.
+				 */
+				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+			/* Inline or not inline - that's the Question. */
+			if (dlen > txq->inlen_empw)
+				goto pointer_empw;
+			/* Inline entire packet, optional VLAN insertion. */
+			tlen = sizeof(dseg->bcount) + dlen;
+			if (MLX5_TXOFF_CONFIG(VLAN) &&
+			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+				/*
+				 * The packet length must be checked in
+				 * mlx5_tx_able_to_empw() and packet
+				 * fits into inline length guaranteed.
+				 */
+				assert((dlen + sizeof(struct rte_vlan_hdr)) <=
+					txq->inlen_empw);
+				tlen += sizeof(struct rte_vlan_hdr);
+				if (room < tlen)
+					break;
+				dseg = mlx5_tx_dseg_vlan(txq, loc, dseg,
+							 dptr, dlen, olx);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+				/* Update sent data bytes counter. */
+				slen +=	sizeof(struct rte_vlan_hdr);
+#endif
+			} else {
+				if (room < tlen)
+					break;
+				dseg = mlx5_tx_dseg_empw(txq, loc, dseg,
+							 dptr, dlen, olx);
+			}
+			tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
+			assert(room >= tlen);
+			room -= tlen;
+			/*
+			 * Packet data are completely inlined,
+			 * free the packet immediately.
+			 */
+			rte_pktmbuf_free_seg(loc->mbuf);
+			goto next_mbuf;
+pointer_empw:
+			/*
+			 * Not inlinable VLAN packets are
+			 * proceeded outside of this routine.
+			 */
+			assert(room >= MLX5_WQE_DSEG_SIZE);
+			if (MLX5_TXOFF_CONFIG(VLAN))
+				assert(!(loc->mbuf->ol_flags &
+					 PKT_TX_VLAN_PKT));
+			mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
+			/* We have to store mbuf in elts.*/
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+			room -= MLX5_WQE_DSEG_SIZE;
+			/* Ring buffer wraparound is checked at the loop end.*/
+			++dseg;
+next_mbuf:
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			slen += dlen;
+#endif
+			loc->pkts_sent++;
+			loc->elts_free--;
+			pkts_n--;
+			if (unlikely(!pkts_n || !loc->elts_free)) {
+				/*
+				 * We have no resources/packets to
+				 * continue build descriptors.
+				 */
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				return MLX5_TXCMP_CODE_EXIT;
+			}
+			/* Check if we have minimal room left. */
+			if (room < MLX5_WQE_DSEG_SIZE) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				goto next_empw;
+			}
+			loc->mbuf = *pkts++;
+			if (likely(pkts_n > 1))
+				rte_prefetch0(*pkts);
+			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+			/*
+			 * Unroll the completion code to avoid
+			 * returning variable value - it results in
+			 * unoptimized sequent checking in caller.
+			 */
+			if (ret == MLX5_TXCMP_CODE_MULTI) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_MULTI;
+			}
+			if (ret == MLX5_TXCMP_CODE_TSO) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_TSO;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_SINGLE;
+			}
+			if (ret != MLX5_TXCMP_CODE_EMPW) {
+				assert(false);
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+			/*
+			 * Check whether packet parameters coincide
+			 * within assumed eMPW batch:
+			 * - check sum settings
+			 * - metadata value
+			 * - software parser settings
+			 */
+			if (!mlx5_tx_match_empw(txq, eseg, loc, olx))
+				break;
+			/* Packet attributes match, continue the same eMPW. */
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+		}
+		/*
+		 * We get here to close an existing eMPW
+		 * session and start the new one.
+		 */
+		assert(pkts_n);
+		part -= room;
+		if (unlikely(!part))
+			return MLX5_TXCMP_CODE_EXIT;
+		mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+		if (unlikely(!loc->elts_free ||
+			     !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		goto next_empw;
+	}
+	assert(false);
+}
+
+/**
+ * The routine sends packets with ordinary MLX5_OPCODE_SEND.
+ * Data inlining and VLAN insertion are supported.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
+			  struct rte_mbuf **restrict pkts,
+			  unsigned int pkts_n,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	/*
+	 * Subroutine is the part of mlx5_tx_burst_single()
+	 * and sends single-segment packet with SEND opcode.
+	 */
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe *restrict wqe;
+		enum mlx5_txcmp_code ret;
+
+		assert(NB_SEGS(loc->mbuf) == 1);
+		if (MLX5_TXOFF_CONFIG(INLINE)) {
+			unsigned int inlen, vlan = 0;
+
+			inlen = rte_pktmbuf_data_len(loc->mbuf);
+			if (MLX5_TXOFF_CONFIG(VLAN) &&
+			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+				vlan = sizeof(struct rte_vlan_hdr);
+				inlen += vlan;
+				static_assert((sizeof(struct rte_vlan_hdr) +
+					       sizeof(struct rte_ether_hdr)) ==
+					       MLX5_ESEG_MIN_INLINE_SIZE,
+					       "invalid min inline data size");
+			}
+			/*
+			 * If inlining is enabled at configuration time
+			 * the limit must be not less than minimal size.
+			 * Otherwise we would do extra check for data
+			 * size to avoid crashes due to length overflow.
+			 */
+			assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+			if (inlen <= txq->inlen_send) {
+				unsigned int seg_n, wqe_n;
+
+				rte_prefetch0(rte_pktmbuf_mtod
+						(loc->mbuf, uint8_t *));
+				/* Check against minimal length. */
+				if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
+					return MLX5_TXCMP_CODE_ERROR;
+				/*
+				 * Completely inlined packet data WQE:
+				 * - Control Segment, SEND opcode
+				 * - Ethernet Segment, no VLAN insertion
+				 * - Data inlined, VLAN optionally inserted
+				 * - Alignment to MLX5_WSEG_SIZE
+				 * Have to estimate amount of WQEBBs
+				 */
+				seg_n = (inlen + 3 * MLX5_WSEG_SIZE -
+					 MLX5_ESEG_MIN_INLINE_SIZE +
+					 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+				/* Check if there are enough WQEBBs. */
+				wqe_n = (seg_n + 3) / 4;
+				if (wqe_n > loc->wqe_free)
+					return MLX5_TXCMP_CODE_EXIT;
+				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+				loc->wqe_last = wqe;
+				mlx5_tx_cseg_init(txq, loc, wqe, seg_n,
+						  MLX5_OPCODE_SEND, olx);
+				mlx5_tx_eseg_data(txq, loc, wqe,
+						  vlan, inlen, 0, olx);
+				txq->wqe_ci += wqe_n;
+				loc->wqe_free -= wqe_n;
+				/*
+				 * Packet data are completely inlined,
+				 * free the packet immediately.
+				 */
+				rte_pktmbuf_free_seg(loc->mbuf);
+			} else if (!MLX5_TXOFF_CONFIG(EMPW) &&
+				   txq->inlen_mode) {
+				/*
+				 * If minimal inlining is requested the eMPW
+				 * feature should be disabled due to data is
+				 * inlined into Ethernet Segment, which can
+				 * not contain inlined data for eMPW due to
+				 * segment shared for all packets.
+				 */
+				struct mlx5_wqe_dseg *restrict dseg;
+				unsigned int ds;
+				uint8_t *dptr;
+
+				/*
+				 * The inline-mode settings require
+				 * to inline the specified amount of
+				 * data bytes to the Ethernet Segment.
+				 * We should check the free space in
+				 * WQE ring buffer to inline partially.
+				 */
+				assert(txq->inlen_send >= txq->inlen_mode);
+				assert(inlen > txq->inlen_mode);
+				assert(txq->inlen_mode >=
+						MLX5_ESEG_MIN_INLINE_SIZE);
+				/*
+				 * Check whether there are enough free WQEBBs:
+				 * - Control Segment
+				 * - Ethernet Segment
+				 * - First Segment of inlined Ethernet data
+				 * - ... data continued ...
+				 * - Finishing Data Segment of pointer type
+				 */
+				ds = (MLX5_WQE_CSEG_SIZE +
+				      MLX5_WQE_ESEG_SIZE +
+				      MLX5_WQE_DSEG_SIZE +
+				      txq->inlen_mode -
+				      MLX5_ESEG_MIN_INLINE_SIZE +
+				      MLX5_WQE_DSEG_SIZE +
+				      MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+				if (loc->wqe_free < ((ds + 3) / 4))
+					return MLX5_TXCMP_CODE_EXIT;
+				/*
+				 * Build the ordinary SEND WQE:
+				 * - Control Segment
+				 * - Ethernet Segment, inline inlen_mode bytes
+				 * - Data Segment of pointer type
+				 */
+				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+				loc->wqe_last = wqe;
+				mlx5_tx_cseg_init(txq, loc, wqe, ds,
+						  MLX5_OPCODE_SEND, olx);
+				dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan,
+							 txq->inlen_mode,
+							 0, olx);
+				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
+				       txq->inlen_mode - vlan;
+				inlen -= txq->inlen_mode;
+				mlx5_tx_dseg_ptr(txq, loc, dseg,
+						 dptr, inlen, olx);
+				/*
+				 * WQE is built, update the loop parameters
+				 * and got to the next packet.
+				 */
+				txq->wqe_ci += (ds + 3) / 4;
+				loc->wqe_free -= (ds + 3) / 4;
+				/* We have to store mbuf in elts.*/
+				assert(MLX5_TXOFF_CONFIG(INLINE));
+				txq->elts[txq->elts_head++ & txq->elts_m] =
+						loc->mbuf;
+				--loc->elts_free;
+			} else {
+				uint8_t *dptr;
+				unsigned int dlen;
+
+				/*
+				 * Partially inlined packet data WQE, we have
+				 * some space in title WQEBB, we can fill it
+				 * with some packet data. It takes one WQEBB,
+				 * it is available, no extra space check:
+				 * - Control Segment, SEND opcode
+				 * - Ethernet Segment, no VLAN insertion
+				 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data
+				 * - Data Segment, pointer type
+				 */
+				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+				loc->wqe_last = wqe;
+				mlx5_tx_cseg_init(txq, loc, wqe, 4,
+						  MLX5_OPCODE_SEND, olx);
+				mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx);
+				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
+				       MLX5_ESEG_MIN_INLINE_SIZE - vlan;
+				/*
+				 * The length check is performed above, by
+				 * comparing with txq->inlen_send. We should
+				 * not get overflow here.
+				 */
+				assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
+				dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE;
+				mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1],
+						 dptr, dlen, olx);
+				++txq->wqe_ci;
+				--loc->wqe_free;
+				/* We have to store mbuf in elts.*/
+				assert(MLX5_TXOFF_CONFIG(INLINE));
+				txq->elts[txq->elts_head++ & txq->elts_m] =
+						loc->mbuf;
+				--loc->elts_free;
+			}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			txq->stats.obytes += vlan +
+					rte_pktmbuf_data_len(loc->mbuf);
+#endif
+		} else {
+			/*
+			 * No inline at all, it means the CPU cycles saving
+			 * is prioritized at configuration, we should not
+			 * copy any packet data to WQE.
+			 *
+			 * SEND WQE, one WQEBB:
+			 * - Control Segment, SEND opcode
+			 * - Ethernet Segment, optional VLAN, no inline
+			 * - Data Segment, pointer type
+			 */
+			wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+			loc->wqe_last = wqe;
+			mlx5_tx_cseg_init(txq, loc, wqe, 3,
+					  MLX5_OPCODE_SEND, olx);
+			mlx5_tx_eseg_none(txq, loc, wqe, olx);
+			/*
+			 * Some Tx offloads may cause an error if
+			 * packet is not long enough, check against
+			 * assumed minimal length.
+			 */
+			if (rte_pktmbuf_data_len(loc->mbuf) <=
+						 MLX5_ESEG_MIN_INLINE_SIZE)
+				return MLX5_TXCMP_CODE_ERROR;
+			mlx5_tx_dseg_ptr
+				(txq, loc, &wqe->dseg[0],
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 rte_pktmbuf_data_len(loc->mbuf), olx);
+			++txq->wqe_ci;
+			--loc->wqe_free;
+			/*
+			 * We should not store mbuf pointer in elts
+			 * if no inlining is configured, this is done
+			 * by calling routine in a batch copy.
+			 */
+			assert(!MLX5_TXOFF_CONFIG(INLINE));
+			--loc->elts_free;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf);
+			if (MLX5_TXOFF_CONFIG(VLAN) &&
+			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+				txq->stats.obytes +=
+					sizeof(struct rte_vlan_hdr);
+#endif
+		}
+		++loc->pkts_sent;
+		--pkts_n;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		if (pkts_n > 1)
+			rte_prefetch0(*pkts);
+		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+		if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE))
+			return ret;
+	}
+	assert(false);
+}
+
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq,
+		     struct rte_mbuf **restrict pkts,
+		     unsigned int pkts_n,
+		     struct mlx5_txq_local *restrict loc,
+		     unsigned int olx)
+{
+	enum mlx5_txcmp_code ret;
+
+	ret = mlx5_tx_able_to_empw(txq, loc, olx, false);
+	if (ret == MLX5_TXCMP_CODE_SINGLE)
+		goto ordinary_send;
+	assert(ret == MLX5_TXCMP_CODE_EMPW);
+	for (;;) {
+		/* Optimize for inline/no inline eMPW send. */
+		ret = (MLX5_TXOFF_CONFIG(INLINE)) ?
+			mlx5_tx_burst_empw_inline
+				(txq, pkts, pkts_n, loc, olx) :
+			mlx5_tx_burst_empw_simple
+				(txq, pkts, pkts_n, loc, olx);
+		if (ret != MLX5_TXCMP_CODE_SINGLE)
+			return ret;
+		/* The resources to send one packet should remain. */
+		assert(loc->elts_free && loc->wqe_free);
+ordinary_send:
+		ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx);
+		if (ret != MLX5_TXCMP_CODE_EMPW)
+			return ret;
+		/* The resources to send one packet should remain. */
+		assert(loc->elts_free && loc->wqe_free);
+	}
+}
+
+/**
+ * DPDK Tx callback template. This is configured template
+ * used to generate routines optimized for specified offload setup.
+ * One of this generated functions is chosen at SQ configuration
+ * time.
+ *
+ * @param txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param olx
+ *   Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
+ *   values. Should be static to take compile time static configuration
+ *   advantages.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+static __rte_always_inline uint16_t
+mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq,
+		   struct rte_mbuf **restrict pkts,
+		   uint16_t pkts_n,
+		   unsigned int olx)
+{
+	struct mlx5_txq_local loc;
+	enum mlx5_txcmp_code ret;
+	unsigned int part;
+
+	assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+	assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+	/*
+	 * Check if there are some CQEs, if any:
+	 * - process an encountered errors
+	 * - process the completed WQEs
+	 * - free related mbufs
+	 * - doorbell the NIC about processed CQEs
+	 */
+	if (unlikely(!pkts_n))
+		return 0;
+	rte_prefetch0(*pkts);
+	mlx5_tx_handle_completion(txq, olx);
+	/*
+	 * Calculate the number of available resources - elts and WQEs.
+	 * There are two possible different scenarios:
+	 * - no data inlining into WQEs, one WQEBB may contains upto
+	 *   four packets, in this case elts become scarce resource
+	 * - data inlining into WQEs, one packet may require multiple
+	 *   WQEBBs, the WQEs become the limiting factor.
+	 */
+	assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+	loc.elts_free = txq->elts_s -
+				(uint16_t)(txq->elts_head - txq->elts_tail);
+	assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+	loc.wqe_free = txq->wqe_s -
+				(uint16_t)(txq->wqe_ci - txq->wqe_pi);
+	if (unlikely(!loc.elts_free || !loc.wqe_free))
+		return 0;
+	loc.pkts_sent = 0;
+	loc.pkts_copy = 0;
+	loc.wqe_last = NULL;
+	for (;;) {
+		/*
+		 * Fetch the packet from array. Usually this is
+		 * the first packet in series of multi/single
+		 * segment packets.
+		 */
+		loc.mbuf = *(pkts + loc.pkts_sent);
+		/* Dedicated branch for multi-segment packets. */
+		if (MLX5_TXOFF_CONFIG(MULTI) &&
+		    unlikely(NB_SEGS(loc.mbuf) > 1)) {
+			/*
+			 * Multi-segment packet encountered.
+			 * Hardware is able to process it only
+			 * with SEND/TSO opcodes, one packet
+			 * per WQE, do it in dedicated routine.
+			 */
+enter_send_multi:
+			assert(loc.pkts_sent >= loc.pkts_copy);
+			part = loc.pkts_sent - loc.pkts_copy;
+			if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
+				/*
+				 * There are some single-segment mbufs not
+				 * stored in elts. The mbufs must be in the
+				 * same order as WQEs, so we must copy the
+				 * mbufs to elts here, before the coming
+				 * multi-segment packet mbufs is appended.
+				 */
+				mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy,
+						  part, olx);
+				loc.pkts_copy = loc.pkts_sent;
+			}
+			assert(pkts_n > loc.pkts_sent);
+			ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx);
+			if (!MLX5_TXOFF_CONFIG(INLINE))
+				loc.pkts_copy = loc.pkts_sent;
+			/*
+			 * These returned code checks are supposed
+			 * to be optimized out due to routine inlining.
+			 */
+			if (ret == MLX5_TXCMP_CODE_EXIT) {
+				/*
+				 * The routine returns this code when
+				 * all packets are sent or there is no
+				 * enough resources to complete request.
+				 */
+				break;
+			}
+			if (ret == MLX5_TXCMP_CODE_ERROR) {
+				/*
+				 * The routine returns this code when
+				 * some error in the incoming packets
+				 * format occurred.
+				 */
+				txq->stats.oerrors++;
+				break;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE) {
+				/*
+				 * The single-segment packet was encountered
+				 * in the array, try to send it with the
+				 * best optimized way, possible engaging eMPW.
+				 */
+				goto enter_send_single;
+			}
+			if (MLX5_TXOFF_CONFIG(TSO) &&
+			    ret == MLX5_TXCMP_CODE_TSO) {
+				/*
+				 * The single-segment TSO packet was
+				 * encountered in the array.
+				 */
+				goto enter_send_tso;
+			}
+			/* We must not get here. Something is going wrong. */
+			assert(false);
+			txq->stats.oerrors++;
+			break;
+		}
+		/* Dedicated branch for single-segment TSO packets. */
+		if (MLX5_TXOFF_CONFIG(TSO) &&
+		    unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) {
+			/*
+			 * TSO might require special way for inlining
+			 * (dedicated parameters) and is sent with
+			 * MLX5_OPCODE_TSO opcode only, provide this
+			 * in dedicated branch.
+			 */
+enter_send_tso:
+			assert(NB_SEGS(loc.mbuf) == 1);
+			assert(pkts_n > loc.pkts_sent);
+			ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx);
+			/*
+			 * These returned code checks are supposed
+			 * to be optimized out due to routine inlining.
+			 */
+			if (ret == MLX5_TXCMP_CODE_EXIT)
+				break;
+			if (ret == MLX5_TXCMP_CODE_ERROR) {
+				txq->stats.oerrors++;
+				break;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE)
+				goto enter_send_single;
+			if (MLX5_TXOFF_CONFIG(MULTI) &&
+			    ret == MLX5_TXCMP_CODE_MULTI) {
+				/*
+				 * The multi-segment packet was
+				 * encountered in the array.
+				 */
+				goto enter_send_multi;
+			}
+			/* We must not get here. Something is going wrong. */
+			assert(false);
+			txq->stats.oerrors++;
+			break;
+		}
+		/*
+		 * The dedicated branch for the single-segment packets
+		 * without TSO. Often these ones can be sent using
+		 * MLX5_OPCODE_EMPW with multiple packets in one WQE.
+		 * The routine builds the WQEs till it encounters
+		 * the TSO or multi-segment packet (in case if these
+		 * offloads are requested at SQ configuration time).
+		 */
+enter_send_single:
+		assert(pkts_n > loc.pkts_sent);
+		ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx);
+		/*
+		 * These returned code checks are supposed
+		 * to be optimized out due to routine inlining.
+		 */
+		if (ret == MLX5_TXCMP_CODE_EXIT)
+			break;
+		if (ret == MLX5_TXCMP_CODE_ERROR) {
+			txq->stats.oerrors++;
+			break;
+		}
+		if (ret == MLX5_TXCMP_CODE_SINGLE)
+			goto enter_send_single;
+		if (MLX5_TXOFF_CONFIG(MULTI) &&
+		    ret == MLX5_TXCMP_CODE_MULTI)
+			goto enter_send_multi;
+		/* We must not get here. Something is going wrong. */
+		assert(false);
+		txq->stats.oerrors++;
+		break;
+	}
+	/*
+	 * Main Tx loop is completed, do the rest:
+	 * - set completion request if thresholds are reached
+	 * - doorbell the hardware
+	 * - copy the rest of mbufs to elts (if any)
+	 */
+	assert(MLX5_TXOFF_CONFIG(INLINE) || loc.pkts_sent >= loc.pkts_copy);
+	/* Take a shortcut if nothing is sent. */
+	if (unlikely(loc.pkts_sent == 0))
+		return 0;
+	/* Not all of the mbufs may be stored into elts yet. */
+	part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy;
+	mlx5_tx_request_completion(txq, part, &loc, olx);
+	/*
+	 * Ring QP doorbell immediately after WQE building completion
+	 * to improve latencies. The pure software related data treatment
+	 * can be completed after doorbell. Tx CQEs for this SQ are
+	 * processed in this thread only by the polling.
+	 */
+	mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, 0);
+	if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
+		/*
+		 * There are some single-segment mbufs not stored in elts.
+		 * It can be only if last packet was single-segment.
+		 * The copying is gathered into one place due to it is
+		 * a good opportunity to optimize that with SIMD.
+		 * Unfortunately if inlining is enabled the gaps in
+		 * pointer array may happen due to early freeing of the
+		 * inlined mbufs.
+		 */
+		mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx);
+	}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Increment sent packets counter. */
+	txq->stats.opackets += loc.pkts_sent;
+#endif
+	assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+	assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+	return loc.pkts_sent;
 }
 
 /* Generate routines with Enhanced Multi-Packet Write support. */
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index d8c6f35..4439e88 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -354,8 +354,9 @@ struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx,
 void mlx5_set_ptype_table(void);
 void mlx5_set_cksum_table(void);
 void mlx5_set_swp_types_table(void);
-__rte_noinline uint16_t mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
-					volatile struct mlx5_err_cqe *err_cqe);
+__rte_noinline uint16_t mlx5_tx_error_cqe_handle
+				(struct mlx5_txq_data *restrict txq,
+				 volatile struct mlx5_err_cqe *err_cqe);
 uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
 void mlx5_rxq_initialize(struct mlx5_rxq_data *rxq);
 __rte_noinline int mlx5_rx_err_handle(struct mlx5_rxq_data *rxq,
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v2 7/7] net/mlx5: add minimal required Tx data inline
  2019-07-15 13:59   ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                       ` (5 preceding siblings ...)
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 6/7] net/mlx5: implement Tx burst template Viacheslav Ovsiienko
@ 2019-07-15 13:59     ` Viacheslav Ovsiienko
  2019-07-17  6:53     ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
  7 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-15 13:59 UTC (permalink / raw)
  To: dev; +Cc: yskoh

Tx data packet data may be inlined into transmit descriptor.
At some circumstances ConnectX NICs may require data to be
inlined for correct operation. The exact data amount may
depend on NIC operation mode, requested Tx offloads,
E-Switch configuration, etc.

The number of data bytes to inline may be specified with
devargs key "txq_inline_min". If this key is present the
specified value (may be aligned by the driver in order
not to exceed the limits and provide better descriptor
space utilization) will be used by the driver and it
is guaranteed the requested data bytes are inlined into
the descriptor beside other inline settings.

If "txq_inline_min" key is not present the value may
be queried by the driver from the NIC via DevX if this
feature is available. This patch provides the implementation
of this query.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c           |  93 +++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5.h           |   4 ++
 drivers/net/mlx5/mlx5_defs.h      |  18 ++++++
 drivers/net/mlx5/mlx5_devx_cmds.c | 100 ++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_prm.h       | 121 +++++++++++++++++++++++++++++++++++++-
 5 files changed, 334 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index e803f08..ce3a62b 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1045,6 +1045,97 @@ struct mlx5_dev_spawn_data {
 }
 
 /**
+ * Configures the minimal amount of data to inline into WQE
+ * while sending packets.
+ *
+ * - the txq_inline_min has the maximal priority, if this
+ *   key is specified in devargs
+ * - if DevX is enabled the inline mode is queried from the
+ *   device (HCA attributes and NIC vport context if needed).
+ * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4LX
+ *   and none (0 bytes) for other NICs
+ *
+ * @param spawn
+ *   Verbs device parameters (name, port, switch_info) to spawn.
+ * @param config
+ *   Device configuration parameters.
+ */
+static void
+mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn,
+		    struct mlx5_dev_config *config)
+{
+	if (config->txq_inline_min != MLX5_ARG_UNSET) {
+		/* Application defines size of inlined data explicitly. */
+		goto exit;
+	}
+	if (config->hca_attr.eth_net_offloads) {
+		/* We have DevX enabled, inline mode queried successfully. */
+		switch (config->hca_attr.wqe_inline_mode) {
+		case MLX5_CAP_INLINE_MODE_L2:
+			/* outer L2 header must be inlined. */
+			config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
+			goto exit;
+		case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+			/* No inline data are required by NIC. */
+			config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
+			goto exit;
+		case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+			/* inline mode is defined by NIC vport context. */
+			if (!config->hca_attr.eth_virt)
+				break;
+			switch (config->hca_attr.vport_inline_mode) {
+			case MLX5_INLINE_MODE_NONE:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_NONE;
+				goto exit;
+			case MLX5_INLINE_MODE_L2:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_L2;
+				goto exit;
+			case MLX5_INLINE_MODE_IP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_L3;
+				goto exit;
+			case MLX5_INLINE_MODE_TCP_UDP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_L4;
+				goto exit;
+			case MLX5_INLINE_MODE_INNER_L2:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_INNER_L2;
+				goto exit;
+			case MLX5_INLINE_MODE_INNER_IP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_INNER_L3;
+				goto exit;
+			case MLX5_INLINE_MODE_INNER_TCP_UDP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_INNER_L4;
+				goto exit;
+			}
+		}
+	}
+	/*
+	 * We get here if we are unable to deduce
+	 * inline data size with DevX. Try PCI ID
+	 * to determine old NICs.
+	 */
+	switch (spawn->pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+		config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
+		break;
+	default:
+		config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
+		break;
+	}
+exit:
+	DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min);
+}
+
+/**
  * Spawn an Ethernet device from Verbs information.
  *
  * @param dpdk_dev
@@ -1529,6 +1620,8 @@ struct mlx5_dev_spawn_data {
 #else
 	config.dv_esw_en = 0;
 #endif
+	/* Detect minimal data bytes to inline. */
+	mlx5_set_min_inline(spawn, &config);
 	/* Store device configuration on private structure. */
 	priv->config = config;
 	if (config.dv_flow_en) {
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 8e2eab3..30be381 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -161,6 +161,10 @@ struct mlx5_devx_counter_set {
 /* HCA attributes. */
 struct mlx5_hca_attr {
 	uint32_t eswitch_manager:1;
+	uint32_t eth_net_offloads:1;
+	uint32_t eth_virt:1;
+	uint32_t wqe_inline_mode:2;
+	uint32_t vport_inline_mode:3;
 };
 
 /* Flow list . */
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 873a595..8c118d5 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -60,6 +60,24 @@
 /* Maximum Packet headers size (L2+L3+L4) for TSO. */
 #define MLX5_MAX_TSO_HEADER (128u + 34u)
 
+/* Inline data size required by NICs. */
+#define MLX5_INLINE_HSIZE_NONE 0
+#define MLX5_INLINE_HSIZE_L2 (sizeof(struct rte_ether_hdr) + \
+			      sizeof(struct rte_vlan_hdr))
+#define MLX5_INLINE_HSIZE_L3 (MLX5_INLINE_HSIZE_L2 + \
+			      sizeof(struct rte_ipv6_hdr))
+#define MLX5_INLINE_HSIZE_L4 (MLX5_INLINE_HSIZE_L3 + \
+			      sizeof(struct rte_tcp_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L2 (MLX5_INLINE_HSIZE_L3 + \
+				    sizeof(struct rte_udp_hdr) + \
+				    sizeof(struct rte_vxlan_hdr) + \
+				    sizeof(struct rte_ether_hdr) + \
+				    sizeof(struct rte_vlan_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L3 (MLX5_INLINE_HSIZE_INNER_L2 + \
+				    sizeof(struct rte_ipv6_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L4 (MLX5_INLINE_HSIZE_INNER_L3 + \
+				    sizeof(struct rte_tcp_hdr))
+
 /* Threshold of buffer replenishment for vectorized Rx. */
 #define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \
 	(RTE_MIN(MLX5_VPMD_RX_MAX_BURST, (unsigned int)(n) >> 2))
diff --git a/drivers/net/mlx5/mlx5_devx_cmds.c b/drivers/net/mlx5/mlx5_devx_cmds.c
index e5776c4..de470a6 100644
--- a/drivers/net/mlx5/mlx5_devx_cmds.c
+++ b/drivers/net/mlx5/mlx5_devx_cmds.c
@@ -107,6 +107,59 @@ int mlx5_devx_cmd_flow_counter_free(struct mlx5dv_devx_obj *obj)
 }
 
 /**
+ * Query NIC vport context.
+ * Currently fiils minimal inline attribute.
+ *
+ * @param[in] ctx
+ *   ibv contexts returned from mlx5dv_open_device.
+ * @param[in] vport
+ *   vport index
+ * @param[out] attr
+ *   Attributes device values.
+ *
+ * @return
+ *   0 on success, a negative value otherwise.
+ */
+static int
+mlx5_devx_cmd_query_nic_vport_context(struct ibv_context *ctx,
+				      unsigned int vport,
+				      struct mlx5_hca_attr *attr)
+{
+	uint32_t in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {0};
+	uint32_t out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {0};
+	void *vctx;
+	int status, syndrome, rc;
+
+	/* Query NIC vport context to determine inline mode. */
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+	rc = mlx5_glue->devx_general_cmd(ctx,
+					 in, sizeof(in),
+					 out, sizeof(out));
+	if (rc)
+		goto error;
+	status = MLX5_GET(query_nic_vport_context_out, out, status);
+	syndrome = MLX5_GET(query_nic_vport_context_out, out, syndrome);
+	if (status) {
+		DRV_LOG(DEBUG, "Failed to query NIC vport context, "
+			"status %x, syndrome = %x",
+			status, syndrome);
+		return -1;
+	}
+	vctx = MLX5_ADDR_OF(query_nic_vport_context_out, out,
+			    nic_vport_context);
+	attr->vport_inline_mode = MLX5_GET(nic_vport_context, vctx,
+					   min_wqe_inline_mode);
+	return 0;
+error:
+	rc = (rc > 0) ? -rc : rc;
+	return rc;
+}
+
+/**
  * Query HCA attributes.
  * Using those attributes we can check on run time if the device
  * is having the required capabilities.
@@ -136,7 +189,7 @@ int mlx5_devx_cmd_flow_counter_free(struct mlx5dv_devx_obj *obj)
 	rc = mlx5_glue->devx_general_cmd(ctx,
 					 in, sizeof(in), out, sizeof(out));
 	if (rc)
-		return rc;
+		goto error;
 	status = MLX5_GET(query_hca_cap_out, out, status);
 	syndrome = MLX5_GET(query_hca_cap_out, out, syndrome);
 	if (status) {
@@ -147,5 +200,50 @@ int mlx5_devx_cmd_flow_counter_free(struct mlx5dv_devx_obj *obj)
 	}
 	hcattr = MLX5_ADDR_OF(query_hca_cap_out, out, capability);
 	attr->eswitch_manager = MLX5_GET(cmd_hca_cap, hcattr, eswitch_manager);
+	attr->eth_net_offloads = MLX5_GET(cmd_hca_cap, hcattr,
+					  eth_net_offloads);
+	attr->eth_virt = MLX5_GET(cmd_hca_cap, hcattr, eth_virt);
+	if (!attr->eth_net_offloads)
+		return 0;
+
+	/* Query HCA offloads for Ethernet protocol. */
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+	MLX5_SET(query_hca_cap_in, in, op_mod,
+		 MLX5_GET_HCA_CAP_OP_MOD_ETHERNET |
+		 MLX5_HCA_CAP_OPMOD_GET_CUR);
+
+	rc = mlx5_glue->devx_general_cmd(ctx,
+					 in, sizeof(in),
+					 out, sizeof(out));
+	if (rc) {
+		attr->eth_net_offloads = 0;
+		goto error;
+	}
+	status = MLX5_GET(query_hca_cap_out, out, status);
+	syndrome = MLX5_GET(query_hca_cap_out, out, syndrome);
+	if (status) {
+		DRV_LOG(DEBUG, "Failed to query devx HCA capabilities, "
+			"status %x, syndrome = %x",
+			status, syndrome);
+		attr->eth_net_offloads = 0;
+		return -1;
+	}
+	hcattr = MLX5_ADDR_OF(query_hca_cap_out, out, capability);
+	attr->wqe_inline_mode = MLX5_GET(eth_offload_cap, hcattr,
+					 wqe_inline_mode);
+	if (attr->wqe_inline_mode != MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
+		return 0;
+	if (attr->eth_virt) {
+		rc = mlx5_devx_cmd_query_nic_vport_context(ctx, 0, attr);
+		if (rc) {
+			attr->eth_virt = 0;
+			goto error;
+		}
+	}
 	return 0;
+error:
+	rc = (rc > 0) ? -rc : rc;
+	return rc;
 }
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index a251369..617dce4 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -616,6 +616,7 @@ enum {
 
 enum {
 	MLX5_CMD_OP_QUERY_HCA_CAP = 0x100,
+	MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT = 0x754,
 	MLX5_CMD_OP_ALLOC_FLOW_COUNTER = 0x939,
 	MLX5_CMD_OP_QUERY_FLOW_COUNTER = 0x93b,
 };
@@ -680,7 +681,8 @@ struct mlx5_ifc_query_flow_counter_in_bits {
 
 enum {
 	MLX5_GET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0 << 1,
-	MLX5_GET_HCA_CAP_OP_MOD_QOS_CAP        = 0xc << 1,
+	MLX5_GET_HCA_CAP_OP_MOD_ETHERNET = 0x1 << 1,
+	MLX5_GET_HCA_CAP_OP_MOD_QOS_CAP = 0xc << 1,
 };
 
 enum {
@@ -688,6 +690,23 @@ enum {
 	MLX5_HCA_CAP_OPMOD_GET_CUR   = 1,
 };
 
+enum {
+	MLX5_CAP_INLINE_MODE_L2,
+	MLX5_CAP_INLINE_MODE_VPORT_CONTEXT,
+	MLX5_CAP_INLINE_MODE_NOT_REQUIRED,
+};
+
+enum {
+	MLX5_INLINE_MODE_NONE,
+	MLX5_INLINE_MODE_L2,
+	MLX5_INLINE_MODE_IP,
+	MLX5_INLINE_MODE_TCP_UDP,
+	MLX5_INLINE_MODE_RESERVED4,
+	MLX5_INLINE_MODE_INNER_L2,
+	MLX5_INLINE_MODE_INNER_IP,
+	MLX5_INLINE_MODE_INNER_TCP_UDP,
+};
+
 struct mlx5_ifc_cmd_hca_cap_bits {
 	u8 reserved_at_0[0x30];
 	u8 vhca_id[0x10];
@@ -958,6 +977,42 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8 reserved_at_61f[0x1e1];
 };
 
+struct mlx5_ifc_eth_offload_cap_bits {
+	u8 csum_cap[0x1];
+	u8 vlan_cap[0x1];
+	u8 lro_cap[0x1];
+	u8 lro_psh_flag[0x1];
+	u8 lro_time_stamp[0x1];
+	u8 reserved_at_5[0x2];
+	u8 wqe_vlan_insert[0x1];
+	u8 self_lb_en_modifiable[0x1];
+	u8 reserved_at_9[0x2];
+	u8 max_lso_cap[0x5];
+	u8 multi_pkt_send_wqe[0x2];
+	u8 wqe_inline_mode[0x2];
+	u8 rss_ind_tbl_cap[0x4];
+	u8 reg_umr_sq[0x1];
+	u8 scatter_fcs[0x1];
+	u8 enhanced_multi_pkt_send_wqe[0x1];
+	u8 tunnel_lso_const_out_ip_id[0x1];
+	u8 reserved_at_1c[0x2];
+	u8 tunnel_stateless_gre[0x1];
+	u8 tunnel_stateless_vxlan[0x1];
+	u8 swp[0x1];
+	u8 swp_csum[0x1];
+	u8 swp_lso[0x1];
+	u8 reserved_at_23[0xd];
+	u8 max_vxlan_udp_ports[0x8];
+	u8 reserved_at_38[0x6];
+	u8 max_geneve_opt_len[0x1];
+	u8 tunnel_stateless_geneve_rx[0x1];
+	u8 reserved_at_40[0x10];
+	u8 lro_min_mss_size[0x10];
+	u8 reserved_at_60[0x120];
+	u8 lro_timer_supported_periods[4][0x20];
+	u8 reserved_at_200[0x600];
+};
+
 struct mlx5_ifc_qos_cap_bits {
 	u8 packet_pacing[0x1];
 	u8 esw_scheduling[0x1];
@@ -985,6 +1040,7 @@ struct mlx5_ifc_qos_cap_bits {
 
 union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap;
+	struct mlx5_ifc_eth_offload_cap_bits eth_offload_cap;
 	struct mlx5_ifc_qos_cap_bits qos_cap;
 	u8 reserved_at_0[0x8000];
 };
@@ -1005,6 +1061,69 @@ struct mlx5_ifc_query_hca_cap_in_bits {
 	u8 reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_mac_address_layout_bits {
+	u8 reserved_at_0[0x10];
+	u8 mac_addr_47_32[0x10];
+	u8 mac_addr_31_0[0x20];
+};
+
+struct mlx5_ifc_nic_vport_context_bits {
+	u8 reserved_at_0[0x5];
+	u8 min_wqe_inline_mode[0x3];
+	u8 reserved_at_8[0x15];
+	u8 disable_mc_local_lb[0x1];
+	u8 disable_uc_local_lb[0x1];
+	u8 roce_en[0x1];
+	u8 arm_change_event[0x1];
+	u8 reserved_at_21[0x1a];
+	u8 event_on_mtu[0x1];
+	u8 event_on_promisc_change[0x1];
+	u8 event_on_vlan_change[0x1];
+	u8 event_on_mc_address_change[0x1];
+	u8 event_on_uc_address_change[0x1];
+	u8 reserved_at_40[0xc];
+	u8 affiliation_criteria[0x4];
+	u8 affiliated_vhca_id[0x10];
+	u8 reserved_at_60[0xd0];
+	u8 mtu[0x10];
+	u8 system_image_guid[0x40];
+	u8 port_guid[0x40];
+	u8 node_guid[0x40];
+	u8 reserved_at_200[0x140];
+	u8 qkey_violation_counter[0x10];
+	u8 reserved_at_350[0x430];
+	u8 promisc_uc[0x1];
+	u8 promisc_mc[0x1];
+	u8 promisc_all[0x1];
+	u8 reserved_at_783[0x2];
+	u8 allowed_list_type[0x3];
+	u8 reserved_at_788[0xc];
+	u8 allowed_list_size[0xc];
+	struct mlx5_ifc_mac_address_layout_bits permanent_address;
+	u8 reserved_at_7e0[0x20];
+};
+
+struct mlx5_ifc_query_nic_vport_context_out_bits {
+	u8 status[0x8];
+	u8 reserved_at_8[0x18];
+	u8 syndrome[0x20];
+	u8 reserved_at_40[0x40];
+	struct mlx5_ifc_nic_vport_context_bits nic_vport_context;
+};
+
+struct mlx5_ifc_query_nic_vport_context_in_bits {
+	u8 opcode[0x10];
+	u8 reserved_at_10[0x10];
+	u8 reserved_at_20[0x10];
+	u8 op_mod[0x10];
+	u8 other_vport[0x1];
+	u8 reserved_at_41[0xf];
+	u8 vport_number[0x10];
+	u8 reserved_at_60[0x5];
+	u8 allowed_list_type[0x3];
+	u8 reserved_at_68[0x18];
+};
+
 /* CQE format mask. */
 #define MLX5E_CQE_FORMAT_MASK 0xc
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v3 0/8] net/mlx5: consolidate Tx datapath
  2019-07-15 13:59   ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                       ` (6 preceding siblings ...)
  2019-07-15 13:59     ` [dpdk-dev] [PATCH v2 7/7] net/mlx5: add minimal required Tx data inline Viacheslav Ovsiienko
@ 2019-07-17  6:53     ` Viacheslav Ovsiienko
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 1/8] net/mlx5: remove Tx datapath implementation Viacheslav Ovsiienko
                         ` (8 more replies)
  7 siblings, 9 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-17  6:53 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patchset introduces the new implementation of tx_burst
routine of mlx5 PMD. The existing implementation is based on the
several branches of tx_burst routines optimizied for most
common sets of Tx hardware offload, supported by Mellanox NICs.

It was not very easy to update, support and develop such kind
of code - multiple branches impose multiple points to process.
Also, many of frequently requested offload combinations are not
supported yet in the most efficient way. That leads to selecting
of not completely matching tx_burst routine and harms the performance.

The new Tx datapath:
  - introduces the unified template for tx_burst routine to generate
    on compile time the most efficient instances, tuned for specified
    sets of hardware offloads
  - the best matching instance is chosen in run-time at sending queue
    configuration
  - almost all possible sets of Tx are supported
  - the best possible method to send each packet is chosen on runtime
  - introduces new devargs for more accurate tuning of data inlining 

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>

---
v3:
  - add reporting max number of segments in packet
  - minor fixes with inline data length setup
  - rebase on the current repo top

v2: 
  - http://patches.dpdk.org/cover/56451/
  - minor performance issues resolved
  - typos in comments and docs

v1: 
  - http://patches.dpdk.org/patch/56091/

Viacheslav Ovsiienko (8):
  net/mlx5: remove Tx datapath implementation
  net/mlx5: add Tx datapath related devargs
  net/mlx5: update Tx datapath definitions
  net/mlx5: add Tx datapath configuration and setup
  net/mlx5: introduce Tx burst routine template
  net/mlx5: implement Tx burst template
  net/mlx5: add minimal required Tx data inline
  net/mlx5: report supported max number of mbuf segments

 doc/guides/nics/mlx5.rst              |  168 +-
 drivers/net/mlx5/mlx5.c               |  155 +-
 drivers/net/mlx5/mlx5.h               |   13 +-
 drivers/net/mlx5/mlx5_defs.h          |   36 +-
 drivers/net/mlx5/mlx5_devx_cmds.c     |  100 +-
 drivers/net/mlx5/mlx5_ethdev.c        |   95 +-
 drivers/net/mlx5/mlx5_prm.h           |  356 ++-
 drivers/net/mlx5/mlx5_rxtx.c          | 5280 ++++++++++++++++++++++-----------
 drivers/net/mlx5/mlx5_rxtx.h          |  333 +--
 drivers/net/mlx5/mlx5_rxtx_vec.c      |  175 --
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h |  289 --
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  284 --
 drivers/net/mlx5/mlx5_txq.c           |  242 +-
 13 files changed, 4451 insertions(+), 3075 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v3 1/8] net/mlx5: remove Tx datapath implementation
  2019-07-17  6:53     ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
@ 2019-07-17  6:53       ` Viacheslav Ovsiienko
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 2/8] net/mlx5: add Tx datapath related devargs Viacheslav Ovsiienko
                         ` (7 subsequent siblings)
  8 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-17  6:53 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch removes the existing Tx datapath code
as preparation step before introducing the new
implementation. The following entities are being
removed:

- obsolete devargs
- tx_burst() routines
- related PRM definitions
- SQ configuration code
- Tx routine selection code
- incompatible Tx completion code

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c               |   43 -
 drivers/net/mlx5/mlx5.h               |    5 -
 drivers/net/mlx5/mlx5_defs.h          |   16 -
 drivers/net/mlx5/mlx5_ethdev.c        |   58 --
 drivers/net/mlx5/mlx5_prm.h           |   77 --
 drivers/net/mlx5/mlx5_rxtx.c          | 1434 +--------------------------------
 drivers/net/mlx5/mlx5_rxtx.h          |  273 -------
 drivers/net/mlx5/mlx5_rxtx_vec.c      |  175 ----
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h |  289 -------
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  284 -------
 drivers/net/mlx5/mlx5_txq.c           |  110 +--
 11 files changed, 35 insertions(+), 2729 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index f4ad5d2..b118ee5 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -69,33 +69,15 @@
 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
 
-/* Device parameter to configure inline send. */
-#define MLX5_TXQ_INLINE "txq_inline"
-
 /*
  * Device parameter to configure the number of TX queues threshold for
  * enabling inline send.
  */
 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
 
-/*
- * Device parameter to configure the number of TX queues threshold for
- * enabling vectorized Tx.
- */
-#define MLX5_TXQS_MAX_VEC "txqs_max_vec"
-
 /* Device parameter to enable multi-packet send WQEs. */
 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
 
-/* Device parameter to include 2 dsegs in the title WQEBB. */
-#define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
-
-/* Device parameter to limit the size of inlining packet. */
-#define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
-
-/* Device parameter to enable hardware Tx vector. */
-#define MLX5_TX_VEC_EN "tx_vec_en"
-
 /* Device parameter to enable hardware Rx vector. */
 #define MLX5_RX_VEC_EN "rx_vec_en"
 
@@ -996,20 +978,10 @@ struct mlx5_dev_spawn_data {
 		config->mprq.max_memcpy_len = tmp;
 	} else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
 		config->mprq.min_rxqs_num = tmp;
-	} else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
-		config->txq_inline = tmp;
 	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
 		config->txqs_inline = tmp;
-	} else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
-		config->txqs_vec = tmp;
 	} else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
 		config->mps = !!tmp;
-	} else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
-		config->mpw_hdr_dseg = !!tmp;
-	} else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
-		config->inline_max_packet_sz = tmp;
-	} else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
-		config->tx_vec_en = !!tmp;
 	} else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
 		config->rx_vec_en = !!tmp;
 	} else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
@@ -1054,13 +1026,8 @@ struct mlx5_dev_spawn_data {
 		MLX5_RX_MPRQ_LOG_STRIDE_NUM,
 		MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
 		MLX5_RXQS_MIN_MPRQ,
-		MLX5_TXQ_INLINE,
 		MLX5_TXQS_MIN_INLINE,
-		MLX5_TXQS_MAX_VEC,
 		MLX5_TXQ_MPW_EN,
-		MLX5_TXQ_MPW_HDR_DSEG_EN,
-		MLX5_TXQ_MAX_INLINE_LEN,
-		MLX5_TX_VEC_EN,
 		MLX5_RX_VEC_EN,
 		MLX5_L3_VXLAN_EN,
 		MLX5_VF_NL_EN,
@@ -2016,12 +1983,8 @@ struct mlx5_dev_spawn_data {
 	dev_config = (struct mlx5_dev_config){
 		.hw_padding = 0,
 		.mps = MLX5_ARG_UNSET,
-		.tx_vec_en = 1,
 		.rx_vec_en = 1,
-		.txq_inline = MLX5_ARG_UNSET,
 		.txqs_inline = MLX5_ARG_UNSET,
-		.txqs_vec = MLX5_ARG_UNSET,
-		.inline_max_packet_sz = MLX5_ARG_UNSET,
 		.vf_nl_en = 1,
 		.mr_ext_memseg_en = 1,
 		.mprq = {
@@ -2034,9 +1997,6 @@ struct mlx5_dev_spawn_data {
 	};
 	/* Device specific configuration. */
 	switch (pci_dev->id.device_id) {
-	case PCI_DEVICE_ID_MELLANOX_CONNECTX5BF:
-		dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS_BLUEFIELD;
-		break;
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
@@ -2046,9 +2006,6 @@ struct mlx5_dev_spawn_data {
 	default:
 		break;
 	}
-	/* Set architecture-dependent default value if unset. */
-	if (dev_config.txqs_vec == MLX5_ARG_UNSET)
-		dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS;
 	for (i = 0; i != ns; ++i) {
 		uint32_t restore;
 
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 2bd2aa6..211558e 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -198,9 +198,7 @@ struct mlx5_dev_config {
 	unsigned int cqe_comp:1; /* CQE compression is enabled. */
 	unsigned int cqe_pad:1; /* CQE padding is enabled. */
 	unsigned int tso:1; /* Whether TSO is supported. */
-	unsigned int tx_vec_en:1; /* Tx vector is enabled. */
 	unsigned int rx_vec_en:1; /* Rx vector is enabled. */
-	unsigned int mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
 	unsigned int mr_ext_memseg_en:1;
 	/* Whether memseg should be extended for MR creation. */
 	unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */
@@ -224,10 +222,7 @@ struct mlx5_dev_config {
 	unsigned int tso_max_payload_sz; /* Maximum TCP payload for TSO. */
 	unsigned int ind_table_max_size; /* Maximum indirection table size. */
 	unsigned int max_dump_files_num; /* Maximum dump files per queue. */
-	int txq_inline; /* Maximum packet size for inlining. */
 	int txqs_inline; /* Queue number threshold for inlining. */
-	int txqs_vec; /* Queue number threshold for vectorized Tx. */
-	int inline_max_packet_sz; /* Max packet size for inlining. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 };
 
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 13801a5..6861304 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -60,15 +60,6 @@
 /* Maximum Packet headers size (L2+L3+L4) for TSO. */
 #define MLX5_MAX_TSO_HEADER 192
 
-/* Default maximum number of Tx queues for vectorized Tx. */
-#if defined(RTE_ARCH_ARM64)
-#define MLX5_VPMD_MAX_TXQS 8
-#define MLX5_VPMD_MAX_TXQS_BLUEFIELD 16
-#else
-#define MLX5_VPMD_MAX_TXQS 4
-#define MLX5_VPMD_MAX_TXQS_BLUEFIELD MLX5_VPMD_MAX_TXQS
-#endif
-
 /* Threshold of buffer replenishment for vectorized Rx. */
 #define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \
 	(RTE_MIN(MLX5_VPMD_RX_MAX_BURST, (unsigned int)(n) >> 2))
@@ -76,13 +67,6 @@
 /* Maximum size of burst for vectorized Rx. */
 #define MLX5_VPMD_RX_MAX_BURST 64U
 
-/*
- * Maximum size of burst for vectorized Tx. This is related to the maximum size
- * of Enhanced MPW (eMPW) WQE as vectorized Tx is supported with eMPW.
- * Careful when changing, large value can cause WQE DS to overlap.
- */
-#define MLX5_VPMD_TX_MAX_BURST        32U
-
 /* Number of packets vectorized Rx can simultaneously process in a loop. */
 #define MLX5_VPMD_DESCS_PER_LOOP      4
 
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 004901a..2902138 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1652,64 +1652,6 @@ int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
 }
 
 /**
- * Configure the TX function to use.
- *
- * @param dev
- *   Pointer to private data structure.
- *
- * @return
- *   Pointer to selected Tx burst function.
- */
-eth_tx_burst_t
-mlx5_select_tx_function(struct rte_eth_dev *dev)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
-	struct mlx5_dev_config *config = &priv->config;
-	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
-	int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
-				    DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
-				    DEV_TX_OFFLOAD_GRE_TNL_TSO |
-				    DEV_TX_OFFLOAD_IP_TNL_TSO |
-				    DEV_TX_OFFLOAD_UDP_TNL_TSO));
-	int swp = !!(tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
-				    DEV_TX_OFFLOAD_UDP_TNL_TSO |
-				    DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM));
-	int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT);
-
-	assert(priv != NULL);
-	/* Select appropriate TX function. */
-	if (vlan_insert || tso || swp)
-		return tx_pkt_burst;
-	if (config->mps == MLX5_MPW_ENHANCED) {
-		if (mlx5_check_vec_tx_support(dev) > 0) {
-			if (mlx5_check_raw_vec_tx_support(dev) > 0)
-				tx_pkt_burst = mlx5_tx_burst_raw_vec;
-			else
-				tx_pkt_burst = mlx5_tx_burst_vec;
-			DRV_LOG(DEBUG,
-				"port %u selected enhanced MPW Tx vectorized"
-				" function",
-				dev->data->port_id);
-		} else {
-			tx_pkt_burst = mlx5_tx_burst_empw;
-			DRV_LOG(DEBUG,
-				"port %u selected enhanced MPW Tx function",
-				dev->data->port_id);
-		}
-	} else if (config->mps && (config->txq_inline > 0)) {
-		tx_pkt_burst = mlx5_tx_burst_mpw_inline;
-		DRV_LOG(DEBUG, "port %u selected MPW inline Tx function",
-			dev->data->port_id);
-	} else if (config->mps) {
-		tx_pkt_burst = mlx5_tx_burst_mpw;
-		DRV_LOG(DEBUG, "port %u selected MPW Tx function",
-			dev->data->port_id);
-	}
-	return tx_pkt_burst;
-}
-
-/**
  * Configure the RX function to use.
  *
  * @param dev
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 95ff29a..dfd9317 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -39,32 +39,12 @@
 /* Invalidate a CQE. */
 #define MLX5_CQE_INVALIDATE (MLX5_CQE_INVALID << 4)
 
-/* Maximum number of packets a multi-packet WQE can handle. */
-#define MLX5_MPW_DSEG_MAX 5
-
 /* WQE DWORD size */
 #define MLX5_WQE_DWORD_SIZE 16
 
 /* WQE size */
 #define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
 
-/* Max size of a WQE session. */
-#define MLX5_WQE_SIZE_MAX 960U
-
-/* Compute the number of DS. */
-#define MLX5_WQE_DS(n) \
-	(((n) + MLX5_WQE_DWORD_SIZE - 1) / MLX5_WQE_DWORD_SIZE)
-
-/* Room for inline data in multi-packet WQE. */
-#define MLX5_MWQE64_INL_DATA 28
-
-/* Default minimum number of Tx queues for inlining packets. */
-#define MLX5_EMPW_MIN_TXQS 8
-
-/* Default max packet length to be inlined. */
-#define MLX5_EMPW_MAX_INLINE_LEN (4U * MLX5_WQE_SIZE)
-
-
 #define MLX5_OPC_MOD_ENHANCED_MPSW 0
 #define MLX5_OPCODE_ENHANCED_MPSW 0x29
 
@@ -164,47 +144,11 @@ enum mlx5_completion_mode {
 	MLX5_COMP_CQE_AND_EQE = 0x3,
 };
 
-/* Subset of struct mlx5_wqe_eth_seg. */
-struct mlx5_wqe_eth_seg_small {
-	uint32_t rsvd0;
-	uint8_t	cs_flags;
-	uint8_t	rsvd1;
-	uint16_t mss;
-	uint32_t flow_table_metadata;
-	uint16_t inline_hdr_sz;
-	uint8_t inline_hdr[2];
-} __rte_aligned(MLX5_WQE_DWORD_SIZE);
-
-struct mlx5_wqe_inl_small {
-	uint32_t byte_cnt;
-	uint8_t raw;
-} __rte_aligned(MLX5_WQE_DWORD_SIZE);
-
-struct mlx5_wqe_ctrl {
-	uint32_t ctrl0;
-	uint32_t ctrl1;
-	uint32_t ctrl2;
-	uint32_t ctrl3;
-} __rte_aligned(MLX5_WQE_DWORD_SIZE);
-
 /* Small common part of the WQE. */
 struct mlx5_wqe {
 	uint32_t ctrl[4];
-	struct mlx5_wqe_eth_seg_small eseg;
-};
-
-/* Vectorize WQE header. */
-struct mlx5_wqe_v {
-	rte_v128u32_t ctrl;
-	rte_v128u32_t eseg;
 };
 
-/* WQE. */
-struct mlx5_wqe64 {
-	struct mlx5_wqe hdr;
-	uint8_t raw[32];
-} __rte_aligned(MLX5_WQE_SIZE);
-
 /* MPW mode. */
 enum mlx5_mpw_mode {
 	MLX5_MPW_DISABLED,
@@ -212,27 +156,6 @@ enum mlx5_mpw_mode {
 	MLX5_MPW_ENHANCED, /* Enhanced Multi-Packet Send WQE, a.k.a MPWv2. */
 };
 
-/* MPW session status. */
-enum mlx5_mpw_state {
-	MLX5_MPW_STATE_OPENED,
-	MLX5_MPW_INL_STATE_OPENED,
-	MLX5_MPW_ENHANCED_STATE_OPENED,
-	MLX5_MPW_STATE_CLOSED,
-};
-
-/* MPW session descriptor. */
-struct mlx5_mpw {
-	enum mlx5_mpw_state state;
-	unsigned int pkts_n;
-	unsigned int len;
-	unsigned int total_len;
-	volatile struct mlx5_wqe *wqe;
-	union {
-		volatile struct mlx5_wqe_data_seg *dseg[MLX5_MPW_DSEG_MAX];
-		volatile uint8_t *raw;
-	} data;
-};
-
 /* WQE for Multi-Packet RQ. */
 struct mlx5_wqe_mprq {
 	struct mlx5_wqe_srq_next_seg next_seg;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index c1dc8c4..f2d6918 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -288,140 +288,6 @@
 }
 
 /**
- * Return the size of tailroom of WQ.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param addr
- *   Pointer to tail of WQ.
- *
- * @return
- *   Size of tailroom.
- */
-static inline size_t
-tx_mlx5_wq_tailroom(struct mlx5_txq_data *txq, void *addr)
-{
-	size_t tailroom;
-	tailroom = (uintptr_t)(txq->wqes) +
-		   (1 << txq->wqe_n) * MLX5_WQE_SIZE -
-		   (uintptr_t)addr;
-	return tailroom;
-}
-
-/**
- * Copy data to tailroom of circular queue.
- *
- * @param dst
- *   Pointer to destination.
- * @param src
- *   Pointer to source.
- * @param n
- *   Number of bytes to copy.
- * @param base
- *   Pointer to head of queue.
- * @param tailroom
- *   Size of tailroom from dst.
- *
- * @return
- *   Pointer after copied data.
- */
-static inline void *
-mlx5_copy_to_wq(void *dst, const void *src, size_t n,
-		void *base, size_t tailroom)
-{
-	void *ret;
-
-	if (n > tailroom) {
-		rte_memcpy(dst, src, tailroom);
-		rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
-			   n - tailroom);
-		ret = (uint8_t *)base + n - tailroom;
-	} else {
-		rte_memcpy(dst, src, n);
-		ret = (n == tailroom) ? base : (uint8_t *)dst + n;
-	}
-	return ret;
-}
-
-/**
- * Inline TSO headers into WQE.
- *
- * @return
- *   0 on success, negative errno value on failure.
- */
-static int
-inline_tso(struct mlx5_txq_data *txq, struct rte_mbuf *buf,
-	   uint32_t *length,
-	   uintptr_t *addr,
-	   uint16_t *pkt_inline_sz,
-	   uint8_t **raw,
-	   uint16_t *max_wqe,
-	   uint16_t *tso_segsz,
-	   uint16_t *tso_header_sz)
-{
-	uintptr_t end = (uintptr_t)(((uintptr_t)txq->wqes) +
-				    (1 << txq->wqe_n) * MLX5_WQE_SIZE);
-	unsigned int copy_b;
-	uint8_t vlan_sz = (buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0;
-	const uint8_t tunneled = txq->tunnel_en && (buf->ol_flags &
-				 PKT_TX_TUNNEL_MASK);
-	uint16_t n_wqe;
-
-	*tso_segsz = buf->tso_segsz;
-	*tso_header_sz = buf->l2_len + vlan_sz + buf->l3_len + buf->l4_len;
-	if (unlikely(*tso_segsz == 0 || *tso_header_sz == 0)) {
-		txq->stats.oerrors++;
-		return -EINVAL;
-	}
-	if (tunneled)
-		*tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
-	/* First seg must contain all TSO headers. */
-	if (unlikely(*tso_header_sz > MLX5_MAX_TSO_HEADER) ||
-		     *tso_header_sz > DATA_LEN(buf)) {
-		txq->stats.oerrors++;
-		return -EINVAL;
-	}
-	copy_b = *tso_header_sz - *pkt_inline_sz;
-	if (!copy_b || ((end - (uintptr_t)*raw) < copy_b))
-		return -EAGAIN;
-	n_wqe = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
-	if (unlikely(*max_wqe < n_wqe))
-		return -EINVAL;
-	*max_wqe -= n_wqe;
-	rte_memcpy((void *)*raw, (void *)*addr, copy_b);
-	*length -= copy_b;
-	*addr += copy_b;
-	copy_b = MLX5_WQE_DS(copy_b) * MLX5_WQE_DWORD_SIZE;
-	*pkt_inline_sz += copy_b;
-	*raw += copy_b;
-	return 0;
-}
-
-/**
- * DPDK callback to check the status of a tx descriptor.
- *
- * @param tx_queue
- *   The tx queue.
- * @param[in] offset
- *   The index of the descriptor in the ring.
- *
- * @return
- *   The status of the tx descriptor.
- */
-int
-mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
-{
-	struct mlx5_txq_data *txq = tx_queue;
-	uint16_t used;
-
-	mlx5_tx_complete(txq);
-	used = txq->elts_head - txq->elts_tail;
-	if (offset < used)
-		return RTE_ETH_TX_DESC_FULL;
-	return RTE_ETH_TX_DESC_DONE;
-}
-
-/**
  * Internal function to compute the number of used descriptors in an RX queue
  *
  * @param rxq
@@ -655,7 +521,7 @@
 						    (1 << txq->cqe_n));
 			mlx5_dump_debug_information(name, "MLX5 Error SQ:",
 						    (const void *)((uintptr_t)
-						    tx_mlx5_wqe(txq, 0)),
+						    txq->wqes),
 						    MLX5_WQE_SIZE *
 						    (1 << txq->wqe_n));
 			txq_ctrl->dump_file_n++;
@@ -683,1247 +549,6 @@
 }
 
 /**
- * DPDK callback for TX.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	unsigned int k = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int comp;
-	volatile struct mlx5_wqe_ctrl *last_wqe = NULL;
-	unsigned int segs_n = 0;
-	const unsigned int max_inline = txq->max_inline;
-	uint64_t addr_64;
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Prefetch first packet cacheline. */
-	rte_prefetch0(*pkts);
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	if (unlikely(!max_wqe))
-		return 0;
-	do {
-		struct rte_mbuf *buf = *pkts; /* First_seg. */
-		uint8_t *raw;
-		volatile struct mlx5_wqe_v *wqe = NULL;
-		volatile rte_v128u32_t *dseg = NULL;
-		uint32_t length;
-		unsigned int ds = 0;
-		unsigned int sg = 0; /* counter of additional segs attached. */
-		uintptr_t addr;
-		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
-		uint16_t tso_header_sz = 0;
-		uint16_t ehdr;
-		uint8_t cs_flags;
-		uint8_t tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
-		uint32_t swp_offsets = 0;
-		uint8_t swp_types = 0;
-		rte_be32_t metadata;
-		uint16_t tso_segsz = 0;
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		uint32_t total_length = 0;
-#endif
-		int ret;
-
-		segs_n = buf->nb_segs;
-		/*
-		 * Make sure there is enough room to store this packet and
-		 * that one ring entry remains unused.
-		 */
-		assert(segs_n);
-		if (max_elts < segs_n)
-			break;
-		max_elts -= segs_n;
-		sg = --segs_n;
-		if (unlikely(--max_wqe == 0))
-			break;
-		wqe = (volatile struct mlx5_wqe_v *)
-			tx_mlx5_wqe(txq, txq->wqe_ci);
-		rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-		if (pkts_n - i > 1)
-			rte_prefetch0(*(pkts + 1));
-		addr = rte_pktmbuf_mtod(buf, uintptr_t);
-		length = DATA_LEN(buf);
-		ehdr = (((uint8_t *)addr)[1] << 8) |
-		       ((uint8_t *)addr)[0];
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		total_length = length;
-#endif
-		if (length < (MLX5_WQE_DWORD_SIZE + 2)) {
-			txq->stats.oerrors++;
-			break;
-		}
-		/* Update element. */
-		(*txq->elts)[elts_head & elts_m] = buf;
-		/* Prefetch next buffer data. */
-		if (pkts_n - i > 1)
-			rte_prefetch0(
-			    rte_pktmbuf_mtod(*(pkts + 1), volatile void *));
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets, &swp_types);
-		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Replace the Ethernet type by the VLAN if necessary. */
-		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
-			uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
-							 buf->vlan_tci);
-			unsigned int len = 2 * RTE_ETHER_ADDR_LEN - 2;
-
-			addr += 2;
-			length -= 2;
-			/* Copy Destination and source mac address. */
-			memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
-			/* Copy VLAN. */
-			memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
-			/* Copy missing two bytes to end the DSeg. */
-			memcpy((uint8_t *)raw + len + sizeof(vlan),
-			       ((uint8_t *)addr) + len, 2);
-			addr += len + 2;
-			length -= (len + 2);
-		} else {
-			memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
-			       MLX5_WQE_DWORD_SIZE);
-			length -= pkt_inline_sz;
-			addr += pkt_inline_sz;
-		}
-		raw += MLX5_WQE_DWORD_SIZE;
-		if (tso) {
-			ret = inline_tso(txq, buf, &length,
-					 &addr, &pkt_inline_sz,
-					 &raw, &max_wqe,
-					 &tso_segsz, &tso_header_sz);
-			if (ret == -EINVAL) {
-				break;
-			} else if (ret == -EAGAIN) {
-				/* NOP WQE. */
-				wqe->ctrl = (rte_v128u32_t){
-					rte_cpu_to_be_32(txq->wqe_ci << 8),
-					rte_cpu_to_be_32(txq->qp_num_8s | 1),
-					rte_cpu_to_be_32
-						(MLX5_COMP_ONLY_FIRST_ERR <<
-						 MLX5_COMP_MODE_OFFSET),
-					0,
-				};
-				ds = 1;
-#ifdef MLX5_PMD_SOFT_COUNTERS
-				total_length = 0;
-#endif
-				k++;
-				goto next_wqe;
-			}
-		}
-		/* Inline if enough room. */
-		if (max_inline || tso) {
-			uint32_t inl = 0;
-			uintptr_t end = (uintptr_t)
-				(((uintptr_t)txq->wqes) +
-				 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
-			unsigned int inline_room = max_inline *
-						   RTE_CACHE_LINE_SIZE -
-						   (pkt_inline_sz - 2) -
-						   !!tso * sizeof(inl);
-			uintptr_t addr_end;
-			unsigned int copy_b;
-
-pkt_inline:
-			addr_end = RTE_ALIGN_FLOOR(addr + inline_room,
-						   RTE_CACHE_LINE_SIZE);
-			copy_b = (addr_end > addr) ?
-				 RTE_MIN((addr_end - addr), length) : 0;
-			if (copy_b && ((end - (uintptr_t)raw) >
-				       (copy_b + sizeof(inl)))) {
-				/*
-				 * One Dseg remains in the current WQE.  To
-				 * keep the computation positive, it is
-				 * removed after the bytes to Dseg conversion.
-				 */
-				uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
-
-				if (unlikely(max_wqe < n))
-					break;
-				max_wqe -= n;
-				if (tso) {
-					assert(inl == 0);
-					inl = rte_cpu_to_be_32(copy_b |
-							       MLX5_INLINE_SEG);
-					rte_memcpy((void *)raw,
-						   (void *)&inl, sizeof(inl));
-					raw += sizeof(inl);
-					pkt_inline_sz += sizeof(inl);
-				}
-				rte_memcpy((void *)raw, (void *)addr, copy_b);
-				addr += copy_b;
-				length -= copy_b;
-				pkt_inline_sz += copy_b;
-			}
-			/*
-			 * 2 DWORDs consumed by the WQE header + ETH segment +
-			 * the size of the inline part of the packet.
-			 */
-			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
-			if (length > 0) {
-				if (ds % (MLX5_WQE_SIZE /
-					  MLX5_WQE_DWORD_SIZE) == 0) {
-					if (unlikely(--max_wqe == 0))
-						break;
-					dseg = (volatile rte_v128u32_t *)
-					       tx_mlx5_wqe(txq, txq->wqe_ci +
-							   ds / 4);
-				} else {
-					dseg = (volatile rte_v128u32_t *)
-						((uintptr_t)wqe +
-						 (ds * MLX5_WQE_DWORD_SIZE));
-				}
-				goto use_dseg;
-			} else if (!segs_n) {
-				goto next_pkt;
-			} else {
-				/*
-				 * Further inline the next segment only for
-				 * non-TSO packets.
-				 */
-				if (!tso) {
-					raw += copy_b;
-					inline_room -= copy_b;
-				} else {
-					inline_room = 0;
-				}
-				/* Move to the next segment. */
-				--segs_n;
-				buf = buf->next;
-				assert(buf);
-				addr = rte_pktmbuf_mtod(buf, uintptr_t);
-				length = DATA_LEN(buf);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-				total_length += length;
-#endif
-				(*txq->elts)[++elts_head & elts_m] = buf;
-				goto pkt_inline;
-			}
-		} else {
-			/*
-			 * No inline has been done in the packet, only the
-			 * Ethernet Header as been stored.
-			 */
-			dseg = (volatile rte_v128u32_t *)
-				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
-			ds = 3;
-use_dseg:
-			/* Add the remaining packet as a simple ds. */
-			addr_64 = rte_cpu_to_be_64(addr);
-			*dseg = (rte_v128u32_t){
-				rte_cpu_to_be_32(length),
-				mlx5_tx_mb2mr(txq, buf),
-				addr_64,
-				addr_64 >> 32,
-			};
-			++ds;
-			if (!segs_n)
-				goto next_pkt;
-		}
-next_seg:
-		assert(buf);
-		assert(ds);
-		assert(wqe);
-		/*
-		 * Spill on next WQE when the current one does not have
-		 * enough room left. Size of WQE must a be a multiple
-		 * of data segment size.
-		 */
-		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
-		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
-			if (unlikely(--max_wqe == 0))
-				break;
-			dseg = (volatile rte_v128u32_t *)
-			       tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
-			rte_prefetch0(tx_mlx5_wqe(txq,
-						  txq->wqe_ci + ds / 4 + 1));
-		} else {
-			++dseg;
-		}
-		++ds;
-		buf = buf->next;
-		assert(buf);
-		length = DATA_LEN(buf);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		total_length += length;
-#endif
-		/* Store segment information. */
-		addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t));
-		*dseg = (rte_v128u32_t){
-			rte_cpu_to_be_32(length),
-			mlx5_tx_mb2mr(txq, buf),
-			addr_64,
-			addr_64 >> 32,
-		};
-		(*txq->elts)[++elts_head & elts_m] = buf;
-		if (--segs_n)
-			goto next_seg;
-next_pkt:
-		if (ds > MLX5_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		++elts_head;
-		++pkts;
-		++i;
-		j += sg;
-		/* Initialize known and common part of the WQE structure. */
-		if (tso) {
-			wqe->ctrl = (rte_v128u32_t){
-				rte_cpu_to_be_32((txq->wqe_ci << 8) |
-						 MLX5_OPCODE_TSO),
-				rte_cpu_to_be_32(txq->qp_num_8s | ds),
-				rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-						 MLX5_COMP_MODE_OFFSET),
-				0,
-			};
-			wqe->eseg = (rte_v128u32_t){
-				swp_offsets,
-				cs_flags | (swp_types << 8) |
-				(rte_cpu_to_be_16(tso_segsz) << 16),
-				metadata,
-				(ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
-			};
-		} else {
-			wqe->ctrl = (rte_v128u32_t){
-				rte_cpu_to_be_32((txq->wqe_ci << 8) |
-						 MLX5_OPCODE_SEND),
-				rte_cpu_to_be_32(txq->qp_num_8s | ds),
-				rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-						 MLX5_COMP_MODE_OFFSET),
-				0,
-			};
-			wqe->eseg = (rte_v128u32_t){
-				swp_offsets,
-				cs_flags | (swp_types << 8),
-				metadata,
-				(ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
-			};
-		}
-next_wqe:
-		txq->wqe_ci += (ds + 3) / 4;
-		/* Save the last successful WQE for completion request */
-		last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe;
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += total_length;
-#endif
-	} while (i < pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely((i + k) == 0))
-		return 0;
-	txq->elts_head += (i + j);
-	/* Check whether completion threshold has been reached. */
-	comp = txq->elts_comp + i + j + k;
-	if (comp >= MLX5_TX_COMP_THRESH) {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		last_wqe->ctrl2 = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						   MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		last_wqe->ctrl3 = txq->elts_head;
-		txq->elts_comp = 0;
-	} else {
-		txq->elts_comp = comp;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe);
-	return i;
-}
-
-/**
- * Open a MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- * @param length
- *   Packet length.
- */
-static inline void
-mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length)
-{
-	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
-	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
-		(volatile struct mlx5_wqe_data_seg (*)[])
-		tx_mlx5_wqe(txq, idx + 1);
-
-	mpw->state = MLX5_MPW_STATE_OPENED;
-	mpw->pkts_n = 0;
-	mpw->len = length;
-	mpw->total_len = 0;
-	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
-	mpw->wqe->eseg.inline_hdr_sz = 0;
-	mpw->wqe->eseg.rsvd0 = 0;
-	mpw->wqe->eseg.rsvd1 = 0;
-	mpw->wqe->eseg.flow_table_metadata = 0;
-	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
-					     (txq->wqe_ci << 8) |
-					     MLX5_OPCODE_TSO);
-	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-					     MLX5_COMP_MODE_OFFSET);
-	mpw->wqe->ctrl[3] = 0;
-	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
-		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
-	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
-		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
-	mpw->data.dseg[2] = &(*dseg)[0];
-	mpw->data.dseg[3] = &(*dseg)[1];
-	mpw->data.dseg[4] = &(*dseg)[2];
-}
-
-/**
- * Close a MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- */
-static inline void
-mlx5_mpw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
-{
-	unsigned int num = mpw->pkts_n;
-
-	/*
-	 * Store size in multiple of 16 bytes. Control and Ethernet segments
-	 * count as 2.
-	 */
-	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | (2 + num));
-	mpw->state = MLX5_MPW_STATE_CLOSED;
-	if (num < 3)
-		++txq->wqe_ci;
-	else
-		txq->wqe_ci += 2;
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-}
-
-/**
- * DPDK callback for TX with MPW support.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int comp;
-	struct mlx5_mpw mpw = {
-		.state = MLX5_MPW_STATE_CLOSED,
-	};
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Prefetch first packet cacheline. */
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	if (unlikely(!max_wqe))
-		return 0;
-	do {
-		struct rte_mbuf *buf = *(pkts++);
-		uint32_t length;
-		unsigned int segs_n = buf->nb_segs;
-		uint32_t cs_flags;
-		rte_be32_t metadata;
-
-		/*
-		 * Make sure there is enough room to store this packet and
-		 * that one ring entry remains unused.
-		 */
-		assert(segs_n);
-		if (max_elts < segs_n)
-			break;
-		/* Do not bother with large packets MPW cannot handle. */
-		if (segs_n > MLX5_MPW_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		max_elts -= segs_n;
-		--pkts_n;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Retrieve packet information. */
-		length = PKT_LEN(buf);
-		assert(length);
-		/* Start new session if packet differs. */
-		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
-		    ((mpw.len != length) ||
-		     (segs_n != 1) ||
-		     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-		     (mpw.wqe->eseg.cs_flags != cs_flags)))
-			mlx5_mpw_close(txq, &mpw);
-		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
-			/*
-			 * Multi-Packet WQE consumes at most two WQE.
-			 * mlx5_mpw_new() expects to be able to use such
-			 * resources.
-			 */
-			if (unlikely(max_wqe < 2))
-				break;
-			max_wqe -= 2;
-			mlx5_mpw_new(txq, &mpw, length);
-			mpw.wqe->eseg.cs_flags = cs_flags;
-			mpw.wqe->eseg.flow_table_metadata = metadata;
-		}
-		/* Multi-segment packets must be alone in their MPW. */
-		assert((segs_n == 1) || (mpw.pkts_n == 0));
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-		length = 0;
-#endif
-		do {
-			volatile struct mlx5_wqe_data_seg *dseg;
-			uintptr_t addr;
-
-			assert(buf);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			dseg = mpw.data.dseg[mpw.pkts_n];
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			*dseg = (struct mlx5_wqe_data_seg){
-				.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
-				.lkey = mlx5_tx_mb2mr(txq, buf),
-				.addr = rte_cpu_to_be_64(addr),
-			};
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-			length += DATA_LEN(buf);
-#endif
-			buf = buf->next;
-			++mpw.pkts_n;
-			++j;
-		} while (--segs_n);
-		assert(length == mpw.len);
-		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
-			mlx5_mpw_close(txq, &mpw);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
-#endif
-		++i;
-	} while (pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely(i == 0))
-		return 0;
-	/* Check whether completion threshold has been reached. */
-	/* "j" includes both packets and segments. */
-	comp = txq->elts_comp + j;
-	if (comp >= MLX5_TX_COMP_THRESH) {
-		volatile struct mlx5_wqe *wqe = mpw.wqe;
-
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->ctrl[3] = elts_head;
-		txq->elts_comp = 0;
-	} else {
-		txq->elts_comp = comp;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	/* Ring QP doorbell. */
-	if (mpw.state == MLX5_MPW_STATE_OPENED)
-		mlx5_mpw_close(txq, &mpw);
-	mlx5_tx_dbrec(txq, mpw.wqe);
-	txq->elts_head = elts_head;
-	return i;
-}
-
-/**
- * Open a MPW inline session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- * @param length
- *   Packet length.
- */
-static inline void
-mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw,
-		    uint32_t length)
-{
-	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
-	struct mlx5_wqe_inl_small *inl;
-
-	mpw->state = MLX5_MPW_INL_STATE_OPENED;
-	mpw->pkts_n = 0;
-	mpw->len = length;
-	mpw->total_len = 0;
-	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
-					     (txq->wqe_ci << 8) |
-					     MLX5_OPCODE_TSO);
-	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-					     MLX5_COMP_MODE_OFFSET);
-	mpw->wqe->ctrl[3] = 0;
-	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
-	mpw->wqe->eseg.inline_hdr_sz = 0;
-	mpw->wqe->eseg.cs_flags = 0;
-	mpw->wqe->eseg.rsvd0 = 0;
-	mpw->wqe->eseg.rsvd1 = 0;
-	mpw->wqe->eseg.flow_table_metadata = 0;
-	inl = (struct mlx5_wqe_inl_small *)
-		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
-	mpw->data.raw = (uint8_t *)&inl->raw;
-}
-
-/**
- * Close a MPW inline session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- */
-static inline void
-mlx5_mpw_inline_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
-{
-	unsigned int size;
-	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
-		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
-
-	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
-	/*
-	 * Store size in multiple of 16 bytes. Control and Ethernet segments
-	 * count as 2.
-	 */
-	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
-					     MLX5_WQE_DS(size));
-	mpw->state = MLX5_MPW_STATE_CLOSED;
-	inl->byte_cnt = rte_cpu_to_be_32(mpw->total_len | MLX5_INLINE_SEG);
-	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
-}
-
-/**
- * DPDK callback for TX with MPW inline support.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
-			 uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int comp;
-	unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
-	struct mlx5_mpw mpw = {
-		.state = MLX5_MPW_STATE_CLOSED,
-	};
-	/*
-	 * Compute the maximum number of WQE which can be consumed by inline
-	 * code.
-	 * - 2 DSEG for:
-	 *   - 1 control segment,
-	 *   - 1 Ethernet segment,
-	 * - N Dseg from the inline request.
-	 */
-	const unsigned int wqe_inl_n =
-		((2 * MLX5_WQE_DWORD_SIZE +
-		  txq->max_inline * RTE_CACHE_LINE_SIZE) +
-		 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Prefetch first packet cacheline. */
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	do {
-		struct rte_mbuf *buf = *(pkts++);
-		uintptr_t addr;
-		uint32_t length;
-		unsigned int segs_n = buf->nb_segs;
-		uint8_t cs_flags;
-		rte_be32_t metadata;
-
-		/*
-		 * Make sure there is enough room to store this packet and
-		 * that one ring entry remains unused.
-		 */
-		assert(segs_n);
-		if (max_elts < segs_n)
-			break;
-		/* Do not bother with large packets MPW cannot handle. */
-		if (segs_n > MLX5_MPW_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		max_elts -= segs_n;
-		--pkts_n;
-		/*
-		 * Compute max_wqe in case less WQE were consumed in previous
-		 * iteration.
-		 */
-		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Retrieve packet information. */
-		length = PKT_LEN(buf);
-		/* Start new session if packet differs. */
-		if (mpw.state == MLX5_MPW_STATE_OPENED) {
-			if ((mpw.len != length) ||
-			    (segs_n != 1) ||
-			    (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-			    (mpw.wqe->eseg.cs_flags != cs_flags))
-				mlx5_mpw_close(txq, &mpw);
-		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
-			if ((mpw.len != length) ||
-			    (segs_n != 1) ||
-			    (length > inline_room) ||
-			    (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
-				mlx5_mpw_inline_close(txq, &mpw);
-				inline_room =
-					txq->max_inline * RTE_CACHE_LINE_SIZE;
-			}
-		}
-		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
-			if ((segs_n != 1) ||
-			    (length > inline_room)) {
-				/*
-				 * Multi-Packet WQE consumes at most two WQE.
-				 * mlx5_mpw_new() expects to be able to use
-				 * such resources.
-				 */
-				if (unlikely(max_wqe < 2))
-					break;
-				max_wqe -= 2;
-				mlx5_mpw_new(txq, &mpw, length);
-				mpw.wqe->eseg.cs_flags = cs_flags;
-				mpw.wqe->eseg.flow_table_metadata = metadata;
-			} else {
-				if (unlikely(max_wqe < wqe_inl_n))
-					break;
-				max_wqe -= wqe_inl_n;
-				mlx5_mpw_inline_new(txq, &mpw, length);
-				mpw.wqe->eseg.cs_flags = cs_flags;
-				mpw.wqe->eseg.flow_table_metadata = metadata;
-			}
-		}
-		/* Multi-segment packets must be alone in their MPW. */
-		assert((segs_n == 1) || (mpw.pkts_n == 0));
-		if (mpw.state == MLX5_MPW_STATE_OPENED) {
-			assert(inline_room ==
-			       txq->max_inline * RTE_CACHE_LINE_SIZE);
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-			length = 0;
-#endif
-			do {
-				volatile struct mlx5_wqe_data_seg *dseg;
-
-				assert(buf);
-				(*txq->elts)[elts_head++ & elts_m] = buf;
-				dseg = mpw.data.dseg[mpw.pkts_n];
-				addr = rte_pktmbuf_mtod(buf, uintptr_t);
-				*dseg = (struct mlx5_wqe_data_seg){
-					.byte_count =
-					       rte_cpu_to_be_32(DATA_LEN(buf)),
-					.lkey = mlx5_tx_mb2mr(txq, buf),
-					.addr = rte_cpu_to_be_64(addr),
-				};
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-				length += DATA_LEN(buf);
-#endif
-				buf = buf->next;
-				++mpw.pkts_n;
-				++j;
-			} while (--segs_n);
-			assert(length == mpw.len);
-			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
-				mlx5_mpw_close(txq, &mpw);
-		} else {
-			unsigned int max;
-
-			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
-			assert(length <= inline_room);
-			assert(length == DATA_LEN(buf));
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			/* Maximum number of bytes before wrapping. */
-			max = ((((uintptr_t)(txq->wqes)) +
-				(1 << txq->wqe_n) *
-				MLX5_WQE_SIZE) -
-			       (uintptr_t)mpw.data.raw);
-			if (length > max) {
-				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
-					   (void *)addr,
-					   max);
-				mpw.data.raw = (volatile void *)txq->wqes;
-				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
-					   (void *)(addr + max),
-					   length - max);
-				mpw.data.raw += length - max;
-			} else {
-				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
-					   (void *)addr,
-					   length);
-
-				if (length == max)
-					mpw.data.raw =
-						(volatile void *)txq->wqes;
-				else
-					mpw.data.raw += length;
-			}
-			++mpw.pkts_n;
-			mpw.total_len += length;
-			++j;
-			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
-				mlx5_mpw_inline_close(txq, &mpw);
-				inline_room =
-					txq->max_inline * RTE_CACHE_LINE_SIZE;
-			} else {
-				inline_room -= length;
-			}
-		}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
-#endif
-		++i;
-	} while (pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely(i == 0))
-		return 0;
-	/* Check whether completion threshold has been reached. */
-	/* "j" includes both packets and segments. */
-	comp = txq->elts_comp + j;
-	if (comp >= MLX5_TX_COMP_THRESH) {
-		volatile struct mlx5_wqe *wqe = mpw.wqe;
-
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->ctrl[3] = elts_head;
-		txq->elts_comp = 0;
-	} else {
-		txq->elts_comp = comp;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	/* Ring QP doorbell. */
-	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
-		mlx5_mpw_inline_close(txq, &mpw);
-	else if (mpw.state == MLX5_MPW_STATE_OPENED)
-		mlx5_mpw_close(txq, &mpw);
-	mlx5_tx_dbrec(txq, mpw.wqe);
-	txq->elts_head = elts_head;
-	return i;
-}
-
-/**
- * Open an Enhanced MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- * @param length
- *   Packet length.
- */
-static inline void
-mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding)
-{
-	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
-
-	mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
-	mpw->pkts_n = 0;
-	mpw->total_len = sizeof(struct mlx5_wqe);
-	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->ctrl[0] =
-		rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
-				 (txq->wqe_ci << 8) |
-				 MLX5_OPCODE_ENHANCED_MPSW);
-	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-					     MLX5_COMP_MODE_OFFSET);
-	mpw->wqe->ctrl[3] = 0;
-	memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
-	if (unlikely(padding)) {
-		uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
-
-		/* Pad the first 2 DWORDs with zero-length inline header. */
-		*(volatile uint32_t *)addr = rte_cpu_to_be_32(MLX5_INLINE_SEG);
-		*(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
-			rte_cpu_to_be_32(MLX5_INLINE_SEG);
-		mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
-		/* Start from the next WQEBB. */
-		mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
-	} else {
-		mpw->data.raw = (volatile void *)(mpw->wqe + 1);
-	}
-}
-
-/**
- * Close an Enhanced MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- *
- * @return
- *   Number of consumed WQEs.
- */
-static inline uint16_t
-mlx5_empw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
-{
-	uint16_t ret;
-
-	/* Store size in multiple of 16 bytes. Control and Ethernet segments
-	 * count as 2.
-	 */
-	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
-					     MLX5_WQE_DS(mpw->total_len));
-	mpw->state = MLX5_MPW_STATE_CLOSED;
-	ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
-	txq->wqe_ci += ret;
-	return ret;
-}
-
-/**
- * TX with Enhanced MPW support.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static inline uint16_t
-txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
-	       uint16_t pkts_n)
-{
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
-	unsigned int mpw_room = 0;
-	unsigned int inl_pad = 0;
-	uint32_t inl_hdr;
-	uint64_t addr_64;
-	struct mlx5_mpw mpw = {
-		.state = MLX5_MPW_STATE_CLOSED,
-	};
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	if (unlikely(!max_wqe))
-		return 0;
-	do {
-		struct rte_mbuf *buf = *(pkts++);
-		uintptr_t addr;
-		unsigned int do_inline = 0; /* Whether inline is possible. */
-		uint32_t length;
-		uint8_t cs_flags;
-		rte_be32_t metadata;
-
-		/* Multi-segmented packet is handled in slow-path outside. */
-		assert(NB_SEGS(buf) == 1);
-		/* Make sure there is enough room to store this packet. */
-		if (max_elts - j == 0)
-			break;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Retrieve packet information. */
-		length = PKT_LEN(buf);
-		/* Start new session if:
-		 * - multi-segment packet
-		 * - no space left even for a dseg
-		 * - next packet can be inlined with a new WQE
-		 * - cs_flag differs
-		 */
-		if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
-			if ((inl_pad + sizeof(struct mlx5_wqe_data_seg) >
-			     mpw_room) ||
-			    (length <= txq->inline_max_packet_sz &&
-			     inl_pad + sizeof(inl_hdr) + length >
-			     mpw_room) ||
-			     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-			    (mpw.wqe->eseg.cs_flags != cs_flags))
-				max_wqe -= mlx5_empw_close(txq, &mpw);
-		}
-		if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
-			/* In Enhanced MPW, inline as much as the budget is
-			 * allowed. The remaining space is to be filled with
-			 * dsegs. If the title WQEBB isn't padded, it will have
-			 * 2 dsegs there.
-			 */
-			mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
-					   (max_inline ? max_inline :
-					    pkts_n * MLX5_WQE_DWORD_SIZE) +
-					   MLX5_WQE_SIZE);
-			if (unlikely(max_wqe * MLX5_WQE_SIZE < mpw_room))
-				break;
-			/* Don't pad the title WQEBB to not waste WQ. */
-			mlx5_empw_new(txq, &mpw, 0);
-			mpw_room -= mpw.total_len;
-			inl_pad = 0;
-			do_inline = length <= txq->inline_max_packet_sz &&
-				    sizeof(inl_hdr) + length <= mpw_room &&
-				    !txq->mpw_hdr_dseg;
-			mpw.wqe->eseg.cs_flags = cs_flags;
-			mpw.wqe->eseg.flow_table_metadata = metadata;
-		} else {
-			/* Evaluate whether the next packet can be inlined.
-			 * Inlininig is possible when:
-			 * - length is less than configured value
-			 * - length fits for remaining space
-			 * - not required to fill the title WQEBB with dsegs
-			 */
-			do_inline =
-				length <= txq->inline_max_packet_sz &&
-				inl_pad + sizeof(inl_hdr) + length <=
-				 mpw_room &&
-				(!txq->mpw_hdr_dseg ||
-				 mpw.total_len >= MLX5_WQE_SIZE);
-		}
-		if (max_inline && do_inline) {
-			/* Inline packet into WQE. */
-			unsigned int max;
-
-			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
-			assert(length == DATA_LEN(buf));
-			inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG);
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			mpw.data.raw = (volatile void *)
-				((uintptr_t)mpw.data.raw + inl_pad);
-			max = tx_mlx5_wq_tailroom(txq,
-					(void *)(uintptr_t)mpw.data.raw);
-			/* Copy inline header. */
-			mpw.data.raw = (volatile void *)
-				mlx5_copy_to_wq(
-					  (void *)(uintptr_t)mpw.data.raw,
-					  &inl_hdr,
-					  sizeof(inl_hdr),
-					  (void *)(uintptr_t)txq->wqes,
-					  max);
-			max = tx_mlx5_wq_tailroom(txq,
-					(void *)(uintptr_t)mpw.data.raw);
-			/* Copy packet data. */
-			mpw.data.raw = (volatile void *)
-				mlx5_copy_to_wq(
-					  (void *)(uintptr_t)mpw.data.raw,
-					  (void *)addr,
-					  length,
-					  (void *)(uintptr_t)txq->wqes,
-					  max);
-			++mpw.pkts_n;
-			mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
-			/* No need to get completion as the entire packet is
-			 * copied to WQ. Free the buf right away.
-			 */
-			rte_pktmbuf_free_seg(buf);
-			mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
-			/* Add pad in the next packet if any. */
-			inl_pad = (((uintptr_t)mpw.data.raw +
-					(MLX5_WQE_DWORD_SIZE - 1)) &
-					~(MLX5_WQE_DWORD_SIZE - 1)) -
-				  (uintptr_t)mpw.data.raw;
-		} else {
-			/* No inline. Load a dseg of packet pointer. */
-			volatile rte_v128u32_t *dseg;
-
-			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
-			assert((inl_pad + sizeof(*dseg)) <= mpw_room);
-			assert(length == DATA_LEN(buf));
-			if (!tx_mlx5_wq_tailroom(txq,
-					(void *)((uintptr_t)mpw.data.raw
-						+ inl_pad)))
-				dseg = (volatile void *)txq->wqes;
-			else
-				dseg = (volatile void *)
-					((uintptr_t)mpw.data.raw +
-					 inl_pad);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
-								    uintptr_t));
-			*dseg = (rte_v128u32_t) {
-				rte_cpu_to_be_32(length),
-				mlx5_tx_mb2mr(txq, buf),
-				addr_64,
-				addr_64 >> 32,
-			};
-			mpw.data.raw = (volatile void *)(dseg + 1);
-			mpw.total_len += (inl_pad + sizeof(*dseg));
-			++j;
-			++mpw.pkts_n;
-			mpw_room -= (inl_pad + sizeof(*dseg));
-			inl_pad = 0;
-		}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
-#endif
-		++i;
-	} while (i < pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely(i == 0))
-		return 0;
-	/* Check whether completion threshold has been reached. */
-	if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
-			(uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
-			 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
-		volatile struct mlx5_wqe *wqe = mpw.wqe;
-
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->ctrl[3] = elts_head;
-		txq->elts_comp = 0;
-		txq->mpw_comp = txq->wqe_ci;
-	} else {
-		txq->elts_comp += j;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
-		mlx5_empw_close(txq, &mpw);
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, mpw.wqe);
-	txq->elts_head = elts_head;
-	return i;
-}
-
-/**
- * DPDK callback for TX with Enhanced MPW support.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t nb_tx = 0;
-
-	while (pkts_n > nb_tx) {
-		uint16_t n;
-		uint16_t ret;
-
-		n = txq_count_contig_multi_seg(&pkts[nb_tx], pkts_n - nb_tx);
-		if (n) {
-			ret = mlx5_tx_burst(dpdk_txq, &pkts[nb_tx], n);
-			if (!ret)
-				break;
-			nb_tx += ret;
-		}
-		n = txq_count_contig_single_seg(&pkts[nb_tx], pkts_n - nb_tx);
-		if (n) {
-			ret = txq_burst_empw(txq, &pkts[nb_tx], n);
-			if (!ret)
-				break;
-			nb_tx += ret;
-		}
-	}
-	return nb_tx;
-}
-
-/**
  * Translate RX completion flags to packet type.
  *
  * @param[in] rxq
@@ -2867,22 +1492,6 @@
  */
 
 __rte_weak uint16_t
-mlx5_tx_burst_raw_vec(void *dpdk_txq __rte_unused,
-		      struct rte_mbuf **pkts __rte_unused,
-		      uint16_t pkts_n __rte_unused)
-{
-	return 0;
-}
-
-__rte_weak uint16_t
-mlx5_tx_burst_vec(void *dpdk_txq __rte_unused,
-		  struct rte_mbuf **pkts __rte_unused,
-		  uint16_t pkts_n __rte_unused)
-{
-	return 0;
-}
-
-__rte_weak uint16_t
 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
 		  struct rte_mbuf **pkts __rte_unused,
 		  uint16_t pkts_n __rte_unused)
@@ -2891,25 +1500,50 @@
 }
 
 __rte_weak int
-mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
+mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
 {
 	return -ENOTSUP;
 }
 
 __rte_weak int
-mlx5_check_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
+mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
 {
 	return -ENOTSUP;
 }
 
-__rte_weak int
-mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
+/**
+ * DPDK callback to check the status of a tx descriptor.
+ *
+ * @param tx_queue
+ *   The tx queue.
+ * @param[in] offset
+ *   The index of the descriptor in the ring.
+ *
+ * @return
+ *   The status of the tx descriptor.
+ */
+int
+mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
 {
-	return -ENOTSUP;
+	(void)tx_queue;
+	(void)offset;
+	return RTE_ETH_TX_DESC_FULL;
 }
 
-__rte_weak int
-mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
+/**
+ * Configure the TX function to use.
+ *
+ * @param dev
+ *   Pointer to private data structure.
+ *
+ * @return
+ *   Pointer to selected Tx burst function.
+ */
+eth_tx_burst_t
+mlx5_select_tx_function(struct rte_eth_dev *dev)
 {
-	return -ENOTSUP;
+	(void)dev;
+	return removed_tx_burst;
 }
+
+
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3d79c18..acde09d 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -329,14 +329,6 @@ struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx,
 void mlx5_set_ptype_table(void);
 void mlx5_set_cksum_table(void);
 void mlx5_set_swp_types_table(void);
-uint16_t mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
-		       uint16_t pkts_n);
-uint16_t mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts,
-			   uint16_t pkts_n);
-uint16_t mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
-				  uint16_t pkts_n);
-uint16_t mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts,
-			    uint16_t pkts_n);
 __rte_noinline uint16_t mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
 					volatile struct mlx5_err_cqe *err_cqe);
 uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
@@ -360,14 +352,8 @@ int mlx5_queue_state_modify_primary(struct rte_eth_dev *dev,
 			const struct mlx5_mp_arg_queue_state_modify *sm);
 
 /* Vectorized version of mlx5_rxtx.c */
-int mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev);
-int mlx5_check_vec_tx_support(struct rte_eth_dev *dev);
 int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data);
 int mlx5_check_vec_rx_support(struct rte_eth_dev *dev);
-uint16_t mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts,
-			       uint16_t pkts_n);
-uint16_t mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
-			   uint16_t pkts_n);
 uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
 			   uint16_t pkts_n);
 
@@ -478,122 +464,6 @@ enum mlx5_cqe_status {
 }
 
 /**
- * Return the address of the WQE.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param  wqe_ci
- *   WQE consumer index.
- *
- * @return
- *   WQE address.
- */
-static inline uintptr_t *
-tx_mlx5_wqe(struct mlx5_txq_data *txq, uint16_t ci)
-{
-	ci &= ((1 << txq->wqe_n) - 1);
-	return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE);
-}
-
-/**
- * Handle the next CQE.
- *
- * @param txq
- *   Pointer to TX queue structure.
- *
- * @return
- *   The last Tx buffer element to free.
- */
-static __rte_always_inline uint16_t
-mlx5_tx_cqe_handle(struct mlx5_txq_data *txq)
-{
-	const unsigned int cqe_n = 1 << txq->cqe_n;
-	const unsigned int cqe_cnt = cqe_n - 1;
-	uint16_t last_elts;
-	union {
-		volatile struct mlx5_cqe *cqe;
-		volatile struct mlx5_err_cqe *err_cqe;
-	} u = {
-		.cqe =  &(*txq->cqes)[txq->cq_ci & cqe_cnt],
-	};
-	int ret = check_cqe(u.cqe, cqe_n, txq->cq_ci);
-
-	if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
-		if (unlikely(ret == MLX5_CQE_STATUS_ERR))
-			last_elts = mlx5_tx_error_cqe_handle(txq, u.err_cqe);
-		else
-			/* Do not release buffers. */
-			return txq->elts_tail;
-	} else {
-		uint16_t new_wqe_pi = rte_be_to_cpu_16(u.cqe->wqe_counter);
-		volatile struct mlx5_wqe_ctrl *ctrl =
-				(volatile struct mlx5_wqe_ctrl *)
-					tx_mlx5_wqe(txq, new_wqe_pi);
-
-		/* Release completion burst buffers. */
-		last_elts = ctrl->ctrl3;
-		txq->wqe_pi = new_wqe_pi;
-		txq->cq_ci++;
-	}
-	rte_compiler_barrier();
-	*txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
-	return last_elts;
-}
-
-/**
- * Manage TX completions.
- *
- * When sending a burst, mlx5_tx_burst() posts several WRs.
- *
- * @param txq
- *   Pointer to TX queue structure.
- */
-static __rte_always_inline void
-mlx5_tx_complete(struct mlx5_txq_data *txq)
-{
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	uint16_t elts_free = txq->elts_tail;
-	uint16_t elts_tail;
-	struct rte_mbuf *m, *free[elts_n];
-	struct rte_mempool *pool = NULL;
-	unsigned int blk_n = 0;
-
-	elts_tail = mlx5_tx_cqe_handle(txq);
-	assert((elts_tail & elts_m) < (1 << txq->wqe_n));
-	/* Free buffers. */
-	while (elts_free != elts_tail) {
-		m = rte_pktmbuf_prefree_seg((*txq->elts)[elts_free++ & elts_m]);
-		if (likely(m != NULL)) {
-			if (likely(m->pool == pool)) {
-				free[blk_n++] = m;
-			} else {
-				if (likely(pool != NULL))
-					rte_mempool_put_bulk(pool,
-							     (void *)free,
-							     blk_n);
-				free[0] = m;
-				pool = m->pool;
-				blk_n = 1;
-			}
-		}
-	}
-	if (blk_n)
-		rte_mempool_put_bulk(pool, (void *)free, blk_n);
-#ifndef NDEBUG
-	elts_free = txq->elts_tail;
-	/* Poisoning. */
-	while (elts_free != elts_tail) {
-		memset(&(*txq->elts)[elts_free & elts_m],
-		       0x66,
-		       sizeof((*txq->elts)[elts_free & elts_m]));
-		++elts_free;
-	}
-#endif
-	txq->elts_tail = elts_tail;
-}
-
-/**
  * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which the
  * cloned mbuf is allocated is returned instead.
  *
@@ -710,147 +580,4 @@ enum mlx5_cqe_status {
 	mlx5_tx_dbrec_cond_wmb(txq, wqe, 1);
 }
 
-/**
- * Convert mbuf to Verb SWP.
- *
- * @param txq_data
- *   Pointer to the Tx queue.
- * @param buf
- *   Pointer to the mbuf.
- * @param offsets
- *   Pointer to the SWP header offsets.
- * @param swp_types
- *   Pointer to the SWP header types.
- */
-static __rte_always_inline void
-txq_mbuf_to_swp(struct mlx5_txq_data *txq, struct rte_mbuf *buf,
-		uint8_t *offsets, uint8_t *swp_types)
-{
-	const uint64_t vlan = buf->ol_flags & PKT_TX_VLAN_PKT;
-	const uint64_t tunnel = buf->ol_flags & PKT_TX_TUNNEL_MASK;
-	const uint64_t tso = buf->ol_flags & PKT_TX_TCP_SEG;
-	const uint64_t csum_flags = buf->ol_flags & PKT_TX_L4_MASK;
-	const uint64_t inner_ip =
-		buf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6);
-	const uint64_t ol_flags_mask = PKT_TX_L4_MASK | PKT_TX_IPV6 |
-				       PKT_TX_OUTER_IPV6;
-	uint16_t idx;
-	uint16_t off;
-
-	if (likely(!txq->swp_en || (tunnel != PKT_TX_TUNNEL_UDP &&
-				    tunnel != PKT_TX_TUNNEL_IP)))
-		return;
-	/*
-	 * The index should have:
-	 * bit[0:1] = PKT_TX_L4_MASK
-	 * bit[4] = PKT_TX_IPV6
-	 * bit[8] = PKT_TX_OUTER_IPV6
-	 * bit[9] = PKT_TX_OUTER_UDP
-	 */
-	idx = (buf->ol_flags & ol_flags_mask) >> 52;
-	if (tunnel == PKT_TX_TUNNEL_UDP)
-		idx |= 1 << 9;
-	*swp_types = mlx5_swp_types_table[idx];
-	/*
-	 * Set offsets for SW parser. Since ConnectX-5, SW parser just
-	 * complements HW parser. SW parser starts to engage only if HW parser
-	 * can't reach a header. For the older devices, HW parser will not kick
-	 * in if any of SWP offsets is set. Therefore, all of the L3 offsets
-	 * should be set regardless of HW offload.
-	 */
-	off = buf->outer_l2_len + (vlan ? sizeof(struct rte_vlan_hdr) : 0);
-	offsets[1] = off >> 1; /* Outer L3 offset. */
-	off += buf->outer_l3_len;
-	if (tunnel == PKT_TX_TUNNEL_UDP)
-		offsets[0] = off >> 1; /* Outer L4 offset. */
-	if (inner_ip) {
-		off += buf->l2_len;
-		offsets[3] = off >> 1; /* Inner L3 offset. */
-		if (csum_flags == PKT_TX_TCP_CKSUM || tso ||
-		    csum_flags == PKT_TX_UDP_CKSUM) {
-			off += buf->l3_len;
-			offsets[2] = off >> 1; /* Inner L4 offset. */
-		}
-	}
-}
-
-/**
- * Convert the Checksum offloads to Verbs.
- *
- * @param buf
- *   Pointer to the mbuf.
- *
- * @return
- *   Converted checksum flags.
- */
-static __rte_always_inline uint8_t
-txq_ol_cksum_to_cs(struct rte_mbuf *buf)
-{
-	uint32_t idx;
-	uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK);
-	const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK |
-				       PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM;
-
-	/*
-	 * The index should have:
-	 * bit[0] = PKT_TX_TCP_SEG
-	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
-	 * bit[4] = PKT_TX_IP_CKSUM
-	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
-	 * bit[9] = tunnel
-	 */
-	idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
-	return mlx5_cksum_table[idx];
-}
-
-/**
- * Count the number of contiguous single segment packets.
- *
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- *
- * @return
- *   Number of contiguous single segment packets.
- */
-static __rte_always_inline unsigned int
-txq_count_contig_single_seg(struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	unsigned int pos;
-
-	if (!pkts_n)
-		return 0;
-	/* Count the number of contiguous single segment packets. */
-	for (pos = 0; pos < pkts_n; ++pos)
-		if (NB_SEGS(pkts[pos]) > 1)
-			break;
-	return pos;
-}
-
-/**
- * Count the number of contiguous multi-segment packets.
- *
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- *
- * @return
- *   Number of contiguous multi-segment packets.
- */
-static __rte_always_inline unsigned int
-txq_count_contig_multi_seg(struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	unsigned int pos;
-
-	if (!pkts_n)
-		return 0;
-	/* Count the number of contiguous multi-segment packets. */
-	for (pos = 0; pos < pkts_n; ++pos)
-		if (NB_SEGS(pkts[pos]) == 1)
-			break;
-	return pos;
-}
-
 #endif /* RTE_PMD_MLX5_RXTX_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
index 073044f..f6ec828 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -40,138 +40,6 @@
 #endif
 
 /**
- * Count the number of packets having same ol_flags and same metadata (if
- * PKT_TX_METADATA is set in ol_flags), and calculate cs_flags.
- *
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- * @param cs_flags
- *   Pointer of flags to be returned.
- * @param metadata
- *   Pointer of metadata to be returned.
- * @param txq_offloads
- *   Offloads enabled on Tx queue
- *
- * @return
- *   Number of packets having same ol_flags and metadata, if relevant.
- */
-static inline unsigned int
-txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags,
-		 rte_be32_t *metadata, const uint64_t txq_offloads)
-{
-	unsigned int pos;
-	const uint64_t cksum_ol_mask =
-		PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
-		PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
-		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
-	rte_be32_t p0_metadata, pn_metadata;
-
-	if (!pkts_n)
-		return 0;
-	p0_metadata = pkts[0]->ol_flags & PKT_TX_METADATA ?
-			pkts[0]->tx_metadata : 0;
-	/* Count the number of packets having same offload parameters. */
-	for (pos = 1; pos < pkts_n; ++pos) {
-		/* Check if packet has same checksum flags. */
-		if ((txq_offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP) &&
-		    ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & cksum_ol_mask))
-			break;
-		/* Check if packet has same metadata. */
-		if (txq_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) {
-			pn_metadata = pkts[pos]->ol_flags & PKT_TX_METADATA ?
-					pkts[pos]->tx_metadata : 0;
-			if (pn_metadata != p0_metadata)
-				break;
-		}
-	}
-	*cs_flags = txq_ol_cksum_to_cs(pkts[0]);
-	*metadata = p0_metadata;
-	return pos;
-}
-
-/**
- * DPDK callback for vectorized TX.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts,
-		      uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t nb_tx = 0;
-
-	while (pkts_n > nb_tx) {
-		uint16_t n;
-		uint16_t ret;
-
-		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
-		ret = txq_burst_v(txq, &pkts[nb_tx], n, 0, 0);
-		nb_tx += ret;
-		if (!ret)
-			break;
-	}
-	return nb_tx;
-}
-
-/**
- * DPDK callback for vectorized TX with multi-seg packets and offload.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t nb_tx = 0;
-
-	while (pkts_n > nb_tx) {
-		uint8_t cs_flags = 0;
-		uint16_t n;
-		uint16_t ret;
-		rte_be32_t metadata = 0;
-
-		/* Transmit multi-seg packets in the head of pkts list. */
-		if ((txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS) &&
-		    NB_SEGS(pkts[nb_tx]) > 1)
-			nb_tx += txq_scatter_v(txq,
-					       &pkts[nb_tx],
-					       pkts_n - nb_tx);
-		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
-		if (txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
-			n = txq_count_contig_single_seg(&pkts[nb_tx], n);
-		if (txq->offloads & (MLX5_VEC_TX_CKSUM_OFFLOAD_CAP |
-				     DEV_TX_OFFLOAD_MATCH_METADATA))
-			n = txq_calc_offload(&pkts[nb_tx], n,
-					     &cs_flags, &metadata,
-					     txq->offloads);
-		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags, metadata);
-		nb_tx += ret;
-		if (!ret)
-			break;
-	}
-	return nb_tx;
-}
-
-/**
  * Skip error packets.
  *
  * @param rxq
@@ -243,49 +111,6 @@
 }
 
 /**
- * Check Tx queue flags are set for raw vectorized Tx.
- *
- * @param dev
- *   Pointer to Ethernet device.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev)
-{
-	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
-
-	/* Doesn't support any offload. */
-	if (offloads)
-		return -ENOTSUP;
-	return 1;
-}
-
-/**
- * Check a device can support vectorized TX.
- *
- * @param dev
- *   Pointer to Ethernet device.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-mlx5_check_vec_tx_support(struct rte_eth_dev *dev)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
-
-	if (!priv->config.tx_vec_en ||
-	    priv->txqs_n > (unsigned int)priv->config.txqs_vec ||
-	    priv->config.mps != MLX5_MPW_ENHANCED ||
-	    offloads & ~MLX5_VEC_TX_OFFLOAD_CAP)
-		return -ENOTSUP;
-	return 1;
-}
-
-/**
  * Check a RX queue can support vectorized RX.
  *
  * @param rxq
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 1c7e3b4..9930286 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -27,295 +27,6 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 
 /**
- * Fill in buffer descriptors in a multi-packet send descriptor.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param dseg
- *   Pointer to buffer descriptor to be written.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param n
- *   Number of packets to be filled.
- */
-static inline void
-txq_wr_dseg_v(struct mlx5_txq_data *txq, uint8_t *dseg,
-	      struct rte_mbuf **pkts, unsigned int n)
-{
-	unsigned int pos;
-	uintptr_t addr;
-	const uint8x16_t dseg_shuf_m = {
-		 3,  2,  1,  0, /* length, bswap32 */
-		 4,  5,  6,  7, /* lkey */
-		15, 14, 13, 12, /* addr, bswap64 */
-		11, 10,  9,  8
-	};
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	uint32_t tx_byte = 0;
-#endif
-
-	for (pos = 0; pos < n; ++pos, dseg += MLX5_WQE_DWORD_SIZE) {
-		uint8x16_t desc;
-		struct rte_mbuf *pkt = pkts[pos];
-
-		addr = rte_pktmbuf_mtod(pkt, uintptr_t);
-		desc = vreinterpretq_u8_u32((uint32x4_t) {
-				DATA_LEN(pkt),
-				mlx5_tx_mb2mr(txq, pkt),
-				addr,
-				addr >> 32 });
-		desc = vqtbl1q_u8(desc, dseg_shuf_m);
-		vst1q_u8(dseg, desc);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		tx_byte += DATA_LEN(pkt);
-#endif
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.obytes += tx_byte;
-#endif
-}
-
-/**
- * Send multi-segmented packets until it encounters a single segment packet in
- * the pkts list.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
-	      uint16_t pkts_n)
-{
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n;
-	volatile struct mlx5_wqe *wqe = NULL;
-	bool metadata_ol =
-		txq->offloads & DEV_TX_OFFLOAD_MATCH_METADATA ? true : false;
-
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	if (unlikely(!pkts_n))
-		return 0;
-	for (n = 0; n < pkts_n; ++n) {
-		struct rte_mbuf *buf = pkts[n];
-		unsigned int segs_n = buf->nb_segs;
-		unsigned int ds = nb_dword_in_hdr;
-		unsigned int len = PKT_LEN(buf);
-		uint16_t wqe_ci = txq->wqe_ci;
-		const uint8x16_t ctrl_shuf_m = {
-			3,  2,  1,  0, /* bswap32 */
-			7,  6,  5,  4, /* bswap32 */
-			11, 10,  9,  8, /* bswap32 */
-			12, 13, 14, 15
-		};
-		uint8_t cs_flags;
-		uint16_t max_elts;
-		uint16_t max_wqe;
-		uint8x16_t *t_wqe;
-		uint8_t *dseg;
-		uint8x16_t ctrl;
-		rte_be32_t metadata =
-			metadata_ol && (buf->ol_flags & PKT_TX_METADATA) ?
-			buf->tx_metadata : 0;
-
-		assert(segs_n);
-		max_elts = elts_n - (elts_head - txq->elts_tail);
-		max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi);
-		/*
-		 * A MPW session consumes 2 WQEs at most to
-		 * include MLX5_MPW_DSEG_MAX pointers.
-		 */
-		if (segs_n == 1 ||
-		    max_elts < segs_n || max_wqe < 2)
-			break;
-		wqe = &((volatile struct mlx5_wqe64 *)
-			 txq->wqes)[wqe_ci & wq_mask].hdr;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Title WQEBB pointer. */
-		t_wqe = (uint8x16_t *)wqe;
-		dseg = (uint8_t *)(wqe + 1);
-		do {
-			if (!(ds++ % nb_dword_per_wqebb)) {
-				dseg = (uint8_t *)
-					&((volatile struct mlx5_wqe64 *)
-					   txq->wqes)[++wqe_ci & wq_mask];
-			}
-			txq_wr_dseg_v(txq, dseg, &buf, 1);
-			dseg += MLX5_WQE_DWORD_SIZE;
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			buf = buf->next;
-		} while (--segs_n);
-		++wqe_ci;
-		/* Fill CTRL in the header. */
-		ctrl = vreinterpretq_u8_u32((uint32x4_t) {
-				MLX5_OPC_MOD_MPW << 24 |
-				txq->wqe_ci << 8 | MLX5_OPCODE_TSO,
-				txq->qp_num_8s | ds, 4, 0});
-		ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m);
-		vst1q_u8((void *)t_wqe, ctrl);
-		/* Fill ESEG in the header. */
-		vst1q_u32((void *)(t_wqe + 1),
-			  ((uint32x4_t){ 0,
-					 rte_cpu_to_be_16(len) << 16 | cs_flags,
-					 metadata, 0 }));
-		txq->wqe_ci = wqe_ci;
-	}
-	if (!n)
-		return 0;
-	txq->elts_comp += (uint16_t)(elts_head - txq->elts_head);
-	txq->elts_head = elts_head;
-	if (txq->elts_comp >= MLX5_TX_COMP_THRESH) {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		wqe->ctrl[3] = txq->elts_head;
-		txq->elts_comp = 0;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += n;
-#endif
-	mlx5_tx_dbrec(txq, wqe);
-	return n;
-}
-
-/**
- * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet,
- * it returns to make it processed by txq_scatter_v(). All the packets in
- * the pkts list should be single segment packets having same offload flags.
- * This must be checked by txq_count_contig_single_seg() and txq_calc_offload().
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
- * @param cs_flags
- *   Checksum offload flags to be written in the descriptor.
- * @param metadata
- *   Metadata value to be written in the descriptor.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static inline uint16_t
-txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint8_t cs_flags, rte_be32_t metadata)
-{
-	struct rte_mbuf **elts;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n = 0;
-	unsigned int pos;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	uint32_t comp_req;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	uint16_t wq_idx = txq->wqe_ci & wq_mask;
-	volatile struct mlx5_wqe64 *wq =
-		&((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx];
-	volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq;
-	const uint8x16_t ctrl_shuf_m = {
-		 3,  2,  1,  0, /* bswap32 */
-		 7,  6,  5,  4, /* bswap32 */
-		11, 10,  9,  8, /* bswap32 */
-		12, 13, 14, 15
-	};
-	uint8x16_t *t_wqe;
-	uint8_t *dseg;
-	uint8x16_t ctrl;
-
-	/* Make sure all packets can fit into a single WQE. */
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts);
-	if (unlikely(!pkts_n))
-		return 0;
-	elts = &(*txq->elts)[elts_head & elts_m];
-	/* Loop for available tailroom first. */
-	n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n);
-	for (pos = 0; pos < (n & -2); pos += 2)
-		vst1q_u64((void *)&elts[pos], vld1q_u64((void *)&pkts[pos]));
-	if (n & 1)
-		elts[pos] = pkts[pos];
-	/* Check if it crosses the end of the queue. */
-	if (unlikely(n < pkts_n)) {
-		elts = &(*txq->elts)[0];
-		for (pos = 0; pos < pkts_n - n; ++pos)
-			elts[pos] = pkts[n + pos];
-	}
-	txq->elts_head += pkts_n;
-	/* Save title WQEBB pointer. */
-	t_wqe = (uint8x16_t *)wqe;
-	dseg = (uint8_t *)(wqe + 1);
-	/* Calculate the number of entries to the end. */
-	n = RTE_MIN(
-		(wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr,
-		pkts_n);
-	/* Fill DSEGs. */
-	txq_wr_dseg_v(txq, dseg, pkts, n);
-	/* Check if it crosses the end of the queue. */
-	if (n < pkts_n) {
-		dseg = (uint8_t *)txq->wqes;
-		txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n);
-	}
-	if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) {
-		txq->elts_comp += pkts_n;
-		comp_req = MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET;
-	} else {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request a completion. */
-		txq->elts_comp = 0;
-		comp_req = MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET;
-	}
-	/* Fill CTRL in the header. */
-	ctrl = vreinterpretq_u8_u32((uint32x4_t) {
-			MLX5_OPC_MOD_ENHANCED_MPSW << 24 |
-			txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW,
-			txq->qp_num_8s | (pkts_n + 2),
-			comp_req,
-			txq->elts_head });
-	ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m);
-	vst1q_u8((void *)t_wqe, ctrl);
-	/* Fill ESEG in the header. */
-	vst1q_u32((void *)(t_wqe + 1),
-		 ((uint32x4_t) { 0, cs_flags, metadata, 0 }));
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += pkts_n;
-#endif
-	txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
-		       nb_dword_per_wqebb;
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
-	return pkts_n;
-}
-
-/**
  * Store free buffers to RX SW ring.
  *
  * @param rxq
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 503ca0f..7bd254f 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -29,290 +29,6 @@
 #endif
 
 /**
- * Fill in buffer descriptors in a multi-packet send descriptor.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param dseg
- *   Pointer to buffer descriptor to be written.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param n
- *   Number of packets to be filled.
- */
-static inline void
-txq_wr_dseg_v(struct mlx5_txq_data *txq, __m128i *dseg,
-	      struct rte_mbuf **pkts, unsigned int n)
-{
-	unsigned int pos;
-	uintptr_t addr;
-	const __m128i shuf_mask_dseg =
-		_mm_set_epi8(8,  9, 10, 11, /* addr, bswap64 */
-			    12, 13, 14, 15,
-			     7,  6,  5,  4, /* lkey */
-			     0,  1,  2,  3  /* length, bswap32 */);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	uint32_t tx_byte = 0;
-#endif
-
-	for (pos = 0; pos < n; ++pos, ++dseg) {
-		__m128i desc;
-		struct rte_mbuf *pkt = pkts[pos];
-
-		addr = rte_pktmbuf_mtod(pkt, uintptr_t);
-		desc = _mm_set_epi32(addr >> 32,
-				     addr,
-				     mlx5_tx_mb2mr(txq, pkt),
-				     DATA_LEN(pkt));
-		desc = _mm_shuffle_epi8(desc, shuf_mask_dseg);
-		_mm_store_si128(dseg, desc);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		tx_byte += DATA_LEN(pkt);
-#endif
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.obytes += tx_byte;
-#endif
-}
-
-/**
- * Send multi-segmented packets until it encounters a single segment packet in
- * the pkts list.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
-	      uint16_t pkts_n)
-{
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n;
-	volatile struct mlx5_wqe *wqe = NULL;
-	bool metadata_ol =
-		txq->offloads & DEV_TX_OFFLOAD_MATCH_METADATA ? true : false;
-
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	if (unlikely(!pkts_n))
-		return 0;
-	for (n = 0; n < pkts_n; ++n) {
-		struct rte_mbuf *buf = pkts[n];
-		unsigned int segs_n = buf->nb_segs;
-		unsigned int ds = nb_dword_in_hdr;
-		unsigned int len = PKT_LEN(buf);
-		uint16_t wqe_ci = txq->wqe_ci;
-		const __m128i shuf_mask_ctrl =
-			_mm_set_epi8(15, 14, 13, 12,
-				      8,  9, 10, 11, /* bswap32 */
-				      4,  5,  6,  7, /* bswap32 */
-				      0,  1,  2,  3  /* bswap32 */);
-		uint8_t cs_flags;
-		uint16_t max_elts;
-		uint16_t max_wqe;
-		__m128i *t_wqe, *dseg;
-		__m128i ctrl;
-		rte_be32_t metadata =
-			metadata_ol && (buf->ol_flags & PKT_TX_METADATA) ?
-			buf->tx_metadata : 0;
-
-		assert(segs_n);
-		max_elts = elts_n - (elts_head - txq->elts_tail);
-		max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi);
-		/*
-		 * A MPW session consumes 2 WQEs at most to
-		 * include MLX5_MPW_DSEG_MAX pointers.
-		 */
-		if (segs_n == 1 ||
-		    max_elts < segs_n || max_wqe < 2)
-			break;
-		if (segs_n > MLX5_MPW_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		wqe = &((volatile struct mlx5_wqe64 *)
-			 txq->wqes)[wqe_ci & wq_mask].hdr;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Title WQEBB pointer. */
-		t_wqe = (__m128i *)wqe;
-		dseg = (__m128i *)(wqe + 1);
-		do {
-			if (!(ds++ % nb_dword_per_wqebb)) {
-				dseg = (__m128i *)
-					&((volatile struct mlx5_wqe64 *)
-					   txq->wqes)[++wqe_ci & wq_mask];
-			}
-			txq_wr_dseg_v(txq, dseg++, &buf, 1);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			buf = buf->next;
-		} while (--segs_n);
-		++wqe_ci;
-		/* Fill CTRL in the header. */
-		ctrl = _mm_set_epi32(0, 4, txq->qp_num_8s | ds,
-				     MLX5_OPC_MOD_MPW << 24 |
-				     txq->wqe_ci << 8 | MLX5_OPCODE_TSO);
-		ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
-		_mm_store_si128(t_wqe, ctrl);
-		/* Fill ESEG in the header. */
-		_mm_store_si128(t_wqe + 1,
-				_mm_set_epi32(0, metadata,
-					      (rte_cpu_to_be_16(len) << 16) |
-					      cs_flags, 0));
-		txq->wqe_ci = wqe_ci;
-	}
-	if (!n)
-		return 0;
-	txq->elts_comp += (uint16_t)(elts_head - txq->elts_head);
-	txq->elts_head = elts_head;
-	if (txq->elts_comp >= MLX5_TX_COMP_THRESH) {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		wqe->ctrl[3] = txq->elts_head;
-		txq->elts_comp = 0;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += n;
-#endif
-	mlx5_tx_dbrec(txq, wqe);
-	return n;
-}
-
-/**
- * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet,
- * it returns to make it processed by txq_scatter_v(). All the packets in
- * the pkts list should be single segment packets having same offload flags.
- * This must be checked by txq_count_contig_single_seg() and txq_calc_offload().
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
- * @param cs_flags
- *   Checksum offload flags to be written in the descriptor.
- * @param metadata
- *   Metadata value to be written in the descriptor.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static inline uint16_t
-txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint8_t cs_flags, rte_be32_t metadata)
-{
-	struct rte_mbuf **elts;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n = 0;
-	unsigned int pos;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	uint32_t comp_req;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	uint16_t wq_idx = txq->wqe_ci & wq_mask;
-	volatile struct mlx5_wqe64 *wq =
-		&((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx];
-	volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq;
-	const __m128i shuf_mask_ctrl =
-		_mm_set_epi8(15, 14, 13, 12,
-			      8,  9, 10, 11, /* bswap32 */
-			      4,  5,  6,  7, /* bswap32 */
-			      0,  1,  2,  3  /* bswap32 */);
-	__m128i *t_wqe, *dseg;
-	__m128i ctrl;
-
-	/* Make sure all packets can fit into a single WQE. */
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts);
-	assert(pkts_n <= MLX5_DSEG_MAX - nb_dword_in_hdr);
-	if (unlikely(!pkts_n))
-		return 0;
-	elts = &(*txq->elts)[elts_head & elts_m];
-	/* Loop for available tailroom first. */
-	n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n);
-	for (pos = 0; pos < (n & -2); pos += 2)
-		_mm_storeu_si128((__m128i *)&elts[pos],
-				 _mm_loadu_si128((__m128i *)&pkts[pos]));
-	if (n & 1)
-		elts[pos] = pkts[pos];
-	/* Check if it crosses the end of the queue. */
-	if (unlikely(n < pkts_n)) {
-		elts = &(*txq->elts)[0];
-		for (pos = 0; pos < pkts_n - n; ++pos)
-			elts[pos] = pkts[n + pos];
-	}
-	txq->elts_head += pkts_n;
-	/* Save title WQEBB pointer. */
-	t_wqe = (__m128i *)wqe;
-	dseg = (__m128i *)(wqe + 1);
-	/* Calculate the number of entries to the end. */
-	n = RTE_MIN(
-		(wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr,
-		pkts_n);
-	/* Fill DSEGs. */
-	txq_wr_dseg_v(txq, dseg, pkts, n);
-	/* Check if it crosses the end of the queue. */
-	if (n < pkts_n) {
-		dseg = (__m128i *)txq->wqes;
-		txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n);
-	}
-	if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) {
-		txq->elts_comp += pkts_n;
-		comp_req = MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET;
-	} else {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request a completion. */
-		txq->elts_comp = 0;
-		comp_req = MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET;
-	}
-	/* Fill CTRL in the header. */
-	ctrl = _mm_set_epi32(txq->elts_head, comp_req,
-			     txq->qp_num_8s | (pkts_n + 2),
-			     MLX5_OPC_MOD_ENHANCED_MPSW << 24 |
-				txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW);
-	ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
-	_mm_store_si128(t_wqe, ctrl);
-	/* Fill ESEG in the header. */
-	_mm_store_si128(t_wqe + 1, _mm_set_epi32(0, metadata, cs_flags, 0));
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += pkts_n;
-#endif
-	txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
-		       nb_dword_per_wqebb;
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
-	return pkts_n;
-}
-
-/**
  * Store free buffers to RX SW ring.
  *
  * @param rxq
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 7741095..0d2dbfa 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -365,25 +365,6 @@
 }
 
 /**
- * Check if the burst function is using eMPW.
- *
- * @param tx_pkt_burst
- *   Tx burst function pointer.
- *
- * @return
- *   1 if the burst function is using eMPW, 0 otherwise.
- */
-static int
-is_empw_burst_func(eth_tx_burst_t tx_pkt_burst)
-{
-	if (tx_pkt_burst == mlx5_tx_burst_raw_vec ||
-	    tx_pkt_burst == mlx5_tx_burst_vec ||
-	    tx_pkt_burst == mlx5_tx_burst_empw)
-		return 1;
-	return 0;
-}
-
-/**
  * Create the Tx queue Verbs object.
  *
  * @param dev
@@ -414,7 +395,6 @@ struct mlx5_txq_ibv *
 	struct mlx5dv_cq cq_info;
 	struct mlx5dv_obj obj;
 	const int desc = 1 << txq_data->elts_n;
-	eth_tx_burst_t tx_pkt_burst = mlx5_select_tx_function(dev);
 	int ret = 0;
 
 	assert(txq_data);
@@ -432,8 +412,6 @@ struct mlx5_txq_ibv *
 		.comp_mask = 0,
 	};
 	cqe_n = desc / MLX5_TX_COMP_THRESH + 1;
-	if (is_empw_burst_func(tx_pkt_burst))
-		cqe_n += MLX5_TX_COMP_THRESH_INLINE_DIV;
 	tmpl.cq = mlx5_glue->create_cq(priv->sh->ctx, cqe_n, NULL, NULL, 0);
 	if (tmpl.cq == NULL) {
 		DRV_LOG(ERR, "port %u Tx queue %u CQ creation failure",
@@ -698,93 +676,7 @@ struct mlx5_txq_ibv *
 static void
 txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
 {
-	struct mlx5_priv *priv = txq_ctrl->priv;
-	struct mlx5_dev_config *config = &priv->config;
-	const unsigned int max_tso_inline =
-		((MLX5_MAX_TSO_HEADER + (RTE_CACHE_LINE_SIZE - 1)) /
-		 RTE_CACHE_LINE_SIZE);
-	unsigned int txq_inline;
-	unsigned int txqs_inline;
-	unsigned int inline_max_packet_sz;
-	eth_tx_burst_t tx_pkt_burst =
-		mlx5_select_tx_function(ETH_DEV(priv));
-	int is_empw_func = is_empw_burst_func(tx_pkt_burst);
-	int tso = !!(txq_ctrl->txq.offloads & (DEV_TX_OFFLOAD_TCP_TSO |
-					       DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
-					       DEV_TX_OFFLOAD_GRE_TNL_TSO |
-					       DEV_TX_OFFLOAD_IP_TNL_TSO |
-					       DEV_TX_OFFLOAD_UDP_TNL_TSO));
-
-	txq_inline = (config->txq_inline == MLX5_ARG_UNSET) ?
-		0 : config->txq_inline;
-	txqs_inline = (config->txqs_inline == MLX5_ARG_UNSET) ?
-		0 : config->txqs_inline;
-	inline_max_packet_sz =
-		(config->inline_max_packet_sz == MLX5_ARG_UNSET) ?
-		0 : config->inline_max_packet_sz;
-	if (is_empw_func) {
-		if (config->txq_inline == MLX5_ARG_UNSET)
-			txq_inline = MLX5_WQE_SIZE_MAX - MLX5_WQE_SIZE;
-		if (config->txqs_inline == MLX5_ARG_UNSET)
-			txqs_inline = MLX5_EMPW_MIN_TXQS;
-		if (config->inline_max_packet_sz == MLX5_ARG_UNSET)
-			inline_max_packet_sz = MLX5_EMPW_MAX_INLINE_LEN;
-		txq_ctrl->txq.mpw_hdr_dseg = config->mpw_hdr_dseg;
-		txq_ctrl->txq.inline_max_packet_sz = inline_max_packet_sz;
-	}
-	if (txq_inline && priv->txqs_n >= txqs_inline) {
-		unsigned int ds_cnt;
-
-		txq_ctrl->txq.max_inline =
-			((txq_inline + (RTE_CACHE_LINE_SIZE - 1)) /
-			 RTE_CACHE_LINE_SIZE);
-		if (is_empw_func) {
-			/* To minimize the size of data set, avoid requesting
-			 * too large WQ.
-			 */
-			txq_ctrl->max_inline_data =
-				((RTE_MIN(txq_inline,
-					  inline_max_packet_sz) +
-				  (RTE_CACHE_LINE_SIZE - 1)) /
-				 RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE;
-		} else {
-			txq_ctrl->max_inline_data =
-				txq_ctrl->txq.max_inline * RTE_CACHE_LINE_SIZE;
-		}
-		/*
-		 * Check if the inline size is too large in a way which
-		 * can make the WQE DS to overflow.
-		 * Considering in calculation:
-		 *      WQE CTRL (1 DS)
-		 *      WQE ETH  (1 DS)
-		 *      Inline part (N DS)
-		 */
-		ds_cnt = 2 + (txq_ctrl->txq.max_inline / MLX5_WQE_DWORD_SIZE);
-		if (ds_cnt > MLX5_DSEG_MAX) {
-			unsigned int max_inline = (MLX5_DSEG_MAX - 2) *
-						  MLX5_WQE_DWORD_SIZE;
-
-			max_inline = max_inline - (max_inline %
-						   RTE_CACHE_LINE_SIZE);
-			DRV_LOG(WARNING,
-				"port %u txq inline is too large (%d) setting"
-				" it to the maximum possible: %d\n",
-				PORT_ID(priv), txq_inline, max_inline);
-			txq_ctrl->txq.max_inline = max_inline /
-						   RTE_CACHE_LINE_SIZE;
-		}
-	}
-	if (tso) {
-		txq_ctrl->max_tso_header = max_tso_inline * RTE_CACHE_LINE_SIZE;
-		txq_ctrl->txq.max_inline = RTE_MAX(txq_ctrl->txq.max_inline,
-						   max_tso_inline);
-		txq_ctrl->txq.tso_en = 1;
-	}
-	txq_ctrl->txq.tunnel_en = config->tunnel_en | config->swp;
-	txq_ctrl->txq.swp_en = ((DEV_TX_OFFLOAD_IP_TNL_TSO |
-				 DEV_TX_OFFLOAD_UDP_TNL_TSO |
-				 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM) &
-				txq_ctrl->txq.offloads) && config->swp;
+	(void)txq_ctrl;
 }
 
 /**
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v3 2/8] net/mlx5: add Tx datapath related devargs
  2019-07-17  6:53     ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 1/8] net/mlx5: remove Tx datapath implementation Viacheslav Ovsiienko
@ 2019-07-17  6:53       ` Viacheslav Ovsiienko
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 3/8] net/mlx5: update Tx datapath definitions Viacheslav Ovsiienko
                         ` (6 subsequent siblings)
  8 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-17  6:53 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch introduces new mlx5 PMD devarg options:

- txq_inline_min - specifies minimal amount of data to be inlined into
  WQE during Tx operations. NICs may require this minimal data amount
  to operate correctly. The exact value may depend on NIC operation mode,
  requested offloads, etc.

- txq_inline_max - specifies the maximal packet length to be completely
  inlined into WQE Ethernet Segment for ordinary SEND method. If packet
  is larger the specified value, the packet data won't be copied by the
  driver at all, data buffer is addressed with a pointer. If packet length
  is less or equal all packet data will be copied into WQE.

- txq_inline_mpw - specifies the maximal packet length to be completely
  inlined into WQE for Enhanced MPW method.

Driver documentation is also updated.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 doc/guides/nics/mlx5.rst | 168 +++++++++++++++++++++++++----------------------
 drivers/net/mlx5/mlx5.c  |  21 ++++++
 drivers/net/mlx5/mlx5.h  |   4 ++
 3 files changed, 113 insertions(+), 80 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 16aa390..e3c2a11 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -348,99 +348,107 @@ Run-time configuration
   greater or equal to this value. The default value is 12, valid only if
   ``mprq_en`` is set.
 
-- ``txq_inline`` parameter [int]
-
-  Amount of data to be inlined during TX operations. Improves latency.
-  Can improve PPS performance when PCI back pressure is detected and may be
-  useful for scenarios involving heavy traffic on many queues.
-
-  Because additional software logic is necessary to handle this mode, this
-  option should be used with care, as it can lower performance when back
-  pressure is not expected.
+- ``txq_inline_min`` parameter [int]
+
+  Minimal amount of data to be inlined into WQE during Tx operations. NICs
+  may require this minimal data amount to operate correctly. The exact value
+  may depend on NIC operation mode, requested offloads, etc.
+
+  If ``txq_inline_min`` key is present the specified value (may be aligned
+  by the driver in order not to exceed the limits and provide better descriptor
+  space utilization) will be used by the driver and it is guaranteed the
+  requested data bytes are inlined into the WQE beside other inline settings.
+
+  If ``txq_inline_min`` key is not present, the value may be queried by the
+  driver from the NIC via DevX if this feature is available. If there is no DevX
+  enabled/supported the value 18 (supposing L2 header including VLAN) is set
+  for ConnectX-4, value 58 (supposing L2-L4 headers, required by configurations
+  over E-Switch) is set for ConnectX-4 Lx, and 0 is set by default for ConnectX-5
+  and newer NICs. If packet is shorter the ``txq_inline_min`` value, the entire
+  packet is inlined.
+
+  Please, note, this minimal data inlining disengages eMPW feature (Enhanced
+  Multi-Packet Write), because last one does not support partial packet inlining.
+  This is not very critical due to minimal data inlining is mostly required
+  by ConnectX-4 and ConnectX-4 Lx, these NICs do not support eMPW feature.
 
 - ``txqs_min_inline`` parameter [int]
 
-  Enable inline send only when the number of TX queues is greater or equal
+  Enable inline data send only when the number of TX queues is greater or equal
   to this value.
 
-  This option should be used in combination with ``txq_inline`` above.
-
-  On ConnectX-4, ConnectX-4 LX, ConnectX-5, ConnectX-6 and BlueField without
-  Enhanced MPW:
-
-        - Disabled by default.
-        - In case ``txq_inline`` is set recommendation is 4.
+  This option should be used in combination with ``txq_inline_max`` and
+  ``txq_inline_mpw`` below and does not affect ``txq_inline_min`` settings above.
 
-  On ConnectX-5, ConnectX-6 and BlueField with Enhanced MPW:
+  If this option is not specified the default value 8 is used.
 
-        - Set to 8 by default.
+  The data inlining consumes the CPU cycles, so this option is intended to
+  auto enable inline data if we have enough Tx queues, which means we have
+  enough CPU cores and PCI bandwidth is getting more critical and CPU
+  is not supposed to be bottleneck anymore.
 
-- ``txqs_max_vec`` parameter [int]
+  The copying data into WQE improves latency and can improve PPS performance
+  when PCI back pressure is detected and may be useful for scenarios involving
+  heavy traffic on many queues.
 
-  Enable vectorized Tx only when the number of TX queues is less than or
-  equal to this value. Effective only when ``tx_vec_en`` is enabled.
-
-  On ConnectX-5:
-
-        - Set to 8 by default on ARMv8.
-        - Set to 4 by default otherwise.
-
-  On BlueField
+  Because additional software logic is necessary to handle this mode, this
+  option should be used with care, as it may lower performance when back
+  pressure is not expected.
 
-        - Set to 16 by default.
+- ``txq_inline_max`` parameter [int]
+
+  Specifies the maximal packet length to be completely inlined into WQE
+  Ethernet Segment for ordinary SEND method. If packet is larger than specified
+  value, the packet data won't be copied by the driver at all, data buffer
+  is addressed with a pointer. If packet length is less or equal all packet
+  data will be copied into WQE. This may improve PCI bandwidth utilization for
+  short packets significantly but requires the extra CPU cycles.
+
+  The data inline feature is controlled by number of Tx queues, if number of Tx
+  queues is larger than ``txqs_min_inline`` key parameter, the inline feature
+  is engaged, if there are not enough Tx queues (which means not enough CPU cores
+  and CPU resources are scarce), data inline is not performed by the driver.
+  Assigning ``txqs_min_inline`` with zero always enables the data inline.
+
+  The default ``txq_inline_max`` value is 290. The specified value may be adjusted
+  by the driver in order not to exceed the limit (930 bytes) and to provide better
+  WQE space filling without gaps, the adjustment is reflected in the debug log.
+
+- ``txq_inline_mpw`` parameter [int]
+
+  Specifies the maximal packet length to be completely inlined into WQE for
+  Enhanced MPW method. If packet is large the specified value, the packet data
+  won't be copied, and data buffer is addressed with pointer. If packet length
+  is less or equal, all packet data will be copied into WQE. This may improve PCI
+  bandwidth utilization for short packets significantly but requires the extra
+  CPU cycles.
+
+  The data inline feature is controlled by number of TX queues, if number of Tx
+  queues is larger than ``txqs_min_inline`` key parameter, the inline feature
+  is engaged, if there are not enough Tx queues (which means not enough CPU cores
+  and CPU resources are scarce), data inline is not performed by the driver.
+  Assigning ``txqs_min_inline`` with zero always enables the data inline.
+
+  The default ``txq_inline_mpw`` value is 188. The specified value may be adjusted
+  by the driver in order not to exceed the limit (930 bytes) and to provide better
+  WQE space filling without gaps, the adjustment is reflected in the debug log.
+  Due to multiple packets may be included to the same WQE with Enhanced Multi
+  Packet Write Method and overall WQE size is limited it is not recommended to
+  specify large values for the ``txq_inline_mpw``.
 
 - ``txq_mpw_en`` parameter [int]
 
-  A nonzero value enables multi-packet send (MPS) for ConnectX-4 Lx and
-  enhanced multi-packet send (Enhanced MPS) for ConnectX-5, ConnectX-6 and BlueField.
-  MPS allows the TX burst function to pack up multiple packets in a
-  single descriptor session in order to save PCI bandwidth and improve
-  performance at the cost of a slightly higher CPU usage. When
-  ``txq_inline`` is set along with ``txq_mpw_en``, TX burst function tries
-  to copy entire packet data on to TX descriptor instead of including
-  pointer of packet only if there is enough room remained in the
-  descriptor. ``txq_inline`` sets per-descriptor space for either pointers
-  or inlined packets. In addition, Enhanced MPS supports hybrid mode -
-  mixing inlined packets and pointers in the same descriptor.
-
-  This option cannot be used with certain offloads such as ``DEV_TX_OFFLOAD_TCP_TSO,
-  DEV_TX_OFFLOAD_VXLAN_TNL_TSO, DEV_TX_OFFLOAD_GRE_TNL_TSO, DEV_TX_OFFLOAD_VLAN_INSERT``.
-  When those offloads are requested the MPS send function will not be used.
-
-  It is currently only supported on the ConnectX-4 Lx, ConnectX-5, ConnectX-6 and BlueField
-  families of adapters.
-  On ConnectX-4 Lx the MPW is considered un-secure hence disabled by default.
-  Users which enable the MPW should be aware that application which provides incorrect
-  mbuf descriptors in the Tx burst can lead to serious errors in the host including, on some cases,
-  NIC to get stuck.
-  On ConnectX-5, ConnectX-6 and BlueField the MPW is secure and enabled by default.
-
-- ``txq_mpw_hdr_dseg_en`` parameter [int]
-
-  A nonzero value enables including two pointers in the first block of TX
-  descriptor. This can be used to lessen CPU load for memory copy.
-
-  Effective only when Enhanced MPS is supported. Disabled by default.
-
-- ``txq_max_inline_len`` parameter [int]
-
-  Maximum size of packet to be inlined. This limits the size of packet to
-  be inlined. If the size of a packet is larger than configured value, the
-  packet isn't inlined even though there's enough space remained in the
-  descriptor. Instead, the packet is included with pointer.
-
-  Effective only when Enhanced MPS is supported. The default value is 256.
-
-- ``tx_vec_en`` parameter [int]
-
-  A nonzero value enables Tx vector on ConnectX-5, ConnectX-6 and BlueField NICs if the number of
-  global Tx queues on the port is less than ``txqs_max_vec``.
-
-  This option cannot be used with certain offloads such as ``DEV_TX_OFFLOAD_TCP_TSO,
-  DEV_TX_OFFLOAD_VXLAN_TNL_TSO, DEV_TX_OFFLOAD_GRE_TNL_TSO, DEV_TX_OFFLOAD_VLAN_INSERT``.
-  When those offloads are requested the MPS send function will not be used.
-
-  Enabled by default on ConnectX-5, ConnectX-6 and BlueField.
+  A nonzero value enables Enhanced Multi-Packet Write (eMPW) for ConnectX-5,
+  ConnectX-6 and BlueField. eMPW allows the TX burst function to pack up multiple
+  packets in a single descriptor session in order to save PCI bandwidth and improve
+  performance at the cost of a slightly higher CPU usage. When ``txq_inline_mpw``
+  is set along with ``txq_mpw_en``, TX burst function copies entire packet
+  data on to TX descriptor instead of including pointer of packet.
+
+  The Enhanced Multi-Packet Write feature is enabled by default if NIC supports
+  it, can be disabled by explicit specifying 0 value for ``txq_mpw_en`` option.
+  Also, if minimal data inlining is requested by non-zero ``txq_inline_min``
+  option or reported by the NIC, the eMPW feature is disengaged.
 
 - ``rx_vec_en`` parameter [int]
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index b118ee5..f1e125e 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -69,6 +69,15 @@
 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
 
+/* Device parameter to limit packet size to inline with ordinary SEND. */
+#define MLX5_TXQ_INLINE_MAX "txq_inline_max"
+
+/* Device parameter to configure minimal data size to inline. */
+#define MLX5_TXQ_INLINE_MIN "txq_inline_min"
+
+/* Device parameter to limit packet size to inline with Enhanced MPW. */
+#define MLX5_TXQ_INLINE_MPW "txq_inline_mpw"
+
 /*
  * Device parameter to configure the number of TX queues threshold for
  * enabling inline send.
@@ -978,6 +987,12 @@ struct mlx5_dev_spawn_data {
 		config->mprq.max_memcpy_len = tmp;
 	} else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
 		config->mprq.min_rxqs_num = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) {
+		config->txq_inline_max = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) {
+		config->txq_inline_min = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) {
+		config->txq_inline_mpw = tmp;
 	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
 		config->txqs_inline = tmp;
 	} else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
@@ -1026,6 +1041,9 @@ struct mlx5_dev_spawn_data {
 		MLX5_RX_MPRQ_LOG_STRIDE_NUM,
 		MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
 		MLX5_RXQS_MIN_MPRQ,
+		MLX5_TXQ_INLINE_MIN,
+		MLX5_TXQ_INLINE_MAX,
+		MLX5_TXQ_INLINE_MPW,
 		MLX5_TXQS_MIN_INLINE,
 		MLX5_TXQ_MPW_EN,
 		MLX5_RX_VEC_EN,
@@ -1984,6 +2002,9 @@ struct mlx5_dev_spawn_data {
 		.hw_padding = 0,
 		.mps = MLX5_ARG_UNSET,
 		.rx_vec_en = 1,
+		.txq_inline_max = MLX5_ARG_UNSET,
+		.txq_inline_min = MLX5_ARG_UNSET,
+		.txq_inline_mpw = MLX5_ARG_UNSET,
 		.txqs_inline = MLX5_ARG_UNSET,
 		.vf_nl_en = 1,
 		.mr_ext_memseg_en = 1,
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 211558e..eef9a55 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -198,6 +198,7 @@ struct mlx5_dev_config {
 	unsigned int cqe_comp:1; /* CQE compression is enabled. */
 	unsigned int cqe_pad:1; /* CQE padding is enabled. */
 	unsigned int tso:1; /* Whether TSO is supported. */
+	unsigned int tx_inline:1; /* Engage TX data inlining. */
 	unsigned int rx_vec_en:1; /* Rx vector is enabled. */
 	unsigned int mr_ext_memseg_en:1;
 	/* Whether memseg should be extended for MR creation. */
@@ -223,6 +224,9 @@ struct mlx5_dev_config {
 	unsigned int ind_table_max_size; /* Maximum indirection table size. */
 	unsigned int max_dump_files_num; /* Maximum dump files per queue. */
 	int txqs_inline; /* Queue number threshold for inlining. */
+	int txq_inline_min; /* Minimal amount of data bytes to inline. */
+	int txq_inline_max; /* Max packet size for inlining with SEND. */
+	int txq_inline_mpw; /* Max packet size for inlining with eMPW. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 };
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v3 3/8] net/mlx5: update Tx datapath definitions
  2019-07-17  6:53     ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 1/8] net/mlx5: remove Tx datapath implementation Viacheslav Ovsiienko
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 2/8] net/mlx5: add Tx datapath related devargs Viacheslav Ovsiienko
@ 2019-07-17  6:53       ` Viacheslav Ovsiienko
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 4/8] net/mlx5: add Tx datapath configuration and setup Viacheslav Ovsiienko
                         ` (5 subsequent siblings)
  8 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-17  6:53 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch updates Tx datapath definitions, mostly hardware related.
The Tx descriptor structures are redefined with required fields,
size definitions are renamed to reflect the meanings in more
appropriate way. This is a preparation step before introducing
the new Tx datapath implementation.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_defs.h |   2 +-
 drivers/net/mlx5/mlx5_prm.h  | 163 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 151 insertions(+), 14 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 6861304..873a595 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -58,7 +58,7 @@
 #define MLX5_MAX_XSTATS 32
 
 /* Maximum Packet headers size (L2+L3+L4) for TSO. */
-#define MLX5_MAX_TSO_HEADER 192
+#define MLX5_MAX_TSO_HEADER (128u + 34u)
 
 /* Threshold of buffer replenishment for vectorized Rx. */
 #define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index dfd9317..9a43dcc 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -39,14 +39,84 @@
 /* Invalidate a CQE. */
 #define MLX5_CQE_INVALIDATE (MLX5_CQE_INVALID << 4)
 
-/* WQE DWORD size */
-#define MLX5_WQE_DWORD_SIZE 16
-
-/* WQE size */
-#define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
+/* WQE Segment sizes in bytes. */
+#define MLX5_WSEG_SIZE 16u
+#define MLX5_WQE_CSEG_SIZE sizeof(struct mlx5_wqe_cseg)
+#define MLX5_WQE_DSEG_SIZE sizeof(struct mlx5_wqe_dseg)
+#define MLX5_WQE_ESEG_SIZE sizeof(struct mlx5_wqe_eseg)
+
+/* WQE/WQEBB size in bytes. */
+#define MLX5_WQE_SIZE sizeof(struct mlx5_wqe)
+
+/*
+ * Max size of a WQE session.
+ * Absolute maximum size is 63 (MLX5_DSEG_MAX) segments,
+ * the WQE size field in Control Segment is 6 bits wide.
+ */
+#define MLX5_WQE_SIZE_MAX (60 * MLX5_WSEG_SIZE)
+
+/*
+ * Default minimum number of Tx queues for inlining packets.
+ * If there are less queues as specified we assume we have
+ * no enough CPU resources (cycles) to perform inlining,
+ * the PCIe throughput is not supposed as bottleneck and
+ * inlining is disabled.
+ */
+#define MLX5_EMPW_MIN_TXQS 8u
+
+/*
+ * Default packet length threshold to be inlined with
+ * enhanced MPW. If packet length exceeds the threshold
+ * the data are not inlined. Should be aligned in WQEBB
+ * boundary with accounting the title Control and Ethernet
+ * segments.
+ */
+#define MLX5_EMPW_DEF_INLINE_LEN (3U * MLX5_WQE_SIZE + \
+				  MLX5_DSEG_MIN_INLINE_SIZE - \
+				  MLX5_WQE_DSEG_SIZE)
+/*
+ * Maximal inline data length sent with enhanced MPW.
+ * Is based on maximal WQE size.
+ */
+#define MLX5_EMPW_MAX_INLINE_LEN (MLX5_WQE_SIZE_MAX - \
+				  MLX5_WQE_CSEG_SIZE - \
+				  MLX5_WQE_ESEG_SIZE - \
+				  MLX5_WQE_DSEG_SIZE + \
+				  MLX5_DSEG_MIN_INLINE_SIZE)
+/*
+ * Minimal amount of packets to be sent with EMPW.
+ * This limits the minimal required size of sent EMPW.
+ * If there are no enough resources to built minimal
+ * EMPW the sending loop exits.
+ */
+#define MLX5_EMPW_MIN_PACKETS (2 + 3 * 4)
+#define MLX5_EMPW_MAX_PACKETS ((MLX5_WQE_SIZE_MAX - \
+				MLX5_WQE_CSEG_SIZE - \
+				MLX5_WQE_ESEG_SIZE) / \
+				MLX5_WSEG_SIZE)
+/*
+ * Default packet length threshold to be inlined with
+ * ordinary SEND. Inlining saves the MR key search
+ * and extra PCIe data fetch transaction, but eats the
+ * CPU cycles.
+ */
+#define MLX5_SEND_DEF_INLINE_LEN (5U * MLX5_WQE_SIZE + \
+				  MLX5_ESEG_MIN_INLINE_SIZE - \
+				  MLX5_WQE_CSEG_SIZE - \
+				  MLX5_WQE_ESEG_SIZE - \
+				  MLX5_WQE_DSEG_SIZE)
+/*
+ * Maximal inline data length sent with ordinary SEND.
+ * Is based on maximal WQE size.
+ */
+#define MLX5_SEND_MAX_INLINE_LEN (MLX5_WQE_SIZE_MAX - \
+				  MLX5_WQE_CSEG_SIZE - \
+				  MLX5_WQE_ESEG_SIZE - \
+				  MLX5_WQE_DSEG_SIZE + \
+				  MLX5_ESEG_MIN_INLINE_SIZE)
 
-#define MLX5_OPC_MOD_ENHANCED_MPSW 0
-#define MLX5_OPCODE_ENHANCED_MPSW 0x29
+/* Missed in mlv5dv.h, should define here. */
+#define MLX5_OPCODE_ENHANCED_MPSW 0x29u
 
 /* CQE value to inform that VLAN is stripped. */
 #define MLX5_CQE_VLAN_STRIPPED (1u << 0)
@@ -114,6 +184,12 @@
 /* Inner L3 type is IPV6. */
 #define MLX5_ETH_WQE_L3_INNER_IPV6 (1u << 0)
 
+/* VLAN insertion flag. */
+#define MLX5_ETH_WQE_VLAN_INSERT (1u << 31)
+
+/* Data inline segment flag. */
+#define MLX5_ETH_WQE_DATA_INLINE (1u << 31)
+
 /* Is flow mark valid. */
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 #define MLX5_FLOW_MARK_IS_VALID(val) ((val) & 0xffffff00)
@@ -130,12 +206,21 @@
 /* Default mark value used when none is provided. */
 #define MLX5_FLOW_MARK_DEFAULT 0xffffff
 
-/* Maximum number of DS in WQE. */
+/* Maximum number of DS in WQE. Limited by 6-bit field. */
 #define MLX5_DSEG_MAX 63
 
 /* The completion mode offset in the WQE control segment line 2. */
 #define MLX5_COMP_MODE_OFFSET 2
 
+/* Amount of data bytes in minimal inline data segment. */
+#define MLX5_DSEG_MIN_INLINE_SIZE 12
+
+/* Amount of data bytes in minimal inline eth segment. */
+#define MLX5_ESEG_MIN_INLINE_SIZE 18
+
+/* Amount of data bytes after eth data segment. */
+#define MLX5_ESEG_EXTRA_DATA_SIZE 32
+
 /* Completion mode. */
 enum mlx5_completion_mode {
 	MLX5_COMP_ONLY_ERR = 0x0,
@@ -144,11 +229,6 @@ enum mlx5_completion_mode {
 	MLX5_COMP_CQE_AND_EQE = 0x3,
 };
 
-/* Small common part of the WQE. */
-struct mlx5_wqe {
-	uint32_t ctrl[4];
-};
-
 /* MPW mode. */
 enum mlx5_mpw_mode {
 	MLX5_MPW_DISABLED,
@@ -156,6 +236,63 @@ enum mlx5_mpw_mode {
 	MLX5_MPW_ENHANCED, /* Enhanced Multi-Packet Send WQE, a.k.a MPWv2. */
 };
 
+/* WQE Control segment. */
+struct mlx5_wqe_cseg {
+	uint32_t opcode;
+	uint32_t sq_ds;
+	uint32_t flags;
+	uint32_t misc;
+} __rte_packed __rte_aligned(MLX5_WSEG_SIZE);
+
+/* Header of data segment. Minimal size Data Segment */
+struct mlx5_wqe_dseg {
+	uint32_t bcount;
+	union {
+		uint8_t inline_data[MLX5_DSEG_MIN_INLINE_SIZE];
+		struct {
+			uint32_t lkey;
+			uint64_t pbuf;
+		} __rte_packed;
+	};
+} __rte_packed;
+
+/* Subset of struct WQE Ethernet Segment. */
+struct mlx5_wqe_eseg {
+	union {
+		struct {
+			uint32_t swp_offs;
+			uint8_t	cs_flags;
+			uint8_t	swp_flags;
+			uint16_t mss;
+			uint32_t metadata;
+			uint16_t inline_hdr_sz;
+			union {
+				uint16_t inline_data;
+				uint16_t vlan_tag;
+			};
+		} __rte_packed;
+		struct {
+			uint32_t offsets;
+			uint32_t flags;
+			uint32_t flow_metadata;
+			uint32_t inline_hdr;
+		} __rte_packed;
+	};
+} __rte_packed;
+
+/* The title WQEBB, header of WQE. */
+struct mlx5_wqe {
+	union {
+		struct mlx5_wqe_cseg cseg;
+		uint32_t ctrl[4];
+	};
+	struct mlx5_wqe_eseg eseg;
+	union {
+		struct mlx5_wqe_dseg dseg[2];
+		uint8_t data[MLX5_ESEG_EXTRA_DATA_SIZE];
+	};
+} __rte_packed;
+
 /* WQE for Multi-Packet RQ. */
 struct mlx5_wqe_mprq {
 	struct mlx5_wqe_srq_next_seg next_seg;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v3 4/8] net/mlx5: add Tx datapath configuration and setup
  2019-07-17  6:53     ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                         ` (2 preceding siblings ...)
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 3/8] net/mlx5: update Tx datapath definitions Viacheslav Ovsiienko
@ 2019-07-17  6:53       ` Viacheslav Ovsiienko
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 5/8] net/mlx5: introduce Tx burst routine template Viacheslav Ovsiienko
                         ` (4 subsequent siblings)
  8 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-17  6:53 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch updates the Tx datapath control and configuration
structures and code for mananging Tx datapath settings.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c |   4 +-
 drivers/net/mlx5/mlx5_rxtx.h |  55 +++++++++----
 drivers/net/mlx5/mlx5_txq.c  | 182 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 212 insertions(+), 29 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index f2d6918..13f9431 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -507,7 +507,7 @@
 			MKSTR(err_str, "Unexpected CQE error syndrome "
 			      "0x%02x CQN = %u SQN = %u wqe_counter = %u "
 			      "wq_ci = %u cq_ci = %u", err_cqe->syndrome,
-			      txq_ctrl->cqn, txq->qp_num_8s >> 8,
+			      txq->cqe_s, txq->qp_num_8s >> 8,
 			      rte_be_to_cpu_16(err_cqe->wqe_counter),
 			      txq->wqe_ci, txq->cq_ci);
 			MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u",
@@ -516,7 +516,7 @@
 			mlx5_dump_debug_information(name, NULL, err_str, 0);
 			mlx5_dump_debug_information(name, "MLX5 Error CQ:",
 						    (const void *)((uintptr_t)
-						    &(*txq->cqes)[0]),
+						    txq->cqes),
 						    sizeof(*err_cqe) *
 						    (1 << txq->cqe_n));
 			mlx5_dump_debug_information(name, "MLX5 Error SQ:",
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index acde09d..d8c6f35 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -188,37 +188,60 @@ struct mlx5_hrxq {
 	uint8_t rss_key[]; /* Hash key. */
 };
 
+/* TX queue send local data. */
+__extension__
+struct mlx5_txq_local {
+	struct mlx5_wqe *wqe_last; /* last sent WQE pointer. */
+	struct rte_mbuf *mbuf; /* first mbuf to process. */
+	uint16_t pkts_copy; /* packets copied to elts. */
+	uint16_t pkts_sent; /* packets sent. */
+	uint16_t elts_free; /* available elts remain. */
+	uint16_t wqe_free; /* available wqe remain. */
+	uint16_t mbuf_off; /* data offset in current mbuf. */
+	uint16_t mbuf_nseg; /* number of remaining mbuf. */
+};
+
 /* TX queue descriptor. */
 __extension__
 struct mlx5_txq_data {
 	uint16_t elts_head; /* Current counter in (*elts)[]. */
 	uint16_t elts_tail; /* Counter of first element awaiting completion. */
-	uint16_t elts_comp; /* Counter since last completion request. */
-	uint16_t mpw_comp; /* WQ index since last completion request. */
+	uint16_t elts_comp; /* elts index since last completion request. */
+	uint16_t elts_s; /* Number of mbuf elements. */
+	uint16_t elts_m; /* Mask for mbuf elements indices. */
+	/* Fields related to elts mbuf storage. */
+	uint16_t wqe_ci; /* Consumer index for work queue. */
+	uint16_t wqe_pi; /* Producer index for work queue. */
+	uint16_t wqe_s; /* Number of WQ elements. */
+	uint16_t wqe_m; /* Mask Number for WQ elements. */
+	uint16_t wqe_comp; /* WQE index since last completion request. */
+	uint16_t wqe_thres; /* WQE threshold to request completion in CQ. */
+	/* WQ related fields. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
 #ifndef NDEBUG
-	uint16_t cq_pi; /* Producer index for completion queue. */
+	uint16_t cq_pi; /* Counter of issued CQE "always" requests. */
 #endif
-	uint16_t wqe_ci; /* Consumer index for work queue. */
-	uint16_t wqe_pi; /* Producer index for work queue. */
-	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
+	uint16_t cqe_s; /* Number of CQ elements. */
+	uint16_t cqe_m; /* Mask for CQ indices. */
+	/* CQ related fields. */
+	uint16_t elts_n:4; /* elts[] length (in log2). */
 	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
-	uint16_t wqe_n:4; /* Number of of WQ elements (in log2). */
+	uint16_t wqe_n:4; /* Number of WQ elements (in log2). */
 	uint16_t tso_en:1; /* When set hardware TSO is enabled. */
 	uint16_t tunnel_en:1;
 	/* When set TX offload for tunneled packets are supported. */
 	uint16_t swp_en:1; /* Whether SW parser is enabled. */
-	uint16_t mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
-	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
-	uint16_t inline_max_packet_sz; /* Max packet size for inlining. */
+	uint16_t inlen_send; /* Ordinary send data inline size. */
+	uint16_t inlen_empw; /* eMPW max packet size to inline. */
+	uint16_t inlen_mode; /* Minimal data length to inline. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	uint64_t offloads; /* Offloads for Tx Queue. */
 	struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
-	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
-	volatile void *wqes; /* Work queue (use volatile to write into). */
+	struct mlx5_wqe *wqes; /* Work queue. */
+	struct mlx5_wqe *wqes_end; /* Work queue array limit. */
+	volatile struct mlx5_cqe *cqes; /* Completion queue. */
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
-	struct rte_mbuf *(*elts)[]; /* TX elements. */
 	uint16_t port_id; /* Port ID of device. */
 	uint16_t idx; /* Queue index. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
@@ -226,6 +249,8 @@ struct mlx5_txq_data {
 	rte_spinlock_t *uar_lock;
 	/* UAR access lock required for 32bit implementations */
 #endif
+	struct rte_mbuf *elts[0];
+	/* Storage for queued packets, must be the last field. */
 } __rte_cache_aligned;
 
 /* Verbs Rx queue elements. */
@@ -239,7 +264,6 @@ struct mlx5_txq_ibv {
 
 /* TX queue control descriptor. */
 struct mlx5_txq_ctrl {
-	struct mlx5_txq_data txq; /* Data path structure. */
 	LIST_ENTRY(mlx5_txq_ctrl) next; /* Pointer to the next element. */
 	rte_atomic32_t refcnt; /* Reference counter. */
 	unsigned int socket; /* CPU socket ID for allocations. */
@@ -249,8 +273,9 @@ struct mlx5_txq_ctrl {
 	struct mlx5_priv *priv; /* Back pointer to private data. */
 	off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
 	void *bf_reg; /* BlueFlame register from Verbs. */
-	uint32_t cqn; /* CQ number. */
 	uint16_t dump_file_n; /* Number of dump files. */
+	struct mlx5_txq_data txq; /* Data path structure. */
+	/* Must be the last field in the structure, contains elts[]. */
 };
 
 #define MLX5_TX_BFREG(txq) \
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 0d2dbfa..b12c070 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -47,7 +47,7 @@
 	unsigned int i;
 
 	for (i = 0; (i != elts_n); ++i)
-		(*txq_ctrl->txq.elts)[i] = NULL;
+		txq_ctrl->txq.elts[i] = NULL;
 	DRV_LOG(DEBUG, "port %u Tx queue %u allocated and configured %u WRs",
 		PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx, elts_n);
 	txq_ctrl->txq.elts_head = 0;
@@ -68,7 +68,7 @@
 	const uint16_t elts_m = elts_n - 1;
 	uint16_t elts_head = txq_ctrl->txq.elts_head;
 	uint16_t elts_tail = txq_ctrl->txq.elts_tail;
-	struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts;
+	struct rte_mbuf *(*elts)[elts_n] = &txq_ctrl->txq.elts;
 
 	DRV_LOG(DEBUG, "port %u Tx queue %u freeing WRs",
 		PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx);
@@ -411,7 +411,8 @@ struct mlx5_txq_ibv *
 	attr.cq = (struct ibv_cq_init_attr_ex){
 		.comp_mask = 0,
 	};
-	cqe_n = desc / MLX5_TX_COMP_THRESH + 1;
+	cqe_n = desc / MLX5_TX_COMP_THRESH +
+		1 + MLX5_TX_COMP_THRESH_INLINE_DIV;
 	tmpl.cq = mlx5_glue->create_cq(priv->sh->ctx, cqe_n, NULL, NULL, 0);
 	if (tmpl.cq == NULL) {
 		DRV_LOG(ERR, "port %u Tx queue %u CQ creation failure",
@@ -449,7 +450,7 @@ struct mlx5_txq_ibv *
 		.pd = priv->sh->pd,
 		.comp_mask = IBV_QP_INIT_ATTR_PD,
 	};
-	if (txq_data->max_inline)
+	if (txq_data->inlen_send)
 		attr.init.cap.max_inline_data = txq_ctrl->max_inline_data;
 	if (txq_data->tso_en) {
 		attr.init.max_tso_header = txq_ctrl->max_tso_header;
@@ -523,25 +524,29 @@ struct mlx5_txq_ibv *
 		goto error;
 	}
 	txq_data->cqe_n = log2above(cq_info.cqe_cnt);
+	txq_data->cqe_s = 1 << txq_data->cqe_n;
+	txq_data->cqe_m = txq_data->cqe_s - 1;
 	txq_data->qp_num_8s = tmpl.qp->qp_num << 8;
 	txq_data->wqes = qp.sq.buf;
 	txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
+	txq_data->wqe_s = 1 << txq_data->wqe_n;
+	txq_data->wqe_m = txq_data->wqe_s - 1;
+	txq_data->wqes_end = txq_data->wqes + txq_data->wqe_s;
 	txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
 	txq_data->cq_db = cq_info.dbrec;
-	txq_data->cqes =
-		(volatile struct mlx5_cqe (*)[])
-		(uintptr_t)cq_info.buf;
+	txq_data->cqes = (volatile struct mlx5_cqe *)cq_info.buf;
 	txq_data->cq_ci = 0;
 #ifndef NDEBUG
 	txq_data->cq_pi = 0;
 #endif
 	txq_data->wqe_ci = 0;
 	txq_data->wqe_pi = 0;
+	txq_data->wqe_comp = 0;
+	txq_data->wqe_thres = txq_data->wqe_s / MLX5_TX_COMP_THRESH_INLINE_DIV;
 	txq_ibv->qp = tmpl.qp;
 	txq_ibv->cq = tmpl.cq;
 	rte_atomic32_inc(&txq_ibv->refcnt);
 	txq_ctrl->bf_reg = qp.bf.reg;
-	txq_ctrl->cqn = cq_info.cqn;
 	txq_uar_init(txq_ctrl);
 	if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) {
 		txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset;
@@ -663,7 +668,11 @@ struct mlx5_txq_ibv *
 	unsigned int wqe_size;
 	const unsigned int desc = 1 << txq_ctrl->txq.elts_n;
 
-	wqe_size = MLX5_WQE_SIZE + txq_ctrl->max_inline_data;
+	wqe_size = MLX5_WQE_CSEG_SIZE +
+		   MLX5_WQE_ESEG_SIZE +
+		   MLX5_WSEG_SIZE -
+		   MLX5_ESEG_MIN_INLINE_SIZE +
+		   txq_ctrl->max_inline_data;
 	return rte_align32pow2(wqe_size * desc) / MLX5_WQE_SIZE;
 }
 
@@ -676,7 +685,156 @@ struct mlx5_txq_ibv *
 static void
 txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
 {
-	(void)txq_ctrl;
+	struct mlx5_priv *priv = txq_ctrl->priv;
+	struct mlx5_dev_config *config = &priv->config;
+	unsigned int inlen_send; /* Inline data for ordinary SEND.*/
+	unsigned int inlen_empw; /* Inline data for enhanced MPW. */
+	unsigned int inlen_mode; /* Minimal required Inline data. */
+	unsigned int txqs_inline; /* Min Tx queues to enable inline. */
+	int tso = !!(txq_ctrl->txq.offloads & (DEV_TX_OFFLOAD_TCP_TSO |
+					       DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+					       DEV_TX_OFFLOAD_GRE_TNL_TSO |
+					       DEV_TX_OFFLOAD_IP_TNL_TSO |
+					       DEV_TX_OFFLOAD_UDP_TNL_TSO));
+	unsigned int temp;
+
+	txqs_inline = (config->txqs_inline == MLX5_ARG_UNSET) ?
+		      MLX5_EMPW_MIN_TXQS :
+		      (unsigned int)config->txqs_inline;
+	inlen_send = (config->txq_inline_max == MLX5_ARG_UNSET) ?
+		     MLX5_SEND_DEF_INLINE_LEN :
+		     (unsigned int)config->txq_inline_max;
+	inlen_empw = (config->txq_inline_mpw == MLX5_ARG_UNSET) ?
+		     MLX5_EMPW_DEF_INLINE_LEN :
+		     (unsigned int)config->txq_inline_mpw;
+	inlen_mode = (config->txq_inline_min == MLX5_ARG_UNSET) ?
+		     0 : (unsigned int)config->txq_inline_min;
+
+	/*
+	 * If there is requested minimal amount of data to inline
+	 * we MUST enable inlining. This is a case for ConnectX-4
+	 * which usually requires L2 inlined for correct operating
+	 * and ConnectX-4LX which requires L2-L4 inlined to
+	 * support E-Switch Flows.
+	 */
+	if (inlen_mode) {
+		if (inlen_mode <= MLX5_ESEG_MIN_INLINE_SIZE) {
+			/*
+			 * Optimize minimal inlining for single
+			 * segment packets to fill one WQEBB
+			 * without gaps.
+			 */
+			temp = MLX5_ESEG_MIN_INLINE_SIZE;
+		} else {
+			temp = inlen_mode - MLX5_ESEG_MIN_INLINE_SIZE;
+			temp = RTE_ALIGN(temp, MLX5_WSEG_SIZE) +
+			       MLX5_ESEG_MIN_INLINE_SIZE;
+			temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN);
+		}
+		if (temp != inlen_mode) {
+			DRV_LOG(INFO,
+				"port %u minimal required inline setting"
+				" aligned from %u to %u",
+				PORT_ID(priv), inlen_mode, temp);
+			inlen_mode = temp;
+		}
+	}
+	/*
+	 * If there are few Tx queues it is prioritized
+	 * to save CPU cycles and disable data inlining at all.
+	 */
+	if (inlen_send && priv->txqs_n >= txqs_inline) {
+		/*
+		 * The data sent with ordinal MLX5_OPCODE_SEND
+		 * may be inlined in Ethernet Segment, align the
+		 * length accordingly to fit entire WQEBBs.
+		 */
+		temp = (inlen_send / MLX5_WQE_SIZE) * MLX5_WQE_SIZE +
+			MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE;
+		temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX +
+				     MLX5_ESEG_MIN_INLINE_SIZE -
+				     MLX5_WQE_CSEG_SIZE -
+				     MLX5_WQE_ESEG_SIZE -
+				     MLX5_WQE_DSEG_SIZE * 2);
+		temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN);
+		temp = RTE_MAX(temp, inlen_mode);
+		if (temp != inlen_send) {
+			DRV_LOG(INFO,
+				"port %u ordinary send inline setting"
+				" aligned from %u to %u",
+				PORT_ID(priv), inlen_send, temp);
+			inlen_send = temp;
+		}
+		/*
+		 * Not aligned to cache lines, but to WQEs.
+		 * First bytes of data (initial alignment)
+		 * is going to be copied explicitly at the
+		 * beginning of inlining buffer in Ethernet
+		 * Segment.
+		 */
+		assert(inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+		assert(inlen_send <= MLX5_WQE_SIZE_MAX +
+				     MLX5_ESEG_MIN_INLINE_SIZE -
+				     MLX5_WQE_CSEG_SIZE -
+				     MLX5_WQE_ESEG_SIZE -
+				     MLX5_WQE_DSEG_SIZE * 2);
+		txq_ctrl->txq.inlen_send = inlen_send;
+		txq_ctrl->txq.inlen_mode = inlen_mode;
+	} else {
+		/*
+		 * If minimal inlining is requested we must
+		 * enable inlining in general, despite the
+		 * number of configured queues.
+		 */
+		txq_ctrl->txq.inlen_send = inlen_mode;
+		txq_ctrl->txq.inlen_mode = inlen_mode;
+		txq_ctrl->txq.inlen_empw = 0;
+		inlen_send = 0;
+		inlen_empw = 0;
+	}
+	if (inlen_send && inlen_empw && priv->txqs_n >= txqs_inline) {
+		/*
+		 * The data sent with MLX5_OPCODE_ENHANCED_MPSW
+		 * may be inlined in Data Segment, align the
+		 * length accordingly to fit entire WQEBBs.
+		 */
+		temp = (inlen_empw + MLX5_WQE_SIZE - 1) / MLX5_WQE_SIZE;
+		temp = temp * MLX5_WQE_SIZE +
+		       MLX5_DSEG_MIN_INLINE_SIZE - MLX5_WQE_DSEG_SIZE;
+		temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX +
+				     MLX5_DSEG_MIN_INLINE_SIZE -
+				     MLX5_WQE_CSEG_SIZE -
+				     MLX5_WQE_ESEG_SIZE -
+				     MLX5_WQE_DSEG_SIZE);
+		temp = RTE_MIN(temp, MLX5_EMPW_MAX_INLINE_LEN);
+		if (temp != inlen_empw) {
+			DRV_LOG(INFO,
+				"port %u enhanced empw inline setting"
+				" aligned from %u to %u",
+				PORT_ID(priv), inlen_empw, temp);
+			inlen_empw = temp;
+		}
+		assert(inlen_empw >= MLX5_ESEG_MIN_INLINE_SIZE);
+		assert(inlen_empw <= MLX5_WQE_SIZE_MAX +
+				     MLX5_DSEG_MIN_INLINE_SIZE -
+				     MLX5_WQE_CSEG_SIZE -
+				     MLX5_WQE_ESEG_SIZE -
+				     MLX5_WQE_DSEG_SIZE);
+		txq_ctrl->txq.inlen_empw = inlen_send;
+	}
+	txq_ctrl->max_inline_data = RTE_MAX(inlen_send, inlen_empw);
+	if (tso) {
+		txq_ctrl->max_tso_header = MLX5_MAX_TSO_HEADER;
+		txq_ctrl->max_inline_data = RTE_MAX(txq_ctrl->max_inline_data,
+						    MLX5_MAX_TSO_HEADER);
+		txq_ctrl->txq.tso_en = 1;
+	}
+	txq_ctrl->txq.tunnel_en = config->tunnel_en | config->swp;
+	txq_ctrl->txq.swp_en = ((DEV_TX_OFFLOAD_IP_TNL_TSO |
+				 DEV_TX_OFFLOAD_UDP_TNL_TSO |
+				 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM) &
+				txq_ctrl->txq.offloads) && config->swp;
+	config->tx_inline = txq_ctrl->txq.inlen_send ? 1 : 0;
 }
 
 /**
@@ -724,6 +882,8 @@ struct mlx5_txq_ctrl *
 	tmpl->priv = priv;
 	tmpl->socket = socket;
 	tmpl->txq.elts_n = log2above(desc);
+	tmpl->txq.elts_s = desc;
+	tmpl->txq.elts_m = desc - 1;
 	tmpl->txq.port_id = dev->data->port_id;
 	tmpl->txq.idx = idx;
 	txq_set_params(tmpl);
@@ -737,8 +897,6 @@ struct mlx5_txq_ctrl *
 		rte_errno = ENOMEM;
 		goto error;
 	}
-	tmpl->txq.elts =
-		(struct rte_mbuf *(*)[1 << tmpl->txq.elts_n])(tmpl + 1);
 	rte_atomic32_inc(&tmpl->refcnt);
 	LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next);
 	return tmpl;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v3 5/8] net/mlx5: introduce Tx burst routine template
  2019-07-17  6:53     ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                         ` (3 preceding siblings ...)
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 4/8] net/mlx5: add Tx datapath configuration and setup Viacheslav Ovsiienko
@ 2019-07-17  6:53       ` Viacheslav Ovsiienko
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 6/8] net/mlx5: implement Tx burst template Viacheslav Ovsiienko
                         ` (3 subsequent siblings)
  8 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-17  6:53 UTC (permalink / raw)
  To: dev; +Cc: yskoh

Mellanox NICs support the wide set of Tx offloads. The supported
offloads are reported by the mlx5 PMD in rte_eth_dev_info tx_offload_capa
field. An application may choose any combination of supported offloads
and configure the device appropriately. Some of Tx offloads may be
not requested by application, or ever all of them may be omitted.
Most of the Tx offloads require some code branches in tx_burst routine
to support ones. If Tx offload is not requested the tx_burst routine
code may be significantly simplified and consume less CPU cycles.

For example, if application does not engage TSO offload this code
can be omitted, if multi-segment packet is not supposed the tx_burst
may assume single mbuf packets only, etc.

Currently, the mlx5 PMD implements multiple tx_burst subroutines
for most common combinations of requested Tx offloads, each branch
has its own dedicated implementation. It is not very easy to update,
support and develop such kind of code - multiple branches impose
the multiple points to process. Also many of frequently requested
offload combinations are not supported yet. That leads to selecting of
not completely matching tx_burst routine and harms the performance.

This patch introduces the new approach for tx_burst code. It is proposed
to develop the unified template for tx_burst routine, which supports
all the Tx offloads and takes the compile time defined parameter
describing the supposed set of supported offloads. On the base
of this template, the compiler is able to generate multiple tx_burst
routines highly optimized for the statically specified set of Tx offloads.
Next, in runtime, at Tx queue configuration the best matching optimized
implementation of tx_burst is chosen.

This patch intentionally omits the template internal implementation,
but just introduces the template itself to emboss the approach of
the multiple specially tuned tx_burst routines.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 511 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 507 insertions(+), 4 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 13f9431..af6f705 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright 2015 6WIND S.A.
- * Copyright 2015 Mellanox Technologies, Ltd
+ * Copyright 2015-2019 Mellanox Technologies, Ltd
  */
 
 #include <assert.h>
@@ -34,6 +34,57 @@
 #include "mlx5_defs.h"
 #include "mlx5_prm.h"
 
+/* TX burst subroutines return codes. */
+enum mlx5_txcmp_code {
+	MLX5_TXCMP_CODE_EXIT = 0,
+	MLX5_TXCMP_CODE_ERROR,
+	MLX5_TXCMP_CODE_SINGLE,
+	MLX5_TXCMP_CODE_MULTI,
+	MLX5_TXCMP_CODE_TSO,
+	MLX5_TXCMP_CODE_EMPW,
+};
+
+/*
+ * These defines are used to configure Tx burst routine option set
+ * supported at compile time. The not specified options are optimized out
+ * out due to if conditions can be explicitly calculated at compile time.
+ * The offloads with bigger runtime check (require more CPU cycles to
+ * skip) overhead should have the bigger index - this is needed to
+ * select the better matching routine function if no exact match and
+ * some offloads are not actually requested.
+ */
+#define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/
+#define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/
+#define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/
+#define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */
+#define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */
+#define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/
+#define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */
+#define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/
+
+/* The most common offloads groups. */
+#define MLX5_TXOFF_CONFIG_NONE 0
+#define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \
+				MLX5_TXOFF_CONFIG_TSO | \
+				MLX5_TXOFF_CONFIG_SWP | \
+				MLX5_TXOFF_CONFIG_CSUM | \
+				MLX5_TXOFF_CONFIG_INLINE | \
+				MLX5_TXOFF_CONFIG_VLAN | \
+				MLX5_TXOFF_CONFIG_METADATA)
+
+#define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask)
+
+#define MLX5_TXOFF_DECL(func, olx) \
+static uint16_t mlx5_tx_burst_##func(void *txq, \
+				     struct rte_mbuf **pkts, \
+				    uint16_t pkts_n) \
+{ \
+	return mlx5_tx_burst_tmpl((struct mlx5_txq_data *restrict)txq, \
+		    pkts, pkts_n, (olx)); \
+}
+
+#define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx},
+
 static __rte_always_inline uint32_t
 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
 
@@ -1531,7 +1582,323 @@
 }
 
 /**
- * Configure the TX function to use.
+ * DPDK Tx callback template. This is configured template
+ * used to generate routines optimized for specified offload setup.
+ * One of this generated functions is chosen at SQ configuration
+ * time.
+ *
+ * @param txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param olx
+ *   Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
+ *   values. Should be static to take compile time static configuration
+ *   advantages.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+static __rte_always_inline uint16_t
+mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq,
+		   struct rte_mbuf **restrict pkts,
+		   uint16_t pkts_n,
+		   unsigned int olx)
+{
+	(void)txq;
+	(void)pkts;
+	(void)pkts_n;
+	(void)olx;
+	return 0;
+}
+
+/* Generate routines with Enhanced Multi-Packet Write support. */
+MLX5_TXOFF_DECL(full_empw,
+		MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(none_empw,
+		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(md_empw,
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mt_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtsc_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mti_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtiv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sc_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sci_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(scv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sciv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(i_empw,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(v_empw,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(iv_empw,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+/* Generate routines without Enhanced Multi-Packet Write support. */
+MLX5_TXOFF_DECL(full,
+		MLX5_TXOFF_CONFIG_FULL)
+
+MLX5_TXOFF_DECL(none,
+		MLX5_TXOFF_CONFIG_NONE)
+
+MLX5_TXOFF_DECL(md,
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mt,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mtsc,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mti,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(mtv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(mtiv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(sc,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(sci,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(scv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(sciv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(i,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(v,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(iv,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+/*
+ * Array of declared and compiled Tx burst function and corresponding
+ * supported offloads set. The array is used to select the Tx burst
+ * function for specified offloads set at Tx queue configuration time.
+ */
+const struct {
+	eth_tx_burst_t func;
+	unsigned int olx;
+} txoff_func[] = {
+MLX5_TXOFF_INFO(full_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(none_empw,
+		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(md_empw,
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mt_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtsc_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mti_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtiv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sc_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sci_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(scv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sciv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(i_empw,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(v_empw,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(iv_empw,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(full,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(none,
+		MLX5_TXOFF_CONFIG_NONE)
+
+MLX5_TXOFF_INFO(md,
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mt,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mtsc,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mti,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_INFO(mtv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mtiv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sc,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sci,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(scv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sciv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(i,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(v,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(iv,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+};
+
+/**
+ * Configure the Tx function to use. The routine checks configured
+ * Tx offloads for the device and selects appropriate Tx burst
+ * routine. There are multiple Tx burst routines compiled from
+ * the same template in the most optimal way for the dedicated
+ * Tx offloads set.
  *
  * @param dev
  *   Pointer to private data structure.
@@ -1542,8 +1909,144 @@
 eth_tx_burst_t
 mlx5_select_tx_function(struct rte_eth_dev *dev)
 {
-	(void)dev;
-	return removed_tx_burst;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
+	unsigned int diff = 0, olx = 0, i, m;
+
+	static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <=
+		      MLX5_DSEG_MAX, "invalid WQE max size");
+	static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE,
+		      "invalid WQE Control Segment size");
+	static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE,
+		      "invalid WQE Ethernet Segment size");
+	static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE,
+		      "invalid WQE Data Segment size");
+	static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE,
+		      "invalid WQE size");
+	assert(priv);
+	if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) {
+		/* We should support Multi-Segment Packets. */
+		olx |= MLX5_TXOFF_CONFIG_MULTI;
+	}
+	if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
+			   DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+			   DEV_TX_OFFLOAD_GRE_TNL_TSO |
+			   DEV_TX_OFFLOAD_IP_TNL_TSO |
+			   DEV_TX_OFFLOAD_UDP_TNL_TSO)) {
+		/* We should support TCP Send Offload. */
+		olx |= MLX5_TXOFF_CONFIG_TSO;
+	}
+	if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
+			   DEV_TX_OFFLOAD_UDP_TNL_TSO |
+			   DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) {
+		/* We should support Software Parser for Tunnels. */
+		olx |= MLX5_TXOFF_CONFIG_SWP;
+	}
+	if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM |
+			   DEV_TX_OFFLOAD_UDP_CKSUM |
+			   DEV_TX_OFFLOAD_TCP_CKSUM |
+			   DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) {
+		/* We should support IP/TCP/UDP Checksums. */
+		olx |= MLX5_TXOFF_CONFIG_CSUM;
+	}
+	if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) {
+		/* We should support VLAN insertion. */
+		olx |= MLX5_TXOFF_CONFIG_VLAN;
+	}
+	if (config->tx_inline) {
+		/*
+		 * Data inlining is enabled by default.
+		 * Required inline data are disabled by default.
+		 */
+		olx |= MLX5_TXOFF_CONFIG_INLINE;
+	}
+	if (config->mps == MLX5_MPW_ENHANCED &&
+	    config->txq_inline_min <= 0) {
+		/*
+		 * The NIC supports Enhanced Multi-Packet Write.
+		 * We do not support legacy MPW due to its
+		 * hardware related problems, so we just ignore
+		 * legacy MLX5_MPW settings. There should be no
+		 * minimal required inline data.
+		 */
+		olx |= MLX5_TXOFF_CONFIG_EMPW;
+	}
+	if (tx_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) {
+		/* We should support Flow metadata. */
+		olx |= MLX5_TXOFF_CONFIG_METADATA;
+	}
+	/*
+	 * Scan the routines table to find the minimal
+	 * satisfying routine with requested offloads.
+	 */
+	m = RTE_DIM(txoff_func);
+	for (i = 0; i < RTE_DIM(txoff_func); i++) {
+		unsigned int tmp;
+
+		tmp = txoff_func[i].olx;
+		if (tmp == olx) {
+			/* Meets requested offloads exactly.*/
+			m = i;
+			break;
+		}
+		if ((tmp & olx) != olx) {
+			/* Does not meet requested offloads at all. */
+			continue;
+		}
+		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW)
+			/* Do not enable eMPW if not configured. */
+			continue;
+		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE)
+			/* Do not enable inlining if not configured. */
+			continue;
+		/*
+		 * Some routine meets the requirements.
+		 * Check whether it has minimal amount
+		 * of not requested offloads.
+		 */
+		tmp = __builtin_popcountl(tmp & ~olx);
+		if (m >= RTE_DIM(txoff_func) || tmp < diff) {
+			/* First or better match, save and continue. */
+			m = i;
+			diff = tmp;
+			continue;
+		}
+		if (tmp == diff) {
+			tmp = txoff_func[i].olx ^ txoff_func[m].olx;
+			if (__builtin_ffsl(txoff_func[i].olx & ~tmp) <
+			    __builtin_ffsl(txoff_func[m].olx & ~tmp)) {
+				/* Lighter not requested offload. */
+				m = i;
+			}
+		}
+	}
+	if (m >= RTE_DIM(txoff_func)) {
+		DRV_LOG(DEBUG, "port %u has no selected Tx function"
+			       " for requested offloads %04X",
+				dev->data->port_id, olx);
+		return NULL;
+	}
+	DRV_LOG(DEBUG, "port %u has selected Tx function"
+		       " supporting offloads %04X/%04X",
+			dev->data->port_id, olx, txoff_func[m].olx);
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI)
+		DRV_LOG(DEBUG, "\tMULTI (multi segment)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO)
+		DRV_LOG(DEBUG, "\tTSO   (TCP send offload)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP)
+		DRV_LOG(DEBUG, "\tSWP   (software parser)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM)
+		DRV_LOG(DEBUG, "\tCSUM  (checksum offload)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE)
+		DRV_LOG(DEBUG, "\tINLIN (inline data)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN)
+		DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA)
+		DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW)
+		DRV_LOG(DEBUG, "\tEMPW  (Enhanced MPW)");
+	return txoff_func[m].func;
 }
 
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v3 6/8] net/mlx5: implement Tx burst template
  2019-07-17  6:53     ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                         ` (4 preceding siblings ...)
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 5/8] net/mlx5: introduce Tx burst routine template Viacheslav Ovsiienko
@ 2019-07-17  6:53       ` Viacheslav Ovsiienko
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 7/8] net/mlx5: add minimal required Tx data inline Viacheslav Ovsiienko
                         ` (2 subsequent siblings)
  8 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-17  6:53 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch adds the implementation of tx_burst routine template.
The template supports all Tx offloads and multiple optimized
tx_burst routines can be generated by compiler from this one.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 2897 +++++++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx.h |    5 +-
 2 files changed, 2873 insertions(+), 29 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index af6f705..2674ea8 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -339,6 +339,109 @@ enum mlx5_txcmp_code {
 }
 
 /**
+ * Set Software Parser flags and offsets in Ethernet Segment of WQE.
+ * Flags must be preliminary initialized to zero.
+ *
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param swp_flags
+ *   Pointer to store Software Parser flags
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Software Parser offsets packed in dword.
+ *   Software Parser flags are set by pointer.
+ */
+static __rte_always_inline uint32_t
+txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc,
+		uint8_t *swp_flags,
+		unsigned int olx)
+{
+	uint64_t ol, tunnel;
+	unsigned int idx, off;
+	uint32_t set;
+
+	if (!MLX5_TXOFF_CONFIG(SWP))
+		return 0;
+	ol = loc->mbuf->ol_flags;
+	tunnel = ol & PKT_TX_TUNNEL_MASK;
+	/*
+	 * Check whether Software Parser is required.
+	 * Only customized tunnels may ask for.
+	 */
+	if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP))
+		return 0;
+	/*
+	 * The index should have:
+	 * bit[0:1] = PKT_TX_L4_MASK
+	 * bit[4] = PKT_TX_IPV6
+	 * bit[8] = PKT_TX_OUTER_IPV6
+	 * bit[9] = PKT_TX_OUTER_UDP
+	 */
+	idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52;
+	idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0;
+	*swp_flags = mlx5_swp_types_table[idx];
+	/*
+	 * Set offsets for SW parser. Since ConnectX-5, SW parser just
+	 * complements HW parser. SW parser starts to engage only if HW parser
+	 * can't reach a header. For the older devices, HW parser will not kick
+	 * in if any of SWP offsets is set. Therefore, all of the L3 offsets
+	 * should be set regardless of HW offload.
+	 */
+	off = loc->mbuf->outer_l2_len;
+	if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT)
+		off += sizeof(struct rte_vlan_hdr);
+	set = (off >> 1) << 8; /* Outer L3 offset. */
+	off += loc->mbuf->outer_l3_len;
+	if (tunnel == PKT_TX_TUNNEL_UDP)
+		set |= off >> 1; /* Outer L4 offset. */
+	if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */
+		const uint64_t csum = ol & PKT_TX_L4_MASK;
+			off += loc->mbuf->l2_len;
+		set |= (off >> 1) << 24; /* Inner L3 offset. */
+		if (csum == PKT_TX_TCP_CKSUM ||
+		    csum == PKT_TX_UDP_CKSUM ||
+		    (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) {
+			off += loc->mbuf->l3_len;
+			set |= (off >> 1) << 16; /* Inner L4 offset. */
+		}
+	}
+	set = rte_cpu_to_le_32(set);
+	return set;
+}
+
+/**
+ * Convert the Checksum offloads to Verbs.
+ *
+ * @param buf
+ *   Pointer to the mbuf.
+ *
+ * @return
+ *   Converted checksum flags.
+ */
+static __rte_always_inline uint8_t
+txq_ol_cksum_to_cs(struct rte_mbuf *buf)
+{
+	uint32_t idx;
+	uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK);
+	const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK |
+				       PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM;
+
+	/*
+	 * The index should have:
+	 * bit[0] = PKT_TX_TCP_SEG
+	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
+	 * bit[4] = PKT_TX_IP_CKSUM
+	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
+	 * bit[9] = tunnel
+	 */
+	idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
+	return mlx5_cksum_table[idx];
+}
+
+/**
  * Internal function to compute the number of used descriptors in an RX queue
  *
  * @param rxq
@@ -543,7 +646,7 @@ enum mlx5_txcmp_code {
  *   The last Tx buffer element to free.
  */
 uint16_t
-mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
+mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq,
 			 volatile struct mlx5_err_cqe *err_cqe)
 {
 	if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
@@ -1563,6 +1666,298 @@ enum mlx5_txcmp_code {
 }
 
 /**
+ * Free the mbufs from the linear array of pointers.
+ *
+ * @param pkts
+ *   Pointer to array of packets to be free.
+ * @param pkts_n
+ *   Number of packets to be freed.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts,
+		  unsigned int pkts_n,
+		  unsigned int olx __rte_unused)
+{
+	struct rte_mempool *pool = NULL;
+	struct rte_mbuf **p_free = NULL;
+	struct rte_mbuf *mbuf;
+	unsigned int n_free = 0;
+
+	/*
+	 * The implemented algorithm eliminates
+	 * copying pointers to temporary array
+	 * for rte_mempool_put_bulk() calls.
+	 */
+	assert(pkts);
+	assert(pkts_n);
+	for (;;) {
+		for (;;) {
+			/*
+			 * Decrement mbuf reference counter, detach
+			 * indirect and external buffers if needed.
+			 */
+			mbuf = rte_pktmbuf_prefree_seg(*pkts);
+			if (likely(mbuf != NULL)) {
+				assert(mbuf == *pkts);
+				if (likely(n_free != 0)) {
+					if (unlikely(pool != mbuf->pool))
+						/* From different pool. */
+						break;
+				} else {
+					/* Start new scan array. */
+					pool = mbuf->pool;
+					p_free = pkts;
+				}
+				++n_free;
+				++pkts;
+				--pkts_n;
+				if (unlikely(pkts_n == 0)) {
+					mbuf = NULL;
+					break;
+				}
+			} else {
+				/*
+				 * This happens if mbuf is still referenced.
+				 * We can't put it back to the pool, skip.
+				 */
+				++pkts;
+				--pkts_n;
+				if (unlikely(n_free != 0))
+					/* There is some array to free.*/
+					break;
+				if (unlikely(pkts_n == 0))
+					/* Last mbuf, nothing to free. */
+					return;
+			}
+		}
+		for (;;) {
+			/*
+			 * This loop is implemented to avoid multiple
+			 * inlining of rte_mempool_put_bulk().
+			 */
+			assert(pool);
+			assert(p_free);
+			assert(n_free);
+			/*
+			 * Free the array of pre-freed mbufs
+			 * belonging to the same memory pool.
+			 */
+			rte_mempool_put_bulk(pool, (void *)p_free, n_free);
+			if (unlikely(mbuf != NULL)) {
+				/* There is the request to start new scan. */
+				pool = mbuf->pool;
+				p_free = pkts++;
+				n_free = 1;
+				--pkts_n;
+				if (likely(pkts_n != 0))
+					break;
+				/*
+				 * This is the last mbuf to be freed.
+				 * Do one more loop iteration to complete.
+				 * This is rare case of the last unique mbuf.
+				 */
+				mbuf = NULL;
+				continue;
+			}
+			if (likely(pkts_n == 0))
+				return;
+			n_free = 0;
+			break;
+		}
+	}
+}
+
+/**
+ * Free the mbuf from the elts ring buffer till new tail.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tail
+ *   Index in elts to free up to, becomes new elts tail.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq,
+		  uint16_t tail,
+		  unsigned int olx __rte_unused)
+{
+	uint16_t n_elts = tail - txq->elts_tail;
+
+	assert(n_elts);
+	assert(n_elts <= txq->elts_s);
+	/*
+	 * Implement a loop to support ring buffer wraparound
+	 * with single inlining of mlx5_tx_free_mbuf().
+	 */
+	do {
+		unsigned int part;
+
+		part = txq->elts_s - (txq->elts_tail & txq->elts_m);
+		part = RTE_MIN(part, n_elts);
+		assert(part);
+		assert(part <= txq->elts_s);
+		mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m],
+				  part, olx);
+		txq->elts_tail += part;
+		n_elts -= part;
+	} while (n_elts);
+}
+
+/**
+ * Store the mbuf being sent into elts ring buffer.
+ * On Tx completion these mbufs will be freed.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq,
+		  struct rte_mbuf **restrict pkts,
+		  unsigned int pkts_n,
+		  unsigned int olx __rte_unused)
+{
+	unsigned int part;
+	struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts;
+
+	assert(pkts);
+	assert(pkts_n);
+	part = txq->elts_s - (txq->elts_head & txq->elts_m);
+	assert(part);
+	assert(part <= txq->elts_s);
+	/* This code is a good candidate for vectorizing with SIMD. */
+	rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)),
+		   (void *)pkts,
+		   RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *));
+	txq->elts_head += pkts_n;
+	if (unlikely(part < pkts_n))
+		/* The copy is wrapping around the elts array. */
+		rte_memcpy((void *)elts, (void *)(pkts + part),
+			   (pkts_n - part) * sizeof(struct rte_mbuf *));
+}
+
+/**
+ * Manage TX completions. This routine checks the CQ for
+ * arrived CQEs, deduces the last accomplished WQE in SQ,
+ * updates SQ producing index and frees all completed mbufs.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * NOTE: not inlined intentionally, it makes tx_burst
+ * routine smaller, simple and faster - from experiments.
+ */
+static void
+mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
+			  unsigned int olx __rte_unused)
+{
+	bool update = false;
+	int ret;
+
+	do {
+		volatile struct mlx5_wqe_cseg *cseg;
+		volatile struct mlx5_cqe *cqe;
+		uint16_t tail;
+
+		cqe = &txq->cqes[txq->cq_ci & txq->cqe_m];
+		ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci);
+		if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+			if (likely(ret != MLX5_CQE_STATUS_ERR)) {
+				/* No new CQEs in completion queue. */
+				assert(ret == MLX5_CQE_STATUS_HW_OWN);
+				if (likely(update)) {
+					/* Update the consumer index. */
+					rte_compiler_barrier();
+					*txq->cq_db =
+						rte_cpu_to_be_32(txq->cq_ci);
+				}
+				return;
+			}
+			/* Some error occurred, try to restart. */
+			rte_wmb();
+			tail = mlx5_tx_error_cqe_handle
+				(txq, (volatile struct mlx5_err_cqe *)cqe);
+		} else {
+			/* Normal transmit completion. */
+			++txq->cq_ci;
+			rte_cio_rmb();
+			txq->wqe_pi = rte_be_to_cpu_16(cqe->wqe_counter);
+			cseg = (volatile struct mlx5_wqe_cseg *)
+				(txq->wqes + (txq->wqe_pi & txq->wqe_m));
+			tail = cseg->misc;
+		}
+#ifndef NDEBUG
+		if (txq->cq_pi)
+			--txq->cq_pi;
+#endif
+		if (likely(tail != txq->elts_tail)) {
+			/* Free data buffers from elts. */
+			mlx5_tx_free_elts(txq, tail, olx);
+			assert(tail == txq->elts_tail);
+		}
+		update = true;
+	} while (true);
+}
+
+/**
+ * Check if the completion request flag should be set in the last WQE.
+ * Both pushed mbufs and WQEs are monitored and the completion request
+ * flag is set if any of thresholds is reached.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param n_mbuf
+ *   Number of mbuf not stored yet in elts array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq,
+			   unsigned int n_mbuf,
+			   struct mlx5_txq_local *restrict loc,
+			   unsigned int olx __rte_unused)
+{
+	uint16_t head = txq->elts_head + n_mbuf;
+
+	if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH ||
+	    (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres) {
+		volatile struct mlx5_wqe *last = loc->wqe_last;
+
+		txq->elts_comp = head;
+		txq->wqe_comp = txq->wqe_ci;
+		/* Request unconditional completion on last WQE. */
+		last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+					    MLX5_COMP_MODE_OFFSET);
+		/* Save elts_head in unused "immediate" field of WQE. */
+		last->cseg.misc = head;
+		/*
+		 * A CQE slot must always be available. Count the
+		 * issued CEQ "always" request instead of production
+		 * index due to here can be CQE with errors and
+		 * difference with ci may become inconsistent.
+		 */
+		assert(txq->cqe_s > ++txq->cq_pi);
+	}
+}
+
+/**
  * DPDK callback to check the status of a tx descriptor.
  *
  * @param tx_queue
@@ -1576,42 +1971,2490 @@ enum mlx5_txcmp_code {
 int
 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
 {
-	(void)tx_queue;
-	(void)offset;
-	return RTE_ETH_TX_DESC_FULL;
+	struct mlx5_txq_data *restrict txq = tx_queue;
+	uint16_t used;
+
+	mlx5_tx_handle_completion(txq, 0);
+	used = txq->elts_head - txq->elts_tail;
+	if (offset < used)
+		return RTE_ETH_TX_DESC_FULL;
+	return RTE_ETH_TX_DESC_DONE;
 }
 
 /**
- * DPDK Tx callback template. This is configured template
- * used to generate routines optimized for specified offload setup.
- * One of this generated functions is chosen at SQ configuration
- * time.
+ * Build the Control Segment with specified opcode:
+ * - MLX5_OPCODE_SEND
+ * - MLX5_OPCODE_ENHANCED_MPSW
+ * - MLX5_OPCODE_TSO
  *
  * @param txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Control Segment.
+ * @param ds
+ *   Supposed length of WQE in segments.
+ * @param opcode
+ *   SQ WQE opcode to put into Control Segment.
  * @param olx
- *   Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
- *   values. Should be static to take compile time static configuration
- *   advantages.
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc __rte_unused,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int ds,
+		  unsigned int opcode,
+		  unsigned int olx __rte_unused)
+{
+	struct mlx5_wqe_cseg *restrict cs = &wqe->cseg;
+
+	cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode);
+	cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR <<
+			     MLX5_COMP_MODE_OFFSET);
+	cs->misc = RTE_BE32(0);
+}
+
+/**
+ * Build the Ethernet Segment without inlined data.
+ * Supports Software Parser, Checksums and VLAN
+ * insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int olx)
+{
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+
+	/*
+	 * Calculate and set check sum flags first, dword field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	es->flags = rte_cpu_to_le_32(csum);
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
+		       loc->mbuf->tx_metadata : 0 : 0;
+	/* Engage VLAN tag insertion feature if requested. */
+	if (MLX5_TXOFF_CONFIG(VLAN) &&
+	    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT |
+						  loc->mbuf->vlan_tci);
+	else
+		es->inline_hdr = RTE_BE32(0);
+}
+
+/**
+ * Build the Ethernet Segment with minimal inlined data
+ * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is
+ * used to fill the gap in single WQEBB WQEs.
+ * Supports Software Parser, Checksums and VLAN
+ * insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ *   Length of VLAN tag insertion if any.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int vlan,
+		  unsigned int olx)
+{
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+	uint8_t *psrc, *pdst;
+
+	/*
+	 * Calculate and set check sum flags first, dword field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	es->flags = rte_cpu_to_le_32(csum);
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
+		       loc->mbuf->tx_metadata : 0 : 0;
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(rte_v128u32_t)),
+		      "invalid Ethernet Segment data size");
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(struct rte_vlan_hdr) +
+				 2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Ethernet Segment data size");
+	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+	es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE);
+	es->inline_data = *(uint16_t *)psrc;
+	psrc +=	sizeof(uint16_t);
+	pdst = (uint8_t *)(es + 1);
+	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+		/* Implement VLAN tag insertion as part inline data. */
+		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
+		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		/* Insert VLAN ethertype + VLAN tag. */
+		*(uint32_t *)pdst = rte_cpu_to_be_32
+					((RTE_ETHER_TYPE_VLAN << 16) |
+					 loc->mbuf->vlan_tci);
+		pdst += sizeof(struct rte_vlan_hdr);
+		/* Copy the rest two bytes from packet data. */
+		*(uint16_t *)pdst = *(uint16_t *)psrc;
+	} else {
+		/* Fill the gap in the title WQEBB with inline data. */
+		rte_mov16(pdst, psrc);
+	}
+}
+
+/**
+ * Build the Ethernet Segment with entire packet
+ * data inlining. Checks the boundary of WQEBB and
+ * ring buffer wrapping, supports Software Parser,
+ * Checksums and VLAN insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ *   Length of VLAN tag insertion if any.
+ * @param inlen
+ *   Length of data to inline (VLAN included, if any).
+ * @param tso
+ *   TSO flag, set mss field from the packet.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
  *
  * @return
- *   Number of packets successfully transmitted (<= pkts_n).
+ *   Pointer to the next Data Segment (aligned and wrapped around).
  */
-static __rte_always_inline uint16_t
-mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq,
-		   struct rte_mbuf **restrict pkts,
-		   uint16_t pkts_n,
-		   unsigned int olx)
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int vlan,
+		  unsigned int inlen,
+		  unsigned int tso,
+		  unsigned int olx)
 {
-	(void)txq;
-	(void)pkts;
-	(void)pkts_n;
-	(void)olx;
-	return 0;
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+	uint8_t *psrc, *pdst;
+	unsigned int part;
+
+	/*
+	 * Calculate and set check sum flags first, dword field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	if (tso) {
+		csum <<= 24;
+		csum |= loc->mbuf->tso_segsz;
+		es->flags = rte_cpu_to_be_32(csum);
+	} else {
+		es->flags = rte_cpu_to_le_32(csum);
+	}
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
+		       loc->mbuf->tx_metadata : 0 : 0;
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(rte_v128u32_t)),
+		      "invalid Ethernet Segment data size");
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(struct rte_vlan_hdr) +
+				 2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Ethernet Segment data size");
+	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+	es->inline_hdr_sz = RTE_BE16(inlen);
+	es->inline_data = *(uint16_t *)psrc;
+	psrc +=	sizeof(uint16_t);
+	pdst = (uint8_t *)(es + 1);
+	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+		/* Implement VLAN tag insertion as part inline data. */
+		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
+		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		/* Insert VLAN ethertype + VLAN tag. */
+		*(uint32_t *)pdst = rte_cpu_to_be_32
+					((RTE_ETHER_TYPE_VLAN << 16) |
+					 loc->mbuf->vlan_tci);
+		pdst += sizeof(struct rte_vlan_hdr);
+		/* Copy the rest two bytes from packet data. */
+		*(uint16_t *)pdst = *(uint16_t *)psrc;
+		psrc += sizeof(uint16_t);
+	} else {
+		/* Fill the gap in the title WQEBB with inline data. */
+		rte_mov16(pdst, psrc);
+		psrc += sizeof(rte_v128u32_t);
+	}
+	pdst = (uint8_t *)(es + 2);
+	assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
+	assert(pdst < (uint8_t *)txq->wqes_end);
+	inlen -= MLX5_ESEG_MIN_INLINE_SIZE;
+	if (!inlen) {
+		assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
+		return (struct mlx5_wqe_dseg *)pdst;
+	}
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, inlen);
+	do {
+		rte_memcpy(pdst, psrc, part);
+		inlen -= part;
+		if (likely(!inlen)) {
+			/*
+			 * If return value is not used by the caller
+			 * the code below will be optimized out.
+			 */
+			pdst += part;
+			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
+				pdst = (uint8_t *)txq->wqes;
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		psrc += part;
+		part = inlen;
+	} while (true);
+}
+
+/**
+ * Copy data from chain of mbuf to the specified linear buffer.
+ * Checksums and VLAN insertion Tx offload features. If data
+ * from some mbuf copied completely this mbuf is freed. Local
+ * structure is used to keep the byte stream state.
+ *
+ * @param pdst
+ *   Pointer to the destination linear buffer.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param len
+ *   Length of data to be copied.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_mseg_memcpy(uint8_t *pdst,
+		    struct mlx5_txq_local *restrict loc,
+		    unsigned int len,
+		    unsigned int olx __rte_unused)
+{
+	struct rte_mbuf *mbuf;
+	unsigned int part, dlen;
+	uint8_t *psrc;
+
+	assert(len);
+	do {
+		/* Allow zero length packets, must check first. */
+		dlen = rte_pktmbuf_data_len(loc->mbuf);
+		if (dlen <= loc->mbuf_off) {
+			/* Exhausted packet, just free. */
+			mbuf = loc->mbuf;
+			loc->mbuf = mbuf->next;
+			rte_pktmbuf_free_seg(mbuf);
+			loc->mbuf_off = 0;
+			assert(loc->mbuf_nseg > 1);
+			assert(loc->mbuf);
+			--loc->mbuf_nseg;
+			continue;
+		}
+		dlen -= loc->mbuf_off;
+		psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+		psrc += loc->mbuf_off;
+		part = RTE_MIN(len, dlen);
+		rte_memcpy(pdst, psrc, part);
+		loc->mbuf_off += part;
+		len -= part;
+		if (!len) {
+			if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) {
+				loc->mbuf_off = 0;
+				/* Exhausted packet, just free. */
+				mbuf = loc->mbuf;
+				loc->mbuf = mbuf->next;
+				rte_pktmbuf_free_seg(mbuf);
+				loc->mbuf_off = 0;
+				assert(loc->mbuf_nseg >= 1);
+				--loc->mbuf_nseg;
+			}
+			return;
+		}
+		pdst += part;
+	} while (true);
+}
+
+/**
+ * Build the Ethernet Segment with inlined data from
+ * multi-segment packet. Checks the boundary of WQEBB
+ * and ring buffer wrapping, supports Software Parser,
+ * Checksums and VLAN insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ *   Length of VLAN tag insertion if any.
+ * @param inlen
+ *   Length of data to inline (VLAN included, if any).
+ * @param tso
+ *   TSO flag, set mss field from the packet.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Pointer to the next Data Segment (aligned and
+ *   possible NOT wrapped around - caller should do
+ *   wrapping check on its own).
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int vlan,
+		  unsigned int inlen,
+		  unsigned int tso,
+		  unsigned int olx)
+{
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+	uint8_t *pdst;
+	unsigned int part;
+
+	/*
+	 * Calculate and set check sum flags first, uint32_t field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	if (tso) {
+		csum <<= 24;
+		csum |= loc->mbuf->tso_segsz;
+		es->flags = rte_cpu_to_be_32(csum);
+	} else {
+		es->flags = rte_cpu_to_le_32(csum);
+	}
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
+		       loc->mbuf->tx_metadata : 0 : 0;
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(rte_v128u32_t)),
+		      "invalid Ethernet Segment data size");
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(struct rte_vlan_hdr) +
+				 2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Ethernet Segment data size");
+	assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
+	es->inline_hdr_sz = RTE_BE16(inlen);
+	pdst = (uint8_t *)&es->inline_data;
+	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+		/* Implement VLAN tag insertion as part inline data. */
+		mlx5_tx_mseg_memcpy(pdst, loc, 2 * RTE_ETHER_ADDR_LEN, olx);
+		*(uint32_t *)pdst = rte_cpu_to_be_32
+					((RTE_ETHER_TYPE_VLAN << 16) |
+					 loc->mbuf->vlan_tci);
+		pdst += sizeof(struct rte_vlan_hdr);
+		inlen -= 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr);
+	}
+	assert(pdst < (uint8_t *)txq->wqes_end);
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, inlen);
+	assert(part);
+	do {
+		mlx5_tx_mseg_memcpy(pdst, loc, part, olx);
+		inlen -= part;
+		if (likely(!inlen)) {
+			pdst += part;
+			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		part = inlen;
+	} while (true);
+}
+
+/**
+ * Build the Data Segment of pointer type.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq,
+		 struct mlx5_txq_local *restrict loc,
+		 struct mlx5_wqe_dseg *restrict dseg,
+		 uint8_t *buf,
+		 unsigned int len,
+		 unsigned int olx __rte_unused)
+
+{
+	assert(len);
+	dseg->bcount = rte_cpu_to_be_32(len);
+	dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
+	dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
+}
+
+/**
+ * Build the Data Segment of pointer type or inline
+ * if data length is less than buffer in minimal
+ * Data Segment size.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe_dseg *restrict dseg,
+		  uint8_t *buf,
+		  unsigned int len,
+		  unsigned int olx __rte_unused)
+
+{
+	uintptr_t dst, src;
+
+	assert(len);
+	if (len > MLX5_DSEG_MIN_INLINE_SIZE) {
+		dseg->bcount = rte_cpu_to_be_32(len);
+		dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
+		dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
+
+		return;
+	}
+	dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
+	/* Unrolled implementation of generic rte_memcpy. */
+	dst = (uintptr_t)&dseg->inline_data[0];
+	src = (uintptr_t)buf;
+	if (len & 0x08) {
+		*(uint64_t *)dst = *(uint64_t *)src;
+		dst += sizeof(uint64_t);
+		src += sizeof(uint64_t);
+	}
+	if (len & 0x04) {
+		*(uint32_t *)dst = *(uint32_t *)src;
+		dst += sizeof(uint32_t);
+		src += sizeof(uint32_t);
+	}
+	if (len & 0x02) {
+		*(uint16_t *)dst = *(uint16_t *)src;
+		dst += sizeof(uint16_t);
+		src += sizeof(uint16_t);
+	}
+	if (len & 0x01)
+		*(uint8_t *)dst = *(uint8_t *)src;
+}
+
+/**
+ * Build the Data Segment of inlined data from single
+ * segment packet, no VLAN insertion.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Pointer to the next Data Segment after inlined data.
+ *   Ring buffer wraparound check is needed. We do not
+ *   do it here because it may not be needed for the
+ *   last packet in the eMPW session.
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc __rte_unused,
+		  struct mlx5_wqe_dseg *restrict dseg,
+		  uint8_t *buf,
+		  unsigned int len,
+		  unsigned int olx __rte_unused)
+{
+	unsigned int part;
+	uint8_t *pdst;
+
+	dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
+	pdst = &dseg->inline_data[0];
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, len);
+	do {
+		rte_memcpy(pdst, buf, part);
+		len -= part;
+		if (likely(!len)) {
+			pdst += part;
+			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			/* Note: no final wraparound check here. */
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		buf += part;
+		part = len;
+	} while (true);
+}
+
+/**
+ * Build the Data Segment of inlined data from single
+ * segment packet with VLAN insertion.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to the dseg fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Pointer to the next Data Segment after inlined data.
+ *   Ring buffer wraparound check is needed.
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc __rte_unused,
+		  struct mlx5_wqe_dseg *restrict dseg,
+		  uint8_t *buf,
+		  unsigned int len,
+		  unsigned int olx __rte_unused)
+
+{
+	unsigned int part;
+	uint8_t *pdst;
+
+	assert(len > MLX5_ESEG_MIN_INLINE_SIZE);
+	static_assert(MLX5_DSEG_MIN_INLINE_SIZE ==
+				 (2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Data Segment data size");
+	dseg->bcount = rte_cpu_to_be_32((len + sizeof(struct rte_vlan_hdr)) |
+					MLX5_ETH_WQE_DATA_INLINE);
+	pdst = &dseg->inline_data[0];
+	memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE);
+	buf += MLX5_DSEG_MIN_INLINE_SIZE;
+	/* Insert VLAN ethertype + VLAN tag. */
+	*(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) |
+					      loc->mbuf->vlan_tci);
+	pdst += sizeof(struct rte_vlan_hdr);
+	if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
+		pdst = (uint8_t *)txq->wqes;
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, len);
+	do {
+		rte_memcpy(pdst, buf, part);
+		len -= part;
+		if (likely(!len)) {
+			pdst += part;
+			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			/* Note: no final wraparound check here. */
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		buf += part;
+		part = len;
+	} while (true);
+}
+
+/**
+ * Build the Ethernet Segment with optionally inlined data with
+ * VLAN insertion and following Data Segments (if any) from
+ * multi-segment packet. Used by ordinary send and TSO.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet/Data Segments.
+ * @param vlan
+ *   Length of VLAN header to insert, 0 means no VLAN insertion.
+ * @param inlen
+ *   Data length to inline. This is minimal amount of data bytes
+ *   to be inlined. For TSO this parameter specifies exact value,
+ *   for ordinary send routine can extend beyond specified value
+ *   to provide better WQE space saving. This length includes
+ *   VLAN header being inserted.
+ * @param tso
+ *   Zero means ordinary send, inlined data can be extended,
+ *   otherwise this is TSO, inlined data length is fixed.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Actual size of built WQE in segments.
+ */
+static __rte_always_inline unsigned int
+mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq,
+		   struct mlx5_txq_local *restrict loc,
+		   struct mlx5_wqe *restrict wqe,
+		   unsigned int vlan,
+		   unsigned int inlen,
+		   unsigned int tso,
+		   unsigned int olx __rte_unused)
+{
+	struct mlx5_wqe_dseg *restrict dseg;
+	unsigned int ds;
+
+	assert((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen);
+	loc->mbuf_nseg = NB_SEGS(loc->mbuf);
+	loc->mbuf_off = 0;
+
+	dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx);
+	if (!loc->mbuf_nseg)
+		goto dseg_done;
+	/*
+	 * There are still some mbuf remaining, not inlined.
+	 * The first mbuf may be partially inlined and we
+	 * must process the possible non-zero data offset.
+	 */
+	if (loc->mbuf_off) {
+		unsigned int dlen;
+		uint8_t *dptr;
+
+		/*
+		 * Exhausted packets must be dropped before.
+		 * Non-zero offset means there are some data
+		 * remained in the packet.
+		 */
+		assert(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf));
+		assert(rte_pktmbuf_data_len(loc->mbuf));
+		dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + loc->mbuf_off;
+		dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off;
+		/*
+		 * Build the pointer/minimal data Data Segment.
+		 * Do ring buffer wrapping check in advance.
+		 */
+		if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+			dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+		mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx);
+		/* Store the mbuf to be freed on completion. */
+		assert(loc->elts_free);
+		txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+		--loc->elts_free;
+		++dseg;
+		if (--loc->mbuf_nseg == 0)
+			goto dseg_done;
+		loc->mbuf = loc->mbuf->next;
+		loc->mbuf_off = 0;
+	}
+	do {
+		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
+			struct rte_mbuf *mbuf;
+
+			/* Zero length segment found, just skip. */
+			mbuf = loc->mbuf;
+			loc->mbuf = loc->mbuf->next;
+			rte_pktmbuf_free_seg(mbuf);
+			if (--loc->mbuf_nseg == 0)
+				break;
+		} else {
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+			mlx5_tx_dseg_iptr
+				(txq, loc, dseg,
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 rte_pktmbuf_data_len(loc->mbuf), olx);
+			assert(loc->elts_free);
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+			--loc->elts_free;
+			++dseg;
+			if (--loc->mbuf_nseg == 0)
+				break;
+			loc->mbuf = loc->mbuf->next;
+		}
+	} while (true);
+
+dseg_done:
+	/* Calculate actual segments used from the dseg pointer. */
+	if ((uintptr_t)wqe < (uintptr_t)dseg)
+		ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE;
+	else
+		ds = (((uintptr_t)dseg - (uintptr_t)wqe) +
+		      txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE;
+	return ds;
+}
+
+/**
+ * Tx one packet function for multi-segment TSO. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs,
+ * sends one packet per WQE.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq,
+			struct mlx5_txq_local *restrict loc,
+			unsigned int olx)
+{
+	struct mlx5_wqe *restrict wqe;
+	unsigned int ds, dlen, inlen, ntcp, vlan = 0;
+
+	/*
+	 * Calculate data length to be inlined to estimate
+	 * the required space in WQE ring buffer.
+	 */
+	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
+	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		vlan = sizeof(struct rte_vlan_hdr);
+	inlen = loc->mbuf->l2_len + vlan +
+		loc->mbuf->l3_len + loc->mbuf->l4_len;
+	if (unlikely((!inlen || !loc->mbuf->tso_segsz)))
+		return MLX5_TXCMP_CODE_ERROR;
+	if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK)
+		inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len;
+	/* Packet must contain all TSO headers. */
+	if (unlikely(inlen > MLX5_MAX_TSO_HEADER ||
+		     inlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
+		     inlen > (dlen + vlan)))
+		return MLX5_TXCMP_CODE_ERROR;
+	/*
+	 * Check whether there are enough free WQEBBs:
+	 * - Control Segment
+	 * - Ethernet Segment
+	 * - First Segment of inlined Ethernet data
+	 * - ... data continued ...
+	 * - Data Segments of pointer/min inline type
+	 */
+	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
+				       MLX5_ESEG_MIN_INLINE_SIZE +
+				       MLX5_WSEG_SIZE +
+				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_EXIT;
+	/* Check for maximal WQE size. */
+	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes/packets counters. */
+	ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) /
+		loc->mbuf->tso_segsz;
+	/*
+	 * One will be added for mbuf itself
+	 * at the end of the mlx5_tx_burst from
+	 * loc->pkts_sent field.
+	 */
+	--ntcp;
+	txq->stats.opackets += ntcp;
+	txq->stats.obytes += dlen + vlan + ntcp * inlen;
+#endif
+	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+	loc->wqe_last = wqe;
+	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx);
+	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx);
+	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+	return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx one packet function for multi-segment SEND. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
+ * sends one packet per WQE, without any data inlining in
+ * Ethernet Segment.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	struct mlx5_wqe_dseg *restrict dseg;
+	struct mlx5_wqe *restrict wqe;
+	unsigned int ds, nseg;
+
+	assert(NB_SEGS(loc->mbuf) > 1);
+	/*
+	 * No inline at all, it means the CPU cycles saving
+	 * is prioritized at configuration, we should not
+	 * copy any packet data to WQE.
+	 */
+	nseg = NB_SEGS(loc->mbuf);
+	ds = 2 + nseg;
+	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_EXIT;
+	/* Check for maximal WQE size. */
+	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_ERROR;
+	/*
+	 * Some Tx offloads may cause an error if
+	 * packet is not long enough, check against
+	 * assumed minimal length.
+	 */
+	if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE)
+		return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes counter. */
+	txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf);
+	if (MLX5_TXOFF_CONFIG(VLAN) &&
+	    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		txq->stats.obytes += sizeof(struct rte_vlan_hdr);
+#endif
+	/*
+	 * SEND WQE, one WQEBB:
+	 * - Control Segment, SEND opcode
+	 * - Ethernet Segment, optional VLAN, no inline
+	 * - Data Segments, pointer only type
+	 */
+	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+	loc->wqe_last = wqe;
+	mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx);
+	mlx5_tx_eseg_none(txq, loc, wqe, olx);
+	dseg = &wqe->dseg[0];
+	do {
+		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
+			struct rte_mbuf *mbuf;
+
+			/*
+			 * Zero length segment found, have to
+			 * correct total size of WQE in segments.
+			 * It is supposed to be rare occasion, so
+			 * in normal case (no zero length segments)
+			 * we avoid extra writing to the Control
+			 * Segment.
+			 */
+			--ds;
+			wqe->cseg.sq_ds -= RTE_BE32(1);
+			mbuf = loc->mbuf;
+			loc->mbuf = mbuf->next;
+			rte_pktmbuf_free_seg(mbuf);
+			if (--nseg == 0)
+				break;
+		} else {
+			mlx5_tx_dseg_ptr
+				(txq, loc, dseg,
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 rte_pktmbuf_data_len(loc->mbuf), olx);
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+			--loc->elts_free;
+			if (--nseg == 0)
+				break;
+			++dseg;
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+			loc->mbuf = loc->mbuf->next;
+		}
+	} while (true);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+	return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx one packet function for multi-segment SEND. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
+ * sends one packet per WQE, with data inlining in
+ * Ethernet Segment and minimal Data Segments.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq,
+			    struct mlx5_txq_local *restrict loc,
+			    unsigned int olx)
+{
+	struct mlx5_wqe *restrict wqe;
+	unsigned int ds, inlen, dlen, vlan = 0;
+
+	assert(MLX5_TXOFF_CONFIG(INLINE));
+	assert(NB_SEGS(loc->mbuf) > 1);
+	/*
+	 * First calculate data length to be inlined
+	 * to estimate the required space for WQE.
+	 */
+	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
+	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		vlan = sizeof(struct rte_vlan_hdr);
+	inlen = dlen + vlan;
+	/* Check against minimal length. */
+	if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
+		return MLX5_TXCMP_CODE_ERROR;
+	assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+	if (inlen > txq->inlen_send) {
+		/*
+		 * Packet length exceeds the allowed inline
+		 * data length, check whether the minimal
+		 * inlining is required. eMPW check is here
+		 * to statically exclude check - no minimal
+		 * inlining required if eMPW is enabled.
+		 */
+		if (MLX5_TXOFF_CONFIG(EMPW) || !txq->inlen_mode) {
+			/*
+			 * VLAN insertion will be done inside by HW.
+			 * It is not utmost effective - VLAN flag is
+			 * checked twice, but we should proceed the
+			 * inlining length correctly and take into
+			 * account the VLAN header being inserted.
+			 */
+			assert(!txq->inlen_mode);
+			return mlx5_tx_packet_multi_send(txq, loc, olx);
+		}
+		assert(txq->inlen_mode >= MLX5_ESEG_MIN_INLINE_SIZE);
+		assert(txq->inlen_mode <= txq->inlen_send);
+		inlen = txq->inlen_mode;
+	}
+	/*
+	 * Check whether there are enough free WQEBBs:
+	 * - Control Segment
+	 * - Ethernet Segment
+	 * - First Segment of inlined Ethernet data
+	 * - ... data continued ...
+	 * - Data Segments of pointer/min inline type
+	 *
+	 * Estimate the number of Data Segments conservatively,
+	 * supposing no any mbufs is being freed during inlining.
+	 */
+	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
+				       MLX5_ESEG_MIN_INLINE_SIZE +
+				       MLX5_WSEG_SIZE +
+				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+	/*
+	 * We may have extra space in WQE to put inline
+	 * data to fill the WQEBBs completely.
+	 */
+	inlen += (4 - ds % 4) * MLX5_WSEG_SIZE;
+	inlen = RTE_MIN(inlen, dlen + vlan);
+	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_EXIT;
+	/* Check for maximal WQE size. */
+	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes/packets counters. */
+	txq->stats.obytes += dlen + vlan;
+#endif
+	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+	loc->wqe_last = wqe;
+	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx);
+	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx);
+	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+	return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx burst function for multi-segment packets. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs,
+ * sends one packet per WQE. Function stops sending if it
+ * encounters the single-segment packet.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
+ *   MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered.
+ * Local context variables updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq,
+		   struct rte_mbuf **restrict pkts,
+		   unsigned int pkts_n,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int olx)
+{
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		enum mlx5_txcmp_code ret;
+
+		assert(NB_SEGS(loc->mbuf) > 1);
+		/*
+		 * Estimate the number of free elts quickly but
+		 * conservatively. Some segment may be fully inlined
+		 * and freed, ignore this here - precise estimation
+		 * is costly.
+		 */
+		if (loc->elts_free < NB_SEGS(loc->mbuf))
+			return MLX5_TXCMP_CODE_EXIT;
+		if (MLX5_TXOFF_CONFIG(TSO) &&
+		    unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) {
+			/* Proceed with multi-segment TSO. */
+			ret = mlx5_tx_packet_multi_tso(txq, loc, olx);
+		} else if (MLX5_TXOFF_CONFIG(INLINE)) {
+			/* Proceed with multi-segment SEND with inlining. */
+			ret = mlx5_tx_packet_multi_inline(txq, loc, olx);
+		} else {
+			/* Proceed with multi-segment SEND w/o inlining. */
+			ret = mlx5_tx_packet_multi_send(txq, loc, olx);
+		}
+		if (ret == MLX5_TXCMP_CODE_EXIT)
+			return MLX5_TXCMP_CODE_EXIT;
+		if (ret == MLX5_TXCMP_CODE_ERROR)
+			return MLX5_TXCMP_CODE_ERROR;
+		/* WQE is built, go to the next packet. */
+		++loc->pkts_sent;
+		--pkts_n;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		if (pkts_n > 1)
+			rte_prefetch0(*pkts);
+		if (likely(NB_SEGS(loc->mbuf) > 1))
+			continue;
+		/* Here ends the series of multi-segment packets. */
+		if (MLX5_TXOFF_CONFIG(TSO) &&
+		    unlikely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)))
+			return MLX5_TXCMP_CODE_TSO;
+		return MLX5_TXCMP_CODE_SINGLE;
+	}
+	assert(false);
+}
+
+/**
+ * Tx burst function for single-segment packets with TSO.
+ * Supports all types of Tx offloads, except multi-packets.
+ * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE.
+ * Function stops sending if it encounters the multi-segment
+ * packet or packet without TSO requested.
+ *
+ * The routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head if inline
+ * offloads is requested due to possible early freeing
+ * of the inlined mbufs (can not store pkts array in elts
+ * as a batch).
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
+ *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ * Local context variables updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq,
+		  struct rte_mbuf **restrict pkts,
+		  unsigned int pkts_n,
+		  struct mlx5_txq_local *restrict loc,
+		  unsigned int olx)
+{
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe_dseg *restrict dseg;
+		struct mlx5_wqe *restrict wqe;
+		unsigned int ds, dlen, hlen, ntcp, vlan = 0;
+		uint8_t *dptr;
+
+		assert(NB_SEGS(loc->mbuf) == 1);
+		dlen = rte_pktmbuf_data_len(loc->mbuf);
+		if (MLX5_TXOFF_CONFIG(VLAN) &&
+		    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+			vlan = sizeof(struct rte_vlan_hdr);
+		}
+		/*
+		 * First calculate the WQE size to check
+		 * whether we have enough space in ring buffer.
+		 */
+		hlen = loc->mbuf->l2_len + vlan +
+		       loc->mbuf->l3_len + loc->mbuf->l4_len;
+		if (unlikely((!hlen || !loc->mbuf->tso_segsz)))
+			return MLX5_TXCMP_CODE_ERROR;
+		if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK)
+			hlen += loc->mbuf->outer_l2_len +
+				loc->mbuf->outer_l3_len;
+		/* Segment must contain all TSO headers. */
+		if (unlikely(hlen > MLX5_MAX_TSO_HEADER ||
+			     hlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
+			     hlen > (dlen + vlan)))
+			return MLX5_TXCMP_CODE_ERROR;
+		/*
+		 * Check whether there are enough free WQEBBs:
+		 * - Control Segment
+		 * - Ethernet Segment
+		 * - First Segment of inlined Ethernet data
+		 * - ... data continued ...
+		 * - Finishing Data Segment of pointer type
+		 */
+		ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE +
+			  MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+		if (loc->wqe_free < ((ds + 3) / 4))
+			return MLX5_TXCMP_CODE_EXIT;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Update sent data bytes/packets counters. */
+		ntcp = (dlen + vlan - hlen +
+			loc->mbuf->tso_segsz - 1) /
+			loc->mbuf->tso_segsz;
+		/*
+		 * One will be added for mbuf itself at the end
+		 * of the mlx5_tx_burst from loc->pkts_sent field.
+		 */
+		--ntcp;
+		txq->stats.opackets += ntcp;
+		txq->stats.obytes += dlen + vlan + ntcp * hlen;
+#endif
+		/*
+		 * Build the TSO WQE:
+		 * - Control Segment
+		 * - Ethernet Segment with hlen bytes inlined
+		 * - Data Segment of pointer type
+		 */
+		wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+		loc->wqe_last = wqe;
+		mlx5_tx_cseg_init(txq, loc, wqe, ds,
+				  MLX5_OPCODE_TSO, olx);
+		dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx);
+		dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan;
+		dlen -= hlen - vlan;
+		mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
+		/*
+		 * WQE is built, update the loop parameters
+		 * and go to the next packet.
+		 */
+		txq->wqe_ci += (ds + 3) / 4;
+		loc->wqe_free -= (ds + 3) / 4;
+		if (MLX5_TXOFF_CONFIG(INLINE))
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+		--loc->elts_free;
+		++loc->pkts_sent;
+		--pkts_n;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		if (pkts_n > 1)
+			rte_prefetch0(*pkts);
+		if (MLX5_TXOFF_CONFIG(MULTI) &&
+		    unlikely(NB_SEGS(loc->mbuf) > 1))
+			return MLX5_TXCMP_CODE_MULTI;
+		if (unlikely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)))
+			return MLX5_TXCMP_CODE_SINGLE;
+		/* Continue with the next TSO packet. */
+	}
+	assert(false);
+}
+
+/**
+ * Analyze the packet and select the best method to send.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ * @param newp
+ *   The predefined flag whether do complete check for
+ *   multi-segment packets and TSO.
+ *
+ * @return
+ *  MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ *  MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO.
+ *  MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND.
+ *  MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq,
+		     struct mlx5_txq_local *restrict loc,
+		     unsigned int olx,
+		     bool newp)
+{
+	/* Check for multi-segment packet. */
+	if (newp &&
+	    MLX5_TXOFF_CONFIG(MULTI) &&
+	    unlikely(NB_SEGS(loc->mbuf) > 1))
+		return MLX5_TXCMP_CODE_MULTI;
+	/* Check for TSO packet. */
+	if (newp &&
+	    MLX5_TXOFF_CONFIG(TSO) &&
+	    unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))
+		return MLX5_TXCMP_CODE_TSO;
+	/* Check if eMPW is enabled at all. */
+	if (!MLX5_TXOFF_CONFIG(EMPW))
+		return MLX5_TXCMP_CODE_SINGLE;
+	/* Check if eMPW can be engaged. */
+	if (MLX5_TXOFF_CONFIG(VLAN) &&
+	    unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) &&
+		(!MLX5_TXOFF_CONFIG(INLINE) ||
+		 unlikely((rte_pktmbuf_data_len(loc->mbuf) +
+			   sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) {
+		/*
+		 * eMPW does not support VLAN insertion offload,
+		 * we have to inline the entire packet but
+		 * packet is too long for inlining.
+		 */
+		return MLX5_TXCMP_CODE_SINGLE;
+	}
+	return MLX5_TXCMP_CODE_EMPW;
+}
+
+/**
+ * Check the next packet attributes to match with the eMPW batch ones.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param es
+ *   Pointer to Ethernet Segment of eMPW batch.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *  true - packet match with eMPW batch attributes.
+ *  false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline bool
+mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused,
+		   struct mlx5_wqe_eseg *restrict es,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int olx)
+{
+	uint8_t swp_flags = 0;
+
+	/* Compare the checksum flags, if any. */
+	if (MLX5_TXOFF_CONFIG(CSUM) &&
+	    txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags)
+		return false;
+	/* Compare the Software Parser offsets and flags. */
+	if (MLX5_TXOFF_CONFIG(SWP) &&
+	    (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) ||
+	     es->swp_flags != swp_flags))
+		return false;
+	/* Fill metadata field if needed. */
+	if (MLX5_TXOFF_CONFIG(METADATA) &&
+		es->metadata != (loc->mbuf->ol_flags & PKT_TX_METADATA ?
+				 loc->mbuf->tx_metadata : 0))
+		return false;
+	/* There must be no VLAN packets in eMPW loop. */
+	if (MLX5_TXOFF_CONFIG(VLAN))
+		assert(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT));
+	return true;
+}
+
+/*
+ * Update send loop variables and WQE for eMPW loop
+ * without data inlining. Number of Data Segments is
+ * equal to the number of sent packets.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param ds
+ *   Number of packets/Data Segments/Packets.
+ * @param slen
+ *   Accumulated statistics, bytes sent
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *  true - packet match with eMPW batch attributes.
+ *  false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline void
+mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int ds,
+		   unsigned int slen,
+		   unsigned int olx __rte_unused)
+{
+	assert(!MLX5_TXOFF_CONFIG(INLINE));
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes counter. */
+	 txq->stats.obytes += slen;
+#else
+	(void)slen;
+#endif
+	loc->elts_free -= ds;
+	loc->pkts_sent += ds;
+	ds += 2;
+	loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+}
+
+/*
+ * Update send loop variables and WQE for eMPW loop
+ * with data inlining. Gets the size of pushed descriptors
+ * and data to the WQE.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param len
+ *   Total size of descriptor/data in bytes.
+ * @param slen
+ *   Accumulated statistics, data bytes sent.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *  true - packet match with eMPW batch attributes.
+ *  false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline void
+mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int len,
+		   unsigned int slen,
+		   unsigned int olx __rte_unused)
+{
+	assert(MLX5_TXOFF_CONFIG(INLINE));
+	assert((len % MLX5_WSEG_SIZE) == 0);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes counter. */
+	 txq->stats.obytes += slen;
+#else
+	(void)slen;
+#endif
+	len = len / MLX5_WSEG_SIZE + 2;
+	loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
+	txq->wqe_ci += (len + 3) / 4;
+	loc->wqe_free -= (len + 3) / 4;
+}
+
+/**
+ * Tx burst functions for single-segment packets without TSO
+ * and with Multi-Packet Writing feature support. Supports
+ * all types of Tx offloads, except multi-packets and TSO.
+ * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends
+ * as many packet per WQE as it can. If eMPW is not configured
+ * or packet can not be sent with eMPW (VLAN insertion) the
+ * ordinary SEND opcode is used and only one packet placed
+ * in WQE.
+ *
+ * Function stops sending if it encounters the multi-segment
+ * packet or packet with TSO requested.
+ *
+ * The routines are responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head if inlining
+ * offload is requested. Otherwise the copying mbufs to elts
+ * can be postponed and completed at the end of burst routine.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ *   MLX5_TXCMP_CODE_TSO - TSO packet encountered.
+ * Local context variables updated.
+ */
+
+/**
+ * The routine sends packets with MLX5_OPCODE_EMPW
+ * without inlining, this is dedicated optimized branch.
+ * No VLAN insertion is supported.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq,
+			  struct rte_mbuf **restrict pkts,
+			  unsigned int pkts_n,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	/*
+	 * Subroutine is the part of mlx5_tx_burst_single()
+	 * and sends single-segment packet with eMPW opcode
+	 * without data inlining.
+	 */
+	assert(!MLX5_TXOFF_CONFIG(INLINE));
+	assert(MLX5_TXOFF_CONFIG(EMPW));
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe_dseg *restrict dseg;
+		struct mlx5_wqe_eseg *restrict eseg;
+		enum mlx5_txcmp_code ret;
+		unsigned int part, loop;
+		unsigned int slen = 0;
+
+next_empw:
+		part = RTE_MIN(pkts_n, MLX5_EMPW_MAX_PACKETS);
+		if (unlikely(loc->elts_free < part)) {
+			/* We have no enough elts to save all mbufs. */
+			if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS))
+				return MLX5_TXCMP_CODE_EXIT;
+			/* But we still able to send at least minimal eMPW. */
+			part = loc->elts_free;
+		}
+		/* Check whether we have enough WQEs */
+		if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) {
+			if (unlikely(loc->wqe_free <
+				((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
+				return MLX5_TXCMP_CODE_EXIT;
+			part = (loc->wqe_free * 4) - 2;
+		}
+		if (likely(part > 1))
+			rte_prefetch0(*pkts);
+		loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+		/*
+		 * Build eMPW title WQEBB:
+		 * - Control Segment, eMPW opcode
+		 * - Ethernet Segment, no inline
+		 */
+		mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2,
+				  MLX5_OPCODE_ENHANCED_MPSW, olx);
+		mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
+				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
+		eseg = &loc->wqe_last->eseg;
+		dseg = &loc->wqe_last->dseg[0];
+		loop = part;
+		for (;;) {
+			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
+			/*
+			 * Some Tx offloads may cause an error if
+			 * packet is not long enough, check against
+			 * assumed minimal length.
+			 */
+			if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) {
+				part -= loop;
+				if (unlikely(!part))
+					return MLX5_TXCMP_CODE_ERROR;
+				/*
+				 * We have some successfully built
+				 * packet Data Segments to send.
+				 */
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			slen += dlen;
+#endif
+			mlx5_tx_dseg_ptr
+				(txq, loc, dseg,
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 dlen, olx);
+			if (unlikely(--loop == 0))
+				break;
+			loc->mbuf = *pkts++;
+			if (likely(loop > 1))
+				rte_prefetch0(*pkts);
+			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+			/*
+			 * Unroll the completion code to avoid
+			 * returning variable value - it results in
+			 * unoptimized sequent checking in caller.
+			 */
+			if (ret == MLX5_TXCMP_CODE_MULTI) {
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_MULTI;
+			}
+			if (ret == MLX5_TXCMP_CODE_TSO) {
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_TSO;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE) {
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_SINGLE;
+			}
+			if (ret != MLX5_TXCMP_CODE_EMPW) {
+				assert(false);
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+			/*
+			 * Check whether packet parameters coincide
+			 * within assumed eMPW batch:
+			 * - check sum settings
+			 * - metadata value
+			 * - software parser settings
+			 */
+			if (!mlx5_tx_match_empw(txq, eseg, loc, olx)) {
+				assert(loop);
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				goto next_empw;
+			}
+			/* Packet attributes match, continue the same eMPW. */
+			++dseg;
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+		}
+		/* eMPW is built successfully, update loop parameters. */
+		assert(!loop);
+		assert(pkts_n >= part);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Update sent data bytes counter. */
+		txq->stats.obytes += slen;
+#endif
+		loc->elts_free -= part;
+		loc->pkts_sent += part;
+		txq->wqe_ci += (2 + part + 3) / 4;
+		loc->wqe_free -= (2 + part + 3) / 4;
+		pkts_n -= part;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+		if (unlikely(ret != MLX5_TXCMP_CODE_EMPW))
+			return ret;
+		/* Continue sending eMPW batches. */
+	}
+	assert(false);
+}
+
+/**
+ * The routine sends packets with MLX5_OPCODE_EMPW
+ * with inlining, optionally supports VLAN insertion.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
+			  struct rte_mbuf **restrict pkts,
+			  unsigned int pkts_n,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	/*
+	 * Subroutine is the part of mlx5_tx_burst_single()
+	 * and sends single-segment packet with eMPW opcode
+	 * with data inlining.
+	 */
+	assert(MLX5_TXOFF_CONFIG(INLINE));
+	assert(MLX5_TXOFF_CONFIG(EMPW));
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe_dseg *restrict dseg;
+		struct mlx5_wqe_eseg *restrict eseg;
+		enum mlx5_txcmp_code ret;
+		unsigned int room, part;
+		unsigned int slen = 0;
+
+next_empw:
+		/* Check whether we have minimal amount WQEs */
+		if (unlikely(loc->wqe_free <
+			    ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
+			return MLX5_TXCMP_CODE_EXIT;
+		if (likely(pkts_n > 1))
+			rte_prefetch0(*pkts);
+		loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+		/*
+		 * Build eMPW title WQEBB:
+		 * - Control Segment, eMPW opcode, zero DS
+		 * - Ethernet Segment, no inline
+		 */
+		mlx5_tx_cseg_init(txq, loc, loc->wqe_last, 0,
+				  MLX5_OPCODE_ENHANCED_MPSW, olx);
+		mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
+				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
+		eseg = &loc->wqe_last->eseg;
+		dseg = &loc->wqe_last->dseg[0];
+		room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE,
+			       loc->wqe_free) * MLX5_WQE_SIZE -
+					MLX5_WQE_CSEG_SIZE -
+					MLX5_WQE_ESEG_SIZE;
+		/* Build WQE till we have space, packets and resources. */
+		part = room;
+		for (;;) {
+			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
+			uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+			unsigned int tlen;
+
+			assert(room >= MLX5_WQE_DSEG_SIZE);
+			assert((room % MLX5_WQE_DSEG_SIZE) == 0);
+			assert((uintptr_t)dseg < (uintptr_t)txq->wqes_end);
+			/*
+			 * Some Tx offloads may cause an error if
+			 * packet is not long enough, check against
+			 * assumed minimal length.
+			 */
+			if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) {
+				part -= room;
+				if (unlikely(!part))
+					return MLX5_TXCMP_CODE_ERROR;
+				/*
+				 * We have some successfully built
+				 * packet Data Segments to send.
+				 */
+				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+			/* Inline or not inline - that's the Question. */
+			if (dlen > txq->inlen_empw)
+				goto pointer_empw;
+			/* Inline entire packet, optional VLAN insertion. */
+			tlen = sizeof(dseg->bcount) + dlen;
+			if (MLX5_TXOFF_CONFIG(VLAN) &&
+			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+				/*
+				 * The packet length must be checked in
+				 * mlx5_tx_able_to_empw() and packet
+				 * fits into inline length guaranteed.
+				 */
+				assert((dlen + sizeof(struct rte_vlan_hdr)) <=
+					txq->inlen_empw);
+				tlen += sizeof(struct rte_vlan_hdr);
+				if (room < tlen)
+					break;
+				dseg = mlx5_tx_dseg_vlan(txq, loc, dseg,
+							 dptr, dlen, olx);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+				/* Update sent data bytes counter. */
+				slen +=	sizeof(struct rte_vlan_hdr);
+#endif
+			} else {
+				if (room < tlen)
+					break;
+				dseg = mlx5_tx_dseg_empw(txq, loc, dseg,
+							 dptr, dlen, olx);
+			}
+			tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
+			assert(room >= tlen);
+			room -= tlen;
+			/*
+			 * Packet data are completely inlined,
+			 * free the packet immediately.
+			 */
+			rte_pktmbuf_free_seg(loc->mbuf);
+			goto next_mbuf;
+pointer_empw:
+			/*
+			 * Not inlinable VLAN packets are
+			 * proceeded outside of this routine.
+			 */
+			assert(room >= MLX5_WQE_DSEG_SIZE);
+			if (MLX5_TXOFF_CONFIG(VLAN))
+				assert(!(loc->mbuf->ol_flags &
+					 PKT_TX_VLAN_PKT));
+			mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
+			/* We have to store mbuf in elts.*/
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+			room -= MLX5_WQE_DSEG_SIZE;
+			/* Ring buffer wraparound is checked at the loop end.*/
+			++dseg;
+next_mbuf:
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			slen += dlen;
+#endif
+			loc->pkts_sent++;
+			loc->elts_free--;
+			pkts_n--;
+			if (unlikely(!pkts_n || !loc->elts_free)) {
+				/*
+				 * We have no resources/packets to
+				 * continue build descriptors.
+				 */
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				return MLX5_TXCMP_CODE_EXIT;
+			}
+			/* Check if we have minimal room left. */
+			if (room < MLX5_WQE_DSEG_SIZE) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				goto next_empw;
+			}
+			loc->mbuf = *pkts++;
+			if (likely(pkts_n > 1))
+				rte_prefetch0(*pkts);
+			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+			/*
+			 * Unroll the completion code to avoid
+			 * returning variable value - it results in
+			 * unoptimized sequent checking in caller.
+			 */
+			if (ret == MLX5_TXCMP_CODE_MULTI) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_MULTI;
+			}
+			if (ret == MLX5_TXCMP_CODE_TSO) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_TSO;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_SINGLE;
+			}
+			if (ret != MLX5_TXCMP_CODE_EMPW) {
+				assert(false);
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+			/*
+			 * Check whether packet parameters coincide
+			 * within assumed eMPW batch:
+			 * - check sum settings
+			 * - metadata value
+			 * - software parser settings
+			 */
+			if (!mlx5_tx_match_empw(txq, eseg, loc, olx))
+				break;
+			/* Packet attributes match, continue the same eMPW. */
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+		}
+		/*
+		 * We get here to close an existing eMPW
+		 * session and start the new one.
+		 */
+		assert(pkts_n);
+		part -= room;
+		if (unlikely(!part))
+			return MLX5_TXCMP_CODE_EXIT;
+		mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+		if (unlikely(!loc->elts_free ||
+			     !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		goto next_empw;
+	}
+	assert(false);
+}
+
+/**
+ * The routine sends packets with ordinary MLX5_OPCODE_SEND.
+ * Data inlining and VLAN insertion are supported.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
+			  struct rte_mbuf **restrict pkts,
+			  unsigned int pkts_n,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	/*
+	 * Subroutine is the part of mlx5_tx_burst_single()
+	 * and sends single-segment packet with SEND opcode.
+	 */
+	assert(loc->elts_free && loc->wqe_free);
+	assert(pkts_n > loc->pkts_sent);
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe *restrict wqe;
+		enum mlx5_txcmp_code ret;
+
+		assert(NB_SEGS(loc->mbuf) == 1);
+		if (MLX5_TXOFF_CONFIG(INLINE)) {
+			unsigned int inlen, vlan = 0;
+
+			inlen = rte_pktmbuf_data_len(loc->mbuf);
+			if (MLX5_TXOFF_CONFIG(VLAN) &&
+			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+				vlan = sizeof(struct rte_vlan_hdr);
+				inlen += vlan;
+				static_assert((sizeof(struct rte_vlan_hdr) +
+					       sizeof(struct rte_ether_hdr)) ==
+					       MLX5_ESEG_MIN_INLINE_SIZE,
+					       "invalid min inline data size");
+			}
+			/*
+			 * If inlining is enabled at configuration time
+			 * the limit must be not less than minimal size.
+			 * Otherwise we would do extra check for data
+			 * size to avoid crashes due to length overflow.
+			 */
+			assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+			if (inlen <= txq->inlen_send) {
+				unsigned int seg_n, wqe_n;
+
+				rte_prefetch0(rte_pktmbuf_mtod
+						(loc->mbuf, uint8_t *));
+				/* Check against minimal length. */
+				if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
+					return MLX5_TXCMP_CODE_ERROR;
+				/*
+				 * Completely inlined packet data WQE:
+				 * - Control Segment, SEND opcode
+				 * - Ethernet Segment, no VLAN insertion
+				 * - Data inlined, VLAN optionally inserted
+				 * - Alignment to MLX5_WSEG_SIZE
+				 * Have to estimate amount of WQEBBs
+				 */
+				seg_n = (inlen + 3 * MLX5_WSEG_SIZE -
+					 MLX5_ESEG_MIN_INLINE_SIZE +
+					 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+				/* Check if there are enough WQEBBs. */
+				wqe_n = (seg_n + 3) / 4;
+				if (wqe_n > loc->wqe_free)
+					return MLX5_TXCMP_CODE_EXIT;
+				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+				loc->wqe_last = wqe;
+				mlx5_tx_cseg_init(txq, loc, wqe, seg_n,
+						  MLX5_OPCODE_SEND, olx);
+				mlx5_tx_eseg_data(txq, loc, wqe,
+						  vlan, inlen, 0, olx);
+				txq->wqe_ci += wqe_n;
+				loc->wqe_free -= wqe_n;
+				/*
+				 * Packet data are completely inlined,
+				 * free the packet immediately.
+				 */
+				rte_pktmbuf_free_seg(loc->mbuf);
+			} else if (!MLX5_TXOFF_CONFIG(EMPW) &&
+				   txq->inlen_mode) {
+				/*
+				 * If minimal inlining is requested the eMPW
+				 * feature should be disabled due to data is
+				 * inlined into Ethernet Segment, which can
+				 * not contain inlined data for eMPW due to
+				 * segment shared for all packets.
+				 */
+				struct mlx5_wqe_dseg *restrict dseg;
+				unsigned int ds;
+				uint8_t *dptr;
+
+				/*
+				 * The inline-mode settings require
+				 * to inline the specified amount of
+				 * data bytes to the Ethernet Segment.
+				 * We should check the free space in
+				 * WQE ring buffer to inline partially.
+				 */
+				assert(txq->inlen_send >= txq->inlen_mode);
+				assert(inlen > txq->inlen_mode);
+				assert(txq->inlen_mode >=
+						MLX5_ESEG_MIN_INLINE_SIZE);
+				/*
+				 * Check whether there are enough free WQEBBs:
+				 * - Control Segment
+				 * - Ethernet Segment
+				 * - First Segment of inlined Ethernet data
+				 * - ... data continued ...
+				 * - Finishing Data Segment of pointer type
+				 */
+				ds = (MLX5_WQE_CSEG_SIZE +
+				      MLX5_WQE_ESEG_SIZE +
+				      MLX5_WQE_DSEG_SIZE +
+				      txq->inlen_mode -
+				      MLX5_ESEG_MIN_INLINE_SIZE +
+				      MLX5_WQE_DSEG_SIZE +
+				      MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+				if (loc->wqe_free < ((ds + 3) / 4))
+					return MLX5_TXCMP_CODE_EXIT;
+				/*
+				 * Build the ordinary SEND WQE:
+				 * - Control Segment
+				 * - Ethernet Segment, inline inlen_mode bytes
+				 * - Data Segment of pointer type
+				 */
+				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+				loc->wqe_last = wqe;
+				mlx5_tx_cseg_init(txq, loc, wqe, ds,
+						  MLX5_OPCODE_SEND, olx);
+				dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan,
+							 txq->inlen_mode,
+							 0, olx);
+				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
+				       txq->inlen_mode - vlan;
+				inlen -= txq->inlen_mode;
+				mlx5_tx_dseg_ptr(txq, loc, dseg,
+						 dptr, inlen, olx);
+				/*
+				 * WQE is built, update the loop parameters
+				 * and got to the next packet.
+				 */
+				txq->wqe_ci += (ds + 3) / 4;
+				loc->wqe_free -= (ds + 3) / 4;
+				/* We have to store mbuf in elts.*/
+				assert(MLX5_TXOFF_CONFIG(INLINE));
+				txq->elts[txq->elts_head++ & txq->elts_m] =
+						loc->mbuf;
+				--loc->elts_free;
+			} else {
+				uint8_t *dptr;
+				unsigned int dlen;
+
+				/*
+				 * Partially inlined packet data WQE, we have
+				 * some space in title WQEBB, we can fill it
+				 * with some packet data. It takes one WQEBB,
+				 * it is available, no extra space check:
+				 * - Control Segment, SEND opcode
+				 * - Ethernet Segment, no VLAN insertion
+				 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data
+				 * - Data Segment, pointer type
+				 */
+				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+				loc->wqe_last = wqe;
+				mlx5_tx_cseg_init(txq, loc, wqe, 4,
+						  MLX5_OPCODE_SEND, olx);
+				mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx);
+				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
+				       MLX5_ESEG_MIN_INLINE_SIZE - vlan;
+				/*
+				 * The length check is performed above, by
+				 * comparing with txq->inlen_send. We should
+				 * not get overflow here.
+				 */
+				assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
+				dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE;
+				mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1],
+						 dptr, dlen, olx);
+				++txq->wqe_ci;
+				--loc->wqe_free;
+				/* We have to store mbuf in elts.*/
+				assert(MLX5_TXOFF_CONFIG(INLINE));
+				txq->elts[txq->elts_head++ & txq->elts_m] =
+						loc->mbuf;
+				--loc->elts_free;
+			}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			txq->stats.obytes += vlan +
+					rte_pktmbuf_data_len(loc->mbuf);
+#endif
+		} else {
+			/*
+			 * No inline at all, it means the CPU cycles saving
+			 * is prioritized at configuration, we should not
+			 * copy any packet data to WQE.
+			 *
+			 * SEND WQE, one WQEBB:
+			 * - Control Segment, SEND opcode
+			 * - Ethernet Segment, optional VLAN, no inline
+			 * - Data Segment, pointer type
+			 */
+			wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+			loc->wqe_last = wqe;
+			mlx5_tx_cseg_init(txq, loc, wqe, 3,
+					  MLX5_OPCODE_SEND, olx);
+			mlx5_tx_eseg_none(txq, loc, wqe, olx);
+			/*
+			 * Some Tx offloads may cause an error if
+			 * packet is not long enough, check against
+			 * assumed minimal length.
+			 */
+			if (rte_pktmbuf_data_len(loc->mbuf) <=
+						 MLX5_ESEG_MIN_INLINE_SIZE)
+				return MLX5_TXCMP_CODE_ERROR;
+			mlx5_tx_dseg_ptr
+				(txq, loc, &wqe->dseg[0],
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 rte_pktmbuf_data_len(loc->mbuf), olx);
+			++txq->wqe_ci;
+			--loc->wqe_free;
+			/*
+			 * We should not store mbuf pointer in elts
+			 * if no inlining is configured, this is done
+			 * by calling routine in a batch copy.
+			 */
+			assert(!MLX5_TXOFF_CONFIG(INLINE));
+			--loc->elts_free;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf);
+			if (MLX5_TXOFF_CONFIG(VLAN) &&
+			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+				txq->stats.obytes +=
+					sizeof(struct rte_vlan_hdr);
+#endif
+		}
+		++loc->pkts_sent;
+		--pkts_n;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		if (pkts_n > 1)
+			rte_prefetch0(*pkts);
+		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+		if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE))
+			return ret;
+	}
+	assert(false);
+}
+
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq,
+		     struct rte_mbuf **restrict pkts,
+		     unsigned int pkts_n,
+		     struct mlx5_txq_local *restrict loc,
+		     unsigned int olx)
+{
+	enum mlx5_txcmp_code ret;
+
+	ret = mlx5_tx_able_to_empw(txq, loc, olx, false);
+	if (ret == MLX5_TXCMP_CODE_SINGLE)
+		goto ordinary_send;
+	assert(ret == MLX5_TXCMP_CODE_EMPW);
+	for (;;) {
+		/* Optimize for inline/no inline eMPW send. */
+		ret = (MLX5_TXOFF_CONFIG(INLINE)) ?
+			mlx5_tx_burst_empw_inline
+				(txq, pkts, pkts_n, loc, olx) :
+			mlx5_tx_burst_empw_simple
+				(txq, pkts, pkts_n, loc, olx);
+		if (ret != MLX5_TXCMP_CODE_SINGLE)
+			return ret;
+		/* The resources to send one packet should remain. */
+		assert(loc->elts_free && loc->wqe_free);
+ordinary_send:
+		ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx);
+		if (ret != MLX5_TXCMP_CODE_EMPW)
+			return ret;
+		/* The resources to send one packet should remain. */
+		assert(loc->elts_free && loc->wqe_free);
+	}
+}
+
+/**
+ * DPDK Tx callback template. This is configured template
+ * used to generate routines optimized for specified offload setup.
+ * One of this generated functions is chosen at SQ configuration
+ * time.
+ *
+ * @param txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param olx
+ *   Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
+ *   values. Should be static to take compile time static configuration
+ *   advantages.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+static __rte_always_inline uint16_t
+mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq,
+		   struct rte_mbuf **restrict pkts,
+		   uint16_t pkts_n,
+		   unsigned int olx)
+{
+	struct mlx5_txq_local loc;
+	enum mlx5_txcmp_code ret;
+	unsigned int part;
+
+	assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+	assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+	/*
+	 * Check if there are some CQEs, if any:
+	 * - process an encountered errors
+	 * - process the completed WQEs
+	 * - free related mbufs
+	 * - doorbell the NIC about processed CQEs
+	 */
+	if (unlikely(!pkts_n))
+		return 0;
+	rte_prefetch0(*pkts);
+	mlx5_tx_handle_completion(txq, olx);
+	/*
+	 * Calculate the number of available resources - elts and WQEs.
+	 * There are two possible different scenarios:
+	 * - no data inlining into WQEs, one WQEBB may contains upto
+	 *   four packets, in this case elts become scarce resource
+	 * - data inlining into WQEs, one packet may require multiple
+	 *   WQEBBs, the WQEs become the limiting factor.
+	 */
+	assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+	loc.elts_free = txq->elts_s -
+				(uint16_t)(txq->elts_head - txq->elts_tail);
+	assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+	loc.wqe_free = txq->wqe_s -
+				(uint16_t)(txq->wqe_ci - txq->wqe_pi);
+	if (unlikely(!loc.elts_free || !loc.wqe_free))
+		return 0;
+	loc.pkts_sent = 0;
+	loc.pkts_copy = 0;
+	loc.wqe_last = NULL;
+	for (;;) {
+		/*
+		 * Fetch the packet from array. Usually this is
+		 * the first packet in series of multi/single
+		 * segment packets.
+		 */
+		loc.mbuf = *(pkts + loc.pkts_sent);
+		/* Dedicated branch for multi-segment packets. */
+		if (MLX5_TXOFF_CONFIG(MULTI) &&
+		    unlikely(NB_SEGS(loc.mbuf) > 1)) {
+			/*
+			 * Multi-segment packet encountered.
+			 * Hardware is able to process it only
+			 * with SEND/TSO opcodes, one packet
+			 * per WQE, do it in dedicated routine.
+			 */
+enter_send_multi:
+			assert(loc.pkts_sent >= loc.pkts_copy);
+			part = loc.pkts_sent - loc.pkts_copy;
+			if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
+				/*
+				 * There are some single-segment mbufs not
+				 * stored in elts. The mbufs must be in the
+				 * same order as WQEs, so we must copy the
+				 * mbufs to elts here, before the coming
+				 * multi-segment packet mbufs is appended.
+				 */
+				mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy,
+						  part, olx);
+				loc.pkts_copy = loc.pkts_sent;
+			}
+			assert(pkts_n > loc.pkts_sent);
+			ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx);
+			if (!MLX5_TXOFF_CONFIG(INLINE))
+				loc.pkts_copy = loc.pkts_sent;
+			/*
+			 * These returned code checks are supposed
+			 * to be optimized out due to routine inlining.
+			 */
+			if (ret == MLX5_TXCMP_CODE_EXIT) {
+				/*
+				 * The routine returns this code when
+				 * all packets are sent or there is no
+				 * enough resources to complete request.
+				 */
+				break;
+			}
+			if (ret == MLX5_TXCMP_CODE_ERROR) {
+				/*
+				 * The routine returns this code when
+				 * some error in the incoming packets
+				 * format occurred.
+				 */
+				txq->stats.oerrors++;
+				break;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE) {
+				/*
+				 * The single-segment packet was encountered
+				 * in the array, try to send it with the
+				 * best optimized way, possible engaging eMPW.
+				 */
+				goto enter_send_single;
+			}
+			if (MLX5_TXOFF_CONFIG(TSO) &&
+			    ret == MLX5_TXCMP_CODE_TSO) {
+				/*
+				 * The single-segment TSO packet was
+				 * encountered in the array.
+				 */
+				goto enter_send_tso;
+			}
+			/* We must not get here. Something is going wrong. */
+			assert(false);
+			txq->stats.oerrors++;
+			break;
+		}
+		/* Dedicated branch for single-segment TSO packets. */
+		if (MLX5_TXOFF_CONFIG(TSO) &&
+		    unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) {
+			/*
+			 * TSO might require special way for inlining
+			 * (dedicated parameters) and is sent with
+			 * MLX5_OPCODE_TSO opcode only, provide this
+			 * in dedicated branch.
+			 */
+enter_send_tso:
+			assert(NB_SEGS(loc.mbuf) == 1);
+			assert(pkts_n > loc.pkts_sent);
+			ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx);
+			/*
+			 * These returned code checks are supposed
+			 * to be optimized out due to routine inlining.
+			 */
+			if (ret == MLX5_TXCMP_CODE_EXIT)
+				break;
+			if (ret == MLX5_TXCMP_CODE_ERROR) {
+				txq->stats.oerrors++;
+				break;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE)
+				goto enter_send_single;
+			if (MLX5_TXOFF_CONFIG(MULTI) &&
+			    ret == MLX5_TXCMP_CODE_MULTI) {
+				/*
+				 * The multi-segment packet was
+				 * encountered in the array.
+				 */
+				goto enter_send_multi;
+			}
+			/* We must not get here. Something is going wrong. */
+			assert(false);
+			txq->stats.oerrors++;
+			break;
+		}
+		/*
+		 * The dedicated branch for the single-segment packets
+		 * without TSO. Often these ones can be sent using
+		 * MLX5_OPCODE_EMPW with multiple packets in one WQE.
+		 * The routine builds the WQEs till it encounters
+		 * the TSO or multi-segment packet (in case if these
+		 * offloads are requested at SQ configuration time).
+		 */
+enter_send_single:
+		assert(pkts_n > loc.pkts_sent);
+		ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx);
+		/*
+		 * These returned code checks are supposed
+		 * to be optimized out due to routine inlining.
+		 */
+		if (ret == MLX5_TXCMP_CODE_EXIT)
+			break;
+		if (ret == MLX5_TXCMP_CODE_ERROR) {
+			txq->stats.oerrors++;
+			break;
+		}
+		if (ret == MLX5_TXCMP_CODE_SINGLE)
+			goto enter_send_single;
+		if (MLX5_TXOFF_CONFIG(MULTI) &&
+		    ret == MLX5_TXCMP_CODE_MULTI)
+			goto enter_send_multi;
+		/* We must not get here. Something is going wrong. */
+		assert(false);
+		txq->stats.oerrors++;
+		break;
+	}
+	/*
+	 * Main Tx loop is completed, do the rest:
+	 * - set completion request if thresholds are reached
+	 * - doorbell the hardware
+	 * - copy the rest of mbufs to elts (if any)
+	 */
+	assert(MLX5_TXOFF_CONFIG(INLINE) || loc.pkts_sent >= loc.pkts_copy);
+	/* Take a shortcut if nothing is sent. */
+	if (unlikely(loc.pkts_sent == 0))
+		return 0;
+	/* Not all of the mbufs may be stored into elts yet. */
+	part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy;
+	mlx5_tx_request_completion(txq, part, &loc, olx);
+	/*
+	 * Ring QP doorbell immediately after WQE building completion
+	 * to improve latencies. The pure software related data treatment
+	 * can be completed after doorbell. Tx CQEs for this SQ are
+	 * processed in this thread only by the polling.
+	 */
+	mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, 0);
+	if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
+		/*
+		 * There are some single-segment mbufs not stored in elts.
+		 * It can be only if last packet was single-segment.
+		 * The copying is gathered into one place due to it is
+		 * a good opportunity to optimize that with SIMD.
+		 * Unfortunately if inlining is enabled the gaps in
+		 * pointer array may happen due to early freeing of the
+		 * inlined mbufs.
+		 */
+		mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx);
+	}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Increment sent packets counter. */
+	txq->stats.opackets += loc.pkts_sent;
+#endif
+	assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+	assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+	return loc.pkts_sent;
 }
 
 /* Generate routines with Enhanced Multi-Packet Write support. */
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index d8c6f35..4439e88 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -354,8 +354,9 @@ struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx,
 void mlx5_set_ptype_table(void);
 void mlx5_set_cksum_table(void);
 void mlx5_set_swp_types_table(void);
-__rte_noinline uint16_t mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
-					volatile struct mlx5_err_cqe *err_cqe);
+__rte_noinline uint16_t mlx5_tx_error_cqe_handle
+				(struct mlx5_txq_data *restrict txq,
+				 volatile struct mlx5_err_cqe *err_cqe);
 uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
 void mlx5_rxq_initialize(struct mlx5_rxq_data *rxq);
 __rte_noinline int mlx5_rx_err_handle(struct mlx5_rxq_data *rxq,
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v3 7/8] net/mlx5: add minimal required Tx data inline
  2019-07-17  6:53     ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                         ` (5 preceding siblings ...)
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 6/8] net/mlx5: implement Tx burst template Viacheslav Ovsiienko
@ 2019-07-17  6:53       ` Viacheslav Ovsiienko
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 8/8] net/mlx5: report supported max number of mbuf segments Viacheslav Ovsiienko
  2019-07-21 14:24       ` [dpdk-dev] [PATCH v4 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
  8 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-17  6:53 UTC (permalink / raw)
  To: dev; +Cc: yskoh

Tx data packet data may be inlined into transmit descriptor.
At some circumstances ConnectX NICs may require data to be
inlined for correct operation. The exact data amount may
depend on NIC operation mode, requested Tx offloads,
E-Switch configuration, etc.

The number of data bytes to inline may be specified with
devargs key "txq_inline_min". If this key is present the
specified value (may be aligned by the driver in order
not to exceed the limits and provide better descriptor
space utilization) will be used by the driver and it
is guaranteed the requested data bytes are inlined into
the descriptor beside other inline settings.

If "txq_inline_min" key is not present the value may
be queried by the driver from the NIC via DevX if this
feature is available. This patch provides the implementation
of this query.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c           |  93 +++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5.h           |   4 ++
 drivers/net/mlx5/mlx5_defs.h      |  18 ++++++
 drivers/net/mlx5/mlx5_devx_cmds.c | 100 ++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_prm.h       | 122 +++++++++++++++++++++++++++++++++++++-
 5 files changed, 335 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index f1e125e..608daed 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1139,6 +1139,97 @@ struct mlx5_dev_spawn_data {
 }
 
 /**
+ * Configures the minimal amount of data to inline into WQE
+ * while sending packets.
+ *
+ * - the txq_inline_min has the maximal priority, if this
+ *   key is specified in devargs
+ * - if DevX is enabled the inline mode is queried from the
+ *   device (HCA attributes and NIC vport context if needed).
+ * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4LX
+ *   and none (0 bytes) for other NICs
+ *
+ * @param spawn
+ *   Verbs device parameters (name, port, switch_info) to spawn.
+ * @param config
+ *   Device configuration parameters.
+ */
+static void
+mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn,
+		    struct mlx5_dev_config *config)
+{
+	if (config->txq_inline_min != MLX5_ARG_UNSET) {
+		/* Application defines size of inlined data explicitly. */
+		goto exit;
+	}
+	if (config->hca_attr.eth_net_offloads) {
+		/* We have DevX enabled, inline mode queried successfully. */
+		switch (config->hca_attr.wqe_inline_mode) {
+		case MLX5_CAP_INLINE_MODE_L2:
+			/* outer L2 header must be inlined. */
+			config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
+			goto exit;
+		case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+			/* No inline data are required by NIC. */
+			config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
+			goto exit;
+		case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+			/* inline mode is defined by NIC vport context. */
+			if (!config->hca_attr.eth_virt)
+				break;
+			switch (config->hca_attr.vport_inline_mode) {
+			case MLX5_INLINE_MODE_NONE:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_NONE;
+				goto exit;
+			case MLX5_INLINE_MODE_L2:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_L2;
+				goto exit;
+			case MLX5_INLINE_MODE_IP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_L3;
+				goto exit;
+			case MLX5_INLINE_MODE_TCP_UDP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_L4;
+				goto exit;
+			case MLX5_INLINE_MODE_INNER_L2:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_INNER_L2;
+				goto exit;
+			case MLX5_INLINE_MODE_INNER_IP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_INNER_L3;
+				goto exit;
+			case MLX5_INLINE_MODE_INNER_TCP_UDP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_INNER_L4;
+				goto exit;
+			}
+		}
+	}
+	/*
+	 * We get here if we are unable to deduce
+	 * inline data size with DevX. Try PCI ID
+	 * to determine old NICs.
+	 */
+	switch (spawn->pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+		config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
+		break;
+	default:
+		config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
+		break;
+	}
+exit:
+	DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min);
+}
+
+/**
  * Spawn an Ethernet device from Verbs information.
  *
  * @param dpdk_dev
@@ -1631,6 +1722,8 @@ struct mlx5_dev_spawn_data {
 #else
 	config.dv_esw_en = 0;
 #endif
+	/* Detect minimal data bytes to inline. */
+	mlx5_set_min_inline(spawn, &config);
 	/* Store device configuration on private structure. */
 	priv->config = config;
 	if (config.dv_flow_en) {
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index eef9a55..f254c8d 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -170,6 +170,10 @@ struct mlx5_hca_attr {
 	uint32_t eswitch_manager:1;
 	uint32_t flow_counters_dump:1;
 	uint8_t flow_counter_bulk_alloc_bitmap;
+	uint32_t eth_net_offloads:1;
+	uint32_t eth_virt:1;
+	uint32_t wqe_inline_mode:2;
+	uint32_t vport_inline_mode:3;
 };
 
 /* Flow list . */
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 873a595..8c118d5 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -60,6 +60,24 @@
 /* Maximum Packet headers size (L2+L3+L4) for TSO. */
 #define MLX5_MAX_TSO_HEADER (128u + 34u)
 
+/* Inline data size required by NICs. */
+#define MLX5_INLINE_HSIZE_NONE 0
+#define MLX5_INLINE_HSIZE_L2 (sizeof(struct rte_ether_hdr) + \
+			      sizeof(struct rte_vlan_hdr))
+#define MLX5_INLINE_HSIZE_L3 (MLX5_INLINE_HSIZE_L2 + \
+			      sizeof(struct rte_ipv6_hdr))
+#define MLX5_INLINE_HSIZE_L4 (MLX5_INLINE_HSIZE_L3 + \
+			      sizeof(struct rte_tcp_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L2 (MLX5_INLINE_HSIZE_L3 + \
+				    sizeof(struct rte_udp_hdr) + \
+				    sizeof(struct rte_vxlan_hdr) + \
+				    sizeof(struct rte_ether_hdr) + \
+				    sizeof(struct rte_vlan_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L3 (MLX5_INLINE_HSIZE_INNER_L2 + \
+				    sizeof(struct rte_ipv6_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L4 (MLX5_INLINE_HSIZE_INNER_L3 + \
+				    sizeof(struct rte_tcp_hdr))
+
 /* Threshold of buffer replenishment for vectorized Rx. */
 #define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \
 	(RTE_MIN(MLX5_VPMD_RX_MAX_BURST, (unsigned int)(n) >> 2))
diff --git a/drivers/net/mlx5/mlx5_devx_cmds.c b/drivers/net/mlx5/mlx5_devx_cmds.c
index d26d5bc..4ee06db 100644
--- a/drivers/net/mlx5/mlx5_devx_cmds.c
+++ b/drivers/net/mlx5/mlx5_devx_cmds.c
@@ -230,6 +230,59 @@ struct mlx5_devx_obj *
 }
 
 /**
+ * Query NIC vport context.
+ * Fills minimal inline attribute.
+ *
+ * @param[in] ctx
+ *   ibv contexts returned from mlx5dv_open_device.
+ * @param[in] vport
+ *   vport index
+ * @param[out] attr
+ *   Attributes device values.
+ *
+ * @return
+ *   0 on success, a negative value otherwise.
+ */
+static int
+mlx5_devx_cmd_query_nic_vport_context(struct ibv_context *ctx,
+				      unsigned int vport,
+				      struct mlx5_hca_attr *attr)
+{
+	uint32_t in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {0};
+	uint32_t out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {0};
+	void *vctx;
+	int status, syndrome, rc;
+
+	/* Query NIC vport context to determine inline mode. */
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+	rc = mlx5_glue->devx_general_cmd(ctx,
+					 in, sizeof(in),
+					 out, sizeof(out));
+	if (rc)
+		goto error;
+	status = MLX5_GET(query_nic_vport_context_out, out, status);
+	syndrome = MLX5_GET(query_nic_vport_context_out, out, syndrome);
+	if (status) {
+		DRV_LOG(DEBUG, "Failed to query NIC vport context, "
+			"status %x, syndrome = %x",
+			status, syndrome);
+		return -1;
+	}
+	vctx = MLX5_ADDR_OF(query_nic_vport_context_out, out,
+			    nic_vport_context);
+	attr->vport_inline_mode = MLX5_GET(nic_vport_context, vctx,
+					   min_wqe_inline_mode);
+	return 0;
+error:
+	rc = (rc > 0) ? -rc : rc;
+	return rc;
+}
+
+/**
  * Query HCA attributes.
  * Using those attributes we can check on run time if the device
  * is having the required capabilities.
@@ -259,7 +312,7 @@ struct mlx5_devx_obj *
 	rc = mlx5_glue->devx_general_cmd(ctx,
 					 in, sizeof(in), out, sizeof(out));
 	if (rc)
-		return rc;
+		goto error;
 	status = MLX5_GET(query_hca_cap_out, out, status);
 	syndrome = MLX5_GET(query_hca_cap_out, out, syndrome);
 	if (status) {
@@ -274,5 +327,50 @@ struct mlx5_devx_obj *
 	attr->flow_counters_dump = MLX5_GET(cmd_hca_cap, hcattr,
 					    flow_counters_dump);
 	attr->eswitch_manager = MLX5_GET(cmd_hca_cap, hcattr, eswitch_manager);
+	attr->eth_net_offloads = MLX5_GET(cmd_hca_cap, hcattr,
+					  eth_net_offloads);
+	attr->eth_virt = MLX5_GET(cmd_hca_cap, hcattr, eth_virt);
+	if (!attr->eth_net_offloads)
+		return 0;
+
+	/* Query HCA offloads for Ethernet protocol. */
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+	MLX5_SET(query_hca_cap_in, in, op_mod,
+		 MLX5_GET_HCA_CAP_OP_MOD_ETHERNET_OFFLOAD_CAPS |
+		 MLX5_HCA_CAP_OPMOD_GET_CUR);
+
+	rc = mlx5_glue->devx_general_cmd(ctx,
+					 in, sizeof(in),
+					 out, sizeof(out));
+	if (rc) {
+		attr->eth_net_offloads = 0;
+		goto error;
+	}
+	status = MLX5_GET(query_hca_cap_out, out, status);
+	syndrome = MLX5_GET(query_hca_cap_out, out, syndrome);
+	if (status) {
+		DRV_LOG(DEBUG, "Failed to query devx HCA capabilities, "
+			"status %x, syndrome = %x",
+			status, syndrome);
+		attr->eth_net_offloads = 0;
+		return -1;
+	}
+	hcattr = MLX5_ADDR_OF(query_hca_cap_out, out, capability);
+	attr->wqe_inline_mode = MLX5_GET(per_protocol_networking_offload_caps,
+					 hcattr, wqe_inline_mode);
+	if (attr->wqe_inline_mode != MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
+		return 0;
+	if (attr->eth_virt) {
+		rc = mlx5_devx_cmd_query_nic_vport_context(ctx, 0, attr);
+		if (rc) {
+			attr->eth_virt = 0;
+			goto error;
+		}
+	}
 	return 0;
+error:
+	rc = (rc > 0) ? -rc : rc;
+	return rc;
 }
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 9a43dcc..dcb99cb 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -625,6 +625,7 @@ enum {
 enum {
 	MLX5_CMD_OP_QUERY_HCA_CAP = 0x100,
 	MLX5_CMD_OP_CREATE_MKEY = 0x200,
+	MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT = 0x754,
 	MLX5_CMD_OP_ALLOC_FLOW_COUNTER = 0x939,
 	MLX5_CMD_OP_QUERY_FLOW_COUNTER = 0x93b,
 };
@@ -784,7 +785,8 @@ struct mlx5_ifc_create_mkey_in_bits {
 
 enum {
 	MLX5_GET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0 << 1,
-	MLX5_GET_HCA_CAP_OP_MOD_QOS_CAP        = 0xc << 1,
+	MLX5_GET_HCA_CAP_OP_MOD_ETHERNET_OFFLOAD_CAPS = 0x1 << 1,
+	MLX5_GET_HCA_CAP_OP_MOD_QOS_CAP = 0xc << 1,
 };
 
 enum {
@@ -792,6 +794,23 @@ enum {
 	MLX5_HCA_CAP_OPMOD_GET_CUR   = 1,
 };
 
+enum {
+	MLX5_CAP_INLINE_MODE_L2,
+	MLX5_CAP_INLINE_MODE_VPORT_CONTEXT,
+	MLX5_CAP_INLINE_MODE_NOT_REQUIRED,
+};
+
+enum {
+	MLX5_INLINE_MODE_NONE,
+	MLX5_INLINE_MODE_L2,
+	MLX5_INLINE_MODE_IP,
+	MLX5_INLINE_MODE_TCP_UDP,
+	MLX5_INLINE_MODE_RESERVED4,
+	MLX5_INLINE_MODE_INNER_L2,
+	MLX5_INLINE_MODE_INNER_IP,
+	MLX5_INLINE_MODE_INNER_TCP_UDP,
+};
+
 struct mlx5_ifc_cmd_hca_cap_bits {
 	u8 reserved_at_0[0x30];
 	u8 vhca_id[0x10];
@@ -1064,6 +1083,42 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8 reserved_at_61f[0x1e1];
 };
 
+struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
+	u8 csum_cap[0x1];
+	u8 vlan_cap[0x1];
+	u8 lro_cap[0x1];
+	u8 lro_psh_flag[0x1];
+	u8 lro_time_stamp[0x1];
+	u8 reserved_at_5[0x2];
+	u8 wqe_vlan_insert[0x1];
+	u8 self_lb_en_modifiable[0x1];
+	u8 reserved_at_9[0x2];
+	u8 max_lso_cap[0x5];
+	u8 multi_pkt_send_wqe[0x2];
+	u8 wqe_inline_mode[0x2];
+	u8 rss_ind_tbl_cap[0x4];
+	u8 reg_umr_sq[0x1];
+	u8 scatter_fcs[0x1];
+	u8 enhanced_multi_pkt_send_wqe[0x1];
+	u8 tunnel_lso_const_out_ip_id[0x1];
+	u8 reserved_at_1c[0x2];
+	u8 tunnel_stateless_gre[0x1];
+	u8 tunnel_stateless_vxlan[0x1];
+	u8 swp[0x1];
+	u8 swp_csum[0x1];
+	u8 swp_lso[0x1];
+	u8 reserved_at_23[0xd];
+	u8 max_vxlan_udp_ports[0x8];
+	u8 reserved_at_38[0x6];
+	u8 max_geneve_opt_len[0x1];
+	u8 tunnel_stateless_geneve_rx[0x1];
+	u8 reserved_at_40[0x10];
+	u8 lro_min_mss_size[0x10];
+	u8 reserved_at_60[0x120];
+	u8 lro_timer_supported_periods[4][0x20];
+	u8 reserved_at_200[0x600];
+};
+
 struct mlx5_ifc_qos_cap_bits {
 	u8 packet_pacing[0x1];
 	u8 esw_scheduling[0x1];
@@ -1091,6 +1146,8 @@ struct mlx5_ifc_qos_cap_bits {
 
 union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap;
+	struct mlx5_ifc_per_protocol_networking_offload_caps_bits
+	       per_protocol_networking_offload_caps;
 	struct mlx5_ifc_qos_cap_bits qos_cap;
 	u8 reserved_at_0[0x8000];
 };
@@ -1111,6 +1168,69 @@ struct mlx5_ifc_query_hca_cap_in_bits {
 	u8 reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_mac_address_layout_bits {
+	u8 reserved_at_0[0x10];
+	u8 mac_addr_47_32[0x10];
+	u8 mac_addr_31_0[0x20];
+};
+
+struct mlx5_ifc_nic_vport_context_bits {
+	u8 reserved_at_0[0x5];
+	u8 min_wqe_inline_mode[0x3];
+	u8 reserved_at_8[0x15];
+	u8 disable_mc_local_lb[0x1];
+	u8 disable_uc_local_lb[0x1];
+	u8 roce_en[0x1];
+	u8 arm_change_event[0x1];
+	u8 reserved_at_21[0x1a];
+	u8 event_on_mtu[0x1];
+	u8 event_on_promisc_change[0x1];
+	u8 event_on_vlan_change[0x1];
+	u8 event_on_mc_address_change[0x1];
+	u8 event_on_uc_address_change[0x1];
+	u8 reserved_at_40[0xc];
+	u8 affiliation_criteria[0x4];
+	u8 affiliated_vhca_id[0x10];
+	u8 reserved_at_60[0xd0];
+	u8 mtu[0x10];
+	u8 system_image_guid[0x40];
+	u8 port_guid[0x40];
+	u8 node_guid[0x40];
+	u8 reserved_at_200[0x140];
+	u8 qkey_violation_counter[0x10];
+	u8 reserved_at_350[0x430];
+	u8 promisc_uc[0x1];
+	u8 promisc_mc[0x1];
+	u8 promisc_all[0x1];
+	u8 reserved_at_783[0x2];
+	u8 allowed_list_type[0x3];
+	u8 reserved_at_788[0xc];
+	u8 allowed_list_size[0xc];
+	struct mlx5_ifc_mac_address_layout_bits permanent_address;
+	u8 reserved_at_7e0[0x20];
+};
+
+struct mlx5_ifc_query_nic_vport_context_out_bits {
+	u8 status[0x8];
+	u8 reserved_at_8[0x18];
+	u8 syndrome[0x20];
+	u8 reserved_at_40[0x40];
+	struct mlx5_ifc_nic_vport_context_bits nic_vport_context;
+};
+
+struct mlx5_ifc_query_nic_vport_context_in_bits {
+	u8 opcode[0x10];
+	u8 reserved_at_10[0x10];
+	u8 reserved_at_20[0x10];
+	u8 op_mod[0x10];
+	u8 other_vport[0x1];
+	u8 reserved_at_41[0xf];
+	u8 vport_number[0x10];
+	u8 reserved_at_60[0x5];
+	u8 allowed_list_type[0x3];
+	u8 reserved_at_68[0x18];
+};
+
 /* CQE format mask. */
 #define MLX5E_CQE_FORMAT_MASK 0xc
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v3 8/8] net/mlx5: report supported max number of mbuf segments
  2019-07-17  6:53     ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                         ` (6 preceding siblings ...)
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 7/8] net/mlx5: add minimal required Tx data inline Viacheslav Ovsiienko
@ 2019-07-17  6:53       ` Viacheslav Ovsiienko
  2019-07-21 14:24       ` [dpdk-dev] [PATCH v4 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
  8 siblings, 0 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-17  6:53 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch fills the tx_desc_lim.nb_seg_max and
tx_desc_lim.nb_mtu_seg_max fields of rte_eth_dev_info
structure to report thee maximal number of packet
segments, requested inline data configuration is
taken into account in conservative way.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_ethdev.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 2902138..fdd6e03 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -592,6 +592,42 @@ struct ethtool_link_settings {
 }
 
 /**
+ * Sets tx mbuf limiting parameters.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] info
+ *   Info structure output buffer.
+ */
+static void
+mlx5_set_txlimit_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	unsigned int inlen;
+	uint16_t nb_max;
+
+	inlen = (config->txq_inline_max == MLX5_ARG_UNSET) ?
+		MLX5_SEND_DEF_INLINE_LEN :
+		(unsigned int)config->txq_inline_max;
+	assert(config->txq_inline_min >= 0);
+	inlen = RTE_MAX(inlen, (unsigned int)config->txq_inline_min);
+	inlen = RTE_MIN(inlen, MLX5_WQE_SIZE_MAX +
+			       MLX5_ESEG_MIN_INLINE_SIZE -
+			       MLX5_WQE_CSEG_SIZE -
+			       MLX5_WQE_ESEG_SIZE -
+			       MLX5_WQE_DSEG_SIZE * 2);
+	nb_max = (MLX5_WQE_SIZE_MAX +
+		  MLX5_ESEG_MIN_INLINE_SIZE -
+		  MLX5_WQE_CSEG_SIZE -
+		  MLX5_WQE_ESEG_SIZE -
+		  MLX5_WQE_DSEG_SIZE -
+		  inlen) / MLX5_WSEG_SIZE;
+	info->tx_desc_lim.nb_seg_max = nb_max;
+	info->tx_desc_lim.nb_mtu_seg_max = nb_max;
+}
+
+/**
  * DPDK callback to get information about the device.
  *
  * @param dev
@@ -634,6 +670,7 @@ struct ethtool_link_settings {
 	info->speed_capa = priv->link_speed_capa;
 	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
 	mlx5_set_default_params(dev, info);
+	mlx5_set_txlimit_params(dev, info);
 	info->switch_info.name = dev->data->name;
 	info->switch_info.domain_id = priv->domain_id;
 	info->switch_info.port_id = priv->representor_id;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v4 0/8] net/mlx5: consolidate Tx datapath
  2019-07-17  6:53     ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                         ` (7 preceding siblings ...)
  2019-07-17  6:53       ` [dpdk-dev] [PATCH v3 8/8] net/mlx5: report supported max number of mbuf segments Viacheslav Ovsiienko
@ 2019-07-21 14:24       ` Viacheslav Ovsiienko
  2019-07-21 14:24         ` [dpdk-dev] [PATCH v4 1/8] net/mlx5: remove Tx datapath implementation Viacheslav Ovsiienko
                           ` (8 more replies)
  8 siblings, 9 replies; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-21 14:24 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patchset introduces the new implementation of tx_burst
routine of mlx5 PMD. The existing implementation is based on the
several branches of tx_burst routines optimizied for most
common sets of Tx hardware offload, supported by Mellanox NICs.

It was not very easy to update, support and develop such kind
of code - multiple branches impose multiple points to process.
Also, many of frequently requested offload combinations are not
supported yet in the most efficient way. That leads to selecting
of not completely matching tx_burst routine and harms the performance.

The new Tx datapath:
  - introduces the unified template for tx_burst routine to generate
    on compile time the most efficient instances, tuned for specified
    sets of hardware offloads
  - the best matching instance is chosen in run-time at sending queue
    configuration
  - almost all possible sets of Tx are supported
  - the best possible method to send each packet is chosen on runtime
  - introduces new devargs for more accurate tuning of data inlining 

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>

---
v4:
  - inline first mbufs for multi-segment packets
  - revert deprecated devargs support to not break device probing
  - HW VLAN insertion from WQE bug
  - rebase

v3:
  - http://patches.dpdk.org/cover/56568/
  - add reporting max number of segments in packet
  - minor fixes with inline data length setup
  - rebase on the current repo top

v2: 
  - http://patches.dpdk.org/cover/56451/
  - minor performance issues resolved
  - typos in comments and docs

v1: 
  - http://patches.dpdk.org/patch/56091/

Viacheslav Ovsiienko (8):

*** BLURB HERE ***

Viacheslav Ovsiienko (8):
  net/mlx5: remove Tx datapath implementation
  net/mlx5: add Tx datapath related devargs
  net/mlx5: update Tx datapath definitions
  net/mlx5: extend NIC attributes query via DevX
  net/mlx5: add Tx datapath configuration and setup
  net/mlx5: introduce Tx burst routine template
  net/mlx5: implement Tx burst template
  net/mlx5: report supported max number of mbuf segments

 doc/guides/nics/mlx5.rst               |  173 +-
 doc/guides/rel_notes/release_19_08.rst |    2 +
 drivers/net/mlx5/mlx5.c                |  187 +-
 drivers/net/mlx5/mlx5.h                |   14 +-
 drivers/net/mlx5/mlx5_defs.h           |   36 +-
 drivers/net/mlx5/mlx5_devx_cmds.c      |  102 +-
 drivers/net/mlx5/mlx5_ethdev.c         |   95 +-
 drivers/net/mlx5/mlx5_prm.h            |  357 ++-
 drivers/net/mlx5/mlx5_rxtx.c           | 5338 ++++++++++++++++++++++----------
 drivers/net/mlx5/mlx5_rxtx.h           |  334 +-
 drivers/net/mlx5/mlx5_rxtx_vec.c       |  175 --
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h  |  289 --
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h   |  284 --
 drivers/net/mlx5/mlx5_txq.c            |  289 +-
 14 files changed, 4638 insertions(+), 3037 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v4 1/8] net/mlx5: remove Tx datapath implementation
  2019-07-21 14:24       ` [dpdk-dev] [PATCH v4 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
@ 2019-07-21 14:24         ` Viacheslav Ovsiienko
  2019-07-22  5:32           ` Yongseok Koh
  2019-07-21 14:24         ` [dpdk-dev] [PATCH v4 2/8] net/mlx5: add Tx datapath related devargs Viacheslav Ovsiienko
                           ` (7 subsequent siblings)
  8 siblings, 1 reply; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-21 14:24 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch removes the existing Tx datapath code
as preparation step before introducing the new
implementation. The following entities are being
removed:

- deprecated devargs support
- tx_burst() routines
- related PRM definitions
- SQ configuration code
- Tx routine selection code
- incompatible Tx completion code

The following devargs are deprecated and ignored:
- "txq_inline" is going to be converted to "txq_inline_max"
  for compatibility issue
- "tx_vec_en"
- "txqs_max_vec"
- "txq_mpw_hdr_dseg_en"
- "txq_max_inline_len" is going to be converted
  to "txq_inline_mpw" for compatibility issue

The deprecated devarg keys are recognized by PMD
and ignored/converted to the new ones in order not
to block device probing.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 doc/guides/nics/mlx5.rst              |   34 +-
 drivers/net/mlx5/mlx5.c               |   39 +-
 drivers/net/mlx5/mlx5.h               |    5 -
 drivers/net/mlx5/mlx5_defs.h          |   16 -
 drivers/net/mlx5/mlx5_ethdev.c        |   58 --
 drivers/net/mlx5/mlx5_prm.h           |   77 --
 drivers/net/mlx5/mlx5_rxtx.c          | 1434 +--------------------------------
 drivers/net/mlx5/mlx5_rxtx.h          |  273 -------
 drivers/net/mlx5/mlx5_rxtx_vec.c      |  175 ----
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h |  289 -------
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  284 -------
 drivers/net/mlx5/mlx5_txq.c           |  110 +--
 12 files changed, 65 insertions(+), 2729 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 16aa390..5cf1e76 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -350,13 +350,8 @@ Run-time configuration
 
 - ``txq_inline`` parameter [int]
 
-  Amount of data to be inlined during TX operations. Improves latency.
-  Can improve PPS performance when PCI back pressure is detected and may be
-  useful for scenarios involving heavy traffic on many queues.
-
-  Because additional software logic is necessary to handle this mode, this
-  option should be used with care, as it can lower performance when back
-  pressure is not expected.
+  Amount of data to be inlined during TX operations. This parameter is
+  deprecated and ignored, kept for compatibility issue.
 
 - ``txqs_min_inline`` parameter [int]
 
@@ -378,16 +373,8 @@ Run-time configuration
 - ``txqs_max_vec`` parameter [int]
 
   Enable vectorized Tx only when the number of TX queues is less than or
-  equal to this value. Effective only when ``tx_vec_en`` is enabled.
-
-  On ConnectX-5:
-
-        - Set to 8 by default on ARMv8.
-        - Set to 4 by default otherwise.
-
-  On BlueField
-
-        - Set to 16 by default.
+  equal to this value. This parameter is deprecated and ignored, kept
+  for compatibility issue to not prevent driver from probing.
 
 - ``txq_mpw_en`` parameter [int]
 
@@ -418,7 +405,8 @@ Run-time configuration
 - ``txq_mpw_hdr_dseg_en`` parameter [int]
 
   A nonzero value enables including two pointers in the first block of TX
-  descriptor. This can be used to lessen CPU load for memory copy.
+  descriptor. The parameter is deprecated and ignored, kept for compatibility
+  issue.
 
   Effective only when Enhanced MPS is supported. Disabled by default.
 
@@ -427,14 +415,14 @@ Run-time configuration
   Maximum size of packet to be inlined. This limits the size of packet to
   be inlined. If the size of a packet is larger than configured value, the
   packet isn't inlined even though there's enough space remained in the
-  descriptor. Instead, the packet is included with pointer.
-
-  Effective only when Enhanced MPS is supported. The default value is 256.
+  descriptor. Instead, the packet is included with pointer. This parameter
+  is deprecated.
 
 - ``tx_vec_en`` parameter [int]
 
-  A nonzero value enables Tx vector on ConnectX-5, ConnectX-6 and BlueField NICs if the number of
-  global Tx queues on the port is less than ``txqs_max_vec``.
+  A nonzero value enables Tx vector on ConnectX-5, ConnectX-6 and BlueField
+  NICs if the number of global Tx queues on the port is less than
+  ``txqs_max_vec``. The parameter is deprecated and ignored.
 
   This option cannot be used with certain offloads such as ``DEV_TX_OFFLOAD_TCP_TSO,
   DEV_TX_OFFLOAD_VXLAN_TNL_TSO, DEV_TX_OFFLOAD_GRE_TNL_TSO, DEV_TX_OFFLOAD_VLAN_INSERT``.
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index f4ad5d2..d4f0eb2 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -69,7 +69,7 @@
 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
 
-/* Device parameter to configure inline send. */
+/* Device parameter to configure inline send. Deprecated, ignored.*/
 #define MLX5_TXQ_INLINE "txq_inline"
 
 /*
@@ -80,20 +80,29 @@
 
 /*
  * Device parameter to configure the number of TX queues threshold for
- * enabling vectorized Tx.
+ * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines).
  */
 #define MLX5_TXQS_MAX_VEC "txqs_max_vec"
 
 /* Device parameter to enable multi-packet send WQEs. */
 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
 
-/* Device parameter to include 2 dsegs in the title WQEBB. */
+/*
+ * Device parameter to include 2 dsegs in the title WQEBB.
+ * Deprecated, ignored.
+ */
 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
 
-/* Device parameter to limit the size of inlining packet. */
+/*
+ * Device parameter to limit the size of inlining packet.
+ * Deprecated, ignored.
+ */
 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
 
-/* Device parameter to enable hardware Tx vector. */
+/*
+ * Device parameter to enable hardware Tx vector.
+ * Deprecated, ignored (no vectorized Tx routines anymore).
+ */
 #define MLX5_TX_VEC_EN "tx_vec_en"
 
 /* Device parameter to enable hardware Rx vector. */
@@ -997,19 +1006,19 @@ struct mlx5_dev_spawn_data {
 	} else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
 		config->mprq.min_rxqs_num = tmp;
 	} else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
-		config->txq_inline = tmp;
+		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
 	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
 		config->txqs_inline = tmp;
 	} else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
-		config->txqs_vec = tmp;
+		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
 	} else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
 		config->mps = !!tmp;
 	} else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
-		config->mpw_hdr_dseg = !!tmp;
+		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
 	} else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
-		config->inline_max_packet_sz = tmp;
+		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
 	} else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
-		config->tx_vec_en = !!tmp;
+		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
 	} else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
 		config->rx_vec_en = !!tmp;
 	} else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
@@ -2016,12 +2025,8 @@ struct mlx5_dev_spawn_data {
 	dev_config = (struct mlx5_dev_config){
 		.hw_padding = 0,
 		.mps = MLX5_ARG_UNSET,
-		.tx_vec_en = 1,
 		.rx_vec_en = 1,
-		.txq_inline = MLX5_ARG_UNSET,
 		.txqs_inline = MLX5_ARG_UNSET,
-		.txqs_vec = MLX5_ARG_UNSET,
-		.inline_max_packet_sz = MLX5_ARG_UNSET,
 		.vf_nl_en = 1,
 		.mr_ext_memseg_en = 1,
 		.mprq = {
@@ -2034,9 +2039,6 @@ struct mlx5_dev_spawn_data {
 	};
 	/* Device specific configuration. */
 	switch (pci_dev->id.device_id) {
-	case PCI_DEVICE_ID_MELLANOX_CONNECTX5BF:
-		dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS_BLUEFIELD;
-		break;
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
@@ -2046,9 +2048,6 @@ struct mlx5_dev_spawn_data {
 	default:
 		break;
 	}
-	/* Set architecture-dependent default value if unset. */
-	if (dev_config.txqs_vec == MLX5_ARG_UNSET)
-		dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS;
 	for (i = 0; i != ns; ++i) {
 		uint32_t restore;
 
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 6230371..354f6bc 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -198,9 +198,7 @@ struct mlx5_dev_config {
 	unsigned int cqe_comp:1; /* CQE compression is enabled. */
 	unsigned int cqe_pad:1; /* CQE padding is enabled. */
 	unsigned int tso:1; /* Whether TSO is supported. */
-	unsigned int tx_vec_en:1; /* Tx vector is enabled. */
 	unsigned int rx_vec_en:1; /* Rx vector is enabled. */
-	unsigned int mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
 	unsigned int mr_ext_memseg_en:1;
 	/* Whether memseg should be extended for MR creation. */
 	unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */
@@ -224,10 +222,7 @@ struct mlx5_dev_config {
 	unsigned int tso_max_payload_sz; /* Maximum TCP payload for TSO. */
 	unsigned int ind_table_max_size; /* Maximum indirection table size. */
 	unsigned int max_dump_files_num; /* Maximum dump files per queue. */
-	int txq_inline; /* Maximum packet size for inlining. */
 	int txqs_inline; /* Queue number threshold for inlining. */
-	int txqs_vec; /* Queue number threshold for vectorized Tx. */
-	int inline_max_packet_sz; /* Max packet size for inlining. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 };
 
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 13801a5..6861304 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -60,15 +60,6 @@
 /* Maximum Packet headers size (L2+L3+L4) for TSO. */
 #define MLX5_MAX_TSO_HEADER 192
 
-/* Default maximum number of Tx queues for vectorized Tx. */
-#if defined(RTE_ARCH_ARM64)
-#define MLX5_VPMD_MAX_TXQS 8
-#define MLX5_VPMD_MAX_TXQS_BLUEFIELD 16
-#else
-#define MLX5_VPMD_MAX_TXQS 4
-#define MLX5_VPMD_MAX_TXQS_BLUEFIELD MLX5_VPMD_MAX_TXQS
-#endif
-
 /* Threshold of buffer replenishment for vectorized Rx. */
 #define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \
 	(RTE_MIN(MLX5_VPMD_RX_MAX_BURST, (unsigned int)(n) >> 2))
@@ -76,13 +67,6 @@
 /* Maximum size of burst for vectorized Rx. */
 #define MLX5_VPMD_RX_MAX_BURST 64U
 
-/*
- * Maximum size of burst for vectorized Tx. This is related to the maximum size
- * of Enhanced MPW (eMPW) WQE as vectorized Tx is supported with eMPW.
- * Careful when changing, large value can cause WQE DS to overlap.
- */
-#define MLX5_VPMD_TX_MAX_BURST        32U
-
 /* Number of packets vectorized Rx can simultaneously process in a loop. */
 #define MLX5_VPMD_DESCS_PER_LOOP      4
 
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index f9826c9..738d540 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1653,64 +1653,6 @@ int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
 }
 
 /**
- * Configure the TX function to use.
- *
- * @param dev
- *   Pointer to private data structure.
- *
- * @return
- *   Pointer to selected Tx burst function.
- */
-eth_tx_burst_t
-mlx5_select_tx_function(struct rte_eth_dev *dev)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
-	struct mlx5_dev_config *config = &priv->config;
-	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
-	int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
-				    DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
-				    DEV_TX_OFFLOAD_GRE_TNL_TSO |
-				    DEV_TX_OFFLOAD_IP_TNL_TSO |
-				    DEV_TX_OFFLOAD_UDP_TNL_TSO));
-	int swp = !!(tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
-				    DEV_TX_OFFLOAD_UDP_TNL_TSO |
-				    DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM));
-	int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT);
-
-	assert(priv != NULL);
-	/* Select appropriate TX function. */
-	if (vlan_insert || tso || swp)
-		return tx_pkt_burst;
-	if (config->mps == MLX5_MPW_ENHANCED) {
-		if (mlx5_check_vec_tx_support(dev) > 0) {
-			if (mlx5_check_raw_vec_tx_support(dev) > 0)
-				tx_pkt_burst = mlx5_tx_burst_raw_vec;
-			else
-				tx_pkt_burst = mlx5_tx_burst_vec;
-			DRV_LOG(DEBUG,
-				"port %u selected enhanced MPW Tx vectorized"
-				" function",
-				dev->data->port_id);
-		} else {
-			tx_pkt_burst = mlx5_tx_burst_empw;
-			DRV_LOG(DEBUG,
-				"port %u selected enhanced MPW Tx function",
-				dev->data->port_id);
-		}
-	} else if (config->mps && (config->txq_inline > 0)) {
-		tx_pkt_burst = mlx5_tx_burst_mpw_inline;
-		DRV_LOG(DEBUG, "port %u selected MPW inline Tx function",
-			dev->data->port_id);
-	} else if (config->mps) {
-		tx_pkt_burst = mlx5_tx_burst_mpw;
-		DRV_LOG(DEBUG, "port %u selected MPW Tx function",
-			dev->data->port_id);
-	}
-	return tx_pkt_burst;
-}
-
-/**
  * Configure the RX function to use.
  *
  * @param dev
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 95ff29a..dfd9317 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -39,32 +39,12 @@
 /* Invalidate a CQE. */
 #define MLX5_CQE_INVALIDATE (MLX5_CQE_INVALID << 4)
 
-/* Maximum number of packets a multi-packet WQE can handle. */
-#define MLX5_MPW_DSEG_MAX 5
-
 /* WQE DWORD size */
 #define MLX5_WQE_DWORD_SIZE 16
 
 /* WQE size */
 #define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
 
-/* Max size of a WQE session. */
-#define MLX5_WQE_SIZE_MAX 960U
-
-/* Compute the number of DS. */
-#define MLX5_WQE_DS(n) \
-	(((n) + MLX5_WQE_DWORD_SIZE - 1) / MLX5_WQE_DWORD_SIZE)
-
-/* Room for inline data in multi-packet WQE. */
-#define MLX5_MWQE64_INL_DATA 28
-
-/* Default minimum number of Tx queues for inlining packets. */
-#define MLX5_EMPW_MIN_TXQS 8
-
-/* Default max packet length to be inlined. */
-#define MLX5_EMPW_MAX_INLINE_LEN (4U * MLX5_WQE_SIZE)
-
-
 #define MLX5_OPC_MOD_ENHANCED_MPSW 0
 #define MLX5_OPCODE_ENHANCED_MPSW 0x29
 
@@ -164,47 +144,11 @@ enum mlx5_completion_mode {
 	MLX5_COMP_CQE_AND_EQE = 0x3,
 };
 
-/* Subset of struct mlx5_wqe_eth_seg. */
-struct mlx5_wqe_eth_seg_small {
-	uint32_t rsvd0;
-	uint8_t	cs_flags;
-	uint8_t	rsvd1;
-	uint16_t mss;
-	uint32_t flow_table_metadata;
-	uint16_t inline_hdr_sz;
-	uint8_t inline_hdr[2];
-} __rte_aligned(MLX5_WQE_DWORD_SIZE);
-
-struct mlx5_wqe_inl_small {
-	uint32_t byte_cnt;
-	uint8_t raw;
-} __rte_aligned(MLX5_WQE_DWORD_SIZE);
-
-struct mlx5_wqe_ctrl {
-	uint32_t ctrl0;
-	uint32_t ctrl1;
-	uint32_t ctrl2;
-	uint32_t ctrl3;
-} __rte_aligned(MLX5_WQE_DWORD_SIZE);
-
 /* Small common part of the WQE. */
 struct mlx5_wqe {
 	uint32_t ctrl[4];
-	struct mlx5_wqe_eth_seg_small eseg;
-};
-
-/* Vectorize WQE header. */
-struct mlx5_wqe_v {
-	rte_v128u32_t ctrl;
-	rte_v128u32_t eseg;
 };
 
-/* WQE. */
-struct mlx5_wqe64 {
-	struct mlx5_wqe hdr;
-	uint8_t raw[32];
-} __rte_aligned(MLX5_WQE_SIZE);
-
 /* MPW mode. */
 enum mlx5_mpw_mode {
 	MLX5_MPW_DISABLED,
@@ -212,27 +156,6 @@ enum mlx5_mpw_mode {
 	MLX5_MPW_ENHANCED, /* Enhanced Multi-Packet Send WQE, a.k.a MPWv2. */
 };
 
-/* MPW session status. */
-enum mlx5_mpw_state {
-	MLX5_MPW_STATE_OPENED,
-	MLX5_MPW_INL_STATE_OPENED,
-	MLX5_MPW_ENHANCED_STATE_OPENED,
-	MLX5_MPW_STATE_CLOSED,
-};
-
-/* MPW session descriptor. */
-struct mlx5_mpw {
-	enum mlx5_mpw_state state;
-	unsigned int pkts_n;
-	unsigned int len;
-	unsigned int total_len;
-	volatile struct mlx5_wqe *wqe;
-	union {
-		volatile struct mlx5_wqe_data_seg *dseg[MLX5_MPW_DSEG_MAX];
-		volatile uint8_t *raw;
-	} data;
-};
-
 /* WQE for Multi-Packet RQ. */
 struct mlx5_wqe_mprq {
 	struct mlx5_wqe_srq_next_seg next_seg;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index c1dc8c4..f2d6918 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -288,140 +288,6 @@
 }
 
 /**
- * Return the size of tailroom of WQ.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param addr
- *   Pointer to tail of WQ.
- *
- * @return
- *   Size of tailroom.
- */
-static inline size_t
-tx_mlx5_wq_tailroom(struct mlx5_txq_data *txq, void *addr)
-{
-	size_t tailroom;
-	tailroom = (uintptr_t)(txq->wqes) +
-		   (1 << txq->wqe_n) * MLX5_WQE_SIZE -
-		   (uintptr_t)addr;
-	return tailroom;
-}
-
-/**
- * Copy data to tailroom of circular queue.
- *
- * @param dst
- *   Pointer to destination.
- * @param src
- *   Pointer to source.
- * @param n
- *   Number of bytes to copy.
- * @param base
- *   Pointer to head of queue.
- * @param tailroom
- *   Size of tailroom from dst.
- *
- * @return
- *   Pointer after copied data.
- */
-static inline void *
-mlx5_copy_to_wq(void *dst, const void *src, size_t n,
-		void *base, size_t tailroom)
-{
-	void *ret;
-
-	if (n > tailroom) {
-		rte_memcpy(dst, src, tailroom);
-		rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
-			   n - tailroom);
-		ret = (uint8_t *)base + n - tailroom;
-	} else {
-		rte_memcpy(dst, src, n);
-		ret = (n == tailroom) ? base : (uint8_t *)dst + n;
-	}
-	return ret;
-}
-
-/**
- * Inline TSO headers into WQE.
- *
- * @return
- *   0 on success, negative errno value on failure.
- */
-static int
-inline_tso(struct mlx5_txq_data *txq, struct rte_mbuf *buf,
-	   uint32_t *length,
-	   uintptr_t *addr,
-	   uint16_t *pkt_inline_sz,
-	   uint8_t **raw,
-	   uint16_t *max_wqe,
-	   uint16_t *tso_segsz,
-	   uint16_t *tso_header_sz)
-{
-	uintptr_t end = (uintptr_t)(((uintptr_t)txq->wqes) +
-				    (1 << txq->wqe_n) * MLX5_WQE_SIZE);
-	unsigned int copy_b;
-	uint8_t vlan_sz = (buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0;
-	const uint8_t tunneled = txq->tunnel_en && (buf->ol_flags &
-				 PKT_TX_TUNNEL_MASK);
-	uint16_t n_wqe;
-
-	*tso_segsz = buf->tso_segsz;
-	*tso_header_sz = buf->l2_len + vlan_sz + buf->l3_len + buf->l4_len;
-	if (unlikely(*tso_segsz == 0 || *tso_header_sz == 0)) {
-		txq->stats.oerrors++;
-		return -EINVAL;
-	}
-	if (tunneled)
-		*tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
-	/* First seg must contain all TSO headers. */
-	if (unlikely(*tso_header_sz > MLX5_MAX_TSO_HEADER) ||
-		     *tso_header_sz > DATA_LEN(buf)) {
-		txq->stats.oerrors++;
-		return -EINVAL;
-	}
-	copy_b = *tso_header_sz - *pkt_inline_sz;
-	if (!copy_b || ((end - (uintptr_t)*raw) < copy_b))
-		return -EAGAIN;
-	n_wqe = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
-	if (unlikely(*max_wqe < n_wqe))
-		return -EINVAL;
-	*max_wqe -= n_wqe;
-	rte_memcpy((void *)*raw, (void *)*addr, copy_b);
-	*length -= copy_b;
-	*addr += copy_b;
-	copy_b = MLX5_WQE_DS(copy_b) * MLX5_WQE_DWORD_SIZE;
-	*pkt_inline_sz += copy_b;
-	*raw += copy_b;
-	return 0;
-}
-
-/**
- * DPDK callback to check the status of a tx descriptor.
- *
- * @param tx_queue
- *   The tx queue.
- * @param[in] offset
- *   The index of the descriptor in the ring.
- *
- * @return
- *   The status of the tx descriptor.
- */
-int
-mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
-{
-	struct mlx5_txq_data *txq = tx_queue;
-	uint16_t used;
-
-	mlx5_tx_complete(txq);
-	used = txq->elts_head - txq->elts_tail;
-	if (offset < used)
-		return RTE_ETH_TX_DESC_FULL;
-	return RTE_ETH_TX_DESC_DONE;
-}
-
-/**
  * Internal function to compute the number of used descriptors in an RX queue
  *
  * @param rxq
@@ -655,7 +521,7 @@
 						    (1 << txq->cqe_n));
 			mlx5_dump_debug_information(name, "MLX5 Error SQ:",
 						    (const void *)((uintptr_t)
-						    tx_mlx5_wqe(txq, 0)),
+						    txq->wqes),
 						    MLX5_WQE_SIZE *
 						    (1 << txq->wqe_n));
 			txq_ctrl->dump_file_n++;
@@ -683,1247 +549,6 @@
 }
 
 /**
- * DPDK callback for TX.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	unsigned int k = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int comp;
-	volatile struct mlx5_wqe_ctrl *last_wqe = NULL;
-	unsigned int segs_n = 0;
-	const unsigned int max_inline = txq->max_inline;
-	uint64_t addr_64;
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Prefetch first packet cacheline. */
-	rte_prefetch0(*pkts);
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	if (unlikely(!max_wqe))
-		return 0;
-	do {
-		struct rte_mbuf *buf = *pkts; /* First_seg. */
-		uint8_t *raw;
-		volatile struct mlx5_wqe_v *wqe = NULL;
-		volatile rte_v128u32_t *dseg = NULL;
-		uint32_t length;
-		unsigned int ds = 0;
-		unsigned int sg = 0; /* counter of additional segs attached. */
-		uintptr_t addr;
-		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
-		uint16_t tso_header_sz = 0;
-		uint16_t ehdr;
-		uint8_t cs_flags;
-		uint8_t tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
-		uint32_t swp_offsets = 0;
-		uint8_t swp_types = 0;
-		rte_be32_t metadata;
-		uint16_t tso_segsz = 0;
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		uint32_t total_length = 0;
-#endif
-		int ret;
-
-		segs_n = buf->nb_segs;
-		/*
-		 * Make sure there is enough room to store this packet and
-		 * that one ring entry remains unused.
-		 */
-		assert(segs_n);
-		if (max_elts < segs_n)
-			break;
-		max_elts -= segs_n;
-		sg = --segs_n;
-		if (unlikely(--max_wqe == 0))
-			break;
-		wqe = (volatile struct mlx5_wqe_v *)
-			tx_mlx5_wqe(txq, txq->wqe_ci);
-		rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-		if (pkts_n - i > 1)
-			rte_prefetch0(*(pkts + 1));
-		addr = rte_pktmbuf_mtod(buf, uintptr_t);
-		length = DATA_LEN(buf);
-		ehdr = (((uint8_t *)addr)[1] << 8) |
-		       ((uint8_t *)addr)[0];
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		total_length = length;
-#endif
-		if (length < (MLX5_WQE_DWORD_SIZE + 2)) {
-			txq->stats.oerrors++;
-			break;
-		}
-		/* Update element. */
-		(*txq->elts)[elts_head & elts_m] = buf;
-		/* Prefetch next buffer data. */
-		if (pkts_n - i > 1)
-			rte_prefetch0(
-			    rte_pktmbuf_mtod(*(pkts + 1), volatile void *));
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets, &swp_types);
-		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Replace the Ethernet type by the VLAN if necessary. */
-		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
-			uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
-							 buf->vlan_tci);
-			unsigned int len = 2 * RTE_ETHER_ADDR_LEN - 2;
-
-			addr += 2;
-			length -= 2;
-			/* Copy Destination and source mac address. */
-			memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
-			/* Copy VLAN. */
-			memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
-			/* Copy missing two bytes to end the DSeg. */
-			memcpy((uint8_t *)raw + len + sizeof(vlan),
-			       ((uint8_t *)addr) + len, 2);
-			addr += len + 2;
-			length -= (len + 2);
-		} else {
-			memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
-			       MLX5_WQE_DWORD_SIZE);
-			length -= pkt_inline_sz;
-			addr += pkt_inline_sz;
-		}
-		raw += MLX5_WQE_DWORD_SIZE;
-		if (tso) {
-			ret = inline_tso(txq, buf, &length,
-					 &addr, &pkt_inline_sz,
-					 &raw, &max_wqe,
-					 &tso_segsz, &tso_header_sz);
-			if (ret == -EINVAL) {
-				break;
-			} else if (ret == -EAGAIN) {
-				/* NOP WQE. */
-				wqe->ctrl = (rte_v128u32_t){
-					rte_cpu_to_be_32(txq->wqe_ci << 8),
-					rte_cpu_to_be_32(txq->qp_num_8s | 1),
-					rte_cpu_to_be_32
-						(MLX5_COMP_ONLY_FIRST_ERR <<
-						 MLX5_COMP_MODE_OFFSET),
-					0,
-				};
-				ds = 1;
-#ifdef MLX5_PMD_SOFT_COUNTERS
-				total_length = 0;
-#endif
-				k++;
-				goto next_wqe;
-			}
-		}
-		/* Inline if enough room. */
-		if (max_inline || tso) {
-			uint32_t inl = 0;
-			uintptr_t end = (uintptr_t)
-				(((uintptr_t)txq->wqes) +
-				 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
-			unsigned int inline_room = max_inline *
-						   RTE_CACHE_LINE_SIZE -
-						   (pkt_inline_sz - 2) -
-						   !!tso * sizeof(inl);
-			uintptr_t addr_end;
-			unsigned int copy_b;
-
-pkt_inline:
-			addr_end = RTE_ALIGN_FLOOR(addr + inline_room,
-						   RTE_CACHE_LINE_SIZE);
-			copy_b = (addr_end > addr) ?
-				 RTE_MIN((addr_end - addr), length) : 0;
-			if (copy_b && ((end - (uintptr_t)raw) >
-				       (copy_b + sizeof(inl)))) {
-				/*
-				 * One Dseg remains in the current WQE.  To
-				 * keep the computation positive, it is
-				 * removed after the bytes to Dseg conversion.
-				 */
-				uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
-
-				if (unlikely(max_wqe < n))
-					break;
-				max_wqe -= n;
-				if (tso) {
-					assert(inl == 0);
-					inl = rte_cpu_to_be_32(copy_b |
-							       MLX5_INLINE_SEG);
-					rte_memcpy((void *)raw,
-						   (void *)&inl, sizeof(inl));
-					raw += sizeof(inl);
-					pkt_inline_sz += sizeof(inl);
-				}
-				rte_memcpy((void *)raw, (void *)addr, copy_b);
-				addr += copy_b;
-				length -= copy_b;
-				pkt_inline_sz += copy_b;
-			}
-			/*
-			 * 2 DWORDs consumed by the WQE header + ETH segment +
-			 * the size of the inline part of the packet.
-			 */
-			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
-			if (length > 0) {
-				if (ds % (MLX5_WQE_SIZE /
-					  MLX5_WQE_DWORD_SIZE) == 0) {
-					if (unlikely(--max_wqe == 0))
-						break;
-					dseg = (volatile rte_v128u32_t *)
-					       tx_mlx5_wqe(txq, txq->wqe_ci +
-							   ds / 4);
-				} else {
-					dseg = (volatile rte_v128u32_t *)
-						((uintptr_t)wqe +
-						 (ds * MLX5_WQE_DWORD_SIZE));
-				}
-				goto use_dseg;
-			} else if (!segs_n) {
-				goto next_pkt;
-			} else {
-				/*
-				 * Further inline the next segment only for
-				 * non-TSO packets.
-				 */
-				if (!tso) {
-					raw += copy_b;
-					inline_room -= copy_b;
-				} else {
-					inline_room = 0;
-				}
-				/* Move to the next segment. */
-				--segs_n;
-				buf = buf->next;
-				assert(buf);
-				addr = rte_pktmbuf_mtod(buf, uintptr_t);
-				length = DATA_LEN(buf);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-				total_length += length;
-#endif
-				(*txq->elts)[++elts_head & elts_m] = buf;
-				goto pkt_inline;
-			}
-		} else {
-			/*
-			 * No inline has been done in the packet, only the
-			 * Ethernet Header as been stored.
-			 */
-			dseg = (volatile rte_v128u32_t *)
-				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
-			ds = 3;
-use_dseg:
-			/* Add the remaining packet as a simple ds. */
-			addr_64 = rte_cpu_to_be_64(addr);
-			*dseg = (rte_v128u32_t){
-				rte_cpu_to_be_32(length),
-				mlx5_tx_mb2mr(txq, buf),
-				addr_64,
-				addr_64 >> 32,
-			};
-			++ds;
-			if (!segs_n)
-				goto next_pkt;
-		}
-next_seg:
-		assert(buf);
-		assert(ds);
-		assert(wqe);
-		/*
-		 * Spill on next WQE when the current one does not have
-		 * enough room left. Size of WQE must a be a multiple
-		 * of data segment size.
-		 */
-		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
-		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
-			if (unlikely(--max_wqe == 0))
-				break;
-			dseg = (volatile rte_v128u32_t *)
-			       tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
-			rte_prefetch0(tx_mlx5_wqe(txq,
-						  txq->wqe_ci + ds / 4 + 1));
-		} else {
-			++dseg;
-		}
-		++ds;
-		buf = buf->next;
-		assert(buf);
-		length = DATA_LEN(buf);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		total_length += length;
-#endif
-		/* Store segment information. */
-		addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t));
-		*dseg = (rte_v128u32_t){
-			rte_cpu_to_be_32(length),
-			mlx5_tx_mb2mr(txq, buf),
-			addr_64,
-			addr_64 >> 32,
-		};
-		(*txq->elts)[++elts_head & elts_m] = buf;
-		if (--segs_n)
-			goto next_seg;
-next_pkt:
-		if (ds > MLX5_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		++elts_head;
-		++pkts;
-		++i;
-		j += sg;
-		/* Initialize known and common part of the WQE structure. */
-		if (tso) {
-			wqe->ctrl = (rte_v128u32_t){
-				rte_cpu_to_be_32((txq->wqe_ci << 8) |
-						 MLX5_OPCODE_TSO),
-				rte_cpu_to_be_32(txq->qp_num_8s | ds),
-				rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-						 MLX5_COMP_MODE_OFFSET),
-				0,
-			};
-			wqe->eseg = (rte_v128u32_t){
-				swp_offsets,
-				cs_flags | (swp_types << 8) |
-				(rte_cpu_to_be_16(tso_segsz) << 16),
-				metadata,
-				(ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
-			};
-		} else {
-			wqe->ctrl = (rte_v128u32_t){
-				rte_cpu_to_be_32((txq->wqe_ci << 8) |
-						 MLX5_OPCODE_SEND),
-				rte_cpu_to_be_32(txq->qp_num_8s | ds),
-				rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-						 MLX5_COMP_MODE_OFFSET),
-				0,
-			};
-			wqe->eseg = (rte_v128u32_t){
-				swp_offsets,
-				cs_flags | (swp_types << 8),
-				metadata,
-				(ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
-			};
-		}
-next_wqe:
-		txq->wqe_ci += (ds + 3) / 4;
-		/* Save the last successful WQE for completion request */
-		last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe;
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += total_length;
-#endif
-	} while (i < pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely((i + k) == 0))
-		return 0;
-	txq->elts_head += (i + j);
-	/* Check whether completion threshold has been reached. */
-	comp = txq->elts_comp + i + j + k;
-	if (comp >= MLX5_TX_COMP_THRESH) {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		last_wqe->ctrl2 = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						   MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		last_wqe->ctrl3 = txq->elts_head;
-		txq->elts_comp = 0;
-	} else {
-		txq->elts_comp = comp;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe);
-	return i;
-}
-
-/**
- * Open a MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- * @param length
- *   Packet length.
- */
-static inline void
-mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length)
-{
-	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
-	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
-		(volatile struct mlx5_wqe_data_seg (*)[])
-		tx_mlx5_wqe(txq, idx + 1);
-
-	mpw->state = MLX5_MPW_STATE_OPENED;
-	mpw->pkts_n = 0;
-	mpw->len = length;
-	mpw->total_len = 0;
-	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
-	mpw->wqe->eseg.inline_hdr_sz = 0;
-	mpw->wqe->eseg.rsvd0 = 0;
-	mpw->wqe->eseg.rsvd1 = 0;
-	mpw->wqe->eseg.flow_table_metadata = 0;
-	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
-					     (txq->wqe_ci << 8) |
-					     MLX5_OPCODE_TSO);
-	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-					     MLX5_COMP_MODE_OFFSET);
-	mpw->wqe->ctrl[3] = 0;
-	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
-		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
-	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
-		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
-	mpw->data.dseg[2] = &(*dseg)[0];
-	mpw->data.dseg[3] = &(*dseg)[1];
-	mpw->data.dseg[4] = &(*dseg)[2];
-}
-
-/**
- * Close a MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- */
-static inline void
-mlx5_mpw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
-{
-	unsigned int num = mpw->pkts_n;
-
-	/*
-	 * Store size in multiple of 16 bytes. Control and Ethernet segments
-	 * count as 2.
-	 */
-	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | (2 + num));
-	mpw->state = MLX5_MPW_STATE_CLOSED;
-	if (num < 3)
-		++txq->wqe_ci;
-	else
-		txq->wqe_ci += 2;
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-}
-
-/**
- * DPDK callback for TX with MPW support.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int comp;
-	struct mlx5_mpw mpw = {
-		.state = MLX5_MPW_STATE_CLOSED,
-	};
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Prefetch first packet cacheline. */
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	if (unlikely(!max_wqe))
-		return 0;
-	do {
-		struct rte_mbuf *buf = *(pkts++);
-		uint32_t length;
-		unsigned int segs_n = buf->nb_segs;
-		uint32_t cs_flags;
-		rte_be32_t metadata;
-
-		/*
-		 * Make sure there is enough room to store this packet and
-		 * that one ring entry remains unused.
-		 */
-		assert(segs_n);
-		if (max_elts < segs_n)
-			break;
-		/* Do not bother with large packets MPW cannot handle. */
-		if (segs_n > MLX5_MPW_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		max_elts -= segs_n;
-		--pkts_n;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Retrieve packet information. */
-		length = PKT_LEN(buf);
-		assert(length);
-		/* Start new session if packet differs. */
-		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
-		    ((mpw.len != length) ||
-		     (segs_n != 1) ||
-		     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-		     (mpw.wqe->eseg.cs_flags != cs_flags)))
-			mlx5_mpw_close(txq, &mpw);
-		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
-			/*
-			 * Multi-Packet WQE consumes at most two WQE.
-			 * mlx5_mpw_new() expects to be able to use such
-			 * resources.
-			 */
-			if (unlikely(max_wqe < 2))
-				break;
-			max_wqe -= 2;
-			mlx5_mpw_new(txq, &mpw, length);
-			mpw.wqe->eseg.cs_flags = cs_flags;
-			mpw.wqe->eseg.flow_table_metadata = metadata;
-		}
-		/* Multi-segment packets must be alone in their MPW. */
-		assert((segs_n == 1) || (mpw.pkts_n == 0));
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-		length = 0;
-#endif
-		do {
-			volatile struct mlx5_wqe_data_seg *dseg;
-			uintptr_t addr;
-
-			assert(buf);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			dseg = mpw.data.dseg[mpw.pkts_n];
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			*dseg = (struct mlx5_wqe_data_seg){
-				.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
-				.lkey = mlx5_tx_mb2mr(txq, buf),
-				.addr = rte_cpu_to_be_64(addr),
-			};
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-			length += DATA_LEN(buf);
-#endif
-			buf = buf->next;
-			++mpw.pkts_n;
-			++j;
-		} while (--segs_n);
-		assert(length == mpw.len);
-		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
-			mlx5_mpw_close(txq, &mpw);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
-#endif
-		++i;
-	} while (pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely(i == 0))
-		return 0;
-	/* Check whether completion threshold has been reached. */
-	/* "j" includes both packets and segments. */
-	comp = txq->elts_comp + j;
-	if (comp >= MLX5_TX_COMP_THRESH) {
-		volatile struct mlx5_wqe *wqe = mpw.wqe;
-
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->ctrl[3] = elts_head;
-		txq->elts_comp = 0;
-	} else {
-		txq->elts_comp = comp;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	/* Ring QP doorbell. */
-	if (mpw.state == MLX5_MPW_STATE_OPENED)
-		mlx5_mpw_close(txq, &mpw);
-	mlx5_tx_dbrec(txq, mpw.wqe);
-	txq->elts_head = elts_head;
-	return i;
-}
-
-/**
- * Open a MPW inline session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- * @param length
- *   Packet length.
- */
-static inline void
-mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw,
-		    uint32_t length)
-{
-	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
-	struct mlx5_wqe_inl_small *inl;
-
-	mpw->state = MLX5_MPW_INL_STATE_OPENED;
-	mpw->pkts_n = 0;
-	mpw->len = length;
-	mpw->total_len = 0;
-	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
-					     (txq->wqe_ci << 8) |
-					     MLX5_OPCODE_TSO);
-	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-					     MLX5_COMP_MODE_OFFSET);
-	mpw->wqe->ctrl[3] = 0;
-	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
-	mpw->wqe->eseg.inline_hdr_sz = 0;
-	mpw->wqe->eseg.cs_flags = 0;
-	mpw->wqe->eseg.rsvd0 = 0;
-	mpw->wqe->eseg.rsvd1 = 0;
-	mpw->wqe->eseg.flow_table_metadata = 0;
-	inl = (struct mlx5_wqe_inl_small *)
-		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
-	mpw->data.raw = (uint8_t *)&inl->raw;
-}
-
-/**
- * Close a MPW inline session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- */
-static inline void
-mlx5_mpw_inline_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
-{
-	unsigned int size;
-	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
-		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
-
-	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
-	/*
-	 * Store size in multiple of 16 bytes. Control and Ethernet segments
-	 * count as 2.
-	 */
-	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
-					     MLX5_WQE_DS(size));
-	mpw->state = MLX5_MPW_STATE_CLOSED;
-	inl->byte_cnt = rte_cpu_to_be_32(mpw->total_len | MLX5_INLINE_SEG);
-	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
-}
-
-/**
- * DPDK callback for TX with MPW inline support.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
-			 uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int comp;
-	unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
-	struct mlx5_mpw mpw = {
-		.state = MLX5_MPW_STATE_CLOSED,
-	};
-	/*
-	 * Compute the maximum number of WQE which can be consumed by inline
-	 * code.
-	 * - 2 DSEG for:
-	 *   - 1 control segment,
-	 *   - 1 Ethernet segment,
-	 * - N Dseg from the inline request.
-	 */
-	const unsigned int wqe_inl_n =
-		((2 * MLX5_WQE_DWORD_SIZE +
-		  txq->max_inline * RTE_CACHE_LINE_SIZE) +
-		 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Prefetch first packet cacheline. */
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
-	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	do {
-		struct rte_mbuf *buf = *(pkts++);
-		uintptr_t addr;
-		uint32_t length;
-		unsigned int segs_n = buf->nb_segs;
-		uint8_t cs_flags;
-		rte_be32_t metadata;
-
-		/*
-		 * Make sure there is enough room to store this packet and
-		 * that one ring entry remains unused.
-		 */
-		assert(segs_n);
-		if (max_elts < segs_n)
-			break;
-		/* Do not bother with large packets MPW cannot handle. */
-		if (segs_n > MLX5_MPW_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		max_elts -= segs_n;
-		--pkts_n;
-		/*
-		 * Compute max_wqe in case less WQE were consumed in previous
-		 * iteration.
-		 */
-		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Retrieve packet information. */
-		length = PKT_LEN(buf);
-		/* Start new session if packet differs. */
-		if (mpw.state == MLX5_MPW_STATE_OPENED) {
-			if ((mpw.len != length) ||
-			    (segs_n != 1) ||
-			    (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-			    (mpw.wqe->eseg.cs_flags != cs_flags))
-				mlx5_mpw_close(txq, &mpw);
-		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
-			if ((mpw.len != length) ||
-			    (segs_n != 1) ||
-			    (length > inline_room) ||
-			    (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
-				mlx5_mpw_inline_close(txq, &mpw);
-				inline_room =
-					txq->max_inline * RTE_CACHE_LINE_SIZE;
-			}
-		}
-		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
-			if ((segs_n != 1) ||
-			    (length > inline_room)) {
-				/*
-				 * Multi-Packet WQE consumes at most two WQE.
-				 * mlx5_mpw_new() expects to be able to use
-				 * such resources.
-				 */
-				if (unlikely(max_wqe < 2))
-					break;
-				max_wqe -= 2;
-				mlx5_mpw_new(txq, &mpw, length);
-				mpw.wqe->eseg.cs_flags = cs_flags;
-				mpw.wqe->eseg.flow_table_metadata = metadata;
-			} else {
-				if (unlikely(max_wqe < wqe_inl_n))
-					break;
-				max_wqe -= wqe_inl_n;
-				mlx5_mpw_inline_new(txq, &mpw, length);
-				mpw.wqe->eseg.cs_flags = cs_flags;
-				mpw.wqe->eseg.flow_table_metadata = metadata;
-			}
-		}
-		/* Multi-segment packets must be alone in their MPW. */
-		assert((segs_n == 1) || (mpw.pkts_n == 0));
-		if (mpw.state == MLX5_MPW_STATE_OPENED) {
-			assert(inline_room ==
-			       txq->max_inline * RTE_CACHE_LINE_SIZE);
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-			length = 0;
-#endif
-			do {
-				volatile struct mlx5_wqe_data_seg *dseg;
-
-				assert(buf);
-				(*txq->elts)[elts_head++ & elts_m] = buf;
-				dseg = mpw.data.dseg[mpw.pkts_n];
-				addr = rte_pktmbuf_mtod(buf, uintptr_t);
-				*dseg = (struct mlx5_wqe_data_seg){
-					.byte_count =
-					       rte_cpu_to_be_32(DATA_LEN(buf)),
-					.lkey = mlx5_tx_mb2mr(txq, buf),
-					.addr = rte_cpu_to_be_64(addr),
-				};
-#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
-				length += DATA_LEN(buf);
-#endif
-				buf = buf->next;
-				++mpw.pkts_n;
-				++j;
-			} while (--segs_n);
-			assert(length == mpw.len);
-			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
-				mlx5_mpw_close(txq, &mpw);
-		} else {
-			unsigned int max;
-
-			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
-			assert(length <= inline_room);
-			assert(length == DATA_LEN(buf));
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			/* Maximum number of bytes before wrapping. */
-			max = ((((uintptr_t)(txq->wqes)) +
-				(1 << txq->wqe_n) *
-				MLX5_WQE_SIZE) -
-			       (uintptr_t)mpw.data.raw);
-			if (length > max) {
-				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
-					   (void *)addr,
-					   max);
-				mpw.data.raw = (volatile void *)txq->wqes;
-				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
-					   (void *)(addr + max),
-					   length - max);
-				mpw.data.raw += length - max;
-			} else {
-				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
-					   (void *)addr,
-					   length);
-
-				if (length == max)
-					mpw.data.raw =
-						(volatile void *)txq->wqes;
-				else
-					mpw.data.raw += length;
-			}
-			++mpw.pkts_n;
-			mpw.total_len += length;
-			++j;
-			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
-				mlx5_mpw_inline_close(txq, &mpw);
-				inline_room =
-					txq->max_inline * RTE_CACHE_LINE_SIZE;
-			} else {
-				inline_room -= length;
-			}
-		}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
-#endif
-		++i;
-	} while (pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely(i == 0))
-		return 0;
-	/* Check whether completion threshold has been reached. */
-	/* "j" includes both packets and segments. */
-	comp = txq->elts_comp + j;
-	if (comp >= MLX5_TX_COMP_THRESH) {
-		volatile struct mlx5_wqe *wqe = mpw.wqe;
-
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->ctrl[3] = elts_head;
-		txq->elts_comp = 0;
-	} else {
-		txq->elts_comp = comp;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	/* Ring QP doorbell. */
-	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
-		mlx5_mpw_inline_close(txq, &mpw);
-	else if (mpw.state == MLX5_MPW_STATE_OPENED)
-		mlx5_mpw_close(txq, &mpw);
-	mlx5_tx_dbrec(txq, mpw.wqe);
-	txq->elts_head = elts_head;
-	return i;
-}
-
-/**
- * Open an Enhanced MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- * @param length
- *   Packet length.
- */
-static inline void
-mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding)
-{
-	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
-
-	mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
-	mpw->pkts_n = 0;
-	mpw->total_len = sizeof(struct mlx5_wqe);
-	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->ctrl[0] =
-		rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
-				 (txq->wqe_ci << 8) |
-				 MLX5_OPCODE_ENHANCED_MPSW);
-	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
-					     MLX5_COMP_MODE_OFFSET);
-	mpw->wqe->ctrl[3] = 0;
-	memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
-	if (unlikely(padding)) {
-		uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
-
-		/* Pad the first 2 DWORDs with zero-length inline header. */
-		*(volatile uint32_t *)addr = rte_cpu_to_be_32(MLX5_INLINE_SEG);
-		*(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
-			rte_cpu_to_be_32(MLX5_INLINE_SEG);
-		mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
-		/* Start from the next WQEBB. */
-		mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
-	} else {
-		mpw->data.raw = (volatile void *)(mpw->wqe + 1);
-	}
-}
-
-/**
- * Close an Enhanced MPW session.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param mpw
- *   Pointer to MPW session structure.
- *
- * @return
- *   Number of consumed WQEs.
- */
-static inline uint16_t
-mlx5_empw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
-{
-	uint16_t ret;
-
-	/* Store size in multiple of 16 bytes. Control and Ethernet segments
-	 * count as 2.
-	 */
-	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
-					     MLX5_WQE_DS(mpw->total_len));
-	mpw->state = MLX5_MPW_STATE_CLOSED;
-	ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
-	txq->wqe_ci += ret;
-	return ret;
-}
-
-/**
- * TX with Enhanced MPW support.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static inline uint16_t
-txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
-	       uint16_t pkts_n)
-{
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	unsigned int i = 0;
-	unsigned int j = 0;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
-	unsigned int mpw_room = 0;
-	unsigned int inl_pad = 0;
-	uint32_t inl_hdr;
-	uint64_t addr_64;
-	struct mlx5_mpw mpw = {
-		.state = MLX5_MPW_STATE_CLOSED,
-	};
-
-	if (unlikely(!pkts_n))
-		return 0;
-	/* Start processing. */
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	if (unlikely(!max_wqe))
-		return 0;
-	do {
-		struct rte_mbuf *buf = *(pkts++);
-		uintptr_t addr;
-		unsigned int do_inline = 0; /* Whether inline is possible. */
-		uint32_t length;
-		uint8_t cs_flags;
-		rte_be32_t metadata;
-
-		/* Multi-segmented packet is handled in slow-path outside. */
-		assert(NB_SEGS(buf) == 1);
-		/* Make sure there is enough room to store this packet. */
-		if (max_elts - j == 0)
-			break;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Copy metadata from mbuf if valid */
-		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
-							     0;
-		/* Retrieve packet information. */
-		length = PKT_LEN(buf);
-		/* Start new session if:
-		 * - multi-segment packet
-		 * - no space left even for a dseg
-		 * - next packet can be inlined with a new WQE
-		 * - cs_flag differs
-		 */
-		if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
-			if ((inl_pad + sizeof(struct mlx5_wqe_data_seg) >
-			     mpw_room) ||
-			    (length <= txq->inline_max_packet_sz &&
-			     inl_pad + sizeof(inl_hdr) + length >
-			     mpw_room) ||
-			     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
-			    (mpw.wqe->eseg.cs_flags != cs_flags))
-				max_wqe -= mlx5_empw_close(txq, &mpw);
-		}
-		if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
-			/* In Enhanced MPW, inline as much as the budget is
-			 * allowed. The remaining space is to be filled with
-			 * dsegs. If the title WQEBB isn't padded, it will have
-			 * 2 dsegs there.
-			 */
-			mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
-					   (max_inline ? max_inline :
-					    pkts_n * MLX5_WQE_DWORD_SIZE) +
-					   MLX5_WQE_SIZE);
-			if (unlikely(max_wqe * MLX5_WQE_SIZE < mpw_room))
-				break;
-			/* Don't pad the title WQEBB to not waste WQ. */
-			mlx5_empw_new(txq, &mpw, 0);
-			mpw_room -= mpw.total_len;
-			inl_pad = 0;
-			do_inline = length <= txq->inline_max_packet_sz &&
-				    sizeof(inl_hdr) + length <= mpw_room &&
-				    !txq->mpw_hdr_dseg;
-			mpw.wqe->eseg.cs_flags = cs_flags;
-			mpw.wqe->eseg.flow_table_metadata = metadata;
-		} else {
-			/* Evaluate whether the next packet can be inlined.
-			 * Inlininig is possible when:
-			 * - length is less than configured value
-			 * - length fits for remaining space
-			 * - not required to fill the title WQEBB with dsegs
-			 */
-			do_inline =
-				length <= txq->inline_max_packet_sz &&
-				inl_pad + sizeof(inl_hdr) + length <=
-				 mpw_room &&
-				(!txq->mpw_hdr_dseg ||
-				 mpw.total_len >= MLX5_WQE_SIZE);
-		}
-		if (max_inline && do_inline) {
-			/* Inline packet into WQE. */
-			unsigned int max;
-
-			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
-			assert(length == DATA_LEN(buf));
-			inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG);
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			mpw.data.raw = (volatile void *)
-				((uintptr_t)mpw.data.raw + inl_pad);
-			max = tx_mlx5_wq_tailroom(txq,
-					(void *)(uintptr_t)mpw.data.raw);
-			/* Copy inline header. */
-			mpw.data.raw = (volatile void *)
-				mlx5_copy_to_wq(
-					  (void *)(uintptr_t)mpw.data.raw,
-					  &inl_hdr,
-					  sizeof(inl_hdr),
-					  (void *)(uintptr_t)txq->wqes,
-					  max);
-			max = tx_mlx5_wq_tailroom(txq,
-					(void *)(uintptr_t)mpw.data.raw);
-			/* Copy packet data. */
-			mpw.data.raw = (volatile void *)
-				mlx5_copy_to_wq(
-					  (void *)(uintptr_t)mpw.data.raw,
-					  (void *)addr,
-					  length,
-					  (void *)(uintptr_t)txq->wqes,
-					  max);
-			++mpw.pkts_n;
-			mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
-			/* No need to get completion as the entire packet is
-			 * copied to WQ. Free the buf right away.
-			 */
-			rte_pktmbuf_free_seg(buf);
-			mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
-			/* Add pad in the next packet if any. */
-			inl_pad = (((uintptr_t)mpw.data.raw +
-					(MLX5_WQE_DWORD_SIZE - 1)) &
-					~(MLX5_WQE_DWORD_SIZE - 1)) -
-				  (uintptr_t)mpw.data.raw;
-		} else {
-			/* No inline. Load a dseg of packet pointer. */
-			volatile rte_v128u32_t *dseg;
-
-			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
-			assert((inl_pad + sizeof(*dseg)) <= mpw_room);
-			assert(length == DATA_LEN(buf));
-			if (!tx_mlx5_wq_tailroom(txq,
-					(void *)((uintptr_t)mpw.data.raw
-						+ inl_pad)))
-				dseg = (volatile void *)txq->wqes;
-			else
-				dseg = (volatile void *)
-					((uintptr_t)mpw.data.raw +
-					 inl_pad);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
-								    uintptr_t));
-			*dseg = (rte_v128u32_t) {
-				rte_cpu_to_be_32(length),
-				mlx5_tx_mb2mr(txq, buf),
-				addr_64,
-				addr_64 >> 32,
-			};
-			mpw.data.raw = (volatile void *)(dseg + 1);
-			mpw.total_len += (inl_pad + sizeof(*dseg));
-			++j;
-			++mpw.pkts_n;
-			mpw_room -= (inl_pad + sizeof(*dseg));
-			inl_pad = 0;
-		}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += length;
-#endif
-		++i;
-	} while (i < pkts_n);
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely(i == 0))
-		return 0;
-	/* Check whether completion threshold has been reached. */
-	if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
-			(uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
-			 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
-		volatile struct mlx5_wqe *wqe = mpw.wqe;
-
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request completion on last WQE. */
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		/* Save elts_head in unused "immediate" field of WQE. */
-		wqe->ctrl[3] = elts_head;
-		txq->elts_comp = 0;
-		txq->mpw_comp = txq->wqe_ci;
-	} else {
-		txq->elts_comp += j;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
-		mlx5_empw_close(txq, &mpw);
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, mpw.wqe);
-	txq->elts_head = elts_head;
-	return i;
-}
-
-/**
- * DPDK callback for TX with Enhanced MPW support.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t nb_tx = 0;
-
-	while (pkts_n > nb_tx) {
-		uint16_t n;
-		uint16_t ret;
-
-		n = txq_count_contig_multi_seg(&pkts[nb_tx], pkts_n - nb_tx);
-		if (n) {
-			ret = mlx5_tx_burst(dpdk_txq, &pkts[nb_tx], n);
-			if (!ret)
-				break;
-			nb_tx += ret;
-		}
-		n = txq_count_contig_single_seg(&pkts[nb_tx], pkts_n - nb_tx);
-		if (n) {
-			ret = txq_burst_empw(txq, &pkts[nb_tx], n);
-			if (!ret)
-				break;
-			nb_tx += ret;
-		}
-	}
-	return nb_tx;
-}
-
-/**
  * Translate RX completion flags to packet type.
  *
  * @param[in] rxq
@@ -2867,22 +1492,6 @@
  */
 
 __rte_weak uint16_t
-mlx5_tx_burst_raw_vec(void *dpdk_txq __rte_unused,
-		      struct rte_mbuf **pkts __rte_unused,
-		      uint16_t pkts_n __rte_unused)
-{
-	return 0;
-}
-
-__rte_weak uint16_t
-mlx5_tx_burst_vec(void *dpdk_txq __rte_unused,
-		  struct rte_mbuf **pkts __rte_unused,
-		  uint16_t pkts_n __rte_unused)
-{
-	return 0;
-}
-
-__rte_weak uint16_t
 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
 		  struct rte_mbuf **pkts __rte_unused,
 		  uint16_t pkts_n __rte_unused)
@@ -2891,25 +1500,50 @@
 }
 
 __rte_weak int
-mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
+mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
 {
 	return -ENOTSUP;
 }
 
 __rte_weak int
-mlx5_check_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
+mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
 {
 	return -ENOTSUP;
 }
 
-__rte_weak int
-mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
+/**
+ * DPDK callback to check the status of a tx descriptor.
+ *
+ * @param tx_queue
+ *   The tx queue.
+ * @param[in] offset
+ *   The index of the descriptor in the ring.
+ *
+ * @return
+ *   The status of the tx descriptor.
+ */
+int
+mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
 {
-	return -ENOTSUP;
+	(void)tx_queue;
+	(void)offset;
+	return RTE_ETH_TX_DESC_FULL;
 }
 
-__rte_weak int
-mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
+/**
+ * Configure the TX function to use.
+ *
+ * @param dev
+ *   Pointer to private data structure.
+ *
+ * @return
+ *   Pointer to selected Tx burst function.
+ */
+eth_tx_burst_t
+mlx5_select_tx_function(struct rte_eth_dev *dev)
 {
-	return -ENOTSUP;
+	(void)dev;
+	return removed_tx_burst;
 }
+
+
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3d79c18..acde09d 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -329,14 +329,6 @@ struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx,
 void mlx5_set_ptype_table(void);
 void mlx5_set_cksum_table(void);
 void mlx5_set_swp_types_table(void);
-uint16_t mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
-		       uint16_t pkts_n);
-uint16_t mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts,
-			   uint16_t pkts_n);
-uint16_t mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
-				  uint16_t pkts_n);
-uint16_t mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts,
-			    uint16_t pkts_n);
 __rte_noinline uint16_t mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
 					volatile struct mlx5_err_cqe *err_cqe);
 uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
@@ -360,14 +352,8 @@ int mlx5_queue_state_modify_primary(struct rte_eth_dev *dev,
 			const struct mlx5_mp_arg_queue_state_modify *sm);
 
 /* Vectorized version of mlx5_rxtx.c */
-int mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev);
-int mlx5_check_vec_tx_support(struct rte_eth_dev *dev);
 int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data);
 int mlx5_check_vec_rx_support(struct rte_eth_dev *dev);
-uint16_t mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts,
-			       uint16_t pkts_n);
-uint16_t mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
-			   uint16_t pkts_n);
 uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
 			   uint16_t pkts_n);
 
@@ -478,122 +464,6 @@ enum mlx5_cqe_status {
 }
 
 /**
- * Return the address of the WQE.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param  wqe_ci
- *   WQE consumer index.
- *
- * @return
- *   WQE address.
- */
-static inline uintptr_t *
-tx_mlx5_wqe(struct mlx5_txq_data *txq, uint16_t ci)
-{
-	ci &= ((1 << txq->wqe_n) - 1);
-	return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE);
-}
-
-/**
- * Handle the next CQE.
- *
- * @param txq
- *   Pointer to TX queue structure.
- *
- * @return
- *   The last Tx buffer element to free.
- */
-static __rte_always_inline uint16_t
-mlx5_tx_cqe_handle(struct mlx5_txq_data *txq)
-{
-	const unsigned int cqe_n = 1 << txq->cqe_n;
-	const unsigned int cqe_cnt = cqe_n - 1;
-	uint16_t last_elts;
-	union {
-		volatile struct mlx5_cqe *cqe;
-		volatile struct mlx5_err_cqe *err_cqe;
-	} u = {
-		.cqe =  &(*txq->cqes)[txq->cq_ci & cqe_cnt],
-	};
-	int ret = check_cqe(u.cqe, cqe_n, txq->cq_ci);
-
-	if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
-		if (unlikely(ret == MLX5_CQE_STATUS_ERR))
-			last_elts = mlx5_tx_error_cqe_handle(txq, u.err_cqe);
-		else
-			/* Do not release buffers. */
-			return txq->elts_tail;
-	} else {
-		uint16_t new_wqe_pi = rte_be_to_cpu_16(u.cqe->wqe_counter);
-		volatile struct mlx5_wqe_ctrl *ctrl =
-				(volatile struct mlx5_wqe_ctrl *)
-					tx_mlx5_wqe(txq, new_wqe_pi);
-
-		/* Release completion burst buffers. */
-		last_elts = ctrl->ctrl3;
-		txq->wqe_pi = new_wqe_pi;
-		txq->cq_ci++;
-	}
-	rte_compiler_barrier();
-	*txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
-	return last_elts;
-}
-
-/**
- * Manage TX completions.
- *
- * When sending a burst, mlx5_tx_burst() posts several WRs.
- *
- * @param txq
- *   Pointer to TX queue structure.
- */
-static __rte_always_inline void
-mlx5_tx_complete(struct mlx5_txq_data *txq)
-{
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	uint16_t elts_free = txq->elts_tail;
-	uint16_t elts_tail;
-	struct rte_mbuf *m, *free[elts_n];
-	struct rte_mempool *pool = NULL;
-	unsigned int blk_n = 0;
-
-	elts_tail = mlx5_tx_cqe_handle(txq);
-	assert((elts_tail & elts_m) < (1 << txq->wqe_n));
-	/* Free buffers. */
-	while (elts_free != elts_tail) {
-		m = rte_pktmbuf_prefree_seg((*txq->elts)[elts_free++ & elts_m]);
-		if (likely(m != NULL)) {
-			if (likely(m->pool == pool)) {
-				free[blk_n++] = m;
-			} else {
-				if (likely(pool != NULL))
-					rte_mempool_put_bulk(pool,
-							     (void *)free,
-							     blk_n);
-				free[0] = m;
-				pool = m->pool;
-				blk_n = 1;
-			}
-		}
-	}
-	if (blk_n)
-		rte_mempool_put_bulk(pool, (void *)free, blk_n);
-#ifndef NDEBUG
-	elts_free = txq->elts_tail;
-	/* Poisoning. */
-	while (elts_free != elts_tail) {
-		memset(&(*txq->elts)[elts_free & elts_m],
-		       0x66,
-		       sizeof((*txq->elts)[elts_free & elts_m]));
-		++elts_free;
-	}
-#endif
-	txq->elts_tail = elts_tail;
-}
-
-/**
  * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which the
  * cloned mbuf is allocated is returned instead.
  *
@@ -710,147 +580,4 @@ enum mlx5_cqe_status {
 	mlx5_tx_dbrec_cond_wmb(txq, wqe, 1);
 }
 
-/**
- * Convert mbuf to Verb SWP.
- *
- * @param txq_data
- *   Pointer to the Tx queue.
- * @param buf
- *   Pointer to the mbuf.
- * @param offsets
- *   Pointer to the SWP header offsets.
- * @param swp_types
- *   Pointer to the SWP header types.
- */
-static __rte_always_inline void
-txq_mbuf_to_swp(struct mlx5_txq_data *txq, struct rte_mbuf *buf,
-		uint8_t *offsets, uint8_t *swp_types)
-{
-	const uint64_t vlan = buf->ol_flags & PKT_TX_VLAN_PKT;
-	const uint64_t tunnel = buf->ol_flags & PKT_TX_TUNNEL_MASK;
-	const uint64_t tso = buf->ol_flags & PKT_TX_TCP_SEG;
-	const uint64_t csum_flags = buf->ol_flags & PKT_TX_L4_MASK;
-	const uint64_t inner_ip =
-		buf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6);
-	const uint64_t ol_flags_mask = PKT_TX_L4_MASK | PKT_TX_IPV6 |
-				       PKT_TX_OUTER_IPV6;
-	uint16_t idx;
-	uint16_t off;
-
-	if (likely(!txq->swp_en || (tunnel != PKT_TX_TUNNEL_UDP &&
-				    tunnel != PKT_TX_TUNNEL_IP)))
-		return;
-	/*
-	 * The index should have:
-	 * bit[0:1] = PKT_TX_L4_MASK
-	 * bit[4] = PKT_TX_IPV6
-	 * bit[8] = PKT_TX_OUTER_IPV6
-	 * bit[9] = PKT_TX_OUTER_UDP
-	 */
-	idx = (buf->ol_flags & ol_flags_mask) >> 52;
-	if (tunnel == PKT_TX_TUNNEL_UDP)
-		idx |= 1 << 9;
-	*swp_types = mlx5_swp_types_table[idx];
-	/*
-	 * Set offsets for SW parser. Since ConnectX-5, SW parser just
-	 * complements HW parser. SW parser starts to engage only if HW parser
-	 * can't reach a header. For the older devices, HW parser will not kick
-	 * in if any of SWP offsets is set. Therefore, all of the L3 offsets
-	 * should be set regardless of HW offload.
-	 */
-	off = buf->outer_l2_len + (vlan ? sizeof(struct rte_vlan_hdr) : 0);
-	offsets[1] = off >> 1; /* Outer L3 offset. */
-	off += buf->outer_l3_len;
-	if (tunnel == PKT_TX_TUNNEL_UDP)
-		offsets[0] = off >> 1; /* Outer L4 offset. */
-	if (inner_ip) {
-		off += buf->l2_len;
-		offsets[3] = off >> 1; /* Inner L3 offset. */
-		if (csum_flags == PKT_TX_TCP_CKSUM || tso ||
-		    csum_flags == PKT_TX_UDP_CKSUM) {
-			off += buf->l3_len;
-			offsets[2] = off >> 1; /* Inner L4 offset. */
-		}
-	}
-}
-
-/**
- * Convert the Checksum offloads to Verbs.
- *
- * @param buf
- *   Pointer to the mbuf.
- *
- * @return
- *   Converted checksum flags.
- */
-static __rte_always_inline uint8_t
-txq_ol_cksum_to_cs(struct rte_mbuf *buf)
-{
-	uint32_t idx;
-	uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK);
-	const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK |
-				       PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM;
-
-	/*
-	 * The index should have:
-	 * bit[0] = PKT_TX_TCP_SEG
-	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
-	 * bit[4] = PKT_TX_IP_CKSUM
-	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
-	 * bit[9] = tunnel
-	 */
-	idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
-	return mlx5_cksum_table[idx];
-}
-
-/**
- * Count the number of contiguous single segment packets.
- *
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- *
- * @return
- *   Number of contiguous single segment packets.
- */
-static __rte_always_inline unsigned int
-txq_count_contig_single_seg(struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	unsigned int pos;
-
-	if (!pkts_n)
-		return 0;
-	/* Count the number of contiguous single segment packets. */
-	for (pos = 0; pos < pkts_n; ++pos)
-		if (NB_SEGS(pkts[pos]) > 1)
-			break;
-	return pos;
-}
-
-/**
- * Count the number of contiguous multi-segment packets.
- *
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- *
- * @return
- *   Number of contiguous multi-segment packets.
- */
-static __rte_always_inline unsigned int
-txq_count_contig_multi_seg(struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	unsigned int pos;
-
-	if (!pkts_n)
-		return 0;
-	/* Count the number of contiguous multi-segment packets. */
-	for (pos = 0; pos < pkts_n; ++pos)
-		if (NB_SEGS(pkts[pos]) == 1)
-			break;
-	return pos;
-}
-
 #endif /* RTE_PMD_MLX5_RXTX_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
index 073044f..f6ec828 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -40,138 +40,6 @@
 #endif
 
 /**
- * Count the number of packets having same ol_flags and same metadata (if
- * PKT_TX_METADATA is set in ol_flags), and calculate cs_flags.
- *
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- * @param cs_flags
- *   Pointer of flags to be returned.
- * @param metadata
- *   Pointer of metadata to be returned.
- * @param txq_offloads
- *   Offloads enabled on Tx queue
- *
- * @return
- *   Number of packets having same ol_flags and metadata, if relevant.
- */
-static inline unsigned int
-txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags,
-		 rte_be32_t *metadata, const uint64_t txq_offloads)
-{
-	unsigned int pos;
-	const uint64_t cksum_ol_mask =
-		PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
-		PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
-		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
-	rte_be32_t p0_metadata, pn_metadata;
-
-	if (!pkts_n)
-		return 0;
-	p0_metadata = pkts[0]->ol_flags & PKT_TX_METADATA ?
-			pkts[0]->tx_metadata : 0;
-	/* Count the number of packets having same offload parameters. */
-	for (pos = 1; pos < pkts_n; ++pos) {
-		/* Check if packet has same checksum flags. */
-		if ((txq_offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP) &&
-		    ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & cksum_ol_mask))
-			break;
-		/* Check if packet has same metadata. */
-		if (txq_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) {
-			pn_metadata = pkts[pos]->ol_flags & PKT_TX_METADATA ?
-					pkts[pos]->tx_metadata : 0;
-			if (pn_metadata != p0_metadata)
-				break;
-		}
-	}
-	*cs_flags = txq_ol_cksum_to_cs(pkts[0]);
-	*metadata = p0_metadata;
-	return pos;
-}
-
-/**
- * DPDK callback for vectorized TX.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts,
-		      uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t nb_tx = 0;
-
-	while (pkts_n > nb_tx) {
-		uint16_t n;
-		uint16_t ret;
-
-		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
-		ret = txq_burst_v(txq, &pkts[nb_tx], n, 0, 0);
-		nb_tx += ret;
-		if (!ret)
-			break;
-	}
-	return nb_tx;
-}
-
-/**
- * DPDK callback for vectorized TX with multi-seg packets and offload.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
-	uint16_t nb_tx = 0;
-
-	while (pkts_n > nb_tx) {
-		uint8_t cs_flags = 0;
-		uint16_t n;
-		uint16_t ret;
-		rte_be32_t metadata = 0;
-
-		/* Transmit multi-seg packets in the head of pkts list. */
-		if ((txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS) &&
-		    NB_SEGS(pkts[nb_tx]) > 1)
-			nb_tx += txq_scatter_v(txq,
-					       &pkts[nb_tx],
-					       pkts_n - nb_tx);
-		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
-		if (txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
-			n = txq_count_contig_single_seg(&pkts[nb_tx], n);
-		if (txq->offloads & (MLX5_VEC_TX_CKSUM_OFFLOAD_CAP |
-				     DEV_TX_OFFLOAD_MATCH_METADATA))
-			n = txq_calc_offload(&pkts[nb_tx], n,
-					     &cs_flags, &metadata,
-					     txq->offloads);
-		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags, metadata);
-		nb_tx += ret;
-		if (!ret)
-			break;
-	}
-	return nb_tx;
-}
-
-/**
  * Skip error packets.
  *
  * @param rxq
@@ -243,49 +111,6 @@
 }
 
 /**
- * Check Tx queue flags are set for raw vectorized Tx.
- *
- * @param dev
- *   Pointer to Ethernet device.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev)
-{
-	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
-
-	/* Doesn't support any offload. */
-	if (offloads)
-		return -ENOTSUP;
-	return 1;
-}
-
-/**
- * Check a device can support vectorized TX.
- *
- * @param dev
- *   Pointer to Ethernet device.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-mlx5_check_vec_tx_support(struct rte_eth_dev *dev)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
-
-	if (!priv->config.tx_vec_en ||
-	    priv->txqs_n > (unsigned int)priv->config.txqs_vec ||
-	    priv->config.mps != MLX5_MPW_ENHANCED ||
-	    offloads & ~MLX5_VEC_TX_OFFLOAD_CAP)
-		return -ENOTSUP;
-	return 1;
-}
-
-/**
  * Check a RX queue can support vectorized RX.
  *
  * @param rxq
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 1c7e3b4..9930286 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -27,295 +27,6 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 
 /**
- * Fill in buffer descriptors in a multi-packet send descriptor.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param dseg
- *   Pointer to buffer descriptor to be written.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param n
- *   Number of packets to be filled.
- */
-static inline void
-txq_wr_dseg_v(struct mlx5_txq_data *txq, uint8_t *dseg,
-	      struct rte_mbuf **pkts, unsigned int n)
-{
-	unsigned int pos;
-	uintptr_t addr;
-	const uint8x16_t dseg_shuf_m = {
-		 3,  2,  1,  0, /* length, bswap32 */
-		 4,  5,  6,  7, /* lkey */
-		15, 14, 13, 12, /* addr, bswap64 */
-		11, 10,  9,  8
-	};
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	uint32_t tx_byte = 0;
-#endif
-
-	for (pos = 0; pos < n; ++pos, dseg += MLX5_WQE_DWORD_SIZE) {
-		uint8x16_t desc;
-		struct rte_mbuf *pkt = pkts[pos];
-
-		addr = rte_pktmbuf_mtod(pkt, uintptr_t);
-		desc = vreinterpretq_u8_u32((uint32x4_t) {
-				DATA_LEN(pkt),
-				mlx5_tx_mb2mr(txq, pkt),
-				addr,
-				addr >> 32 });
-		desc = vqtbl1q_u8(desc, dseg_shuf_m);
-		vst1q_u8(dseg, desc);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		tx_byte += DATA_LEN(pkt);
-#endif
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.obytes += tx_byte;
-#endif
-}
-
-/**
- * Send multi-segmented packets until it encounters a single segment packet in
- * the pkts list.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
-	      uint16_t pkts_n)
-{
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n;
-	volatile struct mlx5_wqe *wqe = NULL;
-	bool metadata_ol =
-		txq->offloads & DEV_TX_OFFLOAD_MATCH_METADATA ? true : false;
-
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	if (unlikely(!pkts_n))
-		return 0;
-	for (n = 0; n < pkts_n; ++n) {
-		struct rte_mbuf *buf = pkts[n];
-		unsigned int segs_n = buf->nb_segs;
-		unsigned int ds = nb_dword_in_hdr;
-		unsigned int len = PKT_LEN(buf);
-		uint16_t wqe_ci = txq->wqe_ci;
-		const uint8x16_t ctrl_shuf_m = {
-			3,  2,  1,  0, /* bswap32 */
-			7,  6,  5,  4, /* bswap32 */
-			11, 10,  9,  8, /* bswap32 */
-			12, 13, 14, 15
-		};
-		uint8_t cs_flags;
-		uint16_t max_elts;
-		uint16_t max_wqe;
-		uint8x16_t *t_wqe;
-		uint8_t *dseg;
-		uint8x16_t ctrl;
-		rte_be32_t metadata =
-			metadata_ol && (buf->ol_flags & PKT_TX_METADATA) ?
-			buf->tx_metadata : 0;
-
-		assert(segs_n);
-		max_elts = elts_n - (elts_head - txq->elts_tail);
-		max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi);
-		/*
-		 * A MPW session consumes 2 WQEs at most to
-		 * include MLX5_MPW_DSEG_MAX pointers.
-		 */
-		if (segs_n == 1 ||
-		    max_elts < segs_n || max_wqe < 2)
-			break;
-		wqe = &((volatile struct mlx5_wqe64 *)
-			 txq->wqes)[wqe_ci & wq_mask].hdr;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Title WQEBB pointer. */
-		t_wqe = (uint8x16_t *)wqe;
-		dseg = (uint8_t *)(wqe + 1);
-		do {
-			if (!(ds++ % nb_dword_per_wqebb)) {
-				dseg = (uint8_t *)
-					&((volatile struct mlx5_wqe64 *)
-					   txq->wqes)[++wqe_ci & wq_mask];
-			}
-			txq_wr_dseg_v(txq, dseg, &buf, 1);
-			dseg += MLX5_WQE_DWORD_SIZE;
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			buf = buf->next;
-		} while (--segs_n);
-		++wqe_ci;
-		/* Fill CTRL in the header. */
-		ctrl = vreinterpretq_u8_u32((uint32x4_t) {
-				MLX5_OPC_MOD_MPW << 24 |
-				txq->wqe_ci << 8 | MLX5_OPCODE_TSO,
-				txq->qp_num_8s | ds, 4, 0});
-		ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m);
-		vst1q_u8((void *)t_wqe, ctrl);
-		/* Fill ESEG in the header. */
-		vst1q_u32((void *)(t_wqe + 1),
-			  ((uint32x4_t){ 0,
-					 rte_cpu_to_be_16(len) << 16 | cs_flags,
-					 metadata, 0 }));
-		txq->wqe_ci = wqe_ci;
-	}
-	if (!n)
-		return 0;
-	txq->elts_comp += (uint16_t)(elts_head - txq->elts_head);
-	txq->elts_head = elts_head;
-	if (txq->elts_comp >= MLX5_TX_COMP_THRESH) {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		wqe->ctrl[3] = txq->elts_head;
-		txq->elts_comp = 0;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += n;
-#endif
-	mlx5_tx_dbrec(txq, wqe);
-	return n;
-}
-
-/**
- * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet,
- * it returns to make it processed by txq_scatter_v(). All the packets in
- * the pkts list should be single segment packets having same offload flags.
- * This must be checked by txq_count_contig_single_seg() and txq_calc_offload().
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
- * @param cs_flags
- *   Checksum offload flags to be written in the descriptor.
- * @param metadata
- *   Metadata value to be written in the descriptor.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static inline uint16_t
-txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint8_t cs_flags, rte_be32_t metadata)
-{
-	struct rte_mbuf **elts;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n = 0;
-	unsigned int pos;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	uint32_t comp_req;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	uint16_t wq_idx = txq->wqe_ci & wq_mask;
-	volatile struct mlx5_wqe64 *wq =
-		&((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx];
-	volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq;
-	const uint8x16_t ctrl_shuf_m = {
-		 3,  2,  1,  0, /* bswap32 */
-		 7,  6,  5,  4, /* bswap32 */
-		11, 10,  9,  8, /* bswap32 */
-		12, 13, 14, 15
-	};
-	uint8x16_t *t_wqe;
-	uint8_t *dseg;
-	uint8x16_t ctrl;
-
-	/* Make sure all packets can fit into a single WQE. */
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts);
-	if (unlikely(!pkts_n))
-		return 0;
-	elts = &(*txq->elts)[elts_head & elts_m];
-	/* Loop for available tailroom first. */
-	n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n);
-	for (pos = 0; pos < (n & -2); pos += 2)
-		vst1q_u64((void *)&elts[pos], vld1q_u64((void *)&pkts[pos]));
-	if (n & 1)
-		elts[pos] = pkts[pos];
-	/* Check if it crosses the end of the queue. */
-	if (unlikely(n < pkts_n)) {
-		elts = &(*txq->elts)[0];
-		for (pos = 0; pos < pkts_n - n; ++pos)
-			elts[pos] = pkts[n + pos];
-	}
-	txq->elts_head += pkts_n;
-	/* Save title WQEBB pointer. */
-	t_wqe = (uint8x16_t *)wqe;
-	dseg = (uint8_t *)(wqe + 1);
-	/* Calculate the number of entries to the end. */
-	n = RTE_MIN(
-		(wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr,
-		pkts_n);
-	/* Fill DSEGs. */
-	txq_wr_dseg_v(txq, dseg, pkts, n);
-	/* Check if it crosses the end of the queue. */
-	if (n < pkts_n) {
-		dseg = (uint8_t *)txq->wqes;
-		txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n);
-	}
-	if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) {
-		txq->elts_comp += pkts_n;
-		comp_req = MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET;
-	} else {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request a completion. */
-		txq->elts_comp = 0;
-		comp_req = MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET;
-	}
-	/* Fill CTRL in the header. */
-	ctrl = vreinterpretq_u8_u32((uint32x4_t) {
-			MLX5_OPC_MOD_ENHANCED_MPSW << 24 |
-			txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW,
-			txq->qp_num_8s | (pkts_n + 2),
-			comp_req,
-			txq->elts_head });
-	ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m);
-	vst1q_u8((void *)t_wqe, ctrl);
-	/* Fill ESEG in the header. */
-	vst1q_u32((void *)(t_wqe + 1),
-		 ((uint32x4_t) { 0, cs_flags, metadata, 0 }));
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += pkts_n;
-#endif
-	txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
-		       nb_dword_per_wqebb;
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
-	return pkts_n;
-}
-
-/**
  * Store free buffers to RX SW ring.
  *
  * @param rxq
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 503ca0f..7bd254f 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -29,290 +29,6 @@
 #endif
 
 /**
- * Fill in buffer descriptors in a multi-packet send descriptor.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param dseg
- *   Pointer to buffer descriptor to be written.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param n
- *   Number of packets to be filled.
- */
-static inline void
-txq_wr_dseg_v(struct mlx5_txq_data *txq, __m128i *dseg,
-	      struct rte_mbuf **pkts, unsigned int n)
-{
-	unsigned int pos;
-	uintptr_t addr;
-	const __m128i shuf_mask_dseg =
-		_mm_set_epi8(8,  9, 10, 11, /* addr, bswap64 */
-			    12, 13, 14, 15,
-			     7,  6,  5,  4, /* lkey */
-			     0,  1,  2,  3  /* length, bswap32 */);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	uint32_t tx_byte = 0;
-#endif
-
-	for (pos = 0; pos < n; ++pos, ++dseg) {
-		__m128i desc;
-		struct rte_mbuf *pkt = pkts[pos];
-
-		addr = rte_pktmbuf_mtod(pkt, uintptr_t);
-		desc = _mm_set_epi32(addr >> 32,
-				     addr,
-				     mlx5_tx_mb2mr(txq, pkt),
-				     DATA_LEN(pkt));
-		desc = _mm_shuffle_epi8(desc, shuf_mask_dseg);
-		_mm_store_si128(dseg, desc);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-		tx_byte += DATA_LEN(pkt);
-#endif
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.obytes += tx_byte;
-#endif
-}
-
-/**
- * Send multi-segmented packets until it encounters a single segment packet in
- * the pkts list.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
-	      uint16_t pkts_n)
-{
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n;
-	volatile struct mlx5_wqe *wqe = NULL;
-	bool metadata_ol =
-		txq->offloads & DEV_TX_OFFLOAD_MATCH_METADATA ? true : false;
-
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	if (unlikely(!pkts_n))
-		return 0;
-	for (n = 0; n < pkts_n; ++n) {
-		struct rte_mbuf *buf = pkts[n];
-		unsigned int segs_n = buf->nb_segs;
-		unsigned int ds = nb_dword_in_hdr;
-		unsigned int len = PKT_LEN(buf);
-		uint16_t wqe_ci = txq->wqe_ci;
-		const __m128i shuf_mask_ctrl =
-			_mm_set_epi8(15, 14, 13, 12,
-				      8,  9, 10, 11, /* bswap32 */
-				      4,  5,  6,  7, /* bswap32 */
-				      0,  1,  2,  3  /* bswap32 */);
-		uint8_t cs_flags;
-		uint16_t max_elts;
-		uint16_t max_wqe;
-		__m128i *t_wqe, *dseg;
-		__m128i ctrl;
-		rte_be32_t metadata =
-			metadata_ol && (buf->ol_flags & PKT_TX_METADATA) ?
-			buf->tx_metadata : 0;
-
-		assert(segs_n);
-		max_elts = elts_n - (elts_head - txq->elts_tail);
-		max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi);
-		/*
-		 * A MPW session consumes 2 WQEs at most to
-		 * include MLX5_MPW_DSEG_MAX pointers.
-		 */
-		if (segs_n == 1 ||
-		    max_elts < segs_n || max_wqe < 2)
-			break;
-		if (segs_n > MLX5_MPW_DSEG_MAX) {
-			txq->stats.oerrors++;
-			break;
-		}
-		wqe = &((volatile struct mlx5_wqe64 *)
-			 txq->wqes)[wqe_ci & wq_mask].hdr;
-		cs_flags = txq_ol_cksum_to_cs(buf);
-		/* Title WQEBB pointer. */
-		t_wqe = (__m128i *)wqe;
-		dseg = (__m128i *)(wqe + 1);
-		do {
-			if (!(ds++ % nb_dword_per_wqebb)) {
-				dseg = (__m128i *)
-					&((volatile struct mlx5_wqe64 *)
-					   txq->wqes)[++wqe_ci & wq_mask];
-			}
-			txq_wr_dseg_v(txq, dseg++, &buf, 1);
-			(*txq->elts)[elts_head++ & elts_m] = buf;
-			buf = buf->next;
-		} while (--segs_n);
-		++wqe_ci;
-		/* Fill CTRL in the header. */
-		ctrl = _mm_set_epi32(0, 4, txq->qp_num_8s | ds,
-				     MLX5_OPC_MOD_MPW << 24 |
-				     txq->wqe_ci << 8 | MLX5_OPCODE_TSO);
-		ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
-		_mm_store_si128(t_wqe, ctrl);
-		/* Fill ESEG in the header. */
-		_mm_store_si128(t_wqe + 1,
-				_mm_set_epi32(0, metadata,
-					      (rte_cpu_to_be_16(len) << 16) |
-					      cs_flags, 0));
-		txq->wqe_ci = wqe_ci;
-	}
-	if (!n)
-		return 0;
-	txq->elts_comp += (uint16_t)(elts_head - txq->elts_head);
-	txq->elts_head = elts_head;
-	if (txq->elts_comp >= MLX5_TX_COMP_THRESH) {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
-						MLX5_COMP_MODE_OFFSET);
-		wqe->ctrl[3] = txq->elts_head;
-		txq->elts_comp = 0;
-	}
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += n;
-#endif
-	mlx5_tx_dbrec(txq, wqe);
-	return n;
-}
-
-/**
- * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet,
- * it returns to make it processed by txq_scatter_v(). All the packets in
- * the pkts list should be single segment packets having same offload flags.
- * This must be checked by txq_count_contig_single_seg() and txq_calc_offload().
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets to be sent.
- * @param pkts_n
- *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
- * @param cs_flags
- *   Checksum offload flags to be written in the descriptor.
- * @param metadata
- *   Metadata value to be written in the descriptor.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static inline uint16_t
-txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint8_t cs_flags, rte_be32_t metadata)
-{
-	struct rte_mbuf **elts;
-	uint16_t elts_head = txq->elts_head;
-	const uint16_t elts_n = 1 << txq->elts_n;
-	const uint16_t elts_m = elts_n - 1;
-	const unsigned int nb_dword_per_wqebb =
-		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
-	const unsigned int nb_dword_in_hdr =
-		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
-	unsigned int n = 0;
-	unsigned int pos;
-	uint16_t max_elts;
-	uint16_t max_wqe;
-	uint32_t comp_req;
-	const uint16_t wq_n = 1 << txq->wqe_n;
-	const uint16_t wq_mask = wq_n - 1;
-	uint16_t wq_idx = txq->wqe_ci & wq_mask;
-	volatile struct mlx5_wqe64 *wq =
-		&((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx];
-	volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq;
-	const __m128i shuf_mask_ctrl =
-		_mm_set_epi8(15, 14, 13, 12,
-			      8,  9, 10, 11, /* bswap32 */
-			      4,  5,  6,  7, /* bswap32 */
-			      0,  1,  2,  3  /* bswap32 */);
-	__m128i *t_wqe, *dseg;
-	__m128i ctrl;
-
-	/* Make sure all packets can fit into a single WQE. */
-	assert(elts_n > pkts_n);
-	mlx5_tx_complete(txq);
-	max_elts = (elts_n - (elts_head - txq->elts_tail));
-	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
-	pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts);
-	assert(pkts_n <= MLX5_DSEG_MAX - nb_dword_in_hdr);
-	if (unlikely(!pkts_n))
-		return 0;
-	elts = &(*txq->elts)[elts_head & elts_m];
-	/* Loop for available tailroom first. */
-	n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n);
-	for (pos = 0; pos < (n & -2); pos += 2)
-		_mm_storeu_si128((__m128i *)&elts[pos],
-				 _mm_loadu_si128((__m128i *)&pkts[pos]));
-	if (n & 1)
-		elts[pos] = pkts[pos];
-	/* Check if it crosses the end of the queue. */
-	if (unlikely(n < pkts_n)) {
-		elts = &(*txq->elts)[0];
-		for (pos = 0; pos < pkts_n - n; ++pos)
-			elts[pos] = pkts[n + pos];
-	}
-	txq->elts_head += pkts_n;
-	/* Save title WQEBB pointer. */
-	t_wqe = (__m128i *)wqe;
-	dseg = (__m128i *)(wqe + 1);
-	/* Calculate the number of entries to the end. */
-	n = RTE_MIN(
-		(wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr,
-		pkts_n);
-	/* Fill DSEGs. */
-	txq_wr_dseg_v(txq, dseg, pkts, n);
-	/* Check if it crosses the end of the queue. */
-	if (n < pkts_n) {
-		dseg = (__m128i *)txq->wqes;
-		txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n);
-	}
-	if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) {
-		txq->elts_comp += pkts_n;
-		comp_req = MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET;
-	} else {
-		/* A CQE slot must always be available. */
-		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-		/* Request a completion. */
-		txq->elts_comp = 0;
-		comp_req = MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET;
-	}
-	/* Fill CTRL in the header. */
-	ctrl = _mm_set_epi32(txq->elts_head, comp_req,
-			     txq->qp_num_8s | (pkts_n + 2),
-			     MLX5_OPC_MOD_ENHANCED_MPSW << 24 |
-				txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW);
-	ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
-	_mm_store_si128(t_wqe, ctrl);
-	/* Fill ESEG in the header. */
-	_mm_store_si128(t_wqe + 1, _mm_set_epi32(0, metadata, cs_flags, 0));
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	txq->stats.opackets += pkts_n;
-#endif
-	txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
-		       nb_dword_per_wqebb;
-	/* Ring QP doorbell. */
-	mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
-	return pkts_n;
-}
-
-/**
  * Store free buffers to RX SW ring.
  *
  * @param rxq
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 82493d7..55892e2 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -365,25 +365,6 @@
 }
 
 /**
- * Check if the burst function is using eMPW.
- *
- * @param tx_pkt_burst
- *   Tx burst function pointer.
- *
- * @return
- *   1 if the burst function is using eMPW, 0 otherwise.
- */
-static int
-is_empw_burst_func(eth_tx_burst_t tx_pkt_burst)
-{
-	if (tx_pkt_burst == mlx5_tx_burst_raw_vec ||
-	    tx_pkt_burst == mlx5_tx_burst_vec ||
-	    tx_pkt_burst == mlx5_tx_burst_empw)
-		return 1;
-	return 0;
-}
-
-/**
  * Create the Tx queue Verbs object.
  *
  * @param dev
@@ -414,7 +395,6 @@ struct mlx5_txq_ibv *
 	struct mlx5dv_cq cq_info;
 	struct mlx5dv_obj obj;
 	const int desc = 1 << txq_data->elts_n;
-	eth_tx_burst_t tx_pkt_burst = mlx5_select_tx_function(dev);
 	int ret = 0;
 
 	assert(txq_data);
@@ -432,8 +412,6 @@ struct mlx5_txq_ibv *
 		.comp_mask = 0,
 	};
 	cqe_n = desc / MLX5_TX_COMP_THRESH + 1;
-	if (is_empw_burst_func(tx_pkt_burst))
-		cqe_n += MLX5_TX_COMP_THRESH_INLINE_DIV;
 	tmpl.cq = mlx5_glue->create_cq(priv->sh->ctx, cqe_n, NULL, NULL, 0);
 	if (tmpl.cq == NULL) {
 		DRV_LOG(ERR, "port %u Tx queue %u CQ creation failure",
@@ -698,93 +676,7 @@ struct mlx5_txq_ibv *
 static void
 txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
 {
-	struct mlx5_priv *priv = txq_ctrl->priv;
-	struct mlx5_dev_config *config = &priv->config;
-	const unsigned int max_tso_inline =
-		((MLX5_MAX_TSO_HEADER + (RTE_CACHE_LINE_SIZE - 1)) /
-		 RTE_CACHE_LINE_SIZE);
-	unsigned int txq_inline;
-	unsigned int txqs_inline;
-	unsigned int inline_max_packet_sz;
-	eth_tx_burst_t tx_pkt_burst =
-		mlx5_select_tx_function(ETH_DEV(priv));
-	int is_empw_func = is_empw_burst_func(tx_pkt_burst);
-	int tso = !!(txq_ctrl->txq.offloads & (DEV_TX_OFFLOAD_TCP_TSO |
-					       DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
-					       DEV_TX_OFFLOAD_GRE_TNL_TSO |
-					       DEV_TX_OFFLOAD_IP_TNL_TSO |
-					       DEV_TX_OFFLOAD_UDP_TNL_TSO));
-
-	txq_inline = (config->txq_inline == MLX5_ARG_UNSET) ?
-		0 : config->txq_inline;
-	txqs_inline = (config->txqs_inline == MLX5_ARG_UNSET) ?
-		0 : config->txqs_inline;
-	inline_max_packet_sz =
-		(config->inline_max_packet_sz == MLX5_ARG_UNSET) ?
-		0 : config->inline_max_packet_sz;
-	if (is_empw_func) {
-		if (config->txq_inline == MLX5_ARG_UNSET)
-			txq_inline = MLX5_WQE_SIZE_MAX - MLX5_WQE_SIZE;
-		if (config->txqs_inline == MLX5_ARG_UNSET)
-			txqs_inline = MLX5_EMPW_MIN_TXQS;
-		if (config->inline_max_packet_sz == MLX5_ARG_UNSET)
-			inline_max_packet_sz = MLX5_EMPW_MAX_INLINE_LEN;
-		txq_ctrl->txq.mpw_hdr_dseg = config->mpw_hdr_dseg;
-		txq_ctrl->txq.inline_max_packet_sz = inline_max_packet_sz;
-	}
-	if (txq_inline && priv->txqs_n >= txqs_inline) {
-		unsigned int ds_cnt;
-
-		txq_ctrl->txq.max_inline =
-			((txq_inline + (RTE_CACHE_LINE_SIZE - 1)) /
-			 RTE_CACHE_LINE_SIZE);
-		if (is_empw_func) {
-			/* To minimize the size of data set, avoid requesting
-			 * too large WQ.
-			 */
-			txq_ctrl->max_inline_data =
-				((RTE_MIN(txq_inline,
-					  inline_max_packet_sz) +
-				  (RTE_CACHE_LINE_SIZE - 1)) /
-				 RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE;
-		} else {
-			txq_ctrl->max_inline_data =
-				txq_ctrl->txq.max_inline * RTE_CACHE_LINE_SIZE;
-		}
-		/*
-		 * Check if the inline size is too large in a way which
-		 * can make the WQE DS to overflow.
-		 * Considering in calculation:
-		 *      WQE CTRL (1 DS)
-		 *      WQE ETH  (1 DS)
-		 *      Inline part (N DS)
-		 */
-		ds_cnt = 2 + (txq_ctrl->txq.max_inline / MLX5_WQE_DWORD_SIZE);
-		if (ds_cnt > MLX5_DSEG_MAX) {
-			unsigned int max_inline = (MLX5_DSEG_MAX - 2) *
-						  MLX5_WQE_DWORD_SIZE;
-
-			max_inline = max_inline - (max_inline %
-						   RTE_CACHE_LINE_SIZE);
-			DRV_LOG(WARNING,
-				"port %u txq inline is too large (%d) setting"
-				" it to the maximum possible: %d\n",
-				PORT_ID(priv), txq_inline, max_inline);
-			txq_ctrl->txq.max_inline = max_inline /
-						   RTE_CACHE_LINE_SIZE;
-		}
-	}
-	if (tso) {
-		txq_ctrl->max_tso_header = max_tso_inline * RTE_CACHE_LINE_SIZE;
-		txq_ctrl->txq.max_inline = RTE_MAX(txq_ctrl->txq.max_inline,
-						   max_tso_inline);
-		txq_ctrl->txq.tso_en = 1;
-	}
-	txq_ctrl->txq.tunnel_en = config->tunnel_en | config->swp;
-	txq_ctrl->txq.swp_en = ((DEV_TX_OFFLOAD_IP_TNL_TSO |
-				 DEV_TX_OFFLOAD_UDP_TNL_TSO |
-				 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM) &
-				txq_ctrl->txq.offloads) && config->swp;
+	(void)txq_ctrl;
 }
 
 /**
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v4 2/8] net/mlx5: add Tx datapath related devargs
  2019-07-21 14:24       ` [dpdk-dev] [PATCH v4 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
  2019-07-21 14:24         ` [dpdk-dev] [PATCH v4 1/8] net/mlx5: remove Tx datapath implementation Viacheslav Ovsiienko
@ 2019-07-21 14:24         ` Viacheslav Ovsiienko
  2019-07-22  5:32           ` Yongseok Koh
  2019-07-21 14:24         ` [dpdk-dev] [PATCH v4 3/8] net/mlx5: update Tx datapath definitions Viacheslav Ovsiienko
                           ` (6 subsequent siblings)
  8 siblings, 1 reply; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-21 14:24 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch introduces new mlx5 PMD devarg options:

- txq_inline_min - specifies minimal amount of data to be inlined into
  WQE during Tx operations. NICs may require this minimal data amount
  to operate correctly. The exact value may depend on NIC operation mode,
  requested offloads, etc.

- txq_inline_max - specifies the maximal packet length to be completely
  inlined into WQE Ethernet Segment for ordinary SEND method. If packet
  is larger the specified value, the packet data won't be copied by the
  driver at all, data buffer is addressed with a pointer. If packet length
  is less or equal all packet data will be copied into WQE.

- txq_inline_mpw - specifies the maximal packet length to be completely
  inlined into WQE for Enhanced MPW method.

Driver documentation is also updated.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 doc/guides/nics/mlx5.rst               | 155 +++++++++++++++++++++++----------
 doc/guides/rel_notes/release_19_08.rst |   2 +
 drivers/net/mlx5/mlx5.c                |  29 +++++-
 drivers/net/mlx5/mlx5.h                |   4 +
 4 files changed, 140 insertions(+), 50 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 5cf1e76..7e87344 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -351,24 +351,102 @@ Run-time configuration
 - ``txq_inline`` parameter [int]
 
   Amount of data to be inlined during TX operations. This parameter is
-  deprecated and ignored, kept for compatibility issue.
+  deprecated and converted to the new parameter ``txq_inline_max`` providing
+  partial compatibility.
 
 - ``txqs_min_inline`` parameter [int]
 
-  Enable inline send only when the number of TX queues is greater or equal
+  Enable inline data send only when the number of TX queues is greater or equal
   to this value.
 
-  This option should be used in combination with ``txq_inline`` above.
-
-  On ConnectX-4, ConnectX-4 LX, ConnectX-5, ConnectX-6 and BlueField without
-  Enhanced MPW:
-
-        - Disabled by default.
-        - In case ``txq_inline`` is set recommendation is 4.
-
-  On ConnectX-5, ConnectX-6 and BlueField with Enhanced MPW:
-
-        - Set to 8 by default.
+  This option should be used in combination with ``txq_inline_max`` and
+  ``txq_inline_mpw`` below and does not affect ``txq_inline_min`` settings above.
+
+  If this option is not specified the default value 16 is used for BlueField
+  and 8 for other platforms
+
+  The data inlining consumes the CPU cycles, so this option is intended to
+  auto enable inline data if we have enough Tx queues, which means we have
+  enough CPU cores and PCI bandwidth is getting more critical and CPU
+  is not supposed to be bottleneck anymore.
+
+  The copying data into WQE improves latency and can improve PPS performance
+  when PCI back pressure is detected and may be useful for scenarios involving
+  heavy traffic on many queues.
+
+  Because additional software logic is necessary to handle this mode, this
+  option should be used with care, as it may lower performance when back
+  pressure is not expected.
+
+- ``txq_inline_min`` parameter [int]
+
+  Minimal amount of data to be inlined into WQE during Tx operations. NICs
+  may require this minimal data amount to operate correctly. The exact value
+  may depend on NIC operation mode, requested offloads, etc.
+
+  If ``txq_inline_min`` key is present the specified value (may be aligned
+  by the driver in order not to exceed the limits and provide better descriptor
+  space utilization) will be used by the driver and it is guaranteed the
+  requested data bytes are inlined into the WQE beside other inline settings.
+  This keys also may update ``txq_inline_max`` value (default of specified
+  explicitly in devargs) to reserve the space for inline data.
+
+  If ``txq_inline_min`` key is not present, the value may be queried by the
+  driver from the NIC via DevX if this feature is available. If there is no DevX
+  enabled/supported the value 18 (supposing L2 header including VLAN) is set
+  for ConnectX-4, value 58 (supposing L2-L4 headers, required by configurations
+  over E-Switch) is set for ConnectX-4 Lx, and 0 is set by default for ConnectX-5
+  and newer NICs. If packet is shorter the ``txq_inline_min`` value, the entire
+  packet is inlined.
+
+  For the ConnectX-4 and ConnectX-4 Lx NICs driver does not allow to set
+  this value below 18 (minimal L2 header, including VLAN).
+
+  Please, note, this minimal data inlining disengages eMPW feature (Enhanced
+  Multi-Packet Write), because last one does not support partial packet inlining.
+  This is not very critical due to minimal data inlining is mostly required
+  by ConnectX-4 and ConnectX-4 Lx, these NICs do not support eMPW feature.
+
+- ``txq_inline_max`` parameter [int]
+
+  Specifies the maximal packet length to be completely inlined into WQE
+  Ethernet Segment for ordinary SEND method. If packet is larger than specified
+  value, the packet data won't be copied by the driver at all, data buffer
+  is addressed with a pointer. If packet length is less or equal all packet
+  data will be copied into WQE. This may improve PCI bandwidth utilization for
+  short packets significantly but requires the extra CPU cycles.
+
+  The data inline feature is controlled by number of Tx queues, if number of Tx
+  queues is larger than ``txqs_min_inline`` key parameter, the inline feature
+  is engaged, if there are not enough Tx queues (which means not enough CPU cores
+  and CPU resources are scarce), data inline is not performed by the driver.
+  Assigning ``txqs_min_inline`` with zero always enables the data inline.
+
+  The default ``txq_inline_max`` value is 290. The specified value may be adjusted
+  by the driver in order not to exceed the limit (930 bytes) and to provide better
+  WQE space filling without gaps, the adjustment is reflected in the debug log.
+
+- ``txq_inline_mpw`` parameter [int]
+
+  Specifies the maximal packet length to be completely inlined into WQE for
+  Enhanced MPW method. If packet is large the specified value, the packet data
+  won't be copied, and data buffer is addressed with pointer. If packet length
+  is less or equal, all packet data will be copied into WQE. This may improve PCI
+  bandwidth utilization for short packets significantly but requires the extra
+  CPU cycles.
+
+  The data inline feature is controlled by number of TX queues, if number of Tx
+  queues is larger than ``txqs_min_inline`` key parameter, the inline feature
+  is engaged, if there are not enough Tx queues (which means not enough CPU cores
+  and CPU resources are scarce), data inline is not performed by the driver.
+  Assigning ``txqs_min_inline`` with zero always enables the data inline.
+
+  The default ``txq_inline_mpw`` value is 188. The specified value may be adjusted
+  by the driver in order not to exceed the limit (930 bytes) and to provide better
+  WQE space filling without gaps, the adjustment is reflected in the debug log.
+  Due to multiple packets may be included to the same WQE with Enhanced Multi
+  Packet Write Method and overall WQE size is limited it is not recommended to
+  specify large values for the ``txq_inline_mpw``.
 
 - ``txqs_max_vec`` parameter [int]
 
@@ -376,47 +454,34 @@ Run-time configuration
   equal to this value. This parameter is deprecated and ignored, kept
   for compatibility issue to not prevent driver from probing.
 
-- ``txq_mpw_en`` parameter [int]
-
-  A nonzero value enables multi-packet send (MPS) for ConnectX-4 Lx and
-  enhanced multi-packet send (Enhanced MPS) for ConnectX-5, ConnectX-6 and BlueField.
-  MPS allows the TX burst function to pack up multiple packets in a
-  single descriptor session in order to save PCI bandwidth and improve
-  performance at the cost of a slightly higher CPU usage. When
-  ``txq_inline`` is set along with ``txq_mpw_en``, TX burst function tries
-  to copy entire packet data on to TX descriptor instead of including
-  pointer of packet only if there is enough room remained in the
-  descriptor. ``txq_inline`` sets per-descriptor space for either pointers
-  or inlined packets. In addition, Enhanced MPS supports hybrid mode -
-  mixing inlined packets and pointers in the same descriptor.
-
-  This option cannot be used with certain offloads such as ``DEV_TX_OFFLOAD_TCP_TSO,
-  DEV_TX_OFFLOAD_VXLAN_TNL_TSO, DEV_TX_OFFLOAD_GRE_TNL_TSO, DEV_TX_OFFLOAD_VLAN_INSERT``.
-  When those offloads are requested the MPS send function will not be used.
-
-  It is currently only supported on the ConnectX-4 Lx, ConnectX-5, ConnectX-6 and BlueField
-  families of adapters.
-  On ConnectX-4 Lx the MPW is considered un-secure hence disabled by default.
-  Users which enable the MPW should be aware that application which provides incorrect
-  mbuf descriptors in the Tx burst can lead to serious errors in the host including, on some cases,
-  NIC to get stuck.
-  On ConnectX-5, ConnectX-6 and BlueField the MPW is secure and enabled by default.
-
 - ``txq_mpw_hdr_dseg_en`` parameter [int]
 
   A nonzero value enables including two pointers in the first block of TX
   descriptor. The parameter is deprecated and ignored, kept for compatibility
   issue.
 
-  Effective only when Enhanced MPS is supported. Disabled by default.
-
 - ``txq_max_inline_len`` parameter [int]
 
   Maximum size of packet to be inlined. This limits the size of packet to
   be inlined. If the size of a packet is larger than configured value, the
   packet isn't inlined even though there's enough space remained in the
   descriptor. Instead, the packet is included with pointer. This parameter
-  is deprecated.
+  is deprecated and converted directly to ``txq_inline_mpw`` providing full
+  compatibility. Valid only if eMPW feature is engaged.
+
+- ``txq_mpw_en`` parameter [int]
+
+  A nonzero value enables Enhanced Multi-Packet Write (eMPW) for ConnectX-5,
+  ConnectX-6 and BlueField. eMPW allows the TX burst function to pack up multiple
+  packets in a single descriptor session in order to save PCI bandwidth and improve
+  performance at the cost of a slightly higher CPU usage. When ``txq_inline_mpw``
+  is set along with ``txq_mpw_en``, TX burst function copies entire packet
+  data on to TX descriptor instead of including pointer of packet.
+
+  The Enhanced Multi-Packet Write feature is enabled by default if NIC supports
+  it, can be disabled by explicit specifying 0 value for ``txq_mpw_en`` option.
+  Also, if minimal data inlining is requested by non-zero ``txq_inline_min``
+  option or reported by the NIC, the eMPW feature is disengaged.
 
 - ``tx_vec_en`` parameter [int]
 
@@ -424,12 +489,6 @@ Run-time configuration
   NICs if the number of global Tx queues on the port is less than
   ``txqs_max_vec``. The parameter is deprecated and ignored.
 
-  This option cannot be used with certain offloads such as ``DEV_TX_OFFLOAD_TCP_TSO,
-  DEV_TX_OFFLOAD_VXLAN_TNL_TSO, DEV_TX_OFFLOAD_GRE_TNL_TSO, DEV_TX_OFFLOAD_VLAN_INSERT``.
-  When those offloads are requested the MPS send function will not be used.
-
-  Enabled by default on ConnectX-5, ConnectX-6 and BlueField.
-
 - ``rx_vec_en`` parameter [int]
 
   A nonzero value enables Rx vector if the port is not configured in
diff --git a/doc/guides/rel_notes/release_19_08.rst b/doc/guides/rel_notes/release_19_08.rst
index 1bf9eb8..6c382cb 100644
--- a/doc/guides/rel_notes/release_19_08.rst
+++ b/doc/guides/rel_notes/release_19_08.rst
@@ -116,6 +116,8 @@ New Features
   * Added support for IP-in-IP tunnel.
   * Accelerate flows with count action creation and destroy.
   * Accelerate flows counter query.
+  * Improve Tx datapath improves performance with enabled HW offloads.
+
 
 * **Updated Solarflare network PMD.**
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index d4f0eb2..bbf2583 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -72,6 +72,15 @@
 /* Device parameter to configure inline send. Deprecated, ignored.*/
 #define MLX5_TXQ_INLINE "txq_inline"
 
+/* Device parameter to limit packet size to inline with ordinary SEND. */
+#define MLX5_TXQ_INLINE_MAX "txq_inline_max"
+
+/* Device parameter to configure minimal data size to inline. */
+#define MLX5_TXQ_INLINE_MIN "txq_inline_min"
+
+/* Device parameter to limit packet size to inline with Enhanced MPW. */
+#define MLX5_TXQ_INLINE_MPW "txq_inline_mpw"
+
 /*
  * Device parameter to configure the number of TX queues threshold for
  * enabling inline send.
@@ -1006,7 +1015,15 @@ struct mlx5_dev_spawn_data {
 	} else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
 		config->mprq.min_rxqs_num = tmp;
 	} else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
-		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
+		DRV_LOG(WARNING, "%s: deprecated parameter,"
+				 " converted to txq_inline_max", key);
+		config->txq_inline_max = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) {
+		config->txq_inline_max = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) {
+		config->txq_inline_min = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) {
+		config->txq_inline_mpw = tmp;
 	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
 		config->txqs_inline = tmp;
 	} else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
@@ -1016,7 +1033,9 @@ struct mlx5_dev_spawn_data {
 	} else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
 		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
 	} else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
-		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
+		DRV_LOG(WARNING, "%s: deprecated parameter,"
+				 " converted to txq_inline_mpw", key);
+		config->txq_inline_mpw = tmp;
 	} else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
 		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
 	} else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
@@ -1064,6 +1083,9 @@ struct mlx5_dev_spawn_data {
 		MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
 		MLX5_RXQS_MIN_MPRQ,
 		MLX5_TXQ_INLINE,
+		MLX5_TXQ_INLINE_MIN,
+		MLX5_TXQ_INLINE_MAX,
+		MLX5_TXQ_INLINE_MPW,
 		MLX5_TXQS_MIN_INLINE,
 		MLX5_TXQS_MAX_VEC,
 		MLX5_TXQ_MPW_EN,
@@ -2026,6 +2048,9 @@ struct mlx5_dev_spawn_data {
 		.hw_padding = 0,
 		.mps = MLX5_ARG_UNSET,
 		.rx_vec_en = 1,
+		.txq_inline_max = MLX5_ARG_UNSET,
+		.txq_inline_min = MLX5_ARG_UNSET,
+		.txq_inline_mpw = MLX5_ARG_UNSET,
 		.txqs_inline = MLX5_ARG_UNSET,
 		.vf_nl_en = 1,
 		.mr_ext_memseg_en = 1,
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 354f6bc..86f005d 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -198,6 +198,7 @@ struct mlx5_dev_config {
 	unsigned int cqe_comp:1; /* CQE compression is enabled. */
 	unsigned int cqe_pad:1; /* CQE padding is enabled. */
 	unsigned int tso:1; /* Whether TSO is supported. */
+	unsigned int tx_inline:1; /* Engage TX data inlining. */
 	unsigned int rx_vec_en:1; /* Rx vector is enabled. */
 	unsigned int mr_ext_memseg_en:1;
 	/* Whether memseg should be extended for MR creation. */
@@ -223,6 +224,9 @@ struct mlx5_dev_config {
 	unsigned int ind_table_max_size; /* Maximum indirection table size. */
 	unsigned int max_dump_files_num; /* Maximum dump files per queue. */
 	int txqs_inline; /* Queue number threshold for inlining. */
+	int txq_inline_min; /* Minimal amount of data bytes to inline. */
+	int txq_inline_max; /* Max packet size for inlining with SEND. */
+	int txq_inline_mpw; /* Max packet size for inlining with eMPW. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 };
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v4 3/8] net/mlx5: update Tx datapath definitions
  2019-07-21 14:24       ` [dpdk-dev] [PATCH v4 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
  2019-07-21 14:24         ` [dpdk-dev] [PATCH v4 1/8] net/mlx5: remove Tx datapath implementation Viacheslav Ovsiienko
  2019-07-21 14:24         ` [dpdk-dev] [PATCH v4 2/8] net/mlx5: add Tx datapath related devargs Viacheslav Ovsiienko
@ 2019-07-21 14:24         ` Viacheslav Ovsiienko
  2019-07-22  5:33           ` Yongseok Koh
  2019-07-21 14:24         ` [dpdk-dev] [PATCH v4 4/8] net/mlx5: extend NIC attributes query via DevX Viacheslav Ovsiienko
                           ` (5 subsequent siblings)
  8 siblings, 1 reply; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-21 14:24 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch updates Tx datapath definitions, mostly hardware related.
The Tx descriptor structures are redefined with required fields,
size definitions are renamed to reflect the meanings in more
appropriate way. This is a preparation step before introducing
the new Tx datapath implementation.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_defs.h |   2 +-
 drivers/net/mlx5/mlx5_prm.h  | 164 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 152 insertions(+), 14 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 6861304..873a595 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -58,7 +58,7 @@
 #define MLX5_MAX_XSTATS 32
 
 /* Maximum Packet headers size (L2+L3+L4) for TSO. */
-#define MLX5_MAX_TSO_HEADER 192
+#define MLX5_MAX_TSO_HEADER (128u + 34u)
 
 /* Threshold of buffer replenishment for vectorized Rx. */
 #define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index dfd9317..97abdb2 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -39,14 +39,85 @@
 /* Invalidate a CQE. */
 #define MLX5_CQE_INVALIDATE (MLX5_CQE_INVALID << 4)
 
-/* WQE DWORD size */
-#define MLX5_WQE_DWORD_SIZE 16
-
-/* WQE size */
-#define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
+/* WQE Segment sizes in bytes. */
+#define MLX5_WSEG_SIZE 16u
+#define MLX5_WQE_CSEG_SIZE sizeof(struct mlx5_wqe_cseg)
+#define MLX5_WQE_DSEG_SIZE sizeof(struct mlx5_wqe_dseg)
+#define MLX5_WQE_ESEG_SIZE sizeof(struct mlx5_wqe_eseg)
+
+/* WQE/WQEBB size in bytes. */
+#define MLX5_WQE_SIZE sizeof(struct mlx5_wqe)
+
+/*
+ * Max size of a WQE session.
+ * Absolute maximum size is 63 (MLX5_DSEG_MAX) segments,
+ * the WQE size field in Control Segment is 6 bits wide.
+ */
+#define MLX5_WQE_SIZE_MAX (60 * MLX5_WSEG_SIZE)
+
+/*
+ * Default minimum number of Tx queues for inlining packets.
+ * If there are less queues as specified we assume we have
+ * no enough CPU resources (cycles) to perform inlining,
+ * the PCIe throughput is not supposed as bottleneck and
+ * inlining is disabled.
+ */
+#define MLX5_INLINE_MAX_TXQS 8u
+#define MLX5_INLINE_MAX_TXQS_BLUEFIELD 16u
+
+/*
+ * Default packet length threshold to be inlined with
+ * enhanced MPW. If packet length exceeds the threshold
+ * the data are not inlined. Should be aligned in WQEBB
+ * boundary with accounting the title Control and Ethernet
+ * segments.
+ */
+#define MLX5_EMPW_DEF_INLINE_LEN (3U * MLX5_WQE_SIZE + \
+				  MLX5_DSEG_MIN_INLINE_SIZE - \
+				  MLX5_WQE_DSEG_SIZE)
+/*
+ * Maximal inline data length sent with enhanced MPW.
+ * Is based on maximal WQE size.
+ */
+#define MLX5_EMPW_MAX_INLINE_LEN (MLX5_WQE_SIZE_MAX - \
+				  MLX5_WQE_CSEG_SIZE - \
+				  MLX5_WQE_ESEG_SIZE - \
+				  MLX5_WQE_DSEG_SIZE + \
+				  MLX5_DSEG_MIN_INLINE_SIZE)
+/*
+ * Minimal amount of packets to be sent with EMPW.
+ * This limits the minimal required size of sent EMPW.
+ * If there are no enough resources to built minimal
+ * EMPW the sending loop exits.
+ */
+#define MLX5_EMPW_MIN_PACKETS (2 + 3 * 4)
+#define MLX5_EMPW_MAX_PACKETS ((MLX5_WQE_SIZE_MAX - \
+				MLX5_WQE_CSEG_SIZE - \
+				MLX5_WQE_ESEG_SIZE) / \
+				MLX5_WSEG_SIZE)
+/*
+ * Default packet length threshold to be inlined with
+ * ordinary SEND. Inlining saves the MR key search
+ * and extra PCIe data fetch transaction, but eats the
+ * CPU cycles.
+ */
+#define MLX5_SEND_DEF_INLINE_LEN (5U * MLX5_WQE_SIZE + \
+				  MLX5_ESEG_MIN_INLINE_SIZE - \
+				  MLX5_WQE_CSEG_SIZE - \
+				  MLX5_WQE_ESEG_SIZE - \
+				  MLX5_WQE_DSEG_SIZE)
+/*
+ * Maximal inline data length sent with ordinary SEND.
+ * Is based on maximal WQE size.
+ */
+#define MLX5_SEND_MAX_INLINE_LEN (MLX5_WQE_SIZE_MAX - \
+				  MLX5_WQE_CSEG_SIZE - \
+				  MLX5_WQE_ESEG_SIZE - \
+				  MLX5_WQE_DSEG_SIZE + \
+				  MLX5_ESEG_MIN_INLINE_SIZE)
 
-#define MLX5_OPC_MOD_ENHANCED_MPSW 0
-#define MLX5_OPCODE_ENHANCED_MPSW 0x29
+/* Missed in mlv5dv.h, should define here. */
+#define MLX5_OPCODE_ENHANCED_MPSW 0x29u
 
 /* CQE value to inform that VLAN is stripped. */
 #define MLX5_CQE_VLAN_STRIPPED (1u << 0)
@@ -114,6 +185,12 @@
 /* Inner L3 type is IPV6. */
 #define MLX5_ETH_WQE_L3_INNER_IPV6 (1u << 0)
 
+/* VLAN insertion flag. */
+#define MLX5_ETH_WQE_VLAN_INSERT (1u << 31)
+
+/* Data inline segment flag. */
+#define MLX5_ETH_WQE_DATA_INLINE (1u << 31)
+
 /* Is flow mark valid. */
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 #define MLX5_FLOW_MARK_IS_VALID(val) ((val) & 0xffffff00)
@@ -130,12 +207,21 @@
 /* Default mark value used when none is provided. */
 #define MLX5_FLOW_MARK_DEFAULT 0xffffff
 
-/* Maximum number of DS in WQE. */
+/* Maximum number of DS in WQE. Limited by 6-bit field. */
 #define MLX5_DSEG_MAX 63
 
 /* The completion mode offset in the WQE control segment line 2. */
 #define MLX5_COMP_MODE_OFFSET 2
 
+/* Amount of data bytes in minimal inline data segment. */
+#define MLX5_DSEG_MIN_INLINE_SIZE 12u
+
+/* Amount of data bytes in minimal inline eth segment. */
+#define MLX5_ESEG_MIN_INLINE_SIZE 18u
+
+/* Amount of data bytes after eth data segment. */
+#define MLX5_ESEG_EXTRA_DATA_SIZE 32u
+
 /* Completion mode. */
 enum mlx5_completion_mode {
 	MLX5_COMP_ONLY_ERR = 0x0,
@@ -144,11 +230,6 @@ enum mlx5_completion_mode {
 	MLX5_COMP_CQE_AND_EQE = 0x3,
 };
 
-/* Small common part of the WQE. */
-struct mlx5_wqe {
-	uint32_t ctrl[4];
-};
-
 /* MPW mode. */
 enum mlx5_mpw_mode {
 	MLX5_MPW_DISABLED,
@@ -156,6 +237,63 @@ enum mlx5_mpw_mode {
 	MLX5_MPW_ENHANCED, /* Enhanced Multi-Packet Send WQE, a.k.a MPWv2. */
 };
 
+/* WQE Control segment. */
+struct mlx5_wqe_cseg {
+	uint32_t opcode;
+	uint32_t sq_ds;
+	uint32_t flags;
+	uint32_t misc;
+} __rte_packed __rte_aligned(MLX5_WSEG_SIZE);
+
+/* Header of data segment. Minimal size Data Segment */
+struct mlx5_wqe_dseg {
+	uint32_t bcount;
+	union {
+		uint8_t inline_data[MLX5_DSEG_MIN_INLINE_SIZE];
+		struct {
+			uint32_t lkey;
+			uint64_t pbuf;
+		} __rte_packed;
+	};
+} __rte_packed;
+
+/* Subset of struct WQE Ethernet Segment. */
+struct mlx5_wqe_eseg {
+	union {
+		struct {
+			uint32_t swp_offs;
+			uint8_t	cs_flags;
+			uint8_t	swp_flags;
+			uint16_t mss;
+			uint32_t metadata;
+			uint16_t inline_hdr_sz;
+			union {
+				uint16_t inline_data;
+				uint16_t vlan_tag;
+			};
+		} __rte_packed;
+		struct {
+			uint32_t offsets;
+			uint32_t flags;
+			uint32_t flow_metadata;
+			uint32_t inline_hdr;
+		} __rte_packed;
+	};
+} __rte_packed;
+
+/* The title WQEBB, header of WQE. */
+struct mlx5_wqe {
+	union {
+		struct mlx5_wqe_cseg cseg;
+		uint32_t ctrl[4];
+	};
+	struct mlx5_wqe_eseg eseg;
+	union {
+		struct mlx5_wqe_dseg dseg[2];
+		uint8_t data[MLX5_ESEG_EXTRA_DATA_SIZE];
+	};
+} __rte_packed;
+
 /* WQE for Multi-Packet RQ. */
 struct mlx5_wqe_mprq {
 	struct mlx5_wqe_srq_next_seg next_seg;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [dpdk-dev] [PATCH v4 4/8] net/mlx5: extend NIC attributes query via DevX
  2019-07-21 14:24       ` [dpdk-dev] [PATCH v4 0/8] net/mlx5: consolidate Tx datapath Viacheslav Ovsiienko
                           ` (2 preceding siblings ...)
  2019-07-21 14:24         ` [dpdk-dev] [PATCH v4 3/8] net/mlx5: update Tx datapath definitions Viacheslav Ovsiienko
@ 2019-07-21 14:24         ` Viacheslav Ovsiienko
  2019-07-22  5:33           ` Yongseok Koh
  2019-07-21 14:24         ` [dpdk-dev] [PATCH v4 5/8] net/mlx5: add Tx datapath configuration and setup Viacheslav Ovsiienko
                           ` (4 subsequent siblings)
  8 siblings, 1 reply; 50+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-21 14:24 UTC (permalink / raw)
  To: dev; +Cc: yskoh

This patch extends the NIC attributes query via DevX.
The appropriate interface structures are borrowed from
kernel driver headers and DevX calls are added to
mlx5_devx_cmd_query_hca_attr() routine.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.h           |   5 ++