All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
@ 2019-01-18 10:29 Shahed Shaikh
  2019-01-18 10:29 ` [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Shahed Shaikh @ 2019-01-18 10:29 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, stable

Allocating replacement buffer per received packet is expensive.
Instead, process received packets first and allocate
replacement buffers in bulk later.

This improves performance by ~25% in terms of PPS on AMD
platforms.

Fixes: 2ea6f76aff40 ("qede: add core driver")
Cc: stable@dpdk.org

Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
---
 drivers/net/qede/qede_rxtx.c | 97 +++++++++++++++++++++++++++++++++-----------
 drivers/net/qede/qede_rxtx.h |  2 +
 2 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index 0e33be1..684c4ae 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -35,6 +35,52 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq)
 	return 0;
 }
 
+#define QEDE_MAX_BULK_ALLOC_COUNT 512
+
+static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq, int count)
+{
+	void *obj_p[QEDE_MAX_BULK_ALLOC_COUNT] __rte_cache_aligned;
+	struct rte_mbuf *mbuf = NULL;
+	struct eth_rx_bd *rx_bd;
+	dma_addr_t mapping;
+	int i, ret = 0;
+	uint16_t idx;
+
+	idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
+
+	if (count > QEDE_MAX_BULK_ALLOC_COUNT)
+		count = QEDE_MAX_BULK_ALLOC_COUNT;
+
+	ret = rte_mempool_get_bulk(rxq->mb_pool, obj_p, count);
+	if (unlikely(ret)) {
+		PMD_RX_LOG(ERR, rxq,
+			   "Failed to allocate %d rx buffers "
+			    "sw_rx_prod %u sw_rx_cons %u mp entries %u free %u",
+			    count, idx, rxq->sw_rx_cons & NUM_RX_BDS(rxq),
+			    rte_mempool_avail_count(rxq->mb_pool),
+			    rte_mempool_in_use_count(rxq->mb_pool));
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < count; i++) {
+		mbuf = obj_p[i];
+		if (likely(i < count - 1))
+			rte_prefetch0(obj_p[i + 1]);
+
+		idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
+		rxq->sw_rx_ring[idx].mbuf = mbuf;
+		rxq->sw_rx_ring[idx].page_offset = 0;
+		mapping = rte_mbuf_data_iova_default(mbuf);
+		rx_bd = (struct eth_rx_bd *)
+			ecore_chain_produce(&rxq->rx_bd_ring);
+		rx_bd->addr.hi = rte_cpu_to_le_32(U64_HI(mapping));
+		rx_bd->addr.lo = rte_cpu_to_le_32(U64_LO(mapping));
+		rxq->sw_rx_prod++;
+	}
+
+	return 0;
+}
+
 /* Criterias for calculating Rx buffer size -
  * 1) rx_buf_size should not exceed the size of mbuf
  * 2) In scattered_rx mode - minimum rx_buf_size should be
@@ -1131,7 +1177,7 @@ qede_reuse_page(__rte_unused struct qede_dev *qdev,
 		struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons)
 {
 	struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq->rx_bd_ring);
-	uint16_t idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq);
+	uint16_t idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
 	struct qede_rx_entry *curr_prod;
 	dma_addr_t new_mapping;
 
@@ -1364,7 +1410,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint8_t bitfield_val;
 #endif
 	uint8_t tunn_parse_flag;
-	uint8_t j;
 	struct eth_fast_path_rx_tpa_start_cqe *cqe_start_tpa;
 	uint64_t ol_flags;
 	uint32_t packet_type;
@@ -1373,6 +1418,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint8_t offset, tpa_agg_idx, flags;
 	struct qede_agg_info *tpa_info = NULL;
 	uint32_t rss_hash;
+	int rx_alloc_count = 0;
 
 	hw_comp_cons = rte_le_to_cpu_16(*rxq->hw_cons_ptr);
 	sw_comp_cons = ecore_chain_get_cons_idx(&rxq->rx_comp_ring);
@@ -1382,6 +1428,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (hw_comp_cons == sw_comp_cons)
 		return 0;
 
+	/* Allocate buffers that we used in previous loop */
+	if (rxq->rx_alloc_count) {
+		if (unlikely(qede_alloc_rx_bulk_mbufs(rxq,
+			     rxq->rx_alloc_count))) {
+			struct rte_eth_dev *dev;
+
+			PMD_RX_LOG(ERR, rxq,
+				   "New buffer allocation failed,"
+				   "dropping incoming packetn");
+			dev = &rte_eth_devices[rxq->port_id];
+			dev->data->rx_mbuf_alloc_failed +=
+							rxq->rx_alloc_count;
+			rxq->rx_alloc_errors += rxq->rx_alloc_count;
+			return 0;
+		}
+		qede_update_rx_prod(qdev, rxq);
+		rxq->rx_alloc_count = 0;
+	}
+
 	while (sw_comp_cons != hw_comp_cons) {
 		ol_flags = 0;
 		packet_type = RTE_PTYPE_UNKNOWN;
@@ -1553,16 +1618,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 			rx_mb->hash.rss = rss_hash;
 		}
 
-		if (unlikely(qede_alloc_rx_buffer(rxq) != 0)) {
-			PMD_RX_LOG(ERR, rxq,
-				   "New buffer allocation failed,"
-				   "dropping incoming packet\n");
-			qede_recycle_rx_bd_ring(rxq, qdev, fp_cqe->bd_num);
-			rte_eth_devices[rxq->port_id].
-			    data->rx_mbuf_alloc_failed++;
-			rxq->rx_alloc_errors++;
-			break;
-		}
+		rx_alloc_count++;
 		qede_rx_bd_ring_consume(rxq);
 
 		if (!tpa_start_flg && fp_cqe->bd_num > 1) {
@@ -1574,17 +1630,9 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 			if (qede_process_sg_pkts(p_rxq, seg1, num_segs,
 						 pkt_len - len))
 				goto next_cqe;
-			for (j = 0; j < num_segs; j++) {
-				if (qede_alloc_rx_buffer(rxq)) {
-					PMD_RX_LOG(ERR, rxq,
-						"Buffer allocation failed");
-					rte_eth_devices[rxq->port_id].
-						data->rx_mbuf_alloc_failed++;
-					rxq->rx_alloc_errors++;
-					break;
-				}
-				rxq->rx_segs++;
-			}
+
+			rx_alloc_count += num_segs;
+			rxq->rx_segs += num_segs;
 		}
 		rxq->rx_segs++; /* for the first segment */
 
@@ -1626,7 +1674,8 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		}
 	}
 
-	qede_update_rx_prod(qdev, rxq);
+	/* Request number of bufferes to be allocated in next loop */
+	rxq->rx_alloc_count = rx_alloc_count;
 
 	rxq->rcv_pkts += rx_pkt;
 
diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h
index 454daa0..5b249cb 100644
--- a/drivers/net/qede/qede_rxtx.h
+++ b/drivers/net/qede/qede_rxtx.h
@@ -192,6 +192,8 @@ struct qede_rx_queue {
 	uint16_t queue_id;
 	uint16_t port_id;
 	uint16_t rx_buf_size;
+	uint16_t rx_alloc_count;
+	uint16_t unused;
 	uint64_t rcv_pkts;
 	uint64_t rx_segs;
 	uint64_t rx_hw_errors;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path
  2019-01-18 10:29 [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Shahed Shaikh
@ 2019-01-18 10:29 ` Shahed Shaikh
  2019-01-18 16:58   ` Rasesh Mody
  2019-01-18 14:41 ` [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Ferruh Yigit
  2019-01-18 16:57 ` Rasesh Mody
  2 siblings, 1 reply; 9+ messages in thread
From: Shahed Shaikh @ 2019-01-18 10:29 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, stable

Incorrect placement of prefetch in Tx path is causing
a performance drop of around ~2% on AMD platform.

Fixes: 2ea6f76aff40 ("qede: add core driver")
Cc: stable@dpdk.org

Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
---
 drivers/net/qede/qede_rxtx.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index 684c4ae..eda19b2 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -2181,7 +2181,6 @@ qede_xmit_pkts(void *p_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 
 		txq->nb_tx_avail -= bd1->data.nbds;
 		txq->sw_tx_prod++;
-		rte_prefetch0(txq->sw_tx_ring[TX_PROD(txq)].mbuf);
 		bd_prod =
 		    rte_cpu_to_le_16(ecore_chain_get_prod_idx(&txq->tx_pbl));
 #ifdef RTE_LIBRTE_QEDE_DEBUG_TX
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
  2019-01-18 10:29 [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Shahed Shaikh
  2019-01-18 10:29 ` [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
@ 2019-01-18 14:41 ` Ferruh Yigit
  2019-01-18 14:41   ` Ferruh Yigit
  2019-01-18 16:57 ` Rasesh Mody
  2 siblings, 1 reply; 9+ messages in thread
From: Ferruh Yigit @ 2019-01-18 14:41 UTC (permalink / raw)
  To: Shahed Shaikh, dev; +Cc: stable, Rasesh Mody

On 1/18/2019 10:29 AM, Shahed Shaikh wrote:
> Allocating replacement buffer per received packet is expensive.
> Instead, process received packets first and allocate
> replacement buffers in bulk later.
> 
> This improves performance by ~25% in terms of PPS on AMD
> platforms.
> 
> Fixes: 2ea6f76aff40 ("qede: add core driver")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>

Hi Shahed,

This patch has been sent same day of the RC3, very close to the actual release,
and updating data path of the driver, so not a trivial patch.

Although I tend to accept driver patches late in process this one was too late,
you won't have proper time to fix if any errors detected, but also I can see it
has a good amount performance effect.

I would like to explicitly ask if you are willing to take the risk. If answer is
yes, please also be sure to get Rasesh's ack.

Thanks,
ferruh

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
  2019-01-18 14:41 ` [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Ferruh Yigit
@ 2019-01-18 14:41   ` Ferruh Yigit
  0 siblings, 0 replies; 9+ messages in thread
From: Ferruh Yigit @ 2019-01-18 14:41 UTC (permalink / raw)
  To: Shahed Shaikh, dev; +Cc: stable, Rasesh Mody, Thomas Monjalon

On 1/18/2019 2:41 PM, Ferruh Yigit wrote:
> On 1/18/2019 10:29 AM, Shahed Shaikh wrote:
>> Allocating replacement buffer per received packet is expensive.
>> Instead, process received packets first and allocate
>> replacement buffers in bulk later.
>>
>> This improves performance by ~25% in terms of PPS on AMD
>> platforms.
>>
>> Fixes: 2ea6f76aff40 ("qede: add core driver")
>> Cc: stable@dpdk.org
>>
>> Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
> 
> Hi Shahed,
> 
> This patch has been sent same day of the RC3, very close to the actual release,
> and updating data path of the driver, so not a trivial patch.
> 
> Although I tend to accept driver patches late in process this one was too late,
> you won't have proper time to fix if any errors detected, but also I can see it
> has a good amount performance effect.
> 
> I would like to explicitly ask if you are willing to take the risk. If answer is
> yes, please also be sure to get Rasesh's ack.

cc'ed Thomas.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
  2019-01-18 10:29 [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Shahed Shaikh
  2019-01-18 10:29 ` [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
  2019-01-18 14:41 ` [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Ferruh Yigit
@ 2019-01-18 16:57 ` Rasesh Mody
  2019-01-18 23:39   ` Thomas Monjalon
  2 siblings, 1 reply; 9+ messages in thread
From: Rasesh Mody @ 2019-01-18 16:57 UTC (permalink / raw)
  To: Shahed Shaikh, dev; +Cc: ferruh.yigit, stable, thomas

>From: dev <dev-bounces@dpdk.org> On Behalf Of Shahed Shaikh
>Sent: Friday, January 18, 2019 2:29 AM
>
>Allocating replacement buffer per received packet is expensive.
>Instead, process received packets first and allocate replacement buffers in
>bulk later.
>
>This improves performance by ~25% in terms of PPS on AMD platforms.
>
>Fixes: 2ea6f76aff40 ("qede: add core driver")
>Cc: stable@dpdk.org
>
>Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
>---

Acked-by: Rasesh Mody <rmody@marvell.com> 

> drivers/net/qede/qede_rxtx.c | 97
>+++++++++++++++++++++++++++++++++-----------
> drivers/net/qede/qede_rxtx.h |  2 +
> 2 files changed, 75 insertions(+), 24 deletions(-)
>
>diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
>index 0e33be1..684c4ae 100644
>--- a/drivers/net/qede/qede_rxtx.c
>+++ b/drivers/net/qede/qede_rxtx.c
>@@ -35,6 +35,52 @@ static inline int qede_alloc_rx_buffer(struct
>qede_rx_queue *rxq)
>        return 0;
> }
>
>+#define QEDE_MAX_BULK_ALLOC_COUNT 512
>+
>+static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq,
>+int count) {
>+       void *obj_p[QEDE_MAX_BULK_ALLOC_COUNT] __rte_cache_aligned;
>+       struct rte_mbuf *mbuf = NULL;
>+       struct eth_rx_bd *rx_bd;
>+       dma_addr_t mapping;
>+       int i, ret = 0;
>+       uint16_t idx;
>+
>+       idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
>+
>+       if (count > QEDE_MAX_BULK_ALLOC_COUNT)
>+               count = QEDE_MAX_BULK_ALLOC_COUNT;
>+
>+       ret = rte_mempool_get_bulk(rxq->mb_pool, obj_p, count);
>+       if (unlikely(ret)) {
>+               PMD_RX_LOG(ERR, rxq,
>+                          "Failed to allocate %d rx buffers "
>+                           "sw_rx_prod %u sw_rx_cons %u mp entries %u free %u",
>+                           count, idx, rxq->sw_rx_cons & NUM_RX_BDS(rxq),
>+                           rte_mempool_avail_count(rxq->mb_pool),
>+                           rte_mempool_in_use_count(rxq->mb_pool));
>+               return -ENOMEM;
>+       }
>+
>+       for (i = 0; i < count; i++) {
>+               mbuf = obj_p[i];
>+               if (likely(i < count - 1))
>+                       rte_prefetch0(obj_p[i + 1]);
>+
>+               idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
>+               rxq->sw_rx_ring[idx].mbuf = mbuf;
>+               rxq->sw_rx_ring[idx].page_offset = 0;
>+               mapping = rte_mbuf_data_iova_default(mbuf);
>+               rx_bd = (struct eth_rx_bd *)
>+                       ecore_chain_produce(&rxq->rx_bd_ring);
>+               rx_bd->addr.hi = rte_cpu_to_le_32(U64_HI(mapping));
>+               rx_bd->addr.lo = rte_cpu_to_le_32(U64_LO(mapping));
>+               rxq->sw_rx_prod++;
>+       }
>+
>+       return 0;
>+}
>+
> /* Criterias for calculating Rx buffer size -
>  * 1) rx_buf_size should not exceed the size of mbuf
>  * 2) In scattered_rx mode - minimum rx_buf_size should be @@ -1131,7
>+1177,7 @@ qede_reuse_page(__rte_unused struct qede_dev *qdev,
>                struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons)  {
>        struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq-
>>rx_bd_ring);
>-       uint16_t idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq);
>+       uint16_t idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
>        struct qede_rx_entry *curr_prod;
>        dma_addr_t new_mapping;
>
>@@ -1364,7 +1410,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
>**rx_pkts, uint16_t nb_pkts)
>        uint8_t bitfield_val;
> #endif
>        uint8_t tunn_parse_flag;
>-       uint8_t j;
>        struct eth_fast_path_rx_tpa_start_cqe *cqe_start_tpa;
>        uint64_t ol_flags;
>        uint32_t packet_type;
>@@ -1373,6 +1418,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
>**rx_pkts, uint16_t nb_pkts)
>        uint8_t offset, tpa_agg_idx, flags;
>        struct qede_agg_info *tpa_info = NULL;
>        uint32_t rss_hash;
>+       int rx_alloc_count = 0;
>
>        hw_comp_cons = rte_le_to_cpu_16(*rxq->hw_cons_ptr);
>        sw_comp_cons = ecore_chain_get_cons_idx(&rxq->rx_comp_ring);
>@@ -1382,6 +1428,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
>**rx_pkts, uint16_t nb_pkts)
>        if (hw_comp_cons == sw_comp_cons)
>                return 0;
>
>+       /* Allocate buffers that we used in previous loop */
>+       if (rxq->rx_alloc_count) {
>+               if (unlikely(qede_alloc_rx_bulk_mbufs(rxq,
>+                            rxq->rx_alloc_count))) {
>+                       struct rte_eth_dev *dev;
>+
>+                       PMD_RX_LOG(ERR, rxq,
>+                                  "New buffer allocation failed,"
>+                                  "dropping incoming packetn");
>+                       dev = &rte_eth_devices[rxq->port_id];
>+                       dev->data->rx_mbuf_alloc_failed +=
>+                                                       rxq->rx_alloc_count;
>+                       rxq->rx_alloc_errors += rxq->rx_alloc_count;
>+                       return 0;
>+               }
>+               qede_update_rx_prod(qdev, rxq);
>+               rxq->rx_alloc_count = 0;
>+       }
>+
>        while (sw_comp_cons != hw_comp_cons) {
>                ol_flags = 0;
>                packet_type = RTE_PTYPE_UNKNOWN; @@ -1553,16 +1618,7 @@
>qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
>                        rx_mb->hash.rss = rss_hash;
>                }
>
>-               if (unlikely(qede_alloc_rx_buffer(rxq) != 0)) {
>-                       PMD_RX_LOG(ERR, rxq,
>-                                  "New buffer allocation failed,"
>-                                  "dropping incoming packet\n");
>-                       qede_recycle_rx_bd_ring(rxq, qdev, fp_cqe->bd_num);
>-                       rte_eth_devices[rxq->port_id].
>-                           data->rx_mbuf_alloc_failed++;
>-                       rxq->rx_alloc_errors++;
>-                       break;
>-               }
>+               rx_alloc_count++;
>                qede_rx_bd_ring_consume(rxq);
>
>                if (!tpa_start_flg && fp_cqe->bd_num > 1) { @@ -1574,17 +1630,9
>@@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t
>nb_pkts)
>                        if (qede_process_sg_pkts(p_rxq, seg1, num_segs,
>                                                 pkt_len - len))
>                                goto next_cqe;
>-                       for (j = 0; j < num_segs; j++) {
>-                               if (qede_alloc_rx_buffer(rxq)) {
>-                                       PMD_RX_LOG(ERR, rxq,
>-                                               "Buffer allocation failed");
>-                                       rte_eth_devices[rxq->port_id].
>-                                               data->rx_mbuf_alloc_failed++;
>-                                       rxq->rx_alloc_errors++;
>-                                       break;
>-                               }
>-                               rxq->rx_segs++;
>-                       }
>+
>+                       rx_alloc_count += num_segs;
>+                       rxq->rx_segs += num_segs;
>                }
>                rxq->rx_segs++; /* for the first segment */
>
>@@ -1626,7 +1674,8 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
>**rx_pkts, uint16_t nb_pkts)
>                }
>        }
>
>-       qede_update_rx_prod(qdev, rxq);
>+       /* Request number of bufferes to be allocated in next loop */
>+       rxq->rx_alloc_count = rx_alloc_count;
>
>        rxq->rcv_pkts += rx_pkt;
>
>diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h
>index 454daa0..5b249cb 100644
>--- a/drivers/net/qede/qede_rxtx.h
>+++ b/drivers/net/qede/qede_rxtx.h
>@@ -192,6 +192,8 @@ struct qede_rx_queue {
>        uint16_t queue_id;
>        uint16_t port_id;
>        uint16_t rx_buf_size;
>+       uint16_t rx_alloc_count;
>+       uint16_t unused;
>        uint64_t rcv_pkts;
>        uint64_t rx_segs;
>        uint64_t rx_hw_errors;
>--
>2.7.4

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path
  2019-01-18 10:29 ` [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
@ 2019-01-18 16:58   ` Rasesh Mody
  0 siblings, 0 replies; 9+ messages in thread
From: Rasesh Mody @ 2019-01-18 16:58 UTC (permalink / raw)
  To: Shahed Shaikh, dev; +Cc: ferruh.yigit, stable, thomas

>From: dev <dev-bounces@dpdk.org> On Behalf Of Shahed Shaikh
>Sent: Friday, January 18, 2019 2:30 AM
>
>Incorrect placement of prefetch in Tx path is causing a performance drop of
>around ~2% on AMD platform.
>
>Fixes: 2ea6f76aff40 ("qede: add core driver")
>Cc: stable@dpdk.org
>
>Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
>---

Acked-by: Rasesh Mody <rmody@marvell.com>

> drivers/net/qede/qede_rxtx.c | 1 -
> 1 file changed, 1 deletion(-)
>
>diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
>index 684c4ae..eda19b2 100644
>--- a/drivers/net/qede/qede_rxtx.c
>+++ b/drivers/net/qede/qede_rxtx.c
>@@ -2181,7 +2181,6 @@ qede_xmit_pkts(void *p_txq, struct rte_mbuf
>**tx_pkts, uint16_t nb_pkts)
>
>                txq->nb_tx_avail -= bd1->data.nbds;
>                txq->sw_tx_prod++;
>-               rte_prefetch0(txq->sw_tx_ring[TX_PROD(txq)].mbuf);
>                bd_prod =
>                    rte_cpu_to_le_16(ecore_chain_get_prod_idx(&txq->tx_pbl));
> #ifdef RTE_LIBRTE_QEDE_DEBUG_TX
>--
>2.7.4

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
  2019-01-18 16:57 ` Rasesh Mody
@ 2019-01-18 23:39   ` Thomas Monjalon
  0 siblings, 0 replies; 9+ messages in thread
From: Thomas Monjalon @ 2019-01-18 23:39 UTC (permalink / raw)
  To: Rasesh Mody, Shahed Shaikh; +Cc: dev, ferruh.yigit, stable

18/01/2019 17:57, Rasesh Mody:
> >From: dev <dev-bounces@dpdk.org> On Behalf Of Shahed Shaikh
> >Sent: Friday, January 18, 2019 2:29 AM
> >
> >Allocating replacement buffer per received packet is expensive.
> >Instead, process received packets first and allocate replacement buffers in
> >bulk later.
> >
> >This improves performance by ~25% in terms of PPS on AMD platforms.
> >
> >Fixes: 2ea6f76aff40 ("qede: add core driver")
> >Cc: stable@dpdk.org
> >
> >Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>

Applied, thanks

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
  2019-01-18 16:13 Shahed Shaikh
@ 2019-01-18 17:15 ` Rasesh Mody
  0 siblings, 0 replies; 9+ messages in thread
From: Rasesh Mody @ 2019-01-18 17:15 UTC (permalink / raw)
  To: Shahed Shaikh, Ferruh Yigit, dev; +Cc: stable, Thomas Monjalon

>From: Shahed Shaikh
>Sent: Friday, January 18, 2019 8:13 AM
>
>> -----Original Message-----
>> From: Ferruh Yigit <ferruh.yigit@intel.com>
>> Sent: Friday, January 18, 2019 8:11 PM
>> To: Shahed Shaikh <shshaikh@marvell.com>; dev@dpdk.org
>> Cc: stable@dpdk.org; Rasesh Mody <rmody@marvell.com>; Thomas
>Monjalon
>> <thomas@monjalon.net>
>> Subject: [EXT] Re: [dpdk-dev] [PATCH 1/2] net/qede: fix performance
>> bottleneck in Rx path
>>
>>
>> ----------------------------------------------------------------------
>> On 1/18/2019 2:41 PM, Ferruh Yigit wrote:
>> > On 1/18/2019 10:29 AM, Shahed Shaikh wrote:
>> >> Allocating replacement buffer per received packet is expensive.
>> >> Instead, process received packets first and allocate replacement
>> >> buffers in bulk later.
>> >>
>> >> This improves performance by ~25% in terms of PPS on AMD platforms.
>> >>
>> >> Fixes: 2ea6f76aff40 ("qede: add core driver")
>> >> Cc: stable@dpdk.org
>> >>
>> >> Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
>> >
>> > Hi Shahed,
>> >
>> > This patch has been sent same day of the RC3, very close to the
>> > actual release, and updating data path of the driver, so not a trivial patch.
>> >
>> > Although I tend to accept driver patches late in process this one
>> > was too late, you won't have proper time to fix if any errors
>> > detected, but also I can see it has a good amount performance effect.
>> >
>> > I would like to explicitly ask if you are willing to take the risk.
>> > If answer is yes, please also be sure to get Rasesh's ack.
>
>Hi Ferruh,
>
>We have done thorough regression testing with this series and would like to
>get this included in 19.02.
>Please consider this series.
>
>Rasesh,
>Please ack this series.

Hi Ferruh, Thomas,

Both the patches in the series are acked and well tested against 19.02 tree.

Thanks!
-Rasesh
>
>Thanks,
>Shahed
>>
>> cc'ed Thomas.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
@ 2019-01-18 16:13 Shahed Shaikh
  2019-01-18 17:15 ` Rasesh Mody
  0 siblings, 1 reply; 9+ messages in thread
From: Shahed Shaikh @ 2019-01-18 16:13 UTC (permalink / raw)
  To: Ferruh Yigit, dev; +Cc: stable, Rasesh Mody, Thomas Monjalon



> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Friday, January 18, 2019 8:11 PM
> To: Shahed Shaikh <shshaikh@marvell.com>; dev@dpdk.org
> Cc: stable@dpdk.org; Rasesh Mody <rmody@marvell.com>; Thomas Monjalon
> <thomas@monjalon.net>
> Subject: [EXT] Re: [dpdk-dev] [PATCH 1/2] net/qede: fix performance bottleneck
> in Rx path
> 
> 
> ----------------------------------------------------------------------
> On 1/18/2019 2:41 PM, Ferruh Yigit wrote:
> > On 1/18/2019 10:29 AM, Shahed Shaikh wrote:
> >> Allocating replacement buffer per received packet is expensive.
> >> Instead, process received packets first and allocate
> >> replacement buffers in bulk later.
> >>
> >> This improves performance by ~25% in terms of PPS on AMD
> >> platforms.
> >>
> >> Fixes: 2ea6f76aff40 ("qede: add core driver")
> >> Cc: stable@dpdk.org
> >>
> >> Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
> >
> > Hi Shahed,
> >
> > This patch has been sent same day of the RC3, very close to the actual release,
> > and updating data path of the driver, so not a trivial patch.
> >
> > Although I tend to accept driver patches late in process this one was too late,
> > you won't have proper time to fix if any errors detected, but also I can see it
> > has a good amount performance effect.
> >
> > I would like to explicitly ask if you are willing to take the risk. If answer is
> > yes, please also be sure to get Rasesh's ack.

Hi Ferruh,

We have done thorough regression testing with this series and would like to get this included in 19.02.
Please consider this series.

Rasesh,
Please ack this series.

Thanks,
Shahed
> 
> cc'ed Thomas.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-01-18 23:39 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-18 10:29 [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Shahed Shaikh
2019-01-18 10:29 ` [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
2019-01-18 16:58   ` Rasesh Mody
2019-01-18 14:41 ` [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Ferruh Yigit
2019-01-18 14:41   ` Ferruh Yigit
2019-01-18 16:57 ` Rasesh Mody
2019-01-18 23:39   ` Thomas Monjalon
2019-01-18 16:13 Shahed Shaikh
2019-01-18 17:15 ` Rasesh Mody

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.