All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
@ 2019-01-18 10:29 Shahed Shaikh
  2019-01-18 10:29 ` [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Shahed Shaikh @ 2019-01-18 10:29 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, stable

Allocating replacement buffer per received packet is expensive.
Instead, process received packets first and allocate
replacement buffers in bulk later.

This improves performance by ~25% in terms of PPS on AMD
platforms.

Fixes: 2ea6f76aff40 ("qede: add core driver")
Cc: stable@dpdk.org

Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
---
 drivers/net/qede/qede_rxtx.c | 97 +++++++++++++++++++++++++++++++++-----------
 drivers/net/qede/qede_rxtx.h |  2 +
 2 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index 0e33be1..684c4ae 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -35,6 +35,52 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq)
 	return 0;
 }
 
+#define QEDE_MAX_BULK_ALLOC_COUNT 512
+
+static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq, int count)
+{
+	void *obj_p[QEDE_MAX_BULK_ALLOC_COUNT] __rte_cache_aligned;
+	struct rte_mbuf *mbuf = NULL;
+	struct eth_rx_bd *rx_bd;
+	dma_addr_t mapping;
+	int i, ret = 0;
+	uint16_t idx;
+
+	idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
+
+	if (count > QEDE_MAX_BULK_ALLOC_COUNT)
+		count = QEDE_MAX_BULK_ALLOC_COUNT;
+
+	ret = rte_mempool_get_bulk(rxq->mb_pool, obj_p, count);
+	if (unlikely(ret)) {
+		PMD_RX_LOG(ERR, rxq,
+			   "Failed to allocate %d rx buffers "
+			    "sw_rx_prod %u sw_rx_cons %u mp entries %u free %u",
+			    count, idx, rxq->sw_rx_cons & NUM_RX_BDS(rxq),
+			    rte_mempool_avail_count(rxq->mb_pool),
+			    rte_mempool_in_use_count(rxq->mb_pool));
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < count; i++) {
+		mbuf = obj_p[i];
+		if (likely(i < count - 1))
+			rte_prefetch0(obj_p[i + 1]);
+
+		idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
+		rxq->sw_rx_ring[idx].mbuf = mbuf;
+		rxq->sw_rx_ring[idx].page_offset = 0;
+		mapping = rte_mbuf_data_iova_default(mbuf);
+		rx_bd = (struct eth_rx_bd *)
+			ecore_chain_produce(&rxq->rx_bd_ring);
+		rx_bd->addr.hi = rte_cpu_to_le_32(U64_HI(mapping));
+		rx_bd->addr.lo = rte_cpu_to_le_32(U64_LO(mapping));
+		rxq->sw_rx_prod++;
+	}
+
+	return 0;
+}
+
 /* Criterias for calculating Rx buffer size -
  * 1) rx_buf_size should not exceed the size of mbuf
  * 2) In scattered_rx mode - minimum rx_buf_size should be
@@ -1131,7 +1177,7 @@ qede_reuse_page(__rte_unused struct qede_dev *qdev,
 		struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons)
 {
 	struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq->rx_bd_ring);
-	uint16_t idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq);
+	uint16_t idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
 	struct qede_rx_entry *curr_prod;
 	dma_addr_t new_mapping;
 
@@ -1364,7 +1410,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint8_t bitfield_val;
 #endif
 	uint8_t tunn_parse_flag;
-	uint8_t j;
 	struct eth_fast_path_rx_tpa_start_cqe *cqe_start_tpa;
 	uint64_t ol_flags;
 	uint32_t packet_type;
@@ -1373,6 +1418,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint8_t offset, tpa_agg_idx, flags;
 	struct qede_agg_info *tpa_info = NULL;
 	uint32_t rss_hash;
+	int rx_alloc_count = 0;
 
 	hw_comp_cons = rte_le_to_cpu_16(*rxq->hw_cons_ptr);
 	sw_comp_cons = ecore_chain_get_cons_idx(&rxq->rx_comp_ring);
@@ -1382,6 +1428,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (hw_comp_cons == sw_comp_cons)
 		return 0;
 
+	/* Allocate buffers that we used in previous loop */
+	if (rxq->rx_alloc_count) {
+		if (unlikely(qede_alloc_rx_bulk_mbufs(rxq,
+			     rxq->rx_alloc_count))) {
+			struct rte_eth_dev *dev;
+
+			PMD_RX_LOG(ERR, rxq,
+				   "New buffer allocation failed,"
+				   "dropping incoming packetn");
+			dev = &rte_eth_devices[rxq->port_id];
+			dev->data->rx_mbuf_alloc_failed +=
+							rxq->rx_alloc_count;
+			rxq->rx_alloc_errors += rxq->rx_alloc_count;
+			return 0;
+		}
+		qede_update_rx_prod(qdev, rxq);
+		rxq->rx_alloc_count = 0;
+	}
+
 	while (sw_comp_cons != hw_comp_cons) {
 		ol_flags = 0;
 		packet_type = RTE_PTYPE_UNKNOWN;
@@ -1553,16 +1618,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 			rx_mb->hash.rss = rss_hash;
 		}
 
-		if (unlikely(qede_alloc_rx_buffer(rxq) != 0)) {
-			PMD_RX_LOG(ERR, rxq,
-				   "New buffer allocation failed,"
-				   "dropping incoming packet\n");
-			qede_recycle_rx_bd_ring(rxq, qdev, fp_cqe->bd_num);
-			rte_eth_devices[rxq->port_id].
-			    data->rx_mbuf_alloc_failed++;
-			rxq->rx_alloc_errors++;
-			break;
-		}
+		rx_alloc_count++;
 		qede_rx_bd_ring_consume(rxq);
 
 		if (!tpa_start_flg && fp_cqe->bd_num > 1) {
@@ -1574,17 +1630,9 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 			if (qede_process_sg_pkts(p_rxq, seg1, num_segs,
 						 pkt_len - len))
 				goto next_cqe;
-			for (j = 0; j < num_segs; j++) {
-				if (qede_alloc_rx_buffer(rxq)) {
-					PMD_RX_LOG(ERR, rxq,
-						"Buffer allocation failed");
-					rte_eth_devices[rxq->port_id].
-						data->rx_mbuf_alloc_failed++;
-					rxq->rx_alloc_errors++;
-					break;
-				}
-				rxq->rx_segs++;
-			}
+
+			rx_alloc_count += num_segs;
+			rxq->rx_segs += num_segs;
 		}
 		rxq->rx_segs++; /* for the first segment */
 
@@ -1626,7 +1674,8 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		}
 	}
 
-	qede_update_rx_prod(qdev, rxq);
+	/* Request number of bufferes to be allocated in next loop */
+	rxq->rx_alloc_count = rx_alloc_count;
 
 	rxq->rcv_pkts += rx_pkt;
 
diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h
index 454daa0..5b249cb 100644
--- a/drivers/net/qede/qede_rxtx.h
+++ b/drivers/net/qede/qede_rxtx.h
@@ -192,6 +192,8 @@ struct qede_rx_queue {
 	uint16_t queue_id;
 	uint16_t port_id;
 	uint16_t rx_buf_size;
+	uint16_t rx_alloc_count;
+	uint16_t unused;
 	uint64_t rcv_pkts;
 	uint64_t rx_segs;
 	uint64_t rx_hw_errors;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path
  2019-01-18 10:29 [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Shahed Shaikh
@ 2019-01-18 10:29 ` Shahed Shaikh
  2019-01-18 16:58   ` Rasesh Mody
  2019-01-18 14:41 ` [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Ferruh Yigit
  2019-01-18 16:57 ` Rasesh Mody
  2 siblings, 1 reply; 7+ messages in thread
From: Shahed Shaikh @ 2019-01-18 10:29 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, stable

Incorrect placement of prefetch in Tx path is causing
a performance drop of around ~2% on AMD platform.

Fixes: 2ea6f76aff40 ("qede: add core driver")
Cc: stable@dpdk.org

Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
---
 drivers/net/qede/qede_rxtx.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index 684c4ae..eda19b2 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -2181,7 +2181,6 @@ qede_xmit_pkts(void *p_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 
 		txq->nb_tx_avail -= bd1->data.nbds;
 		txq->sw_tx_prod++;
-		rte_prefetch0(txq->sw_tx_ring[TX_PROD(txq)].mbuf);
 		bd_prod =
 		    rte_cpu_to_le_16(ecore_chain_get_prod_idx(&txq->tx_pbl));
 #ifdef RTE_LIBRTE_QEDE_DEBUG_TX
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
  2019-01-18 10:29 [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Shahed Shaikh
  2019-01-18 10:29 ` [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
@ 2019-01-18 14:41 ` Ferruh Yigit
  2019-01-18 14:41   ` Ferruh Yigit
  2019-01-18 16:57 ` Rasesh Mody
  2 siblings, 1 reply; 7+ messages in thread
From: Ferruh Yigit @ 2019-01-18 14:41 UTC (permalink / raw)
  To: Shahed Shaikh, dev; +Cc: stable, Rasesh Mody

On 1/18/2019 10:29 AM, Shahed Shaikh wrote:
> Allocating replacement buffer per received packet is expensive.
> Instead, process received packets first and allocate
> replacement buffers in bulk later.
> 
> This improves performance by ~25% in terms of PPS on AMD
> platforms.
> 
> Fixes: 2ea6f76aff40 ("qede: add core driver")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>

Hi Shahed,

This patch has been sent same day of the RC3, very close to the actual release,
and updating data path of the driver, so not a trivial patch.

Although I tend to accept driver patches late in process this one was too late,
you won't have proper time to fix if any errors detected, but also I can see it
has a good amount performance effect.

I would like to explicitly ask if you are willing to take the risk. If answer is
yes, please also be sure to get Rasesh's ack.

Thanks,
ferruh

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
  2019-01-18 14:41 ` [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Ferruh Yigit
@ 2019-01-18 14:41   ` Ferruh Yigit
  0 siblings, 0 replies; 7+ messages in thread
From: Ferruh Yigit @ 2019-01-18 14:41 UTC (permalink / raw)
  To: Shahed Shaikh, dev; +Cc: stable, Rasesh Mody, Thomas Monjalon

On 1/18/2019 2:41 PM, Ferruh Yigit wrote:
> On 1/18/2019 10:29 AM, Shahed Shaikh wrote:
>> Allocating replacement buffer per received packet is expensive.
>> Instead, process received packets first and allocate
>> replacement buffers in bulk later.
>>
>> This improves performance by ~25% in terms of PPS on AMD
>> platforms.
>>
>> Fixes: 2ea6f76aff40 ("qede: add core driver")
>> Cc: stable@dpdk.org
>>
>> Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
> 
> Hi Shahed,
> 
> This patch has been sent same day of the RC3, very close to the actual release,
> and updating data path of the driver, so not a trivial patch.
> 
> Although I tend to accept driver patches late in process this one was too late,
> you won't have proper time to fix if any errors detected, but also I can see it
> has a good amount performance effect.
> 
> I would like to explicitly ask if you are willing to take the risk. If answer is
> yes, please also be sure to get Rasesh's ack.

cc'ed Thomas.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
  2019-01-18 10:29 [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Shahed Shaikh
  2019-01-18 10:29 ` [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
  2019-01-18 14:41 ` [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Ferruh Yigit
@ 2019-01-18 16:57 ` Rasesh Mody
  2019-01-18 23:39   ` Thomas Monjalon
  2 siblings, 1 reply; 7+ messages in thread
From: Rasesh Mody @ 2019-01-18 16:57 UTC (permalink / raw)
  To: Shahed Shaikh, dev; +Cc: ferruh.yigit, stable, thomas

>From: dev <dev-bounces@dpdk.org> On Behalf Of Shahed Shaikh
>Sent: Friday, January 18, 2019 2:29 AM
>
>Allocating replacement buffer per received packet is expensive.
>Instead, process received packets first and allocate replacement buffers in
>bulk later.
>
>This improves performance by ~25% in terms of PPS on AMD platforms.
>
>Fixes: 2ea6f76aff40 ("qede: add core driver")
>Cc: stable@dpdk.org
>
>Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
>---

Acked-by: Rasesh Mody <rmody@marvell.com> 

> drivers/net/qede/qede_rxtx.c | 97
>+++++++++++++++++++++++++++++++++-----------
> drivers/net/qede/qede_rxtx.h |  2 +
> 2 files changed, 75 insertions(+), 24 deletions(-)
>
>diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
>index 0e33be1..684c4ae 100644
>--- a/drivers/net/qede/qede_rxtx.c
>+++ b/drivers/net/qede/qede_rxtx.c
>@@ -35,6 +35,52 @@ static inline int qede_alloc_rx_buffer(struct
>qede_rx_queue *rxq)
>        return 0;
> }
>
>+#define QEDE_MAX_BULK_ALLOC_COUNT 512
>+
>+static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq,
>+int count) {
>+       void *obj_p[QEDE_MAX_BULK_ALLOC_COUNT] __rte_cache_aligned;
>+       struct rte_mbuf *mbuf = NULL;
>+       struct eth_rx_bd *rx_bd;
>+       dma_addr_t mapping;
>+       int i, ret = 0;
>+       uint16_t idx;
>+
>+       idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
>+
>+       if (count > QEDE_MAX_BULK_ALLOC_COUNT)
>+               count = QEDE_MAX_BULK_ALLOC_COUNT;
>+
>+       ret = rte_mempool_get_bulk(rxq->mb_pool, obj_p, count);
>+       if (unlikely(ret)) {
>+               PMD_RX_LOG(ERR, rxq,
>+                          "Failed to allocate %d rx buffers "
>+                           "sw_rx_prod %u sw_rx_cons %u mp entries %u free %u",
>+                           count, idx, rxq->sw_rx_cons & NUM_RX_BDS(rxq),
>+                           rte_mempool_avail_count(rxq->mb_pool),
>+                           rte_mempool_in_use_count(rxq->mb_pool));
>+               return -ENOMEM;
>+       }
>+
>+       for (i = 0; i < count; i++) {
>+               mbuf = obj_p[i];
>+               if (likely(i < count - 1))
>+                       rte_prefetch0(obj_p[i + 1]);
>+
>+               idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
>+               rxq->sw_rx_ring[idx].mbuf = mbuf;
>+               rxq->sw_rx_ring[idx].page_offset = 0;
>+               mapping = rte_mbuf_data_iova_default(mbuf);
>+               rx_bd = (struct eth_rx_bd *)
>+                       ecore_chain_produce(&rxq->rx_bd_ring);
>+               rx_bd->addr.hi = rte_cpu_to_le_32(U64_HI(mapping));
>+               rx_bd->addr.lo = rte_cpu_to_le_32(U64_LO(mapping));
>+               rxq->sw_rx_prod++;
>+       }
>+
>+       return 0;
>+}
>+
> /* Criterias for calculating Rx buffer size -
>  * 1) rx_buf_size should not exceed the size of mbuf
>  * 2) In scattered_rx mode - minimum rx_buf_size should be @@ -1131,7
>+1177,7 @@ qede_reuse_page(__rte_unused struct qede_dev *qdev,
>                struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons)  {
>        struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq-
>>rx_bd_ring);
>-       uint16_t idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq);
>+       uint16_t idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
>        struct qede_rx_entry *curr_prod;
>        dma_addr_t new_mapping;
>
>@@ -1364,7 +1410,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
>**rx_pkts, uint16_t nb_pkts)
>        uint8_t bitfield_val;
> #endif
>        uint8_t tunn_parse_flag;
>-       uint8_t j;
>        struct eth_fast_path_rx_tpa_start_cqe *cqe_start_tpa;
>        uint64_t ol_flags;
>        uint32_t packet_type;
>@@ -1373,6 +1418,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
>**rx_pkts, uint16_t nb_pkts)
>        uint8_t offset, tpa_agg_idx, flags;
>        struct qede_agg_info *tpa_info = NULL;
>        uint32_t rss_hash;
>+       int rx_alloc_count = 0;
>
>        hw_comp_cons = rte_le_to_cpu_16(*rxq->hw_cons_ptr);
>        sw_comp_cons = ecore_chain_get_cons_idx(&rxq->rx_comp_ring);
>@@ -1382,6 +1428,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
>**rx_pkts, uint16_t nb_pkts)
>        if (hw_comp_cons == sw_comp_cons)
>                return 0;
>
>+       /* Allocate buffers that we used in previous loop */
>+       if (rxq->rx_alloc_count) {
>+               if (unlikely(qede_alloc_rx_bulk_mbufs(rxq,
>+                            rxq->rx_alloc_count))) {
>+                       struct rte_eth_dev *dev;
>+
>+                       PMD_RX_LOG(ERR, rxq,
>+                                  "New buffer allocation failed,"
>+                                  "dropping incoming packetn");
>+                       dev = &rte_eth_devices[rxq->port_id];
>+                       dev->data->rx_mbuf_alloc_failed +=
>+                                                       rxq->rx_alloc_count;
>+                       rxq->rx_alloc_errors += rxq->rx_alloc_count;
>+                       return 0;
>+               }
>+               qede_update_rx_prod(qdev, rxq);
>+               rxq->rx_alloc_count = 0;
>+       }
>+
>        while (sw_comp_cons != hw_comp_cons) {
>                ol_flags = 0;
>                packet_type = RTE_PTYPE_UNKNOWN; @@ -1553,16 +1618,7 @@
>qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
>                        rx_mb->hash.rss = rss_hash;
>                }
>
>-               if (unlikely(qede_alloc_rx_buffer(rxq) != 0)) {
>-                       PMD_RX_LOG(ERR, rxq,
>-                                  "New buffer allocation failed,"
>-                                  "dropping incoming packet\n");
>-                       qede_recycle_rx_bd_ring(rxq, qdev, fp_cqe->bd_num);
>-                       rte_eth_devices[rxq->port_id].
>-                           data->rx_mbuf_alloc_failed++;
>-                       rxq->rx_alloc_errors++;
>-                       break;
>-               }
>+               rx_alloc_count++;
>                qede_rx_bd_ring_consume(rxq);
>
>                if (!tpa_start_flg && fp_cqe->bd_num > 1) { @@ -1574,17 +1630,9
>@@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t
>nb_pkts)
>                        if (qede_process_sg_pkts(p_rxq, seg1, num_segs,
>                                                 pkt_len - len))
>                                goto next_cqe;
>-                       for (j = 0; j < num_segs; j++) {
>-                               if (qede_alloc_rx_buffer(rxq)) {
>-                                       PMD_RX_LOG(ERR, rxq,
>-                                               "Buffer allocation failed");
>-                                       rte_eth_devices[rxq->port_id].
>-                                               data->rx_mbuf_alloc_failed++;
>-                                       rxq->rx_alloc_errors++;
>-                                       break;
>-                               }
>-                               rxq->rx_segs++;
>-                       }
>+
>+                       rx_alloc_count += num_segs;
>+                       rxq->rx_segs += num_segs;
>                }
>                rxq->rx_segs++; /* for the first segment */
>
>@@ -1626,7 +1674,8 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
>**rx_pkts, uint16_t nb_pkts)
>                }
>        }
>
>-       qede_update_rx_prod(qdev, rxq);
>+       /* Request number of bufferes to be allocated in next loop */
>+       rxq->rx_alloc_count = rx_alloc_count;
>
>        rxq->rcv_pkts += rx_pkt;
>
>diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h
>index 454daa0..5b249cb 100644
>--- a/drivers/net/qede/qede_rxtx.h
>+++ b/drivers/net/qede/qede_rxtx.h
>@@ -192,6 +192,8 @@ struct qede_rx_queue {
>        uint16_t queue_id;
>        uint16_t port_id;
>        uint16_t rx_buf_size;
>+       uint16_t rx_alloc_count;
>+       uint16_t unused;
>        uint64_t rcv_pkts;
>        uint64_t rx_segs;
>        uint64_t rx_hw_errors;
>--
>2.7.4

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path
  2019-01-18 10:29 ` [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
@ 2019-01-18 16:58   ` Rasesh Mody
  0 siblings, 0 replies; 7+ messages in thread
From: Rasesh Mody @ 2019-01-18 16:58 UTC (permalink / raw)
  To: Shahed Shaikh, dev; +Cc: ferruh.yigit, stable, thomas

>From: dev <dev-bounces@dpdk.org> On Behalf Of Shahed Shaikh
>Sent: Friday, January 18, 2019 2:30 AM
>
>Incorrect placement of prefetch in Tx path is causing a performance drop of
>around ~2% on AMD platform.
>
>Fixes: 2ea6f76aff40 ("qede: add core driver")
>Cc: stable@dpdk.org
>
>Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
>---

Acked-by: Rasesh Mody <rmody@marvell.com>

> drivers/net/qede/qede_rxtx.c | 1 -
> 1 file changed, 1 deletion(-)
>
>diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
>index 684c4ae..eda19b2 100644
>--- a/drivers/net/qede/qede_rxtx.c
>+++ b/drivers/net/qede/qede_rxtx.c
>@@ -2181,7 +2181,6 @@ qede_xmit_pkts(void *p_txq, struct rte_mbuf
>**tx_pkts, uint16_t nb_pkts)
>
>                txq->nb_tx_avail -= bd1->data.nbds;
>                txq->sw_tx_prod++;
>-               rte_prefetch0(txq->sw_tx_ring[TX_PROD(txq)].mbuf);
>                bd_prod =
>                    rte_cpu_to_le_16(ecore_chain_get_prod_idx(&txq->tx_pbl));
> #ifdef RTE_LIBRTE_QEDE_DEBUG_TX
>--
>2.7.4

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
  2019-01-18 16:57 ` Rasesh Mody
@ 2019-01-18 23:39   ` Thomas Monjalon
  0 siblings, 0 replies; 7+ messages in thread
From: Thomas Monjalon @ 2019-01-18 23:39 UTC (permalink / raw)
  To: Rasesh Mody, Shahed Shaikh; +Cc: dev, ferruh.yigit, stable

18/01/2019 17:57, Rasesh Mody:
> >From: dev <dev-bounces@dpdk.org> On Behalf Of Shahed Shaikh
> >Sent: Friday, January 18, 2019 2:29 AM
> >
> >Allocating replacement buffer per received packet is expensive.
> >Instead, process received packets first and allocate replacement buffers in
> >bulk later.
> >
> >This improves performance by ~25% in terms of PPS on AMD platforms.
> >
> >Fixes: 2ea6f76aff40 ("qede: add core driver")
> >Cc: stable@dpdk.org
> >
> >Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>

Applied, thanks

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2019-01-18 23:39 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-18 10:29 [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Shahed Shaikh
2019-01-18 10:29 ` [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
2019-01-18 16:58   ` Rasesh Mody
2019-01-18 14:41 ` [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Ferruh Yigit
2019-01-18 14:41   ` Ferruh Yigit
2019-01-18 16:57 ` Rasesh Mody
2019-01-18 23:39   ` Thomas Monjalon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.