All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
@ 2019-01-18 10:29 Shahed Shaikh
  2019-01-18 10:29 ` [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Shahed Shaikh @ 2019-01-18 10:29 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, stable

Allocating replacement buffer per received packet is expensive.
Instead, process received packets first and allocate
replacement buffers in bulk later.

This improves performance by ~25% in terms of PPS on AMD
platforms.

Fixes: 2ea6f76aff40 ("qede: add core driver")
Cc: stable@dpdk.org

Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
---
 drivers/net/qede/qede_rxtx.c | 97 +++++++++++++++++++++++++++++++++-----------
 drivers/net/qede/qede_rxtx.h |  2 +
 2 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index 0e33be1..684c4ae 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -35,6 +35,52 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq)
 	return 0;
 }
 
+#define QEDE_MAX_BULK_ALLOC_COUNT 512
+
+static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq, int count)
+{
+	void *obj_p[QEDE_MAX_BULK_ALLOC_COUNT] __rte_cache_aligned;
+	struct rte_mbuf *mbuf = NULL;
+	struct eth_rx_bd *rx_bd;
+	dma_addr_t mapping;
+	int i, ret = 0;
+	uint16_t idx;
+
+	idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
+
+	if (count > QEDE_MAX_BULK_ALLOC_COUNT)
+		count = QEDE_MAX_BULK_ALLOC_COUNT;
+
+	ret = rte_mempool_get_bulk(rxq->mb_pool, obj_p, count);
+	if (unlikely(ret)) {
+		PMD_RX_LOG(ERR, rxq,
+			   "Failed to allocate %d rx buffers "
+			    "sw_rx_prod %u sw_rx_cons %u mp entries %u free %u",
+			    count, idx, rxq->sw_rx_cons & NUM_RX_BDS(rxq),
+			    rte_mempool_avail_count(rxq->mb_pool),
+			    rte_mempool_in_use_count(rxq->mb_pool));
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < count; i++) {
+		mbuf = obj_p[i];
+		if (likely(i < count - 1))
+			rte_prefetch0(obj_p[i + 1]);
+
+		idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
+		rxq->sw_rx_ring[idx].mbuf = mbuf;
+		rxq->sw_rx_ring[idx].page_offset = 0;
+		mapping = rte_mbuf_data_iova_default(mbuf);
+		rx_bd = (struct eth_rx_bd *)
+			ecore_chain_produce(&rxq->rx_bd_ring);
+		rx_bd->addr.hi = rte_cpu_to_le_32(U64_HI(mapping));
+		rx_bd->addr.lo = rte_cpu_to_le_32(U64_LO(mapping));
+		rxq->sw_rx_prod++;
+	}
+
+	return 0;
+}
+
 /* Criterias for calculating Rx buffer size -
  * 1) rx_buf_size should not exceed the size of mbuf
  * 2) In scattered_rx mode - minimum rx_buf_size should be
@@ -1131,7 +1177,7 @@ qede_reuse_page(__rte_unused struct qede_dev *qdev,
 		struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons)
 {
 	struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq->rx_bd_ring);
-	uint16_t idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq);
+	uint16_t idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
 	struct qede_rx_entry *curr_prod;
 	dma_addr_t new_mapping;
 
@@ -1364,7 +1410,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint8_t bitfield_val;
 #endif
 	uint8_t tunn_parse_flag;
-	uint8_t j;
 	struct eth_fast_path_rx_tpa_start_cqe *cqe_start_tpa;
 	uint64_t ol_flags;
 	uint32_t packet_type;
@@ -1373,6 +1418,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint8_t offset, tpa_agg_idx, flags;
 	struct qede_agg_info *tpa_info = NULL;
 	uint32_t rss_hash;
+	int rx_alloc_count = 0;
 
 	hw_comp_cons = rte_le_to_cpu_16(*rxq->hw_cons_ptr);
 	sw_comp_cons = ecore_chain_get_cons_idx(&rxq->rx_comp_ring);
@@ -1382,6 +1428,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (hw_comp_cons == sw_comp_cons)
 		return 0;
 
+	/* Allocate buffers that we used in previous loop */
+	if (rxq->rx_alloc_count) {
+		if (unlikely(qede_alloc_rx_bulk_mbufs(rxq,
+			     rxq->rx_alloc_count))) {
+			struct rte_eth_dev *dev;
+
+			PMD_RX_LOG(ERR, rxq,
+				   "New buffer allocation failed,"
+				   "dropping incoming packetn");
+			dev = &rte_eth_devices[rxq->port_id];
+			dev->data->rx_mbuf_alloc_failed +=
+							rxq->rx_alloc_count;
+			rxq->rx_alloc_errors += rxq->rx_alloc_count;
+			return 0;
+		}
+		qede_update_rx_prod(qdev, rxq);
+		rxq->rx_alloc_count = 0;
+	}
+
 	while (sw_comp_cons != hw_comp_cons) {
 		ol_flags = 0;
 		packet_type = RTE_PTYPE_UNKNOWN;
@@ -1553,16 +1618,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 			rx_mb->hash.rss = rss_hash;
 		}
 
-		if (unlikely(qede_alloc_rx_buffer(rxq) != 0)) {
-			PMD_RX_LOG(ERR, rxq,
-				   "New buffer allocation failed,"
-				   "dropping incoming packet\n");
-			qede_recycle_rx_bd_ring(rxq, qdev, fp_cqe->bd_num);
-			rte_eth_devices[rxq->port_id].
-			    data->rx_mbuf_alloc_failed++;
-			rxq->rx_alloc_errors++;
-			break;
-		}
+		rx_alloc_count++;
 		qede_rx_bd_ring_consume(rxq);
 
 		if (!tpa_start_flg && fp_cqe->bd_num > 1) {
@@ -1574,17 +1630,9 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 			if (qede_process_sg_pkts(p_rxq, seg1, num_segs,
 						 pkt_len - len))
 				goto next_cqe;
-			for (j = 0; j < num_segs; j++) {
-				if (qede_alloc_rx_buffer(rxq)) {
-					PMD_RX_LOG(ERR, rxq,
-						"Buffer allocation failed");
-					rte_eth_devices[rxq->port_id].
-						data->rx_mbuf_alloc_failed++;
-					rxq->rx_alloc_errors++;
-					break;
-				}
-				rxq->rx_segs++;
-			}
+
+			rx_alloc_count += num_segs;
+			rxq->rx_segs += num_segs;
 		}
 		rxq->rx_segs++; /* for the first segment */
 
@@ -1626,7 +1674,8 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		}
 	}
 
-	qede_update_rx_prod(qdev, rxq);
+	/* Request number of bufferes to be allocated in next loop */
+	rxq->rx_alloc_count = rx_alloc_count;
 
 	rxq->rcv_pkts += rx_pkt;
 
diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h
index 454daa0..5b249cb 100644
--- a/drivers/net/qede/qede_rxtx.h
+++ b/drivers/net/qede/qede_rxtx.h
@@ -192,6 +192,8 @@ struct qede_rx_queue {
 	uint16_t queue_id;
 	uint16_t port_id;
 	uint16_t rx_buf_size;
+	uint16_t rx_alloc_count;
+	uint16_t unused;
 	uint64_t rcv_pkts;
 	uint64_t rx_segs;
 	uint64_t rx_hw_errors;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
@ 2019-01-18 16:13 Shahed Shaikh
  2019-01-18 17:15 ` Rasesh Mody
  0 siblings, 1 reply; 9+ messages in thread
From: Shahed Shaikh @ 2019-01-18 16:13 UTC (permalink / raw)
  To: Ferruh Yigit, dev; +Cc: stable, Rasesh Mody, Thomas Monjalon



> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Friday, January 18, 2019 8:11 PM
> To: Shahed Shaikh <shshaikh@marvell.com>; dev@dpdk.org
> Cc: stable@dpdk.org; Rasesh Mody <rmody@marvell.com>; Thomas Monjalon
> <thomas@monjalon.net>
> Subject: [EXT] Re: [dpdk-dev] [PATCH 1/2] net/qede: fix performance bottleneck
> in Rx path
> 
> 
> ----------------------------------------------------------------------
> On 1/18/2019 2:41 PM, Ferruh Yigit wrote:
> > On 1/18/2019 10:29 AM, Shahed Shaikh wrote:
> >> Allocating replacement buffer per received packet is expensive.
> >> Instead, process received packets first and allocate
> >> replacement buffers in bulk later.
> >>
> >> This improves performance by ~25% in terms of PPS on AMD
> >> platforms.
> >>
> >> Fixes: 2ea6f76aff40 ("qede: add core driver")
> >> Cc: stable@dpdk.org
> >>
> >> Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
> >
> > Hi Shahed,
> >
> > This patch has been sent same day of the RC3, very close to the actual release,
> > and updating data path of the driver, so not a trivial patch.
> >
> > Although I tend to accept driver patches late in process this one was too late,
> > you won't have proper time to fix if any errors detected, but also I can see it
> > has a good amount performance effect.
> >
> > I would like to explicitly ask if you are willing to take the risk. If answer is
> > yes, please also be sure to get Rasesh's ack.

Hi Ferruh,

We have done thorough regression testing with this series and would like to get this included in 19.02.
Please consider this series.

Rasesh,
Please ack this series.

Thanks,
Shahed
> 
> cc'ed Thomas.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-01-18 23:39 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-18 10:29 [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Shahed Shaikh
2019-01-18 10:29 ` [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
2019-01-18 16:58   ` Rasesh Mody
2019-01-18 14:41 ` [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Ferruh Yigit
2019-01-18 14:41   ` Ferruh Yigit
2019-01-18 16:57 ` Rasesh Mody
2019-01-18 23:39   ` Thomas Monjalon
2019-01-18 16:13 Shahed Shaikh
2019-01-18 17:15 ` Rasesh Mody

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.