All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next] net: axienet: Use NAPI for TX completion path
@ 2022-04-29 22:28 ` Robert Hancock
  0 siblings, 0 replies; 14+ messages in thread
From: Robert Hancock @ 2022-04-29 22:28 UTC (permalink / raw)
  To: netdev
  Cc: radhey.shyam.pandey, davem, edumazet, kuba, pabeni, michal.simek,
	linux-arm-kernel, Robert Hancock

This driver was using the TX IRQ handler to perform all TX completion
tasks. Under heavy TX network load, this can cause significant irqs-off
latencies (found to be in the hundreds of microseconds using ftrace).
This can cause other issues, such as overrunning serial UART FIFOs when
using high baud rates with limited UART FIFO sizes.

Switch to using the NAPI poll handler to perform the TX completion work
to get this out of hard IRQ context and avoid the IRQ latency impact.

Signed-off-by: Robert Hancock <robert.hancock@calian.com>
---
 drivers/net/ethernet/xilinx/xilinx_axienet.h  |  2 +
 .../net/ethernet/xilinx/xilinx_axienet_main.c | 56 ++++++++++++-------
 2 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h b/drivers/net/ethernet/xilinx/xilinx_axienet.h
index d5c1e5c4a508..6e58d034fe90 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet.h
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h
@@ -397,6 +397,7 @@ struct axidma_bd {
  * @regs:	Base address for the axienet_local device address space
  * @dma_regs:	Base address for the axidma device address space
  * @rx_dma_cr:  Nominal content of RX DMA control register
+ * @tx_dma_cr:  Nominal content of TX DMA control register
  * @dma_err_task: Work structure to process Axi DMA errors
  * @tx_irq:	Axidma TX IRQ number
  * @rx_irq:	Axidma RX IRQ number
@@ -454,6 +455,7 @@ struct axienet_local {
 	void __iomem *dma_regs;
 
 	u32 rx_dma_cr;
+	u32 tx_dma_cr;
 
 	struct work_struct dma_err_task;
 
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index d6fc3f7acdf0..a52e616275e4 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -254,8 +254,6 @@ static u32 axienet_usec_to_timer(struct axienet_local *lp, u32 coalesce_usec)
  */
 static void axienet_dma_start(struct axienet_local *lp)
 {
-	u32 tx_cr;
-
 	/* Start updating the Rx channel control register */
 	lp->rx_dma_cr = (lp->coalesce_count_rx << XAXIDMA_COALESCE_SHIFT) |
 			XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_ERROR_MASK;
@@ -269,16 +267,16 @@ static void axienet_dma_start(struct axienet_local *lp)
 	axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp->rx_dma_cr);
 
 	/* Start updating the Tx channel control register */
-	tx_cr = (lp->coalesce_count_tx << XAXIDMA_COALESCE_SHIFT) |
-		XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_ERROR_MASK;
+	lp->tx_dma_cr = (lp->coalesce_count_tx << XAXIDMA_COALESCE_SHIFT) |
+			XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_ERROR_MASK;
 	/* Only set interrupt delay timer if not generating an interrupt on
 	 * the first TX packet. Otherwise leave at 0 to disable delay interrupt.
 	 */
 	if (lp->coalesce_count_tx > 1)
-		tx_cr |= (axienet_usec_to_timer(lp, lp->coalesce_usec_tx)
-				<< XAXIDMA_DELAY_SHIFT) |
-			 XAXIDMA_IRQ_DELAY_MASK;
-	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, tx_cr);
+		lp->tx_dma_cr |= (axienet_usec_to_timer(lp, lp->coalesce_usec_tx)
+					<< XAXIDMA_DELAY_SHIFT) |
+				 XAXIDMA_IRQ_DELAY_MASK;
+	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
 
 	/* Populate the tail pointer and bring the Rx Axi DMA engine out of
 	 * halted state. This will make the Rx side ready for reception.
@@ -294,8 +292,8 @@ static void axienet_dma_start(struct axienet_local *lp)
 	 * tail pointer register that the Tx channel will start transmitting.
 	 */
 	axienet_dma_out_addr(lp, XAXIDMA_TX_CDESC_OFFSET, lp->tx_bd_p);
-	tx_cr |= XAXIDMA_CR_RUNSTOP_MASK;
-	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, tx_cr);
+	lp->tx_dma_cr |= XAXIDMA_CR_RUNSTOP_MASK;
+	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
 }
 
 /**
@@ -671,13 +669,14 @@ static int axienet_device_reset(struct net_device *ndev)
  * @nr_bds:	Number of descriptors to clean up, can be -1 if unknown.
  * @sizep:	Pointer to a u32 filled with the total sum of all bytes
  * 		in all cleaned-up descriptors. Ignored if NULL.
+ * @budget:	NAPI budget (use 0 when not called from NAPI poll)
  *
  * Would either be called after a successful transmit operation, or after
  * there was an error when setting up the chain.
  * Returns the number of descriptors handled.
  */
 static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd,
-				 int nr_bds, u32 *sizep)
+				 int nr_bds, u32 *sizep, int budget)
 {
 	struct axienet_local *lp = netdev_priv(ndev);
 	struct axidma_bd *cur_p;
@@ -707,7 +706,7 @@ static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd,
 				 DMA_TO_DEVICE);
 
 		if (cur_p->skb && (status & XAXIDMA_BD_STS_COMPLETE_MASK))
-			dev_consume_skb_irq(cur_p->skb);
+			napi_consume_skb(cur_p->skb, budget);
 
 		cur_p->app0 = 0;
 		cur_p->app1 = 0;
@@ -756,20 +755,24 @@ static inline int axienet_check_tx_bd_space(struct axienet_local *lp,
  * axienet_start_xmit_done - Invoked once a transmit is completed by the
  * Axi DMA Tx channel.
  * @ndev:	Pointer to the net_device structure
+ * @budget:	NAPI budget
  *
- * This function is invoked from the Axi DMA Tx isr to notify the completion
+ * This function is invoked from the NAPI processing to notify the completion
  * of transmit operation. It clears fields in the corresponding Tx BDs and
  * unmaps the corresponding buffer so that CPU can regain ownership of the
  * buffer. It finally invokes "netif_wake_queue" to restart transmission if
  * required.
  */
-static void axienet_start_xmit_done(struct net_device *ndev)
+static void axienet_start_xmit_done(struct net_device *ndev, int budget)
 {
 	struct axienet_local *lp = netdev_priv(ndev);
 	u32 packets = 0;
 	u32 size = 0;
 
-	packets = axienet_free_tx_chain(ndev, lp->tx_bd_ci, -1, &size);
+	packets = axienet_free_tx_chain(ndev, lp->tx_bd_ci, -1, &size, budget);
+
+	if (!packets)
+		return;
 
 	lp->tx_bd_ci += packets;
 	if (lp->tx_bd_ci >= lp->tx_bd_num)
@@ -865,7 +868,7 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 				netdev_err(ndev, "TX DMA mapping error\n");
 			ndev->stats.tx_dropped++;
 			axienet_free_tx_chain(ndev, orig_tail_ptr, ii + 1,
-					      NULL);
+					      NULL, 0);
 			lp->tx_bd_tail = orig_tail_ptr;
 
 			return NETDEV_TX_OK;
@@ -899,9 +902,9 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 }
 
 /**
- * axienet_poll - Triggered by RX ISR to complete the received BD processing.
+ * axienet_poll - Triggered by RX/TX ISR to complete the BD processing.
  * @napi:	Pointer to NAPI structure.
- * @budget:	Max number of packets to process.
+ * @budget:	Max number of RX packets to process.
  *
  * Return: Number of RX packets processed.
  */
@@ -916,6 +919,8 @@ static int axienet_poll(struct napi_struct *napi, int budget)
 	struct sk_buff *skb, *new_skb;
 	struct axienet_local *lp = container_of(napi, struct axienet_local, napi);
 
+	axienet_start_xmit_done(lp->ndev, budget);
+
 	cur_p = &lp->rx_bd_v[lp->rx_bd_ci];
 
 	while (packets < budget && (cur_p->status & XAXIDMA_BD_STS_COMPLETE_MASK)) {
@@ -1001,11 +1006,12 @@ static int axienet_poll(struct napi_struct *napi, int budget)
 		axienet_dma_out_addr(lp, XAXIDMA_RX_TDESC_OFFSET, tail_p);
 
 	if (packets < budget && napi_complete_done(napi, packets)) {
-		/* Re-enable RX completion interrupts. This should
-		 * cause an immediate interrupt if any RX packets are
+		/* Re-enable RX/TX completion interrupts. This should
+		 * cause an immediate interrupt if any RX/TX packets are
 		 * already pending.
 		 */
 		axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp->rx_dma_cr);
+		axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
 	}
 	return packets;
 }
@@ -1040,7 +1046,15 @@ static irqreturn_t axienet_tx_irq(int irq, void *_ndev)
 			   (lp->tx_bd_v[lp->tx_bd_ci]).phys);
 		schedule_work(&lp->dma_err_task);
 	} else {
-		axienet_start_xmit_done(lp->ndev);
+		/* Disable further TX completion interrupts and schedule
+		 * NAPI to handle the completions.
+		 */
+		u32 cr = lp->tx_dma_cr;
+
+		cr &= ~(XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_DELAY_MASK);
+		axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, cr);
+
+		napi_schedule(&lp->napi);
 	}
 
 	return IRQ_HANDLED;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH net-next] net: axienet: Use NAPI for TX completion path
@ 2022-04-29 22:28 ` Robert Hancock
  0 siblings, 0 replies; 14+ messages in thread
From: Robert Hancock @ 2022-04-29 22:28 UTC (permalink / raw)
  To: netdev
  Cc: radhey.shyam.pandey, davem, edumazet, kuba, pabeni, michal.simek,
	linux-arm-kernel, Robert Hancock

This driver was using the TX IRQ handler to perform all TX completion
tasks. Under heavy TX network load, this can cause significant irqs-off
latencies (found to be in the hundreds of microseconds using ftrace).
This can cause other issues, such as overrunning serial UART FIFOs when
using high baud rates with limited UART FIFO sizes.

Switch to using the NAPI poll handler to perform the TX completion work
to get this out of hard IRQ context and avoid the IRQ latency impact.

Signed-off-by: Robert Hancock <robert.hancock@calian.com>
---
 drivers/net/ethernet/xilinx/xilinx_axienet.h  |  2 +
 .../net/ethernet/xilinx/xilinx_axienet_main.c | 56 ++++++++++++-------
 2 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h b/drivers/net/ethernet/xilinx/xilinx_axienet.h
index d5c1e5c4a508..6e58d034fe90 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet.h
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h
@@ -397,6 +397,7 @@ struct axidma_bd {
  * @regs:	Base address for the axienet_local device address space
  * @dma_regs:	Base address for the axidma device address space
  * @rx_dma_cr:  Nominal content of RX DMA control register
+ * @tx_dma_cr:  Nominal content of TX DMA control register
  * @dma_err_task: Work structure to process Axi DMA errors
  * @tx_irq:	Axidma TX IRQ number
  * @rx_irq:	Axidma RX IRQ number
@@ -454,6 +455,7 @@ struct axienet_local {
 	void __iomem *dma_regs;
 
 	u32 rx_dma_cr;
+	u32 tx_dma_cr;
 
 	struct work_struct dma_err_task;
 
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index d6fc3f7acdf0..a52e616275e4 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -254,8 +254,6 @@ static u32 axienet_usec_to_timer(struct axienet_local *lp, u32 coalesce_usec)
  */
 static void axienet_dma_start(struct axienet_local *lp)
 {
-	u32 tx_cr;
-
 	/* Start updating the Rx channel control register */
 	lp->rx_dma_cr = (lp->coalesce_count_rx << XAXIDMA_COALESCE_SHIFT) |
 			XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_ERROR_MASK;
@@ -269,16 +267,16 @@ static void axienet_dma_start(struct axienet_local *lp)
 	axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp->rx_dma_cr);
 
 	/* Start updating the Tx channel control register */
-	tx_cr = (lp->coalesce_count_tx << XAXIDMA_COALESCE_SHIFT) |
-		XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_ERROR_MASK;
+	lp->tx_dma_cr = (lp->coalesce_count_tx << XAXIDMA_COALESCE_SHIFT) |
+			XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_ERROR_MASK;
 	/* Only set interrupt delay timer if not generating an interrupt on
 	 * the first TX packet. Otherwise leave at 0 to disable delay interrupt.
 	 */
 	if (lp->coalesce_count_tx > 1)
-		tx_cr |= (axienet_usec_to_timer(lp, lp->coalesce_usec_tx)
-				<< XAXIDMA_DELAY_SHIFT) |
-			 XAXIDMA_IRQ_DELAY_MASK;
-	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, tx_cr);
+		lp->tx_dma_cr |= (axienet_usec_to_timer(lp, lp->coalesce_usec_tx)
+					<< XAXIDMA_DELAY_SHIFT) |
+				 XAXIDMA_IRQ_DELAY_MASK;
+	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
 
 	/* Populate the tail pointer and bring the Rx Axi DMA engine out of
 	 * halted state. This will make the Rx side ready for reception.
@@ -294,8 +292,8 @@ static void axienet_dma_start(struct axienet_local *lp)
 	 * tail pointer register that the Tx channel will start transmitting.
 	 */
 	axienet_dma_out_addr(lp, XAXIDMA_TX_CDESC_OFFSET, lp->tx_bd_p);
-	tx_cr |= XAXIDMA_CR_RUNSTOP_MASK;
-	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, tx_cr);
+	lp->tx_dma_cr |= XAXIDMA_CR_RUNSTOP_MASK;
+	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
 }
 
 /**
@@ -671,13 +669,14 @@ static int axienet_device_reset(struct net_device *ndev)
  * @nr_bds:	Number of descriptors to clean up, can be -1 if unknown.
  * @sizep:	Pointer to a u32 filled with the total sum of all bytes
  * 		in all cleaned-up descriptors. Ignored if NULL.
+ * @budget:	NAPI budget (use 0 when not called from NAPI poll)
  *
  * Would either be called after a successful transmit operation, or after
  * there was an error when setting up the chain.
  * Returns the number of descriptors handled.
  */
 static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd,
-				 int nr_bds, u32 *sizep)
+				 int nr_bds, u32 *sizep, int budget)
 {
 	struct axienet_local *lp = netdev_priv(ndev);
 	struct axidma_bd *cur_p;
@@ -707,7 +706,7 @@ static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd,
 				 DMA_TO_DEVICE);
 
 		if (cur_p->skb && (status & XAXIDMA_BD_STS_COMPLETE_MASK))
-			dev_consume_skb_irq(cur_p->skb);
+			napi_consume_skb(cur_p->skb, budget);
 
 		cur_p->app0 = 0;
 		cur_p->app1 = 0;
@@ -756,20 +755,24 @@ static inline int axienet_check_tx_bd_space(struct axienet_local *lp,
  * axienet_start_xmit_done - Invoked once a transmit is completed by the
  * Axi DMA Tx channel.
  * @ndev:	Pointer to the net_device structure
+ * @budget:	NAPI budget
  *
- * This function is invoked from the Axi DMA Tx isr to notify the completion
+ * This function is invoked from the NAPI processing to notify the completion
  * of transmit operation. It clears fields in the corresponding Tx BDs and
  * unmaps the corresponding buffer so that CPU can regain ownership of the
  * buffer. It finally invokes "netif_wake_queue" to restart transmission if
  * required.
  */
-static void axienet_start_xmit_done(struct net_device *ndev)
+static void axienet_start_xmit_done(struct net_device *ndev, int budget)
 {
 	struct axienet_local *lp = netdev_priv(ndev);
 	u32 packets = 0;
 	u32 size = 0;
 
-	packets = axienet_free_tx_chain(ndev, lp->tx_bd_ci, -1, &size);
+	packets = axienet_free_tx_chain(ndev, lp->tx_bd_ci, -1, &size, budget);
+
+	if (!packets)
+		return;
 
 	lp->tx_bd_ci += packets;
 	if (lp->tx_bd_ci >= lp->tx_bd_num)
@@ -865,7 +868,7 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 				netdev_err(ndev, "TX DMA mapping error\n");
 			ndev->stats.tx_dropped++;
 			axienet_free_tx_chain(ndev, orig_tail_ptr, ii + 1,
-					      NULL);
+					      NULL, 0);
 			lp->tx_bd_tail = orig_tail_ptr;
 
 			return NETDEV_TX_OK;
@@ -899,9 +902,9 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 }
 
 /**
- * axienet_poll - Triggered by RX ISR to complete the received BD processing.
+ * axienet_poll - Triggered by RX/TX ISR to complete the BD processing.
  * @napi:	Pointer to NAPI structure.
- * @budget:	Max number of packets to process.
+ * @budget:	Max number of RX packets to process.
  *
  * Return: Number of RX packets processed.
  */
@@ -916,6 +919,8 @@ static int axienet_poll(struct napi_struct *napi, int budget)
 	struct sk_buff *skb, *new_skb;
 	struct axienet_local *lp = container_of(napi, struct axienet_local, napi);
 
+	axienet_start_xmit_done(lp->ndev, budget);
+
 	cur_p = &lp->rx_bd_v[lp->rx_bd_ci];
 
 	while (packets < budget && (cur_p->status & XAXIDMA_BD_STS_COMPLETE_MASK)) {
@@ -1001,11 +1006,12 @@ static int axienet_poll(struct napi_struct *napi, int budget)
 		axienet_dma_out_addr(lp, XAXIDMA_RX_TDESC_OFFSET, tail_p);
 
 	if (packets < budget && napi_complete_done(napi, packets)) {
-		/* Re-enable RX completion interrupts. This should
-		 * cause an immediate interrupt if any RX packets are
+		/* Re-enable RX/TX completion interrupts. This should
+		 * cause an immediate interrupt if any RX/TX packets are
 		 * already pending.
 		 */
 		axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp->rx_dma_cr);
+		axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
 	}
 	return packets;
 }
@@ -1040,7 +1046,15 @@ static irqreturn_t axienet_tx_irq(int irq, void *_ndev)
 			   (lp->tx_bd_v[lp->tx_bd_ci]).phys);
 		schedule_work(&lp->dma_err_task);
 	} else {
-		axienet_start_xmit_done(lp->ndev);
+		/* Disable further TX completion interrupts and schedule
+		 * NAPI to handle the completions.
+		 */
+		u32 cr = lp->tx_dma_cr;
+
+		cr &= ~(XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_DELAY_MASK);
+		axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, cr);
+
+		napi_schedule(&lp->napi);
 	}
 
 	return IRQ_HANDLED;
-- 
2.31.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* RE: [PATCH net-next] net: axienet: Use NAPI for TX completion path
  2022-04-29 22:28 ` Robert Hancock
@ 2022-05-02 19:30   ` Radhey Shyam Pandey
  -1 siblings, 0 replies; 14+ messages in thread
From: Radhey Shyam Pandey @ 2022-05-02 19:30 UTC (permalink / raw)
  To: Robert Hancock, netdev
  Cc: davem, edumazet, kuba, pabeni, Michal Simek, linux-arm-kernel,
	Harini Katakam

> -----Original Message-----
> From: Robert Hancock <robert.hancock@calian.com>
> Sent: Saturday, April 30, 2022 3:59 AM
> To: netdev@vger.kernel.org
> Cc: Radhey Shyam Pandey <radheys@xilinx.com>; davem@davemloft.net;
> edumazet@google.com; kuba@kernel.org; pabeni@redhat.com; Michal Simek
> <michals@xilinx.com>; linux-arm-kernel@lists.infradead.org; Robert Hancock
> <robert.hancock@calian.com>
> Subject: [PATCH net-next] net: axienet: Use NAPI for TX completion path
> 
> This driver was using the TX IRQ handler to perform all TX completion
> tasks. Under heavy TX network load, this can cause significant irqs-off
> latencies (found to be in the hundreds of microseconds using ftrace).
> This can cause other issues, such as overrunning serial UART FIFOs when
> using high baud rates with limited UART FIFO sizes.
> 
> Switch to using the NAPI poll handler to perform the TX completion work
> to get this out of hard IRQ context and avoid the IRQ latency impact.

Thanks for the patch. I assume for simulating heavy network load we
are using netperf/iperf. Do we have some details on the benchmark
before and after adding TX NAPI? I want to see the impact on
throughput.

> 
> Signed-off-by: Robert Hancock <robert.hancock@calian.com>
> ---
>  drivers/net/ethernet/xilinx/xilinx_axienet.h  |  2 +
>  .../net/ethernet/xilinx/xilinx_axienet_main.c | 56 ++++++++++++-------
>  2 files changed, 37 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h
> b/drivers/net/ethernet/xilinx/xilinx_axienet.h
> index d5c1e5c4a508..6e58d034fe90 100644
> --- a/drivers/net/ethernet/xilinx/xilinx_axienet.h
> +++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h
> @@ -397,6 +397,7 @@ struct axidma_bd {
>   * @regs:	Base address for the axienet_local device address space
>   * @dma_regs:	Base address for the axidma device address space
>   * @rx_dma_cr:  Nominal content of RX DMA control register
> + * @tx_dma_cr:  Nominal content of TX DMA control register
>   * @dma_err_task: Work structure to process Axi DMA errors
>   * @tx_irq:	Axidma TX IRQ number
>   * @rx_irq:	Axidma RX IRQ number
> @@ -454,6 +455,7 @@ struct axienet_local {
>  	void __iomem *dma_regs;
> 
>  	u32 rx_dma_cr;
> +	u32 tx_dma_cr;
> 
>  	struct work_struct dma_err_task;
> 
> diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
> b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
> index d6fc3f7acdf0..a52e616275e4 100644
> --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
> +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
> @@ -254,8 +254,6 @@ static u32 axienet_usec_to_timer(struct axienet_local
> *lp, u32 coalesce_usec)
>   */
>  static void axienet_dma_start(struct axienet_local *lp)
>  {
> -	u32 tx_cr;
> -
>  	/* Start updating the Rx channel control register */
>  	lp->rx_dma_cr = (lp->coalesce_count_rx <<
> XAXIDMA_COALESCE_SHIFT) |
>  			XAXIDMA_IRQ_IOC_MASK |
> XAXIDMA_IRQ_ERROR_MASK;
> @@ -269,16 +267,16 @@ static void axienet_dma_start(struct axienet_local
> *lp)
>  	axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp->rx_dma_cr);
> 
>  	/* Start updating the Tx channel control register */
> -	tx_cr = (lp->coalesce_count_tx << XAXIDMA_COALESCE_SHIFT) |
> -		XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_ERROR_MASK;
> +	lp->tx_dma_cr = (lp->coalesce_count_tx <<
> XAXIDMA_COALESCE_SHIFT) |
> +			XAXIDMA_IRQ_IOC_MASK |
> XAXIDMA_IRQ_ERROR_MASK;
>  	/* Only set interrupt delay timer if not generating an interrupt on
>  	 * the first TX packet. Otherwise leave at 0 to disable delay interrupt.
>  	 */
>  	if (lp->coalesce_count_tx > 1)
> -		tx_cr |= (axienet_usec_to_timer(lp, lp->coalesce_usec_tx)
> -				<< XAXIDMA_DELAY_SHIFT) |
> -			 XAXIDMA_IRQ_DELAY_MASK;
> -	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, tx_cr);
> +		lp->tx_dma_cr |= (axienet_usec_to_timer(lp, lp-
> >coalesce_usec_tx)
> +					<< XAXIDMA_DELAY_SHIFT) |
> +				 XAXIDMA_IRQ_DELAY_MASK;
> +	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
> 
>  	/* Populate the tail pointer and bring the Rx Axi DMA engine out of
>  	 * halted state. This will make the Rx side ready for reception.
> @@ -294,8 +292,8 @@ static void axienet_dma_start(struct axienet_local *lp)
>  	 * tail pointer register that the Tx channel will start transmitting.
>  	 */
>  	axienet_dma_out_addr(lp, XAXIDMA_TX_CDESC_OFFSET, lp-
> >tx_bd_p);
> -	tx_cr |= XAXIDMA_CR_RUNSTOP_MASK;
> -	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, tx_cr);
> +	lp->tx_dma_cr |= XAXIDMA_CR_RUNSTOP_MASK;
> +	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
>  }
> 
>  /**
> @@ -671,13 +669,14 @@ static int axienet_device_reset(struct net_device
> *ndev)
>   * @nr_bds:	Number of descriptors to clean up, can be -1 if unknown.
>   * @sizep:	Pointer to a u32 filled with the total sum of all bytes
>   * 		in all cleaned-up descriptors. Ignored if NULL.
> + * @budget:	NAPI budget (use 0 when not called from NAPI poll)
>   *
>   * Would either be called after a successful transmit operation, or after
>   * there was an error when setting up the chain.
>   * Returns the number of descriptors handled.
>   */
>  static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd,
> -				 int nr_bds, u32 *sizep)
> +				 int nr_bds, u32 *sizep, int budget)
>  {
>  	struct axienet_local *lp = netdev_priv(ndev);
>  	struct axidma_bd *cur_p;
> @@ -707,7 +706,7 @@ static int axienet_free_tx_chain(struct net_device
> *ndev, u32 first_bd,
>  				 DMA_TO_DEVICE);
> 
>  		if (cur_p->skb && (status &
> XAXIDMA_BD_STS_COMPLETE_MASK))
> -			dev_consume_skb_irq(cur_p->skb);
> +			napi_consume_skb(cur_p->skb, budget);
> 
>  		cur_p->app0 = 0;
>  		cur_p->app1 = 0;
> @@ -756,20 +755,24 @@ static inline int axienet_check_tx_bd_space(struct
> axienet_local *lp,
>   * axienet_start_xmit_done - Invoked once a transmit is completed by the
>   * Axi DMA Tx channel.
>   * @ndev:	Pointer to the net_device structure
> + * @budget:	NAPI budget
>   *
> - * This function is invoked from the Axi DMA Tx isr to notify the completion
> + * This function is invoked from the NAPI processing to notify the completion
>   * of transmit operation. It clears fields in the corresponding Tx BDs and
>   * unmaps the corresponding buffer so that CPU can regain ownership of the
>   * buffer. It finally invokes "netif_wake_queue" to restart transmission if
>   * required.
>   */
> -static void axienet_start_xmit_done(struct net_device *ndev)
> +static void axienet_start_xmit_done(struct net_device *ndev, int budget)
>  {
>  	struct axienet_local *lp = netdev_priv(ndev);
>  	u32 packets = 0;
>  	u32 size = 0;
> 
> -	packets = axienet_free_tx_chain(ndev, lp->tx_bd_ci, -1, &size);
> +	packets = axienet_free_tx_chain(ndev, lp->tx_bd_ci, -1, &size, budget);
> +
> +	if (!packets)
> +		return;
> 
>  	lp->tx_bd_ci += packets;
>  	if (lp->tx_bd_ci >= lp->tx_bd_num)
> @@ -865,7 +868,7 @@ axienet_start_xmit(struct sk_buff *skb, struct
> net_device *ndev)
>  				netdev_err(ndev, "TX DMA mapping error\n");
>  			ndev->stats.tx_dropped++;
>  			axienet_free_tx_chain(ndev, orig_tail_ptr, ii + 1,
> -					      NULL);
> +					      NULL, 0);
>  			lp->tx_bd_tail = orig_tail_ptr;
> 
>  			return NETDEV_TX_OK;
> @@ -899,9 +902,9 @@ axienet_start_xmit(struct sk_buff *skb, struct
> net_device *ndev)
>  }
> 
>  /**
> - * axienet_poll - Triggered by RX ISR to complete the received BD processing.
> + * axienet_poll - Triggered by RX/TX ISR to complete the BD processing.
>   * @napi:	Pointer to NAPI structure.
> - * @budget:	Max number of packets to process.
> + * @budget:	Max number of RX packets to process.
>   *
>   * Return: Number of RX packets processed.
>   */
> @@ -916,6 +919,8 @@ static int axienet_poll(struct napi_struct *napi, int
> budget)
>  	struct sk_buff *skb, *new_skb;
>  	struct axienet_local *lp = container_of(napi, struct axienet_local,
> napi);
> 
> +	axienet_start_xmit_done(lp->ndev, budget);
> +
>  	cur_p = &lp->rx_bd_v[lp->rx_bd_ci];
> 
>  	while (packets < budget && (cur_p->status &
> XAXIDMA_BD_STS_COMPLETE_MASK)) {
> @@ -1001,11 +1006,12 @@ static int axienet_poll(struct napi_struct *napi, int
> budget)
>  		axienet_dma_out_addr(lp, XAXIDMA_RX_TDESC_OFFSET,
> tail_p);
> 
>  	if (packets < budget && napi_complete_done(napi, packets)) {
> -		/* Re-enable RX completion interrupts. This should
> -		 * cause an immediate interrupt if any RX packets are
> +		/* Re-enable RX/TX completion interrupts. This should
> +		 * cause an immediate interrupt if any RX/TX packets are
>  		 * already pending.
>  		 */
>  		axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp-
> >rx_dma_cr);
> +		axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp-
> >tx_dma_cr);
>  	}
>  	return packets;
>  }
> @@ -1040,7 +1046,15 @@ static irqreturn_t axienet_tx_irq(int irq, void
> *_ndev)
>  			   (lp->tx_bd_v[lp->tx_bd_ci]).phys);
>  		schedule_work(&lp->dma_err_task);
>  	} else {
> -		axienet_start_xmit_done(lp->ndev);
> +		/* Disable further TX completion interrupts and schedule
> +		 * NAPI to handle the completions.
> +		 */
> +		u32 cr = lp->tx_dma_cr;
> +
> +		cr &= ~(XAXIDMA_IRQ_IOC_MASK |
> XAXIDMA_IRQ_DELAY_MASK);
> +		axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, cr);
> +
> +		napi_schedule(&lp->napi);
>  	}
> 
>  	return IRQ_HANDLED;
> --
> 2.31.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH net-next] net: axienet: Use NAPI for TX completion path
@ 2022-05-02 19:30   ` Radhey Shyam Pandey
  0 siblings, 0 replies; 14+ messages in thread
From: Radhey Shyam Pandey @ 2022-05-02 19:30 UTC (permalink / raw)
  To: Robert Hancock, netdev
  Cc: davem, edumazet, kuba, pabeni, Michal Simek, linux-arm-kernel,
	Harini Katakam

> -----Original Message-----
> From: Robert Hancock <robert.hancock@calian.com>
> Sent: Saturday, April 30, 2022 3:59 AM
> To: netdev@vger.kernel.org
> Cc: Radhey Shyam Pandey <radheys@xilinx.com>; davem@davemloft.net;
> edumazet@google.com; kuba@kernel.org; pabeni@redhat.com; Michal Simek
> <michals@xilinx.com>; linux-arm-kernel@lists.infradead.org; Robert Hancock
> <robert.hancock@calian.com>
> Subject: [PATCH net-next] net: axienet: Use NAPI for TX completion path
> 
> This driver was using the TX IRQ handler to perform all TX completion
> tasks. Under heavy TX network load, this can cause significant irqs-off
> latencies (found to be in the hundreds of microseconds using ftrace).
> This can cause other issues, such as overrunning serial UART FIFOs when
> using high baud rates with limited UART FIFO sizes.
> 
> Switch to using the NAPI poll handler to perform the TX completion work
> to get this out of hard IRQ context and avoid the IRQ latency impact.

Thanks for the patch. I assume for simulating heavy network load we
are using netperf/iperf. Do we have some details on the benchmark
before and after adding TX NAPI? I want to see the impact on
throughput.

> 
> Signed-off-by: Robert Hancock <robert.hancock@calian.com>
> ---
>  drivers/net/ethernet/xilinx/xilinx_axienet.h  |  2 +
>  .../net/ethernet/xilinx/xilinx_axienet_main.c | 56 ++++++++++++-------
>  2 files changed, 37 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h
> b/drivers/net/ethernet/xilinx/xilinx_axienet.h
> index d5c1e5c4a508..6e58d034fe90 100644
> --- a/drivers/net/ethernet/xilinx/xilinx_axienet.h
> +++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h
> @@ -397,6 +397,7 @@ struct axidma_bd {
>   * @regs:	Base address for the axienet_local device address space
>   * @dma_regs:	Base address for the axidma device address space
>   * @rx_dma_cr:  Nominal content of RX DMA control register
> + * @tx_dma_cr:  Nominal content of TX DMA control register
>   * @dma_err_task: Work structure to process Axi DMA errors
>   * @tx_irq:	Axidma TX IRQ number
>   * @rx_irq:	Axidma RX IRQ number
> @@ -454,6 +455,7 @@ struct axienet_local {
>  	void __iomem *dma_regs;
> 
>  	u32 rx_dma_cr;
> +	u32 tx_dma_cr;
> 
>  	struct work_struct dma_err_task;
> 
> diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
> b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
> index d6fc3f7acdf0..a52e616275e4 100644
> --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
> +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
> @@ -254,8 +254,6 @@ static u32 axienet_usec_to_timer(struct axienet_local
> *lp, u32 coalesce_usec)
>   */
>  static void axienet_dma_start(struct axienet_local *lp)
>  {
> -	u32 tx_cr;
> -
>  	/* Start updating the Rx channel control register */
>  	lp->rx_dma_cr = (lp->coalesce_count_rx <<
> XAXIDMA_COALESCE_SHIFT) |
>  			XAXIDMA_IRQ_IOC_MASK |
> XAXIDMA_IRQ_ERROR_MASK;
> @@ -269,16 +267,16 @@ static void axienet_dma_start(struct axienet_local
> *lp)
>  	axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp->rx_dma_cr);
> 
>  	/* Start updating the Tx channel control register */
> -	tx_cr = (lp->coalesce_count_tx << XAXIDMA_COALESCE_SHIFT) |
> -		XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_ERROR_MASK;
> +	lp->tx_dma_cr = (lp->coalesce_count_tx <<
> XAXIDMA_COALESCE_SHIFT) |
> +			XAXIDMA_IRQ_IOC_MASK |
> XAXIDMA_IRQ_ERROR_MASK;
>  	/* Only set interrupt delay timer if not generating an interrupt on
>  	 * the first TX packet. Otherwise leave at 0 to disable delay interrupt.
>  	 */
>  	if (lp->coalesce_count_tx > 1)
> -		tx_cr |= (axienet_usec_to_timer(lp, lp->coalesce_usec_tx)
> -				<< XAXIDMA_DELAY_SHIFT) |
> -			 XAXIDMA_IRQ_DELAY_MASK;
> -	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, tx_cr);
> +		lp->tx_dma_cr |= (axienet_usec_to_timer(lp, lp-
> >coalesce_usec_tx)
> +					<< XAXIDMA_DELAY_SHIFT) |
> +				 XAXIDMA_IRQ_DELAY_MASK;
> +	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
> 
>  	/* Populate the tail pointer and bring the Rx Axi DMA engine out of
>  	 * halted state. This will make the Rx side ready for reception.
> @@ -294,8 +292,8 @@ static void axienet_dma_start(struct axienet_local *lp)
>  	 * tail pointer register that the Tx channel will start transmitting.
>  	 */
>  	axienet_dma_out_addr(lp, XAXIDMA_TX_CDESC_OFFSET, lp-
> >tx_bd_p);
> -	tx_cr |= XAXIDMA_CR_RUNSTOP_MASK;
> -	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, tx_cr);
> +	lp->tx_dma_cr |= XAXIDMA_CR_RUNSTOP_MASK;
> +	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
>  }
> 
>  /**
> @@ -671,13 +669,14 @@ static int axienet_device_reset(struct net_device
> *ndev)
>   * @nr_bds:	Number of descriptors to clean up, can be -1 if unknown.
>   * @sizep:	Pointer to a u32 filled with the total sum of all bytes
>   * 		in all cleaned-up descriptors. Ignored if NULL.
> + * @budget:	NAPI budget (use 0 when not called from NAPI poll)
>   *
>   * Would either be called after a successful transmit operation, or after
>   * there was an error when setting up the chain.
>   * Returns the number of descriptors handled.
>   */
>  static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd,
> -				 int nr_bds, u32 *sizep)
> +				 int nr_bds, u32 *sizep, int budget)
>  {
>  	struct axienet_local *lp = netdev_priv(ndev);
>  	struct axidma_bd *cur_p;
> @@ -707,7 +706,7 @@ static int axienet_free_tx_chain(struct net_device
> *ndev, u32 first_bd,
>  				 DMA_TO_DEVICE);
> 
>  		if (cur_p->skb && (status &
> XAXIDMA_BD_STS_COMPLETE_MASK))
> -			dev_consume_skb_irq(cur_p->skb);
> +			napi_consume_skb(cur_p->skb, budget);
> 
>  		cur_p->app0 = 0;
>  		cur_p->app1 = 0;
> @@ -756,20 +755,24 @@ static inline int axienet_check_tx_bd_space(struct
> axienet_local *lp,
>   * axienet_start_xmit_done - Invoked once a transmit is completed by the
>   * Axi DMA Tx channel.
>   * @ndev:	Pointer to the net_device structure
> + * @budget:	NAPI budget
>   *
> - * This function is invoked from the Axi DMA Tx isr to notify the completion
> + * This function is invoked from the NAPI processing to notify the completion
>   * of transmit operation. It clears fields in the corresponding Tx BDs and
>   * unmaps the corresponding buffer so that CPU can regain ownership of the
>   * buffer. It finally invokes "netif_wake_queue" to restart transmission if
>   * required.
>   */
> -static void axienet_start_xmit_done(struct net_device *ndev)
> +static void axienet_start_xmit_done(struct net_device *ndev, int budget)
>  {
>  	struct axienet_local *lp = netdev_priv(ndev);
>  	u32 packets = 0;
>  	u32 size = 0;
> 
> -	packets = axienet_free_tx_chain(ndev, lp->tx_bd_ci, -1, &size);
> +	packets = axienet_free_tx_chain(ndev, lp->tx_bd_ci, -1, &size, budget);
> +
> +	if (!packets)
> +		return;
> 
>  	lp->tx_bd_ci += packets;
>  	if (lp->tx_bd_ci >= lp->tx_bd_num)
> @@ -865,7 +868,7 @@ axienet_start_xmit(struct sk_buff *skb, struct
> net_device *ndev)
>  				netdev_err(ndev, "TX DMA mapping error\n");
>  			ndev->stats.tx_dropped++;
>  			axienet_free_tx_chain(ndev, orig_tail_ptr, ii + 1,
> -					      NULL);
> +					      NULL, 0);
>  			lp->tx_bd_tail = orig_tail_ptr;
> 
>  			return NETDEV_TX_OK;
> @@ -899,9 +902,9 @@ axienet_start_xmit(struct sk_buff *skb, struct
> net_device *ndev)
>  }
> 
>  /**
> - * axienet_poll - Triggered by RX ISR to complete the received BD processing.
> + * axienet_poll - Triggered by RX/TX ISR to complete the BD processing.
>   * @napi:	Pointer to NAPI structure.
> - * @budget:	Max number of packets to process.
> + * @budget:	Max number of RX packets to process.
>   *
>   * Return: Number of RX packets processed.
>   */
> @@ -916,6 +919,8 @@ static int axienet_poll(struct napi_struct *napi, int
> budget)
>  	struct sk_buff *skb, *new_skb;
>  	struct axienet_local *lp = container_of(napi, struct axienet_local,
> napi);
> 
> +	axienet_start_xmit_done(lp->ndev, budget);
> +
>  	cur_p = &lp->rx_bd_v[lp->rx_bd_ci];
> 
>  	while (packets < budget && (cur_p->status &
> XAXIDMA_BD_STS_COMPLETE_MASK)) {
> @@ -1001,11 +1006,12 @@ static int axienet_poll(struct napi_struct *napi, int
> budget)
>  		axienet_dma_out_addr(lp, XAXIDMA_RX_TDESC_OFFSET,
> tail_p);
> 
>  	if (packets < budget && napi_complete_done(napi, packets)) {
> -		/* Re-enable RX completion interrupts. This should
> -		 * cause an immediate interrupt if any RX packets are
> +		/* Re-enable RX/TX completion interrupts. This should
> +		 * cause an immediate interrupt if any RX/TX packets are
>  		 * already pending.
>  		 */
>  		axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp-
> >rx_dma_cr);
> +		axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp-
> >tx_dma_cr);
>  	}
>  	return packets;
>  }
> @@ -1040,7 +1046,15 @@ static irqreturn_t axienet_tx_irq(int irq, void
> *_ndev)
>  			   (lp->tx_bd_v[lp->tx_bd_ci]).phys);
>  		schedule_work(&lp->dma_err_task);
>  	} else {
> -		axienet_start_xmit_done(lp->ndev);
> +		/* Disable further TX completion interrupts and schedule
> +		 * NAPI to handle the completions.
> +		 */
> +		u32 cr = lp->tx_dma_cr;
> +
> +		cr &= ~(XAXIDMA_IRQ_IOC_MASK |
> XAXIDMA_IRQ_DELAY_MASK);
> +		axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, cr);
> +
> +		napi_schedule(&lp->napi);
>  	}
> 
>  	return IRQ_HANDLED;
> --
> 2.31.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next] net: axienet: Use NAPI for TX completion path
  2022-05-02 19:30   ` Radhey Shyam Pandey
@ 2022-05-05  2:20     ` Jakub Kicinski
  -1 siblings, 0 replies; 14+ messages in thread
From: Jakub Kicinski @ 2022-05-05  2:20 UTC (permalink / raw)
  To: Radhey Shyam Pandey, Robert Hancock
  Cc: netdev, davem, edumazet, pabeni, Michal Simek, linux-arm-kernel,
	Harini Katakam

On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:
> > This driver was using the TX IRQ handler to perform all TX completion
> > tasks. Under heavy TX network load, this can cause significant irqs-off
> > latencies (found to be in the hundreds of microseconds using ftrace).
> > This can cause other issues, such as overrunning serial UART FIFOs when
> > using high baud rates with limited UART FIFO sizes.
> > 
> > Switch to using the NAPI poll handler to perform the TX completion work
> > to get this out of hard IRQ context and avoid the IRQ latency impact.  
> 
> Thanks for the patch. I assume for simulating heavy network load we
> are using netperf/iperf. Do we have some details on the benchmark
> before and after adding TX NAPI? I want to see the impact on
> throughput.

Seems like a reasonable ask, let's get the patch reposted 
with the numbers in the commit message.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next] net: axienet: Use NAPI for TX completion path
@ 2022-05-05  2:20     ` Jakub Kicinski
  0 siblings, 0 replies; 14+ messages in thread
From: Jakub Kicinski @ 2022-05-05  2:20 UTC (permalink / raw)
  To: Radhey Shyam Pandey, Robert Hancock
  Cc: netdev, davem, edumazet, pabeni, Michal Simek, linux-arm-kernel,
	Harini Katakam

On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:
> > This driver was using the TX IRQ handler to perform all TX completion
> > tasks. Under heavy TX network load, this can cause significant irqs-off
> > latencies (found to be in the hundreds of microseconds using ftrace).
> > This can cause other issues, such as overrunning serial UART FIFOs when
> > using high baud rates with limited UART FIFO sizes.
> > 
> > Switch to using the NAPI poll handler to perform the TX completion work
> > to get this out of hard IRQ context and avoid the IRQ latency impact.  
> 
> Thanks for the patch. I assume for simulating heavy network load we
> are using netperf/iperf. Do we have some details on the benchmark
> before and after adding TX NAPI? I want to see the impact on
> throughput.

Seems like a reasonable ask, let's get the patch reposted 
with the numbers in the commit message.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next] net: axienet: Use NAPI for TX completion path
  2022-05-05  2:20     ` Jakub Kicinski
@ 2022-05-05 17:33       ` Robert Hancock
  -1 siblings, 0 replies; 14+ messages in thread
From: Robert Hancock @ 2022-05-05 17:33 UTC (permalink / raw)
  To: radheys, kuba
  Cc: linux-arm-kernel, netdev, davem, michals, pabeni, edumazet, harinik

On Wed, 2022-05-04 at 19:20 -0700, Jakub Kicinski wrote:
> On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:
> > > This driver was using the TX IRQ handler to perform all TX completion
> > > tasks. Under heavy TX network load, this can cause significant irqs-off
> > > latencies (found to be in the hundreds of microseconds using ftrace).
> > > This can cause other issues, such as overrunning serial UART FIFOs when
> > > using high baud rates with limited UART FIFO sizes.
> > > 
> > > Switch to using the NAPI poll handler to perform the TX completion work
> > > to get this out of hard IRQ context and avoid the IRQ latency impact.  
> > 
> > Thanks for the patch. I assume for simulating heavy network load we
> > are using netperf/iperf. Do we have some details on the benchmark
> > before and after adding TX NAPI? I want to see the impact on
> > throughput.
> 
> Seems like a reasonable ask, let's get the patch reposted 
> with the numbers in the commit message.

Didn't mean to ignore that request, looks like I didn't get Radhey's email
directly, odd.

I did a test with iperf3 from the board (Xilinx MPSoC ZU9EG platform) connected
to a Linux PC via a switch at 1G link speed. With TX NAPI in place I saw about
942 Mbps for TX rate, with the previous code I saw 941 Mbps. RX speed was also
unchanged at 941 Mbps. So no real significant change either way. I can spin
another version of the patch that includes these numbers.

-- 
Robert Hancock
Senior Hardware Designer, Calian Advanced Technologies
www.calian.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next] net: axienet: Use NAPI for TX completion path
@ 2022-05-05 17:33       ` Robert Hancock
  0 siblings, 0 replies; 14+ messages in thread
From: Robert Hancock @ 2022-05-05 17:33 UTC (permalink / raw)
  To: radheys, kuba
  Cc: linux-arm-kernel, netdev, davem, michals, pabeni, edumazet, harinik

On Wed, 2022-05-04 at 19:20 -0700, Jakub Kicinski wrote:
> On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:
> > > This driver was using the TX IRQ handler to perform all TX completion
> > > tasks. Under heavy TX network load, this can cause significant irqs-off
> > > latencies (found to be in the hundreds of microseconds using ftrace).
> > > This can cause other issues, such as overrunning serial UART FIFOs when
> > > using high baud rates with limited UART FIFO sizes.
> > > 
> > > Switch to using the NAPI poll handler to perform the TX completion work
> > > to get this out of hard IRQ context and avoid the IRQ latency impact.  
> > 
> > Thanks for the patch. I assume for simulating heavy network load we
> > are using netperf/iperf. Do we have some details on the benchmark
> > before and after adding TX NAPI? I want to see the impact on
> > throughput.
> 
> Seems like a reasonable ask, let's get the patch reposted 
> with the numbers in the commit message.

Didn't mean to ignore that request, looks like I didn't get Radhey's email
directly, odd.

I did a test with iperf3 from the board (Xilinx MPSoC ZU9EG platform) connected
to a Linux PC via a switch at 1G link speed. With TX NAPI in place I saw about
942 Mbps for TX rate, with the previous code I saw 941 Mbps. RX speed was also
unchanged at 941 Mbps. So no real significant change either way. I can spin
another version of the patch that includes these numbers.

-- 
Robert Hancock
Senior Hardware Designer, Calian Advanced Technologies
www.calian.com
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next] net: axienet: Use NAPI for TX completion path
  2022-05-05 17:33       ` Robert Hancock
@ 2022-05-05 18:08         ` Jakub Kicinski
  -1 siblings, 0 replies; 14+ messages in thread
From: Jakub Kicinski @ 2022-05-05 18:08 UTC (permalink / raw)
  To: Robert Hancock
  Cc: radheys, linux-arm-kernel, netdev, davem, michals, pabeni,
	edumazet, harinik

On Thu, 5 May 2022 17:33:39 +0000 Robert Hancock wrote:
> On Wed, 2022-05-04 at 19:20 -0700, Jakub Kicinski wrote:
> > On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:  
> > > Thanks for the patch. I assume for simulating heavy network load we
> > > are using netperf/iperf. Do we have some details on the benchmark
> > > before and after adding TX NAPI? I want to see the impact on
> > > throughput.  
> > 
> > Seems like a reasonable ask, let's get the patch reposted 
> > with the numbers in the commit message.  
> 
> Didn't mean to ignore that request, looks like I didn't get Radhey's email
> directly, odd.
> 
> I did a test with iperf3 from the board (Xilinx MPSoC ZU9EG platform) connected
> to a Linux PC via a switch at 1G link speed. With TX NAPI in place I saw about
> 942 Mbps for TX rate, with the previous code I saw 941 Mbps. RX speed was also
> unchanged at 941 Mbps. So no real significant change either way. I can spin
> another version of the patch that includes these numbers.

Sounds like line rate, is there a difference in CPU utilization?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next] net: axienet: Use NAPI for TX completion path
@ 2022-05-05 18:08         ` Jakub Kicinski
  0 siblings, 0 replies; 14+ messages in thread
From: Jakub Kicinski @ 2022-05-05 18:08 UTC (permalink / raw)
  To: Robert Hancock
  Cc: radheys, linux-arm-kernel, netdev, davem, michals, pabeni,
	edumazet, harinik

On Thu, 5 May 2022 17:33:39 +0000 Robert Hancock wrote:
> On Wed, 2022-05-04 at 19:20 -0700, Jakub Kicinski wrote:
> > On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:  
> > > Thanks for the patch. I assume for simulating heavy network load we
> > > are using netperf/iperf. Do we have some details on the benchmark
> > > before and after adding TX NAPI? I want to see the impact on
> > > throughput.  
> > 
> > Seems like a reasonable ask, let's get the patch reposted 
> > with the numbers in the commit message.  
> 
> Didn't mean to ignore that request, looks like I didn't get Radhey's email
> directly, odd.
> 
> I did a test with iperf3 from the board (Xilinx MPSoC ZU9EG platform) connected
> to a Linux PC via a switch at 1G link speed. With TX NAPI in place I saw about
> 942 Mbps for TX rate, with the previous code I saw 941 Mbps. RX speed was also
> unchanged at 941 Mbps. So no real significant change either way. I can spin
> another version of the patch that includes these numbers.

Sounds like line rate, is there a difference in CPU utilization?

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next] net: axienet: Use NAPI for TX completion path
  2022-05-05 18:08         ` Jakub Kicinski
@ 2022-05-05 18:56           ` Robert Hancock
  -1 siblings, 0 replies; 14+ messages in thread
From: Robert Hancock @ 2022-05-05 18:56 UTC (permalink / raw)
  To: kuba
  Cc: harinik, michals, pabeni, edumazet, linux-arm-kernel, netdev,
	radheys, davem

On Thu, 2022-05-05 at 11:08 -0700, Jakub Kicinski wrote:
> On Thu, 5 May 2022 17:33:39 +0000 Robert Hancock wrote:
> > On Wed, 2022-05-04 at 19:20 -0700, Jakub Kicinski wrote:
> > > On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:  
> > > > Thanks for the patch. I assume for simulating heavy network load we
> > > > are using netperf/iperf. Do we have some details on the benchmark
> > > > before and after adding TX NAPI? I want to see the impact on
> > > > throughput.  
> > > 
> > > Seems like a reasonable ask, let's get the patch reposted 
> > > with the numbers in the commit message.  
> > 
> > Didn't mean to ignore that request, looks like I didn't get Radhey's email
> > directly, odd.
> > 
> > I did a test with iperf3 from the board (Xilinx MPSoC ZU9EG platform)
> > connected
> > to a Linux PC via a switch at 1G link speed. With TX NAPI in place I saw
> > about
> > 942 Mbps for TX rate, with the previous code I saw 941 Mbps. RX speed was
> > also
> > unchanged at 941 Mbps. So no real significant change either way. I can spin
> > another version of the patch that includes these numbers.
> 
> Sounds like line rate, is there a difference in CPU utilization?

Some measurements on that from the TX load case - in both cases the RX and TX
IRQs ended up being split across CPU0 and CPU3 due to irqbalance:

Before:

CPU0 (RX): 1% hard IRQ, 13% soft IRQ
CPU3 (TX): 12% hard IRQ, 30% soft IRQ

After:

CPU0 (RX): <1% hard IRQ, 29% soft IRQ
CPU3 (TX): <1% hard IRQ, 21% soft IRQ

The hard IRQ time is definitely lower, and the total CPU usage is lower as well
(56% down to 50%). It's interesting that so much of the CPU load ended up on
the CPU with the RX IRQ though, presumably because the RX and TX IRQs are
triggering the same NAPI poll operation. Since they're separate IRQs that can
be on separate CPUs, it might be a win to use separate NAPI poll structures
for RX and TX so that both CPUs aren't trying to hit the same rings (TX and
RX)?

-- 
Robert Hancock
Senior Hardware Designer, Calian Advanced Technologies
www.calian.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next] net: axienet: Use NAPI for TX completion path
@ 2022-05-05 18:56           ` Robert Hancock
  0 siblings, 0 replies; 14+ messages in thread
From: Robert Hancock @ 2022-05-05 18:56 UTC (permalink / raw)
  To: kuba
  Cc: harinik, michals, pabeni, edumazet, linux-arm-kernel, netdev,
	radheys, davem

On Thu, 2022-05-05 at 11:08 -0700, Jakub Kicinski wrote:
> On Thu, 5 May 2022 17:33:39 +0000 Robert Hancock wrote:
> > On Wed, 2022-05-04 at 19:20 -0700, Jakub Kicinski wrote:
> > > On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:  
> > > > Thanks for the patch. I assume for simulating heavy network load we
> > > > are using netperf/iperf. Do we have some details on the benchmark
> > > > before and after adding TX NAPI? I want to see the impact on
> > > > throughput.  
> > > 
> > > Seems like a reasonable ask, let's get the patch reposted 
> > > with the numbers in the commit message.  
> > 
> > Didn't mean to ignore that request, looks like I didn't get Radhey's email
> > directly, odd.
> > 
> > I did a test with iperf3 from the board (Xilinx MPSoC ZU9EG platform)
> > connected
> > to a Linux PC via a switch at 1G link speed. With TX NAPI in place I saw
> > about
> > 942 Mbps for TX rate, with the previous code I saw 941 Mbps. RX speed was
> > also
> > unchanged at 941 Mbps. So no real significant change either way. I can spin
> > another version of the patch that includes these numbers.
> 
> Sounds like line rate, is there a difference in CPU utilization?

Some measurements on that from the TX load case - in both cases the RX and TX
IRQs ended up being split across CPU0 and CPU3 due to irqbalance:

Before:

CPU0 (RX): 1% hard IRQ, 13% soft IRQ
CPU3 (TX): 12% hard IRQ, 30% soft IRQ

After:

CPU0 (RX): <1% hard IRQ, 29% soft IRQ
CPU3 (TX): <1% hard IRQ, 21% soft IRQ

The hard IRQ time is definitely lower, and the total CPU usage is lower as well
(56% down to 50%). It's interesting that so much of the CPU load ended up on
the CPU with the RX IRQ though, presumably because the RX and TX IRQs are
triggering the same NAPI poll operation. Since they're separate IRQs that can
be on separate CPUs, it might be a win to use separate NAPI poll structures
for RX and TX so that both CPUs aren't trying to hit the same rings (TX and
RX)?

-- 
Robert Hancock
Senior Hardware Designer, Calian Advanced Technologies
www.calian.com
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next] net: axienet: Use NAPI for TX completion path
  2022-05-05 18:56           ` Robert Hancock
@ 2022-05-05 20:15             ` Robert Hancock
  -1 siblings, 0 replies; 14+ messages in thread
From: Robert Hancock @ 2022-05-05 20:15 UTC (permalink / raw)
  To: kuba
  Cc: harinik, michals, pabeni, edumazet, linux-arm-kernel, netdev,
	radheys, davem

On Thu, 2022-05-05 at 12:56 -0600, Robert Hancock wrote:
> On Thu, 2022-05-05 at 11:08 -0700, Jakub Kicinski wrote:
> > On Thu, 5 May 2022 17:33:39 +0000 Robert Hancock wrote:
> > > On Wed, 2022-05-04 at 19:20 -0700, Jakub Kicinski wrote:
> > > > On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:  
> > > > > Thanks for the patch. I assume for simulating heavy network load we
> > > > > are using netperf/iperf. Do we have some details on the benchmark
> > > > > before and after adding TX NAPI? I want to see the impact on
> > > > > throughput.  
> > > > 
> > > > Seems like a reasonable ask, let's get the patch reposted 
> > > > with the numbers in the commit message.  
> > > 
> > > Didn't mean to ignore that request, looks like I didn't get Radhey's
> > > email
> > > directly, odd.
> > > 
> > > I did a test with iperf3 from the board (Xilinx MPSoC ZU9EG platform)
> > > connected
> > > to a Linux PC via a switch at 1G link speed. With TX NAPI in place I saw
> > > about
> > > 942 Mbps for TX rate, with the previous code I saw 941 Mbps. RX speed was
> > > also
> > > unchanged at 941 Mbps. So no real significant change either way. I can
> > > spin
> > > another version of the patch that includes these numbers.
> > 
> > Sounds like line rate, is there a difference in CPU utilization?
> 
> Some measurements on that from the TX load case - in both cases the RX and TX
> IRQs ended up being split across CPU0 and CPU3 due to irqbalance:
> 
> Before:
> 
> CPU0 (RX): 1% hard IRQ, 13% soft IRQ
> CPU3 (TX): 12% hard IRQ, 30% soft IRQ
> 
> After:
> 
> CPU0 (RX): <1% hard IRQ, 29% soft IRQ
> CPU3 (TX): <1% hard IRQ, 21% soft IRQ
> 
> The hard IRQ time is definitely lower, and the total CPU usage is lower as
> well
> (56% down to 50%). It's interesting that so much of the CPU load ended up on
> the CPU with the RX IRQ though, presumably because the RX and TX IRQs are
> triggering the same NAPI poll operation. Since they're separate IRQs that can
> be on separate CPUs, it might be a win to use separate NAPI poll structures
> for RX and TX so that both CPUs aren't trying to hit the same rings (TX and
> RX)?

Indeed, it appears that separate RX and TX NAPI polling lowers the CPU usage
overall by a few percent as well as keeping the TX work on the same CPU as the
TX IRQ. I'll submit a v3 with these changes and will include the softirq
numbers in the commit text.

> 
-- 
Robert Hancock
Senior Hardware Designer, Calian Advanced Technologies
www.calian.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next] net: axienet: Use NAPI for TX completion path
@ 2022-05-05 20:15             ` Robert Hancock
  0 siblings, 0 replies; 14+ messages in thread
From: Robert Hancock @ 2022-05-05 20:15 UTC (permalink / raw)
  To: kuba
  Cc: harinik, michals, pabeni, edumazet, linux-arm-kernel, netdev,
	radheys, davem

On Thu, 2022-05-05 at 12:56 -0600, Robert Hancock wrote:
> On Thu, 2022-05-05 at 11:08 -0700, Jakub Kicinski wrote:
> > On Thu, 5 May 2022 17:33:39 +0000 Robert Hancock wrote:
> > > On Wed, 2022-05-04 at 19:20 -0700, Jakub Kicinski wrote:
> > > > On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:  
> > > > > Thanks for the patch. I assume for simulating heavy network load we
> > > > > are using netperf/iperf. Do we have some details on the benchmark
> > > > > before and after adding TX NAPI? I want to see the impact on
> > > > > throughput.  
> > > > 
> > > > Seems like a reasonable ask, let's get the patch reposted 
> > > > with the numbers in the commit message.  
> > > 
> > > Didn't mean to ignore that request, looks like I didn't get Radhey's
> > > email
> > > directly, odd.
> > > 
> > > I did a test with iperf3 from the board (Xilinx MPSoC ZU9EG platform)
> > > connected
> > > to a Linux PC via a switch at 1G link speed. With TX NAPI in place I saw
> > > about
> > > 942 Mbps for TX rate, with the previous code I saw 941 Mbps. RX speed was
> > > also
> > > unchanged at 941 Mbps. So no real significant change either way. I can
> > > spin
> > > another version of the patch that includes these numbers.
> > 
> > Sounds like line rate, is there a difference in CPU utilization?
> 
> Some measurements on that from the TX load case - in both cases the RX and TX
> IRQs ended up being split across CPU0 and CPU3 due to irqbalance:
> 
> Before:
> 
> CPU0 (RX): 1% hard IRQ, 13% soft IRQ
> CPU3 (TX): 12% hard IRQ, 30% soft IRQ
> 
> After:
> 
> CPU0 (RX): <1% hard IRQ, 29% soft IRQ
> CPU3 (TX): <1% hard IRQ, 21% soft IRQ
> 
> The hard IRQ time is definitely lower, and the total CPU usage is lower as
> well
> (56% down to 50%). It's interesting that so much of the CPU load ended up on
> the CPU with the RX IRQ though, presumably because the RX and TX IRQs are
> triggering the same NAPI poll operation. Since they're separate IRQs that can
> be on separate CPUs, it might be a win to use separate NAPI poll structures
> for RX and TX so that both CPUs aren't trying to hit the same rings (TX and
> RX)?

Indeed, it appears that separate RX and TX NAPI polling lowers the CPU usage
overall by a few percent as well as keeping the TX work on the same CPU as the
TX IRQ. I'll submit a v3 with these changes and will include the softirq
numbers in the commit text.

> 
-- 
Robert Hancock
Senior Hardware Designer, Calian Advanced Technologies
www.calian.com
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2022-05-05 20:16 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-29 22:28 [PATCH net-next] net: axienet: Use NAPI for TX completion path Robert Hancock
2022-04-29 22:28 ` Robert Hancock
2022-05-02 19:30 ` Radhey Shyam Pandey
2022-05-02 19:30   ` Radhey Shyam Pandey
2022-05-05  2:20   ` Jakub Kicinski
2022-05-05  2:20     ` Jakub Kicinski
2022-05-05 17:33     ` Robert Hancock
2022-05-05 17:33       ` Robert Hancock
2022-05-05 18:08       ` Jakub Kicinski
2022-05-05 18:08         ` Jakub Kicinski
2022-05-05 18:56         ` Robert Hancock
2022-05-05 18:56           ` Robert Hancock
2022-05-05 20:15           ` Robert Hancock
2022-05-05 20:15             ` Robert Hancock

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.