All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] bnx2x: Fix error recovering in switch configuration
@ 2022-08-25 20:00 Thinh Tran
  2022-08-27  1:44 ` Jakub Kicinski
  2022-08-30  9:19 ` [EXT] [PATCH] bnx2x: Fix error recovering in switch configuration Manish Chopra
  0 siblings, 2 replies; 36+ messages in thread
From: Thinh Tran @ 2022-08-25 20:00 UTC (permalink / raw)
  To: kuba; +Cc: netdev, aelior, davem, manishc, skalluru, Thinh Tran

As the BCM57810 and other I/O adapters are connected
through a PCIe switch, the bnx2x driver causes unexpected
system hang/crash while handling PCIe switch errors, if 
its error handler is called after other drivers' handlers.

In this case, after numbers of bnx2x_tx_timout(), the
bnx2x_nic_unload() is  called, frees up resources and
calls bnx2x_napi_disable(). Then when EEH calls its
error handler, the bnx2x_io_error_detected() and
bnx2x_io_slot_reset() also calling bnx2x_napi_disable()
and freeing the resources.

This patch will:
- reduce the numbers of bnx2x_panic_dump() while in
  bnx2x_tx_timeout(), avoid filling up dmesg buffer.
- use checking new napi_enable flags to prevent calling 
  disable again which causing system hangs.
- cheking if fp->page_pool already freed avoid system
  crash.

Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  4 ++++
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 24 ++++++++++++++++++-
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 +++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index dd5945c4bfec..7fa23d47907a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -1509,6 +1509,10 @@ struct bnx2x {
 	bool			cnic_loaded;
 	struct cnic_eth_dev	*(*cnic_probe)(struct net_device *);
 
+	bool			napi_enable;
+	bool			cnic_napi_enable;
+	int			tx_timeout_cnt;
+
 	/* Flag that indicates that we can start looking for FCoE L2 queue
 	 * completions in the default status block.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 712b5595bc39..bb8d91f44642 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -1860,37 +1860,49 @@ static int bnx2x_setup_irqs(struct bnx2x *bp)
 static void bnx2x_napi_enable_cnic(struct bnx2x *bp)
 {
 	int i;
+	if (bp->cnic_napi_enable)
+		return;
 
 	for_each_rx_queue_cnic(bp, i) {
 		napi_enable(&bnx2x_fp(bp, i, napi));
 	}
+	bp->cnic_napi_enable = true;
 }
 
 static void bnx2x_napi_enable(struct bnx2x *bp)
 {
 	int i;
+	if (bp->napi_enable)
+		return;
 
 	for_each_eth_queue(bp, i) {
 		napi_enable(&bnx2x_fp(bp, i, napi));
 	}
+	bp->napi_enable = true;
 }
 
 static void bnx2x_napi_disable_cnic(struct bnx2x *bp)
 {
 	int i;
+	if (!bp->cnic_napi_enable)
+		return;
 
 	for_each_rx_queue_cnic(bp, i) {
 		napi_disable(&bnx2x_fp(bp, i, napi));
 	}
+	bp->cnic_napi_enable = false;
 }
 
 static void bnx2x_napi_disable(struct bnx2x *bp)
 {
 	int i;
+	if (!bp->napi_enable)
+		return;
 
 	for_each_eth_queue(bp, i) {
 		napi_disable(&bnx2x_fp(bp, i, napi));
 	}
+	bp->napi_enable = false;
 }
 
 void bnx2x_netif_start(struct bnx2x *bp)
@@ -2554,6 +2566,7 @@ int bnx2x_load_cnic(struct bnx2x *bp)
 	}
 
 	/* Add all CNIC NAPI objects */
+	bp->cnic_napi_enable = false;
 	bnx2x_add_all_napi_cnic(bp);
 	DP(NETIF_MSG_IFUP, "cnic napi added\n");
 	bnx2x_napi_enable_cnic(bp);
@@ -2701,7 +2714,9 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 	 */
 	bnx2x_setup_tc(bp->dev, bp->max_cos);
 
+	bp->tx_timeout_cnt = 0;
 	/* Add all NAPI objects */
+	bp->napi_enable = false;
 	bnx2x_add_all_napi(bp);
 	DP(NETIF_MSG_IFUP, "napi added\n");
 	bnx2x_napi_enable(bp);
@@ -4982,7 +4997,14 @@ void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
 	 */
 	if (!bp->panic)
 #ifndef BNX2X_STOP_ON_ERROR
-		bnx2x_panic_dump(bp, false);
+	{
+		if (++bp->tx_timeout_cnt > 3) {
+			bnx2x_panic_dump(bp, false);
+			bp->tx_timeout_cnt = 0;
+		} else {
+			netdev_err(bp->dev, "TX timeout %d times\n", bp->tx_timeout_cnt);
+		}
+	}
 #else
 		bnx2x_panic();
 #endif
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index d8b1824c334d..7e1d38a2c7ec 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -1018,6 +1018,9 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 	if (fp->mode == TPA_MODE_DISABLED)
 		return;
 
+	if (!fp->page_pool.page)
+		return;
+
 	for (i = 0; i < last; i++)
 		bnx2x_free_rx_sge(bp, fp, i);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] bnx2x: Fix error recovering in switch configuration
  2022-08-25 20:00 [PATCH] bnx2x: Fix error recovering in switch configuration Thinh Tran
@ 2022-08-27  1:44 ` Jakub Kicinski
  2022-08-30 16:15   ` Thinh Tran
  2022-09-16 19:51   ` [PATCH v2] " Thinh Tran
  2022-08-30  9:19 ` [EXT] [PATCH] bnx2x: Fix error recovering in switch configuration Manish Chopra
  1 sibling, 2 replies; 36+ messages in thread
From: Jakub Kicinski @ 2022-08-27  1:44 UTC (permalink / raw)
  To: Thinh Tran; +Cc: netdev, aelior, davem, manishc, skalluru

On Thu, 25 Aug 2022 20:00:29 +0000 Thinh Tran wrote:
> As the BCM57810 and other I/O adapters are connected
> through a PCIe switch, the bnx2x driver causes unexpected
> system hang/crash while handling PCIe switch errors, if 
> its error handler is called after other drivers' handlers.
> 
> In this case, after numbers of bnx2x_tx_timout(), the
> bnx2x_nic_unload() is  called, frees up resources and
> calls bnx2x_napi_disable(). Then when EEH calls its
> error handler, the bnx2x_io_error_detected() and
> bnx2x_io_slot_reset() also calling bnx2x_napi_disable()
> and freeing the resources.
> 
> This patch will:
> - reduce the numbers of bnx2x_panic_dump() while in
>   bnx2x_tx_timeout(), avoid filling up dmesg buffer.
> - use checking new napi_enable flags to prevent calling 
>   disable again which causing system hangs.
> - cheking if fp->page_pool already freed avoid system
>   crash.

> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> index 712b5595bc39..bb8d91f44642 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> @@ -1860,37 +1860,49 @@ static int bnx2x_setup_irqs(struct bnx2x *bp)
>  static void bnx2x_napi_enable_cnic(struct bnx2x *bp)
>  {
>  	int i;
> +	if (bp->cnic_napi_enable)

empty line between variables and code, pls

> +		return;
>  
>  	for_each_rx_queue_cnic(bp, i) {
>  		napi_enable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->cnic_napi_enable = true;

The concept of not calling napi_enable() / disable()
feels a little wrong. It's the state of the driver,
not the NAPI that's the problem so perhaps you could
a appropriately named bool for that (IDK, maybe 
nic_stopped) and prevent coming into the NAPI handling
functions completely?

Is all other code in the driver on the path in question 
really idempotent?

>  }
>  
>  static void bnx2x_napi_enable(struct bnx2x *bp)
>  {
>  	int i;
> +	if (bp->napi_enable)
> +		return;
>  
>  	for_each_eth_queue(bp, i) {
>  		napi_enable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->napi_enable = true;
>  }
>  
>  static void bnx2x_napi_disable_cnic(struct bnx2x *bp)
>  {
>  	int i;
> +	if (!bp->cnic_napi_enable)
> +		return;
>  
>  	for_each_rx_queue_cnic(bp, i) {
>  		napi_disable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->cnic_napi_enable = false;
>  }
>  
>  static void bnx2x_napi_disable(struct bnx2x *bp)
>  {
>  	int i;
> +	if (!bp->napi_enable)
> +		return;
>  
>  	for_each_eth_queue(bp, i) {
>  		napi_disable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->napi_enable = false;
>  }
>  
>  void bnx2x_netif_start(struct bnx2x *bp)
> @@ -2554,6 +2566,7 @@ int bnx2x_load_cnic(struct bnx2x *bp)
>  	}
>  
>  	/* Add all CNIC NAPI objects */
> +	bp->cnic_napi_enable = false;
>  	bnx2x_add_all_napi_cnic(bp);
>  	DP(NETIF_MSG_IFUP, "cnic napi added\n");
>  	bnx2x_napi_enable_cnic(bp);
> @@ -2701,7 +2714,9 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
>  	 */
>  	bnx2x_setup_tc(bp->dev, bp->max_cos);
>  
> +	bp->tx_timeout_cnt = 0;
>  	/* Add all NAPI objects */
> +	bp->napi_enable = false;
>  	bnx2x_add_all_napi(bp);
>  	DP(NETIF_MSG_IFUP, "napi added\n");
>  	bnx2x_napi_enable(bp);
> @@ -4982,7 +4997,14 @@ void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
>  	 */
>  	if (!bp->panic)
>  #ifndef BNX2X_STOP_ON_ERROR
> -		bnx2x_panic_dump(bp, false);
> +	{
> +		if (++bp->tx_timeout_cnt > 3) {
> +			bnx2x_panic_dump(bp, false);
> +			bp->tx_timeout_cnt = 0;
> +		} else {
> +			netdev_err(bp->dev, "TX timeout %d times\n", bp->tx_timeout_cnt);
> +		}
> +	}

Please put this code in a helper function so that the oddly looking
brackets are not needed.

>  #else
>  		bnx2x_panic();
>  #endif
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
> index d8b1824c334d..7e1d38a2c7ec 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
> @@ -1018,6 +1018,9 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
>  	if (fp->mode == TPA_MODE_DISABLED)
>  		return;
>  
> +	if (!fp->page_pool.page)
> +		return;

See, another thing that's not idempotent. Better to bail higher up,
in the callee.

>  	for (i = 0; i < last; i++)
>  		bnx2x_free_rx_sge(bp, fp, i);
>  


^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [EXT] [PATCH] bnx2x: Fix error recovering in switch configuration
  2022-08-25 20:00 [PATCH] bnx2x: Fix error recovering in switch configuration Thinh Tran
  2022-08-27  1:44 ` Jakub Kicinski
@ 2022-08-30  9:19 ` Manish Chopra
  2022-08-30 21:01   ` Thinh Tran
  1 sibling, 1 reply; 36+ messages in thread
From: Manish Chopra @ 2022-08-30  9:19 UTC (permalink / raw)
  To: Thinh Tran, kuba
  Cc: netdev, Ariel Elior, davem, Sudarsana Reddy Kalluru, Alok Prasad

> -----Original Message-----
> From: Thinh Tran <thinhtr@linux.vnet.ibm.com>
> Sent: Friday, August 26, 2022 1:30 AM
> To: kuba@kernel.org
> Cc: netdev@vger.kernel.org; Ariel Elior <aelior@marvell.com>;
> davem@davemloft.net; Manish Chopra <manishc@marvell.com>; Sudarsana
> Reddy Kalluru <skalluru@marvell.com>; Thinh Tran
> <thinhtr@linux.vnet.ibm.com>
> Subject: [EXT] [PATCH] bnx2x: Fix error recovering in switch configuration
> 
> External Email
> 
> ----------------------------------------------------------------------
> As the BCM57810 and other I/O adapters are connected through a PCIe
> switch, the bnx2x driver causes unexpected system hang/crash while handling
> PCIe switch errors, if its error handler is called after other drivers' handlers.
> 
> In this case, after numbers of bnx2x_tx_timout(), the
> bnx2x_nic_unload() is  called, frees up resources and calls
> bnx2x_napi_disable(). Then when EEH calls its error handler, the
> bnx2x_io_error_detected() and
> bnx2x_io_slot_reset() also calling bnx2x_napi_disable() and freeing the
> resources.
> 
> This patch will:
> - reduce the numbers of bnx2x_panic_dump() while in
>   bnx2x_tx_timeout(), avoid filling up dmesg buffer.
> - use checking new napi_enable flags to prevent calling
>   disable again which causing system hangs.
> - cheking if fp->page_pool already freed avoid system
>   crash.
> 
> Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
> ---
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  4 ++++
>  .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 24 ++++++++++++++++++-
>  .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 +++
>  3 files changed, 30 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> index dd5945c4bfec..7fa23d47907a 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> @@ -1509,6 +1509,10 @@ struct bnx2x {
>  	bool			cnic_loaded;
>  	struct cnic_eth_dev	*(*cnic_probe)(struct net_device *);
> 
> +	bool			napi_enable;
> +	bool			cnic_napi_enable;
> +	int			tx_timeout_cnt;
> +
>  	/* Flag that indicates that we can start looking for FCoE L2 queue
>  	 * completions in the default status block.
>  	 */
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> index 712b5595bc39..bb8d91f44642 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> @@ -1860,37 +1860,49 @@ static int bnx2x_setup_irqs(struct bnx2x *bp)
> static void bnx2x_napi_enable_cnic(struct bnx2x *bp)  {
>  	int i;
> +	if (bp->cnic_napi_enable)
> +		return;
> 
>  	for_each_rx_queue_cnic(bp, i) {
>  		napi_enable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->cnic_napi_enable = true;
>  }
> 
>  static void bnx2x_napi_enable(struct bnx2x *bp)  {
>  	int i;
> +	if (bp->napi_enable)
> +		return;
> 
>  	for_each_eth_queue(bp, i) {
>  		napi_enable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->napi_enable = true;
>  }
> 
>  static void bnx2x_napi_disable_cnic(struct bnx2x *bp)  {
>  	int i;
> +	if (!bp->cnic_napi_enable)
> +		return;
> 
>  	for_each_rx_queue_cnic(bp, i) {
>  		napi_disable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->cnic_napi_enable = false;
>  }
> 
>  static void bnx2x_napi_disable(struct bnx2x *bp)  {
>  	int i;
> +	if (!bp->napi_enable)
> +		return;
> 
>  	for_each_eth_queue(bp, i) {
>  		napi_disable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->napi_enable = false;
>  }
> 
>  void bnx2x_netif_start(struct bnx2x *bp) @@ -2554,6 +2566,7 @@ int
> bnx2x_load_cnic(struct bnx2x *bp)
>  	}
> 
>  	/* Add all CNIC NAPI objects */
> +	bp->cnic_napi_enable = false;
>  	bnx2x_add_all_napi_cnic(bp);
>  	DP(NETIF_MSG_IFUP, "cnic napi added\n");
>  	bnx2x_napi_enable_cnic(bp);
> @@ -2701,7 +2714,9 @@ int bnx2x_nic_load(struct bnx2x *bp, int
> load_mode)
>  	 */
>  	bnx2x_setup_tc(bp->dev, bp->max_cos);
> 
> +	bp->tx_timeout_cnt = 0;
>  	/* Add all NAPI objects */
> +	bp->napi_enable = false;
>  	bnx2x_add_all_napi(bp);
>  	DP(NETIF_MSG_IFUP, "napi added\n");
>  	bnx2x_napi_enable(bp);
> @@ -4982,7 +4997,14 @@ void bnx2x_tx_timeout(struct net_device *dev,
> unsigned int txqueue)
>  	 */
>  	if (!bp->panic)
>  #ifndef BNX2X_STOP_ON_ERROR
> -		bnx2x_panic_dump(bp, false);
> +	{
> +		if (++bp->tx_timeout_cnt > 3) {
> +			bnx2x_panic_dump(bp, false);
> +			bp->tx_timeout_cnt = 0;
> +		} else {
> +			netdev_err(bp->dev, "TX timeout %d times\n", bp-
> >tx_timeout_cnt);
> +		}
> +	

Hello Trinh,
bnx2x_panic_dump() dumps quite of some useful debug (fastpath, hw related) info to look at in the logs,
I think they should be dumped at very first occurrence of tx timeout rather than waiting on next few tx timeout
event (which may not even occur in some cases). One another way to prevent it could be by turning off the
carrier (netif_carrier_off()) at very first place in bnx2x_tx_timeout() to prevent such repeated occurrences of tx timeout on the netdev.

Thanks,
Manish

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] bnx2x: Fix error recovering in switch configuration
  2022-08-27  1:44 ` Jakub Kicinski
@ 2022-08-30 16:15   ` Thinh Tran
  2022-09-16 19:51   ` [PATCH v2] " Thinh Tran
  1 sibling, 0 replies; 36+ messages in thread
From: Thinh Tran @ 2022-08-30 16:15 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev, aelior, davem, manishc, skalluru

Thanks for reviewing it.

On 8/26/2022 8:44 PM, Jakub Kicinski wrote:
> On Thu, 25 Aug 2022 20:00:29 +0000 Thinh Tran wrote:
>> As the BCM57810 and other I/O adapters are connected
>> through a PCIe switch, the bnx2x driver causes unexpected
>> system hang/crash while handling PCIe switch errors, if
>> its error handler is called after other drivers' handlers.
>>
>> In this case, after numbers of bnx2x_tx_timout(), the
>> bnx2x_nic_unload() is  called, frees up resources and
>> calls bnx2x_napi_disable(). Then when EEH calls its
>> error handler, the bnx2x_io_error_detected() and
>> bnx2x_io_slot_reset() also calling bnx2x_napi_disable()
>> and freeing the resources.
>>
>> This patch will:
>> - reduce the numbers of bnx2x_panic_dump() while in
>>    bnx2x_tx_timeout(), avoid filling up dmesg buffer.
>> - use checking new napi_enable flags to prevent calling
>>    disable again which causing system hangs.
>> - cheking if fp->page_pool already freed avoid system
>>    crash.
> 
>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
>> index 712b5595bc39..bb8d91f44642 100644
>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
>> @@ -1860,37 +1860,49 @@ static int bnx2x_setup_irqs(struct bnx2x *bp)
>>   static void bnx2x_napi_enable_cnic(struct bnx2x *bp)
>>   {
>>   	int i;
>> +	if (bp->cnic_napi_enable)
> 
> empty line between variables and code, pls
> 
Will correct it.

>> +		return;
>>   
>>   	for_each_rx_queue_cnic(bp, i) {
>>   		napi_enable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->cnic_napi_enable = true;
> 
> The concept of not calling napi_enable() / disable()
> feels a little wrong. It's the state of the driver,
> not the NAPI that's the problem so perhaps you could
> a appropriately named bool for that (IDK, maybe
> nic_stopped) and prevent coming into the NAPI handling
> functions completely?
> > Is all other code in the driver on the path in question
> really idempotent?
> 

For the states of the driver, it already has bnx2x_netif_start()
and bnx2x_netif_stop() to handle NAPI functions.
But the bnx2x_nic_load() directly calls bnx2x_napi_enable() and 
_disable() in case of errors, and is being called from various functions 
while the driver in different states.

I will work with the suggestion.


>>   }
>>   
>>   static void bnx2x_napi_enable(struct bnx2x *bp)
>>   {
>>   	int i;
>> +	if (bp->napi_enable)
>> +		return;
>>   
>>   	for_each_eth_queue(bp, i) {
>>   		napi_enable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->napi_enable = true;
>>   }
>>   
>>   static void bnx2x_napi_disable_cnic(struct bnx2x *bp)
>>   {
>>   	int i;
>> +	if (!bp->cnic_napi_enable)
>> +		return;
>>   
>>   	for_each_rx_queue_cnic(bp, i) {
>>   		napi_disable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->cnic_napi_enable = false;
>>   }
>>   
>>   static void bnx2x_napi_disable(struct bnx2x *bp)
>>   {
>>   	int i;
>> +	if (!bp->napi_enable)
>> +		return;
>>   
>>   	for_each_eth_queue(bp, i) {
>>   		napi_disable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->napi_enable = false;
>>   }
>>   
>>   void bnx2x_netif_start(struct bnx2x *bp)
>> @@ -2554,6 +2566,7 @@ int bnx2x_load_cnic(struct bnx2x *bp)
>>   	}
>>   
>>   	/* Add all CNIC NAPI objects */
>> +	bp->cnic_napi_enable = false;
>>   	bnx2x_add_all_napi_cnic(bp);
>>   	DP(NETIF_MSG_IFUP, "cnic napi added\n");
>>   	bnx2x_napi_enable_cnic(bp);
>> @@ -2701,7 +2714,9 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
>>   	 */
>>   	bnx2x_setup_tc(bp->dev, bp->max_cos);
>>   
>> +	bp->tx_timeout_cnt = 0;
>>   	/* Add all NAPI objects */
>> +	bp->napi_enable = false;
>>   	bnx2x_add_all_napi(bp);
>>   	DP(NETIF_MSG_IFUP, "napi added\n");
>>   	bnx2x_napi_enable(bp);
>> @@ -4982,7 +4997,14 @@ void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
>>   	 */
>>   	if (!bp->panic)
>>   #ifndef BNX2X_STOP_ON_ERROR
>> -		bnx2x_panic_dump(bp, false);
>> +	{
>> +		if (++bp->tx_timeout_cnt > 3) {
>> +			bnx2x_panic_dump(bp, false);
>> +			bp->tx_timeout_cnt = 0;
>> +		} else {
>> +			netdev_err(bp->dev, "TX timeout %d times\n", bp->tx_timeout_cnt);
>> +		}
>> +	}
> 
> Please put this code in a helper function so that the oddly looking
> brackets are not needed.
> 
>>   #else
>>   		bnx2x_panic();
>>   #endif
>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
>> index d8b1824c334d..7e1d38a2c7ec 100644
>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
>> @@ -1018,6 +1018,9 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
>>   	if (fp->mode == TPA_MODE_DISABLED)
>>   		return;
>>   
>> +	if (!fp->page_pool.page)
>> +		return;
> 
> See, another thing that's not idempotent. Better to bail higher up,
> in the callee.
>

Will correct it.

>>   	for (i = 0; i < last; i++)
>>   		bnx2x_free_rx_sge(bp, fp, i);
>>   
> 


Thinh Tran

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [EXT] [PATCH] bnx2x: Fix error recovering in switch configuration
  2022-08-30  9:19 ` [EXT] [PATCH] bnx2x: Fix error recovering in switch configuration Manish Chopra
@ 2022-08-30 21:01   ` Thinh Tran
  0 siblings, 0 replies; 36+ messages in thread
From: Thinh Tran @ 2022-08-30 21:01 UTC (permalink / raw)
  To: Manish Chopra, kuba
  Cc: netdev, Ariel Elior, davem, Sudarsana Reddy Kalluru, Alok Prasad



On 8/30/2022 4:19 AM, Manish Chopra wrote:
>> -----Original Message-----
>> From: Thinh Tran <thinhtr@linux.vnet.ibm.com>
>> Sent: Friday, August 26, 2022 1:30 AM
>> To: kuba@kernel.org
>> Cc: netdev@vger.kernel.org; Ariel Elior <aelior@marvell.com>;
>> davem@davemloft.net; Manish Chopra <manishc@marvell.com>; Sudarsana
>> Reddy Kalluru <skalluru@marvell.com>; Thinh Tran
>> <thinhtr@linux.vnet.ibm.com>
>> Subject: [EXT] [PATCH] bnx2x: Fix error recovering in switch configuration
>>
>> External Email
>>
>> ----------------------------------------------------------------------
>> As the BCM57810 and other I/O adapters are connected through a PCIe
>> switch, the bnx2x driver causes unexpected system hang/crash while handling
>> PCIe switch errors, if its error handler is called after other drivers' handlers.
>>
>> In this case, after numbers of bnx2x_tx_timout(), the
>> bnx2x_nic_unload() is  called, frees up resources and calls
>> bnx2x_napi_disable(). Then when EEH calls its error handler, the
>> bnx2x_io_error_detected() and
>> bnx2x_io_slot_reset() also calling bnx2x_napi_disable() and freeing the
>> resources.
>>
>> This patch will:
>> - reduce the numbers of bnx2x_panic_dump() while in
>>    bnx2x_tx_timeout(), avoid filling up dmesg buffer.
>> - use checking new napi_enable flags to prevent calling
>>    disable again which causing system hangs.
>> - cheking if fp->page_pool already freed avoid system
>>    crash.
>>
>> Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
>> ---
>>   drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  4 ++++
>>   .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 24 ++++++++++++++++++-
>>   .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 +++
>>   3 files changed, 30 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>> index dd5945c4bfec..7fa23d47907a 100644
>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>> @@ -1509,6 +1509,10 @@ struct bnx2x {
>>   	bool			cnic_loaded;
>>   	struct cnic_eth_dev	*(*cnic_probe)(struct net_device *);
>>
>> +	bool			napi_enable;
>> +	bool			cnic_napi_enable;
>> +	int			tx_timeout_cnt;
>> +
>>   	/* Flag that indicates that we can start looking for FCoE L2 queue
>>   	 * completions in the default status block.
>>   	 */
>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
>> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
>> index 712b5595bc39..bb8d91f44642 100644
>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
>> @@ -1860,37 +1860,49 @@ static int bnx2x_setup_irqs(struct bnx2x *bp)
>> static void bnx2x_napi_enable_cnic(struct bnx2x *bp)  {
>>   	int i;
>> +	if (bp->cnic_napi_enable)
>> +		return;
>>
>>   	for_each_rx_queue_cnic(bp, i) {
>>   		napi_enable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->cnic_napi_enable = true;
>>   }
>>
>>   static void bnx2x_napi_enable(struct bnx2x *bp)  {
>>   	int i;
>> +	if (bp->napi_enable)
>> +		return;
>>
>>   	for_each_eth_queue(bp, i) {
>>   		napi_enable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->napi_enable = true;
>>   }
>>
>>   static void bnx2x_napi_disable_cnic(struct bnx2x *bp)  {
>>   	int i;
>> +	if (!bp->cnic_napi_enable)
>> +		return;
>>
>>   	for_each_rx_queue_cnic(bp, i) {
>>   		napi_disable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->cnic_napi_enable = false;
>>   }
>>
>>   static void bnx2x_napi_disable(struct bnx2x *bp)  {
>>   	int i;
>> +	if (!bp->napi_enable)
>> +		return;
>>
>>   	for_each_eth_queue(bp, i) {
>>   		napi_disable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->napi_enable = false;
>>   }
>>
>>   void bnx2x_netif_start(struct bnx2x *bp) @@ -2554,6 +2566,7 @@ int
>> bnx2x_load_cnic(struct bnx2x *bp)
>>   	}
>>
>>   	/* Add all CNIC NAPI objects */
>> +	bp->cnic_napi_enable = false;
>>   	bnx2x_add_all_napi_cnic(bp);
>>   	DP(NETIF_MSG_IFUP, "cnic napi added\n");
>>   	bnx2x_napi_enable_cnic(bp);
>> @@ -2701,7 +2714,9 @@ int bnx2x_nic_load(struct bnx2x *bp, int
>> load_mode)
>>   	 */
>>   	bnx2x_setup_tc(bp->dev, bp->max_cos);
>>
>> +	bp->tx_timeout_cnt = 0;
>>   	/* Add all NAPI objects */
>> +	bp->napi_enable = false;
>>   	bnx2x_add_all_napi(bp);
>>   	DP(NETIF_MSG_IFUP, "napi added\n");
>>   	bnx2x_napi_enable(bp);
>> @@ -4982,7 +4997,14 @@ void bnx2x_tx_timeout(struct net_device *dev,
>> unsigned int txqueue)
>>   	 */
>>   	if (!bp->panic)
>>   #ifndef BNX2X_STOP_ON_ERROR
>> -		bnx2x_panic_dump(bp, false);
>> +	{
>> +		if (++bp->tx_timeout_cnt > 3) {
>> +			bnx2x_panic_dump(bp, false);
>> +			bp->tx_timeout_cnt = 0;
>> +		} else {
>> +			netdev_err(bp->dev, "TX timeout %d times\n", bp-
>>> tx_timeout_cnt);
>> +		}
>> +	
> 
> Hello Trinh,
> bnx2x_panic_dump() dumps quite of some useful debug (fastpath, hw related) info to look at in the logs,
> I think they should be dumped at very first occurrence of tx timeout rather than waiting on next few tx timeout
> event (which may not even occur in some cases). One another way to prevent it could be by turning off the
> carrier (netif_carrier_off()) at very first place in bnx2x_tx_timeout() to prevent such repeated occurrences of tx timeout on the netdev.
> 
> Thanks,
> Manish

will do

Thanks,
Thinh Tran

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v2] bnx2x: Fix error recovering in switch configuration
  2022-08-27  1:44 ` Jakub Kicinski
  2022-08-30 16:15   ` Thinh Tran
@ 2022-09-16 19:51   ` Thinh Tran
  2022-09-19 13:53     ` [EXT] " Manish Chopra
                       ` (2 more replies)
  1 sibling, 3 replies; 36+ messages in thread
From: Thinh Tran @ 2022-09-16 19:51 UTC (permalink / raw)
  To: kuba
  Cc: netdev, aelior, davem, manishc, skalluru, pabeni, edumazet, Thinh Tran

As the BCM57810 and other I/O adapters are connected
through a PCIe switch, the bnx2x driver causes unexpected
system hang/crash while handling PCIe switch errors, if
its error handler is called after other drivers' handlers.

In this case, after numbers of bnx2x_tx_timout(), the
bnx2x_nic_unload() is  called, frees up resources and
calls bnx2x_napi_disable(). Then when EEH calls its
error handler, the bnx2x_io_error_detected() and
bnx2x_io_slot_reset() also calling bnx2x_napi_disable()
and freeing the resources.

Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>

 v2:
   - Check the state of the NIC before calling disable nappi
     and freeing the IRQ
   - Prevent recurrence of TX timeout by turning off the carrier,
     calling netif_carrier_off() in bnx2x_tx_timeout()
   - Check and bail out early if fp->page_pool already freed
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  2 +
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 27 +++++++++----
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 ++
 .../net/ethernet/broadcom/bnx2x/bnx2x_main.c  | 38 +++++++++----------
 .../net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  | 17 +++++----
 5 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index dd5945c4bfec..11280f0eb75b 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -1509,6 +1509,8 @@ struct bnx2x {
 	bool			cnic_loaded;
 	struct cnic_eth_dev	*(*cnic_probe)(struct net_device *);
 
+	bool			nic_stopped;
+
 	/* Flag that indicates that we can start looking for FCoE L2 queue
 	 * completions in the default status block.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 712b5595bc39..7ba53ce1d09e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -2705,6 +2705,7 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 	bnx2x_add_all_napi(bp);
 	DP(NETIF_MSG_IFUP, "napi added\n");
 	bnx2x_napi_enable(bp);
+	bp->nic_stopped = false;
 
 	if (IS_PF(bp)) {
 		/* set pf load just before approaching the MCP */
@@ -2950,6 +2951,7 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 load_error1:
 	bnx2x_napi_disable(bp);
 	bnx2x_del_all_napi(bp);
+	bp->nic_stopped = true;
 
 	/* clear pf_load status, as it was already set */
 	if (IS_PF(bp))
@@ -3085,14 +3087,17 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link)
 		if (!CHIP_IS_E1x(bp))
 			bnx2x_pf_disable(bp);
 
-		/* Disable HW interrupts, NAPI */
-		bnx2x_netif_stop(bp, 1);
-		/* Delete all NAPI objects */
-		bnx2x_del_all_napi(bp);
-		if (CNIC_LOADED(bp))
-			bnx2x_del_all_napi_cnic(bp);
-		/* Release IRQs */
-		bnx2x_free_irq(bp);
+		if (!bp->nic_stopped) {
+			/* Disable HW interrupts, NAPI */
+			bnx2x_netif_stop(bp, 1);
+			/* Delete all NAPI objects */
+			bnx2x_del_all_napi(bp);
+			if (CNIC_LOADED(bp))
+				bnx2x_del_all_napi_cnic(bp);
+			/* Release IRQs */
+			bnx2x_free_irq(bp);
+			bp->nic_stopped = true;
+		}
 
 		/* Report UNLOAD_DONE to MCP */
 		bnx2x_send_unload_done(bp, false);
@@ -4977,6 +4982,12 @@ void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 
+	/* Immediately indicate link as down */
+	bp->link_vars.link_up = 0;
+	bp->force_link_down = true;
+	netif_carrier_off(dev);
+	BNX2X_ERR("Indicating link is down due to Tx-timeout\n");
+
 	/* We want the information of the dump logged,
 	 * but calling bnx2x_panic() would kill all chances of recovery.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index d8b1824c334d..bc0dee25b804 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -1015,6 +1015,9 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 {
 	int i;
 
+	if (!fp->page_pool.page)
+		return;
+
 	if (fp->mode == TPA_MODE_DISABLED)
 		return;
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 962253db25b8..3129a3372f8b 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -9475,15 +9475,18 @@ void bnx2x_chip_cleanup(struct bnx2x *bp, int unload_mode, bool keep_link)
 		}
 	}
 
-	/* Disable HW interrupts, NAPI */
-	bnx2x_netif_stop(bp, 1);
-	/* Delete all NAPI objects */
-	bnx2x_del_all_napi(bp);
-	if (CNIC_LOADED(bp))
-		bnx2x_del_all_napi_cnic(bp);
+	if (!bp->nic_stopped) {
+		/* Disable HW interrupts, NAPI */
+		bnx2x_netif_stop(bp, 1);
+		/* Delete all NAPI objects */
+		bnx2x_del_all_napi(bp);
+		if (CNIC_LOADED(bp))
+			bnx2x_del_all_napi_cnic(bp);
 
-	/* Release IRQs */
-	bnx2x_free_irq(bp);
+		/* Release IRQs */
+		bnx2x_free_irq(bp);
+		bp->nic_stopped = true;
+	}
 
 	/* Reset the chip, unless PCI function is offline. If we reach this
 	 * point following a PCI error handling, it means device is really
@@ -10274,12 +10277,6 @@ static void bnx2x_sp_rtnl_task(struct work_struct *work)
 		bp->sp_rtnl_state = 0;
 		smp_mb();
 
-		/* Immediately indicate link as down */
-		bp->link_vars.link_up = 0;
-		bp->force_link_down = true;
-		netif_carrier_off(bp->dev);
-		BNX2X_ERR("Indicating link is down due to Tx-timeout\n");
-
 		bnx2x_nic_unload(bp, UNLOAD_NORMAL, true);
 		/* When ret value shows failure of allocation failure,
 		 * the nic is rebooted again. If open still fails, a error
@@ -14256,13 +14253,16 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev)
 		}
 		bnx2x_drain_tx_queues(bp);
 		bnx2x_send_unload_req(bp, UNLOAD_RECOVERY);
-		bnx2x_netif_stop(bp, 1);
-		bnx2x_del_all_napi(bp);
+		if (!bp->nic_stopped) {
+			bnx2x_netif_stop(bp, 1);
+			bnx2x_del_all_napi(bp);
 
-		if (CNIC_LOADED(bp))
-			bnx2x_del_all_napi_cnic(bp);
+			if (CNIC_LOADED(bp))
+				bnx2x_del_all_napi_cnic(bp);
 
-		bnx2x_free_irq(bp);
+			bnx2x_free_irq(bp);
+			bp->nic_stopped = true;
+		}
 
 		/* Report UNLOAD_DONE to MCP */
 		bnx2x_send_unload_done(bp, true);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
index c9129b9ba446..1f94f7987470 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -529,13 +529,16 @@ void bnx2x_vfpf_close_vf(struct bnx2x *bp)
 	bnx2x_vfpf_finalize(bp, &req->first_tlv);
 
 free_irq:
-	/* Disable HW interrupts, NAPI */
-	bnx2x_netif_stop(bp, 0);
-	/* Delete all NAPI objects */
-	bnx2x_del_all_napi(bp);
-
-	/* Release IRQs */
-	bnx2x_free_irq(bp);
+	if (!bp->nic_stopped) {
+		/* Disable HW interrupts, NAPI */
+		bnx2x_netif_stop(bp, 0);
+		/* Delete all NAPI objects */
+		bnx2x_del_all_napi(bp);
+
+		/* Release IRQs */
+		bnx2x_free_irq(bp);
+		bp->nic_stopped = true;
+	}
 }
 
 static void bnx2x_leading_vfq_init(struct bnx2x *bp, struct bnx2x_virtf *vf,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* RE: [EXT] [PATCH v2] bnx2x: Fix error recovering in switch configuration
  2022-09-16 19:51   ` [PATCH v2] " Thinh Tran
@ 2022-09-19 13:53     ` Manish Chopra
  2022-09-21 20:11       ` Thinh Tran
  2023-07-19 22:02     ` [Patch v3] " Thinh Tran
  2023-07-28 21:11     ` [PATCH v4] " Thinh Tran
  2 siblings, 1 reply; 36+ messages in thread
From: Manish Chopra @ 2022-09-19 13:53 UTC (permalink / raw)
  To: Thinh Tran, kuba
  Cc: netdev, Ariel Elior, davem, Sudarsana Reddy Kalluru, pabeni,
	edumazet, Alok Prasad

Hello Thinh,

> -----Original Message-----
> From: Thinh Tran <thinhtr@linux.vnet.ibm.com>
> Sent: Saturday, September 17, 2022 1:21 AM
> To: kuba@kernel.org
> Cc: netdev@vger.kernel.org; Ariel Elior <aelior@marvell.com>;
> davem@davemloft.net; Manish Chopra <manishc@marvell.com>; Sudarsana
> Reddy Kalluru <skalluru@marvell.com>; pabeni@redhat.com;
> edumazet@google.com; Thinh Tran <thinhtr@linux.vnet.ibm.com>
> Subject: [EXT] [PATCH v2] bnx2x: Fix error recovering in switch configuration
> 
> External Email
> 
> ----------------------------------------------------------------------
> As the BCM57810 and other I/O adapters are connected through a PCIe
> switch, the bnx2x driver causes unexpected system hang/crash while handling
> PCIe switch errors, if its error handler is called after other drivers' handlers.
> 
> In this case, after numbers of bnx2x_tx_timout(), the
> bnx2x_nic_unload() is  called, frees up resources and calls
> bnx2x_napi_disable(). Then when EEH calls its error handler, the
> bnx2x_io_error_detected() and
> bnx2x_io_slot_reset() also calling bnx2x_napi_disable() and freeing the
> resources.
> 
> Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
> 
>  v2:
>    - Check the state of the NIC before calling disable nappi
>      and freeing the IRQ
>    - Prevent recurrence of TX timeout by turning off the carrier,
>      calling netif_carrier_off() in bnx2x_tx_timeout()
>    - Check and bail out early if fp->page_pool already freed
> ---
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  2 +
>  .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 27 +++++++++----
>  .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 ++
>  .../net/ethernet/broadcom/bnx2x/bnx2x_main.c  | 38 +++++++++----------
> .../net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  | 17 +++++----
>  5 files changed, 53 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> index dd5945c4bfec..11280f0eb75b 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> @@ -1509,6 +1509,8 @@ struct bnx2x {
>  	bool			cnic_loaded;
>  	struct cnic_eth_dev	*(*cnic_probe)(struct net_device *);
> 
> +	bool			nic_stopped;
> +

There is an already 'state' variable which holds the NIC state whether it was opened or closed.
Perhaps we could use that easily in bnx2x_io_slot_reset() to prevent multiple NAPI disablement
instead of adding a newer one. Please see below for ex -

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 962253db25b8..c8e9b47208ed 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -14256,13 +14256,16 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev)
                }
                bnx2x_drain_tx_queues(bp);
                bnx2x_send_unload_req(bp, UNLOAD_RECOVERY);
-               bnx2x_netif_stop(bp, 1);
-               bnx2x_del_all_napi(bp);

-               if (CNIC_LOADED(bp))
-                       bnx2x_del_all_napi_cnic(bp);
+               if (bp->state == BNX2X_STATE_OPEN) {
+                       bnx2x_netif_stop(bp, 1);
+                       bnx2x_del_all_napi(bp);

-               bnx2x_free_irq(bp);
+                       if (CNIC_LOADED(bp))
+                               bnx2x_del_all_napi_cnic(bp);
+
+                       bnx2x_free_irq(bp);
+               }

                /* Report UNLOAD_DONE to MCP */
                bnx2x_send_unload_done(bp, true);

Thanks,
Manish

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [EXT] [PATCH v2] bnx2x: Fix error recovering in switch configuration
  2022-09-19 13:53     ` [EXT] " Manish Chopra
@ 2022-09-21 20:11       ` Thinh Tran
  2022-09-28 21:33         ` Thinh Tran
  0 siblings, 1 reply; 36+ messages in thread
From: Thinh Tran @ 2022-09-21 20:11 UTC (permalink / raw)
  To: Manish Chopra, kuba
  Cc: netdev, Ariel Elior, davem, Sudarsana Reddy Kalluru, pabeni,
	edumazet, Alok Prasad

Hi Manish,
Thanks for reviewing the patch.


On 9/19/2022 8:53 AM, Manish Chopra wrote:
> Hello Thinh,
> 
>> -----Original Message-----
>> From: Thinh Tran <thinhtr@linux.vnet.ibm.com>
>> Sent: Saturday, September 17, 2022 1:21 AM
>> To: kuba@kernel.org
>> Cc: netdev@vger.kernel.org; Ariel Elior <aelior@marvell.com>;
>> davem@davemloft.net; Manish Chopra <manishc@marvell.com>; Sudarsana
>> Reddy Kalluru <skalluru@marvell.com>; pabeni@redhat.com;
>> edumazet@google.com; Thinh Tran <thinhtr@linux.vnet.ibm.com>
>> Subject: [EXT] [PATCH v2] bnx2x: Fix error recovering in switch configuration
>>
>> External Email
>>
>> ----------------------------------------------------------------------
>> As the BCM57810 and other I/O adapters are connected through a PCIe
>> switch, the bnx2x driver causes unexpected system hang/crash while handling
>> PCIe switch errors, if its error handler is called after other drivers' handlers.
>>
>> In this case, after numbers of bnx2x_tx_timout(), the
>> bnx2x_nic_unload() is  called, frees up resources and calls
>> bnx2x_napi_disable(). Then when EEH calls its error handler, the
>> bnx2x_io_error_detected() and
>> bnx2x_io_slot_reset() also calling bnx2x_napi_disable() and freeing the
>> resources.
>>
>> Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
>>
>>   v2:
>>     - Check the state of the NIC before calling disable nappi
>>       and freeing the IRQ
>>     - Prevent recurrence of TX timeout by turning off the carrier,
>>       calling netif_carrier_off() in bnx2x_tx_timeout()
>>     - Check and bail out early if fp->page_pool already freed
>> ---
>>   drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  2 +
>>   .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 27 +++++++++----
>>   .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 ++
>>   .../net/ethernet/broadcom/bnx2x/bnx2x_main.c  | 38 +++++++++----------
>> .../net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  | 17 +++++----
>>   5 files changed, 53 insertions(+), 34 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>> index dd5945c4bfec..11280f0eb75b 100644
>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>> @@ -1509,6 +1509,8 @@ struct bnx2x {
>>   	bool			cnic_loaded;
>>   	struct cnic_eth_dev	*(*cnic_probe)(struct net_device *);
>>
>> +	bool			nic_stopped;
>> +
> 
> There is an already 'state' variable which holds the NIC state whether it was opened or closed. > Perhaps we could use that easily in bnx2x_io_slot_reset() to prevent 
multiple NAPI disablement
> instead of adding a newer one. Please see below for ex -
> 
In my early debug, I did checked for bp->state variable but I was unsure 
what the NIC state was safe to disable the NAPI.
Thanks for the hint.

> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
> index 962253db25b8..c8e9b47208ed 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
> @@ -14256,13 +14256,16 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev)
>                  }
>                  bnx2x_drain_tx_queues(bp);
>                  bnx2x_send_unload_req(bp, UNLOAD_RECOVERY);
> -               bnx2x_netif_stop(bp, 1);
> -               bnx2x_del_all_napi(bp);
> 
> -               if (CNIC_LOADED(bp))
> -                       bnx2x_del_all_napi_cnic(bp);
> +               if (bp->state == BNX2X_STATE_OPEN) {
> +                       bnx2x_netif_stop(bp, 1);
> +                       bnx2x_del_all_napi(bp);
> 
> -               bnx2x_free_irq(bp);
> +                       if (CNIC_LOADED(bp))
> +                               bnx2x_del_all_napi_cnic(bp);
> +
> +                       bnx2x_free_irq(bp);
> +               }
> 
>                  /* Report UNLOAD_DONE to MCP */
>                  bnx2x_send_unload_done(bp, true);
> 
> Thanks,
> Manish

It requires a specific system setup, I will test with v3 patch when the 
system is available.

Thanks,
Thinh Tran

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [EXT] [PATCH v2] bnx2x: Fix error recovering in switch configuration
  2022-09-21 20:11       ` Thinh Tran
@ 2022-09-28 21:33         ` Thinh Tran
  2022-10-10 22:01           ` Thinh Tran
  0 siblings, 1 reply; 36+ messages in thread
From: Thinh Tran @ 2022-09-28 21:33 UTC (permalink / raw)
  To: Manish Chopra, kuba
  Cc: netdev, Ariel Elior, davem, Sudarsana Reddy Kalluru, pabeni,
	edumazet, Alok Prasad

Hi Manish,

I think we need the extra variable, nic_stopped, or perhaps defining a 
different state to determine the disablement of NAPI and freed IRQs.
Since device gets the error, the NIC state is set to 
BNX2X_STATE_CLOSING_WAIT4_HALT (0x4000)

14132 static int bnx2x_eeh_nic_unload(struct bnx2x *bp)
14133 {
14134         bp->state = BNX2X_STATE_CLOSING_WAIT4_HALT;
14135

and stays in this state in EEH handler and driver reset code paths, when 
both trying to disabling the NAPI.

On 9/21/2022 3:11 PM, Thinh Tran wrote:
> Hi Manish,
> Thanks for reviewing the patch.
> 
> 
> On 9/19/2022 8:53 AM, Manish Chopra wrote:
>> Hello Thinh,
>>
>>> -----Original Message-----
>>> From: Thinh Tran <thinhtr@linux.vnet.ibm.com>
>>> Sent: Saturday, September 17, 2022 1:21 AM
>>> To: kuba@kernel.org
>>> Cc: netdev@vger.kernel.org; Ariel Elior <aelior@marvell.com>;
>>> davem@davemloft.net; Manish Chopra <manishc@marvell.com>; Sudarsana
>>> Reddy Kalluru <skalluru@marvell.com>; pabeni@redhat.com;
>>> edumazet@google.com; Thinh Tran <thinhtr@linux.vnet.ibm.com>
>>> Subject: [EXT] [PATCH v2] bnx2x: Fix error recovering in switch 
>>> configuration
>>>
>>> External Email
>>>
>>> ----------------------------------------------------------------------
>>> As the BCM57810 and other I/O adapters are connected through a PCIe
>>> switch, the bnx2x driver causes unexpected system hang/crash while 
>>> handling
>>> PCIe switch errors, if its error handler is called after other 
>>> drivers' handlers.
>>>
>>> In this case, after numbers of bnx2x_tx_timout(), the
>>> bnx2x_nic_unload() is  called, frees up resources and calls
>>> bnx2x_napi_disable(). Then when EEH calls its error handler, the
>>> bnx2x_io_error_detected() and
>>> bnx2x_io_slot_reset() also calling bnx2x_napi_disable() and freeing the
>>> resources.
>>>
>>> Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
>>>
>>>   v2:
>>>     - Check the state of the NIC before calling disable nappi
>>>       and freeing the IRQ
>>>     - Prevent recurrence of TX timeout by turning off the carrier,
>>>       calling netif_carrier_off() in bnx2x_tx_timeout()
>>>     - Check and bail out early if fp->page_pool already freed
>>> ---
>>>   drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  2 +
>>>   .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 27 +++++++++----
>>>   .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 ++
>>>   .../net/ethernet/broadcom/bnx2x/bnx2x_main.c  | 38 +++++++++----------
>>> .../net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  | 17 +++++----
>>>   5 files changed, 53 insertions(+), 34 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>>> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>>> index dd5945c4bfec..11280f0eb75b 100644
>>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>>> @@ -1509,6 +1509,8 @@ struct bnx2x {
>>>       bool            cnic_loaded;
>>>       struct cnic_eth_dev    *(*cnic_probe)(struct net_device *);
>>>
>>> +    bool            nic_stopped;
>>> +
>>
>> There is an already 'state' variable which holds the NIC state whether 
>> it was opened or closed. > Perhaps we could use that easily in 
>> bnx2x_io_slot_reset() to prevent 
> multiple NAPI disablement
>> instead of adding a newer one. Please see below for ex -
>>
> In my early debug, I did checked for bp->state variable but I was unsure 
> what the NIC state was safe to disable the NAPI.
> Thanks for the hint.
> 
>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c 
>> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
>> index 962253db25b8..c8e9b47208ed 100644
>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
>> @@ -14256,13 +14256,16 @@ static pci_ers_result_t 
>> bnx2x_io_slot_reset(struct pci_dev *pdev)
>>                  }
>>                  bnx2x_drain_tx_queues(bp);
>>                  bnx2x_send_unload_req(bp, UNLOAD_RECOVERY);
>> -               bnx2x_netif_stop(bp, 1);
>> -               bnx2x_del_all_napi(bp);
>>
>> -               if (CNIC_LOADED(bp))
>> -                       bnx2x_del_all_napi_cnic(bp);
>> +               if (bp->state == BNX2X_STATE_OPEN) {
>> +                       bnx2x_netif_stop(bp, 1);
>> +                       bnx2x_del_all_napi(bp);
>>
>> -               bnx2x_free_irq(bp);
>> +                       if (CNIC_LOADED(bp))
>> +                               bnx2x_del_all_napi_cnic(bp);
>> +
>> +                       bnx2x_free_irq(bp);
>> +               }
>>
>>                  /* Report UNLOAD_DONE to MCP */
>>                  bnx2x_send_unload_done(bp, true);
>>
Using the v2 patch, I checked the NIC state at places before and after 
disabling the NAPI,
         BNX2X_ERR("before state flag = 0x%x \n", bp->state);
         if (!bp->nic_stopped) {
                 BNX2X_ERR("after state flag = 0x%x \n", bp->state);
		bnx2x_netif_stop(bp, 1);
		bnx2x_del_all_napi(bp);
                 ....

the NIC state is always BNX2X_STATE_CLOSING_WAIT4_HALT (0x4000) when 
device get error injection.

1- When EEH handler code path was called first, the 
bnx2x_io_slot_reset() does disablement the NAPI and done.
    [28197.100795] bnx2x: [bnx2x_io_slot_reset:14212(eth3)]IO slot reset 
initializing...
    [28197.100959] bnx2x 0201:50:00.0: enabling device (0140 -> 0142)
    [28197.107249] bnx2x: [bnx2x_io_slot_reset:14228(eth3)]IO slot reset 
--> driver unload
    [28199.106281] bnx2x: [bnx2x_clean_tx_queue:1205(eth3)]timeout 
waiting for queue[1]: txdata->tx_pkt_prod(7270) != txdata->tx_pkt_cons(7269)
    [28201.106281] bnx2x: [bnx2x_clean_tx_queue:1205(eth3)]timeout 
waiting for queue[3]: txdata->tx_pkt_prod(10977) != 
txdata->tx_pkt_cons(10969)
    [28203.106280] bnx2x: [bnx2x_clean_tx_queue:1205(eth3)]timeout 
waiting for queue[4]: txdata->tx_pkt_prod(9532) != txdata->tx_pkt_cons(9514)
    [28205.106284] bnx2x: [bnx2x_clean_tx_queue:1205(eth3)]timeout 
waiting for queue[6]: txdata->tx_pkt_prod(2359) != txdata->tx_pkt_cons(2331)
    [28205.314280] bnx2x: [bnx2x_io_slot_reset:14246(eth3)]before -- 
state flag = 0x4000   <---- check NIC state
    [28205.314282] bnx2x: [bnx2x_io_slot_reset:14248(eth3)]after -- 
state flag = 0x4000         <---- check NIC state and disable the NAPI
    [28205.405009] PCI 0201:50:00.0#500000: EEH: bnx2x driver reports: 
'recovered'
........ snip ......
    [28213.525014] EEH: Finished:'slot_reset' with aggregate recovery 
state:'recovered'
    [28213.525016] EEH: Notify device driver to resume
    [28213.525017] EEH: Beginning: 'resume'
    [28213.525018] PCI 0201:50:00.0#500000: EEH: Invoking bnx2x->resume()
    [28225.115287] bnx2x 0201:50:00.0 eth3: using MSI-X  IRQs: sp 101 
fp[0] 103 ... fp[7] 110
    [28225.696425] PCI 0201:50:00.0#500000: EEH: bnx2x driver reports: 
'none'


2- When the driver reset code path was called first, the 
bnx2x_chip_cleanup() does the disablement the NAPI, but NIC is resumed 
after the EEH handler code path was called.
    [13376.774278] bnx2x: [bnx2x_state_wait:312(eth3)]timeout waiting 
for state 2
     [13376.774280] bnx2x: [bnx2x_func_stop:9118(eth3)]FUNC_STOP ramrod 
failed. Running a dry transaction
    [13376.774339] bnx2x: [bnx2x_chip_cleanup:9467(eth3)]before state 
flag = 0x4000
    [13376.774341] bnx2x: [bnx2x_chip_cleanup:9469(eth3)]after state 
flag = 0x4000
    [13376.774344] bnx2x: [bnx2x_igu_int_disable:891(eth3)]BUG! Proper 
val not read from IGU!
    [13391.774280] bnx2x: [bnx2x_fw_command:3044(eth3)]FW failed to respond!
    ........ snip .....
    [13442.118385] PCI 0201:50:00.0#500000: EEH: Invoking 
bnx2x->slot_reset()
    [13442.118388] bnx2x: [bnx2x_io_slot_reset:14212(eth3)]IO slot reset 
initializing...
    [13442.118509] bnx2x 0201:50:00.0: enabling device (0140 -> 0142)
    [13442.124825] bnx2x: [bnx2x_io_slot_reset:14228(eth3)]IO slot reset 
--> driver unload
    [13442.154290] bnx2x: [bnx2x_io_slot_reset:14246(eth3)]before -- 
state flag = 0x4000    <--- check NIC state
    [13442.274291] PCI 0201:50:00.0#500000: EEH: bnx2x driver reports: 
'recovered'
    ....... snip .........
    [13442.424290] EEH: Finished:'slot_reset' with aggregate recovery 
state:'recovered'
    [13442.424291] EEH: Notify device driver to resume
    [13442.424292] EEH: Beginning: 'resume'
    [13442.424293] PCI 0201:50:00.0#500000: EEH: Invoking bnx2x->resume()
    [13443.015280] bnx2x 0201:50:00.0 eth3: using MSI-X  IRQs: sp 61 
fp[0] 63 ... fp[7] 75
    [13443.616423] PCI 0201:50:00.0#500000: EEH: bnx2x driver reports: 
'none'


>> Thanks,
>> Manish
> 
> It requires a specific system setup, I will test with v3 patch when the 
> system is available.
> 
> Thanks,
> Thinh Tran

Any suggestions?
Thanks,
Thinh Tran

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [EXT] [PATCH v2] bnx2x: Fix error recovering in switch configuration
  2022-09-28 21:33         ` Thinh Tran
@ 2022-10-10 22:01           ` Thinh Tran
  2022-10-17 11:14             ` Manish Chopra
  0 siblings, 1 reply; 36+ messages in thread
From: Thinh Tran @ 2022-10-10 22:01 UTC (permalink / raw)
  To: Manish Chopra
  Cc: netdev, Ariel Elior, davem, Sudarsana Reddy Kalluru, pabeni,
	edumazet, Alok Prasad, kuba

Hi Manish,
Do you have any other suggestion or using the extra variable, 
nic_stopped, in v2 patch is efficient enough to fix the issue?

Thanks,
Thinh Tran

On 9/28/2022 4:33 PM, Thinh Tran wrote:
> Hi Manish,
> 
> I think we need the extra variable, nic_stopped, or perhaps defining a 
> different state to determine the disablement of NAPI and freed IRQs.
> Since device gets the error, the NIC state is set to 
> BNX2X_STATE_CLOSING_WAIT4_HALT (0x4000)
> 
> 14132 static int bnx2x_eeh_nic_unload(struct bnx2x *bp)
> 14133 {
> 14134         bp->state = BNX2X_STATE_CLOSING_WAIT4_HALT;
> 14135
> 
> and stays in this state in EEH handler and driver reset code paths, when 
> both trying to disabling the NAPI.
> 
> On 9/21/2022 3:11 PM, Thinh Tran wrote:
>> Hi Manish,
>> Thanks for reviewing the patch.
>>
>>
>> On 9/19/2022 8:53 AM, Manish Chopra wrote:
>>> Hello Thinh,
>>>
>>>> -----Original Message-----
>>>> From: Thinh Tran <thinhtr@linux.vnet.ibm.com>
>>>> Sent: Saturday, September 17, 2022 1:21 AM
>>>> To: kuba@kernel.org
>>>> Cc: netdev@vger.kernel.org; Ariel Elior <aelior@marvell.com>;
>>>> davem@davemloft.net; Manish Chopra <manishc@marvell.com>; Sudarsana
>>>> Reddy Kalluru <skalluru@marvell.com>; pabeni@redhat.com;
>>>> edumazet@google.com; Thinh Tran <thinhtr@linux.vnet.ibm.com>
>>>> Subject: [EXT] [PATCH v2] bnx2x: Fix error recovering in switch 
>>>> configuration
>>>>
>>>> External Email
>>>>
>>>> ----------------------------------------------------------------------
>>>> As the BCM57810 and other I/O adapters are connected through a PCIe
>>>> switch, the bnx2x driver causes unexpected system hang/crash while 
>>>> handling
>>>> PCIe switch errors, if its error handler is called after other 
>>>> drivers' handlers.
>>>>
>>>> In this case, after numbers of bnx2x_tx_timout(), the
>>>> bnx2x_nic_unload() is  called, frees up resources and calls
>>>> bnx2x_napi_disable(). Then when EEH calls its error handler, the
>>>> bnx2x_io_error_detected() and
>>>> bnx2x_io_slot_reset() also calling bnx2x_napi_disable() and freeing the
>>>> resources.
>>>>
>>>> Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
>>>>
>>>>   v2:
>>>>     - Check the state of the NIC before calling disable nappi
>>>>       and freeing the IRQ
>>>>     - Prevent recurrence of TX timeout by turning off the carrier,
>>>>       calling netif_carrier_off() in bnx2x_tx_timeout()
>>>>     - Check and bail out early if fp->page_pool already freed
>>>> ---
>>>>   drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  2 +
>>>>   .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 27 +++++++++----
>>>>   .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 ++
>>>>   .../net/ethernet/broadcom/bnx2x/bnx2x_main.c  | 38 
>>>> +++++++++----------
>>>> .../net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  | 17 +++++----
>>>>   5 files changed, 53 insertions(+), 34 deletions(-)
>>>>
>>>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>>>> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>>>> index dd5945c4bfec..11280f0eb75b 100644
>>>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>>>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>>>> @@ -1509,6 +1509,8 @@ struct bnx2x {
>>>>       bool            cnic_loaded;
>>>>       struct cnic_eth_dev    *(*cnic_probe)(struct net_device *);
>>>>
>>>> +    bool            nic_stopped;
>>>> +
>>>
>>> There is an already 'state' variable which holds the NIC state 
>>> whether it was opened or closed. > Perhaps we could use that easily 
>>> in bnx2x_io_slot_reset() to prevent 
>> multiple NAPI disablement
>>> instead of adding a newer one. Please see below for ex -
>>>
>> In my early debug, I did checked for bp->state variable but I was 
>> unsure what the NIC state was safe to disable the NAPI.
>> Thanks for the hint.
>>
>>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c 
>>> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
>>> index 962253db25b8..c8e9b47208ed 100644
>>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
>>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
>>> @@ -14256,13 +14256,16 @@ static pci_ers_result_t 
>>> bnx2x_io_slot_reset(struct pci_dev *pdev)
>>>                  }
>>>                  bnx2x_drain_tx_queues(bp);
>>>                  bnx2x_send_unload_req(bp, UNLOAD_RECOVERY);
>>> -               bnx2x_netif_stop(bp, 1);
>>> -               bnx2x_del_all_napi(bp);
>>>
>>> -               if (CNIC_LOADED(bp))
>>> -                       bnx2x_del_all_napi_cnic(bp);
>>> +               if (bp->state == BNX2X_STATE_OPEN) {
>>> +                       bnx2x_netif_stop(bp, 1);
>>> +                       bnx2x_del_all_napi(bp);
>>>
>>> -               bnx2x_free_irq(bp);
>>> +                       if (CNIC_LOADED(bp))
>>> +                               bnx2x_del_all_napi_cnic(bp);
>>> +
>>> +                       bnx2x_free_irq(bp);
>>> +               }
>>>
>>>                  /* Report UNLOAD_DONE to MCP */
>>>                  bnx2x_send_unload_done(bp, true);
>>>
> Using the v2 patch, I checked the NIC state at places before and after 
> disabling the NAPI,
>          BNX2X_ERR("before state flag = 0x%x \n", bp->state);
>          if (!bp->nic_stopped) {
>                  BNX2X_ERR("after state flag = 0x%x \n", bp->state);
>          bnx2x_netif_stop(bp, 1);
>          bnx2x_del_all_napi(bp);
>                  ....
> 
> the NIC state is always BNX2X_STATE_CLOSING_WAIT4_HALT (0x4000) when 
> device get error injection.
> 
> 1- When EEH handler code path was called first, the 
> bnx2x_io_slot_reset() does disablement the NAPI and done.
>     [28197.100795] bnx2x: [bnx2x_io_slot_reset:14212(eth3)]IO slot reset 
> initializing...
>     [28197.100959] bnx2x 0201:50:00.0: enabling device (0140 -> 0142)
>     [28197.107249] bnx2x: [bnx2x_io_slot_reset:14228(eth3)]IO slot reset 
> --> driver unload
>     [28199.106281] bnx2x: [bnx2x_clean_tx_queue:1205(eth3)]timeout 
> waiting for queue[1]: txdata->tx_pkt_prod(7270) != 
> txdata->tx_pkt_cons(7269)
>     [28201.106281] bnx2x: [bnx2x_clean_tx_queue:1205(eth3)]timeout 
> waiting for queue[3]: txdata->tx_pkt_prod(10977) != 
> txdata->tx_pkt_cons(10969)
>     [28203.106280] bnx2x: [bnx2x_clean_tx_queue:1205(eth3)]timeout 
> waiting for queue[4]: txdata->tx_pkt_prod(9532) != 
> txdata->tx_pkt_cons(9514)
>     [28205.106284] bnx2x: [bnx2x_clean_tx_queue:1205(eth3)]timeout 
> waiting for queue[6]: txdata->tx_pkt_prod(2359) != 
> txdata->tx_pkt_cons(2331)
>     [28205.314280] bnx2x: [bnx2x_io_slot_reset:14246(eth3)]before -- 
> state flag = 0x4000   <---- check NIC state
>     [28205.314282] bnx2x: [bnx2x_io_slot_reset:14248(eth3)]after -- 
> state flag = 0x4000         <---- check NIC state and disable the NAPI
>     [28205.405009] PCI 0201:50:00.0#500000: EEH: bnx2x driver reports: 
> 'recovered'
> ........ snip ......
>     [28213.525014] EEH: Finished:'slot_reset' with aggregate recovery 
> state:'recovered'
>     [28213.525016] EEH: Notify device driver to resume
>     [28213.525017] EEH: Beginning: 'resume'
>     [28213.525018] PCI 0201:50:00.0#500000: EEH: Invoking bnx2x->resume()
>     [28225.115287] bnx2x 0201:50:00.0 eth3: using MSI-X  IRQs: sp 101 
> fp[0] 103 ... fp[7] 110
>     [28225.696425] PCI 0201:50:00.0#500000: EEH: bnx2x driver reports: 
> 'none'
> 
> 
> 2- When the driver reset code path was called first, the 
> bnx2x_chip_cleanup() does the disablement the NAPI, but NIC is resumed 
> after the EEH handler code path was called.
>     [13376.774278] bnx2x: [bnx2x_state_wait:312(eth3)]timeout waiting 
> for state 2
>      [13376.774280] bnx2x: [bnx2x_func_stop:9118(eth3)]FUNC_STOP ramrod 
> failed. Running a dry transaction
>     [13376.774339] bnx2x: [bnx2x_chip_cleanup:9467(eth3)]before state 
> flag = 0x4000
>     [13376.774341] bnx2x: [bnx2x_chip_cleanup:9469(eth3)]after state 
> flag = 0x4000
>     [13376.774344] bnx2x: [bnx2x_igu_int_disable:891(eth3)]BUG! Proper 
> val not read from IGU!
>     [13391.774280] bnx2x: [bnx2x_fw_command:3044(eth3)]FW failed to 
> respond!
>     ........ snip .....
>     [13442.118385] PCI 0201:50:00.0#500000: EEH: Invoking 
> bnx2x->slot_reset()
>     [13442.118388] bnx2x: [bnx2x_io_slot_reset:14212(eth3)]IO slot reset 
> initializing...
>     [13442.118509] bnx2x 0201:50:00.0: enabling device (0140 -> 0142)
>     [13442.124825] bnx2x: [bnx2x_io_slot_reset:14228(eth3)]IO slot reset 
> --> driver unload
>     [13442.154290] bnx2x: [bnx2x_io_slot_reset:14246(eth3)]before -- 
> state flag = 0x4000    <--- check NIC state
>     [13442.274291] PCI 0201:50:00.0#500000: EEH: bnx2x driver reports: 
> 'recovered'
>     ....... snip .........
>     [13442.424290] EEH: Finished:'slot_reset' with aggregate recovery 
> state:'recovered'
>     [13442.424291] EEH: Notify device driver to resume
>     [13442.424292] EEH: Beginning: 'resume'
>     [13442.424293] PCI 0201:50:00.0#500000: EEH: Invoking bnx2x->resume()
>     [13443.015280] bnx2x 0201:50:00.0 eth3: using MSI-X  IRQs: sp 61 
> fp[0] 63 ... fp[7] 75
>     [13443.616423] PCI 0201:50:00.0#500000: EEH: bnx2x driver reports: 
> 'none'
> 
> 
>>> Thanks,
>>> Manish
>>
>> It requires a specific system setup, I will test with v3 patch when 
>> the system is available.
>>
>> Thanks,
>> Thinh Tran
> 
> Any suggestions?
> Thanks,
> Thinh Tran

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [EXT] [PATCH v2] bnx2x: Fix error recovering in switch configuration
  2022-10-10 22:01           ` Thinh Tran
@ 2022-10-17 11:14             ` Manish Chopra
  2022-10-20 19:12               ` Thinh Tran
  0 siblings, 1 reply; 36+ messages in thread
From: Manish Chopra @ 2022-10-17 11:14 UTC (permalink / raw)
  To: Thinh Tran
  Cc: netdev, Ariel Elior, davem, Sudarsana Reddy Kalluru, pabeni,
	edumazet, Alok Prasad, kuba

Hello Thinh,

> -----Original Message-----
> From: Thinh Tran <thinhtr@linux.vnet.ibm.com>
> Sent: Tuesday, October 11, 2022 3:31 AM
> To: Manish Chopra <manishc@marvell.com>
> Cc: netdev@vger.kernel.org; Ariel Elior <aelior@marvell.com>;
> davem@davemloft.net; Sudarsana Reddy Kalluru <skalluru@marvell.com>;
> pabeni@redhat.com; edumazet@google.com; Alok Prasad
> <palok@marvell.com>; kuba@kernel.org
> Subject: Re: [EXT] [PATCH v2] bnx2x: Fix error recovering in switch
> configuration
> 
> Hi Manish,
> Do you have any other suggestion or using the extra variable, nic_stopped, in
> v2 patch is efficient enough to fix the issue?

I am fine with this if using existing 'state' var is not a suitable option here. Thanks.

> 
> Thanks,
> Thinh Tran
> 
> On 9/28/2022 4:33 PM, Thinh Tran wrote:
> > Hi Manish,
> >
> > I think we need the extra variable, nic_stopped, or perhaps defining a
> > different state to determine the disablement of NAPI and freed IRQs.
> > Since device gets the error, the NIC state is set to
> > BNX2X_STATE_CLOSING_WAIT4_HALT (0x4000)
> >
> > 14132 static int bnx2x_eeh_nic_unload(struct bnx2x *bp)
> > 14133 {
> > 14134         bp->state = BNX2X_STATE_CLOSING_WAIT4_HALT;
> > 14135
> >
> > and stays in this state in EEH handler and driver reset code paths,
> > when both trying to disabling the NAPI.
> >
> > On 9/21/2022 3:11 PM, Thinh Tran wrote:
> >> Hi Manish,
> >> Thanks for reviewing the patch.
> >>
> >>
> >> On 9/19/2022 8:53 AM, Manish Chopra wrote:
> >>> Hello Thinh,
> >>>
> >>>> -----Original Message-----
> >>>> From: Thinh Tran <thinhtr@linux.vnet.ibm.com>
> >>>> Sent: Saturday, September 17, 2022 1:21 AM
> >>>> To: kuba@kernel.org
> >>>> Cc: netdev@vger.kernel.org; Ariel Elior <aelior@marvell.com>;
> >>>> davem@davemloft.net; Manish Chopra <manishc@marvell.com>;
> Sudarsana
> >>>> Reddy Kalluru <skalluru@marvell.com>; pabeni@redhat.com;
> >>>> edumazet@google.com; Thinh Tran <thinhtr@linux.vnet.ibm.com>
> >>>> Subject: [EXT] [PATCH v2] bnx2x: Fix error recovering in switch
> >>>> configuration
> >>>>
> >>>> External Email
> >>>>
> >>>> -------------------------------------------------------------------
> >>>> --- As the BCM57810 and other I/O adapters are connected through a
> >>>> PCIe switch, the bnx2x driver causes unexpected system hang/crash
> >>>> while handling PCIe switch errors, if its error handler is called
> >>>> after other drivers' handlers.
> >>>>
> >>>> In this case, after numbers of bnx2x_tx_timout(), the
> >>>> bnx2x_nic_unload() is  called, frees up resources and calls
> >>>> bnx2x_napi_disable(). Then when EEH calls its error handler, the
> >>>> bnx2x_io_error_detected() and
> >>>> bnx2x_io_slot_reset() also calling bnx2x_napi_disable() and freeing
> >>>> the resources.
> >>>>
> >>>> Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
> >>>>
> >>>>   v2:
> >>>>     - Check the state of the NIC before calling disable nappi
> >>>>       and freeing the IRQ
> >>>>     - Prevent recurrence of TX timeout by turning off the carrier,
> >>>>       calling netif_carrier_off() in bnx2x_tx_timeout()
> >>>>     - Check and bail out early if fp->page_pool already freed
> >>>> ---
> >>>>   drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  2 +
> >>>>   .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 27 +++++++++----
> >>>>   .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 ++
> >>>>   .../net/ethernet/broadcom/bnx2x/bnx2x_main.c  | 38
> >>>> +++++++++----------
> >>>> .../net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  | 17 +++++----
> >>>>   5 files changed, 53 insertions(+), 34 deletions(-)
> >>>>
> >>>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> >>>> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> >>>> index dd5945c4bfec..11280f0eb75b 100644
> >>>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> >>>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> >>>> @@ -1509,6 +1509,8 @@ struct bnx2x {
> >>>>       bool            cnic_loaded;
> >>>>       struct cnic_eth_dev    *(*cnic_probe)(struct net_device *);
> >>>>
> >>>> +    bool            nic_stopped;
> >>>> +
> >>>
> >>> There is an already 'state' variable which holds the NIC state
> >>> whether it was opened or closed. > Perhaps we could use that easily
> >>> in bnx2x_io_slot_reset() to prevent
> >> multiple NAPI disablement
> >>> instead of adding a newer one. Please see below for ex -
> >>>
> >> In my early debug, I did checked for bp->state variable but I was
> >> unsure what the NIC state was safe to disable the NAPI.
> >> Thanks for the hint.
> >>
> >>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
> >>> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
> >>> index 962253db25b8..c8e9b47208ed 100644
> >>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
> >>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
> >>> @@ -14256,13 +14256,16 @@ static pci_ers_result_t
> >>> bnx2x_io_slot_reset(struct pci_dev *pdev)
> >>>                  }
> >>>                  bnx2x_drain_tx_queues(bp);
> >>>                  bnx2x_send_unload_req(bp, UNLOAD_RECOVERY);
> >>> -               bnx2x_netif_stop(bp, 1);
> >>> -               bnx2x_del_all_napi(bp);
> >>>
> >>> -               if (CNIC_LOADED(bp))
> >>> -                       bnx2x_del_all_napi_cnic(bp);
> >>> +               if (bp->state == BNX2X_STATE_OPEN) {
> >>> +                       bnx2x_netif_stop(bp, 1);
> >>> +                       bnx2x_del_all_napi(bp);
> >>>
> >>> -               bnx2x_free_irq(bp);
> >>> +                       if (CNIC_LOADED(bp))
> >>> +                               bnx2x_del_all_napi_cnic(bp);
> >>> +
> >>> +                       bnx2x_free_irq(bp);
> >>> +               }
> >>>
> >>>                  /* Report UNLOAD_DONE to MCP */
> >>>                  bnx2x_send_unload_done(bp, true);
> >>>
> > Using the v2 patch, I checked the NIC state at places before and after
> > disabling the NAPI,
> >          BNX2X_ERR("before state flag = 0x%x \n", bp->state);
> >          if (!bp->nic_stopped) {
> >                  BNX2X_ERR("after state flag = 0x%x \n", bp->state);
> >          bnx2x_netif_stop(bp, 1);
> >          bnx2x_del_all_napi(bp);
> >                  ....
> >
> > the NIC state is always BNX2X_STATE_CLOSING_WAIT4_HALT (0x4000)
> when
> > device get error injection.
> >
> > 1- When EEH handler code path was called first, the
> > bnx2x_io_slot_reset() does disablement the NAPI and done.
> >     [28197.100795] bnx2x: [bnx2x_io_slot_reset:14212(eth3)]IO slot
> > reset initializing...
> >     [28197.100959] bnx2x 0201:50:00.0: enabling device (0140 -> 0142)
> >     [28197.107249] bnx2x: [bnx2x_io_slot_reset:14228(eth3)]IO slot
> > reset
> > --> driver unload
> >     [28199.106281] bnx2x: [bnx2x_clean_tx_queue:1205(eth3)]timeout
> > waiting for queue[1]: txdata->tx_pkt_prod(7270) !=
> > txdata->tx_pkt_cons(7269)
> >     [28201.106281] bnx2x: [bnx2x_clean_tx_queue:1205(eth3)]timeout
> > waiting for queue[3]: txdata->tx_pkt_prod(10977) !=
> > txdata->tx_pkt_cons(10969)
> >     [28203.106280] bnx2x: [bnx2x_clean_tx_queue:1205(eth3)]timeout
> > waiting for queue[4]: txdata->tx_pkt_prod(9532) !=
> > txdata->tx_pkt_cons(9514)
> >     [28205.106284] bnx2x: [bnx2x_clean_tx_queue:1205(eth3)]timeout
> > waiting for queue[6]: txdata->tx_pkt_prod(2359) !=
> > txdata->tx_pkt_cons(2331)
> >     [28205.314280] bnx2x: [bnx2x_io_slot_reset:14246(eth3)]before --
> > state flag = 0x4000   <---- check NIC state
> >     [28205.314282] bnx2x: [bnx2x_io_slot_reset:14248(eth3)]after --
> > state flag = 0x4000         <---- check NIC state and disable the NAPI
> >     [28205.405009] PCI 0201:50:00.0#500000: EEH: bnx2x driver reports:
> > 'recovered'
> > ........ snip ......
> >     [28213.525014] EEH: Finished:'slot_reset' with aggregate recovery
> > state:'recovered'
> >     [28213.525016] EEH: Notify device driver to resume
> >     [28213.525017] EEH: Beginning: 'resume'
> >     [28213.525018] PCI 0201:50:00.0#500000: EEH: Invoking
> > bnx2x->resume()
> >     [28225.115287] bnx2x 0201:50:00.0 eth3: using MSI-X  IRQs: sp 101
> > fp[0] 103 ... fp[7] 110
> >     [28225.696425] PCI 0201:50:00.0#500000: EEH: bnx2x driver reports:
> > 'none'
> >
> >
> > 2- When the driver reset code path was called first, the
> > bnx2x_chip_cleanup() does the disablement the NAPI, but NIC is resumed
> > after the EEH handler code path was called.
> >     [13376.774278] bnx2x: [bnx2x_state_wait:312(eth3)]timeout waiting
> > for state 2
> >      [13376.774280] bnx2x: [bnx2x_func_stop:9118(eth3)]FUNC_STOP
> > ramrod failed. Running a dry transaction
> >     [13376.774339] bnx2x: [bnx2x_chip_cleanup:9467(eth3)]before state
> > flag = 0x4000
> >     [13376.774341] bnx2x: [bnx2x_chip_cleanup:9469(eth3)]after state
> > flag = 0x4000
> >     [13376.774344] bnx2x: [bnx2x_igu_int_disable:891(eth3)]BUG! Proper
> > val not read from IGU!
> >     [13391.774280] bnx2x: [bnx2x_fw_command:3044(eth3)]FW failed to
> > respond!
> >     ........ snip .....
> >     [13442.118385] PCI 0201:50:00.0#500000: EEH: Invoking
> > bnx2x->slot_reset()
> >     [13442.118388] bnx2x: [bnx2x_io_slot_reset:14212(eth3)]IO slot
> > reset initializing...
> >     [13442.118509] bnx2x 0201:50:00.0: enabling device (0140 -> 0142)
> >     [13442.124825] bnx2x: [bnx2x_io_slot_reset:14228(eth3)]IO slot
> > reset
> > --> driver unload
> >     [13442.154290] bnx2x: [bnx2x_io_slot_reset:14246(eth3)]before --
> > state flag = 0x4000    <--- check NIC state
> >     [13442.274291] PCI 0201:50:00.0#500000: EEH: bnx2x driver reports:
> > 'recovered'
> >     ....... snip .........
> >     [13442.424290] EEH: Finished:'slot_reset' with aggregate recovery
> > state:'recovered'
> >     [13442.424291] EEH: Notify device driver to resume
> >     [13442.424292] EEH: Beginning: 'resume'
> >     [13442.424293] PCI 0201:50:00.0#500000: EEH: Invoking
> > bnx2x->resume()
> >     [13443.015280] bnx2x 0201:50:00.0 eth3: using MSI-X  IRQs: sp 61
> > fp[0] 63 ... fp[7] 75
> >     [13443.616423] PCI 0201:50:00.0#500000: EEH: bnx2x driver reports:
> > 'none'
> >
> >
> >>> Thanks,
> >>> Manish
> >>
> >> It requires a specific system setup, I will test with v3 patch when
> >> the system is available.
> >>
> >> Thanks,
> >> Thinh Tran
> >
> > Any suggestions?
> > Thanks,
> > Thinh Tran

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [EXT] [PATCH v2] bnx2x: Fix error recovering in switch configuration
  2022-10-17 11:14             ` Manish Chopra
@ 2022-10-20 19:12               ` Thinh Tran
  0 siblings, 0 replies; 36+ messages in thread
From: Thinh Tran @ 2022-10-20 19:12 UTC (permalink / raw)
  To: Manish Chopra
  Cc: netdev, Ariel Elior, davem, Sudarsana Reddy Kalluru, pabeni,
	edumazet, Alok Prasad, kuba

Hi Manish,

On 10/17/2022 6:14 AM, Manish Chopra wrote:

> I am fine with this if using existing 'state' var is not a suitable option here. Thanks.
> 

Thanks for the review
Thinh Tran

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [Patch v3] bnx2x: Fix error recovering in switch configuration
  2022-09-16 19:51   ` [PATCH v2] " Thinh Tran
  2022-09-19 13:53     ` [EXT] " Manish Chopra
@ 2023-07-19 22:02     ` Thinh Tran
  2023-07-24 11:48       ` Simon Horman
  2023-07-24 22:50       ` Jakub Kicinski
  2023-07-28 21:11     ` [PATCH v4] " Thinh Tran
  2 siblings, 2 replies; 36+ messages in thread
From: Thinh Tran @ 2023-07-19 22:02 UTC (permalink / raw)
  To: kuba
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru, drc,
	abdhalee, Thinh Tran


As the BCM57810 and other I/O adapters are connected
through a PCIe switch, the bnx2x driver causes unexpected
system hang/crash while handling PCIe switch errors, if
its error handler is called after other drivers' handlers.

In this case, after numbers of bnx2x_tx_timout(), the
bnx2x_nic_unload() is  called, frees up resources and
calls bnx2x_napi_disable(). Then when EEH calls its
error handler, the bnx2x_io_error_detected() and
bnx2x_io_slot_reset() also calling bnx2x_napi_disable()
and freeing the resources.


Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
Reviewed-by: Manish Chopra <manishc@marvell.com>
Tested-by: Abdul Haleem <abdhalee@in.ibm.com>
Tested-by: David Christensen <drc@linux.vnet.ibm.com>

  v3:
    - no changes, just repatched to the latest driver level 
    - updated the reviewed-by Manish in October, 2022

  v2:
   - Check the state of the NIC before calling disable nappi
     and freeing the IRQ
   - Prevent recurrence of TX timeout by turning off the carrier,
     calling netif_carrier_off() in bnx2x_tx_timeout()
   - Check and bail out early if fp->page_pool already freed

---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  2 +
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 27 +++++++++----
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 ++
 .../net/ethernet/broadcom/bnx2x/bnx2x_main.c  | 38 +++++++++----------
 .../net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  | 17 +++++----
 5 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 8bcde0a6e011..e2a4e1088b7f 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -1508,6 +1508,8 @@ struct bnx2x {
 	bool			cnic_loaded;
 	struct cnic_eth_dev	*(*cnic_probe)(struct net_device *);
 
+	bool                    nic_stopped;
+
 	/* Flag that indicates that we can start looking for FCoE L2 queue
 	 * completions in the default status block.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 6ea5521074d3..97364089ff80 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -2715,6 +2715,7 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 	bnx2x_add_all_napi(bp);
 	DP(NETIF_MSG_IFUP, "napi added\n");
 	bnx2x_napi_enable(bp);
+	bp->nic_stopped = false;
 
 	if (IS_PF(bp)) {
 		/* set pf load just before approaching the MCP */
@@ -2960,6 +2961,7 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 load_error1:
 	bnx2x_napi_disable(bp);
 	bnx2x_del_all_napi(bp);
+	bp->nic_stopped = true;
 
 	/* clear pf_load status, as it was already set */
 	if (IS_PF(bp))
@@ -3095,14 +3097,17 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link)
 		if (!CHIP_IS_E1x(bp))
 			bnx2x_pf_disable(bp);
 
-		/* Disable HW interrupts, NAPI */
-		bnx2x_netif_stop(bp, 1);
-		/* Delete all NAPI objects */
-		bnx2x_del_all_napi(bp);
-		if (CNIC_LOADED(bp))
-			bnx2x_del_all_napi_cnic(bp);
-		/* Release IRQs */
-		bnx2x_free_irq(bp);
+		if (!bp->nic_stopped) {
+			/* Disable HW interrupts, NAPI */
+			bnx2x_netif_stop(bp, 1);
+			/* Delete all NAPI objects */
+			bnx2x_del_all_napi(bp);
+			if (CNIC_LOADED(bp))
+				bnx2x_del_all_napi_cnic(bp);
+			/* Release IRQs */
+			bnx2x_free_irq(bp);
+			bp->nic_stopped = true;
+		}
 
 		/* Report UNLOAD_DONE to MCP */
 		bnx2x_send_unload_done(bp, false);
@@ -4987,6 +4992,12 @@ void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 
+	/* Immediately indicate link as down */
+	bp->link_vars.link_up = 0;
+	bp->force_link_down = true;
+	netif_carrier_off(dev);
+	BNX2X_ERR("Indicating link is down due to Tx-timeout\n");
+
 	/* We want the information of the dump logged,
 	 * but calling bnx2x_panic() would kill all chances of recovery.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index d8b1824c334d..bc0dee25b804 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -1015,6 +1015,9 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 {
 	int i;
 
+	if (!fp->page_pool.page)
+		return;
+
 	if (fp->mode == TPA_MODE_DISABLED)
 		return;
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 1e7a6f1d4223..adb1c7a2c367 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -9474,15 +9474,18 @@ void bnx2x_chip_cleanup(struct bnx2x *bp, int unload_mode, bool keep_link)
 		}
 	}
 
-	/* Disable HW interrupts, NAPI */
-	bnx2x_netif_stop(bp, 1);
-	/* Delete all NAPI objects */
-	bnx2x_del_all_napi(bp);
-	if (CNIC_LOADED(bp))
-		bnx2x_del_all_napi_cnic(bp);
+	if (!bp->nic_stopped) {
+		/* Disable HW interrupts, NAPI */
+		bnx2x_netif_stop(bp, 1);
+		/* Delete all NAPI objects */
+		bnx2x_del_all_napi(bp);
+		if (CNIC_LOADED(bp))
+			bnx2x_del_all_napi_cnic(bp);
 
-	/* Release IRQs */
-	bnx2x_free_irq(bp);
+		/* Release IRQs */
+		bnx2x_free_irq(bp);
+		bp->nic_stopped = true;
+	}
 
 	/* Reset the chip, unless PCI function is offline. If we reach this
 	 * point following a PCI error handling, it means device is really
@@ -10273,12 +10276,6 @@ static void bnx2x_sp_rtnl_task(struct work_struct *work)
 		bp->sp_rtnl_state = 0;
 		smp_mb();
 
-		/* Immediately indicate link as down */
-		bp->link_vars.link_up = 0;
-		bp->force_link_down = true;
-		netif_carrier_off(bp->dev);
-		BNX2X_ERR("Indicating link is down due to Tx-timeout\n");
-
 		bnx2x_nic_unload(bp, UNLOAD_NORMAL, true);
 		/* When ret value shows failure of allocation failure,
 		 * the nic is rebooted again. If open still fails, a error
@@ -14238,13 +14235,16 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev)
 		}
 		bnx2x_drain_tx_queues(bp);
 		bnx2x_send_unload_req(bp, UNLOAD_RECOVERY);
-		bnx2x_netif_stop(bp, 1);
-		bnx2x_del_all_napi(bp);
+		if (!bp->nic_stopped) {
+			bnx2x_netif_stop(bp, 1);
+			bnx2x_del_all_napi(bp);
 
-		if (CNIC_LOADED(bp))
-			bnx2x_del_all_napi_cnic(bp);
+			if (CNIC_LOADED(bp))
+				bnx2x_del_all_napi_cnic(bp);
 
-		bnx2x_free_irq(bp);
+			bnx2x_free_irq(bp);
+			bp->nic_stopped = true;
+		}
 
 		/* Report UNLOAD_DONE to MCP */
 		bnx2x_send_unload_done(bp, true);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
index 0657a0f5170f..8946a931e87e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -529,13 +529,16 @@ void bnx2x_vfpf_close_vf(struct bnx2x *bp)
 	bnx2x_vfpf_finalize(bp, &req->first_tlv);
 
 free_irq:
-	/* Disable HW interrupts, NAPI */
-	bnx2x_netif_stop(bp, 0);
-	/* Delete all NAPI objects */
-	bnx2x_del_all_napi(bp);
-
-	/* Release IRQs */
-	bnx2x_free_irq(bp);
+	if (!bp->nic_stopped) {
+		/* Disable HW interrupts, NAPI */
+		bnx2x_netif_stop(bp, 0);
+		/* Delete all NAPI objects */
+		bnx2x_del_all_napi(bp);
+
+		/* Release IRQs */
+		bnx2x_free_irq(bp);
+		bp->nic_stopped = true;
+	}
 }
 
 static void bnx2x_leading_vfq_init(struct bnx2x *bp, struct bnx2x_virtf *vf,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [Patch v3] bnx2x: Fix error recovering in switch configuration
  2023-07-19 22:02     ` [Patch v3] " Thinh Tran
@ 2023-07-24 11:48       ` Simon Horman
  2023-07-24 22:50       ` Jakub Kicinski
  1 sibling, 0 replies; 36+ messages in thread
From: Simon Horman @ 2023-07-24 11:48 UTC (permalink / raw)
  To: Thinh Tran
  Cc: kuba, aelior, davem, edumazet, manishc, netdev, pabeni, skalluru,
	drc, abdhalee

On Wed, Jul 19, 2023 at 10:02:01PM +0000, Thinh Tran wrote:
> 
> As the BCM57810 and other I/O adapters are connected
> through a PCIe switch, the bnx2x driver causes unexpected
> system hang/crash while handling PCIe switch errors, if
> its error handler is called after other drivers' handlers.
> 
> In this case, after numbers of bnx2x_tx_timout(), the
> bnx2x_nic_unload() is  called, frees up resources and
> calls bnx2x_napi_disable(). Then when EEH calls its
> error handler, the bnx2x_io_error_detected() and
> bnx2x_io_slot_reset() also calling bnx2x_napi_disable()
> and freeing the resources.
> 
> 
> Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
> Reviewed-by: Manish Chopra <manishc@marvell.com>
> Tested-by: Abdul Haleem <abdhalee@in.ibm.com>
> Tested-by: David Christensen <drc@linux.vnet.ibm.com>

Reviewed-by: Simon Horman <simon.horman@corigine.com>


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [Patch v3] bnx2x: Fix error recovering in switch configuration
  2023-07-19 22:02     ` [Patch v3] " Thinh Tran
  2023-07-24 11:48       ` Simon Horman
@ 2023-07-24 22:50       ` Jakub Kicinski
  2023-07-27 13:29         ` Thinh Tran
  1 sibling, 1 reply; 36+ messages in thread
From: Jakub Kicinski @ 2023-07-24 22:50 UTC (permalink / raw)
  To: Thinh Tran
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru, drc,
	abdhalee

On Wed, 19 Jul 2023 22:02:01 +0000 Thinh Tran wrote:
> -		/* Disable HW interrupts, NAPI */
> -		bnx2x_netif_stop(bp, 1);
> -		/* Delete all NAPI objects */
> -		bnx2x_del_all_napi(bp);
> -		if (CNIC_LOADED(bp))
> -			bnx2x_del_all_napi_cnic(bp);
> -		/* Release IRQs */
> -		bnx2x_free_irq(bp);
> +		if (!bp->nic_stopped) {
> +			/* Disable HW interrupts, NAPI */
> +			bnx2x_netif_stop(bp, 1);
> +			/* Delete all NAPI objects */
> +			bnx2x_del_all_napi(bp);
> +			if (CNIC_LOADED(bp))
> +				bnx2x_del_all_napi_cnic(bp);
> +			/* Release IRQs */
> +			bnx2x_free_irq(bp);
> +			bp->nic_stopped = true;
> +		}

You're sprinkling this if around the same piece of code in multiple
places. Please factor it out into a function, and add return early 
from it if needed. If possible please keep the code symmetric (i.e.
also factor out the inverse of this code for starting the NIC).
-- 
pw-bot: cr

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [Patch v3] bnx2x: Fix error recovering in switch configuration
  2023-07-24 22:50       ` Jakub Kicinski
@ 2023-07-27 13:29         ` Thinh Tran
  0 siblings, 0 replies; 36+ messages in thread
From: Thinh Tran @ 2023-07-27 13:29 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru, drc,
	abdhalee



On 7/24/2023 5:50 PM, Jakub Kicinski wrote:
> 
> You're sprinkling this if around the same piece of code in multiple
> places. Please factor it out into a function, and add return early
> from it if needed. If possible please keep the code symmetric (i.e.
> also factor out the inverse of this code for starting the NIC).

Thank you for reviewing. I'll work for creating a separate function for 
it, and one for the inverse as well if it's possible.

Thinh Tran

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v4] bnx2x: Fix error recovering in switch configuration
  2022-09-16 19:51   ` [PATCH v2] " Thinh Tran
  2022-09-19 13:53     ` [EXT] " Manish Chopra
  2023-07-19 22:02     ` [Patch v3] " Thinh Tran
@ 2023-07-28 21:11     ` Thinh Tran
  2023-08-01  0:47       ` Jakub Kicinski
                         ` (2 more replies)
  2 siblings, 3 replies; 36+ messages in thread
From: Thinh Tran @ 2023-07-28 21:11 UTC (permalink / raw)
  To: kuba
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru, drc,
	abdhalee, simon.horman, Thinh Tran

As the BCM57810 and other I/O adapters are connected
through a PCIe switch, the bnx2x driver causes unexpected
system hang/crash while handling PCIe switch errors, if
its error handler is called after other drivers' handlers.

In this case, after numbers of bnx2x_tx_timout(), the
bnx2x_nic_unload() is  called, frees up resources and
calls bnx2x_napi_disable(). Then when EEH calls its
error handler, the bnx2x_io_error_detected() and
bnx2x_io_slot_reset() also calling bnx2x_napi_disable()
and freeing the resources.


Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
Reviewed-by: Manish Chopra <manishc@marvell.com>
Tested-by: Abdul Haleem <abdhalee@in.ibm.com>
Tested-by: David Christensen <drc@linux.vnet.ibm.com>

Reviewed-by: Simon Horman <simon.horman@corigine.com>

  v4:
   - factoring common code into new function bnx2x_stop_nic()
     that disables and releases IRQs and NAPIs 
  v3:
    - no changes, just repatched to the latest driver level
    - updated the reviewed-by Manish in October, 2022

  v2:
   - Check the state of the NIC before calling disable nappi
     and freeing the IRQ
   - Prevent recurrence of TX timeout by turning off the carrier,
     calling netif_carrier_off() in bnx2x_tx_timeout()
   - Check and bail out early if fp->page_pool already freed


---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  2 ++
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 18 +++++++------
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   | 18 +++++++++++++
 .../net/ethernet/broadcom/bnx2x/bnx2x_main.c  | 25 +++----------------
 .../net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  |  9 ++-----
 5 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 8bcde0a6e011..e2a4e1088b7f 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -1508,6 +1508,8 @@ struct bnx2x {
 	bool			cnic_loaded;
 	struct cnic_eth_dev	*(*cnic_probe)(struct net_device *);
 
+	bool                    nic_stopped;
+
 	/* Flag that indicates that we can start looking for FCoE L2 queue
 	 * completions in the default status block.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 6ea5521074d3..b4143c427936 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -2715,6 +2715,7 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 	bnx2x_add_all_napi(bp);
 	DP(NETIF_MSG_IFUP, "napi added\n");
 	bnx2x_napi_enable(bp);
+	bp->nic_stopped = false;
 
 	if (IS_PF(bp)) {
 		/* set pf load just before approaching the MCP */
@@ -2960,6 +2961,7 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 load_error1:
 	bnx2x_napi_disable(bp);
 	bnx2x_del_all_napi(bp);
+	bp->nic_stopped = true;
 
 	/* clear pf_load status, as it was already set */
 	if (IS_PF(bp))
@@ -3095,14 +3097,8 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link)
 		if (!CHIP_IS_E1x(bp))
 			bnx2x_pf_disable(bp);
 
-		/* Disable HW interrupts, NAPI */
-		bnx2x_netif_stop(bp, 1);
-		/* Delete all NAPI objects */
-		bnx2x_del_all_napi(bp);
-		if (CNIC_LOADED(bp))
-			bnx2x_del_all_napi_cnic(bp);
-		/* Release IRQs */
-		bnx2x_free_irq(bp);
+		/* Disable HW interrupts, delete NAPIs, Release IRQs */
+		bnx2x_stop_nic(bp);
 
 		/* Report UNLOAD_DONE to MCP */
 		bnx2x_send_unload_done(bp, false);
@@ -4987,6 +4983,12 @@ void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 
+	/* Immediately indicate link as down */
+	bp->link_vars.link_up = 0;
+	bp->force_link_down = true;
+	netif_carrier_off(dev);
+	BNX2X_ERR("Indicating link is down due to Tx-timeout\n");
+
 	/* We want the information of the dump logged,
 	 * but calling bnx2x_panic() would kill all chances of recovery.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index d8b1824c334d..f5ecbe8d604a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -1015,6 +1015,9 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 {
 	int i;
 
+	if (!fp->page_pool.page)
+		return;
+
 	if (fp->mode == TPA_MODE_DISABLED)
 		return;
 
@@ -1399,5 +1402,20 @@ void bnx2x_set_os_driver_state(struct bnx2x *bp, u32 state);
  */
 int bnx2x_nvram_read(struct bnx2x *bp, u32 offset, u8 *ret_buf,
 		     int buf_size);
+static inline void bnx2x_stop_nic(struct bnx2x *bp)
+{
+	if (!bp->nic_stopped) {
+		/* Disable HW interrupts, NAPI */
+		bnx2x_netif_stop(bp, 1);
+		/* Delete all NAPI objects */
+		bnx2x_del_all_napi(bp);
+		if (CNIC_LOADED(bp))
+			bnx2x_del_all_napi_cnic(bp);
+		/* Release IRQs */
+		bnx2x_free_irq(bp);
+		bp->nic_stopped = true;
+	}
+}
+
 
 #endif /* BNX2X_CMN_H */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 1e7a6f1d4223..e40c83b8b32e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -9474,15 +9474,8 @@ void bnx2x_chip_cleanup(struct bnx2x *bp, int unload_mode, bool keep_link)
 		}
 	}
 
-	/* Disable HW interrupts, NAPI */
-	bnx2x_netif_stop(bp, 1);
-	/* Delete all NAPI objects */
-	bnx2x_del_all_napi(bp);
-	if (CNIC_LOADED(bp))
-		bnx2x_del_all_napi_cnic(bp);
-
-	/* Release IRQs */
-	bnx2x_free_irq(bp);
+	/* Disable HW interrupts, delete NAPIs, Release IRQs */
+	bnx2x_stop_nic(bp);
 
 	/* Reset the chip, unless PCI function is offline. If we reach this
 	 * point following a PCI error handling, it means device is really
@@ -10273,12 +10266,6 @@ static void bnx2x_sp_rtnl_task(struct work_struct *work)
 		bp->sp_rtnl_state = 0;
 		smp_mb();
 
-		/* Immediately indicate link as down */
-		bp->link_vars.link_up = 0;
-		bp->force_link_down = true;
-		netif_carrier_off(bp->dev);
-		BNX2X_ERR("Indicating link is down due to Tx-timeout\n");
-
 		bnx2x_nic_unload(bp, UNLOAD_NORMAL, true);
 		/* When ret value shows failure of allocation failure,
 		 * the nic is rebooted again. If open still fails, a error
@@ -14238,13 +14225,9 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev)
 		}
 		bnx2x_drain_tx_queues(bp);
 		bnx2x_send_unload_req(bp, UNLOAD_RECOVERY);
-		bnx2x_netif_stop(bp, 1);
-		bnx2x_del_all_napi(bp);
-
-		if (CNIC_LOADED(bp))
-			bnx2x_del_all_napi_cnic(bp);
 
-		bnx2x_free_irq(bp);
+		/* Disable HW interrupts, delete NAPIs, Release IRQs */
+		bnx2x_stop_nic(bp);
 
 		/* Report UNLOAD_DONE to MCP */
 		bnx2x_send_unload_done(bp, true);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
index 0657a0f5170f..d8af7c453172 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -529,13 +529,8 @@ void bnx2x_vfpf_close_vf(struct bnx2x *bp)
 	bnx2x_vfpf_finalize(bp, &req->first_tlv);
 
 free_irq:
-	/* Disable HW interrupts, NAPI */
-	bnx2x_netif_stop(bp, 0);
-	/* Delete all NAPI objects */
-	bnx2x_del_all_napi(bp);
-
-	/* Release IRQs */
-	bnx2x_free_irq(bp);
+	/* Disable HW interrupts, delete NAPIs, Release IRQs */
+	bnx2x_stop_nic(bp);
 }
 
 static void bnx2x_leading_vfq_init(struct bnx2x *bp, struct bnx2x_virtf *vf,
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v4] bnx2x: Fix error recovering in switch configuration
  2023-07-28 21:11     ` [PATCH v4] " Thinh Tran
@ 2023-08-01  0:47       ` Jakub Kicinski
  2023-08-01  8:30         ` Paolo Abeni
  2023-08-07 21:08         ` Thinh Tran
  2023-08-11 20:15       ` [Patch v5 0/4] " Thinh Tran
  2023-08-18 16:14       ` [Patch v6 " Thinh Tran
  2 siblings, 2 replies; 36+ messages in thread
From: Jakub Kicinski @ 2023-08-01  0:47 UTC (permalink / raw)
  To: Thinh Tran
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru, drc,
	abdhalee, simon.horman

On Fri, 28 Jul 2023 16:11:33 -0500 Thinh Tran wrote:
> As the BCM57810 and other I/O adapters are connected
> through a PCIe switch, the bnx2x driver causes unexpected
> system hang/crash while handling PCIe switch errors, if
> its error handler is called after other drivers' handlers.
> 
> In this case, after numbers of bnx2x_tx_timout(), the
> bnx2x_nic_unload() is  called, frees up resources and
> calls bnx2x_napi_disable(). Then when EEH calls its
> error handler, the bnx2x_io_error_detected() and
> bnx2x_io_slot_reset() also calling bnx2x_napi_disable()
> and freeing the resources.
> 
> 
> Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
> Reviewed-by: Manish Chopra <manishc@marvell.com>
> Tested-by: Abdul Haleem <abdhalee@in.ibm.com>
> Tested-by: David Christensen <drc@linux.vnet.ibm.com>
> 
> Reviewed-by: Simon Horman <simon.horman@corigine.com>

nit: no empty lines between tags

There should be a "---" line between the tags and changelog.

>   v4:
>    - factoring common code into new function bnx2x_stop_nic()
>      that disables and releases IRQs and NAPIs 
>   v3:
>     - no changes, just repatched to the latest driver level
>     - updated the reviewed-by Manish in October, 2022
> 
>   v2:
>    - Check the state of the NIC before calling disable nappi
>      and freeing the IRQ
>    - Prevent recurrence of TX timeout by turning off the carrier,
>      calling netif_carrier_off() in bnx2x_tx_timeout()
>    - Check and bail out early if fp->page_pool already freed
>
> ---
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  2 ++

> @@ -3095,14 +3097,8 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link)
>  		if (!CHIP_IS_E1x(bp))
>  			bnx2x_pf_disable(bp);
>  
> -		/* Disable HW interrupts, NAPI */
> -		bnx2x_netif_stop(bp, 1);
> -		/* Delete all NAPI objects */
> -		bnx2x_del_all_napi(bp);
> -		if (CNIC_LOADED(bp))
> -			bnx2x_del_all_napi_cnic(bp);
> -		/* Release IRQs */
> -		bnx2x_free_irq(bp);

Could you split the change into two patches - one factoring out the
code into bnx2x_stop_nic() and the other adding the nic_stopped
variable? First one should be pure code refactoring with no functional
changes. That'd make the reviewing process easier.

> +		/* Disable HW interrupts, delete NAPIs, Release IRQs */
> +		bnx2x_stop_nic(bp);
>  
>  		/* Report UNLOAD_DONE to MCP */
>  		bnx2x_send_unload_done(bp, false);
> @@ -4987,6 +4983,12 @@ void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
>  {
>  	struct bnx2x *bp = netdev_priv(dev);
>  
> +	/* Immediately indicate link as down */
> +	bp->link_vars.link_up = 0;
> +	bp->force_link_down = true;
> +	netif_carrier_off(dev);
> +	BNX2X_ERR("Indicating link is down due to Tx-timeout\n");

Is this code move to make the shutdown more immediate?
That could also be a separate patch.

>  	/* We want the information of the dump logged,
>  	 * but calling bnx2x_panic() would kill all chances of recovery.
>  	 */
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
> index d8b1824c334d..f5ecbe8d604a 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
> @@ -1015,6 +1015,9 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
>  {
>  	int i;
>  
> +	if (!fp->page_pool.page)
> +		return;
> +
>  	if (fp->mode == TPA_MODE_DISABLED)
>  		return;
>  
> @@ -1399,5 +1402,20 @@ void bnx2x_set_os_driver_state(struct bnx2x *bp, u32 state);
>   */
>  int bnx2x_nvram_read(struct bnx2x *bp, u32 offset, u8 *ret_buf,
>  		     int buf_size);
> +static inline void bnx2x_stop_nic(struct bnx2x *bp)

can't it live in bnx2x_cmn.c ? Why make it a static inline?

> +{
> +	if (!bp->nic_stopped) {
> +		/* Disable HW interrupts, NAPI */
> +		bnx2x_netif_stop(bp, 1);
> +		/* Delete all NAPI objects */
> +		bnx2x_del_all_napi(bp);
> +		if (CNIC_LOADED(bp))
> +			bnx2x_del_all_napi_cnic(bp);
> +		/* Release IRQs */
> +		bnx2x_free_irq(bp);
> +		bp->nic_stopped = true;
> +	}
> +}
> +
>  

nit: double new line

> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
> @@ -529,13 +529,8 @@ void bnx2x_vfpf_close_vf(struct bnx2x *bp)
>  	bnx2x_vfpf_finalize(bp, &req->first_tlv);
>  
>  free_irq:
> -	/* Disable HW interrupts, NAPI */
> -	bnx2x_netif_stop(bp, 0);

This used to say 

	bnx2x_netif_stop(bp, 0);

but bnx2x_stop_nic() will do:

	bnx2x_netif_stop(bp, 1);

is it okay to shut down the HW here ? (whatever that entails)

> -	/* Delete all NAPI objects */
> -	bnx2x_del_all_napi(bp);
> -
> -	/* Release IRQs */
> -	bnx2x_free_irq(bp);
> +	/* Disable HW interrupts, delete NAPIs, Release IRQs */
> +	bnx2x_stop_nic(bp);
-- 
pw-bot: cr

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4] bnx2x: Fix error recovering in switch configuration
  2023-08-01  0:47       ` Jakub Kicinski
@ 2023-08-01  8:30         ` Paolo Abeni
  2023-08-07 21:15           ` Thinh Tran
  2023-08-07 21:08         ` Thinh Tran
  1 sibling, 1 reply; 36+ messages in thread
From: Paolo Abeni @ 2023-08-01  8:30 UTC (permalink / raw)
  To: Jakub Kicinski, Thinh Tran
  Cc: aelior, davem, edumazet, manishc, netdev, skalluru, drc,
	abdhalee, simon.horman

On Mon, 2023-07-31 at 17:47 -0700, Jakub Kicinski wrote:
> On Fri, 28 Jul 2023 16:11:33 -0500 Thinh Tran wrote:
> > As the BCM57810 and other I/O adapters are connected
> > through a PCIe switch, the bnx2x driver causes unexpected
> > system hang/crash while handling PCIe switch errors, if
> > its error handler is called after other drivers' handlers.
> > 
> > In this case, after numbers of bnx2x_tx_timout(), the
> > bnx2x_nic_unload() is  called, frees up resources and
> > calls bnx2x_napi_disable(). Then when EEH calls its
> > error handler, the bnx2x_io_error_detected() and
> > bnx2x_io_slot_reset() also calling bnx2x_napi_disable()
> > and freeing the resources.
> > 
> > 
> > Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
> > Reviewed-by: Manish Chopra <manishc@marvell.com>
> > Tested-by: Abdul Haleem <abdhalee@in.ibm.com>
> > Tested-by: David Christensen <drc@linux.vnet.ibm.com>
> > 
> > Reviewed-by: Simon Horman <simon.horman@corigine.com>
> 
> nit: no empty lines between tags
> 
> There should be a "---" line between the tags and changelog.
> 
> >   v4:
> >    - factoring common code into new function bnx2x_stop_nic()
> >      that disables and releases IRQs and NAPIs 
> >   v3:
> >     - no changes, just repatched to the latest driver level
> >     - updated the reviewed-by Manish in October, 2022
> > 
> >   v2:
> >    - Check the state of the NIC before calling disable nappi
> >      and freeing the IRQ
> >    - Prevent recurrence of TX timeout by turning off the carrier,
> >      calling netif_carrier_off() in bnx2x_tx_timeout()
> >    - Check and bail out early if fp->page_pool already freed
> > 
> > ---
> >  drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  2 ++
> 
> > @@ -3095,14 +3097,8 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link)
> >  		if (!CHIP_IS_E1x(bp))
> >  			bnx2x_pf_disable(bp);
> >  
> > -		/* Disable HW interrupts, NAPI */
> > -		bnx2x_netif_stop(bp, 1);
> > -		/* Delete all NAPI objects */
> > -		bnx2x_del_all_napi(bp);
> > -		if (CNIC_LOADED(bp))
> > -			bnx2x_del_all_napi_cnic(bp);
> > -		/* Release IRQs */
> > -		bnx2x_free_irq(bp);
> 
> Could you split the change into two patches - one factoring out the
> code into bnx2x_stop_nic() and the other adding the nic_stopped
> variable? First one should be pure code refactoring with no functional
> changes. That'd make the reviewing process easier.
> 
> > +		/* Disable HW interrupts, delete NAPIs, Release IRQs */
> > +		bnx2x_stop_nic(bp);
> >  
> >  		/* Report UNLOAD_DONE to MCP */
> >  		bnx2x_send_unload_done(bp, false);
> > @@ -4987,6 +4983,12 @@ void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
> >  {
> >  	struct bnx2x *bp = netdev_priv(dev);
> >  
> > +	/* Immediately indicate link as down */
> > +	bp->link_vars.link_up = 0;
> > +	bp->force_link_down = true;
> > +	netif_carrier_off(dev);
> > +	BNX2X_ERR("Indicating link is down due to Tx-timeout\n");
> 
> Is this code move to make the shutdown more immediate?
> That could also be a separate patch.

Note that the original code run under the rtnl lock and this is not
lockless, it that safe?

Cheers,

Paolo


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4] bnx2x: Fix error recovering in switch configuration
  2023-08-01  0:47       ` Jakub Kicinski
  2023-08-01  8:30         ` Paolo Abeni
@ 2023-08-07 21:08         ` Thinh Tran
  2023-08-07 21:24           ` Jakub Kicinski
  2023-08-07 21:35           ` Jakub Kicinski
  1 sibling, 2 replies; 36+ messages in thread
From: Thinh Tran @ 2023-08-07 21:08 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru, drc,
	abdhalee, simon.horman

Thank you for thorough review.

On 7/31/2023 7:47 PM, Jakub Kicinski wrote:

> 
> Could you split the change into two patches - one factoring out the
> code into bnx2x_stop_nic() and the other adding the nic_stopped
> variable? First one should be pure code refactoring with no functional
> changes. That'd make the reviewing process easier.

Sorry, I misunderstood comments in the reviewing of v3 asking to factor 
the code.
Should I keep the changes I made, or should I summit a new patch with 
factored code?

> 
>> +		/* Disable HW interrupts, delete NAPIs, Release IRQs */
>> +		bnx2x_stop_nic(bp);
>>   
>>   		/* Report UNLOAD_DONE to MCP */
>>   		bnx2x_send_unload_done(bp, false);
>> @@ -4987,6 +4983,12 @@ void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
>>   {
>>   	struct bnx2x *bp = netdev_priv(dev);
>>   
>> +	/* Immediately indicate link as down */
>> +	bp->link_vars.link_up = 0;
>> +	bp->force_link_down = true;
>> +	netif_carrier_off(dev);
>> +	BNX2X_ERR("Indicating link is down due to Tx-timeout\n");
> 
> Is this code move to make the shutdown more immediate?
> That could also be a separate patch.

Moving the code to here has the effect of disabling the link, preventing 
the excessive output of debug information from bnx2x_panic(). While 
bnx2x_panic() offers valuable debugging details, the excessive dumping 
overwhelms the dmesg buffer, they become unreadable.
I will exclude this part from the patch. A separate patch will be 
created to improve providing debug information

> 
>>   	/* We want the information of the dump logged,
>>   	 * but calling bnx2x_panic() would kill all chances of recovery.
>>   	 */
>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
>> index d8b1824c334d..f5ecbe8d604a 100644
>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
>> @@ -1015,6 +1015,9 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
>>   {
>>   	int i;
>>   
>> +	if (!fp->page_pool.page)
>> +		return;
>> +
>>   	if (fp->mode == TPA_MODE_DISABLED)
>>   		return;
>>   
>> @@ -1399,5 +1402,20 @@ void bnx2x_set_os_driver_state(struct bnx2x *bp, u32 state);
>>    */
>>   int bnx2x_nvram_read(struct bnx2x *bp, u32 offset, u8 *ret_buf,
>>   		     int buf_size);
>> +static inline void bnx2x_stop_nic(struct bnx2x *bp)
> 
> can't it live in bnx2x_cmn.c ?
It's in common header file for also being used by bnx2x_vfpf.c.

  Why make it a static inline?
> 
Just make it inlined where it is called.
>> +{
>> +	if (!bp->nic_stopped) {
>> +		/* Disable HW interrupts, NAPI */
>> +		bnx2x_netif_stop(bp, 1);
>> +		/* Delete all NAPI objects */
>> +		bnx2x_del_all_napi(bp);
>> +		if (CNIC_LOADED(bp))
>> +			bnx2x_del_all_napi_cnic(bp);
>> +		/* Release IRQs */
>> +		bnx2x_free_irq(bp);
>> +		bp->nic_stopped = true;
>> +	}
>> +}
>> +
>>   
> 
> nit: double new line
> 
>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
>> @@ -529,13 +529,8 @@ void bnx2x_vfpf_close_vf(struct bnx2x *bp)
>>   	bnx2x_vfpf_finalize(bp, &req->first_tlv);
>>   
>>   free_irq:
>> -	/* Disable HW interrupts, NAPI */
>> -	bnx2x_netif_stop(bp, 0);
> 
> This used to say
> 
> 	bnx2x_netif_stop(bp, 0);
> 
> but bnx2x_stop_nic() will do:
> 
> 	bnx2x_netif_stop(bp, 1);
> 
> is it okay to shut down the HW here ? (whatever that entails)
> 
My mistake, I didn't notice this before. The second parameter is for 
deciding if the hardware should stop sending interrupts or not. I'm not 
familiar with the virtual function code path, but I'll correct it to 
make sure things are consistent.

>> -	/* Delete all NAPI objects */
>> -	bnx2x_del_all_napi(bp);
>> -
>> -	/* Release IRQs */
>> -	bnx2x_free_irq(bp);
>> +	/* Disable HW interrupts, delete NAPIs, Release IRQs */
>> +	bnx2x_stop_nic(bp);

Thanks,
Thinh Tran

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4] bnx2x: Fix error recovering in switch configuration
  2023-08-01  8:30         ` Paolo Abeni
@ 2023-08-07 21:15           ` Thinh Tran
  0 siblings, 0 replies; 36+ messages in thread
From: Thinh Tran @ 2023-08-07 21:15 UTC (permalink / raw)
  To: Paolo Abeni, Jakub Kicinski
  Cc: aelior, davem, edumazet, manishc, netdev, skalluru, drc,
	abdhalee, simon.horman

Hi,
Thanks for the review.

On 8/1/2023 3:30 AM, Paolo Abeni wrote:
> On Mon, 2023-07-31 at 17:47 -0700, Jakub Kicinski wrote:

>>> @@ -4987,6 +4983,12 @@ void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
>>>   {
>>>   	struct bnx2x *bp = netdev_priv(dev);
>>>   
>>> +	/* Immediately indicate link as down */
>>> +	bp->link_vars.link_up = 0;
>>> +	bp->force_link_down = true;
>>> +	netif_carrier_off(dev);
>>> +	BNX2X_ERR("Indicating link is down due to Tx-timeout\n");
>>
>> Is this code move to make the shutdown more immediate?
>> That could also be a separate patch.
> 
> Note that the original code run under the rtnl lock and this is not
> lockless, it that safe?
>
Yes, it is safe.
The caller, dev_watchdog() is a holding a spin_lock of the net_device.

> Cheers,
> 
> Paolo
> 

Thanks,
Thinh Tran

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4] bnx2x: Fix error recovering in switch configuration
  2023-08-07 21:08         ` Thinh Tran
@ 2023-08-07 21:24           ` Jakub Kicinski
  2023-08-07 21:35           ` Jakub Kicinski
  1 sibling, 0 replies; 36+ messages in thread
From: Jakub Kicinski @ 2023-08-07 21:24 UTC (permalink / raw)
  To: Thinh Tran
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru, drc,
	abdhalee, simon.horman

On Mon, 7 Aug 2023 16:08:50 -0500 Thinh Tran wrote:
> > Could you split the change into two patches - one factoring out the
> > code into bnx2x_stop_nic() and the other adding the nic_stopped
> > variable? First one should be pure code refactoring with no functional
> > changes. That'd make the reviewing process easier.  
> 
> Sorry, I misunderstood comments in the reviewing of v3 asking to factor 
> the code.
> Should I keep the changes I made, or should I summit a new patch with 
> factored code?

I am not sure what you're asking.

In v5 I'm hoping to see 3 patches (as a single series!)
patch 1 - factor out the disabling into a helper
patch 2 - introduce the bp->nic_stopped checking
patch 3 - changes to bnx2x_tx_timeout()

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4] bnx2x: Fix error recovering in switch configuration
  2023-08-07 21:08         ` Thinh Tran
  2023-08-07 21:24           ` Jakub Kicinski
@ 2023-08-07 21:35           ` Jakub Kicinski
  1 sibling, 0 replies; 36+ messages in thread
From: Jakub Kicinski @ 2023-08-07 21:35 UTC (permalink / raw)
  To: Thinh Tran
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru, drc,
	abdhalee, simon.horman

On Mon, 7 Aug 2023 16:08:50 -0500 Thinh Tran wrote:
> >> +static inline void bnx2x_stop_nic(struct bnx2x *bp)  
> > 
> > can't it live in bnx2x_cmn.c ?  
> It's in common header file for also being used by bnx2x_vfpf.c.

And bnx2x_cmn.c is the common source file. I don't see why it has 
to be a static inline.

>>> Why make it a static inline?
>    
> Just make it inlined where it is called.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [Patch v5 0/4] bnx2x: Fix error recovering in switch configuration
  2023-07-28 21:11     ` [PATCH v4] " Thinh Tran
  2023-08-01  0:47       ` Jakub Kicinski
@ 2023-08-11 20:15       ` Thinh Tran
  2023-08-11 20:15         ` [Patch v5 1/4] bnx2x: new the bp->nic_stopped variable for checking NIC status Thinh Tran
                           ` (4 more replies)
  2023-08-18 16:14       ` [Patch v6 " Thinh Tran
  2 siblings, 5 replies; 36+ messages in thread
From: Thinh Tran @ 2023-08-11 20:15 UTC (permalink / raw)
  To: kuba
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru,
	VENKATA.SAI.DUGGI, Thinh Tran, Abdul Haleem, David Christensen,
	Simon Horman, Venkata Sai Duggi

As the BCM57810 and other I/O adapters are connected
through a PCIe switch, the bnx2x driver causes unexpected
system hang/crash while handling PCIe switch errors, if
its error handler is called after other drivers' handlers.

In this case, after numbers of bnx2x_tx_timout(), the
bnx2x_nic_unload() is  called, frees up resources and
calls bnx2x_napi_disable(). Then when EEH calls its
error handler, the bnx2x_io_error_detected() and
bnx2x_io_slot_reset() also calling bnx2x_napi_disable()
and freeing the resources.


Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
Reviewed-by: Manish Chopra <manishc@marvell.com>
Tested-by: Abdul Haleem <abdhalee@in.ibm.com>
Tested-by: David Christensen <drc@linux.vnet.ibm.com>

Reviewed-by: Simon Horman <simon.horman@corigine.com>
Tested-by: Venkata Sai Duggi <venkata.sai.duggi@ibm.com>

  v5:
   - Breaking down into a series of individual patches
  v4:
   - factoring common code into new function bnx2x_stop_nic()
     that disables and releases IRQs and NAPIs
  v3:
   - no changes, just repatched to the latest driver level
   - updated the reviewed-by Manish in October, 2022
  v2:
   - Check the state of the NIC before calling disable nappi
     and freeing the IRQ
   - Prevent recurrence of TX timeout by turning off the carrier,
     calling netif_carrier_off() in bnx2x_tx_timeout()
   - Check and bail out early if fp->page_pool already freed


Thinh Tran (4):
  bnx2x: new the bp->nic_stopped variable for checking NIC status
  bnx2x: factor out common code to bnx2x_stop_nic()
  bnx2x: Prevent access to a freed page in page_pool
  bnx2x: prevent excessive debug information during a TX timeout

 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  2 ++
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 32 ++++++++++++++-----
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  4 +++
 .../net/ethernet/broadcom/bnx2x/bnx2x_main.c  | 26 +++------------
 .../net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  |  9 ++----
 5 files changed, 36 insertions(+), 37 deletions(-)

-- 
2.27.0


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [Patch v5 1/4] bnx2x: new the bp->nic_stopped variable for checking NIC status
  2023-08-11 20:15       ` [Patch v5 0/4] " Thinh Tran
@ 2023-08-11 20:15         ` Thinh Tran
  2023-08-11 20:15         ` [Patch v5 2/4] bnx2x: factor out common code to bnx2x_stop_nic() Thinh Tran
                           ` (3 subsequent siblings)
  4 siblings, 0 replies; 36+ messages in thread
From: Thinh Tran @ 2023-08-11 20:15 UTC (permalink / raw)
  To: kuba
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru,
	VENKATA.SAI.DUGGI, Thinh Tran

Introducing 'nic_stopped' to verify the NIC's status before disabling
NAPI and freeing IRQs

Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  2 ++
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 21 +++++++-----
 .../net/ethernet/broadcom/bnx2x/bnx2x_main.c  | 32 +++++++++++--------
 .../net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  | 17 ++++++----
 4 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index de24ba76bfba..a00f1e1316bb 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -1509,6 +1509,8 @@ struct bnx2x {
 	bool			cnic_loaded;
 	struct cnic_eth_dev	*(*cnic_probe)(struct net_device *);
 
+	bool                    nic_stopped;
+
 	/* Flag that indicates that we can start looking for FCoE L2 queue
 	 * completions in the default status block.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index f34e008e12d4..feb0c23788ab 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -2703,6 +2703,7 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 	bnx2x_add_all_napi(bp);
 	DP(NETIF_MSG_IFUP, "napi added\n");
 	bnx2x_napi_enable(bp);
+	bp->nic_stopped = false;
 
 	if (IS_PF(bp)) {
 		/* set pf load just before approaching the MCP */
@@ -2948,6 +2949,7 @@ load_error2:
 load_error1:
 	bnx2x_napi_disable(bp);
 	bnx2x_del_all_napi(bp);
+	bp->nic_stopped = true;
 
 	/* clear pf_load status, as it was already set */
 	if (IS_PF(bp))
@@ -3083,14 +3085,17 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link)
 		if (!CHIP_IS_E1x(bp))
 			bnx2x_pf_disable(bp);
 
-		/* Disable HW interrupts, NAPI */
-		bnx2x_netif_stop(bp, 1);
-		/* Delete all NAPI objects */
-		bnx2x_del_all_napi(bp);
-		if (CNIC_LOADED(bp))
-			bnx2x_del_all_napi_cnic(bp);
-		/* Release IRQs */
-		bnx2x_free_irq(bp);
+		if (!bp->nic_stopped) {
+			/* Disable HW interrupts, NAPI */
+			bnx2x_netif_stop(bp, 1);
+			/* Delete all NAPI objects */
+			bnx2x_del_all_napi(bp);
+			if (CNIC_LOADED(bp))
+				bnx2x_del_all_napi_cnic(bp);
+			/* Release IRQs */
+			bnx2x_free_irq(bp);
+			bp->nic_stopped = true;
+		}
 
 		/* Report UNLOAD_DONE to MCP */
 		bnx2x_send_unload_done(bp, false);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 7516a2fb80ba..755e3bf8f44a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -9475,15 +9475,18 @@ unload_error:
 		}
 	}
 
-	/* Disable HW interrupts, NAPI */
-	bnx2x_netif_stop(bp, 1);
-	/* Delete all NAPI objects */
-	bnx2x_del_all_napi(bp);
-	if (CNIC_LOADED(bp))
-		bnx2x_del_all_napi_cnic(bp);
+	if (!bp->nic_stopped) {
+		/* Disable HW interrupts, NAPI */
+		bnx2x_netif_stop(bp, 1);
+		/* Delete all NAPI objects */
+		bnx2x_del_all_napi(bp);
+		if (CNIC_LOADED(bp))
+			bnx2x_del_all_napi_cnic(bp);
 
-	/* Release IRQs */
-	bnx2x_free_irq(bp);
+		/* Release IRQs */
+		bnx2x_free_irq(bp);
+		bp->nic_stopped = true;
+	}
 
 	/* Reset the chip, unless PCI function is offline. If we reach this
 	 * point following a PCI error handling, it means device is really
@@ -14256,13 +14259,16 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev)
 		}
 		bnx2x_drain_tx_queues(bp);
 		bnx2x_send_unload_req(bp, UNLOAD_RECOVERY);
-		bnx2x_netif_stop(bp, 1);
-		bnx2x_del_all_napi(bp);
+		if (!bp->nic_stopped) {
+			bnx2x_netif_stop(bp, 1);
+			bnx2x_del_all_napi(bp);
 
-		if (CNIC_LOADED(bp))
-			bnx2x_del_all_napi_cnic(bp);
+			if (CNIC_LOADED(bp))
+				bnx2x_del_all_napi_cnic(bp);
 
-		bnx2x_free_irq(bp);
+			bnx2x_free_irq(bp);
+			bp->nic_stopped = true;
+		}
 
 		/* Report UNLOAD_DONE to MCP */
 		bnx2x_send_unload_done(bp, true);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
index 7c96f943c6f3..0802462b4d16 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -529,13 +529,16 @@ void bnx2x_vfpf_close_vf(struct bnx2x *bp)
 	bnx2x_vfpf_finalize(bp, &req->first_tlv);
 
 free_irq:
-	/* Disable HW interrupts, NAPI */
-	bnx2x_netif_stop(bp, 0);
-	/* Delete all NAPI objects */
-	bnx2x_del_all_napi(bp);
-
-	/* Release IRQs */
-	bnx2x_free_irq(bp);
+	if (!bp->nic_stopped) {
+		/* Disable HW interrupts, NAPI */
+		bnx2x_netif_stop(bp, 0);
+		/* Delete all NAPI objects */
+		bnx2x_del_all_napi(bp);
+
+		/* Release IRQs */
+		bnx2x_free_irq(bp);
+		bp->nic_stopped = true;
+	}
 }
 
 static void bnx2x_leading_vfq_init(struct bnx2x *bp, struct bnx2x_virtf *vf,
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [Patch v5 2/4] bnx2x: factor out common code to bnx2x_stop_nic()
  2023-08-11 20:15       ` [Patch v5 0/4] " Thinh Tran
  2023-08-11 20:15         ` [Patch v5 1/4] bnx2x: new the bp->nic_stopped variable for checking NIC status Thinh Tran
@ 2023-08-11 20:15         ` Thinh Tran
  2023-08-11 22:08           ` Jakub Kicinski
  2023-08-11 20:15         ` [Patch v5 3/4] bnx2x: Prevent access to a freed page in page_pool Thinh Tran
                           ` (2 subsequent siblings)
  4 siblings, 1 reply; 36+ messages in thread
From: Thinh Tran @ 2023-08-11 20:15 UTC (permalink / raw)
  To: kuba
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru,
	VENKATA.SAI.DUGGI, Thinh Tran

Refactoring common code into a new function named bnx2x_stop_nic()

Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
---
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 27 +++++++++++--------
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  1 +
 .../net/ethernet/broadcom/bnx2x/bnx2x_main.c  | 26 +++---------------
 .../net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  | 12 ++-------
 4 files changed, 23 insertions(+), 43 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index feb0c23788ab..5296f5b8426b 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -3085,17 +3085,8 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link)
 		if (!CHIP_IS_E1x(bp))
 			bnx2x_pf_disable(bp);
 
-		if (!bp->nic_stopped) {
-			/* Disable HW interrupts, NAPI */
-			bnx2x_netif_stop(bp, 1);
-			/* Delete all NAPI objects */
-			bnx2x_del_all_napi(bp);
-			if (CNIC_LOADED(bp))
-				bnx2x_del_all_napi_cnic(bp);
-			/* Release IRQs */
-			bnx2x_free_irq(bp);
-			bp->nic_stopped = true;
-		}
+		/* Disable HW interrupts, delete NAPIs, Release IRQs */
+		bnx2x_stop_nic(bp, 1);
 
 		/* Report UNLOAD_DONE to MCP */
 		bnx2x_send_unload_done(bp, false);
@@ -5127,3 +5118,17 @@ void bnx2x_schedule_sp_rtnl(struct bnx2x *bp, enum sp_rtnl_flag flag,
 	   flag);
 	schedule_delayed_work(&bp->sp_rtnl_task, 0);
 }
+void bnx2x_stop_nic(struct bnx2x *bp, int disable_hw)
+{
+	if (!bp->nic_stopped) {
+		/* Disable HW interrupts, NAPI */
+		bnx2x_netif_stop(bp, disable_hw);
+		/* Delete all NAPI objects */
+		bnx2x_del_all_napi(bp);
+		if (CNIC_LOADED(bp))
+			bnx2x_del_all_napi_cnic(bp);
+		/* Release IRQs */
+		bnx2x_free_irq(bp);
+		bp->nic_stopped = true;
+	}
+}
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index d8b1824c334d..0ad879a5af95 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -553,6 +553,7 @@ void bnx2x_free_skbs(struct bnx2x *bp);
 void bnx2x_netif_stop(struct bnx2x *bp, int disable_hw);
 void bnx2x_netif_start(struct bnx2x *bp);
 int bnx2x_load_cnic(struct bnx2x *bp);
+void bnx2x_stop_nic(struct bnx2x *bp, int disable_hw);
 
 /**
  * bnx2x_enable_msix - set msix configuration.
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 755e3bf8f44a..7add3a420534 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -9475,18 +9475,8 @@ unload_error:
 		}
 	}
 
-	if (!bp->nic_stopped) {
-		/* Disable HW interrupts, NAPI */
-		bnx2x_netif_stop(bp, 1);
-		/* Delete all NAPI objects */
-		bnx2x_del_all_napi(bp);
-		if (CNIC_LOADED(bp))
-			bnx2x_del_all_napi_cnic(bp);
-
-		/* Release IRQs */
-		bnx2x_free_irq(bp);
-		bp->nic_stopped = true;
-	}
+	/* Disable HW interrupts, delete NAPIs, Release IRQs */
+	bnx2x_stop_nic(bp, 1);
 
 	/* Reset the chip, unless PCI function is offline. If we reach this
 	 * point following a PCI error handling, it means device is really
@@ -14259,16 +14249,8 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev)
 		}
 		bnx2x_drain_tx_queues(bp);
 		bnx2x_send_unload_req(bp, UNLOAD_RECOVERY);
-		if (!bp->nic_stopped) {
-			bnx2x_netif_stop(bp, 1);
-			bnx2x_del_all_napi(bp);
-
-			if (CNIC_LOADED(bp))
-				bnx2x_del_all_napi_cnic(bp);
-
-			bnx2x_free_irq(bp);
-			bp->nic_stopped = true;
-		}
+		/* Disable HW interrupts, delete NAPIs, Release IRQs */
+		bnx2x_stop_nic(bp,1);
 
 		/* Report UNLOAD_DONE to MCP */
 		bnx2x_send_unload_done(bp, true);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
index 0802462b4d16..651bb40f3443 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -529,16 +529,8 @@ void bnx2x_vfpf_close_vf(struct bnx2x *bp)
 	bnx2x_vfpf_finalize(bp, &req->first_tlv);
 
 free_irq:
-	if (!bp->nic_stopped) {
-		/* Disable HW interrupts, NAPI */
-		bnx2x_netif_stop(bp, 0);
-		/* Delete all NAPI objects */
-		bnx2x_del_all_napi(bp);
-
-		/* Release IRQs */
-		bnx2x_free_irq(bp);
-		bp->nic_stopped = true;
-	}
+	/* Disable HW interrupts, delete NAPIs, Release IRQs */
+	bnx2x_stop_nic(bp, 0);
 }
 
 static void bnx2x_leading_vfq_init(struct bnx2x *bp, struct bnx2x_virtf *vf,
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [Patch v5 3/4] bnx2x: Prevent access to a freed page in page_pool
  2023-08-11 20:15       ` [Patch v5 0/4] " Thinh Tran
  2023-08-11 20:15         ` [Patch v5 1/4] bnx2x: new the bp->nic_stopped variable for checking NIC status Thinh Tran
  2023-08-11 20:15         ` [Patch v5 2/4] bnx2x: factor out common code to bnx2x_stop_nic() Thinh Tran
@ 2023-08-11 20:15         ` Thinh Tran
  2023-08-11 20:15         ` [Patch v5 4/4] bnx2x: prevent excessive debug information during a TX timeout Thinh Tran
  2023-08-11 22:09         ` [Patch v5 0/4] bnx2x: Fix error recovering in switch configuration Jakub Kicinski
  4 siblings, 0 replies; 36+ messages in thread
From: Thinh Tran @ 2023-08-11 20:15 UTC (permalink / raw)
  To: kuba
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru,
	VENKATA.SAI.DUGGI, Thinh Tran

Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index 0ad879a5af95..ca3ab06206b3 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -1016,6 +1016,9 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 {
 	int i;
 
+	if (!fp->page_pool.page)
+		return;
+
 	if (fp->mode == TPA_MODE_DISABLED)
 		return;
 
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [Patch v5 4/4] bnx2x: prevent excessive debug information during a TX timeout
  2023-08-11 20:15       ` [Patch v5 0/4] " Thinh Tran
                           ` (2 preceding siblings ...)
  2023-08-11 20:15         ` [Patch v5 3/4] bnx2x: Prevent access to a freed page in page_pool Thinh Tran
@ 2023-08-11 20:15         ` Thinh Tran
  2023-08-11 22:09         ` [Patch v5 0/4] bnx2x: Fix error recovering in switch configuration Jakub Kicinski
  4 siblings, 0 replies; 36+ messages in thread
From: Thinh Tran @ 2023-08-11 20:15 UTC (permalink / raw)
  To: kuba
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru,
	VENKATA.SAI.DUGGI, Thinh Tran

Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c  | 6 ++++++
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 6 ------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 5296f5b8426b..814350d10b7a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -4971,6 +4971,12 @@ void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 
+	/* Immediately indicate link as down */
+	bp->link_vars.link_up = 0;
+	bp->force_link_down = true;
+	netif_carrier_off(dev);
+	BNX2X_ERR("Indicating link is down due to Tx-timeout\n");
+
 	/* We want the information of the dump logged,
 	 * but calling bnx2x_panic() would kill all chances of recovery.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 7add3a420534..5c1bde0f15f3 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -10267,12 +10267,6 @@ static void bnx2x_sp_rtnl_task(struct work_struct *work)
 		bp->sp_rtnl_state = 0;
 		smp_mb();
 
-		/* Immediately indicate link as down */
-		bp->link_vars.link_up = 0;
-		bp->force_link_down = true;
-		netif_carrier_off(bp->dev);
-		BNX2X_ERR("Indicating link is down due to Tx-timeout\n");
-
 		bnx2x_nic_unload(bp, UNLOAD_NORMAL, true);
 		/* When ret value shows failure of allocation failure,
 		 * the nic is rebooted again. If open still fails, a error
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [Patch v5 2/4] bnx2x: factor out common code to bnx2x_stop_nic()
  2023-08-11 20:15         ` [Patch v5 2/4] bnx2x: factor out common code to bnx2x_stop_nic() Thinh Tran
@ 2023-08-11 22:08           ` Jakub Kicinski
  0 siblings, 0 replies; 36+ messages in thread
From: Jakub Kicinski @ 2023-08-11 22:08 UTC (permalink / raw)
  To: Thinh Tran
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru,
	VENKATA.SAI.DUGGI

On Fri, 11 Aug 2023 15:15:10 -0500 Thinh Tran wrote:
>  }
> +void bnx2x_stop_nic(struct bnx2x *bp, int disable_hw)
> +{

nit: empty line between closing bracket of previous function and the
beginning of the new one, please (checkpatch --strict should hopefully
point that out, although I haven't confirmed).

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [Patch v5 0/4] bnx2x: Fix error recovering in switch configuration
  2023-08-11 20:15       ` [Patch v5 0/4] " Thinh Tran
                           ` (3 preceding siblings ...)
  2023-08-11 20:15         ` [Patch v5 4/4] bnx2x: prevent excessive debug information during a TX timeout Thinh Tran
@ 2023-08-11 22:09         ` Jakub Kicinski
  2023-08-18 13:24           ` Thinh Tran
  4 siblings, 1 reply; 36+ messages in thread
From: Jakub Kicinski @ 2023-08-11 22:09 UTC (permalink / raw)
  To: Thinh Tran
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru,
	VENKATA.SAI.DUGGI, Abdul Haleem, David Christensen, Simon Horman

On Fri, 11 Aug 2023 15:15:08 -0500 Thinh Tran wrote:
> As the BCM57810 and other I/O adapters are connected
> through a PCIe switch, the bnx2x driver causes unexpected
> system hang/crash while handling PCIe switch errors, if
> its error handler is called after other drivers' handlers.
> 
> In this case, after numbers of bnx2x_tx_timout(), the
> bnx2x_nic_unload() is  called, frees up resources and
> calls bnx2x_napi_disable(). Then when EEH calls its
> error handler, the bnx2x_io_error_detected() and
> bnx2x_io_slot_reset() also calling bnx2x_napi_disable()
> and freeing the resources.

It's looking fairly reasonable, thanks!

We do need commit messages describing motivations and impact 
of each individual commit, tho, and those commits which are 
not refactoring / improvements but prevent crashes need to
have a Fixes tag pointing to the first commit in history where
problem may occur (sometimes it's the first commit of the entire
history).
-- 
pw-bot: cr

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [Patch v5 0/4] bnx2x: Fix error recovering in switch configuration
  2023-08-11 22:09         ` [Patch v5 0/4] bnx2x: Fix error recovering in switch configuration Jakub Kicinski
@ 2023-08-18 13:24           ` Thinh Tran
  0 siblings, 0 replies; 36+ messages in thread
From: Thinh Tran @ 2023-08-18 13:24 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru,
	VENKATA.SAI.DUGGI, Abdul Haleem, David Christensen, Simon Horman



On 8/11/2023 5:09 PM, Jakub Kicinski wrote:

> 
> It's looking fairly reasonable, thanks!
> 
> We do need commit messages describing motivations and impact
> of each individual commit, tho, and those commits which are
> not refactoring / improvements but prevent crashes need to
> have a Fixes tag pointing to the first commit in history where
> problem may occur (sometimes it's the first commit of the entire
> history).
Thanks the review. I'll make the correction and resubmit them shortly.
Thinh Tran

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [Patch v6 0/4] bnx2x: Fix error recovering in switch configuration
  2023-07-28 21:11     ` [PATCH v4] " Thinh Tran
  2023-08-01  0:47       ` Jakub Kicinski
  2023-08-11 20:15       ` [Patch v5 0/4] " Thinh Tran
@ 2023-08-18 16:14       ` Thinh Tran
  2023-08-18 16:14         ` [Patch v6 1/4] bnx2x: new flag for track HW resource Thinh Tran
                           ` (3 more replies)
  2 siblings, 4 replies; 36+ messages in thread
From: Thinh Tran @ 2023-08-18 16:14 UTC (permalink / raw)
  To: kuba
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru,
	VENKATA.SAI.DUGGI, Thinh Tran, Abdul Haleem, David Christensen,
	Simon Horman, Venkata Sai Duggi

While injecting PCIe errors to the upstream PCIe switch of
a BCM57810 NIC, system hangs/crashes were observed.

After several calls to bnx2x_tx_timout() complete,
bnx2x_nic_unload() is called to free up HW resources
and bnx2x_napi_disable() is called to release NAPI objects.
Later, when the EEH driver calls bnx2x_io_slot_reset() to
complete the recovery process, bnx2x attempts to disable
NAPI again by calling bnx2x_napi_disable() and freeing
resources which have already been freed, resulting in a
hang or crash.

This patch set introduces a new flag to track the HW 
resource and NAPI allocation state, refactor duplicated
code into a single function, check page pool allocation
status before freeing, and reduces debug output when
a TX timeout event occurs.


Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
Reviewed-by: Manish Chopra <manishc@marvell.com>
Tested-by: Abdul Haleem <abdhalee@in.ibm.com>
Tested-by: David Christensen <drc@linux.vnet.ibm.com>

Reviewed-by: Simon Horman <simon.horman@corigine.com>
Tested-by: Venkata Sai Duggi <venkata.sai.duggi@ibm.com>

  v6:
   - Clarifying and updating commit messages
  v5:
   - Breaking down into a series of individual patches
  v4:
   - factoring common code into new function bnx2x_stop_nic()
     that disables and releases IRQs and NAPIs
  v3:
   - no changes, just repatched to the latest driver level
   - updated the reviewed-by Manish in October, 2022
  v2:
   - Check the state of the NIC before calling disable nappi
     and freeing the IRQ
   - Prevent recurrence of TX timeout by turning off the carrier,
     calling netif_carrier_off() in bnx2x_tx_timeout()
   - Check and bail out early if fp->page_pool already freed


Thinh Tran (4):
  bnx2x: new the bp->nic_stopped variable for checking NIC status
  bnx2x: factor out common code to bnx2x_stop_nic()
  bnx2x: Prevent access to a freed page in page_pool
  bnx2x: prevent excessive debug information during a TX timeout

 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  2 ++
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 33 ++++++++++++++-----
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  4 +++
 .../net/ethernet/broadcom/bnx2x/bnx2x_main.c  | 26 +++------------
 .../net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  |  9 ++---
 5 files changed, 37 insertions(+), 37 deletions(-)

-- 
2.27.0


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [Patch v6 1/4] bnx2x: new flag for track HW resource
  2023-08-18 16:14       ` [Patch v6 " Thinh Tran
@ 2023-08-18 16:14         ` Thinh Tran
  2023-08-18 16:14         ` [Patch v6 2/4] bnx2x: factor out common code to bnx2x_stop_nic() Thinh Tran
                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 36+ messages in thread
From: Thinh Tran @ 2023-08-18 16:14 UTC (permalink / raw)
  To: kuba
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru,
	VENKATA.SAI.DUGGI, Thinh Tran


Add a new flag, nic_stopped, to track NIC resource usage (HW interrupts,
NAPI objection allocation).

Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  2 ++
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 21 +++++++-----
 .../net/ethernet/broadcom/bnx2x/bnx2x_main.c  | 32 +++++++++++--------
 .../net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  | 17 ++++++----
 4 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 8bcde0a6e011..e2a4e1088b7f 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -1508,6 +1508,8 @@ struct bnx2x {
 	bool			cnic_loaded;
 	struct cnic_eth_dev	*(*cnic_probe)(struct net_device *);
 
+	bool                    nic_stopped;
+
 	/* Flag that indicates that we can start looking for FCoE L2 queue
 	 * completions in the default status block.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 6ea5521074d3..e9c1e1bb5580 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -2715,6 +2715,7 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 	bnx2x_add_all_napi(bp);
 	DP(NETIF_MSG_IFUP, "napi added\n");
 	bnx2x_napi_enable(bp);
+	bp->nic_stopped = false;
 
 	if (IS_PF(bp)) {
 		/* set pf load just before approaching the MCP */
@@ -2960,6 +2961,7 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 load_error1:
 	bnx2x_napi_disable(bp);
 	bnx2x_del_all_napi(bp);
+	bp->nic_stopped = true;
 
 	/* clear pf_load status, as it was already set */
 	if (IS_PF(bp))
@@ -3095,14 +3097,17 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link)
 		if (!CHIP_IS_E1x(bp))
 			bnx2x_pf_disable(bp);
 
-		/* Disable HW interrupts, NAPI */
-		bnx2x_netif_stop(bp, 1);
-		/* Delete all NAPI objects */
-		bnx2x_del_all_napi(bp);
-		if (CNIC_LOADED(bp))
-			bnx2x_del_all_napi_cnic(bp);
-		/* Release IRQs */
-		bnx2x_free_irq(bp);
+		if (!bp->nic_stopped) {
+			/* Disable HW interrupts, NAPI */
+			bnx2x_netif_stop(bp, 1);
+			/* Delete all NAPI objects */
+			bnx2x_del_all_napi(bp);
+			if (CNIC_LOADED(bp))
+				bnx2x_del_all_napi_cnic(bp);
+			/* Release IRQs */
+			bnx2x_free_irq(bp);
+			bp->nic_stopped = true;
+		}
 
 		/* Report UNLOAD_DONE to MCP */
 		bnx2x_send_unload_done(bp, false);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 1e7a6f1d4223..0d8e61c63c7c 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -9474,15 +9474,18 @@ void bnx2x_chip_cleanup(struct bnx2x *bp, int unload_mode, bool keep_link)
 		}
 	}
 
-	/* Disable HW interrupts, NAPI */
-	bnx2x_netif_stop(bp, 1);
-	/* Delete all NAPI objects */
-	bnx2x_del_all_napi(bp);
-	if (CNIC_LOADED(bp))
-		bnx2x_del_all_napi_cnic(bp);
+	if (!bp->nic_stopped) {
+		/* Disable HW interrupts, NAPI */
+		bnx2x_netif_stop(bp, 1);
+		/* Delete all NAPI objects */
+		bnx2x_del_all_napi(bp);
+		if (CNIC_LOADED(bp))
+			bnx2x_del_all_napi_cnic(bp);
 
-	/* Release IRQs */
-	bnx2x_free_irq(bp);
+		/* Release IRQs */
+		bnx2x_free_irq(bp);
+		bp->nic_stopped = true;
+	}
 
 	/* Reset the chip, unless PCI function is offline. If we reach this
 	 * point following a PCI error handling, it means device is really
@@ -14238,13 +14241,16 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev)
 		}
 		bnx2x_drain_tx_queues(bp);
 		bnx2x_send_unload_req(bp, UNLOAD_RECOVERY);
-		bnx2x_netif_stop(bp, 1);
-		bnx2x_del_all_napi(bp);
+		if (!bp->nic_stopped) {
+			bnx2x_netif_stop(bp, 1);
+			bnx2x_del_all_napi(bp);
 
-		if (CNIC_LOADED(bp))
-			bnx2x_del_all_napi_cnic(bp);
+			if (CNIC_LOADED(bp))
+				bnx2x_del_all_napi_cnic(bp);
 
-		bnx2x_free_irq(bp);
+			bnx2x_free_irq(bp);
+			bp->nic_stopped = true;
+		}
 
 		/* Report UNLOAD_DONE to MCP */
 		bnx2x_send_unload_done(bp, true);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
index 0657a0f5170f..8946a931e87e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -529,13 +529,16 @@ void bnx2x_vfpf_close_vf(struct bnx2x *bp)
 	bnx2x_vfpf_finalize(bp, &req->first_tlv);
 
 free_irq:
-	/* Disable HW interrupts, NAPI */
-	bnx2x_netif_stop(bp, 0);
-	/* Delete all NAPI objects */
-	bnx2x_del_all_napi(bp);
-
-	/* Release IRQs */
-	bnx2x_free_irq(bp);
+	if (!bp->nic_stopped) {
+		/* Disable HW interrupts, NAPI */
+		bnx2x_netif_stop(bp, 0);
+		/* Delete all NAPI objects */
+		bnx2x_del_all_napi(bp);
+
+		/* Release IRQs */
+		bnx2x_free_irq(bp);
+		bp->nic_stopped = true;
+	}
 }
 
 static void bnx2x_leading_vfq_init(struct bnx2x *bp, struct bnx2x_virtf *vf,
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [Patch v6 2/4] bnx2x: factor out common code to bnx2x_stop_nic()
  2023-08-18 16:14       ` [Patch v6 " Thinh Tran
  2023-08-18 16:14         ` [Patch v6 1/4] bnx2x: new flag for track HW resource Thinh Tran
@ 2023-08-18 16:14         ` Thinh Tran
  2023-08-18 16:14         ` [Patch v6 3/4] bnx2x: Prevent access to a freed page in page_pool Thinh Tran
  2023-08-18 16:14         ` [Patch v6 4/4] bnx2x: prevent excessive debug information during a TX timeout Thinh Tran
  3 siblings, 0 replies; 36+ messages in thread
From: Thinh Tran @ 2023-08-18 16:14 UTC (permalink / raw)
  To: kuba
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru,
	VENKATA.SAI.DUGGI, Thinh Tran

Refactor common code which disables and releases HW interrupts, deletes
NAPI objects, into a new bnx2x_stop_nic() function.

Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
---
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 28 +++++++++++--------
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  1 +
 .../net/ethernet/broadcom/bnx2x/bnx2x_main.c  | 26 +++--------------
 .../net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  | 12 ++------
 4 files changed, 24 insertions(+), 43 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index e9c1e1bb5580..e34aff5fb782 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -3097,17 +3097,8 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link)
 		if (!CHIP_IS_E1x(bp))
 			bnx2x_pf_disable(bp);
 
-		if (!bp->nic_stopped) {
-			/* Disable HW interrupts, NAPI */
-			bnx2x_netif_stop(bp, 1);
-			/* Delete all NAPI objects */
-			bnx2x_del_all_napi(bp);
-			if (CNIC_LOADED(bp))
-				bnx2x_del_all_napi_cnic(bp);
-			/* Release IRQs */
-			bnx2x_free_irq(bp);
-			bp->nic_stopped = true;
-		}
+		/* Disable HW interrupts, delete NAPIs, Release IRQs */
+		bnx2x_stop_nic(bp, 1);
 
 		/* Report UNLOAD_DONE to MCP */
 		bnx2x_send_unload_done(bp, false);
@@ -5139,3 +5130,18 @@ void bnx2x_schedule_sp_rtnl(struct bnx2x *bp, enum sp_rtnl_flag flag,
 	   flag);
 	schedule_delayed_work(&bp->sp_rtnl_task, 0);
 }
+
+void bnx2x_stop_nic(struct bnx2x *bp, int disable_hw)
+{
+	if (!bp->nic_stopped) {
+		/* Disable HW interrupts, NAPI */
+		bnx2x_netif_stop(bp, disable_hw);
+		/* Delete all NAPI objects */
+		bnx2x_del_all_napi(bp);
+		if (CNIC_LOADED(bp))
+			bnx2x_del_all_napi_cnic(bp);
+		/* Release IRQs */
+		bnx2x_free_irq(bp);
+		bp->nic_stopped = true;
+	}
+}
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index d8b1824c334d..0ad879a5af95 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -553,6 +553,7 @@ void bnx2x_free_skbs(struct bnx2x *bp);
 void bnx2x_netif_stop(struct bnx2x *bp, int disable_hw);
 void bnx2x_netif_start(struct bnx2x *bp);
 int bnx2x_load_cnic(struct bnx2x *bp);
+void bnx2x_stop_nic(struct bnx2x *bp, int disable_hw);
 
 /**
  * bnx2x_enable_msix - set msix configuration.
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 0d8e61c63c7c..13d2a7761c24 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -9474,18 +9474,8 @@ void bnx2x_chip_cleanup(struct bnx2x *bp, int unload_mode, bool keep_link)
 		}
 	}
 
-	if (!bp->nic_stopped) {
-		/* Disable HW interrupts, NAPI */
-		bnx2x_netif_stop(bp, 1);
-		/* Delete all NAPI objects */
-		bnx2x_del_all_napi(bp);
-		if (CNIC_LOADED(bp))
-			bnx2x_del_all_napi_cnic(bp);
-
-		/* Release IRQs */
-		bnx2x_free_irq(bp);
-		bp->nic_stopped = true;
-	}
+	/* Disable HW interrupts, delete NAPIs, Release IRQs */
+	bnx2x_stop_nic(bp, 1);
 
 	/* Reset the chip, unless PCI function is offline. If we reach this
 	 * point following a PCI error handling, it means device is really
@@ -14241,16 +14231,8 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev)
 		}
 		bnx2x_drain_tx_queues(bp);
 		bnx2x_send_unload_req(bp, UNLOAD_RECOVERY);
-		if (!bp->nic_stopped) {
-			bnx2x_netif_stop(bp, 1);
-			bnx2x_del_all_napi(bp);
-
-			if (CNIC_LOADED(bp))
-				bnx2x_del_all_napi_cnic(bp);
-
-			bnx2x_free_irq(bp);
-			bp->nic_stopped = true;
-		}
+		/* Disable HW interrupts, delete NAPIs, Release IRQs */
+		bnx2x_stop_nic(bp, 1);
 
 		/* Report UNLOAD_DONE to MCP */
 		bnx2x_send_unload_done(bp, true);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
index 8946a931e87e..e92e82423096 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -529,16 +529,8 @@ void bnx2x_vfpf_close_vf(struct bnx2x *bp)
 	bnx2x_vfpf_finalize(bp, &req->first_tlv);
 
 free_irq:
-	if (!bp->nic_stopped) {
-		/* Disable HW interrupts, NAPI */
-		bnx2x_netif_stop(bp, 0);
-		/* Delete all NAPI objects */
-		bnx2x_del_all_napi(bp);
-
-		/* Release IRQs */
-		bnx2x_free_irq(bp);
-		bp->nic_stopped = true;
-	}
+	/* Disable HW interrupts, delete NAPIs, Release IRQs */
+	bnx2x_stop_nic(bp, 0);
 }
 
 static void bnx2x_leading_vfq_init(struct bnx2x *bp, struct bnx2x_virtf *vf,
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [Patch v6 3/4] bnx2x: Prevent access to a freed page in page_pool
  2023-08-18 16:14       ` [Patch v6 " Thinh Tran
  2023-08-18 16:14         ` [Patch v6 1/4] bnx2x: new flag for track HW resource Thinh Tran
  2023-08-18 16:14         ` [Patch v6 2/4] bnx2x: factor out common code to bnx2x_stop_nic() Thinh Tran
@ 2023-08-18 16:14         ` Thinh Tran
  2023-08-18 16:14         ` [Patch v6 4/4] bnx2x: prevent excessive debug information during a TX timeout Thinh Tran
  3 siblings, 0 replies; 36+ messages in thread
From: Thinh Tran @ 2023-08-18 16:14 UTC (permalink / raw)
  To: kuba
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru,
	VENKATA.SAI.DUGGI, Thinh Tran

Verify page pool allocations before freeing. 
Fixes: Fixes: 523224a3b3cd ("bnx2x, cnic, bnx2i: use new FW/HSI")

Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index 0ad879a5af95..ca3ab06206b3 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -1016,6 +1016,9 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 {
 	int i;
 
+	if (!fp->page_pool.page)
+		return;
+
 	if (fp->mode == TPA_MODE_DISABLED)
 		return;
 
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [Patch v6 4/4] bnx2x: prevent excessive debug information during a TX timeout
  2023-08-18 16:14       ` [Patch v6 " Thinh Tran
                           ` (2 preceding siblings ...)
  2023-08-18 16:14         ` [Patch v6 3/4] bnx2x: Prevent access to a freed page in page_pool Thinh Tran
@ 2023-08-18 16:14         ` Thinh Tran
  3 siblings, 0 replies; 36+ messages in thread
From: Thinh Tran @ 2023-08-18 16:14 UTC (permalink / raw)
  To: kuba
  Cc: aelior, davem, edumazet, manishc, netdev, pabeni, skalluru,
	VENKATA.SAI.DUGGI, Thinh Tran

Reduce debug output when a TX timeout occurs to avoid overflowing the
system log buffer.

Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c  | 6 ++++++
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 6 ------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index e34aff5fb782..73546497d978 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -4983,6 +4983,12 @@ void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 
+	/* Immediately indicate link as down */
+	bp->link_vars.link_up = 0;
+	bp->force_link_down = true;
+	netif_carrier_off(dev);
+	BNX2X_ERR("Indicating link is down due to Tx-timeout\n");
+
 	/* We want the information of the dump logged,
 	 * but calling bnx2x_panic() would kill all chances of recovery.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 13d2a7761c24..236cd40d7381 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -10266,12 +10266,6 @@ static void bnx2x_sp_rtnl_task(struct work_struct *work)
 		bp->sp_rtnl_state = 0;
 		smp_mb();
 
-		/* Immediately indicate link as down */
-		bp->link_vars.link_up = 0;
-		bp->force_link_down = true;
-		netif_carrier_off(bp->dev);
-		BNX2X_ERR("Indicating link is down due to Tx-timeout\n");
-
 		bnx2x_nic_unload(bp, UNLOAD_NORMAL, true);
 		/* When ret value shows failure of allocation failure,
 		 * the nic is rebooted again. If open still fails, a error
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2023-08-18 16:15 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-25 20:00 [PATCH] bnx2x: Fix error recovering in switch configuration Thinh Tran
2022-08-27  1:44 ` Jakub Kicinski
2022-08-30 16:15   ` Thinh Tran
2022-09-16 19:51   ` [PATCH v2] " Thinh Tran
2022-09-19 13:53     ` [EXT] " Manish Chopra
2022-09-21 20:11       ` Thinh Tran
2022-09-28 21:33         ` Thinh Tran
2022-10-10 22:01           ` Thinh Tran
2022-10-17 11:14             ` Manish Chopra
2022-10-20 19:12               ` Thinh Tran
2023-07-19 22:02     ` [Patch v3] " Thinh Tran
2023-07-24 11:48       ` Simon Horman
2023-07-24 22:50       ` Jakub Kicinski
2023-07-27 13:29         ` Thinh Tran
2023-07-28 21:11     ` [PATCH v4] " Thinh Tran
2023-08-01  0:47       ` Jakub Kicinski
2023-08-01  8:30         ` Paolo Abeni
2023-08-07 21:15           ` Thinh Tran
2023-08-07 21:08         ` Thinh Tran
2023-08-07 21:24           ` Jakub Kicinski
2023-08-07 21:35           ` Jakub Kicinski
2023-08-11 20:15       ` [Patch v5 0/4] " Thinh Tran
2023-08-11 20:15         ` [Patch v5 1/4] bnx2x: new the bp->nic_stopped variable for checking NIC status Thinh Tran
2023-08-11 20:15         ` [Patch v5 2/4] bnx2x: factor out common code to bnx2x_stop_nic() Thinh Tran
2023-08-11 22:08           ` Jakub Kicinski
2023-08-11 20:15         ` [Patch v5 3/4] bnx2x: Prevent access to a freed page in page_pool Thinh Tran
2023-08-11 20:15         ` [Patch v5 4/4] bnx2x: prevent excessive debug information during a TX timeout Thinh Tran
2023-08-11 22:09         ` [Patch v5 0/4] bnx2x: Fix error recovering in switch configuration Jakub Kicinski
2023-08-18 13:24           ` Thinh Tran
2023-08-18 16:14       ` [Patch v6 " Thinh Tran
2023-08-18 16:14         ` [Patch v6 1/4] bnx2x: new flag for track HW resource Thinh Tran
2023-08-18 16:14         ` [Patch v6 2/4] bnx2x: factor out common code to bnx2x_stop_nic() Thinh Tran
2023-08-18 16:14         ` [Patch v6 3/4] bnx2x: Prevent access to a freed page in page_pool Thinh Tran
2023-08-18 16:14         ` [Patch v6 4/4] bnx2x: prevent excessive debug information during a TX timeout Thinh Tran
2022-08-30  9:19 ` [EXT] [PATCH] bnx2x: Fix error recovering in switch configuration Manish Chopra
2022-08-30 21:01   ` Thinh Tran

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.