netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] net/cxgb4: Avoid disabling PCI device for towice
@ 2014-01-20  3:05 Gavin Shan
  2014-01-20  3:05 ` [PATCH 2/2] net/cxgb4: Don't retrieve stats during recovery Gavin Shan
  0 siblings, 1 reply; 6+ messages in thread
From: Gavin Shan @ 2014-01-20  3:05 UTC (permalink / raw)
  To: netdev; +Cc: dm, Gavin Shan

If we have EEH error happens to the adapter and we have to remove
it from the system for some reasons (e.g. more than 5 EEH errors
detected from the device in last hour), the adapter will be disabled
for towice separately by eeh_err_detected() and remove_one(), which
will incur following unexpected backtrace. The patch tries to avoid
it.

WARNING: at drivers/pci/pci.c:1431
CPU: 12 PID: 121 Comm: eehd Not tainted 3.13.0-rc7+ #1
task: c0000001823a3780 ti: c00000018240c000 task.ti: c00000018240c000
NIP: c0000000003c1e40 LR: c0000000003c1e3c CTR: 0000000001764c5c
REGS: c00000018240f470 TRAP: 0700   Not tainted  (3.13.0-rc7+)
MSR: 8000000000029032 <SF,EE,ME,IR,DR,RI>  CR: 28000024  XER: 00000004
CFAR: c000000000706528 SOFTE: 1
GPR00: c0000000003c1e3c c00000018240f6f0 c0000000010fe1f8 0000000000000035
GPR04: 0000000000000000 0000000000000000 00000000003ae509 0000000000000000
GPR08: 000000000000346f 0000000000000000 0000000000000000 0000000000003fef
GPR12: 0000000028000022 c00000000ec93000 c0000000000c11b0 c000000184ac3e40
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR24: 0000000000000000 c0000000009398d8 c00000000101f9c0 c0000001860ae000
GPR28: c000000182ba0000 00000000000001f0 c0000001860ae6f8 c0000001860ae000
NIP [c0000000003c1e40] .pci_disable_device+0xd0/0xf0
LR [c0000000003c1e3c] .pci_disable_device+0xcc/0xf0
Call Trace:
[c0000000003c1e3c] .pci_disable_device+0xcc/0xf0 (unreliable)
[d0000000073881c4] .remove_one+0x174/0x320 [cxgb4]
[c0000000003c57e0] .pci_device_remove+0x60/0x100
[c00000000046396c] .__device_release_driver+0x9c/0x120
[c000000000463a20] .device_release_driver+0x30/0x60
[c0000000003bcdb4] .pci_stop_bus_device+0x94/0xd0
[c0000000003bcf48] .pci_stop_and_remove_bus_device+0x18/0x30
[c00000000003f548] .pcibios_remove_pci_devices+0xa8/0x140
[c000000000035c00] .eeh_handle_normal_event+0xa0/0x3c0
[c000000000035f50] .eeh_handle_event+0x30/0x2b0
[c0000000000362c4] .eeh_event_handler+0xf4/0x1b0
[c0000000000c12b8] .kthread+0x108/0x130
[c00000000000a168] .ret_from_kernel_thread+0x5c/0x74

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h      |    5 +++--
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |   23 ++++++++++++++++++-----
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 56e0415..b4ca0a1 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -387,8 +387,9 @@ struct work_struct;
 
 enum {                                 /* adapter flags */
 	FULL_INIT_DONE     = (1 << 0),
-	USING_MSI          = (1 << 1),
-	USING_MSIX         = (1 << 2),
+	DEV_ENABLED        = (1 << 1),
+	USING_MSI          = (1 << 2),
+	USING_MSIX         = (1 << 3),
 	FW_OK              = (1 << 4),
 	RSS_TNLALLLOOKUP   = (1 << 5),
 	USING_SOFT_PARAMS  = (1 << 6),
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index fff02ed..c8eafbf 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5505,7 +5505,10 @@ static pci_ers_result_t eeh_err_detected(struct pci_dev *pdev,
 	if (adap->flags & FULL_INIT_DONE)
 		cxgb_down(adap);
 	rtnl_unlock();
-	pci_disable_device(pdev);
+	if ((adap->flags & DEV_ENABLED)) {
+		pci_disable_device(pdev);
+		adap->flags &= ~DEV_ENABLED;
+	}
 out:	return state == pci_channel_io_perm_failure ?
 		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
 }
@@ -5522,9 +5525,13 @@ static pci_ers_result_t eeh_slot_reset(struct pci_dev *pdev)
 		return PCI_ERS_RESULT_RECOVERED;
 	}
 
-	if (pci_enable_device(pdev)) {
-		dev_err(&pdev->dev, "cannot reenable PCI device after reset\n");
-		return PCI_ERS_RESULT_DISCONNECT;
+	if (!(adap->flags & DEV_ENABLED)) {
+		if (pci_enable_device(pdev)) {
+			dev_err(&pdev->dev, "Cannot reenable PCI "
+					    "device after reset\n");
+			return PCI_ERS_RESULT_DISCONNECT;
+		}
+		adap->flags |= DEV_ENABLED;
 	}
 
 	pci_set_master(pdev);
@@ -5910,6 +5917,9 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto out_disable_device;
 	}
 
+	/* PCI device has been enabled */
+	adapter->flags |= DEV_ENABLED;
+
 	adapter->regs = pci_ioremap_bar(pdev, 0);
 	if (!adapter->regs) {
 		dev_err(&pdev->dev, "cannot map device registers\n");
@@ -6145,7 +6155,10 @@ static void remove_one(struct pci_dev *pdev)
 			iounmap(adapter->bar2);
 		kfree(adapter);
 		pci_disable_pcie_error_reporting(pdev);
-		pci_disable_device(pdev);
+		if ((adapter->flags & DEV_ENABLED)) {
+			pci_disable_device(pdev);
+			adapter->flags &= ~DEV_ENABLED;
+		}
 		pci_release_regions(pdev);
 	} else
 		pci_release_regions(pdev);
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/2] net/cxgb4: Don't retrieve stats during recovery
  2014-01-20  3:05 [PATCH 1/2] net/cxgb4: Avoid disabling PCI device for towice Gavin Shan
@ 2014-01-20  3:05 ` Gavin Shan
  2014-01-20  3:49   ` Ben Hutchings
                     ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Gavin Shan @ 2014-01-20  3:05 UTC (permalink / raw)
  To: netdev; +Cc: dm, Gavin Shan

We possiblly retrieve the adapter's statistics during EEH recovery
and that should be disallowed. Otherwise, it would possibly incur
replicate EEH error and EEH recovery is going to fail eventually.
The patch checks if the PCI device is off-line before statistic
retrieval.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |   11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index c8eafbf..b0e72fb 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -4288,6 +4288,17 @@ static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
 	struct port_info *p = netdev_priv(dev);
 	struct adapter *adapter = p->adapter;
 
+	/*
+	 * We possibly retrieve the statistics while the PCI
+	 * device is off-line. That would cause the recovery
+	 * on off-lined PCI device going to fail. So it's
+	 * reasonable to block it during the recovery period.
+	 */
+	if (pci_channel_offline(adapter->pdev)) {
+		memset(ns, 0, sizeof(*ns));
+		return ns;
+	}
+
 	spin_lock(&adapter->stats_lock);
 	t4_get_port_stats(adapter, p->tx_chan, &stats);
 	spin_unlock(&adapter->stats_lock);
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/2] net/cxgb4: Don't retrieve stats during recovery
  2014-01-20  3:05 ` [PATCH 2/2] net/cxgb4: Don't retrieve stats during recovery Gavin Shan
@ 2014-01-20  3:49   ` Ben Hutchings
  2014-01-20 14:35   ` Sergei Shtylyov
  2014-01-20 22:05   ` Dimitris Michailidis
  2 siblings, 0 replies; 6+ messages in thread
From: Ben Hutchings @ 2014-01-20  3:49 UTC (permalink / raw)
  To: Gavin Shan; +Cc: netdev, dm

[-- Attachment #1: Type: text/plain, Size: 1754 bytes --]

On Mon, 2014-01-20 at 11:05 +0800, Gavin Shan wrote:
> We possiblly retrieve the adapter's statistics during EEH recovery
> and that should be disallowed. Otherwise, it would possibly incur
> replicate EEH error and EEH recovery is going to fail eventually.
> The patch checks if the PCI device is off-line before statistic
> retrieval.
> 
> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> ---
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |   11 +++++++++++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> index c8eafbf..b0e72fb 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> @@ -4288,6 +4288,17 @@ static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
>  	struct port_info *p = netdev_priv(dev);
>  	struct adapter *adapter = p->adapter;
>  
> +	/*
> +	 * We possibly retrieve the statistics while the PCI
> +	 * device is off-line. That would cause the recovery
> +	 * on off-lined PCI device going to fail. So it's
> +	 * reasonable to block it during the recovery period.
> +	 */
> +	if (pci_channel_offline(adapter->pdev)) {
> +		memset(ns, 0, sizeof(*ns));
> +		return ns;
> +	}

The buffer is already zero-initialised so there's no need for this
memset().

>         spin_lock(&adapter->stats_lock);
>         t4_get_port_stats(adapter, p->tx_chan, &stats);
>         spin_unlock(&adapter->stats_lock);

Is there anything to stop this running just after pci_channel_offline()
becomes true?

Ben.

-- 
Ben Hutchings
One of the nice things about standards is that there are so many of them.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/2] net/cxgb4: Don't retrieve stats during recovery
  2014-01-20  3:05 ` [PATCH 2/2] net/cxgb4: Don't retrieve stats during recovery Gavin Shan
  2014-01-20  3:49   ` Ben Hutchings
@ 2014-01-20 14:35   ` Sergei Shtylyov
  2014-01-20 22:05   ` Dimitris Michailidis
  2 siblings, 0 replies; 6+ messages in thread
From: Sergei Shtylyov @ 2014-01-20 14:35 UTC (permalink / raw)
  To: Gavin Shan, netdev; +Cc: dm

Hello.

On 20-01-2014 7:05, Gavin Shan wrote:

> We possiblly retrieve the adapter's statistics during EEH recovery

    Only "possibly".

> and that should be disallowed. Otherwise, it would possibly incur
> replicate EEH error and EEH recovery is going to fail eventually.
> The patch checks if the PCI device is off-line before statistic
> retrieval.

> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> ---
>   drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |   11 +++++++++++
>   1 file changed, 11 insertions(+)
>
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> index c8eafbf..b0e72fb 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> @@ -4288,6 +4288,17 @@ static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
>   	struct port_info *p = netdev_priv(dev);
>   	struct adapter *adapter = p->adapter;
>
> +	/*
> +	 * We possibly retrieve the statistics while the PCI
> +	 * device is off-line. That would cause the recovery
> +	 * on off-lined PCI device going to fail. So it's
> +	 * reasonable to block it during the recovery period.
> +	 */

    The multi-line comment style in the networking code is somewhat special:

/* bla
  * bla
  */

WBR, Sergei

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/2] net/cxgb4: Don't retrieve stats during recovery
  2014-01-20  3:05 ` [PATCH 2/2] net/cxgb4: Don't retrieve stats during recovery Gavin Shan
  2014-01-20  3:49   ` Ben Hutchings
  2014-01-20 14:35   ` Sergei Shtylyov
@ 2014-01-20 22:05   ` Dimitris Michailidis
       [not found]     ` <20140121023404.GA6575@shangw.(null)>
  2 siblings, 1 reply; 6+ messages in thread
From: Dimitris Michailidis @ 2014-01-20 22:05 UTC (permalink / raw)
  To: Gavin Shan, netdev

On 01/19/2014 07:05 PM, Gavin Shan wrote:
> We possiblly retrieve the adapter's statistics during EEH recovery
> and that should be disallowed. Otherwise, it would possibly incur
> replicate EEH error and EEH recovery is going to fail eventually.
> The patch checks if the PCI device is off-line before statistic
> retrieval.

The net_devices are detached during EEH so I think netif_device_present 
is a better check than pci_channel_offline.  I am not sure such a test 
should be left to each driver though.  If you do end up putting it in 
the driver it needs better synchronization with the EEH handlers as Ben 
mentioned.

>
> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> ---
>   drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |   11 +++++++++++
>   1 file changed, 11 insertions(+)
>
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> index c8eafbf..b0e72fb 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> @@ -4288,6 +4288,17 @@ static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
>   	struct port_info *p = netdev_priv(dev);
>   	struct adapter *adapter = p->adapter;
>
> +	/*
> +	 * We possibly retrieve the statistics while the PCI
> +	 * device is off-line. That would cause the recovery
> +	 * on off-lined PCI device going to fail. So it's
> +	 * reasonable to block it during the recovery period.
> +	 */
> +	if (pci_channel_offline(adapter->pdev)) {
> +		memset(ns, 0, sizeof(*ns));
> +		return ns;
> +	}
> +
>   	spin_lock(&adapter->stats_lock);
>   	t4_get_port_stats(adapter, p->tx_chan, &stats);
>   	spin_unlock(&adapter->stats_lock);
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH 2/2] net/cxgb4: Don't retrieve stats during recovery
       [not found]       ` <20140123013134.GA5947@shangw.(null)>
@ 2014-01-23  2:17         ` Dimitrios Michailidis
  0 siblings, 0 replies; 6+ messages in thread
From: Dimitrios Michailidis @ 2014-01-23  2:17 UTC (permalink / raw)
  To: Gavin Shan; +Cc: netdev

On Wed, Jan 22, 2014 at 5:32 PM, Gavin Shan wrote:
> >On Mon, Jan 20, 2014 at 02:05:18PM -0800, Dimitris Michailidis wrote:
> >>On 01/19/2014 07:05 PM, Gavin Shan wrote:
> >>>We possiblly retrieve the adapter's statistics during EEH recovery
> >>>and that should be disallowed. Otherwise, it would possibly incur
> >>>replicate EEH error and EEH recovery is going to fail eventually.
> >>>The patch checks if the PCI device is off-line before statistic
> >>>retrieval.
> >>
> >>The net_devices are detached during EEH so I think
> >>netif_device_present is a better check than pci_channel_offline.  I
> >>am not sure such a test should be left to each driver though.  If you
> >>do end up putting it in the driver it needs better synchronization
> >>with the EEH handlers as Ben mentioned.
> >>
> >
> >Ok. I agree that netif_device_present() is better since the statistics
> >is more net_device specific (other than pci_dev). And it's more accurate
> >to use netif_device_present() based on what we have:
> >
> >	pci_channel_offline()	----------+
> >	eeh_err_detected()		  |
> >		!netif_device_present() --+-----+
> >	<EEH recovery>			  |	|
> >	!pci_channel_offline()	----------+	|
> >	eeh_slot_reset()			|
> >	eeh_resume()				|
> >		netif_device_present()	--------+
> >
> >For the syncrhonization, I think we can just reuse the "adap->stats_lock".
> >Something like this:
> >
> >static pci_ers_result_t eeh_err_detected(struct pci_dev *pdev,
> >					 pci_channel_state_t state)
> >{
> >	:
> >	spin_lock(&adap->stats_lock);
> >        for_each_port(adap, i) {
> >                struct net_device *dev = adap->port[i];
> >
> >                netif_device_detach(dev);
> >                netif_carrier_off(dev);
> >        }
> >	spin_unlock(&adap->stats_lock);
> >	:
> >}
> >
> >static void eeh_resume(struct pci_dev *pdev)
> >{
> >	:
> >	spin_lock(&adap->stats_lock);
> >        for_each_port(adap, i) {
> >                struct net_device *dev = adap->port[i];
> >
> >                if (netif_running(dev)) {
> >                        link_start(dev);
> >                        cxgb_set_rxmode(dev);
> >                }
> >                netif_device_attach(dev);
> >        }
> >	spin_unlock(&adap->stats_lock);
> >	:
> >}

Both link_start and cxgb_set_rxmode here issue blocking commands to FW, these two cannot be under a spinlock.  In fact I don't think you need locking here at all.  The devices can be attached asynchronously relative to the stats code, we don't care if it races.  On detach it matters but not here.

> >static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
> >                                                struct rtnl_link_stats64 *ns)
> >{
> >	:
> >        spin_lock(&adapter->stats_lock);
> >	if (!netif_device_present(dev)) {
> >		spin_unlock(&adapter->stats_lock);
> >		return ns;
> >	}
> >        t4_get_port_stats(adapter, p->tx_chan, &stats);
> >        spin_unlock(&adapter->stats_lock);
> >	:
> >}
> >
> 
> Dimitris, Any more comments on this? :-)
 
Just the above.  Thanks.

> If you think it's fine, I'm going to change it like this and send
> out "v2".
> 
> Thanks,
> Gavin
> 
> >>>
> >>>Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> >>>---
> >>>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |   11 +++++++++++
> >>>  1 file changed, 11 insertions(+)
> >>>
> >>>diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> >>>index c8eafbf..b0e72fb 100644
> >>>--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> >>>+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> >>>@@ -4288,6 +4288,17 @@ static struct rtnl_link_stats64
> *cxgb_get_stats(struct net_device *dev,
> >>>  	struct port_info *p = netdev_priv(dev);
> >>>  	struct adapter *adapter = p->adapter;
> >>>
> >>>+	/*
> >>>+	 * We possibly retrieve the statistics while the PCI
> >>>+	 * device is off-line. That would cause the recovery
> >>>+	 * on off-lined PCI device going to fail. So it's
> >>>+	 * reasonable to block it during the recovery period.
> >>>+	 */
> >>>+	if (pci_channel_offline(adapter->pdev)) {
> >>>+		memset(ns, 0, sizeof(*ns));
> >>>+		return ns;
> >>>+	}
> >>>+
> >>>  	spin_lock(&adapter->stats_lock);
> >>>  	t4_get_port_stats(adapter, p->tx_chan, &stats);
> >>>  	spin_unlock(&adapter->stats_lock);
> >>>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2014-01-23  2:42 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-01-20  3:05 [PATCH 1/2] net/cxgb4: Avoid disabling PCI device for towice Gavin Shan
2014-01-20  3:05 ` [PATCH 2/2] net/cxgb4: Don't retrieve stats during recovery Gavin Shan
2014-01-20  3:49   ` Ben Hutchings
2014-01-20 14:35   ` Sergei Shtylyov
2014-01-20 22:05   ` Dimitris Michailidis
     [not found]     ` <20140121023404.GA6575@shangw.(null)>
     [not found]       ` <20140123013134.GA5947@shangw.(null)>
2014-01-23  2:17         ` Dimitrios Michailidis

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).