linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] ppc/eeh: lock module while handling EEH event
@ 2012-09-17 14:34 Gavin Shan
  2012-09-17 14:34 ` [PATCH 2/2] ppc/eeh: fix crash on converting OF node to edev Gavin Shan
  0 siblings, 1 reply; 2+ messages in thread
From: Gavin Shan @ 2012-09-17 14:34 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan, stable

The EEH core is talking with the PCI device driver to determine the
action (purely reset, or PCI device removal). During the period, the
driver might be unloaded and in turn causes kernel crash as follows:

EEH: Detected PCI bus error on PHB#4-PE#10000
EEH: This PCI device has failed 3 times in the last hour
lpfc 0004:01:00.0: 0:2710 PCI channel disable preparing for reset
Unable to handle kernel paging request for data at address 0x00000490
Faulting instruction address: 0xd00000000e682c90
cpu 0x1: Vector: 300 (Data Access) at [c000000fc75ffa20]
    pc: d00000000e682c90: .lpfc_io_error_detected+0x30/0x240 [lpfc]
    lr: d00000000e682c8c: .lpfc_io_error_detected+0x2c/0x240 [lpfc]
    sp: c000000fc75ffca0
   msr: 8000000000009032
   dar: 490
 dsisr: 40000000
  current = 0xc000000fc79b88b0
  paca    = 0xc00000000edb0380	 softe: 0	 irq_happened: 0x00
    pid   = 3386, comm = eehd
enter ? for help
[c000000fc75ffca0] c000000fc75ffd30 (unreliable)
[c000000fc75ffd30] c00000000004fd3c .eeh_report_error+0x7c/0xf0
[c000000fc75ffdc0] c00000000004ee00 .eeh_pe_dev_traverse+0xa0/0x180
[c000000fc75ffe70] c00000000004ffd8 .eeh_handle_event+0x68/0x300
[c000000fc75fff00] c0000000000503a0 .eeh_event_handler+0x130/0x1a0
[c000000fc75fff90] c000000000020138 .kernel_thread+0x54/0x70
1:mon>

The patch increases the reference of the corresponding driver modules
while EEH core does the negotiation with PCI device driver so that the
corresponding driver modules can't be unloaded during the period and
we're safe to refer the callbacks. 

Cc: stable@vger.kernel.org
Reported-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/eeh_driver.c |   91 ++++++++++++++++++++------
 1 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
index 37c2cf7..a3fefb6 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -25,6 +25,7 @@
 #include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/module.h>
 #include <linux/pci.h>
 #include <asm/eeh.h>
 #include <asm/eeh_event.h>
@@ -47,6 +48,41 @@ static inline const char *eeh_pcid_name(struct pci_dev *pdev)
 	return "";
 }
 
+/**
+ * eeh_pcid_get - Get the PCI device driver
+ * @pdev: PCI device
+ *
+ * The function is used to retrieve the PCI device driver for
+ * the indicated PCI device. Besides, we will increase the reference
+ * of the PCI device driver to prevent that being unloaded on
+ * the fly. Otherwise, kernel crash would be seen.
+ */
+static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
+{
+	if (!pdev || !pdev->driver)
+		return NULL;
+
+	if (!try_module_get(pdev->driver->driver.owner))
+		return NULL;
+
+	return pdev->driver;
+}
+
+/**
+ * eeh_pcid_put - Dereference on the PCI device driver
+ * @pdev: PCI device
+ *
+ * The function is called to do dereference on the PCI device
+ * driver of the indicated PCI device.
+ */
+static inline void eeh_pcid_put(struct pci_dev *pdev)
+{
+	if (!pdev || !pdev->driver)
+		return;
+
+	module_put(pdev->driver->driver.owner);
+}
+
 #if 0
 static void print_device_node_tree(struct pci_dn *pdn, int dent)
 {
@@ -128,23 +164,24 @@ static void *eeh_report_error(void *data, void *userdata)
 	struct eeh_dev *edev = (struct eeh_dev *)data;
 	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
 	enum pci_ers_result rc, *res = userdata;
-	struct pci_driver *driver = dev->driver;
+	struct pci_driver *driver;
 
 	/* We might not have the associated PCI device,
 	 * then we should continue for next one.
 	 */
 	if (!dev) return NULL;
-
 	dev->error_state = pci_channel_io_frozen;
 
-	if (!driver)
-		return NULL;
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
 
 	eeh_disable_irq(dev);
 
 	if (!driver->err_handler ||
-	    !driver->err_handler->error_detected)
+	    !driver->err_handler->error_detected) {
+		eeh_pcid_put(dev);
 		return NULL;
+	}
 
 	rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
 
@@ -152,6 +189,7 @@ static void *eeh_report_error(void *data, void *userdata)
 	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
 
+	eeh_pcid_put(dev);
 	return NULL;
 }
 
@@ -171,12 +209,14 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
 	enum pci_ers_result rc, *res = userdata;
 	struct pci_driver *driver;
 
-	if (!dev) return NULL;
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
 
-	if (!(driver = dev->driver) ||
-	    !driver->err_handler ||
-	    !driver->err_handler->mmio_enabled)
+	if (!driver->err_handler ||
+	    !driver->err_handler->mmio_enabled) {
+		eeh_pcid_put(dev);
 		return NULL;
+	}
 
 	rc = driver->err_handler->mmio_enabled(dev);
 
@@ -184,6 +224,7 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
 	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
 
+	eeh_pcid_put(dev);
 	return NULL;
 }
 
@@ -204,16 +245,19 @@ static void *eeh_report_reset(void *data, void *userdata)
 	enum pci_ers_result rc, *res = userdata;
 	struct pci_driver *driver;
 
-	if (!dev || !(driver = dev->driver))
-		return NULL;
-
+	if (!dev) return NULL;
 	dev->error_state = pci_channel_io_normal;
 
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
+
 	eeh_enable_irq(dev);
 
 	if (!driver->err_handler ||
-	    !driver->err_handler->slot_reset)
+	    !driver->err_handler->slot_reset) {
+		eeh_pcid_put(dev);
 		return NULL;
+	}
 
 	rc = driver->err_handler->slot_reset(dev);
 	if ((*res == PCI_ERS_RESULT_NONE) ||
@@ -221,6 +265,7 @@ static void *eeh_report_reset(void *data, void *userdata)
 	if (*res == PCI_ERS_RESULT_DISCONNECT &&
 	     rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 
+	eeh_pcid_put(dev);
 	return NULL;
 }
 
@@ -240,20 +285,22 @@ static void *eeh_report_resume(void *data, void *userdata)
 	struct pci_driver *driver;
 
 	if (!dev) return NULL;
-
 	dev->error_state = pci_channel_io_normal;
 
-	if (!(driver = dev->driver))
-		return NULL;
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
 
 	eeh_enable_irq(dev);
 
 	if (!driver->err_handler ||
-	    !driver->err_handler->resume)
+	    !driver->err_handler->resume) {
+		eeh_pcid_put(dev);
 		return NULL;
+	}
 
 	driver->err_handler->resume(dev);
 
+	eeh_pcid_put(dev);
 	return NULL;
 }
 
@@ -272,20 +319,22 @@ static void *eeh_report_failure(void *data, void *userdata)
 	struct pci_driver *driver;
 
 	if (!dev) return NULL;
-
 	dev->error_state = pci_channel_io_perm_failure;
 
-	if (!(driver = dev->driver))
-		return NULL;
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
 
 	eeh_disable_irq(dev);
 
 	if (!driver->err_handler ||
-	    !driver->err_handler->error_detected)
+	    !driver->err_handler->error_detected) {
+		eeh_pcid_put(dev);
 		return NULL;
+	}
 
 	driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
 
+	eeh_pcid_put(dev);
 	return NULL;
 }
 
-- 
1.7.5.4

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [PATCH 2/2] ppc/eeh: fix crash on converting OF node to edev
  2012-09-17 14:34 [PATCH 1/2] ppc/eeh: lock module while handling EEH event Gavin Shan
@ 2012-09-17 14:34 ` Gavin Shan
  0 siblings, 0 replies; 2+ messages in thread
From: Gavin Shan @ 2012-09-17 14:34 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan, stable

The kernel crash was reported by Alexy. He was testing some feature
with private kernel, in which Alexy added some code in pci_pm_reset()
to read the CSR after writting it. The bug could be reproduced on
Fiber Channel card (Fibre Channel: Emulex Corporation Saturn-X:
LightPulse Fibre Channel Host Adapter (rev 03)) by the following
commands.

	# echo 1 > /sys/devices/pci0004:01/0004:01:00.0/reset
	# rmmod lpfc
	# modprobe lpfc

The history behind the test case is that those additional config
space reading operations in pci_pm_reset() would cause EEH error,
but we didn't detect EEH error until "modprobe lpfc". For the case,
all the PCI devices on PCI bus (0004:01) were removed and added after
PE reset. Then the EEH devices would be figured out again based on
the OF nodes. Unfortunately, there were some child OF nodes under
PCI device (0004:01:00.0), but they didn't have attached PCI_DN since
they're invisible from PCI domain. However, we were still trying to
convert OF node to EEH device without checking on the attached PCI_DN.
Eventually, it caused the kernel crash as follows:

Unable to handle kernel paging request for data at address 0x00000030
Faulting instruction address: 0xc00000000004d888
cpu 0x0: Vector: 300 (Data Access) at [c000000fc797b950]
    pc: c00000000004d888: .eeh_add_device_tree_early+0x78/0x140
    lr: c00000000004d880: .eeh_add_device_tree_early+0x70/0x140
    sp: c000000fc797bbd0
   msr: 8000000000009032
   dar: 30
 dsisr: 40000000
  current = 0xc000000fc78d9f70
  paca    = 0xc00000000edb0000   softe: 0        irq_happened: 0x00
    pid   = 2951, comm = eehd
enter ? for help
[c000000fc797bc50] c00000000004d848 .eeh_add_device_tree_early+0x38/0x140
[c000000fc797bcd0] c00000000004d848 .eeh_add_device_tree_early+0x38/0x140
[c000000fc797bd50] c000000000051b54 .pcibios_add_pci_devices+0x34/0x190
[c000000fc797bde0] c00000000004fb10 .eeh_reset_device+0x100/0x160
[c000000fc797be70] c0000000000502dc .eeh_handle_event+0x19c/0x300
[c000000fc797bf00] c000000000050570 .eeh_event_handler+0x130/0x1a0
[c000000fc797bf90] c000000000020138 .kernel_thread+0x54/0x70

The patch changes of_node_to_eeh_dev() and just returns NULL if the
passed OF node doesn't have attached PCI_DN.

Cc: stable@vger.kernel.org
Reported-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pci-bridge.h |    8 ++++++++
 arch/powerpc/platforms/pseries/eeh.c  |    2 +-
 2 files changed, 9 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index a059cb9..025a130 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -182,6 +182,14 @@ static inline int pci_device_from_OF_node(struct device_node *np,
 #if defined(CONFIG_EEH)
 static inline struct eeh_dev *of_node_to_eeh_dev(struct device_node *dn)
 {
+	/*
+	 * For those OF nodes whose parent isn't PCI bridge, they
+	 * don't have PCI_DN actually. So we have to skip them for
+	 * any EEH operations.
+	 */
+	if (!dn || !PCI_DN(dn))
+		return NULL;
+
 	return PCI_DN(dn)->edev;
 }
 #else
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 43f6ed4..9a04322 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -728,7 +728,7 @@ static void eeh_add_device_early(struct device_node *dn)
 {
 	struct pci_controller *phb;
 
-	if (!dn || !of_node_to_eeh_dev(dn))
+	if (!of_node_to_eeh_dev(dn))
 		return;
 	phb = of_node_to_eeh_dev(dn)->phb;
 
-- 
1.7.5.4

^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2012-09-17 14:34 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-09-17 14:34 [PATCH 1/2] ppc/eeh: lock module while handling EEH event Gavin Shan
2012-09-17 14:34 ` [PATCH 2/2] ppc/eeh: fix crash on converting OF node to edev Gavin Shan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).