All of lore.kernel.org
 help / color / mirror / Atom feed
From: Gavin Shan <shangw@linux.vnet.ibm.com>
To: linuxppc-dev@lists.ozlabs.org
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Subject: [PATCH 14/31] powerpc/eeh: EEH core to handle special event
Date: Tue, 18 Jun 2013 16:33:38 +0800	[thread overview]
Message-ID: <1371544435-4943-15-git-send-email-shangw@linux.vnet.ibm.com> (raw)
In-Reply-To: <1371544435-4943-1-git-send-email-shangw@linux.vnet.ibm.com>

On PowerNV platform, the EEH event caused by interrupt won't have
binding PE. The patch enables EEH core to handle the special event.
To avoid the current logic we have, The eeh_handle_event() is renamed
to eeh_handle_normal_event(), and the eeh_handle_special_event() is
introduced. The function eeh_handle_event() dispatches to above
2 functions according to the input parameter. Besides, new backend
"next_error" added to eeh_ops and it's expected to have following
return values:

	4 - Dead IOC           3 - Dead PHB
	2 - Fenced PHB         1 - Frozen PE
	0 - No error found

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h   |    2 +
 arch/powerpc/kernel/eeh_driver.c |  127 ++++++++++++++++++++++++++++++++-----
 2 files changed, 111 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 0c0ac93..a0b11fb 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -53,6 +53,7 @@ struct device_node;
 
 #define EEH_PE_ISOLATED		(1 << 0)	/* Isolated PE		*/
 #define EEH_PE_RECOVERING	(1 << 1)	/* Recovering PE	*/
+#define EEH_PE_PHB_DEAD		(1 << 2)	/* Dead PHB		*/
 
 struct eeh_pe {
 	int type;			/* PE type: PHB/Bus/Device	*/
@@ -145,6 +146,7 @@ struct eeh_ops {
 	int (*configure_bridge)(struct eeh_pe *pe);
 	int (*read_config)(struct device_node *dn, int where, int size, u32 *val);
 	int (*write_config)(struct device_node *dn, int where, int size, u32 val);
+	int (*next_error)(struct eeh_pe **pe);
 };
 
 extern struct eeh_ops *eeh_ops;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 69102b1..0974e13 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -399,24 +399,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
  */
 #define MAX_WAIT_FOR_RECOVERY 150
 
-/**
- * eeh_handle_event - Reset a PCI device after hard lockup.
- * @pe: EEH PE
- *
- * While PHB detects address or data parity errors on particular PCI
- * slot, the associated PE will be frozen. Besides, DMA's occurring
- * to wild addresses (which usually happen due to bugs in device
- * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
- * #PERR or other misc PCI-related errors also can trigger EEH errors.
- *
- * Recovery process consists of unplugging the device driver (which
- * generated hotplug events to userspace), then issuing a PCI #RST to
- * the device, then reconfiguring the PCI config space for all bridges
- * & devices under this slot, and then finally restarting the device
- * drivers (which cause a second set of hotplug events to go out to
- * userspace).
- */
-void eeh_handle_event(struct eeh_pe *pe)
+static void eeh_handle_normal_event(struct eeh_pe *pe)
 {
 	struct pci_bus *frozen_bus;
 	int rc = 0;
@@ -555,3 +538,111 @@ perm_error:
 		pcibios_remove_pci_devices(frozen_bus);
 }
 
+static void eeh_handle_special_event(void)
+{
+	struct eeh_pe *pe, *phb_pe;
+	struct pci_bus *bus;
+	struct pci_controller *hose, *tmp;
+	unsigned long flags;
+	int rc = 0;
+
+	/*
+	 * The return value from next_error() has been classified as follows.
+	 * It might be good to enumerate them. However, next_error() is only
+	 * supported by PowerNV platform for now. So it would be fine to use
+	 * integer directly:
+	 *
+	 * 4 - Dead IOC           3 - Dead PHB
+	 * 2 - Fenced PHB         1 - Frozen PE
+	 * 0 - No error found
+	 *
+	 */
+	rc = eeh_ops->next_error(&pe);
+	if (rc <= 0)
+		return;
+
+	switch (rc) {
+	case 4:
+		/* Mark all PHBs in dead state */
+		eeh_serialize_lock(&flags);
+		list_for_each_entry_safe(hose, tmp,
+				&hose_list, list_node) {
+			phb_pe = eeh_phb_pe_get(hose);
+			if (!phb_pe) continue;
+
+			eeh_pe_state_mark(phb_pe,
+				EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+		}
+		eeh_serialize_unlock(flags);
+
+		/* Purge all events */
+		eeh_remove_event(NULL);
+		break;
+	case 3:
+	case 2:
+	case 1:
+		/* Mark the PE in fenced state */
+		eeh_serialize_lock(&flags);
+		if (rc == 3)
+			eeh_pe_state_mark(pe,
+				EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+		else
+			eeh_pe_state_mark(pe,
+				EEH_PE_ISOLATED | EEH_PE_RECOVERING);
+		eeh_serialize_unlock(flags);
+
+		/* Purge all events of the PHB */
+		eeh_remove_event(pe);
+		break;
+	default:
+		pr_err("%s: Invalid value %d from next_error()\n",
+		       __func__, rc);
+		return;
+	}
+
+	/*
+	 * For fenced PHB and frozen PE, it's handled as normal
+	 * event. We have to remove the affected PHBs for dead
+	 * PHB and IOC
+	 */
+	if (rc == 2 || rc == 1)
+		eeh_handle_normal_event(pe);
+	else {
+		list_for_each_entry_safe(hose, tmp,
+			&hose_list, list_node) {
+			phb_pe = eeh_phb_pe_get(hose);
+			if (!phb_pe || !(phb_pe->state & EEH_PE_PHB_DEAD))
+				continue;
+
+			bus = eeh_pe_bus_get(phb_pe);
+			/* Notify all devices that they're about to go down. */
+			eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
+			pcibios_remove_pci_devices(bus);
+		}
+	}
+}
+
+/**
+ * eeh_handle_event - Reset a PCI device after hard lockup.
+ * @pe: EEH PE
+ *
+ * While PHB detects address or data parity errors on particular PCI
+ * slot, the associated PE will be frozen. Besides, DMA's occurring
+ * to wild addresses (which usually happen due to bugs in device
+ * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
+ * #PERR or other misc PCI-related errors also can trigger EEH errors.
+ *
+ * Recovery process consists of unplugging the device driver (which
+ * generated hotplug events to userspace), then issuing a PCI #RST to
+ * the device, then reconfiguring the PCI config space for all bridges
+ * & devices under this slot, and then finally restarting the device
+ * drivers (which cause a second set of hotplug events to go out to
+ * userspace).
+ */
+void eeh_handle_event(struct eeh_pe *pe)
+{
+	if (pe)
+		eeh_handle_normal_event(pe);
+	else
+		eeh_handle_special_event();
+}
-- 
1.7.5.4

  parent reply	other threads:[~2013-06-18  8:34 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-06-18  8:33 [PATCH v5 00/31] EEH Support for PowerNV platform Gavin Shan
2013-06-18  8:33 ` [PATCH 01/31] powerpc/eeh: Move common part to kernel directory Gavin Shan
2013-06-19  3:58   ` Michael Neuling
2013-06-19  6:11     ` Gavin Shan
2013-06-19  6:18       ` Gavin Shan
2013-06-19  7:29   ` Gavin Shan
2013-06-18  8:33 ` [PATCH 02/31] powerpc/eeh: Cleanup for EEH core Gavin Shan
2013-06-19  6:37   ` Gavin Shan
2013-06-18  8:33 ` [PATCH 03/31] powerpc/eeh: Make eeh_phb_pe_get() public Gavin Shan
2013-06-18  8:33 ` [PATCH 04/31] powerpc/eeh: Make eeh_pe_get() public Gavin Shan
2013-06-18  8:33 ` [PATCH 05/31] powerpc/eeh: Trace PCI bus from PE Gavin Shan
2013-06-19  7:21   ` Mike Qiu
2013-06-19  8:48     ` Gavin Shan
2013-06-19 10:20   ` Gavin Shan
2013-06-18  8:33 ` [PATCH 06/31] powerpc/eeh: Make eeh_init() public Gavin Shan
2013-06-18  8:33 ` [PATCH 07/31] powerpc/eeh: EEH post initialization operation Gavin Shan
2013-06-18  8:33 ` [PATCH 08/31] powerpc/eeh: Refactor eeh_reset_pe_once() Gavin Shan
2013-06-18  8:33 ` [PATCH 09/31] powerpc/eeh: Delay EEH probe during hotplug Gavin Shan
2013-06-18  8:33 ` [PATCH 10/31] powerpc/eeh: Single kthread to handle events Gavin Shan
2013-06-18  8:33 ` [PATCH 11/31] powerpc/eeh: Trace time on first error for PE Gavin Shan
2013-06-18  8:33 ` [PATCH 12/31] powerpc/eeh: Allow to purge EEH events Gavin Shan
2013-06-18  8:33 ` [PATCH 13/31] powerpc/eeh: Export confirm_error_lock Gavin Shan
2013-06-18  8:33 ` Gavin Shan [this message]
2013-06-19  6:19   ` [PATCH 14/31] powerpc/eeh: EEH core to handle special event Gavin Shan
2013-06-18  8:33 ` [PATCH 15/31] powerpc/eeh: Sync OPAL API with firmware Gavin Shan
2013-06-18  8:33 ` [PATCH 16/31] powerpc/eeh: EEH backend for P7IOC Gavin Shan
2013-06-18  8:33 ` [PATCH 17/31] powerpc/eeh: I/O chip post initialization Gavin Shan
2013-06-18  8:33 ` [PATCH 18/31] powerpc/eeh: I/O chip EEH enable option Gavin Shan
2013-06-18  8:33 ` [PATCH 19/31] powerpc/eeh: I/O chip EEH state retrieval Gavin Shan
2013-06-18  8:33 ` [PATCH 20/31] powerpc/eeh: I/O chip PE reset Gavin Shan
2013-06-18  8:33 ` [PATCH 21/31] powerpc/eeh: I/O chip PE log and bridge setup Gavin Shan
2013-06-18  8:33 ` [PATCH 22/31] powerpc/eeh: I/O chip next error Gavin Shan
2013-06-18  8:33 ` [PATCH 23/31] powerpc/eeh: PowerNV EEH backends Gavin Shan
2013-06-18  8:33 ` [PATCH 24/31] powerpc/eeh: Initialization for PowerNV Gavin Shan
2013-06-18  8:33 ` [PATCH 25/31] powerpc/eeh: Enable EEH check for config access Gavin Shan
2013-06-18  8:33 ` [PATCH 26/31] powerpc/eeh: Allow to check fenced PHB proactively Gavin Shan
2013-06-18  8:33 ` [PATCH 27/31] powernv/opal: Notifier for OPAL events Gavin Shan
2013-06-18  8:33 ` [PATCH 28/31] powernv/opal: Disable OPAL notifier upon poweroff Gavin Shan
2013-06-18  8:33 ` [PATCH 29/31] powerpc/eeh: Register OPAL notifier for PCI error Gavin Shan
2013-06-18  8:33 ` [PATCH 30/31] powerpc/powernv: Debugfs directory for PHB Gavin Shan
2013-06-18  8:33 ` [PATCH 31/31] powerpc/eeh: Debugfs for error injection Gavin Shan
2013-06-18  8:41 ` [PATCH v5 00/31] EEH Support for PowerNV platform Gavin Shan
2013-06-20  5:20 [PATCH v6 " Gavin Shan
2013-06-20  5:21 ` [PATCH 14/31] powerpc/eeh: EEH core to handle special event Gavin Shan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1371544435-4943-15-git-send-email-shangw@linux.vnet.ibm.com \
    --to=shangw@linux.vnet.ibm.com \
    --cc=linuxppc-dev@lists.ozlabs.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.