linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] powerpc/eeh: Clear frozen state for child PE
@ 2014-05-04 23:29 Gavin Shan
  2014-05-04 23:29 ` [PATCH 2/3] powerpc/eeh: Report frozen parent PE prior to " Gavin Shan
  2014-05-04 23:29 ` [PATCH 3/3] powerpc/powernv: Don't escalate non-existing frozen PE Gavin Shan
  0 siblings, 2 replies; 3+ messages in thread
From: Gavin Shan @ 2014-05-04 23:29 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev, Gavin Shan

Since commit cb523e09 ("powerpc/eeh: Avoid I/O access during PE
reset"), the PE is kept as frozen state on hardware level until
the PE reset is done completely. After that, we explicitly clear
the frozen state of the affected PE. However, there might have
frozen child PEs of the affected PE and we also need clear their
frozen state as well. Otherwise, the recovery is going to fail.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/eeh_driver.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 7100a5b..8bb40e7 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -447,8 +447,9 @@ static void *eeh_pe_detach_dev(void *data, void *userdata)
  * PE reset (for 3 times), we try to clear the frozen state
  * for 3 times as well.
  */
-static int eeh_clear_pe_frozen_state(struct eeh_pe *pe)
+static void *__eeh_clear_pe_frozen_state(void *data, void *flag)
 {
+	struct eeh_pe *pe = (struct eeh_pe *)data;
 	int i, rc;
 
 	for (i = 0; i < 3; i++) {
@@ -461,13 +462,24 @@ static int eeh_clear_pe_frozen_state(struct eeh_pe *pe)
 	}
 
 	/* The PE has been isolated, clear it */
-	if (rc)
+	if (rc) {
 		pr_warn("%s: Can't clear frozen PHB#%x-PE#%x (%d)\n",
 			__func__, pe->phb->global_number, pe->addr, rc);
-	else
+		return (void *)pe;
+	}
+
+	return NULL;
+}
+
+static int eeh_clear_pe_frozen_state(struct eeh_pe *pe)
+{
+	void *rc;
+
+	rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, NULL);
+	if (!rc)
 		eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
 
-	return rc;
+	return rc ? -EIO : 0;
 }
 
 /**
-- 
1.8.3.2

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/3] powerpc/eeh: Report frozen parent PE prior to child PE
  2014-05-04 23:29 [PATCH 1/3] powerpc/eeh: Clear frozen state for child PE Gavin Shan
@ 2014-05-04 23:29 ` Gavin Shan
  2014-05-04 23:29 ` [PATCH 3/3] powerpc/powernv: Don't escalate non-existing frozen PE Gavin Shan
  1 sibling, 0 replies; 3+ messages in thread
From: Gavin Shan @ 2014-05-04 23:29 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev, Gavin Shan

When we have the corner case of frozen parent and child PE at the
same time, we have to handle the frozen parent PE prior to the
child. Without clearning the frozen state on parent PE, the child
PE can't be recovered successfully.

The patch searches the EEH PE hierarchy tree and returns the toppest
frozen PE to be handled. It ensures the frozen parent PE will be
handled prior to child PE.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/eeh.c                 | 27 ++++++++++++++++++++++++---
 arch/powerpc/platforms/powernv/eeh-ioda.c | 30 ++++++++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9f8de75..33d683a 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -357,10 +357,11 @@ out:
 int eeh_dev_check_failure(struct eeh_dev *edev)
 {
 	int ret;
+	int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 	unsigned long flags;
 	struct device_node *dn;
 	struct pci_dev *dev;
-	struct eeh_pe *pe;
+	struct eeh_pe *pe, *parent_pe;
 	int rc = 0;
 	const char *location;
 
@@ -438,14 +439,34 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 	 */
 	if ((ret < 0) ||
 	    (ret == EEH_STATE_NOT_SUPPORT) ||
-	    (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
-	    (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
+	    ((ret & active_flags) == active_flags)) {
 		eeh_stats.false_positives++;
 		pe->false_positives++;
 		rc = 0;
 		goto dn_unlock;
 	}
 
+	/*
+	 * It should be corner case that the parent PE has been
+	 * put into frozen state as well. We should take care
+	 * that at first.
+	 */
+	parent_pe = pe->parent;
+	while (parent_pe) {
+		/* Hit the ceiling ? */
+		if (parent_pe->type & EEH_PE_PHB)
+			break;
+
+		/* Frozen parent PE ? */
+		ret = eeh_ops->get_state(parent_pe, NULL);
+		if (ret > 0 &&
+		    (ret & active_flags) != active_flags)
+			pe = parent_pe;
+
+		/* Next parent level */
+		parent_pe = parent_pe->parent;
+	}
+
 	eeh_stats.slot_resets++;
 
 	/* Avoid repeated reports of this failure, including problems
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index e5b88c5..65feec6 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -783,11 +783,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 {
 	struct pci_controller *hose;
 	struct pnv_phb *phb;
-	struct eeh_pe *phb_pe;
+	struct eeh_pe *phb_pe, *parent_pe;
 	u64 frozen_pe_no;
+	int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 	u16 err_type, severity;
 	long rc;
-	int ret = EEH_NEXT_ERR_NONE;
+	int state, ret = EEH_NEXT_ERR_NONE;
 
 	/*
 	 * While running here, it's safe to purge the event queue.
@@ -920,6 +921,31 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 		}
 
 		/*
+		 * We probably have the frozen parent PE out there and
+		 * we need have to handle frozen parent PE firstly.
+		 */
+		if (ret == EEH_NEXT_ERR_FROZEN_PE) {
+			parent_pe = (*pe)->parent;
+			while (parent_pe) {
+				/* Hit the ceiling ? */
+				if (parent_pe->type & EEH_PE_PHB)
+					break;
+
+				/* Frozen parent PE ? */
+				state = ioda_eeh_get_state(parent_pe);
+				if (state > 0 &&
+				    (state & active_flags) != active_flags)
+					*pe = parent_pe;
+
+				/* Next parent level */
+				parent_pe = parent_pe->parent;
+			}
+
+			/* We possibly migrate to another PE */
+			eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
+		}
+
+		/*
 		 * If we have no errors on the specific PHB or only
 		 * informative error there, we continue poking it.
 		 * Otherwise, we need actions to be taken by upper
-- 
1.8.3.2

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 3/3] powerpc/powernv: Don't escalate non-existing frozen PE
  2014-05-04 23:29 [PATCH 1/3] powerpc/eeh: Clear frozen state for child PE Gavin Shan
  2014-05-04 23:29 ` [PATCH 2/3] powerpc/eeh: Report frozen parent PE prior to " Gavin Shan
@ 2014-05-04 23:29 ` Gavin Shan
  1 sibling, 0 replies; 3+ messages in thread
From: Gavin Shan @ 2014-05-04 23:29 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev, Gavin Shan

Commit cb5b242c ("powerpc/eeh: Escalate error on non-existing PE")
escalates the frozen state on non-existing PE to fenced PHB. It
was to improve kdump reliability. After that, commit 361f2a2a
("powrpc/powernv: Reset PHB in kdump kernel") was introduced to
issue complete reset on all PHBs to increase the reliability of
kdump kernel.

Commit cb5b242c becomes unuseful and it would be reverted.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/eeh-ioda.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 65feec6..1b5982f 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -884,13 +884,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 			 * it again.
 			 */
 			if (ioda_eeh_get_pe(hose, frozen_pe_no, pe)) {
-				*pe = phb_pe;
-				pr_err("EEH: Escalated frozen PHB#%x-"
-				       "PE#%llx (%s) detected\n",
-					hose->global_number,
-					frozen_pe_no,
-					eeh_pe_loc_get(phb_pe));
-				ret = EEH_NEXT_ERR_FENCED_PHB;
+				/* Try best to clear it */
+				pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n",
+					hose->global_number, frozen_pe_no);
+				opal_pci_eeh_freeze_clear(phb->opal_id, frozen_pe_no,
+					OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+				ret = EEH_NEXT_ERR_NONE;
 			} else if ((*pe)->state & EEH_PE_ISOLATED) {
 				ret = EEH_NEXT_ERR_NONE;
 			} else {
-- 
1.8.3.2

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2014-05-04 23:28 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-05-04 23:29 [PATCH 1/3] powerpc/eeh: Clear frozen state for child PE Gavin Shan
2014-05-04 23:29 ` [PATCH 2/3] powerpc/eeh: Report frozen parent PE prior to " Gavin Shan
2014-05-04 23:29 ` [PATCH 3/3] powerpc/powernv: Don't escalate non-existing frozen PE Gavin Shan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).