linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
From: Sam Bobroff <sbobroff@linux.ibm.com>
To: linuxppc-dev@lists.ozlabs.org
Cc: oohall@gmail.com
Subject: [PATCH RFC 09/15] powerpw/eeh: Sync eeh_handle_special_event(), pnv_eeh_get_pe(), pnv_eeh_next_error()
Date: Wed,  2 Oct 2019 16:02:47 +1000	[thread overview]
Message-ID: <67a2d7716989f7b4f82f11b12d1df1b4818c5a59.1569996166.git.sbobroff@linux.ibm.com> (raw)
In-Reply-To: <cover.1569996166.git.sbobroff@linux.ibm.com>

Synchronize access to eeh_pe.

Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
---
 arch/powerpc/kernel/eeh_driver.c             | 15 +++++---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 38 ++++++++++++++++----
 2 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index c9d73070793e..bc5d58bf3904 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -1184,6 +1184,7 @@ void eeh_handle_special_event(void)
 
 
 	do {
+		/* Acquire ref if rc == _FROZEN_PE, _FENCED_PHB or _DEAD_PHB */
 		rc = eeh_ops->next_error(&pe);
 
 		switch (rc) {
@@ -1195,10 +1196,11 @@ void eeh_handle_special_event(void)
 			eeh_remove_event(NULL, true);
 
 			list_for_each_entry(hose, &hose_list, list_node) {
-				phb_pe = eeh_phb_pe_get(hose);
+				phb_pe = eeh_phb_pe_get(hose); /* Acquire ref */
 				if (!phb_pe) continue;
 
 				eeh_pe_mark_isolated(phb_pe);
+				eeh_put_pe(phb_pe); /* Release ref */
 			}
 
 			eeh_serialize_unlock(flags);
@@ -1236,15 +1238,17 @@ void eeh_handle_special_event(void)
 		if (rc == EEH_NEXT_ERR_FROZEN_PE ||
 		    rc == EEH_NEXT_ERR_FENCED_PHB) {
 			eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
-			eeh_handle_normal_event(pe);
+			eeh_handle_normal_event(pe); /* Give ref */
 		} else {
 			pci_lock_rescan_remove();
 			list_for_each_entry(hose, &hose_list, list_node) {
-				phb_pe = eeh_phb_pe_get(hose);
+				phb_pe = eeh_phb_pe_get(hose); /* Acquire ref */
 				if (!phb_pe ||
 				    !(phb_pe->state & EEH_PE_ISOLATED) ||
-				    (phb_pe->state & EEH_PE_RECOVERING))
+				    (phb_pe->state & EEH_PE_RECOVERING)) {
+					eeh_put_pe(phb_pe); /* Release ref */
 					continue;
+				}
 
 				eeh_for_each_pe(pe, tmp_pe)
 					eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
@@ -1263,11 +1267,14 @@ void eeh_handle_special_event(void)
 					       __func__,
 					       pe->phb->global_number,
 					       pe->addr);
+					eeh_put_pe(phb_pe); /* Release ref */
 					break;
 				}
 				pci_hp_remove_devices(bus);
+				eeh_put_pe(phb_pe); /* Release ref */
 			}
 			pci_unlock_rescan_remove();
+			eeh_put_pe(pe); /* Release ref */
 		}
 
 		/*
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index e477e0b70968..c56a796dd894 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1404,6 +1404,7 @@ static void pnv_eeh_get_and_dump_hub_diag(struct pci_controller *hose)
 	}
 }
 
+/* A return of 0 indicates that *pe is set, and referenced. */
 static int pnv_eeh_get_pe(struct pci_controller *hose,
 			  u16 pe_no, struct eeh_pe **pe)
 {
@@ -1431,6 +1432,7 @@ static int pnv_eeh_get_pe(struct pci_controller *hose,
 
 	/* Freeze the (compound) PE */
 	*pe = dev_pe;
+	eeh_get_pe(*pe); /* Acquire ref */
 	if (!(dev_pe->state & EEH_PE_ISOLATED))
 		phb->freeze_pe(phb, pe_no);
 
@@ -1439,23 +1441,26 @@ static int pnv_eeh_get_pe(struct pci_controller *hose,
 	 * have been frozen. However, we still need poke until
 	 * hitting the frozen PE on top level.
 	 */
-	dev_pe = dev_pe->parent;
+	eeh_pe_move_to_parent(&dev_pe);
 	while (dev_pe && !(dev_pe->type & EEH_PE_PHB)) {
 		int ret;
 		ret = eeh_ops->get_state(dev_pe, NULL);
 		if (ret <= 0 || eeh_state_active(ret)) {
-			dev_pe = dev_pe->parent;
+			eeh_pe_move_to_parent(&dev_pe);
 			continue;
 		}
 
 		/* Frozen parent PE */
+		eeh_put_pe(*pe); /* Release ref */
 		*pe = dev_pe;
+		eeh_get_pe(*pe); /* Acquire ref */
 		if (!(dev_pe->state & EEH_PE_ISOLATED))
 			phb->freeze_pe(phb, dev_pe->addr);
 
 		/* Next one */
-		dev_pe = dev_pe->parent;
+		eeh_pe_move_to_parent(&dev_pe);
 	}
+	eeh_put_pe(dev_pe);
 
 	return 0;
 }
@@ -1469,6 +1474,8 @@ static int pnv_eeh_get_pe(struct pci_controller *hose,
  * OPAL APIs for next error to handle. The informational error is
  * handled internally by platform. However, the dead IOC, dead PHB,
  * fenced PHB and frozen PE should be handled by EEH core eventually.
+ * On return, *pe will be ref'd iff returning _FROZEN_PE, _FENCED_PHB or
+ * _DEAD_PHB.
  */
 static int pnv_eeh_next_error(struct eeh_pe **pe)
 {
@@ -1479,6 +1486,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 	__be16 err_type, severity;
 	long rc;
 	int state, ret = EEH_NEXT_ERR_NONE;
+	unsigned long flags;
 
 	/*
 	 * While running here, it's safe to purge the event queue. The
@@ -1493,9 +1501,11 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 		 * needn't take care of it any more.
 		 */
 		phb = hose->private_data;
-		phb_pe = eeh_phb_pe_get(hose);
-		if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED))
+		phb_pe = eeh_phb_pe_get(hose); /* Acquire ref */
+		if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED)) {
+			eeh_put_pe(phb_pe); /* Release ref */
 			continue;
+		}
 
 		rc = opal_pci_next_error(phb->opal_id,
 					 &frozen_pe_no, &err_type, &severity);
@@ -1503,6 +1513,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 			pr_devel("%s: Invalid return value on "
 				 "PHB#%x (0x%lx) from opal_pci_next_error",
 				 __func__, hose->global_number, rc);
+			eeh_put_pe(phb_pe); /* Release ref */
 			continue;
 		}
 
@@ -1511,6 +1522,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 		    be16_to_cpu(severity) == OPAL_EEH_SEV_NO_ERROR) {
 			pr_devel("%s: No error found on PHB#%x\n",
 				 __func__, hose->global_number);
+			eeh_put_pe(phb_pe); /* Release ref */
 			continue;
 		}
 
@@ -1539,19 +1551,23 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 		case OPAL_EEH_PHB_ERROR:
 			if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_DEAD) {
 				*pe = phb_pe;
+				eeh_get_pe(*pe); /* Acquire ref */
 				pr_err("EEH: dead PHB#%x detected, "
 				       "location: %s\n",
 					hose->global_number,
 					eeh_pe_loc_get(phb_pe));
 				ret = EEH_NEXT_ERR_DEAD_PHB;
+				/* Retain ref on pe */
 			} else if (be16_to_cpu(severity) ==
 				   OPAL_EEH_SEV_PHB_FENCED) {
 				*pe = phb_pe;
+				eeh_get_pe(*pe); /* Acquire ref */
 				pr_err("EEH: Fenced PHB#%x detected, "
 				       "location: %s\n",
 					hose->global_number,
 					eeh_pe_loc_get(phb_pe));
 				ret = EEH_NEXT_ERR_FENCED_PHB;
+				/* Retain ref on pe */
 			} else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
 				pr_info("EEH: PHB#%x informative error "
 					"detected, location: %s\n",
@@ -1568,8 +1584,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 			 * If we can't find the corresponding PE, we
 			 * just try to unfreeze.
 			 */
+			/* Maybe acquire ref */
 			if (pnv_eeh_get_pe(hose,
 				be64_to_cpu(frozen_pe_no), pe)) {
+				/* 'pe' was not set by pnv_eeh_get_pe() */
 				pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n",
 					hose->global_number, be64_to_cpu(frozen_pe_no));
 				pr_info("EEH: PHB location: %s\n",
@@ -1589,6 +1607,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 				ret = EEH_NEXT_ERR_NONE;
 			} else if ((*pe)->state & EEH_PE_ISOLATED ||
 				   eeh_pe_passed(*pe)) {
+				eeh_put_pe(*pe); /* Release ref */
 				ret = EEH_NEXT_ERR_NONE;
 			} else {
 				pr_err("EEH: Frozen PE#%x "
@@ -1600,6 +1619,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 				       eeh_pe_loc_get(*pe),
 				       eeh_pe_loc_get(phb_pe));
 				ret = EEH_NEXT_ERR_FROZEN_PE;
+				/* Retain ref on pe */
 			}
 
 			break;
@@ -1631,7 +1651,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 		 * we need have to handle frozen parent PE firstly.
 		 */
 		if (ret == EEH_NEXT_ERR_FROZEN_PE) {
+			eeh_lock_pes(&flags);
 			parent_pe = (*pe)->parent;
+			eeh_get_pe(parent_pe);
+			eeh_unlock_pes(flags);
 			while (parent_pe) {
 				/* Hit the ceiling ? */
 				if (parent_pe->type & EEH_PE_PHB)
@@ -1643,13 +1666,15 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 					*pe = parent_pe;
 
 				/* Next parent level */
-				parent_pe = parent_pe->parent;
+				eeh_pe_move_to_parent(&parent_pe);
 			}
+			eeh_put_pe(parent_pe); /* Release ref (for early-out) */
 
 			/* We possibly migrate to another PE */
 			eeh_pe_mark_isolated(*pe);
 		}
 
+		eeh_put_pe(phb_pe); /* Release ref */
 		/*
 		 * If we have no errors on the specific PHB or only
 		 * informative error there, we continue poking it.
@@ -1664,6 +1689,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 	if (ret == EEH_NEXT_ERR_NONE && eeh_enabled())
 		enable_irq(eeh_event_irq);
 
+	/* *pe may be ref'd, see above */
 	return ret;
 }
 
-- 
2.22.0.216.g00a2a96fc9


  parent reply	other threads:[~2019-10-02  6:11 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-02  6:02 [PATCH RFC 00/15] powerpc/eeh: Synchronize access to struct eeh_pe Sam Bobroff
2019-10-02  6:02 ` [PATCH RFC 01/15] powerpc/eeh: Introduce refcounting for " Sam Bobroff
2019-10-02  6:02 ` [PATCH RFC 02/15] powerpc/eeh: Rename eeh_pe_get() to eeh_pe_find() Sam Bobroff
2019-11-13  2:32   ` Oliver O'Halloran
2019-10-02  6:02 ` [PATCH RFC 03/15] powerpc/eeh: Track orphaned struct eeh_pe Sam Bobroff
2019-10-02  6:02 ` [PATCH RFC 04/15] powerpc/eeh: Sync eeh_pe_next(), eeh_pe_find() and early-out traversals Sam Bobroff
2019-10-02  6:02 ` [PATCH RFC 05/15] powerpc/eeh: Sync eeh_pe_get_parent() Sam Bobroff
2019-10-02  6:02 ` [PATCH RFC 06/15] powerpc/eeh: Sync eeh_phb_pe_get() Sam Bobroff
2019-10-02  6:02 ` [PATCH RFC 07/15] powerpc/eeh: Sync eeh_add_to_parent_pe() and eeh_rmv_from_parent_pe() Sam Bobroff
2019-10-02  6:02 ` [PATCH RFC 08/15] powerpc/eeh: Sync eeh_handle_normal_event() Sam Bobroff
2019-10-02  6:02 ` Sam Bobroff [this message]
2019-10-02  6:02 ` [PATCH RFC 10/15] powerpc/eeh: Sync eeh_phb_check_failure() Sam Bobroff
2019-10-02  6:02 ` [PATCH RFC 11/15] powerpc/eeh: Sync eeh_dev_check_failure() Sam Bobroff
2019-10-02  6:02 ` [PATCH RFC 12/15] powerpc/eeh: Sync eeh_pe_get_state() Sam Bobroff
2019-10-02  6:02 ` [PATCH RFC 13/15] powerpc/eeh: Sync pnv_eeh_ei_write() Sam Bobroff
2019-10-02  6:02 ` [PATCH RFC 14/15] powerpc/eeh: Sync eeh_force_recover_write() Sam Bobroff
2019-10-02  6:02 ` [PATCH RFC 15/15] powerpc/eeh: Sync pcibios_set_pcie_reset_state() Sam Bobroff

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=67a2d7716989f7b4f82f11b12d1df1b4818c5a59.1569996166.git.sbobroff@linux.ibm.com \
    --to=sbobroff@linux.ibm.com \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=oohall@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).