LinuxPPC-Dev Archive on lore.kernel.org
 help / Atom feed
* [RESEND PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock'
@ 2019-01-29 11:06 Vaibhav Jain
  2019-01-30 12:41 ` Michael Ellerman
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Vaibhav Jain @ 2019-01-29 11:06 UTC (permalink / raw)
  To: linuxppc-dev, Frederic Barrat
  Cc: Philippe Bergheaud, Christophe Lombard, stable,
	Alastair D'Silva, Andrew Donnellan, Vaibhav Jain,
	Christophe Lombard

Within cxl module, iteration over array 'adapter->afu' may be racy
at few points as it might be simultaneously read during an EEH and its
contents being set to NULL while driver is being unloaded or unbound
from the adapter. This might result in a NULL pointer to 'struct afu'
being de-referenced during an EEH thereby causing a kernel oops.

This patch fixes this by making sure that all access to the array
'adapter->afu' is wrapped within the context of spin-lock
'adapter->afu_list_lock'.

Cc: stable@vger.kernel.org
Fixes: 9e8df8a2196("cxl: EEH support")
Acked-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Acked-by: Frederic Barrat <fbarrat@linux.ibm.com>
Acked-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
---
Changelog:

Resend:
* Fixed the reference to 'adapter->afu' in patch description. [Andrew]
* Added the 'Fixes' tag and marked the patch to stable

v3:
* Updated a slice loop in cxl_pci_error_detectected() to ignore NULL
  slices [Fred]
* Added a NULL AFU check in cxl_pci_slot_reset() [Fred]

v2:
* Fixed a wrong comparison of non-null pointer [Fred]
* Moved a call to cxl_vphb_error_detected() within a branch that
  checks for not null AFU pointer in 'adapter->slices' [Fred]
* Removed a misleading comment in code.
---
 drivers/misc/cxl/guest.c |  2 ++
 drivers/misc/cxl/pci.c   | 39 ++++++++++++++++++++++++++++++---------
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c
index 5d28d9e454f5..08f4a512afad 100644
--- a/drivers/misc/cxl/guest.c
+++ b/drivers/misc/cxl/guest.c
@@ -267,6 +267,7 @@ static int guest_reset(struct cxl *adapter)
 	int i, rc;
 
 	pr_devel("Adapter reset request\n");
+	spin_lock(&adapter->afu_list_lock);
 	for (i = 0; i < adapter->slices; i++) {
 		if ((afu = adapter->afu[i])) {
 			pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT,
@@ -283,6 +284,7 @@ static int guest_reset(struct cxl *adapter)
 			pci_error_handlers(afu, CXL_RESUME_EVENT, 0);
 		}
 	}
+	spin_unlock(&adapter->afu_list_lock);
 	return rc;
 }
 
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index c79ba1c699ad..300531d6136f 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1805,7 +1805,7 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
 	/* There should only be one entry, but go through the list
 	 * anyway
 	 */
-	if (afu->phb == NULL)
+	if (afu == NULL || afu->phb == NULL)
 		return result;
 
 	list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
@@ -1832,7 +1832,8 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 {
 	struct cxl *adapter = pci_get_drvdata(pdev);
 	struct cxl_afu *afu;
-	pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET, afu_result;
+	pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
+	pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET;
 	int i;
 
 	/* At this point, we could still have an interrupt pending.
@@ -1843,6 +1844,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 
 	/* If we're permanently dead, give up. */
 	if (state == pci_channel_io_perm_failure) {
+		spin_lock(&adapter->afu_list_lock);
 		for (i = 0; i < adapter->slices; i++) {
 			afu = adapter->afu[i];
 			/*
@@ -1851,6 +1853,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 			 */
 			cxl_vphb_error_detected(afu, state);
 		}
+		spin_unlock(&adapter->afu_list_lock);
 		return PCI_ERS_RESULT_DISCONNECT;
 	}
 
@@ -1932,11 +1935,17 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 	 *     * In slot_reset, free the old resources and allocate new ones.
 	 *     * In resume, clear the flag to allow things to start.
 	 */
+
+	/* Make sure no one else changes the afu list */
+	spin_lock(&adapter->afu_list_lock);
+
 	for (i = 0; i < adapter->slices; i++) {
 		afu = adapter->afu[i];
 
-		afu_result = cxl_vphb_error_detected(afu, state);
+		if (afu == NULL)
+			continue;
 
+		afu_result = cxl_vphb_error_detected(afu, state);
 		cxl_context_detach_all(afu);
 		cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
 		pci_deconfigure_afu(afu);
@@ -1948,6 +1957,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 			 (result == PCI_ERS_RESULT_NEED_RESET))
 			result = PCI_ERS_RESULT_NONE;
 	}
+	spin_unlock(&adapter->afu_list_lock);
 
 	/* should take the context lock here */
 	if (cxl_adapter_context_lock(adapter) != 0)
@@ -1980,14 +1990,18 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
 	 */
 	cxl_adapter_context_unlock(adapter);
 
+	spin_lock(&adapter->afu_list_lock);
 	for (i = 0; i < adapter->slices; i++) {
 		afu = adapter->afu[i];
 
+		if (afu == NULL)
+			continue;
+
 		if (pci_configure_afu(afu, adapter, pdev))
-			goto err;
+			goto err_unlock;
 
 		if (cxl_afu_select_best_mode(afu))
-			goto err;
+			goto err_unlock;
 
 		if (afu->phb == NULL)
 			continue;
@@ -1999,16 +2013,16 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
 			ctx = cxl_get_context(afu_dev);
 
 			if (ctx && cxl_release_context(ctx))
-				goto err;
+				goto err_unlock;
 
 			ctx = cxl_dev_context_init(afu_dev);
 			if (IS_ERR(ctx))
-				goto err;
+				goto err_unlock;
 
 			afu_dev->dev.archdata.cxl_ctx = ctx;
 
 			if (cxl_ops->afu_check_and_enable(afu))
-				goto err;
+				goto err_unlock;
 
 			afu_dev->error_state = pci_channel_io_normal;
 
@@ -2029,8 +2043,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
 				result = PCI_ERS_RESULT_DISCONNECT;
 		}
 	}
+
+	spin_unlock(&adapter->afu_list_lock);
 	return result;
 
+err_unlock:
+	spin_unlock(&adapter->afu_list_lock);
+
 err:
 	/* All the bits that happen in both error_detected and cxl_remove
 	 * should be idempotent, so we don't need to worry about leaving a mix
@@ -2051,10 +2070,11 @@ static void cxl_pci_resume(struct pci_dev *pdev)
 	 * This is not the place to be checking if everything came back up
 	 * properly, because there's no return value: do that in slot_reset.
 	 */
+	spin_lock(&adapter->afu_list_lock);
 	for (i = 0; i < adapter->slices; i++) {
 		afu = adapter->afu[i];
 
-		if (afu->phb == NULL)
+		if (afu == NULL || afu->phb == NULL)
 			continue;
 
 		list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
@@ -2063,6 +2083,7 @@ static void cxl_pci_resume(struct pci_dev *pdev)
 				afu_dev->driver->err_handler->resume(afu_dev);
 		}
 	}
+	spin_unlock(&adapter->afu_list_lock);
 }
 
 static const struct pci_error_handlers cxl_err_handler = {
-- 
2.20.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RESEND PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock'
  2019-01-29 11:06 [RESEND PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock' Vaibhav Jain
@ 2019-01-30 12:41 ` Michael Ellerman
  2019-01-30 13:25   ` Vaibhav Jain
  2019-01-30 14:46 ` Sasha Levin
  2019-02-08 13:02 ` [RESEND, " Michael Ellerman
  2 siblings, 1 reply; 5+ messages in thread
From: Michael Ellerman @ 2019-01-30 12:41 UTC (permalink / raw)
  To: Vaibhav Jain, linuxppc-dev, Frederic Barrat
  Cc: Philippe Bergheaud, Christophe Lombard, stable,
	Alastair D'Silva, Andrew Donnellan, Vaibhav Jain,
	Christophe Lombard

Vaibhav Jain <vaibhav@linux.ibm.com> writes:

> Within cxl module, iteration over array 'adapter->afu' may be racy
> at few points as it might be simultaneously read during an EEH and its
> contents being set to NULL while driver is being unloaded or unbound
> from the adapter. This might result in a NULL pointer to 'struct afu'
> being de-referenced during an EEH thereby causing a kernel oops.
>
> This patch fixes this by making sure that all access to the array
> 'adapter->afu' is wrapped within the context of spin-lock
> 'adapter->afu_list_lock'.
>
> Cc: stable@vger.kernel.org
> Fixes: 9e8df8a2196("cxl: EEH support")
> Acked-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
> Acked-by: Frederic Barrat <fbarrat@linux.ibm.com>
> Acked-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
> ---
> Changelog:
>
> Resend:
> * Fixed the reference to 'adapter->afu' in patch description. [Andrew]
> * Added the 'Fixes' tag and marked the patch to stable

FYI RESEND means you didn't change anything, but you sent the patch
again for some other reason, like the Cc list was wrong or you thought
it had been ignored.

In this case you should have just sent a v4, updating the change log is
a perfectly valid reason for a new version of the patch.

I've applied it, no need to RESEND ;)

cheers

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RESEND PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock'
  2019-01-30 12:41 ` Michael Ellerman
@ 2019-01-30 13:25   ` Vaibhav Jain
  0 siblings, 0 replies; 5+ messages in thread
From: Vaibhav Jain @ 2019-01-30 13:25 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev, Frederic Barrat
  Cc: Philippe Bergheaud, Christophe Lombard, stable,
	Alastair D'Silva, Andrew Donnellan, Christophe Lombard

Michael Ellerman <mpe@ellerman.id.au> writes:

> FYI RESEND means you didn't change anything, but you sent the patch
> again for some other reason, like the Cc list was wrong or you thought
> it had been ignored.
>
> In this case you should have just sent a v4, updating the change log is
> a perfectly valid reason for a new version of the patch.
Thanks for pointing this out. Will take care of this in future.

>
> I've applied it, no need to RESEND ;)
Thanks.

-- 
Vaibhav Jain <vaibhav@linux.ibm.com>
Linux Technology Center, IBM India Pvt. Ltd.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RESEND PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock'
  2019-01-29 11:06 [RESEND PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock' Vaibhav Jain
  2019-01-30 12:41 ` Michael Ellerman
@ 2019-01-30 14:46 ` Sasha Levin
  2019-02-08 13:02 ` [RESEND, " Michael Ellerman
  2 siblings, 0 replies; 5+ messages in thread
From: Sasha Levin @ 2019-01-30 14:46 UTC (permalink / raw)
  To: Sasha Levin; +Cc: , Vaibhav Jain, stable

Hi,

[This is an automated email]

This commit has been processed because it contains a "Fixes:" tag,
fixing commit: .

The bot has tested the following trees: v4.20.5, v4.19.18, v4.14.96, v4.9.153, v4.4.172, v3.18.133.

v4.20.5: Build OK!
v4.19.18: Build OK!
v4.14.96: Build OK!
v4.9.153: Failed to apply! Possible dependencies:
    bb81733de28c ("cxl: Fix error handling in _cxl_pci_associate_default_context()")

v4.4.172: Failed to apply! Possible dependencies:
    14baf4d9c739 ("cxl: Add guest-specific code")
    2b04cf310ba8 ("cxl: Rename some bare-metal specific functions")
    5be587b11101 ("cxl: Introduce implementation-specific API")
    73d55c3b59f7 ("cxl: IRQ allocation for guests")
    8633186209e3 ("cxl: Move common code away from bare-metal-specific files")
    cbffa3a5146a ("cxl: Separate bare-metal fields in adapter and AFU data structures")
    d56d301b5174 ("cxl: Move bare-metal specific code to specialized files")
    ea2d1f95efd7 ("cxl: Isolate a few bare-metal-specific calls")

v3.18.133: Failed to apply! Possible dependencies:
    0520336afe5d ("cxl: Export file ops for use by API")
    0b3f9c757cab ("cxl: Drop commands if the PCI channel is not in normal state")
    14baf4d9c739 ("cxl: Add guest-specific code")
    27d4dc7116ee ("cxl: Implement an ioctl to fetch afu card-id, offset-id and mode")
    3d6b040e7338 ("cxl: sparse: Make declarations static")
    406e12ec0b64 ("cxl: Cleanup Makefile")
    456295e284be ("cxl: Fix leaking interrupts if attach process fails")
    588b34be20bc ("cxl: Convert MMIO read/write macros to inline functions")
    5be587b11101 ("cxl: Introduce implementation-specific API")
    62fa19d4b4fd ("cxl: Add ability to reset the card")
    6428832a7bfa ("cxl: Add cookie parameter to afu_release_irqs()")
    6f7f0b3df6d4 ("cxl: Add AFU virtual PHB and kernel API")
    73d55c3b59f7 ("cxl: IRQ allocation for guests")
    7bb5d91a4dda ("cxl: Rework context lifetimes")
    80c394fab896 ("cxl: Add explicit precision specifiers")
    80fa93fce37d ("cxl: Name interrupts in /proc/interrupt")
    8dde152ea348 ("cxl: fix leak of IRQ names in cxl_free_afu_irqs()")
    95bc11bcd142 ("cxl: Add image control to sysfs")
    9bcf28cdb28e ("cxl: Add tracepoints")
    b087e6190ddc ("cxl: Export optional AFU configuration record in sysfs")
    b12994fbfe93 ("cxl: cxl_afu_reset() -> __cxl_afu_reset()")
    bc78b05bb412 ("cxl: Return error to PSL if IRQ demultiplexing fails & print clearer warning")
    c358d84b4e57 ("cxl: Split afu_register_irqs() function")
    cbffa3a5146a ("cxl: Separate bare-metal fields in adapter and AFU data structures")
    de369538436a ("cxl: use more common format specifier")
    e36f6fe1f7aa ("cxl: Export AFU error buffer via sysfs")
    ea2d1f95efd7 ("cxl: Isolate a few bare-metal-specific calls")
    eda3693c842e ("cxl: Rework detach context functions")


How should we proceed with this patch?

--
Thanks,
Sasha

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RESEND, v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock'
  2019-01-29 11:06 [RESEND PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock' Vaibhav Jain
  2019-01-30 12:41 ` Michael Ellerman
  2019-01-30 14:46 ` Sasha Levin
@ 2019-02-08 13:02 ` " Michael Ellerman
  2 siblings, 0 replies; 5+ messages in thread
From: Michael Ellerman @ 2019-02-08 13:02 UTC (permalink / raw)
  To: Vaibhav Jain, linuxppc-dev, Frederic Barrat
  Cc: Philippe Bergheaud, Christophe Lombard, stable,
	Alastair D'Silva, Andrew Donnellan, Vaibhav Jain,
	Christophe Lombard

On Tue, 2019-01-29 at 11:06:18 UTC, Vaibhav Jain wrote:
> Within cxl module, iteration over array 'adapter->afu' may be racy
> at few points as it might be simultaneously read during an EEH and its
> contents being set to NULL while driver is being unloaded or unbound
> from the adapter. This might result in a NULL pointer to 'struct afu'
> being de-referenced during an EEH thereby causing a kernel oops.
> 
> This patch fixes this by making sure that all access to the array
> 'adapter->afu' is wrapped within the context of spin-lock
> 'adapter->afu_list_lock'.
> 
> Cc: stable@vger.kernel.org
> Fixes: 9e8df8a2196("cxl: EEH support")
> Acked-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
> Acked-by: Frederic Barrat <fbarrat@linux.ibm.com>
> Acked-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/edeb304f659792fb5bab90d7d6f3408b

cheers

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, back to index

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-29 11:06 [RESEND PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock' Vaibhav Jain
2019-01-30 12:41 ` Michael Ellerman
2019-01-30 13:25   ` Vaibhav Jain
2019-01-30 14:46 ` Sasha Levin
2019-02-08 13:02 ` [RESEND, " Michael Ellerman

LinuxPPC-Dev Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linuxppc-dev/0 linuxppc-dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linuxppc-dev linuxppc-dev/ https://lore.kernel.org/linuxppc-dev \
		linuxppc-dev@lists.ozlabs.org linuxppc-dev@ozlabs.org linuxppc-dev@archiver.kernel.org
	public-inbox-index linuxppc-dev


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.ozlabs.lists.linuxppc-dev


AGPL code for this site: git clone https://public-inbox.org/ public-inbox