All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock'
@ 2019-01-29  6:15 Vaibhav Jain
  2019-01-29  6:37 ` Andrew Donnellan
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Vaibhav Jain @ 2019-01-29  6:15 UTC (permalink / raw)
  To: linuxppc-dev, Frederic Barrat
  Cc: Philippe Bergheaud, Vaibhav Jain, Alastair D'Silva,
	Christophe Lombard, Andrew Donnellan

Within cxl module, iteration over array 'adapter->slices' may be racy
at few points as it might be simultaneously read during an EEH and its
contents being set to NULL while driver is being unloaded or unbound
from the adapter. This might result in a NULL pointer to 'struct afu'
being de-referenced during an EEH thereby causing a kernel oops.

This patch fixes this by making sure that all access to the array
'adapter->slices' is wrapped within the context of spin-lock
'adapter->afu_list_lock'.

Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
---
Changelog:

v3:
* Updated a slice loop in cxl_pci_error_detectected() to ignore NULL
  slices [Fred]
* Added a NULL AFU check in cxl_pci_slot_reset() [Fred]

v2:
* Fixed a wrong comparison of non-null pointer [Fred]
* Moved a call to cxl_vphb_error_detected() within a branch that
  checks for not null AFU pointer in 'adapter->slices' [Fred]
* Removed a misleading comment in code.
---
 drivers/misc/cxl/guest.c |  2 ++
 drivers/misc/cxl/pci.c   | 39 ++++++++++++++++++++++++++++++---------
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c
index 5d28d9e454f5..08f4a512afad 100644
--- a/drivers/misc/cxl/guest.c
+++ b/drivers/misc/cxl/guest.c
@@ -267,6 +267,7 @@ static int guest_reset(struct cxl *adapter)
 	int i, rc;
 
 	pr_devel("Adapter reset request\n");
+	spin_lock(&adapter->afu_list_lock);
 	for (i = 0; i < adapter->slices; i++) {
 		if ((afu = adapter->afu[i])) {
 			pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT,
@@ -283,6 +284,7 @@ static int guest_reset(struct cxl *adapter)
 			pci_error_handlers(afu, CXL_RESUME_EVENT, 0);
 		}
 	}
+	spin_unlock(&adapter->afu_list_lock);
 	return rc;
 }
 
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index c79ba1c699ad..300531d6136f 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1805,7 +1805,7 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
 	/* There should only be one entry, but go through the list
 	 * anyway
 	 */
-	if (afu->phb == NULL)
+	if (afu == NULL || afu->phb == NULL)
 		return result;
 
 	list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
@@ -1832,7 +1832,8 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 {
 	struct cxl *adapter = pci_get_drvdata(pdev);
 	struct cxl_afu *afu;
-	pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET, afu_result;
+	pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
+	pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET;
 	int i;
 
 	/* At this point, we could still have an interrupt pending.
@@ -1843,6 +1844,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 
 	/* If we're permanently dead, give up. */
 	if (state == pci_channel_io_perm_failure) {
+		spin_lock(&adapter->afu_list_lock);
 		for (i = 0; i < adapter->slices; i++) {
 			afu = adapter->afu[i];
 			/*
@@ -1851,6 +1853,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 			 */
 			cxl_vphb_error_detected(afu, state);
 		}
+		spin_unlock(&adapter->afu_list_lock);
 		return PCI_ERS_RESULT_DISCONNECT;
 	}
 
@@ -1932,11 +1935,17 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 	 *     * In slot_reset, free the old resources and allocate new ones.
 	 *     * In resume, clear the flag to allow things to start.
 	 */
+
+	/* Make sure no one else changes the afu list */
+	spin_lock(&adapter->afu_list_lock);
+
 	for (i = 0; i < adapter->slices; i++) {
 		afu = adapter->afu[i];
 
-		afu_result = cxl_vphb_error_detected(afu, state);
+		if (afu == NULL)
+			continue;
 
+		afu_result = cxl_vphb_error_detected(afu, state);
 		cxl_context_detach_all(afu);
 		cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
 		pci_deconfigure_afu(afu);
@@ -1948,6 +1957,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 			 (result == PCI_ERS_RESULT_NEED_RESET))
 			result = PCI_ERS_RESULT_NONE;
 	}
+	spin_unlock(&adapter->afu_list_lock);
 
 	/* should take the context lock here */
 	if (cxl_adapter_context_lock(adapter) != 0)
@@ -1980,14 +1990,18 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
 	 */
 	cxl_adapter_context_unlock(adapter);
 
+	spin_lock(&adapter->afu_list_lock);
 	for (i = 0; i < adapter->slices; i++) {
 		afu = adapter->afu[i];
 
+		if (afu == NULL)
+			continue;
+
 		if (pci_configure_afu(afu, adapter, pdev))
-			goto err;
+			goto err_unlock;
 
 		if (cxl_afu_select_best_mode(afu))
-			goto err;
+			goto err_unlock;
 
 		if (afu->phb == NULL)
 			continue;
@@ -1999,16 +2013,16 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
 			ctx = cxl_get_context(afu_dev);
 
 			if (ctx && cxl_release_context(ctx))
-				goto err;
+				goto err_unlock;
 
 			ctx = cxl_dev_context_init(afu_dev);
 			if (IS_ERR(ctx))
-				goto err;
+				goto err_unlock;
 
 			afu_dev->dev.archdata.cxl_ctx = ctx;
 
 			if (cxl_ops->afu_check_and_enable(afu))
-				goto err;
+				goto err_unlock;
 
 			afu_dev->error_state = pci_channel_io_normal;
 
@@ -2029,8 +2043,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
 				result = PCI_ERS_RESULT_DISCONNECT;
 		}
 	}
+
+	spin_unlock(&adapter->afu_list_lock);
 	return result;
 
+err_unlock:
+	spin_unlock(&adapter->afu_list_lock);
+
 err:
 	/* All the bits that happen in both error_detected and cxl_remove
 	 * should be idempotent, so we don't need to worry about leaving a mix
@@ -2051,10 +2070,11 @@ static void cxl_pci_resume(struct pci_dev *pdev)
 	 * This is not the place to be checking if everything came back up
 	 * properly, because there's no return value: do that in slot_reset.
 	 */
+	spin_lock(&adapter->afu_list_lock);
 	for (i = 0; i < adapter->slices; i++) {
 		afu = adapter->afu[i];
 
-		if (afu->phb == NULL)
+		if (afu == NULL || afu->phb == NULL)
 			continue;
 
 		list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
@@ -2063,6 +2083,7 @@ static void cxl_pci_resume(struct pci_dev *pdev)
 				afu_dev->driver->err_handler->resume(afu_dev);
 		}
 	}
+	spin_unlock(&adapter->afu_list_lock);
 }
 
 static const struct pci_error_handlers cxl_err_handler = {
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock'
  2019-01-29  6:15 [PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock' Vaibhav Jain
@ 2019-01-29  6:37 ` Andrew Donnellan
  2019-01-29 11:08   ` Vaibhav Jain
  2019-01-29  8:29 ` Frederic Barrat
  2019-01-29 10:08 ` christophe lombard
  2 siblings, 1 reply; 5+ messages in thread
From: Andrew Donnellan @ 2019-01-29  6:37 UTC (permalink / raw)
  To: Vaibhav Jain, linuxppc-dev, Frederic Barrat
  Cc: Philippe Bergheaud, Alastair D'Silva, Christophe Lombard

On 29/1/19 5:15 pm, Vaibhav Jain wrote:
> Within cxl module, iteration over array 'adapter->slices' may be racy

adapter->slices isn't an array, adapter->afu is the array.

> at few points as it might be simultaneously read during an EEH and its
> contents being set to NULL while driver is being unloaded or unbound
> from the adapter. This might result in a NULL pointer to 'struct afu'
> being de-referenced during an EEH thereby causing a kernel oops.
> 
> This patch fixes this by making sure that all access to the array
> 'adapter->slices' is wrapped within the context of spin-lock
> 'adapter->afu_list_lock'.
> 
> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>

Does this need to go to stable? (I'm guessing we've been hitting actual 
Oopses?)

Acked-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>

> ---
> Changelog:
> 
> v3:
> * Updated a slice loop in cxl_pci_error_detectected() to ignore NULL
>    slices [Fred]
> * Added a NULL AFU check in cxl_pci_slot_reset() [Fred]
> 
> v2:
> * Fixed a wrong comparison of non-null pointer [Fred]
> * Moved a call to cxl_vphb_error_detected() within a branch that
>    checks for not null AFU pointer in 'adapter->slices' [Fred]
> * Removed a misleading comment in code.
> ---
>   drivers/misc/cxl/guest.c |  2 ++
>   drivers/misc/cxl/pci.c   | 39 ++++++++++++++++++++++++++++++---------
>   2 files changed, 32 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c
> index 5d28d9e454f5..08f4a512afad 100644
> --- a/drivers/misc/cxl/guest.c
> +++ b/drivers/misc/cxl/guest.c
> @@ -267,6 +267,7 @@ static int guest_reset(struct cxl *adapter)
>   	int i, rc;
>   
>   	pr_devel("Adapter reset request\n");
> +	spin_lock(&adapter->afu_list_lock);
>   	for (i = 0; i < adapter->slices; i++) {
>   		if ((afu = adapter->afu[i])) {
>   			pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT,
> @@ -283,6 +284,7 @@ static int guest_reset(struct cxl *adapter)
>   			pci_error_handlers(afu, CXL_RESUME_EVENT, 0);
>   		}
>   	}
> +	spin_unlock(&adapter->afu_list_lock);
>   	return rc;
>   }
>   
> diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
> index c79ba1c699ad..300531d6136f 100644
> --- a/drivers/misc/cxl/pci.c
> +++ b/drivers/misc/cxl/pci.c
> @@ -1805,7 +1805,7 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
>   	/* There should only be one entry, but go through the list
>   	 * anyway
>   	 */
> -	if (afu->phb == NULL)
> +	if (afu == NULL || afu->phb == NULL)
>   		return result;
>   
>   	list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
> @@ -1832,7 +1832,8 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>   {
>   	struct cxl *adapter = pci_get_drvdata(pdev);
>   	struct cxl_afu *afu;
> -	pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET, afu_result;
> +	pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
> +	pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET;
>   	int i;
>   
>   	/* At this point, we could still have an interrupt pending.
> @@ -1843,6 +1844,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>   
>   	/* If we're permanently dead, give up. */
>   	if (state == pci_channel_io_perm_failure) {
> +		spin_lock(&adapter->afu_list_lock);
>   		for (i = 0; i < adapter->slices; i++) {
>   			afu = adapter->afu[i];
>   			/*
> @@ -1851,6 +1853,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>   			 */
>   			cxl_vphb_error_detected(afu, state);
>   		}
> +		spin_unlock(&adapter->afu_list_lock);
>   		return PCI_ERS_RESULT_DISCONNECT;
>   	}
>   
> @@ -1932,11 +1935,17 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>   	 *     * In slot_reset, free the old resources and allocate new ones.
>   	 *     * In resume, clear the flag to allow things to start.
>   	 */
> +
> +	/* Make sure no one else changes the afu list */
> +	spin_lock(&adapter->afu_list_lock);
> +
>   	for (i = 0; i < adapter->slices; i++) {
>   		afu = adapter->afu[i];
>   
> -		afu_result = cxl_vphb_error_detected(afu, state);
> +		if (afu == NULL)
> +			continue;
>   
> +		afu_result = cxl_vphb_error_detected(afu, state);
>   		cxl_context_detach_all(afu);
>   		cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
>   		pci_deconfigure_afu(afu);
> @@ -1948,6 +1957,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>   			 (result == PCI_ERS_RESULT_NEED_RESET))
>   			result = PCI_ERS_RESULT_NONE;
>   	}
> +	spin_unlock(&adapter->afu_list_lock);
>   
>   	/* should take the context lock here */
>   	if (cxl_adapter_context_lock(adapter) != 0)
> @@ -1980,14 +1990,18 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
>   	 */
>   	cxl_adapter_context_unlock(adapter);
>   
> +	spin_lock(&adapter->afu_list_lock);
>   	for (i = 0; i < adapter->slices; i++) {
>   		afu = adapter->afu[i];
>   
> +		if (afu == NULL)
> +			continue;
> +
>   		if (pci_configure_afu(afu, adapter, pdev))
> -			goto err;
> +			goto err_unlock;
>   
>   		if (cxl_afu_select_best_mode(afu))
> -			goto err;
> +			goto err_unlock;
>   
>   		if (afu->phb == NULL)
>   			continue;
> @@ -1999,16 +2013,16 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
>   			ctx = cxl_get_context(afu_dev);
>   
>   			if (ctx && cxl_release_context(ctx))
> -				goto err;
> +				goto err_unlock;
>   
>   			ctx = cxl_dev_context_init(afu_dev);
>   			if (IS_ERR(ctx))
> -				goto err;
> +				goto err_unlock;
>   
>   			afu_dev->dev.archdata.cxl_ctx = ctx;
>   
>   			if (cxl_ops->afu_check_and_enable(afu))
> -				goto err;
> +				goto err_unlock;
>   
>   			afu_dev->error_state = pci_channel_io_normal;
>   
> @@ -2029,8 +2043,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
>   				result = PCI_ERS_RESULT_DISCONNECT;
>   		}
>   	}
> +
> +	spin_unlock(&adapter->afu_list_lock);
>   	return result;
>   
> +err_unlock:
> +	spin_unlock(&adapter->afu_list_lock);
> +
>   err:
>   	/* All the bits that happen in both error_detected and cxl_remove
>   	 * should be idempotent, so we don't need to worry about leaving a mix
> @@ -2051,10 +2070,11 @@ static void cxl_pci_resume(struct pci_dev *pdev)
>   	 * This is not the place to be checking if everything came back up
>   	 * properly, because there's no return value: do that in slot_reset.
>   	 */
> +	spin_lock(&adapter->afu_list_lock);
>   	for (i = 0; i < adapter->slices; i++) {
>   		afu = adapter->afu[i];
>   
> -		if (afu->phb == NULL)
> +		if (afu == NULL || afu->phb == NULL)
>   			continue;
>   
>   		list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
> @@ -2063,6 +2083,7 @@ static void cxl_pci_resume(struct pci_dev *pdev)
>   				afu_dev->driver->err_handler->resume(afu_dev);
>   		}
>   	}
> +	spin_unlock(&adapter->afu_list_lock);
>   }
>   
>   static const struct pci_error_handlers cxl_err_handler = {
> 

-- 
Andrew Donnellan              OzLabs, ADL Canberra
andrew.donnellan@au1.ibm.com  IBM Australia Limited


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock'
  2019-01-29  6:15 [PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock' Vaibhav Jain
  2019-01-29  6:37 ` Andrew Donnellan
@ 2019-01-29  8:29 ` Frederic Barrat
  2019-01-29 10:08 ` christophe lombard
  2 siblings, 0 replies; 5+ messages in thread
From: Frederic Barrat @ 2019-01-29  8:29 UTC (permalink / raw)
  To: Vaibhav Jain, linuxppc-dev
  Cc: Philippe Bergheaud, Alastair D'Silva, Christophe Lombard,
	Andrew Donnellan



Le 29/01/2019 à 07:15, Vaibhav Jain a écrit :
> Within cxl module, iteration over array 'adapter->slices' may be racy
> at few points as it might be simultaneously read during an EEH and its
> contents being set to NULL while driver is being unloaded or unbound
> from the adapter. This might result in a NULL pointer to 'struct afu'
> being de-referenced during an EEH thereby causing a kernel oops.
> 
> This patch fixes this by making sure that all access to the array
> 'adapter->slices' is wrapped within the context of spin-lock
> 'adapter->afu_list_lock'.
> 
> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
> ---

Thanks for fixing this!

Acked-by: Frederic Barrat <fbarrat@linux.ibm.com>



> Changelog:
> 
> v3:
> * Updated a slice loop in cxl_pci_error_detectected() to ignore NULL
>    slices [Fred]
> * Added a NULL AFU check in cxl_pci_slot_reset() [Fred]
> 
> v2:
> * Fixed a wrong comparison of non-null pointer [Fred]
> * Moved a call to cxl_vphb_error_detected() within a branch that
>    checks for not null AFU pointer in 'adapter->slices' [Fred]
> * Removed a misleading comment in code.
> ---
>   drivers/misc/cxl/guest.c |  2 ++
>   drivers/misc/cxl/pci.c   | 39 ++++++++++++++++++++++++++++++---------
>   2 files changed, 32 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c
> index 5d28d9e454f5..08f4a512afad 100644
> --- a/drivers/misc/cxl/guest.c
> +++ b/drivers/misc/cxl/guest.c
> @@ -267,6 +267,7 @@ static int guest_reset(struct cxl *adapter)
>   	int i, rc;
>   
>   	pr_devel("Adapter reset request\n");
> +	spin_lock(&adapter->afu_list_lock);
>   	for (i = 0; i < adapter->slices; i++) {
>   		if ((afu = adapter->afu[i])) {
>   			pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT,
> @@ -283,6 +284,7 @@ static int guest_reset(struct cxl *adapter)
>   			pci_error_handlers(afu, CXL_RESUME_EVENT, 0);
>   		}
>   	}
> +	spin_unlock(&adapter->afu_list_lock);
>   	return rc;
>   }
>   
> diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
> index c79ba1c699ad..300531d6136f 100644
> --- a/drivers/misc/cxl/pci.c
> +++ b/drivers/misc/cxl/pci.c
> @@ -1805,7 +1805,7 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
>   	/* There should only be one entry, but go through the list
>   	 * anyway
>   	 */
> -	if (afu->phb == NULL)
> +	if (afu == NULL || afu->phb == NULL)
>   		return result;
>   
>   	list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
> @@ -1832,7 +1832,8 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>   {
>   	struct cxl *adapter = pci_get_drvdata(pdev);
>   	struct cxl_afu *afu;
> -	pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET, afu_result;
> +	pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
> +	pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET;
>   	int i;
>   
>   	/* At this point, we could still have an interrupt pending.
> @@ -1843,6 +1844,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>   
>   	/* If we're permanently dead, give up. */
>   	if (state == pci_channel_io_perm_failure) {
> +		spin_lock(&adapter->afu_list_lock);
>   		for (i = 0; i < adapter->slices; i++) {
>   			afu = adapter->afu[i];
>   			/*
> @@ -1851,6 +1853,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>   			 */
>   			cxl_vphb_error_detected(afu, state);
>   		}
> +		spin_unlock(&adapter->afu_list_lock);
>   		return PCI_ERS_RESULT_DISCONNECT;
>   	}
>   
> @@ -1932,11 +1935,17 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>   	 *     * In slot_reset, free the old resources and allocate new ones.
>   	 *     * In resume, clear the flag to allow things to start.
>   	 */
> +
> +	/* Make sure no one else changes the afu list */
> +	spin_lock(&adapter->afu_list_lock);
> +
>   	for (i = 0; i < adapter->slices; i++) {
>   		afu = adapter->afu[i];
>   
> -		afu_result = cxl_vphb_error_detected(afu, state);
> +		if (afu == NULL)
> +			continue;
>   
> +		afu_result = cxl_vphb_error_detected(afu, state);
>   		cxl_context_detach_all(afu);
>   		cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
>   		pci_deconfigure_afu(afu);
> @@ -1948,6 +1957,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>   			 (result == PCI_ERS_RESULT_NEED_RESET))
>   			result = PCI_ERS_RESULT_NONE;
>   	}
> +	spin_unlock(&adapter->afu_list_lock);
>   
>   	/* should take the context lock here */
>   	if (cxl_adapter_context_lock(adapter) != 0)
> @@ -1980,14 +1990,18 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
>   	 */
>   	cxl_adapter_context_unlock(adapter);
>   
> +	spin_lock(&adapter->afu_list_lock);
>   	for (i = 0; i < adapter->slices; i++) {
>   		afu = adapter->afu[i];
>   
> +		if (afu == NULL)
> +			continue;
> +
>   		if (pci_configure_afu(afu, adapter, pdev))
> -			goto err;
> +			goto err_unlock;
>   
>   		if (cxl_afu_select_best_mode(afu))
> -			goto err;
> +			goto err_unlock;
>   
>   		if (afu->phb == NULL)
>   			continue;
> @@ -1999,16 +2013,16 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
>   			ctx = cxl_get_context(afu_dev);
>   
>   			if (ctx && cxl_release_context(ctx))
> -				goto err;
> +				goto err_unlock;
>   
>   			ctx = cxl_dev_context_init(afu_dev);
>   			if (IS_ERR(ctx))
> -				goto err;
> +				goto err_unlock;
>   
>   			afu_dev->dev.archdata.cxl_ctx = ctx;
>   
>   			if (cxl_ops->afu_check_and_enable(afu))
> -				goto err;
> +				goto err_unlock;
>   
>   			afu_dev->error_state = pci_channel_io_normal;
>   
> @@ -2029,8 +2043,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
>   				result = PCI_ERS_RESULT_DISCONNECT;
>   		}
>   	}
> +
> +	spin_unlock(&adapter->afu_list_lock);
>   	return result;
>   
> +err_unlock:
> +	spin_unlock(&adapter->afu_list_lock);
> +
>   err:
>   	/* All the bits that happen in both error_detected and cxl_remove
>   	 * should be idempotent, so we don't need to worry about leaving a mix
> @@ -2051,10 +2070,11 @@ static void cxl_pci_resume(struct pci_dev *pdev)
>   	 * This is not the place to be checking if everything came back up
>   	 * properly, because there's no return value: do that in slot_reset.
>   	 */
> +	spin_lock(&adapter->afu_list_lock);
>   	for (i = 0; i < adapter->slices; i++) {
>   		afu = adapter->afu[i];
>   
> -		if (afu->phb == NULL)
> +		if (afu == NULL || afu->phb == NULL)
>   			continue;
>   
>   		list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
> @@ -2063,6 +2083,7 @@ static void cxl_pci_resume(struct pci_dev *pdev)
>   				afu_dev->driver->err_handler->resume(afu_dev);
>   		}
>   	}
> +	spin_unlock(&adapter->afu_list_lock);
>   }
>   
>   static const struct pci_error_handlers cxl_err_handler = {
> 


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock'
  2019-01-29  6:15 [PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock' Vaibhav Jain
  2019-01-29  6:37 ` Andrew Donnellan
  2019-01-29  8:29 ` Frederic Barrat
@ 2019-01-29 10:08 ` christophe lombard
  2 siblings, 0 replies; 5+ messages in thread
From: christophe lombard @ 2019-01-29 10:08 UTC (permalink / raw)
  To: Vaibhav Jain, linuxppc-dev, Frederic Barrat
  Cc: Philippe Bergheaud, Alastair D'Silva, Christophe Lombard,
	Andrew Donnellan

On 29/01/2019 07:15, Vaibhav Jain wrote:
> Within cxl module, iteration over array 'adapter->slices' may be racy
> at few points as it might be simultaneously read during an EEH and its
> contents being set to NULL while driver is being unloaded or unbound
> from the adapter. This might result in a NULL pointer to 'struct afu'
> being de-referenced during an EEH thereby causing a kernel oops.
> 
> This patch fixes this by making sure that all access to the array
> 'adapter->slices' is wrapped within the context of spin-lock
> 'adapter->afu_list_lock'.
> 
> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
> ---
> Changelog:
> 
> v3:
> * Updated a slice loop in cxl_pci_error_detectected() to ignore NULL
>    slices [Fred]
> * Added a NULL AFU check in cxl_pci_slot_reset() [Fred]
> 
> v2:
> * Fixed a wrong comparison of non-null pointer [Fred]
> * Moved a call to cxl_vphb_error_detected() within a branch that
>    checks for not null AFU pointer in 'adapter->slices' [Fred]
> * Removed a misleading comment in code.
> ---

Thanks

Acked-by: Christophe Lombard<clombard@linux.vnet.ibm.com>


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock'
  2019-01-29  6:37 ` Andrew Donnellan
@ 2019-01-29 11:08   ` Vaibhav Jain
  0 siblings, 0 replies; 5+ messages in thread
From: Vaibhav Jain @ 2019-01-29 11:08 UTC (permalink / raw)
  To: Andrew Donnellan, linuxppc-dev, Frederic Barrat
  Cc: Philippe Bergheaud, Alastair D'Silva, Christophe Lombard


Andrew Donnellan <andrew.donnellan@au1.ibm.com> writes:

> On 29/1/19 5:15 pm, Vaibhav Jain wrote:
>> Within cxl module, iteration over array 'adapter->slices' may be racy
>
> adapter->slices isn't an array, adapter->afu is the array.
Thanks for catching this. Have fixed the patch description in the resent patch.

>
> Does this need to go to stable? (I'm guessing we've been hitting actual 
> Oopses?)
Re-send patch marked to stable.

> Acked-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Thanks !!

-- 
Vaibhav Jain <vaibhav@linux.ibm.com>
Linux Technology Center, IBM India Pvt. Ltd.


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-01-29 11:11 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-29  6:15 [PATCH v3] cxl: Wrap iterations over afu slices inside 'afu_list_lock' Vaibhav Jain
2019-01-29  6:37 ` Andrew Donnellan
2019-01-29 11:08   ` Vaibhav Jain
2019-01-29  8:29 ` Frederic Barrat
2019-01-29 10:08 ` christophe lombard

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.