All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/6] mpi3mr:  Support PCIe Error Recovery
@ 2023-12-14 20:58 Ranjan Kumar
  2023-12-14 20:58 ` [PATCH v2 1/6] mpi3mr: Creation of helper function Ranjan Kumar
                   ` (5 more replies)
  0 siblings, 6 replies; 10+ messages in thread
From: Ranjan Kumar @ 2023-12-14 20:58 UTC (permalink / raw)
  To: linux-scsi, martin.petersen
  Cc: rajsekhar.chundru, sathya.prakash, sumit.saxena,
	chandrakanth.patil, prayas.patel, Ranjan Kumar

[-- Attachment #1: Type: text/plain, Size: 904 bytes --]

Patches are related to "Advance Error Recovery"
and are tightly coupled and the remaining two patches are defect fix.

v1->v2:
- AER patch split as suggested by Bjorn Helgaas
- Updated driver version to a new value

Ranjan Kumar (6):
  mpi3mr: Creation of helper function
  mpi3mr: Support PCIe Error Recovery callback handlers
  mpi3mr: Prevent PCI writes from driver during PCI error recovery
  mpi3mr: Improve Shutdown times when firmware has faulted
  mpi3mr: Reset stop_bsgs flag post controller reset failure
  mpi3mr: Update driver version to 8.6.1.0.50

 drivers/scsi/mpi3mr/mpi3mr.h           |  36 ++-
 drivers/scsi/mpi3mr/mpi3mr_app.c       |  64 ++---
 drivers/scsi/mpi3mr/mpi3mr_fw.c        |  65 +++--
 drivers/scsi/mpi3mr/mpi3mr_os.c        | 322 +++++++++++++++++++++----
 drivers/scsi/mpi3mr/mpi3mr_transport.c |  39 ++-
 5 files changed, 435 insertions(+), 91 deletions(-)

-- 
2.31.1


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4209 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2 1/6] mpi3mr: Creation of helper function
  2023-12-14 20:58 [PATCH v2 0/6] mpi3mr: Support PCIe Error Recovery Ranjan Kumar
@ 2023-12-14 20:58 ` Ranjan Kumar
  2024-01-04 23:11   ` Bjorn Helgaas
  2023-12-14 20:58 ` [PATCH v2 2/6] mpi3mr: Support PCIe Error Recovery callback handlers Ranjan Kumar
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 10+ messages in thread
From: Ranjan Kumar @ 2023-12-14 20:58 UTC (permalink / raw)
  To: linux-scsi, martin.petersen
  Cc: rajsekhar.chundru, sathya.prakash, sumit.saxena,
	chandrakanth.patil, prayas.patel, Ranjan Kumar

[-- Attachment #1: Type: text/plain, Size: 3562 bytes --]

Use of helper function to get controller and shost details.

Signed-off-by: Sathya Prakash <sathya.prakash@broadcom.com>
Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com>
---
 drivers/scsi/mpi3mr/mpi3mr_os.c | 54 ++++++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 11 deletions(-)

diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c
index 1bffd629c124..76ba31a9517d 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_os.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_os.c
@@ -5230,6 +5230,35 @@ mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	return retval;
 }
 
+/**
+ * mpi3mr_get_shost_and_mrioc - get shost and ioc reference if
+ *                     they are valid
+ * @pdev: PCI device struct
+ * @shost: address to store scsi host reference
+ * @mrioc: address store HBA adapter reference
+ *
+ * Return: 0 if *shost and *ioc are not NULL otherwise -1.
+ */
+
+static int
+mpi3mr_get_shost_and_mrioc(struct pci_dev *pdev,
+	struct Scsi_Host **shost, struct mpi3mr_ioc **mrioc)
+{
+	*shost = pci_get_drvdata(pdev);
+	if (*shost == NULL) {
+		dev_err(&pdev->dev, "pdev's driver data is null\n");
+		return -ENXIO;
+	}
+
+	*mrioc = shost_priv(*shost);
+	if (*mrioc == NULL) {
+		dev_err(&pdev->dev, "shost's private data is null\n");
+		*shost = NULL;
+		return -ENXIO;
+		}
+	return 0;
+}
+
 /**
  * mpi3mr_remove - PCI remove callback
  * @pdev: PCI device instance
@@ -5242,7 +5271,7 @@ mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  */
 static void mpi3mr_remove(struct pci_dev *pdev)
 {
-	struct Scsi_Host *shost = pci_get_drvdata(pdev);
+	struct Scsi_Host *shost;
 	struct mpi3mr_ioc *mrioc;
 	struct workqueue_struct	*wq;
 	unsigned long flags;
@@ -5250,7 +5279,7 @@ static void mpi3mr_remove(struct pci_dev *pdev)
 	struct mpi3mr_hba_port *port, *hba_port_next;
 	struct mpi3mr_sas_node *sas_expander, *sas_expander_next;
 
-	if (!shost)
+	if (mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc))
 		return;
 
 	mrioc = shost_priv(shost);
@@ -5328,12 +5357,12 @@ static void mpi3mr_remove(struct pci_dev *pdev)
  */
 static void mpi3mr_shutdown(struct pci_dev *pdev)
 {
-	struct Scsi_Host *shost = pci_get_drvdata(pdev);
+	struct Scsi_Host *shost;
 	struct mpi3mr_ioc *mrioc;
 	struct workqueue_struct	*wq;
 	unsigned long flags;
 
-	if (!shost)
+	if (mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc))
 		return;
 
 	mrioc = shost_priv(shost);
@@ -5361,17 +5390,19 @@ static void mpi3mr_shutdown(struct pci_dev *pdev)
  * Change the power state to the given value and cleanup the IOC
  * by issuing MUR and shutdown notification
  *
- * Return: 0 always.
+ * Return: 0 on success, non-zero on failure
  */
 static int __maybe_unused
 mpi3mr_suspend(struct device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
-	struct Scsi_Host *shost = pci_get_drvdata(pdev);
+	struct Scsi_Host *shost;
 	struct mpi3mr_ioc *mrioc;
+	int rc;
 
-	if (!shost)
-		return 0;
+	rc = mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc);
+	if (rc)
+		return rc;
 
 	mrioc = shost_priv(shost);
 	while (mrioc->reset_in_progress || mrioc->is_driver_loading)
@@ -5402,13 +5433,14 @@ static int __maybe_unused
 mpi3mr_resume(struct device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
-	struct Scsi_Host *shost = pci_get_drvdata(pdev);
+	struct Scsi_Host *shost;
 	struct mpi3mr_ioc *mrioc;
 	pci_power_t device_state = pdev->current_state;
 	int r;
 
-	if (!shost)
-		return 0;
+	r = mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc);
+	if (r)
+		return r;
 
 	mrioc = shost_priv(shost);
 
-- 
2.31.1


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4209 bytes --]

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 2/6] mpi3mr: Support PCIe Error Recovery callback handlers
  2023-12-14 20:58 [PATCH v2 0/6] mpi3mr: Support PCIe Error Recovery Ranjan Kumar
  2023-12-14 20:58 ` [PATCH v2 1/6] mpi3mr: Creation of helper function Ranjan Kumar
@ 2023-12-14 20:58 ` Ranjan Kumar
  2024-01-04 23:49   ` Bjorn Helgaas
  2023-12-14 20:58 ` [PATCH v2 3/6] mpi3mr: Prevent PCI writes from driver during PCI error recovery Ranjan Kumar
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 10+ messages in thread
From: Ranjan Kumar @ 2023-12-14 20:58 UTC (permalink / raw)
  To: linux-scsi, martin.petersen
  Cc: rajsekhar.chundru, sathya.prakash, sumit.saxena,
	chandrakanth.patil, prayas.patel, Ranjan Kumar

[-- Attachment #1: Type: text/plain, Size: 7945 bytes --]

The driver has been upgraded to include support for the
PCIe error recovery callback handler which is crucial for
the recovery of the controllers. This feature is
necessary for addressing the errors reported by
the PCIe AER (Advanced Error Reporting) mechanism.

Signed-off-by: Sathya Prakash <sathya.prakash@broadcom.com>
Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com>
---
 drivers/scsi/mpi3mr/mpi3mr.h    |   5 +
 drivers/scsi/mpi3mr/mpi3mr_os.c | 197 ++++++++++++++++++++++++++++++++
 2 files changed, 202 insertions(+)

diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h
index 3de1ee05c44e..25e6e3a09468 100644
--- a/drivers/scsi/mpi3mr/mpi3mr.h
+++ b/drivers/scsi/mpi3mr/mpi3mr.h
@@ -23,6 +23,7 @@
 #include <linux/miscdevice.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/aer.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -1055,6 +1056,8 @@ struct scmd_priv {
  * @ioctl_chain_sge: DMA buffer descriptor for IOCTL chain
  * @ioctl_resp_sge: DMA buffer descriptor for Mgmt cmd response
  * @ioctl_sges_allocated: Flag for IOCTL SGEs allocated or not
+ * @pcie_err_recovery: PCIe error recovery in progress
+ * @block_on_pcie_err: Block IO during PCI error recovery
  */
 struct mpi3mr_ioc {
 	struct list_head list;
@@ -1246,6 +1249,8 @@ struct mpi3mr_ioc {
 	struct dma_memory_desc ioctl_chain_sge;
 	struct dma_memory_desc ioctl_resp_sge;
 	bool ioctl_sges_allocated;
+	bool pcie_err_recovery;
+	bool block_on_pcie_err;
 };
 
 /**
diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c
index 76ba31a9517d..dea47ef53abb 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_os.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_os.c
@@ -5472,6 +5472,195 @@ mpi3mr_resume(struct device *dev)
 	return 0;
 }
 
+/**
+ * mpi3mr_pcierr_detected - PCI error detected callback
+ * @pdev: PCI device instance
+ * @state: channel state
+ *
+ * This function is called by the PCI error recovery driver and
+ * based on the state passed the driver decides what actions to
+ * be recommended back to PCI driver.
+ *
+ * For all of the states if there is no valid mrioc or scsi host
+ * references in the pci device then this function will retyrn
+ * the resul as disconnect.
+ *
+ * For normal state, this function will return the result as can
+ * recover.
+ *
+ * For frozen state, this function will block for any pennding
+ * controller initialization or re-initialization to complete,
+ * stop any new interactions with the controller and return
+ * status as reset required.
+ *
+ * For permanent failure state, this function will mark the
+ * controller as unrecoverable and return status as disconnect.
+ *
+ * Returns: PCI_ERS_RESULT_NEED_RESET or CAN_RECOVER or
+ * DISCONNECT based on the controller state.
+ */
+static pci_ers_result_t
+mpi3mr_pcierr_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	struct Scsi_Host *shost;
+	struct mpi3mr_ioc *mrioc;
+	pci_ers_result_t ret_val = PCI_ERS_RESULT_DISCONNECT;
+
+	dev_info(&pdev->dev, "%s: callback invoked state(%d)\n", __func__,
+	    state);
+
+	if (mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc)) {
+		dev_err(&pdev->dev, "device not available\n");
+		return ret_val;
+	}
+
+	switch (state) {
+	case pci_channel_io_normal:
+		ret_val = PCI_ERS_RESULT_CAN_RECOVER;
+		break;
+	case pci_channel_io_frozen:
+		mrioc->pcie_err_recovery = true;
+		mrioc->block_on_pcie_err = true;
+		while (mrioc->reset_in_progress || mrioc->is_driver_loading)
+			ssleep(1);
+		scsi_block_requests(mrioc->shost);
+		mpi3mr_stop_watchdog(mrioc);
+		mpi3mr_cleanup_resources(mrioc);
+		mrioc->pdev = NULL;
+		ret_val = PCI_ERS_RESULT_NEED_RESET;
+		break;
+	case pci_channel_io_perm_failure:
+		mrioc->pcie_err_recovery = true;
+		mrioc->block_on_pcie_err = true;
+		mrioc->unrecoverable = 1;
+		mpi3mr_stop_watchdog(mrioc);
+		mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
+		ret_val = PCI_ERS_RESULT_DISCONNECT;
+		break;
+	default:
+		break;
+	}
+	return ret_val;
+}
+
+/**
+ * mpi3mr_pcierr_slot_reset_done - Post slot reset callback
+ * @pdev: PCI device instance
+ *
+ * This function is called by the PCI error recovery driver
+ * after a slot or link reset issued by it for the recovery, the
+ * driver is expected to bring back the controller and
+ * initialize it.
+ *
+ * This function restores pci state and reinitializes controller
+ * resoruces and the controller, this blocks for any pending
+ * reset to complete.
+ *
+ * Returns: PCI_ERS_RESULT_DISCONNECT on failure or
+ * PCI_ERS_RESULT_RECOVERED
+ */
+static pci_ers_result_t mpi3mr_pcierr_slot_reset_done(struct pci_dev *pdev)
+{
+	struct Scsi_Host *shost;
+	struct mpi3mr_ioc *mrioc;
+
+
+	dev_info(&pdev->dev, "%s: callback invoked\n", __func__);
+
+	if (mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc)) {
+		dev_err(&pdev->dev, "device not available\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	while (mrioc->reset_in_progress)
+		ssleep(1);
+
+	mrioc->pdev = pdev;
+	pci_restore_state(pdev);
+
+	if (mpi3mr_setup_resources(mrioc)) {
+		ioc_err(mrioc, "setup resources failed\n");
+		goto out_failed;
+	}
+	mrioc->unrecoverable = 0;
+	mrioc->pcie_err_recovery = false;
+
+	if (mpi3mr_soft_reset_handler(mrioc, MPI3MR_RESET_FROM_FIRMWARE, 0))
+		goto out_failed;
+
+	return PCI_ERS_RESULT_RECOVERED;
+
+out_failed:
+	mrioc->unrecoverable = 1;
+	mrioc->block_on_pcie_err = false;
+	scsi_unblock_requests(shost);
+	mpi3mr_start_watchdog(mrioc);
+	return PCI_ERS_RESULT_DISCONNECT;
+}
+
+/**
+ * mpi3mr_pcierr_resume - PCI error recovery resume
+ * callback
+ * @pdev: PCI device instance
+ *
+ * This function enables all I/O and IOCTLs post reset issued as
+ * part of the PCI advacned error reporting and handling
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_pcierr_resume(struct pci_dev *pdev)
+{
+	struct Scsi_Host *shost;
+	struct mpi3mr_ioc *mrioc;
+
+	dev_info(&pdev->dev, "%s: callback invoked\n", __func__);
+
+	if (mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc)) {
+		dev_err(&pdev->dev, "device not available\n");
+		return;
+	}
+
+	pci_aer_clear_nonfatal_status(pdev);
+
+	if (mrioc->block_on_pcie_err) {
+		mrioc->block_on_pcie_err = false;
+		scsi_unblock_requests(shost);
+		mpi3mr_start_watchdog(mrioc);
+	}
+
+}
+
+/**
+ * mpi3mr_pcierr_mmio_enabled - PCI error recovery callback
+ * @pdev: PCI device instance
+ *
+ * This is called only if _pcierr_error_detected returns
+ * PCI_ERS_RESULT_CAN_RECOVER.
+ *
+ * Return: PCI_ERS_RESULT_DISCONNECT when the controller is
+ * unrecoverable or when the shost/mnrioc reference cannot be
+ * found, else return PCI_ERS_RESULT_RECOVERED
+ */
+static pci_ers_result_t mpi3mr_pcierr_mmio_enabled(struct pci_dev *pdev)
+{
+
+	struct Scsi_Host *shost;
+	struct mpi3mr_ioc *mrioc;
+/*
+ *
+ */
+	dev_info(&pdev->dev, "%s: callback invoked\n", __func__);
+
+	if (mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc)) {
+		dev_err(&pdev->dev, "device not available\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+	if (mrioc->unrecoverable)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
 static const struct pci_device_id mpi3mr_pci_id_table[] = {
 	{
 		PCI_DEVICE_SUB(MPI3_MFGPAGE_VENDORID_BROADCOM,
@@ -5489,6 +5678,13 @@ static const struct pci_device_id mpi3mr_pci_id_table[] = {
 };
 MODULE_DEVICE_TABLE(pci, mpi3mr_pci_id_table);
 
+static struct pci_error_handlers mpi3mr_err_handler = {
+	.error_detected = mpi3mr_pcierr_detected,
+	.mmio_enabled = mpi3mr_pcierr_mmio_enabled,
+	.slot_reset = mpi3mr_pcierr_slot_reset_done,
+	.resume = mpi3mr_pcierr_resume,
+};
+
 static SIMPLE_DEV_PM_OPS(mpi3mr_pm_ops, mpi3mr_suspend, mpi3mr_resume);
 
 static struct pci_driver mpi3mr_pci_driver = {
@@ -5497,6 +5693,7 @@ static struct pci_driver mpi3mr_pci_driver = {
 	.probe = mpi3mr_probe,
 	.remove = mpi3mr_remove,
 	.shutdown = mpi3mr_shutdown,
+	.err_handler = &mpi3mr_err_handler,
 	.driver.pm = &mpi3mr_pm_ops,
 };
 
-- 
2.31.1


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4209 bytes --]

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 3/6] mpi3mr: Prevent PCI writes from driver during PCI error recovery
  2023-12-14 20:58 [PATCH v2 0/6] mpi3mr: Support PCIe Error Recovery Ranjan Kumar
  2023-12-14 20:58 ` [PATCH v2 1/6] mpi3mr: Creation of helper function Ranjan Kumar
  2023-12-14 20:58 ` [PATCH v2 2/6] mpi3mr: Support PCIe Error Recovery callback handlers Ranjan Kumar
@ 2023-12-14 20:58 ` Ranjan Kumar
  2024-01-05  3:20   ` Bjorn Helgaas
  2023-12-14 20:58 ` [PATCH v2 4/6] mpi3mr: Improve Shutdown times when firmware has faulted Ranjan Kumar
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 10+ messages in thread
From: Ranjan Kumar @ 2023-12-14 20:58 UTC (permalink / raw)
  To: linux-scsi, martin.petersen
  Cc: rajsekhar.chundru, sathya.prakash, sumit.saxena,
	chandrakanth.patil, prayas.patel, Ranjan Kumar

[-- Attachment #1: Type: text/plain, Size: 18383 bytes --]

Prevent interacting with the hardware while
the error recovery in progress.

Signed-off-by: Sathya Prakash <sathya.prakash@broadcom.com>
Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com>
---
 drivers/scsi/mpi3mr/mpi3mr.h           | 26 ++++++++++
 drivers/scsi/mpi3mr/mpi3mr_app.c       | 64 +++++++++++++----------
 drivers/scsi/mpi3mr/mpi3mr_fw.c        | 28 +++++++---
 drivers/scsi/mpi3mr/mpi3mr_os.c        | 71 +++++++++++++++-----------
 drivers/scsi/mpi3mr/mpi3mr_transport.c | 39 ++++++++++++--
 5 files changed, 158 insertions(+), 70 deletions(-)

diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h
index 25e6e3a09468..1ac7f88dc1cd 100644
--- a/drivers/scsi/mpi3mr/mpi3mr.h
+++ b/drivers/scsi/mpi3mr/mpi3mr.h
@@ -481,6 +481,7 @@ struct mpi3mr_throttle_group_info {
 
 /* HBA port flags */
 #define MPI3MR_HBA_PORT_FLAG_DIRTY	0x01
+#define MPI3MR_HBA_PORT_FLAG_NEW       0x02
 
 /* IOCTL data transfer sge*/
 #define MPI3MR_NUM_IOCTL_SGE		256
@@ -900,6 +901,29 @@ struct scmd_priv {
 	u8 mpi3mr_scsiio_req[MPI3MR_ADMIN_REQ_FRAME_SZ];
 };
 
+/**
+ * struct mpi3mr_pdevinfo - PCI device information
+ *
+ * @dev_id: PCI device ID of the adapter
+ * @dev_hw_rev: PCI revision of the adapter
+ * @subsys_dev_id: PCI subsystem device ID of the adapter
+ * @subsys_ven_id: PCI subsystem vendor ID of the adapter
+ * @dev: PCI device
+ * @func: PCI function
+ * @bus: PCI bus
+ * @seg_id: PCI segment ID
+ */
+struct mpi3mr_pdevinfo {
+	u16 id;
+	u16 ssid;
+	u16 ssvid;
+	u16 segment;
+	u8 dev:5;
+	u8 func:3;
+	u8 bus;
+	u8 revision;
+};
+
 /**
  * struct mpi3mr_ioc - Adapter anchor structure stored in shost
  * private data
@@ -1058,6 +1082,7 @@ struct scmd_priv {
  * @ioctl_sges_allocated: Flag for IOCTL SGEs allocated or not
  * @pcie_err_recovery: PCIe error recovery in progress
  * @block_on_pcie_err: Block IO during PCI error recovery
+ * @pdevinfo: PCI device information
  */
 struct mpi3mr_ioc {
 	struct list_head list;
@@ -1251,6 +1276,7 @@ struct mpi3mr_ioc {
 	bool ioctl_sges_allocated;
 	bool pcie_err_recovery;
 	bool block_on_pcie_err;
+	struct mpi3mr_pdevinfo pdevinfo;
 };
 
 /**
diff --git a/drivers/scsi/mpi3mr/mpi3mr_app.c b/drivers/scsi/mpi3mr/mpi3mr_app.c
index 4b93b7440da6..a11d6f026f0e 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_app.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_app.c
@@ -31,7 +31,7 @@ static int mpi3mr_bsg_pel_abort(struct mpi3mr_ioc *mrioc)
 		dprint_bsg_err(mrioc, "%s: reset in progress\n", __func__);
 		return -1;
 	}
-	if (mrioc->stop_bsgs) {
+	if (mrioc->stop_bsgs || mrioc->block_on_pcie_err) {
 		dprint_bsg_err(mrioc, "%s: bsgs are blocked\n", __func__);
 		return -1;
 	}
@@ -424,6 +424,9 @@ static long mpi3mr_bsg_adp_reset(struct mpi3mr_ioc *mrioc,
 		goto out;
 	}
 
+	if (mrioc->unrecoverable || mrioc->block_on_pcie_err)
+		return -EINVAL;
+
 	sg_copy_to_buffer(job->request_payload.sg_list,
 			  job->request_payload.sg_cnt,
 			  &adpreset, sizeof(adpreset));
@@ -470,25 +473,29 @@ static long mpi3mr_bsg_populate_adpinfo(struct mpi3mr_ioc *mrioc,
 
 	memset(&adpinfo, 0, sizeof(adpinfo));
 	adpinfo.adp_type = MPI3MR_BSG_ADPTYPE_AVGFAMILY;
-	adpinfo.pci_dev_id = mrioc->pdev->device;
-	adpinfo.pci_dev_hw_rev = mrioc->pdev->revision;
-	adpinfo.pci_subsys_dev_id = mrioc->pdev->subsystem_device;
-	adpinfo.pci_subsys_ven_id = mrioc->pdev->subsystem_vendor;
-	adpinfo.pci_bus = mrioc->pdev->bus->number;
-	adpinfo.pci_dev = PCI_SLOT(mrioc->pdev->devfn);
-	adpinfo.pci_func = PCI_FUNC(mrioc->pdev->devfn);
-	adpinfo.pci_seg_id = pci_domain_nr(mrioc->pdev->bus);
 	adpinfo.app_intfc_ver = MPI3MR_IOCTL_VERSION;
 
-	ioc_state = mpi3mr_get_iocstate(mrioc);
-	if (ioc_state == MRIOC_STATE_UNRECOVERABLE)
-		adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_UNRECOVERABLE;
-	else if ((mrioc->reset_in_progress) || (mrioc->stop_bsgs))
+	if (mrioc->reset_in_progress || mrioc->stop_bsgs ||
+	    mrioc->block_on_pcie_err)
 		adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_IN_RESET;
-	else if (ioc_state == MRIOC_STATE_FAULT)
-		adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_FAULT;
-	else
-		adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_OPERATIONAL;
+	else {
+		ioc_state = mpi3mr_get_iocstate(mrioc);
+		if (ioc_state == MRIOC_STATE_UNRECOVERABLE)
+			adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_UNRECOVERABLE;
+		else if (ioc_state == MRIOC_STATE_FAULT)
+			adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_FAULT;
+		else
+			adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_OPERATIONAL;
+	}
+
+	adpinfo.pci_dev_id = mrioc->pdevinfo.id;
+	adpinfo.pci_dev_hw_rev = mrioc->pdevinfo.revision;
+	adpinfo.pci_subsys_dev_id = mrioc->pdevinfo.ssid;
+	adpinfo.pci_subsys_ven_id = mrioc->pdevinfo.ssvid;
+	adpinfo.pci_bus = mrioc->pdevinfo.bus;
+	adpinfo.pci_dev = mrioc->pdevinfo.dev;
+	adpinfo.pci_func = mrioc->pdevinfo.func;
+	adpinfo.pci_seg_id = mrioc->pdevinfo.segment;
 
 	memcpy((u8 *)&adpinfo.driver_info, (u8 *)&mrioc->driver_info,
 	    sizeof(adpinfo.driver_info));
@@ -1495,7 +1502,7 @@ static long mpi3mr_bsg_process_mpt_cmds(struct bsg_job *job)
 		mutex_unlock(&mrioc->bsg_cmds.mutex);
 		goto out;
 	}
-	if (mrioc->stop_bsgs) {
+	if (mrioc->stop_bsgs || mrioc->block_on_pcie_err) {
 		dprint_bsg_err(mrioc, "%s: bsgs are blocked\n", __func__);
 		rval = -EAGAIN;
 		mutex_unlock(&mrioc->bsg_cmds.mutex);
@@ -2020,17 +2027,20 @@ adp_state_show(struct device *dev, struct device_attribute *attr,
 	enum mpi3mr_iocstate ioc_state;
 	uint8_t adp_state;
 
-	ioc_state = mpi3mr_get_iocstate(mrioc);
-	if (ioc_state == MRIOC_STATE_UNRECOVERABLE)
-		adp_state = MPI3MR_BSG_ADPSTATE_UNRECOVERABLE;
-	else if ((mrioc->reset_in_progress) || (mrioc->stop_bsgs))
+	if (mrioc->reset_in_progress || mrioc->stop_bsgs ||
+		 mrioc->block_on_pcie_err)
 		adp_state = MPI3MR_BSG_ADPSTATE_IN_RESET;
-	else if (ioc_state == MRIOC_STATE_FAULT)
-		adp_state = MPI3MR_BSG_ADPSTATE_FAULT;
-	else
-		adp_state = MPI3MR_BSG_ADPSTATE_OPERATIONAL;
+	else {
+		ioc_state = mpi3mr_get_iocstate(mrioc);
+		if (ioc_state == MRIOC_STATE_UNRECOVERABLE)
+			adp_state = MPI3MR_BSG_ADPSTATE_UNRECOVERABLE;
+		else if (ioc_state == MRIOC_STATE_FAULT)
+			adp_state = MPI3MR_BSG_ADPSTATE_FAULT;
+		else
+			adp_state = MPI3MR_BSG_ADPSTATE_OPERATIONAL;
+	}
 
-	return sysfs_emit(buf, "%u\n", adp_state);
+	return snprintf(buf, PAGE_SIZE, "%u\n", adp_state);
 }
 
 static DEVICE_ATTR_RO(adp_state);
diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c
index d8c57a0a518f..c50fc27231cd 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_fw.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c
@@ -595,7 +595,7 @@ int mpi3mr_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
 	mrioc = (struct mpi3mr_ioc *)shost->hostdata;
 
 	if ((mrioc->reset_in_progress || mrioc->prepare_for_reset ||
-	    mrioc->unrecoverable))
+	    mrioc->unrecoverable || mrioc->pcie_err_recovery))
 		return 0;
 
 	num_entries = mpi3mr_process_op_reply_q(mrioc,
@@ -1037,13 +1037,13 @@ enum mpi3mr_iocstate mpi3mr_get_iocstate(struct mpi3mr_ioc *mrioc)
 	u32 ioc_status, ioc_config;
 	u8 ready, enabled;
 
-	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
-	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
-
 	if (mrioc->unrecoverable)
 		return MRIOC_STATE_UNRECOVERABLE;
+	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+
 	if (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT)
 		return MRIOC_STATE_FAULT;
+	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
 
 	ready = (ioc_status & MPI3_SYSIF_IOC_STATUS_READY);
 	enabled = (ioc_config & MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC);
@@ -1667,6 +1667,12 @@ int mpi3mr_admin_request_post(struct mpi3mr_ioc *mrioc, void *admin_req,
 		retval = -EAGAIN;
 		goto out;
 	}
+	if (mrioc->pcie_err_recovery) {
+		ioc_err(mrioc, "admin request queue submission failed due to pcie error recovery in progress\n");
+		retval = -EAGAIN;
+		goto out;
+	}
+
 	areq_entry = (u8 *)mrioc->admin_req_base +
 	    (areq_pi * MPI3MR_ADMIN_REQ_FRAME_SZ);
 	memset(areq_entry, 0, MPI3MR_ADMIN_REQ_FRAME_SZ);
@@ -2337,6 +2343,11 @@ int mpi3mr_op_request_post(struct mpi3mr_ioc *mrioc,
 		retval = -EAGAIN;
 		goto out;
 	}
+	if (mrioc->pcie_err_recovery) {
+		ioc_err(mrioc, "operational request queue submission failed due to pcie error recovery in progress\n");
+		retval = -EAGAIN;
+		goto out;
+	}
 
 	segment_base_addr = segments[pi / op_req_q->segment_qd].segment;
 	req_entry = (u8 *)segment_base_addr +
@@ -2585,7 +2596,7 @@ static void mpi3mr_watchdog_work(struct work_struct *work)
 	u32 fault, host_diagnostic, ioc_status;
 	u32 reset_reason = MPI3MR_RESET_FROM_FAULT_WATCH;
 
-	if (mrioc->reset_in_progress)
+	if (mrioc->reset_in_progress || mrioc->pcie_err_recovery)
 		return;
 
 	if (!mrioc->unrecoverable && !pci_device_is_present(mrioc->pdev)) {
@@ -4111,7 +4122,7 @@ int mpi3mr_reinit_ioc(struct mpi3mr_ioc *mrioc, u8 is_resume)
 		goto out_failed_noretry;
 	}
 
-	if (is_resume) {
+	if (is_resume || mrioc->block_on_pcie_err) {
 		dprint_reset(mrioc, "setting up single ISR\n");
 		retval = mpi3mr_setup_isr(mrioc, 1);
 		if (retval) {
@@ -4151,7 +4162,7 @@ int mpi3mr_reinit_ioc(struct mpi3mr_ioc *mrioc, u8 is_resume)
 		goto out_failed;
 	}
 
-	if (is_resume) {
+	if (is_resume || mrioc->block_on_pcie_err) {
 		dprint_reset(mrioc, "setting up multiple ISR\n");
 		retval = mpi3mr_setup_isr(mrioc, 0);
 		if (retval) {
@@ -4625,7 +4636,8 @@ void mpi3mr_cleanup_ioc(struct mpi3mr_ioc *mrioc)
 
 	ioc_state = mpi3mr_get_iocstate(mrioc);
 
-	if ((!mrioc->unrecoverable) && (!mrioc->reset_in_progress) &&
+	if (!mrioc->unrecoverable && !mrioc->reset_in_progress &&
+	    !mrioc->pcie_err_recovery &&
 	    (ioc_state == MRIOC_STATE_READY)) {
 		if (mpi3mr_issue_and_process_mur(mrioc,
 		    MPI3MR_RESET_FROM_CTLR_CLEANUP))
diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c
index dea47ef53abb..d61fd105dfad 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_os.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_os.c
@@ -919,7 +919,7 @@ static int mpi3mr_report_tgtdev_to_host(struct mpi3mr_ioc *mrioc,
 	int retval = 0;
 	struct mpi3mr_tgt_dev *tgtdev;
 
-	if (mrioc->reset_in_progress)
+	if (mrioc->reset_in_progress || mrioc->pcie_err_recovery)
 		return -1;
 
 	tgtdev = mpi3mr_get_tgtdev_by_perst_id(mrioc, perst_id);
@@ -2000,8 +2000,13 @@ static void mpi3mr_fwevt_bh(struct mpi3mr_ioc *mrioc,
 	}
 	case MPI3_EVENT_WAIT_FOR_DEVICES_TO_REFRESH:
 	{
-		while (mrioc->device_refresh_on)
+		while ((mrioc->device_refresh_on || mrioc->block_on_pcie_err) &&
+		    !mrioc->unrecoverable && !mrioc->pcie_err_recovery) {
 			msleep(500);
+		}
+
+		if (mrioc->unrecoverable || mrioc->pcie_err_recovery)
+			break;
 
 		dprint_event_bh(mrioc,
 		    "scan for non responding and newly added devices after soft reset started\n");
@@ -3680,6 +3685,13 @@ int mpi3mr_issue_tm(struct mpi3mr_ioc *mrioc, u8 tm_type,
 		mutex_unlock(&drv_cmd->mutex);
 		goto out;
 	}
+	if (mrioc->block_on_pcie_err) {
+		retval = -1;
+		dprint_tm(mrioc, "sending task management failed due to\n"
+				"pcie error recovery in progress\n");
+		mutex_unlock(&drv_cmd->mutex);
+		goto out;
+	}
 
 	drv_cmd->state = MPI3MR_CMD_PENDING;
 	drv_cmd->is_waiting = 1;
@@ -4073,12 +4085,19 @@ static int mpi3mr_eh_bus_reset(struct scsi_cmnd *scmd)
 	if (dev_type == MPI3_DEVICE_DEVFORM_VD) {
 		mpi3mr_wait_for_host_io(mrioc,
 			MPI3MR_RAID_ERRREC_RESET_TIMEOUT);
-		if (!mpi3mr_get_fw_pending_ios(mrioc))
+		if (!mpi3mr_get_fw_pending_ios(mrioc)) {
+			while (mrioc->reset_in_progress ||
+			       mrioc->prepare_for_reset ||
+			       mrioc->block_on_pcie_err)
+				ssleep(1);
 			retval = SUCCESS;
+			goto out;
+		}
 	}
 	if (retval == FAILED)
 		mpi3mr_print_pending_host_io(mrioc);
 
+out:
 	sdev_printk(KERN_INFO, scmd->device,
 		"Bus reset is %s for scmd(%p)\n",
 		((retval == SUCCESS) ? "SUCCESS" : "FAILED"), scmd);
@@ -4779,7 +4798,8 @@ static int mpi3mr_qcmd(struct Scsi_Host *shost,
 		goto out;
 	}
 
-	if (mrioc->reset_in_progress) {
+	if (mrioc->reset_in_progress || mrioc->prepare_for_reset
+	    || mrioc->block_on_pcie_err) {
 		retval = SCSI_MLQUEUE_HOST_BUSY;
 		goto out;
 	}
@@ -5123,6 +5143,14 @@ mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	mrioc->logging_level = logging_level;
 	mrioc->shost = shost;
 	mrioc->pdev = pdev;
+	mrioc->pdevinfo.id = pdev->device;
+	mrioc->pdevinfo.revision = pdev->revision;
+	mrioc->pdevinfo.ssid = pdev->subsystem_device;
+	mrioc->pdevinfo.ssvid = pdev->subsystem_vendor;
+	mrioc->pdevinfo.bus = pdev->bus->number;
+	mrioc->pdevinfo.dev = PCI_SLOT(pdev->devfn);
+	mrioc->pdevinfo.func = PCI_FUNC(pdev->devfn);
+	mrioc->pdevinfo.segment = pci_domain_nr(pdev->bus);
 	mrioc->stop_bsgs = 1;
 
 	mrioc->max_sgl_entries = max_sgl_entries;
@@ -5276,17 +5304,21 @@ static void mpi3mr_remove(struct pci_dev *pdev)
 	struct workqueue_struct	*wq;
 	unsigned long flags;
 	struct mpi3mr_tgt_dev *tgtdev, *tgtdev_next;
-	struct mpi3mr_hba_port *port, *hba_port_next;
-	struct mpi3mr_sas_node *sas_expander, *sas_expander_next;
 
 	if (mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc))
 		return;
 
-	mrioc = shost_priv(shost);
 	while (mrioc->reset_in_progress || mrioc->is_driver_loading)
 		ssleep(1);
 
-	if (!pci_device_is_present(mrioc->pdev)) {
+	if (mrioc->block_on_pcie_err) {
+		mrioc->block_on_pcie_err = false;
+		scsi_unblock_requests(shost);
+		mrioc->unrecoverable = 1;
+	}
+
+	if (!pci_device_is_present(mrioc->pdev) ||
+	    mrioc->pcie_err_recovery) {
 		mrioc->unrecoverable = 1;
 		mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
 	}
@@ -5316,29 +5348,6 @@ static void mpi3mr_remove(struct pci_dev *pdev)
 	mpi3mr_cleanup_ioc(mrioc);
 	mpi3mr_free_mem(mrioc);
 	mpi3mr_cleanup_resources(mrioc);
-
-	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
-	list_for_each_entry_safe_reverse(sas_expander, sas_expander_next,
-	    &mrioc->sas_expander_list, list) {
-		spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
-		mpi3mr_expander_node_remove(mrioc, sas_expander);
-		spin_lock_irqsave(&mrioc->sas_node_lock, flags);
-	}
-	list_for_each_entry_safe(port, hba_port_next, &mrioc->hba_port_table_list, list) {
-		ioc_info(mrioc,
-		    "removing hba_port entry: %p port: %d from hba_port list\n",
-		    port, port->port_id);
-		list_del(&port->list);
-		kfree(port);
-	}
-	spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
-
-	if (mrioc->sas_hba.num_phys) {
-		kfree(mrioc->sas_hba.phy);
-		mrioc->sas_hba.phy = NULL;
-		mrioc->sas_hba.num_phys = 0;
-	}
-
 	spin_lock(&mrioc_list_lock);
 	list_del(&mrioc->list);
 	spin_unlock(&mrioc_list_lock);
diff --git a/drivers/scsi/mpi3mr/mpi3mr_transport.c b/drivers/scsi/mpi3mr/mpi3mr_transport.c
index c0c8ab586957..8c8368104a27 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_transport.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_transport.c
@@ -149,6 +149,11 @@ static int mpi3mr_report_manufacture(struct mpi3mr_ioc *mrioc,
 		return -EFAULT;
 	}
 
+	if (mrioc->pcie_err_recovery) {
+		ioc_err(mrioc, "%s: pcie error recovery in progress!\n", __func__);
+		return -EFAULT;
+	}
+
 	data_out_sz = sizeof(struct rep_manu_request);
 	data_in_sz = sizeof(struct rep_manu_reply);
 	data_out = dma_alloc_coherent(&mrioc->pdev->dev,
@@ -792,6 +797,12 @@ static int mpi3mr_set_identify(struct mpi3mr_ioc *mrioc, u16 handle,
 		return -EFAULT;
 	}
 
+	if (mrioc->pcie_err_recovery) {
+		ioc_err(mrioc, "%s: pcie error recovery in progress!\n",
+		    __func__);
+		return -EFAULT;
+	}
+
 	if ((mpi3mr_cfg_get_dev_pg0(mrioc, &ioc_status, &device_pg0,
 	    sizeof(device_pg0), MPI3_DEVICE_PGAD_FORM_HANDLE, handle))) {
 		ioc_err(mrioc, "%s: device page0 read failed\n", __func__);
@@ -1009,6 +1020,9 @@ mpi3mr_alloc_hba_port(struct mpi3mr_ioc *mrioc, u16 port_id)
 	hba_port->port_id = port_id;
 	ioc_info(mrioc, "hba_port entry: %p, port: %d is added to hba_port list\n",
 	    hba_port, hba_port->port_id);
+	if (mrioc->reset_in_progress ||
+		mrioc->pcie_err_recovery)
+		hba_port->flags = MPI3MR_HBA_PORT_FLAG_NEW;
 	list_add_tail(&hba_port->list, &mrioc->hba_port_table_list);
 	return hba_port;
 }
@@ -1057,7 +1071,7 @@ void mpi3mr_update_links(struct mpi3mr_ioc *mrioc,
 	struct mpi3mr_sas_node *mr_sas_node;
 	struct mpi3mr_sas_phy *mr_sas_phy;
 
-	if (mrioc->reset_in_progress)
+	if (mrioc->reset_in_progress || mrioc->pcie_err_recovery)
 		return;
 
 	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
@@ -1965,7 +1979,7 @@ int mpi3mr_expander_add(struct mpi3mr_ioc *mrioc, u16 handle)
 	if (!handle)
 		return -1;
 
-	if (mrioc->reset_in_progress)
+	if (mrioc->reset_in_progress || mrioc->pcie_err_recovery)
 		return -1;
 
 	if ((mpi3mr_cfg_get_sas_exp_pg0(mrioc, &ioc_status, &expander_pg0,
@@ -2171,7 +2185,7 @@ void mpi3mr_expander_node_remove(struct mpi3mr_ioc *mrioc,
 	/* remove sibling ports attached to this expander */
 	list_for_each_entry_safe(mr_sas_port, next,
 	   &sas_expander->sas_port_list, port_list) {
-		if (mrioc->reset_in_progress)
+		if (mrioc->reset_in_progress || mrioc->pcie_err_recovery)
 			return;
 		if (mr_sas_port->remote_identify.device_type ==
 		    SAS_END_DEVICE)
@@ -2221,7 +2235,7 @@ void mpi3mr_expander_remove(struct mpi3mr_ioc *mrioc, u64 sas_address,
 	struct mpi3mr_sas_node *sas_expander;
 	unsigned long flags;
 
-	if (mrioc->reset_in_progress)
+	if (mrioc->reset_in_progress || mrioc->pcie_err_recovery)
 		return;
 
 	if (!hba_port)
@@ -2532,6 +2546,11 @@ static int mpi3mr_get_expander_phy_error_log(struct mpi3mr_ioc *mrioc,
 		return -EFAULT;
 	}
 
+	if (mrioc->pcie_err_recovery) {
+		ioc_err(mrioc, "%s: pcie error recovery in progress!\n", __func__);
+		return -EFAULT;
+	}
+
 	data_out_sz = sizeof(struct phy_error_log_request);
 	data_in_sz = sizeof(struct phy_error_log_reply);
 	sz = data_out_sz + data_in_sz;
@@ -2791,6 +2810,12 @@ mpi3mr_expander_phy_control(struct mpi3mr_ioc *mrioc,
 		return -EFAULT;
 	}
 
+	if (mrioc->pcie_err_recovery) {
+		ioc_err(mrioc, "%s: pcie error recovery in progress!\n",
+		    __func__);
+		return -EFAULT;
+	}
+
 	data_out_sz = sizeof(struct phy_control_request);
 	data_in_sz = sizeof(struct phy_control_reply);
 	sz = data_out_sz + data_in_sz;
@@ -3214,6 +3239,12 @@ mpi3mr_transport_smp_handler(struct bsg_job *job, struct Scsi_Host *shost,
 		goto out;
 	}
 
+	if (mrioc->pcie_err_recovery) {
+		ioc_err(mrioc, "%s: pcie error recovery in progress!\n", __func__);
+		rc = -EFAULT;
+		goto out;
+	}
+
 	rc = mpi3mr_map_smp_buffer(&mrioc->pdev->dev, &job->request_payload,
 	    &dma_addr_out, &dma_len_out, &addr_out);
 	if (rc)
-- 
2.31.1


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4209 bytes --]

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 4/6] mpi3mr: Improve Shutdown times when firmware has faulted
  2023-12-14 20:58 [PATCH v2 0/6] mpi3mr: Support PCIe Error Recovery Ranjan Kumar
                   ` (2 preceding siblings ...)
  2023-12-14 20:58 ` [PATCH v2 3/6] mpi3mr: Prevent PCI writes from driver during PCI error recovery Ranjan Kumar
@ 2023-12-14 20:58 ` Ranjan Kumar
  2023-12-14 20:58 ` [PATCH v2 5/6] mpi3mr: Reset stop_bsgs flag post controller reset failure Ranjan Kumar
  2023-12-14 20:59 ` [PATCH v2 6/6] mpi3mr: Update driver version to 8.6.1.0.50 Ranjan Kumar
  5 siblings, 0 replies; 10+ messages in thread
From: Ranjan Kumar @ 2023-12-14 20:58 UTC (permalink / raw)
  To: linux-scsi, martin.petersen
  Cc: rajsekhar.chundru, sathya.prakash, sumit.saxena,
	chandrakanth.patil, prayas.patel, Ranjan Kumar

[-- Attachment #1: Type: text/plain, Size: 3942 bytes --]

The driver monitors the controller state periodically while
waiting for the shutdown notification MPI request to complete.
If the firmware is faulty, the driver resets the controller and
re-issues the shutdown notification. The driver will make three
attempts to complete the shutdown process and will not retry the
notification request if the controller reset is unsuccessful.

Signed-off-by: Prayas Patel <prayas.patel@broadcom.com>
Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com>
---
 drivers/scsi/mpi3mr/mpi3mr.h    |  1 +
 drivers/scsi/mpi3mr/mpi3mr_fw.c | 36 +++++++++++++++++++++++++--------
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h
index 1ac7f88dc1cd..de953eb055d0 100644
--- a/drivers/scsi/mpi3mr/mpi3mr.h
+++ b/drivers/scsi/mpi3mr/mpi3mr.h
@@ -159,6 +159,7 @@ extern atomic64_t event_counter;
 /* Controller Reset related definitions */
 #define MPI3MR_HOSTDIAG_UNLOCK_RETRY_COUNT	5
 #define MPI3MR_MAX_RESET_RETRY_COUNT		3
+#define MPI3MR_MAX_SHUTDOWN_RETRY_COUNT		2
 
 /* ResponseCode definitions */
 #define MPI3MR_RI_MASK_RESPCODE		(0x000000FF)
diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c
index c50fc27231cd..491ef854fdba 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_fw.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c
@@ -4566,9 +4566,10 @@ void mpi3mr_free_mem(struct mpi3mr_ioc *mrioc)
  */
 static void mpi3mr_issue_ioc_shutdown(struct mpi3mr_ioc *mrioc)
 {
-	u32 ioc_config, ioc_status;
-	u8 retval = 1;
+	u32 ioc_config, ioc_status, shutdown_action;
+	u8 retval = 1, retry = 0;
 	u32 timeout = MPI3MR_DEFAULT_SHUTDOWN_TIME * 10;
+	u32 timeout_remaining = 0;
 
 	ioc_info(mrioc, "Issuing shutdown Notification\n");
 	if (mrioc->unrecoverable) {
@@ -4583,15 +4584,16 @@ static void mpi3mr_issue_ioc_shutdown(struct mpi3mr_ioc *mrioc)
 		return;
 	}
 
+	shutdown_action = MPI3_SYSIF_IOC_CONFIG_SHUTDOWN_NORMAL |
+	    MPI3_SYSIF_IOC_CONFIG_DEVICE_SHUTDOWN_SEND_REQ;
 	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
-	ioc_config |= MPI3_SYSIF_IOC_CONFIG_SHUTDOWN_NORMAL;
-	ioc_config |= MPI3_SYSIF_IOC_CONFIG_DEVICE_SHUTDOWN_SEND_REQ;
+	ioc_config |= shutdown_action;
 
 	writel(ioc_config, &mrioc->sysif_regs->ioc_configuration);
 
 	if (mrioc->facts.shutdown_timeout)
 		timeout = mrioc->facts.shutdown_timeout * 10;
-
+	timeout_remaining = timeout;
 	do {
 		ioc_status = readl(&mrioc->sysif_regs->ioc_status);
 		if ((ioc_status & MPI3_SYSIF_IOC_STATUS_SHUTDOWN_MASK)
@@ -4599,8 +4601,26 @@ static void mpi3mr_issue_ioc_shutdown(struct mpi3mr_ioc *mrioc)
 			retval = 0;
 			break;
 		}
+		if (mrioc->unrecoverable)
+			break;
+		if (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT) {
+			mpi3mr_print_fault_info(mrioc);
+			if (retry >= MPI3MR_MAX_SHUTDOWN_RETRY_COUNT)
+				break;
+			if (mpi3mr_issue_reset(mrioc,
+			    MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET,
+			    MPI3MR_RESET_FROM_CTLR_CLEANUP))
+				break;
+			ioc_config =
+			    readl(&mrioc->sysif_regs->ioc_configuration);
+			ioc_config |= shutdown_action;
+			writel(ioc_config,
+			    &mrioc->sysif_regs->ioc_configuration);
+			timeout_remaining = timeout;
+			retry++;
+		}
 		msleep(100);
-	} while (--timeout);
+	} while (--timeout_remaining);
 
 	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
 	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
@@ -4609,11 +4629,11 @@ static void mpi3mr_issue_ioc_shutdown(struct mpi3mr_ioc *mrioc)
 		if ((ioc_status & MPI3_SYSIF_IOC_STATUS_SHUTDOWN_MASK)
 		    == MPI3_SYSIF_IOC_STATUS_SHUTDOWN_IN_PROGRESS)
 			ioc_warn(mrioc,
-			    "shutdown still in progress after timeout\n");
+			    "shutdown still in progress\n");
 	}
 
 	ioc_info(mrioc,
-	    "Base IOC Sts/Config after %s shutdown is (0x%x)/(0x%x)\n",
+	    "ioc_status/ioc_config after %s shutdown is (0x%x)/(0x%x)\n",
 	    (!retval) ? "successful" : "failed", ioc_status,
 	    ioc_config);
 }
-- 
2.31.1


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4209 bytes --]

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 5/6] mpi3mr: Reset stop_bsgs flag post controller reset failure
  2023-12-14 20:58 [PATCH v2 0/6] mpi3mr: Support PCIe Error Recovery Ranjan Kumar
                   ` (3 preceding siblings ...)
  2023-12-14 20:58 ` [PATCH v2 4/6] mpi3mr: Improve Shutdown times when firmware has faulted Ranjan Kumar
@ 2023-12-14 20:58 ` Ranjan Kumar
  2023-12-14 20:59 ` [PATCH v2 6/6] mpi3mr: Update driver version to 8.6.1.0.50 Ranjan Kumar
  5 siblings, 0 replies; 10+ messages in thread
From: Ranjan Kumar @ 2023-12-14 20:58 UTC (permalink / raw)
  To: linux-scsi, martin.petersen
  Cc: rajsekhar.chundru, sathya.prakash, sumit.saxena,
	chandrakanth.patil, prayas.patel, Ranjan Kumar

[-- Attachment #1: Type: text/plain, Size: 799 bytes --]

stop_bsgs flag blocks the ioctls during controller reset.
currently, it remains set after a controller failure, so
resetting it.

Signed-off-by: Sumit Saxena <sumit.saxena@broadcom.com>
Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com>
---
 drivers/scsi/mpi3mr/mpi3mr_fw.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c
index 491ef854fdba..209920cd5ec2 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_fw.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c
@@ -5136,6 +5136,7 @@ int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc,
 		mrioc->device_refresh_on = 0;
 		mrioc->unrecoverable = 1;
 		mrioc->reset_in_progress = 0;
+		mrioc->stop_bsgs = 0;
 		retval = -1;
 		mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
 	}
-- 
2.31.1


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4209 bytes --]

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 6/6] mpi3mr: Update driver version to 8.6.1.0.50
  2023-12-14 20:58 [PATCH v2 0/6] mpi3mr: Support PCIe Error Recovery Ranjan Kumar
                   ` (4 preceding siblings ...)
  2023-12-14 20:58 ` [PATCH v2 5/6] mpi3mr: Reset stop_bsgs flag post controller reset failure Ranjan Kumar
@ 2023-12-14 20:59 ` Ranjan Kumar
  5 siblings, 0 replies; 10+ messages in thread
From: Ranjan Kumar @ 2023-12-14 20:59 UTC (permalink / raw)
  To: linux-scsi, martin.petersen
  Cc: rajsekhar.chundru, sathya.prakash, sumit.saxena,
	chandrakanth.patil, prayas.patel, Ranjan Kumar

[-- Attachment #1: Type: text/plain, Size: 755 bytes --]

Update driver version to 8.6.1.0.50

Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com>
---
 drivers/scsi/mpi3mr/mpi3mr.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h
index de953eb055d0..d6e43862221f 100644
--- a/drivers/scsi/mpi3mr/mpi3mr.h
+++ b/drivers/scsi/mpi3mr/mpi3mr.h
@@ -56,8 +56,8 @@ extern struct list_head mrioc_list;
 extern int prot_mask;
 extern atomic64_t event_counter;
 
-#define MPI3MR_DRIVER_VERSION	"8.5.1.0.0"
-#define MPI3MR_DRIVER_RELDATE	"5-December-2023"
+#define MPI3MR_DRIVER_VERSION	"8.6.1.0.50"
+#define MPI3MR_DRIVER_RELDATE	"6-December-2023"
 
 #define MPI3MR_DRIVER_NAME	"mpi3mr"
 #define MPI3MR_DRIVER_LICENSE	"GPL"
-- 
2.31.1


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4209 bytes --]

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 1/6] mpi3mr: Creation of helper function
  2023-12-14 20:58 ` [PATCH v2 1/6] mpi3mr: Creation of helper function Ranjan Kumar
@ 2024-01-04 23:11   ` Bjorn Helgaas
  0 siblings, 0 replies; 10+ messages in thread
From: Bjorn Helgaas @ 2024-01-04 23:11 UTC (permalink / raw)
  To: Ranjan Kumar
  Cc: linux-scsi, martin.petersen, rajsekhar.chundru, sathya.prakash,
	sumit.saxena, chandrakanth.patil, prayas.patel

I wasn't cc'd on this series, so apologies if it's already been
applied and these comments are moot.

BTW, the cover doesn't say what this applies to, and it doesn't apply
cleanly to v6.7-rc1.

On Fri, Dec 15, 2023 at 02:28:55AM +0530, Ranjan Kumar wrote:
> Use of helper function to get controller and shost details.
> 
> Signed-off-by: Sathya Prakash <sathya.prakash@broadcom.com>
> Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com>
> ---
>  drivers/scsi/mpi3mr/mpi3mr_os.c | 54 ++++++++++++++++++++++++++-------
>  1 file changed, 43 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c
> index 1bffd629c124..76ba31a9517d 100644
> --- a/drivers/scsi/mpi3mr/mpi3mr_os.c
> +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c
> @@ -5230,6 +5230,35 @@ mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	return retval;
>  }
>  
> +/**
> + * mpi3mr_get_shost_and_mrioc - get shost and ioc reference if
> + *                     they are valid
> + * @pdev: PCI device struct
> + * @shost: address to store scsi host reference
> + * @mrioc: address store HBA adapter reference
> + *
> + * Return: 0 if *shost and *ioc are not NULL otherwise -1.
> + */
> +

Spurious blank line (I see that mpi3mr_probe() has a blank line here,
but most functions in this file do not).

> +static int
> +mpi3mr_get_shost_and_mrioc(struct pci_dev *pdev,

Most functions in this file use this style:

  static int mpi3mr_get_shost_and_mrioc(struct pci_dev *pdev,

> +	struct Scsi_Host **shost, struct mpi3mr_ioc **mrioc)
> +{
> +	*shost = pci_get_drvdata(pdev);
> +	if (*shost == NULL) {
> +		dev_err(&pdev->dev, "pdev's driver data is null\n");
> +		return -ENXIO;
> +	}
> +
> +	*mrioc = shost_priv(*shost);
> +	if (*mrioc == NULL) {
> +		dev_err(&pdev->dev, "shost's private data is null\n");
> +		*shost = NULL;
> +		return -ENXIO;
> +		}
> +	return 0;
> +}
> +
>  /**
>   * mpi3mr_remove - PCI remove callback
>   * @pdev: PCI device instance
> @@ -5242,7 +5271,7 @@ mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>   */
>  static void mpi3mr_remove(struct pci_dev *pdev)
>  {
> -	struct Scsi_Host *shost = pci_get_drvdata(pdev);
> +	struct Scsi_Host *shost;
>  	struct mpi3mr_ioc *mrioc;
>  	struct workqueue_struct	*wq;
>  	unsigned long flags;
> @@ -5250,7 +5279,7 @@ static void mpi3mr_remove(struct pci_dev *pdev)
>  	struct mpi3mr_hba_port *port, *hba_port_next;
>  	struct mpi3mr_sas_node *sas_expander, *sas_expander_next;
>  
> -	if (!shost)
> +	if (mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc))
>  		return;
>  
>  	mrioc = shost_priv(shost);

Maybe I'm missing something, but it looks like this patch should
remove "mrioc = shost_priv(shost)" every time it adds a call to
mpi3mr_get_shost_and_mrioc().

Bjorn

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 2/6] mpi3mr: Support PCIe Error Recovery callback handlers
  2023-12-14 20:58 ` [PATCH v2 2/6] mpi3mr: Support PCIe Error Recovery callback handlers Ranjan Kumar
@ 2024-01-04 23:49   ` Bjorn Helgaas
  0 siblings, 0 replies; 10+ messages in thread
From: Bjorn Helgaas @ 2024-01-04 23:49 UTC (permalink / raw)
  To: Ranjan Kumar
  Cc: linux-scsi, martin.petersen, rajsekhar.chundru, sathya.prakash,
	sumit.saxena, chandrakanth.patil, prayas.patel

On Fri, Dec 15, 2023 at 02:28:56AM +0530, Ranjan Kumar wrote:
> The driver has been upgraded to include support for the
> PCIe error recovery callback handler which is crucial for
> the recovery of the controllers. This feature is
> necessary for addressing the errors reported by
> the PCIe AER (Advanced Error Reporting) mechanism.

I think PCI error handling is not strictly limited to PCIe AER because
the powerpc EEH framework also uses the pci_error_handlers callbacks,
so I would refer to this as just "PCI error recovery" and maybe omit
the AER part.

Commit log and comments appear to be wrapped to 60 or 70 columns,
which is unnecessarily short.  I'd recommend 75 columns for commit
log, maybe 78 for comments.

> Signed-off-by: Sathya Prakash <sathya.prakash@broadcom.com>
> Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com>
> ---
>  drivers/scsi/mpi3mr/mpi3mr.h    |   5 +
>  drivers/scsi/mpi3mr/mpi3mr_os.c | 197 ++++++++++++++++++++++++++++++++
>  2 files changed, 202 insertions(+)
> 
> diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h
> index 3de1ee05c44e..25e6e3a09468 100644
> --- a/drivers/scsi/mpi3mr/mpi3mr.h
> +++ b/drivers/scsi/mpi3mr/mpi3mr.h
> @@ -23,6 +23,7 @@
>  #include <linux/miscdevice.h>
>  #include <linux/module.h>
>  #include <linux/pci.h>
> +#include <linux/aer.h>
>  #include <linux/poll.h>
>  #include <linux/sched.h>
>  #include <linux/slab.h>
> @@ -1055,6 +1056,8 @@ struct scmd_priv {
>   * @ioctl_chain_sge: DMA buffer descriptor for IOCTL chain
>   * @ioctl_resp_sge: DMA buffer descriptor for Mgmt cmd response
>   * @ioctl_sges_allocated: Flag for IOCTL SGEs allocated or not
> + * @pcie_err_recovery: PCIe error recovery in progress
> + * @block_on_pcie_err: Block IO during PCI error recovery

s/PCIe error recovery/PCI error recovery/ so they match.

>  struct mpi3mr_ioc {
>  	struct list_head list;
> @@ -1246,6 +1249,8 @@ struct mpi3mr_ioc {
>  	struct dma_memory_desc ioctl_chain_sge;
>  	struct dma_memory_desc ioctl_resp_sge;
>  	bool ioctl_sges_allocated;
> +	bool pcie_err_recovery;
> +	bool block_on_pcie_err;
>  };
>  
>  /**
> diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c
> index 76ba31a9517d..dea47ef53abb 100644
> --- a/drivers/scsi/mpi3mr/mpi3mr_os.c
> +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c
> @@ -5472,6 +5472,195 @@ mpi3mr_resume(struct device *dev)
>  	return 0;
>  }
>  
> +/**
> + * mpi3mr_pcierr_detected - PCI error detected callback
> + * @pdev: PCI device instance
> + * @state: channel state
> + *
> + * This function is called by the PCI error recovery driver and
> + * based on the state passed the driver decides what actions to
> + * be recommended back to PCI driver.
> + *
> + * For all of the states if there is no valid mrioc or scsi host
> + * references in the pci device then this function will retyrn
> + * the resul as disconnect.

s/pci device/PCI device/ to match other uses
s/retyrn/return/
s/resul/result/

> + * For normal state, this function will return the result as can
> + * recover.
> + *
> + * For frozen state, this function will block for any pennding

s/pennding/pending/

> + * controller initialization or re-initialization to complete,
> + * stop any new interactions with the controller and return
> + * status as reset required.
> + *
> + * For permanent failure state, this function will mark the
> + * controller as unrecoverable and return status as disconnect.
> + *
> + * Returns: PCI_ERS_RESULT_NEED_RESET or CAN_RECOVER or
> + * DISCONNECT based on the controller state.
> + */
> +static pci_ers_result_t
> +mpi3mr_pcierr_detected(struct pci_dev *pdev, pci_channel_state_t state)
> +{
> +	struct Scsi_Host *shost;
> +	struct mpi3mr_ioc *mrioc;
> +	pci_ers_result_t ret_val = PCI_ERS_RESULT_DISCONNECT;
> +
> +	dev_info(&pdev->dev, "%s: callback invoked state(%d)\n", __func__,
> +	    state);
> +
> +	if (mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc)) {
> +		dev_err(&pdev->dev, "device not available\n");
> +		return ret_val;

Since you already know the return value here, use it explicitly:

  return PCI_ERS_RESULT_DISCONNECT;

> +	}
> +
> +	switch (state) {
> +	case pci_channel_io_normal:
> +		ret_val = PCI_ERS_RESULT_CAN_RECOVER;

Personally, I would just return directly from each case instead of
assigning "ret_val".

> +		break;
> +	case pci_channel_io_frozen:
> +		mrioc->pcie_err_recovery = true;
> +		mrioc->block_on_pcie_err = true;
> +		while (mrioc->reset_in_progress || mrioc->is_driver_loading)
> +			ssleep(1);

This looks like a potential hang.  What guarantees that one of these
is eventually set?

> +		scsi_block_requests(mrioc->shost);
> +		mpi3mr_stop_watchdog(mrioc);
> +		mpi3mr_cleanup_resources(mrioc);

Why are mpi3mr_cleanup_resources() and mpi3mr_setup_resources()
needed?

> +		mrioc->pdev = NULL;

Seems weird to clear mrioc->pdev and then restore it later.  What does
this accomplish?

> +		ret_val = PCI_ERS_RESULT_NEED_RESET;
> +		break;
> +	case pci_channel_io_perm_failure:
> +		mrioc->pcie_err_recovery = true;
> +		mrioc->block_on_pcie_err = true;
> +		mrioc->unrecoverable = 1;
> +		mpi3mr_stop_watchdog(mrioc);
> +		mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
> +		ret_val = PCI_ERS_RESULT_DISCONNECT;
> +		break;
> +	default:
> +		break;
> +	}
> +	return ret_val;

Then this could become "return PCI_ERS_RESULT_DISCONNECT;" and you
don't need "ret_val" at all.

> +}
> +
> +/**
> + * mpi3mr_pcierr_slot_reset_done - Post slot reset callback
> + * @pdev: PCI device instance
> + *
> + * This function is called by the PCI error recovery driver
> + * after a slot or link reset issued by it for the recovery, the
> + * driver is expected to bring back the controller and
> + * initialize it.
> + *
> + * This function restores pci state and reinitializes controller
> + * resoruces and the controller, this blocks for any pending
> + * reset to complete.

s/pci state/PCI state/ to match other uses
s/resoruces/resources/

> + * Returns: PCI_ERS_RESULT_DISCONNECT on failure or
> + * PCI_ERS_RESULT_RECOVERED
> + */
> +static pci_ers_result_t mpi3mr_pcierr_slot_reset_done(struct pci_dev *pdev)
> +{
> +	struct Scsi_Host *shost;
> +	struct mpi3mr_ioc *mrioc;
> +
> +

Spurious blank line.

> +	dev_info(&pdev->dev, "%s: callback invoked\n", __func__);
> +
> +	if (mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc)) {
> +		dev_err(&pdev->dev, "device not available\n");
> +		return PCI_ERS_RESULT_DISCONNECT;
> +	}
> +
> +	while (mrioc->reset_in_progress)
> +		ssleep(1);

Another potential hang?

> +	mrioc->pdev = pdev;
> +	pci_restore_state(pdev);
> +
> +	if (mpi3mr_setup_resources(mrioc)) {
> +		ioc_err(mrioc, "setup resources failed\n");
> +		goto out_failed;
> +	}
> +	mrioc->unrecoverable = 0;
> +	mrioc->pcie_err_recovery = false;
> +
> +	if (mpi3mr_soft_reset_handler(mrioc, MPI3MR_RESET_FROM_FIRMWARE, 0))
> +		goto out_failed;
> +
> +	return PCI_ERS_RESULT_RECOVERED;
> +
> +out_failed:
> +	mrioc->unrecoverable = 1;
> +	mrioc->block_on_pcie_err = false;
> +	scsi_unblock_requests(shost);
> +	mpi3mr_start_watchdog(mrioc);

I'm not a SCSI person, but it seems odd to unblock requests and start
the watchdog here.  Isn't the device just dead at this point?

> +	return PCI_ERS_RESULT_DISCONNECT;
> +}
> +
> +/**
> + * mpi3mr_pcierr_resume - PCI error recovery resume
> + * callback
> + * @pdev: PCI device instance
> + *
> + * This function enables all I/O and IOCTLs post reset issued as
> + * part of the PCI advacned error reporting and handling

s/advacned/advanced/ (actually, I'd just omit "advanced", since this
covers more than just AER in the powerpc case).

> + * Return: Nothing.
> + */
> +static void mpi3mr_pcierr_resume(struct pci_dev *pdev)
> +{
> +	struct Scsi_Host *shost;
> +	struct mpi3mr_ioc *mrioc;
> +
> +	dev_info(&pdev->dev, "%s: callback invoked\n", __func__);
> +
> +	if (mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc)) {
> +		dev_err(&pdev->dev, "device not available\n");
> +		return;
> +	}
> +
> +	pci_aer_clear_nonfatal_status(pdev);
> +
> +	if (mrioc->block_on_pcie_err) {
> +		mrioc->block_on_pcie_err = false;
> +		scsi_unblock_requests(shost);
> +		mpi3mr_start_watchdog(mrioc);
> +	}
> +
> +}
> +
> +/**
> + * mpi3mr_pcierr_mmio_enabled - PCI error recovery callback
> + * @pdev: PCI device instance
> + *
> + * This is called only if _pcierr_error_detected returns
> + * PCI_ERS_RESULT_CAN_RECOVER.

s/_pcierr_error_detected/mpi3mr_pcierr_detected()/ ?  (Or possible
different name as suggested below)

> + * Return: PCI_ERS_RESULT_DISCONNECT when the controller is
> + * unrecoverable or when the shost/mnrioc reference cannot be
> + * found, else return PCI_ERS_RESULT_RECOVERED

s/mnrioc/mrioc/

> +static pci_ers_result_t mpi3mr_pcierr_mmio_enabled(struct pci_dev *pdev)
> +{
> +
> +	struct Scsi_Host *shost;
> +	struct mpi3mr_ioc *mrioc;
> +/*
> + *
> + */

Spurious comment?

> +	dev_info(&pdev->dev, "%s: callback invoked\n", __func__);
> +
> +	if (mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc)) {
> +		dev_err(&pdev->dev, "device not available\n");
> +		return PCI_ERS_RESULT_DISCONNECT;
> +	}
> +	if (mrioc->unrecoverable)
> +		return PCI_ERS_RESULT_DISCONNECT;
> +
> +	return PCI_ERS_RESULT_RECOVERED;
> +}
> +
>  static const struct pci_device_id mpi3mr_pci_id_table[] = {
>  	{
>  		PCI_DEVICE_SUB(MPI3_MFGPAGE_VENDORID_BROADCOM,
> @@ -5489,6 +5678,13 @@ static const struct pci_device_id mpi3mr_pci_id_table[] = {
>  };
>  MODULE_DEVICE_TABLE(pci, mpi3mr_pci_id_table);
>  
> +static struct pci_error_handlers mpi3mr_err_handler = {
> +	.error_detected = mpi3mr_pcierr_detected,
> +	.mmio_enabled = mpi3mr_pcierr_mmio_enabled,
> +	.slot_reset = mpi3mr_pcierr_slot_reset_done,

It would be helpful if you used "mpi3mr_pcierr_slot_reset()" to match
the function pointer name.  This would make it much easier to compare
.slot_reset() implementations across drivers.

I agree that "slot_reset_done()" is more descriptive and would
probably have been a better name for the function pointer, but that
ship has sailed.

I'd suggest "mpi3mr_pcierr_error_detected()" for the same reason.  Or
maybe remove "pcierr_" from all of them if it feels too redundant.

> +	.resume = mpi3mr_pcierr_resume,
> +};
> +
>  static SIMPLE_DEV_PM_OPS(mpi3mr_pm_ops, mpi3mr_suspend, mpi3mr_resume);
>  
>  static struct pci_driver mpi3mr_pci_driver = {
> @@ -5497,6 +5693,7 @@ static struct pci_driver mpi3mr_pci_driver = {
>  	.probe = mpi3mr_probe,
>  	.remove = mpi3mr_remove,
>  	.shutdown = mpi3mr_shutdown,
> +	.err_handler = &mpi3mr_err_handler,
>  	.driver.pm = &mpi3mr_pm_ops,
>  };
>  
> -- 
> 2.31.1
> 



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 3/6] mpi3mr: Prevent PCI writes from driver during PCI error recovery
  2023-12-14 20:58 ` [PATCH v2 3/6] mpi3mr: Prevent PCI writes from driver during PCI error recovery Ranjan Kumar
@ 2024-01-05  3:20   ` Bjorn Helgaas
  0 siblings, 0 replies; 10+ messages in thread
From: Bjorn Helgaas @ 2024-01-05  3:20 UTC (permalink / raw)
  To: Ranjan Kumar
  Cc: linux-scsi, martin.petersen, rajsekhar.chundru, sathya.prakash,
	sumit.saxena, chandrakanth.patil, prayas.patel

On Fri, Dec 15, 2023 at 02:28:57AM +0530, Ranjan Kumar wrote:
> Prevent interacting with the hardware while
> the error recovery in progress.
> 
> Signed-off-by: Sathya Prakash <sathya.prakash@broadcom.com>
> Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com>
> ---
>  drivers/scsi/mpi3mr/mpi3mr.h           | 26 ++++++++++
>  drivers/scsi/mpi3mr/mpi3mr_app.c       | 64 +++++++++++++----------
>  drivers/scsi/mpi3mr/mpi3mr_fw.c        | 28 +++++++---
>  drivers/scsi/mpi3mr/mpi3mr_os.c        | 71 +++++++++++++++-----------
>  drivers/scsi/mpi3mr/mpi3mr_transport.c | 39 ++++++++++++--
>  5 files changed, 158 insertions(+), 70 deletions(-)
> 
> diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h
> index 25e6e3a09468..1ac7f88dc1cd 100644
> --- a/drivers/scsi/mpi3mr/mpi3mr.h
> +++ b/drivers/scsi/mpi3mr/mpi3mr.h
> @@ -481,6 +481,7 @@ struct mpi3mr_throttle_group_info {
>  
>  /* HBA port flags */
>  #define MPI3MR_HBA_PORT_FLAG_DIRTY	0x01
> +#define MPI3MR_HBA_PORT_FLAG_NEW       0x02
>  
>  /* IOCTL data transfer sge*/
>  #define MPI3MR_NUM_IOCTL_SGE		256
> @@ -900,6 +901,29 @@ struct scmd_priv {
>  	u8 mpi3mr_scsiio_req[MPI3MR_ADMIN_REQ_FRAME_SZ];
>  };
>  
> +/**
> + * struct mpi3mr_pdevinfo - PCI device information
> + *
> + * @dev_id: PCI device ID of the adapter
> + * @dev_hw_rev: PCI revision of the adapter
> + * @subsys_dev_id: PCI subsystem device ID of the adapter
> + * @subsys_ven_id: PCI subsystem vendor ID of the adapter
> + * @dev: PCI device
> + * @func: PCI function
> + * @bus: PCI bus
> + * @seg_id: PCI segment ID

These names don't match the actual structure members, so I bet
scripts/kernel-doc is going to complain about them.

> + */
> +struct mpi3mr_pdevinfo {
> +	u16 id;
> +	u16 ssid;
> +	u16 ssvid;
> +	u16 segment;
> +	u8 dev:5;
> +	u8 func:3;
> +	u8 bus;
> +	u8 revision;
> +};
> +
>  /**
>   * struct mpi3mr_ioc - Adapter anchor structure stored in shost
>   * private data
> @@ -1058,6 +1082,7 @@ struct scmd_priv {
>   * @ioctl_sges_allocated: Flag for IOCTL SGEs allocated or not
>   * @pcie_err_recovery: PCIe error recovery in progress
>   * @block_on_pcie_err: Block IO during PCI error recovery
> + * @pdevinfo: PCI device information
>   */
>  struct mpi3mr_ioc {
>  	struct list_head list;
> @@ -1251,6 +1276,7 @@ struct mpi3mr_ioc {
>  	bool ioctl_sges_allocated;
>  	bool pcie_err_recovery;
>  	bool block_on_pcie_err;
> +	struct mpi3mr_pdevinfo pdevinfo;

I don't understand the reason to add "pdevinfo".  mpi3mr_probe() fills
pdevinfo from mrioc->pdev.  mpi3mr_bsg_populate_adpinfo() previously
filled adpinfo from mrioc->pdev, and now fills it from pdevinfo
instead.

It seems like the same data.  Is this related to the fact that
mpi3mr_pcierr_detected() set mrioc->pdev to NULL?  If so, this patch
would have to come first, otherwise if we bisect at the point between
patch [2/6] and this [3/6], there's a potential NULL pointer
dereference.  Bisection should work correctly at every commit.

I don't think setting mrioc->pdev to NULL is a good idea, so hopefully
pdevinfo can just go away.

>  };
>  
>  /**
> diff --git a/drivers/scsi/mpi3mr/mpi3mr_app.c b/drivers/scsi/mpi3mr/mpi3mr_app.c
> index 4b93b7440da6..a11d6f026f0e 100644
> --- a/drivers/scsi/mpi3mr/mpi3mr_app.c
> +++ b/drivers/scsi/mpi3mr/mpi3mr_app.c
> @@ -31,7 +31,7 @@ static int mpi3mr_bsg_pel_abort(struct mpi3mr_ioc *mrioc)
>  		dprint_bsg_err(mrioc, "%s: reset in progress\n", __func__);
>  		return -1;
>  	}
> -	if (mrioc->stop_bsgs) {
> +	if (mrioc->stop_bsgs || mrioc->block_on_pcie_err) {
>  		dprint_bsg_err(mrioc, "%s: bsgs are blocked\n", __func__);
>  		return -1;
>  	}
> @@ -424,6 +424,9 @@ static long mpi3mr_bsg_adp_reset(struct mpi3mr_ioc *mrioc,
>  		goto out;
>  	}
>  
> +	if (mrioc->unrecoverable || mrioc->block_on_pcie_err)
> +		return -EINVAL;
> +
>  	sg_copy_to_buffer(job->request_payload.sg_list,
>  			  job->request_payload.sg_cnt,
>  			  &adpreset, sizeof(adpreset));
> @@ -470,25 +473,29 @@ static long mpi3mr_bsg_populate_adpinfo(struct mpi3mr_ioc *mrioc,
>  
>  	memset(&adpinfo, 0, sizeof(adpinfo));
>  	adpinfo.adp_type = MPI3MR_BSG_ADPTYPE_AVGFAMILY;
> -	adpinfo.pci_dev_id = mrioc->pdev->device;
> -	adpinfo.pci_dev_hw_rev = mrioc->pdev->revision;
> -	adpinfo.pci_subsys_dev_id = mrioc->pdev->subsystem_device;
> -	adpinfo.pci_subsys_ven_id = mrioc->pdev->subsystem_vendor;
> -	adpinfo.pci_bus = mrioc->pdev->bus->number;
> -	adpinfo.pci_dev = PCI_SLOT(mrioc->pdev->devfn);
> -	adpinfo.pci_func = PCI_FUNC(mrioc->pdev->devfn);
> -	adpinfo.pci_seg_id = pci_domain_nr(mrioc->pdev->bus);
>  	adpinfo.app_intfc_ver = MPI3MR_IOCTL_VERSION;
>  
> -	ioc_state = mpi3mr_get_iocstate(mrioc);
> -	if (ioc_state == MRIOC_STATE_UNRECOVERABLE)
> -		adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_UNRECOVERABLE;
> -	else if ((mrioc->reset_in_progress) || (mrioc->stop_bsgs))
> +	if (mrioc->reset_in_progress || mrioc->stop_bsgs ||
> +	    mrioc->block_on_pcie_err)
>  		adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_IN_RESET;
> -	else if (ioc_state == MRIOC_STATE_FAULT)
> -		adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_FAULT;
> -	else
> -		adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_OPERATIONAL;
> +	else {
> +		ioc_state = mpi3mr_get_iocstate(mrioc);
> +		if (ioc_state == MRIOC_STATE_UNRECOVERABLE)
> +			adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_UNRECOVERABLE;
> +		else if (ioc_state == MRIOC_STATE_FAULT)
> +			adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_FAULT;
> +		else
> +			adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_OPERATIONAL;
> +	}
> +
> +	adpinfo.pci_dev_id = mrioc->pdevinfo.id;
> +	adpinfo.pci_dev_hw_rev = mrioc->pdevinfo.revision;
> +	adpinfo.pci_subsys_dev_id = mrioc->pdevinfo.ssid;
> +	adpinfo.pci_subsys_ven_id = mrioc->pdevinfo.ssvid;
> +	adpinfo.pci_bus = mrioc->pdevinfo.bus;
> +	adpinfo.pci_dev = mrioc->pdevinfo.dev;
> +	adpinfo.pci_func = mrioc->pdevinfo.func;
> +	adpinfo.pci_seg_id = mrioc->pdevinfo.segment;

I don't know if it's possible, but if the adpinfo changes could be
moved to their own patch (or even removed altogether), this patch
would be more about what the commit log advertises: preventing
interaction with the hardware during error recovery.

>  	memcpy((u8 *)&adpinfo.driver_info, (u8 *)&mrioc->driver_info,
>  	    sizeof(adpinfo.driver_info));
> @@ -1495,7 +1502,7 @@ static long mpi3mr_bsg_process_mpt_cmds(struct bsg_job *job)
>  		mutex_unlock(&mrioc->bsg_cmds.mutex);
>  		goto out;
>  	}
> -	if (mrioc->stop_bsgs) {
> +	if (mrioc->stop_bsgs || mrioc->block_on_pcie_err) {
>  		dprint_bsg_err(mrioc, "%s: bsgs are blocked\n", __func__);
>  		rval = -EAGAIN;
>  		mutex_unlock(&mrioc->bsg_cmds.mutex);
> @@ -2020,17 +2027,20 @@ adp_state_show(struct device *dev, struct device_attribute *attr,
>  	enum mpi3mr_iocstate ioc_state;
>  	uint8_t adp_state;
>  
> -	ioc_state = mpi3mr_get_iocstate(mrioc);
> -	if (ioc_state == MRIOC_STATE_UNRECOVERABLE)
> -		adp_state = MPI3MR_BSG_ADPSTATE_UNRECOVERABLE;
> -	else if ((mrioc->reset_in_progress) || (mrioc->stop_bsgs))
> +	if (mrioc->reset_in_progress || mrioc->stop_bsgs ||
> +		 mrioc->block_on_pcie_err)
>  		adp_state = MPI3MR_BSG_ADPSTATE_IN_RESET;
> -	else if (ioc_state == MRIOC_STATE_FAULT)
> -		adp_state = MPI3MR_BSG_ADPSTATE_FAULT;
> -	else
> -		adp_state = MPI3MR_BSG_ADPSTATE_OPERATIONAL;
> +	else {
> +		ioc_state = mpi3mr_get_iocstate(mrioc);
> +		if (ioc_state == MRIOC_STATE_UNRECOVERABLE)
> +			adp_state = MPI3MR_BSG_ADPSTATE_UNRECOVERABLE;
> +		else if (ioc_state == MRIOC_STATE_FAULT)
> +			adp_state = MPI3MR_BSG_ADPSTATE_FAULT;
> +		else
> +			adp_state = MPI3MR_BSG_ADPSTATE_OPERATIONAL;
> +	}
>  
> -	return sysfs_emit(buf, "%u\n", adp_state);
> +	return snprintf(buf, PAGE_SIZE, "%u\n", adp_state);
>  }
>  
>  static DEVICE_ATTR_RO(adp_state);
> diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c
> index d8c57a0a518f..c50fc27231cd 100644
> --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c
> +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c
> @@ -595,7 +595,7 @@ int mpi3mr_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
>  	mrioc = (struct mpi3mr_ioc *)shost->hostdata;
>  
>  	if ((mrioc->reset_in_progress || mrioc->prepare_for_reset ||
> -	    mrioc->unrecoverable))
> +	    mrioc->unrecoverable || mrioc->pcie_err_recovery))
>  		return 0;
>  
>  	num_entries = mpi3mr_process_op_reply_q(mrioc,
> @@ -1037,13 +1037,13 @@ enum mpi3mr_iocstate mpi3mr_get_iocstate(struct mpi3mr_ioc *mrioc)
>  	u32 ioc_status, ioc_config;
>  	u8 ready, enabled;
>  
> -	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
> -	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
> -
>  	if (mrioc->unrecoverable)
>  		return MRIOC_STATE_UNRECOVERABLE;
> +	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
> +
>  	if (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT)
>  		return MRIOC_STATE_FAULT;
> +	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);

These reads look suspect because you cannot rely on
mrioc->unrecoverable to catch all cases where the device is
inaccessible.  For example, the device may be unplugged here:

  # device is operating normally

  if (mrioc->unrecoverable)
    return MRIOC_STATE_UNRECOVERABLE;

  # device is hot-unplugged here

  readl(&mrioc->sysif_regs->ioc_status);

  # this readl() returns ~0 because the MMIO access failed on PCI

>  	ready = (ioc_status & MPI3_SYSIF_IOC_STATUS_READY);
>  	enabled = (ioc_config & MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC);
> @@ -1667,6 +1667,12 @@ int mpi3mr_admin_request_post(struct mpi3mr_ioc *mrioc, void *admin_req,
>  		retval = -EAGAIN;
>  		goto out;
>  	}
> +	if (mrioc->pcie_err_recovery) {
> +		ioc_err(mrioc, "admin request queue submission failed due to pcie error recovery in progress\n");
> +		retval = -EAGAIN;
> +		goto out;
> +	}
> +
>  	areq_entry = (u8 *)mrioc->admin_req_base +
>  	    (areq_pi * MPI3MR_ADMIN_REQ_FRAME_SZ);
>  	memset(areq_entry, 0, MPI3MR_ADMIN_REQ_FRAME_SZ);
> @@ -2337,6 +2343,11 @@ int mpi3mr_op_request_post(struct mpi3mr_ioc *mrioc,
>  		retval = -EAGAIN;
>  		goto out;
>  	}
> +	if (mrioc->pcie_err_recovery) {
> +		ioc_err(mrioc, "operational request queue submission failed due to pcie error recovery in progress\n");
> +		retval = -EAGAIN;
> +		goto out;
> +	}
>  
>  	segment_base_addr = segments[pi / op_req_q->segment_qd].segment;
>  	req_entry = (u8 *)segment_base_addr +
> @@ -2585,7 +2596,7 @@ static void mpi3mr_watchdog_work(struct work_struct *work)
>  	u32 fault, host_diagnostic, ioc_status;
>  	u32 reset_reason = MPI3MR_RESET_FROM_FAULT_WATCH;
>  
> -	if (mrioc->reset_in_progress)
> +	if (mrioc->reset_in_progress || mrioc->pcie_err_recovery)
>  		return;
>  
>  	if (!mrioc->unrecoverable && !pci_device_is_present(mrioc->pdev)) {
> @@ -4111,7 +4122,7 @@ int mpi3mr_reinit_ioc(struct mpi3mr_ioc *mrioc, u8 is_resume)
>  		goto out_failed_noretry;
>  	}
>  
> -	if (is_resume) {
> +	if (is_resume || mrioc->block_on_pcie_err) {
>  		dprint_reset(mrioc, "setting up single ISR\n");
>  		retval = mpi3mr_setup_isr(mrioc, 1);
>  		if (retval) {
> @@ -4151,7 +4162,7 @@ int mpi3mr_reinit_ioc(struct mpi3mr_ioc *mrioc, u8 is_resume)
>  		goto out_failed;
>  	}
>  
> -	if (is_resume) {
> +	if (is_resume || mrioc->block_on_pcie_err) {
>  		dprint_reset(mrioc, "setting up multiple ISR\n");
>  		retval = mpi3mr_setup_isr(mrioc, 0);
>  		if (retval) {
> @@ -4625,7 +4636,8 @@ void mpi3mr_cleanup_ioc(struct mpi3mr_ioc *mrioc)
>  
>  	ioc_state = mpi3mr_get_iocstate(mrioc);
>  
> -	if ((!mrioc->unrecoverable) && (!mrioc->reset_in_progress) &&
> +	if (!mrioc->unrecoverable && !mrioc->reset_in_progress &&
> +	    !mrioc->pcie_err_recovery &&
>  	    (ioc_state == MRIOC_STATE_READY)) {
>  		if (mpi3mr_issue_and_process_mur(mrioc,
>  		    MPI3MR_RESET_FROM_CTLR_CLEANUP))
> diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c
> index dea47ef53abb..d61fd105dfad 100644
> --- a/drivers/scsi/mpi3mr/mpi3mr_os.c
> +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c
> @@ -919,7 +919,7 @@ static int mpi3mr_report_tgtdev_to_host(struct mpi3mr_ioc *mrioc,
>  	int retval = 0;
>  	struct mpi3mr_tgt_dev *tgtdev;
>  
> -	if (mrioc->reset_in_progress)
> +	if (mrioc->reset_in_progress || mrioc->pcie_err_recovery)
>  		return -1;
>  
>  	tgtdev = mpi3mr_get_tgtdev_by_perst_id(mrioc, perst_id);
> @@ -2000,8 +2000,13 @@ static void mpi3mr_fwevt_bh(struct mpi3mr_ioc *mrioc,
>  	}
>  	case MPI3_EVENT_WAIT_FOR_DEVICES_TO_REFRESH:
>  	{
> -		while (mrioc->device_refresh_on)
> +		while ((mrioc->device_refresh_on || mrioc->block_on_pcie_err) &&
> +		    !mrioc->unrecoverable && !mrioc->pcie_err_recovery) {
>  			msleep(500);
> +		}

Looks like a potential hang here.  I guess it was always a potential
hang; this just makes the conditional more complicated.  Usually you
want a timeout or other means of ensuring the loop exits.

> +
> +		if (mrioc->unrecoverable || mrioc->pcie_err_recovery)
> +			break;
>  
>  		dprint_event_bh(mrioc,
>  		    "scan for non responding and newly added devices after soft reset started\n");
> @@ -3680,6 +3685,13 @@ int mpi3mr_issue_tm(struct mpi3mr_ioc *mrioc, u8 tm_type,
>  		mutex_unlock(&drv_cmd->mutex);
>  		goto out;
>  	}
> +	if (mrioc->block_on_pcie_err) {
> +		retval = -1;
> +		dprint_tm(mrioc, "sending task management failed due to\n"
> +				"pcie error recovery in progress\n");
> +		mutex_unlock(&drv_cmd->mutex);
> +		goto out;
> +	}
>  
>  	drv_cmd->state = MPI3MR_CMD_PENDING;
>  	drv_cmd->is_waiting = 1;
> @@ -4073,12 +4085,19 @@ static int mpi3mr_eh_bus_reset(struct scsi_cmnd *scmd)
>  	if (dev_type == MPI3_DEVICE_DEVFORM_VD) {
>  		mpi3mr_wait_for_host_io(mrioc,
>  			MPI3MR_RAID_ERRREC_RESET_TIMEOUT);
> -		if (!mpi3mr_get_fw_pending_ios(mrioc))
> +		if (!mpi3mr_get_fw_pending_ios(mrioc)) {
> +			while (mrioc->reset_in_progress ||
> +			       mrioc->prepare_for_reset ||
> +			       mrioc->block_on_pcie_err)
> +				ssleep(1);

Another possible hang?

>  			retval = SUCCESS;
> +			goto out;
> +		}
>  	}
>  	if (retval == FAILED)
>  		mpi3mr_print_pending_host_io(mrioc);
>  
> +out:
>  	sdev_printk(KERN_INFO, scmd->device,
>  		"Bus reset is %s for scmd(%p)\n",
>  		((retval == SUCCESS) ? "SUCCESS" : "FAILED"), scmd);
> @@ -4779,7 +4798,8 @@ static int mpi3mr_qcmd(struct Scsi_Host *shost,
>  		goto out;
>  	}
>  
> -	if (mrioc->reset_in_progress) {
> +	if (mrioc->reset_in_progress || mrioc->prepare_for_reset
> +	    || mrioc->block_on_pcie_err) {
>  		retval = SCSI_MLQUEUE_HOST_BUSY;
>  		goto out;
>  	}
> @@ -5123,6 +5143,14 @@ mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	mrioc->logging_level = logging_level;
>  	mrioc->shost = shost;
>  	mrioc->pdev = pdev;
> +	mrioc->pdevinfo.id = pdev->device;
> +	mrioc->pdevinfo.revision = pdev->revision;
> +	mrioc->pdevinfo.ssid = pdev->subsystem_device;
> +	mrioc->pdevinfo.ssvid = pdev->subsystem_vendor;
> +	mrioc->pdevinfo.bus = pdev->bus->number;
> +	mrioc->pdevinfo.dev = PCI_SLOT(pdev->devfn);
> +	mrioc->pdevinfo.func = PCI_FUNC(pdev->devfn);
> +	mrioc->pdevinfo.segment = pci_domain_nr(pdev->bus);
>  	mrioc->stop_bsgs = 1;
>  
>  	mrioc->max_sgl_entries = max_sgl_entries;
> @@ -5276,17 +5304,21 @@ static void mpi3mr_remove(struct pci_dev *pdev)
>  	struct workqueue_struct	*wq;
>  	unsigned long flags;
>  	struct mpi3mr_tgt_dev *tgtdev, *tgtdev_next;
> -	struct mpi3mr_hba_port *port, *hba_port_next;
> -	struct mpi3mr_sas_node *sas_expander, *sas_expander_next;
>  
>  	if (mpi3mr_get_shost_and_mrioc(pdev, &shost, &mrioc))
>  		return;
>  
> -	mrioc = shost_priv(shost);
>  	while (mrioc->reset_in_progress || mrioc->is_driver_loading)
>  		ssleep(1);
>  
> -	if (!pci_device_is_present(mrioc->pdev)) {
> +	if (mrioc->block_on_pcie_err) {
> +		mrioc->block_on_pcie_err = false;
> +		scsi_unblock_requests(shost);
> +		mrioc->unrecoverable = 1;
> +	}
> +
> +	if (!pci_device_is_present(mrioc->pdev) ||
> +	    mrioc->pcie_err_recovery) {
>  		mrioc->unrecoverable = 1;
>  		mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
>  	}
> @@ -5316,29 +5348,6 @@ static void mpi3mr_remove(struct pci_dev *pdev)
>  	mpi3mr_cleanup_ioc(mrioc);
>  	mpi3mr_free_mem(mrioc);
>  	mpi3mr_cleanup_resources(mrioc);
> -
> -	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
> -	list_for_each_entry_safe_reverse(sas_expander, sas_expander_next,
> -	    &mrioc->sas_expander_list, list) {
> -		spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
> -		mpi3mr_expander_node_remove(mrioc, sas_expander);
> -		spin_lock_irqsave(&mrioc->sas_node_lock, flags);
> -	}
> -	list_for_each_entry_safe(port, hba_port_next, &mrioc->hba_port_table_list, list) {
> -		ioc_info(mrioc,
> -		    "removing hba_port entry: %p port: %d from hba_port list\n",
> -		    port, port->port_id);
> -		list_del(&port->list);
> -		kfree(port);
> -	}
> -	spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
> -
> -	if (mrioc->sas_hba.num_phys) {
> -		kfree(mrioc->sas_hba.phy);
> -		mrioc->sas_hba.phy = NULL;
> -		mrioc->sas_hba.num_phys = 0;
> -	}
> -
>  	spin_lock(&mrioc_list_lock);
>  	list_del(&mrioc->list);
>  	spin_unlock(&mrioc_list_lock);
> diff --git a/drivers/scsi/mpi3mr/mpi3mr_transport.c b/drivers/scsi/mpi3mr/mpi3mr_transport.c
> index c0c8ab586957..8c8368104a27 100644
> --- a/drivers/scsi/mpi3mr/mpi3mr_transport.c
> +++ b/drivers/scsi/mpi3mr/mpi3mr_transport.c
> @@ -149,6 +149,11 @@ static int mpi3mr_report_manufacture(struct mpi3mr_ioc *mrioc,
>  		return -EFAULT;
>  	}
>  
> +	if (mrioc->pcie_err_recovery) {
> +		ioc_err(mrioc, "%s: pcie error recovery in progress!\n", __func__);
> +		return -EFAULT;
> +	}
> +
>  	data_out_sz = sizeof(struct rep_manu_request);
>  	data_in_sz = sizeof(struct rep_manu_reply);
>  	data_out = dma_alloc_coherent(&mrioc->pdev->dev,
> @@ -792,6 +797,12 @@ static int mpi3mr_set_identify(struct mpi3mr_ioc *mrioc, u16 handle,
>  		return -EFAULT;
>  	}
>  
> +	if (mrioc->pcie_err_recovery) {
> +		ioc_err(mrioc, "%s: pcie error recovery in progress!\n",
> +		    __func__);
> +		return -EFAULT;
> +	}
> +
>  	if ((mpi3mr_cfg_get_dev_pg0(mrioc, &ioc_status, &device_pg0,
>  	    sizeof(device_pg0), MPI3_DEVICE_PGAD_FORM_HANDLE, handle))) {
>  		ioc_err(mrioc, "%s: device page0 read failed\n", __func__);
> @@ -1009,6 +1020,9 @@ mpi3mr_alloc_hba_port(struct mpi3mr_ioc *mrioc, u16 port_id)
>  	hba_port->port_id = port_id;
>  	ioc_info(mrioc, "hba_port entry: %p, port: %d is added to hba_port list\n",
>  	    hba_port, hba_port->port_id);
> +	if (mrioc->reset_in_progress ||
> +		mrioc->pcie_err_recovery)

  if (mrioc->reset_in_progress || mrioc->pcie_err_recovery)

would fit on one line and make this easier to read.

> +		hba_port->flags = MPI3MR_HBA_PORT_FLAG_NEW;
>  	list_add_tail(&hba_port->list, &mrioc->hba_port_table_list);
>  	return hba_port;
>  }
> @@ -1057,7 +1071,7 @@ void mpi3mr_update_links(struct mpi3mr_ioc *mrioc,
>  	struct mpi3mr_sas_node *mr_sas_node;
>  	struct mpi3mr_sas_phy *mr_sas_phy;
>  
> -	if (mrioc->reset_in_progress)
> +	if (mrioc->reset_in_progress || mrioc->pcie_err_recovery)
>  		return;
>  
>  	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
> @@ -1965,7 +1979,7 @@ int mpi3mr_expander_add(struct mpi3mr_ioc *mrioc, u16 handle)
>  	if (!handle)
>  		return -1;
>  
> -	if (mrioc->reset_in_progress)
> +	if (mrioc->reset_in_progress || mrioc->pcie_err_recovery)
>  		return -1;
>  
>  	if ((mpi3mr_cfg_get_sas_exp_pg0(mrioc, &ioc_status, &expander_pg0,
> @@ -2171,7 +2185,7 @@ void mpi3mr_expander_node_remove(struct mpi3mr_ioc *mrioc,
>  	/* remove sibling ports attached to this expander */
>  	list_for_each_entry_safe(mr_sas_port, next,
>  	   &sas_expander->sas_port_list, port_list) {
> -		if (mrioc->reset_in_progress)
> +		if (mrioc->reset_in_progress || mrioc->pcie_err_recovery)
>  			return;
>  		if (mr_sas_port->remote_identify.device_type ==
>  		    SAS_END_DEVICE)
> @@ -2221,7 +2235,7 @@ void mpi3mr_expander_remove(struct mpi3mr_ioc *mrioc, u64 sas_address,
>  	struct mpi3mr_sas_node *sas_expander;
>  	unsigned long flags;
>  
> -	if (mrioc->reset_in_progress)
> +	if (mrioc->reset_in_progress || mrioc->pcie_err_recovery)
>  		return;
>  
>  	if (!hba_port)
> @@ -2532,6 +2546,11 @@ static int mpi3mr_get_expander_phy_error_log(struct mpi3mr_ioc *mrioc,
>  		return -EFAULT;
>  	}
>  
> +	if (mrioc->pcie_err_recovery) {
> +		ioc_err(mrioc, "%s: pcie error recovery in progress!\n", __func__);
> +		return -EFAULT;
> +	}
> +
>  	data_out_sz = sizeof(struct phy_error_log_request);
>  	data_in_sz = sizeof(struct phy_error_log_reply);
>  	sz = data_out_sz + data_in_sz;
> @@ -2791,6 +2810,12 @@ mpi3mr_expander_phy_control(struct mpi3mr_ioc *mrioc,
>  		return -EFAULT;
>  	}
>  
> +	if (mrioc->pcie_err_recovery) {
> +		ioc_err(mrioc, "%s: pcie error recovery in progress!\n",
> +		    __func__);
> +		return -EFAULT;
> +	}
> +
>  	data_out_sz = sizeof(struct phy_control_request);
>  	data_in_sz = sizeof(struct phy_control_reply);
>  	sz = data_out_sz + data_in_sz;
> @@ -3214,6 +3239,12 @@ mpi3mr_transport_smp_handler(struct bsg_job *job, struct Scsi_Host *shost,
>  		goto out;
>  	}
>  
> +	if (mrioc->pcie_err_recovery) {
> +		ioc_err(mrioc, "%s: pcie error recovery in progress!\n", __func__);
> +		rc = -EFAULT;
> +		goto out;
> +	}
> +
>  	rc = mpi3mr_map_smp_buffer(&mrioc->pdev->dev, &job->request_payload,
>  	    &dma_addr_out, &dma_len_out, &addr_out);
>  	if (rc)
> -- 
> 2.31.1
> 



^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-01-05  3:20 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-14 20:58 [PATCH v2 0/6] mpi3mr: Support PCIe Error Recovery Ranjan Kumar
2023-12-14 20:58 ` [PATCH v2 1/6] mpi3mr: Creation of helper function Ranjan Kumar
2024-01-04 23:11   ` Bjorn Helgaas
2023-12-14 20:58 ` [PATCH v2 2/6] mpi3mr: Support PCIe Error Recovery callback handlers Ranjan Kumar
2024-01-04 23:49   ` Bjorn Helgaas
2023-12-14 20:58 ` [PATCH v2 3/6] mpi3mr: Prevent PCI writes from driver during PCI error recovery Ranjan Kumar
2024-01-05  3:20   ` Bjorn Helgaas
2023-12-14 20:58 ` [PATCH v2 4/6] mpi3mr: Improve Shutdown times when firmware has faulted Ranjan Kumar
2023-12-14 20:58 ` [PATCH v2 5/6] mpi3mr: Reset stop_bsgs flag post controller reset failure Ranjan Kumar
2023-12-14 20:59 ` [PATCH v2 6/6] mpi3mr: Update driver version to 8.6.1.0.50 Ranjan Kumar

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.