All of lore.kernel.org
 help / color / mirror / Atom feed
From: Don Brace <don.brace@microchip.com>
To: <Kevin.Barnett@microchip.com>, <scott.teel@microchip.com>,
	<Justin.Lindley@microchip.com>, <scott.benesh@microchip.com>,
	<gerry.morong@microchip.com>, <mahesh.rajashekhara@microchip.com>,
	<mike.mcgowen@microchip.com>, <murthy.bhat@microchip.com>,
	<kumar.meiyappan@microchip.com>, <hch@infradead.org>,
	<jejb@linux.vnet.ibm.com>, <joseph.szczypek@hpe.com>,
	<POSWALD@suse.com>
Cc: <linux-scsi@vger.kernel.org>
Subject: [PATCH V2 06/16] smartpqi: fix PCI control linkdown system hang
Date: Fri, 8 Jul 2022 13:47:15 -0500	[thread overview]
Message-ID: <165730603578.177165.4699352086827187263.stgit@brunhilda> (raw)
In-Reply-To: <165730597930.177165.11663580730429681919.stgit@brunhilda>

From: Sagar Biradar <sagar.biradar@microchip.com>

Fail all outstanding requests after a PCI linkdown.

Block access to device SCSI attributes during the following conditions:
"Cable pull" is called PQI_CTRL_SURPRISE_REMOVAL.
"PCIe Link Down" is called PQI_CTRL_GRACEFUL_REMOVAL.

Block access to device SCSI attributes during and in rare instances when
the controller goes offline.

Either outstanding requests or the access of SCSI attributes post
linkdown can lead to a hang.

Post linkdown, driver does not fail the outstanding requests leading
to long wait time before all the IOs eventually fail.

Also access of the SCSI attributes by host applications can lead to a
system hang.

Reviewed-by: Scott Benesh <scott.benesh@microchip.com>
Reviewed-by: Scott Teel <scott.teel@microchip.com>
Reviewed-by: Mike McGowen <mike.mcgowen@microchip.com>
Reviewed-by: Kevin Barnett <kevin.barnett@microchip.com>
Signed-off-by: Sagar Biradar <sagar.biradar@microchip.com>
Signed-off-by: Don Brace <don.brace@microchip.com>
---
 drivers/scsi/smartpqi/smartpqi.h      |    7 +++++
 drivers/scsi/smartpqi/smartpqi_init.c |   48 ++++++++++++++++++++++++++++++---
 drivers/scsi/smartpqi/smartpqi_sis.c  |    2 +
 3 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/smartpqi/smartpqi.h b/drivers/scsi/smartpqi/smartpqi.h
index aa8663e1b98b..f1145ded843e 100644
--- a/drivers/scsi/smartpqi/smartpqi.h
+++ b/drivers/scsi/smartpqi/smartpqi.h
@@ -1269,6 +1269,12 @@ struct pqi_event {
 #define PQI_CTRL_PRODUCT_REVISION_A	0
 #define PQI_CTRL_PRODUCT_REVISION_B	1
 
+enum pqi_ctrl_removal_state {
+	PQI_CTRL_PRESENT = 0,
+	PQI_CTRL_GRACEFUL_REMOVAL,
+	PQI_CTRL_SURPRISE_REMOVAL
+};
+
 struct pqi_ctrl_info {
 	unsigned int	ctrl_id;
 	struct pci_dev	*pci_dev;
@@ -1389,6 +1395,7 @@ struct pqi_ctrl_info {
 	struct work_struct ofa_quiesce_work;
 	u32		ofa_bytes_requested;
 	u16		ofa_cancel_reason;
+	enum pqi_ctrl_removal_state ctrl_removal_state;
 };
 
 enum pqi_ctrl_mode {
diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c
index 11a8e224fe84..27b83db3e3dc 100644
--- a/drivers/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/scsi/smartpqi/smartpqi_init.c
@@ -95,6 +95,7 @@ static void pqi_ofa_free_host_buffer(struct pqi_ctrl_info *ctrl_info);
 static int pqi_ofa_host_memory_update(struct pqi_ctrl_info *ctrl_info);
 static int pqi_device_wait_for_pending_io(struct pqi_ctrl_info *ctrl_info,
 	struct pqi_scsi_dev *device, u8 lun, unsigned long timeout_msecs);
+static void pqi_fail_all_outstanding_requests(struct pqi_ctrl_info *ctrl_info);
 
 /* for flags argument to pqi_submit_raid_request_synchronous() */
 #define PQI_SYNC_FLAGS_INTERRUPTABLE	0x1
@@ -6157,9 +6158,11 @@ static int pqi_device_wait_for_pending_io(struct pqi_ctrl_info *ctrl_info,
 	warning_timeout = (PQI_PENDING_IO_WARNING_TIMEOUT_SECS * HZ) + start_jiffies;
 
 	while ((cmds_outstanding = atomic_read(&device->scsi_cmds_outstanding[lun])) > 0) {
-		pqi_check_ctrl_health(ctrl_info);
-		if (pqi_ctrl_offline(ctrl_info))
-			return -ENXIO;
+		if (ctrl_info->ctrl_removal_state != PQI_CTRL_GRACEFUL_REMOVAL) {
+			pqi_check_ctrl_health(ctrl_info);
+			if (pqi_ctrl_offline(ctrl_info))
+				return -ENXIO;
+		}
 		msecs_waiting = jiffies_to_msecs(jiffies - start_jiffies);
 		if (msecs_waiting >= timeout_msecs) {
 			dev_err(&ctrl_info->pci_dev->dev,
@@ -6945,6 +6948,9 @@ static ssize_t pqi_unique_id_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -6981,6 +6987,9 @@ static ssize_t pqi_lunid_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -7016,6 +7025,9 @@ static ssize_t pqi_path_info_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -7093,6 +7105,9 @@ static ssize_t pqi_sas_address_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -7119,6 +7134,9 @@ static ssize_t pqi_ssd_smart_path_enabled_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -7148,6 +7166,9 @@ static ssize_t pqi_raid_level_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -7178,6 +7199,9 @@ static ssize_t pqi_raid_bypass_cnt_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -7205,6 +7229,9 @@ static ssize_t pqi_sas_ncq_prio_enable_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -8547,7 +8574,6 @@ static void pqi_free_interrupts(struct pqi_ctrl_info *ctrl_info)
 
 static void pqi_free_ctrl_resources(struct pqi_ctrl_info *ctrl_info)
 {
-	pqi_stop_heartbeat_timer(ctrl_info);
 	pqi_free_interrupts(ctrl_info);
 	if (ctrl_info->queue_memory_base)
 		dma_free_coherent(&ctrl_info->pci_dev->dev,
@@ -8572,8 +8598,15 @@ static void pqi_free_ctrl_resources(struct pqi_ctrl_info *ctrl_info)
 
 static void pqi_remove_ctrl(struct pqi_ctrl_info *ctrl_info)
 {
+	ctrl_info->controller_online = false;
+	pqi_stop_heartbeat_timer(ctrl_info);
+	pqi_ctrl_block_requests(ctrl_info);
 	pqi_cancel_rescan_worker(ctrl_info);
 	pqi_cancel_update_time_worker(ctrl_info);
+	if (ctrl_info->ctrl_removal_state == PQI_CTRL_SURPRISE_REMOVAL) {
+		pqi_fail_all_outstanding_requests(ctrl_info);
+		ctrl_info->pqi_mode_enabled = false;
+	}
 	pqi_remove_all_scsi_devices(ctrl_info);
 	pqi_unregister_scsi(ctrl_info);
 	if (ctrl_info->pqi_mode_enabled)
@@ -8914,11 +8947,18 @@ static int pqi_pci_probe(struct pci_dev *pci_dev,
 static void pqi_pci_remove(struct pci_dev *pci_dev)
 {
 	struct pqi_ctrl_info *ctrl_info;
+	u16 vendor_id;
 
 	ctrl_info = pci_get_drvdata(pci_dev);
 	if (!ctrl_info)
 		return;
 
+	pci_read_config_word(ctrl_info->pci_dev, PCI_SUBSYSTEM_VENDOR_ID, &vendor_id);
+	if (vendor_id == 0xffff)
+		ctrl_info->ctrl_removal_state = PQI_CTRL_SURPRISE_REMOVAL;
+	else
+		ctrl_info->ctrl_removal_state = PQI_CTRL_GRACEFUL_REMOVAL;
+
 	pqi_remove_ctrl(ctrl_info);
 }
 
diff --git a/drivers/scsi/smartpqi/smartpqi_sis.c b/drivers/scsi/smartpqi/smartpqi_sis.c
index 2b99b6e9cd71..59d9c2792371 100644
--- a/drivers/scsi/smartpqi/smartpqi_sis.c
+++ b/drivers/scsi/smartpqi/smartpqi_sis.c
@@ -138,7 +138,7 @@ bool sis_is_firmware_running(struct pqi_ctrl_info *ctrl_info)
 
 	status = readl(&ctrl_info->registers->sis_firmware_status);
 
-	if (status & SIS_CTRL_KERNEL_PANIC)
+	if (status != ~0 && (status & SIS_CTRL_KERNEL_PANIC))
 		running = false;
 	else
 		running = true;


  parent reply	other threads:[~2022-07-08 18:46 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-07-08 18:46 [PATCH V2 00/16] smartpqi updates Don Brace
2022-07-08 18:46 ` [PATCH V2 01/16] smartpqi: shorten drive visibility after removal Don Brace
2022-07-08 18:46 ` [PATCH V2 02/16] smartpqi: add controller fw version to console log Don Brace
2022-07-08 18:47 ` [PATCH V2 03/16] smartpqi: add PCI-IDs for ramaxel controllers Don Brace
2022-07-08 18:47 ` [PATCH V2 04/16] smartpqi: close write read holes Don Brace
2022-07-08 18:47 ` [PATCH V2 05/16] smartpqi: add driver support for multi-LUN devices Don Brace
2022-07-08 18:47 ` Don Brace [this message]
2022-07-08 18:47 ` [PATCH V2 07/16] smartpqi: add PCI-ID for Adaptec SmartHBA 2100-8i Don Brace
2022-07-08 18:47 ` [PATCH V2 08/16] smartpqi: add PCI-IDs for Lenovo controllers Don Brace
2022-07-08 18:47 ` [PATCH V2 09/16] smartpqi: stop logging spurious PQI reset failures Don Brace
2022-07-08 18:47 ` [PATCH V2 10/16] smartpqi: fix dma direction for RAID requests Don Brace
2022-07-08 18:47 ` [PATCH V2 11/16] smartpqi: fix RAID map race condition Don Brace
2022-07-08 18:47 ` [PATCH V2 12/16] smartpqi: add module param to disable managed ints Don Brace
2022-07-08 18:47 ` [PATCH V2 13/16] smartpqi: update deleting a LUN via sysfs Don Brace
2022-07-08 18:47 ` [PATCH V2 14/16] smartpqi: add ctrl ready timeout module parameter Don Brace
2022-07-08 18:48 ` [PATCH V2 15/16] smartpqi: update copyright to current year Don Brace
2022-07-08 18:48 ` [PATCH V2 16/16] smartpqi: update version to 2.1.18-045 Don Brace
2022-07-14  3:43 ` [PATCH V2 00/16] smartpqi updates Martin K. Petersen
2022-07-19  3:08 ` Martin K. Petersen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=165730603578.177165.4699352086827187263.stgit@brunhilda \
    --to=don.brace@microchip.com \
    --cc=Justin.Lindley@microchip.com \
    --cc=Kevin.Barnett@microchip.com \
    --cc=POSWALD@suse.com \
    --cc=gerry.morong@microchip.com \
    --cc=hch@infradead.org \
    --cc=jejb@linux.vnet.ibm.com \
    --cc=joseph.szczypek@hpe.com \
    --cc=kumar.meiyappan@microchip.com \
    --cc=linux-scsi@vger.kernel.org \
    --cc=mahesh.rajashekhara@microchip.com \
    --cc=mike.mcgowen@microchip.com \
    --cc=murthy.bhat@microchip.com \
    --cc=scott.benesh@microchip.com \
    --cc=scott.teel@microchip.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.