All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ahci: Add support for EEH error recovery
@ 2015-05-14  1:35 wenxiong
  2015-05-14 15:13 ` Tejun Heo
  0 siblings, 1 reply; 5+ messages in thread
From: wenxiong @ 2015-05-14  1:35 UTC (permalink / raw)
  To: tj; +Cc: jgarzik, linux-ide, bjking1, wenxiong, Wen Xiong

From: Wen Xiong <wenxiong@linux.vnet.ibm.com>

This patch adds the callback functions to support EEH error
recovery in ahci driver. Also adds the code in ahci_error_handler
to issue an MMIO load then check if it is in EEH. If it is in EEH,
ahci_error_handler will wait until EEH recovery is completed.

Signed-off-by: Wen Xiong <wenxiong@linux.vnet.ibm.com>
---
 drivers/ata/ahci.c    |   70 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/ata/ahci.h    |    3 ++
 drivers/ata/libahci.c |   11 +++++++
 3 files changed, 84 insertions(+), 0 deletions(-)

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 65ee944..0184677 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -96,6 +96,10 @@ static int ahci_pci_device_suspend(struct pci_dev *pdev, pm_message_t mesg);
 static int ahci_pci_device_resume(struct pci_dev *pdev);
 #endif
 
+static pci_ers_result_t ahci_pci_error_detected(struct pci_dev *pdev,
+					       pci_channel_state_t state);
+static pci_ers_result_t ahci_pci_slot_reset(struct pci_dev *pdev);
+
 static struct scsi_host_template ahci_sht = {
 	AHCI_SHT("ahci"),
 };
@@ -520,6 +524,10 @@ static const struct pci_device_id ahci_pci_tbl[] = {
 	{ }	/* terminate list */
 };
 
+static const struct pci_error_handlers ahci_err_handler = {
+	.error_detected = ahci_pci_error_detected,
+	.slot_reset = ahci_pci_slot_reset,
+};
 
 static struct pci_driver ahci_pci_driver = {
 	.name			= DRV_NAME,
@@ -530,6 +538,7 @@ static struct pci_driver ahci_pci_driver = {
 	.suspend		= ahci_pci_device_suspend,
 	.resume			= ahci_pci_device_resume,
 #endif
+	.err_handler		= &ahci_err_handler,
 };
 
 #if defined(CONFIG_PATA_MARVELL) || defined(CONFIG_PATA_MARVELL_MODULE)
@@ -813,6 +822,64 @@ static int ahci_pci_device_resume(struct pci_dev *pdev)
 }
 #endif
 
+/**
+ * ahci_pci_error_detected - Called when a PCI error is detected.
+ * @pdev:	PCI device struct
+ * @state:	PCI channel state
+ *
+ * Description: Called when a PCI error is detected.
+ *
+ * Return value:
+ * PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT
+ */
+static pci_ers_result_t ahci_pci_error_detected(struct pci_dev *pdev,
+					       pci_channel_state_t state)
+{
+	struct ata_host *host = pci_get_drvdata(pdev);
+	int i;
+
+	if (state == pci_channel_io_perm_failure)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	for (i = 0; i < host->n_ports; i++)
+		scsi_block_requests(host->ports[i]->scsi_host);
+
+	return PCI_ERS_RESULT_NEED_RESET;
+
+}
+
+/**
+ * ahci_pci_slot_reset - Called when PCI slot has been reset.
+ * @pdev:	PCI device struct
+ *
+ * Description: This routine is called by the pci error recovery
+ * code after the PCI slot has been reset, just before we
+ * should resume normal operations.
+ */
+static pci_ers_result_t ahci_pci_slot_reset(struct pci_dev *pdev)
+{
+	struct ata_host *host = pci_get_drvdata(pdev);
+	struct ahci_host_priv *hpriv = host->private_data;
+	int i, rc;
+
+	pci_restore_state(pdev);
+
+	pci_save_state(pdev);
+
+	rc = ahci_pci_reset_controller(host);
+	if (rc)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	ahci_pci_init_controller(host);
+
+	for (i = 0; i < host->n_ports; i++)
+		scsi_unblock_requests(host->ports[i]->scsi_host);
+
+	wake_up_all(&hpriv->eeh_wait_q);
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
 static int ahci_configure_dma_masks(struct pci_dev *pdev, int using_dac)
 {
 	int rc;
@@ -1439,6 +1506,7 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar];
 
+	init_waitqueue_head(&hpriv->eeh_wait_q);
 	/* must set flag prior to save config in order to take effect */
 	if (ahci_broken_devslp(pdev))
 		hpriv->flags |= AHCI_HFLAG_NO_DEVSLP;
@@ -1549,6 +1617,8 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	pci_set_master(pdev);
 
+	pci_save_state(pdev);
+
 	return ahci_host_activate(host, pdev->irq, &ahci_sht);
 }
 
diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h
index 71262e0..6bbf747 100644
--- a/drivers/ata/ahci.h
+++ b/drivers/ata/ahci.h
@@ -51,6 +51,8 @@
 #define EM_MSG_LED_VALUE_OFF          0xfff80000
 #define EM_MSG_LED_VALUE_ON           0x00010000
 
+#define AHCI_PCI_ERROR_RECOVERY_TIMEOUT	(120 * HZ)
+
 enum {
 	AHCI_MAX_PORTS		= 32,
 	AHCI_MAX_CLKS		= 5,
@@ -341,6 +343,7 @@ struct ahci_host_priv {
 	struct phy		**phys;
 	unsigned		nports;		/* Number of ports */
 	void			*plat_data;	/* Other platform data */
+	wait_queue_head_t	eeh_wait_q;
 	/*
 	 * Optional ahci_start_engine override, if not set this gets set to the
 	 * default ahci_start_engine during ahci_save_initial_config, this can
diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index 287c4ba..bd7422a 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -43,6 +43,7 @@
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_cmnd.h>
 #include <linux/libata.h>
+#include <linux/pci.h>
 #include "ahci.h"
 #include "libata.h"
 
@@ -1968,6 +1969,16 @@ static void ahci_thaw(struct ata_port *ap)
 void ahci_error_handler(struct ata_port *ap)
 {
 	struct ahci_host_priv *hpriv = ap->host->private_data;
+	void __iomem *mmio = hpriv->mmio;
+	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
+	u32 irq_stat;
+
+	irq_stat = readl(mmio + HOST_IRQ_STAT);
+
+	if (pci_channel_offline(pdev))
+		wait_event_timeout(hpriv->eeh_wait_q,
+				!pci_channel_offline(pdev),
+				AHCI_PCI_ERROR_RECOVERY_TIMEOUT);
 
 	if (!(ap->pflags & ATA_PFLAG_FROZEN)) {
 		/* restart engine */
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] ahci: Add support for EEH error recovery
  2015-05-14  1:35 [PATCH] ahci: Add support for EEH error recovery wenxiong
@ 2015-05-14 15:13 ` Tejun Heo
  2015-05-14 15:44   ` Brian King
  0 siblings, 1 reply; 5+ messages in thread
From: Tejun Heo @ 2015-05-14 15:13 UTC (permalink / raw)
  To: wenxiong; +Cc: jgarzik, linux-ide, bjking1, wenxiong

Hello, Wen.

On Wed, May 13, 2015 at 08:35:19PM -0500, wenxiong@linux.vnet.ibm.com wrote:
> From: Wen Xiong <wenxiong@linux.vnet.ibm.com>
> 
> This patch adds the callback functions to support EEH error
> recovery in ahci driver. Also adds the code in ahci_error_handler
> to issue an MMIO load then check if it is in EEH. If it is in EEH,
> ahci_error_handler will wait until EEH recovery is completed.

Can you please explain why we would want this?  What does it buy us?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] ahci: Add support for EEH error recovery
  2015-05-14 15:13 ` Tejun Heo
@ 2015-05-14 15:44   ` Brian King
  2015-05-14 15:48     ` Tejun Heo
  0 siblings, 1 reply; 5+ messages in thread
From: Brian King @ 2015-05-14 15:44 UTC (permalink / raw)
  To: Tejun Heo, wenxiong; +Cc: jgarzik, linux-ide, Wen Xiong

On 05/14/2015 10:13 AM, Tejun Heo wrote:
> Hello, Wen.
> 
> On Wed, May 13, 2015 at 08:35:19PM -0500, wenxiong@linux.vnet.ibm.com wrote:
>> From: Wen Xiong <wenxiong@linux.vnet.ibm.com>
>>
>> This patch adds the callback functions to support EEH error
>> recovery in ahci driver. Also adds the code in ahci_error_handler
>> to issue an MMIO load then check if it is in EEH. If it is in EEH,
>> ahci_error_handler will wait until EEH recovery is completed.
> 
> Can you please explain why we would want this?  What does it buy us?

So, on the Power platform, the pci_error_handlers map to our EEH recovery.
In that case, without this patch, if we hit any sort of PCIe error, we
won't be able to recover and we'll lose all access to the ahci disks.
This could be the adapter trying to access an invalid DMA address due
to a transient hardware issue, or it could be due to a driver bug giving
the adapter an invalid address. It could also be other various PCIe
errors that cause our PCIe bridge chip to isolate the device and
place it into the EEH "frozen" state. When this occurs, if the driver
associated with the hardware does not have these handlers registered,
powerpc arch kernel code will hotplug remove the adapter, recover the
adapter, then hotplug add it back. This works OK for some devices,
but generally not so well for storage devices with mounted filesystems,
which would tend to go readonly in this case.

-Brian

-- 
Brian King
Power Linux I/O
IBM Linux Technology Center



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] ahci: Add support for EEH error recovery
  2015-05-14 15:44   ` Brian King
@ 2015-05-14 15:48     ` Tejun Heo
  2015-05-14 16:09       ` Brian King
  0 siblings, 1 reply; 5+ messages in thread
From: Tejun Heo @ 2015-05-14 15:48 UTC (permalink / raw)
  To: Brian King; +Cc: wenxiong, jgarzik, linux-ide, Wen Xiong

Hello, Brian.

On Thu, May 14, 2015 at 10:44:18AM -0500, Brian King wrote:
> So, on the Power platform, the pci_error_handlers map to our EEH recovery.

What's EEH?

> In that case, without this patch, if we hit any sort of PCIe error, we
> won't be able to recover and we'll lose all access to the ahci disks.
> This could be the adapter trying to access an invalid DMA address due
> to a transient hardware issue, or it could be due to a driver bug giving
> the adapter an invalid address. It could also be other various PCIe
> errors that cause our PCIe bridge chip to isolate the device and
> place it into the EEH "frozen" state. When this occurs, if the driver
> associated with the hardware does not have these handlers registered,
> powerpc arch kernel code will hotplug remove the adapter, recover the
> adapter, then hotplug add it back. This works OK for some devices,
> but generally not so well for storage devices with mounted filesystems,
> which would tend to go readonly in this case.

I think the above, with more details on how the error handling
actually works (IOW what it does), should be in the patch description
and comments.  Wen, can you please update the patch with more
information?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] ahci: Add support for EEH error recovery
  2015-05-14 15:48     ` Tejun Heo
@ 2015-05-14 16:09       ` Brian King
  0 siblings, 0 replies; 5+ messages in thread
From: Brian King @ 2015-05-14 16:09 UTC (permalink / raw)
  To: Tejun Heo; +Cc: wenxiong, jgarzik, linux-ide, Wen Xiong

On 05/14/2015 10:48 AM, Tejun Heo wrote:
> Hello, Brian.
> 
> On Thu, May 14, 2015 at 10:44:18AM -0500, Brian King wrote:
>> So, on the Power platform, the pci_error_handlers map to our EEH recovery.
> 
> What's EEH?

It stands for "Extended Error Handling". 

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/PCI/pci-error-recovery.txt
http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/powerpc/eeh-pci-error-recovery.txt

> 
>> In that case, without this patch, if we hit any sort of PCIe error, we
>> won't be able to recover and we'll lose all access to the ahci disks.
>> This could be the adapter trying to access an invalid DMA address due
>> to a transient hardware issue, or it could be due to a driver bug giving
>> the adapter an invalid address. It could also be other various PCIe
>> errors that cause our PCIe bridge chip to isolate the device and
>> place it into the EEH "frozen" state. When this occurs, if the driver
>> associated with the hardware does not have these handlers registered,
>> powerpc arch kernel code will hotplug remove the adapter, recover the
>> adapter, then hotplug add it back. This works OK for some devices,
>> but generally not so well for storage devices with mounted filesystems,
>> which would tend to go readonly in this case.
> 
> I think the above, with more details on how the error handling
> actually works (IOW what it does), should be in the patch description
> and comments.  Wen, can you please update the patch with more
> information?

Agreed.

Thanks,

Brian

-- 
Brian King
Power Linux I/O
IBM Linux Technology Center



^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2015-05-14 16:10 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-14  1:35 [PATCH] ahci: Add support for EEH error recovery wenxiong
2015-05-14 15:13 ` Tejun Heo
2015-05-14 15:44   ` Brian King
2015-05-14 15:48     ` Tejun Heo
2015-05-14 16:09       ` Brian King

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.