RE: [PATCH 8/8] cxl/pci: Add (hopeful) error handling support

From: Shiju Jose <shiju.jose@huawei.com>
To: Dan Williams <dan.j.williams@intel.com>,
	"linux-cxl@vger.kernel.org" <linux-cxl@vger.kernel.org>
Cc: "ben.widawsky@intel.com" <ben.widawsky@intel.com>,
	"vishal.l.verma@intel.com" <vishal.l.verma@intel.com>,
	"alison.schofield@intel.com" <alison.schofield@intel.com>,
	Jonathan Cameron <jonathan.cameron@huawei.com>,
	"ira.weiny@intel.com" <ira.weiny@intel.com>,
	"linux-pci@vger.kernel.org" <linux-pci@vger.kernel.org>
Subject: RE: [PATCH 8/8] cxl/pci: Add (hopeful) error handling support
Date: Fri, 18 Mar 2022 09:41:55 +0000	[thread overview]
Message-ID: <e9de480061ad425e9603cf71db5c610d@huawei.com> (raw)
In-Reply-To: <164740406489.3912056.8334546166826246693.stgit@dwillia2-desk3.amr.corp.intel.com>

Hi Dan,

>-----Original Message-----
>From: Dan Williams <dan.j.williams@intel.com>
>Sent: 16 March 2022 04:14
>To: linux-cxl@vger.kernel.org
>Cc: ben.widawsky@intel.com; vishal.l.verma@intel.com;
>alison.schofield@intel.com; Jonathan Cameron
><jonathan.cameron@huawei.com>; ira.weiny@intel.com; linux-
>pci@vger.kernel.org
>Subject: [PATCH 8/8] cxl/pci: Add (hopeful) error handling support
>
>Add nominal error handling that tears down CXL.mem in response to error
>notifications that imply a device reset. Given some CXL.mem may be
>operating as System RAM, there is a high likelihood that these error events
>are fatal. However, if the system survives the notification the expectation is
>that the driver behavior is equivalent to a hot-unplug and re-plug of an
>endpoint.
>
>Note that this does not change the mask values from the default. That awaits
>CXL _OSC support to determine whether platform firmware is in control of the
>mask registers.
>
>Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>---
> drivers/cxl/core/memdev.c |    1
> drivers/cxl/cxlmem.h      |    2 +
> drivers/cxl/pci.c         |  109
>+++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 112 insertions(+)
>
>diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index
>1f76b28f9826..223d512790e1 100644
>--- a/drivers/cxl/core/memdev.c
>+++ b/drivers/cxl/core/memdev.c
>@@ -341,6 +341,7 @@ struct cxl_memdev *devm_cxl_add_memdev(struct
>cxl_dev_state *cxlds)
> 	 * needed as this is ordered with cdev_add() publishing the device.
> 	 */
> 	cxlmd->cxlds = cxlds;
>+	cxlds->cxlmd = cxlmd;
>
> 	cdev = &cxlmd->cdev;
> 	rc = cdev_device_add(cdev, dev);
>diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index
>5d33ce24fe09..f58e16951414 100644
>--- a/drivers/cxl/cxlmem.h
>+++ b/drivers/cxl/cxlmem.h
>@@ -117,6 +117,7 @@ struct cxl_endpoint_dvsec_info {
>  * Currently only memory devices are represented.
>  *
>  * @dev: The device associated with this CXL state
>+ * @cxlmd: The device representing the CXL.mem capabilities of @dev
>  * @regs: Parsed register blocks
>  * @cxl_dvsec: Offset to the PCIe device DVSEC
>  * @payload_size: Size of space for payload @@ -148,6 +149,7 @@ struct
>cxl_endpoint_dvsec_info {
>  */
> struct cxl_dev_state {
> 	struct device *dev;
>+	struct cxl_memdev *cxlmd;
>
> 	struct cxl_regs regs;
> 	int cxl_dvsec;
>diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index
>bde8929450f0..823cbfa093fa 100644
>--- a/drivers/cxl/pci.c
>+++ b/drivers/cxl/pci.c
>@@ -8,6 +8,7 @@
> #include <linux/mutex.h>
> #include <linux/list.h>
> #include <linux/pci.h>
>+#include <linux/aer.h>
> #include <linux/io.h>
> #include "cxlmem.h"
> #include "cxlpci.h"
>@@ -533,6 +534,11 @@ static void cxl_dvsec_ranges(struct cxl_dev_state
>*cxlds)
> 	info->ranges = __cxl_dvsec_ranges(cxlds, info);  }
>
>+static void disable_aer(void *pdev)
>+{
>+	pci_disable_pcie_error_reporting(pdev);
>+}
>+
> static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>{
> 	struct cxl_register_map map;
>@@ -554,6 +560,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const
>struct pci_device_id *id)
> 	cxlds = cxl_dev_state_create(&pdev->dev);
> 	if (IS_ERR(cxlds))
> 		return PTR_ERR(cxlds);
>+	pci_set_drvdata(pdev, cxlds);
>
> 	cxlds->serial = pci_get_dsn(pdev);
> 	cxlds->cxl_dvsec = pci_find_dvsec_capability( @@ -610,6 +617,14 @@
>static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> 	if (IS_ERR(cxlmd))
> 		return PTR_ERR(cxlmd);
>
>+	if (cxlds->regs.ras) {
>+		pci_enable_pcie_error_reporting(pdev);
>+		rc = devm_add_action_or_reset(&pdev->dev, disable_aer,
>pdev);
>+		if (rc)
>+			return rc;
>+	}
>+	pci_save_state(pdev);
>+
> 	if (range_len(&cxlds->pmem_range) &&
>IS_ENABLED(CONFIG_CXL_PMEM))
> 		rc = devm_cxl_add_nvdimm(&pdev->dev, cxlmd);
>
>@@ -623,10 +638,104 @@ static const struct pci_device_id cxl_mem_pci_tbl[]
>= {  };  MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl);
>
>+/*
>+ * Log the state of the RAS status registers and prepare them to log
>+the
>+ * next error status.
>+ */
>+static void cxl_report_and_clear(struct cxl_dev_state *cxlds) {
>+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
>+	struct device *dev = &cxlmd->dev;
>+	void __iomem *addr;
>+	u32 status;
>+
>+	if (!cxlds->regs.ras)
>+		return;
In the cxl_error_detected () may need to return PCI_ERS_RESULT_NONE
for the following cases, if exist,
1. if (!cxlds->regs.ras),
2. if any errors would be reported during the dev initialization. 

>+
>+	addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
>+	status = readl(addr);
>+	if (status & CXL_RAS_UNCORRECTABLE_STATUS_MASK) {
>+		dev_warn(cxlds->dev, "%s: uncorrectable status: %#08x\n",
>+			 dev_name(dev), status);
>+		writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK,
>addr);
>+	}
For the uncorrectable non-fatal errors, if any, may need to return PCI_ERS_RESULT_NEED_RESET
to trigger the slot reset to prevent more serious issues later. For this case the state would be
"pci_channel_io_normal".

>+
>+	addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
>+	status = readl(addr);
>+	if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
>+		dev_warn(cxlds->dev, "%s: correctable status: %#08x\n",
>+			 dev_name(dev), status);
>+		writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK,
>addr);
>+	}
>+}
>+
>+static pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>+					   pci_channel_state_t state)
>+{
>+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
>+	struct device *dev = &cxlmd->dev;
>+
>+	/*
>+	 * A frozen channel indicates an impending reset which is fatal to
>+	 * CXL.mem operation, and will likely crash the system. On the off
>+	 * chance the situation is recoverable dump the status of the RAS
>+	 * capability registers and bounce the active state of the memdev.
>+	 */
>+	cxl_report_and_clear(cxlds);
>+
>+	switch (state) {
>+	case pci_channel_io_normal:
>+		return PCI_ERS_RESULT_CAN_RECOVER;
>+	case pci_channel_io_frozen:
>+		dev_warn(&pdev->dev,
>+			 "%s: frozen state error detected, disable
>CXL.mem\n",
>+			 dev_name(dev));
>+		device_release_driver(dev);
>+		return PCI_ERS_RESULT_NEED_RESET;
>+	case pci_channel_io_perm_failure:
>+		dev_warn(&pdev->dev,
>+			 "failure state error detected, request disconnect\n");
>+		return PCI_ERS_RESULT_DISCONNECT;
>+	}
>+	return PCI_ERS_RESULT_NEED_RESET;
>+}
>+
>+static pci_ers_result_t cxl_slot_reset(struct pci_dev *pdev) {
>+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
>+	struct device *dev = &cxlmd->dev;
>+
>+	dev_info(&pdev->dev, "%s: restart CXL.mem after slot reset\n",
>+		 dev_name(dev));
>+	pci_restore_state(pdev);
1. Do we need to call pci_save_state(pdev) here after the reset? though pci_save_state(pdev) is being invoked in the  
cxl_pci_probe().

>+	if (device_attach(dev) <= 0)
>+		return PCI_ERS_RESULT_DISCONNECT;
My understanding is that pci_disable_pcie_error_reporting(pdev) would be called
in the disable_aer () in the reset, 
pci_enable_pcie_error_reporting(pdev) may need to call here after the reset?

>+	return PCI_ERS_RESULT_RECOVERED;
>+}
>+
>+static void cxl_error_resume(struct pci_dev *pdev) {
>+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
>+	struct device *dev = &cxlmd->dev;
>+
>+	dev_info(&pdev->dev, "%s: error resume %s\n", dev_name(dev),
>+		 dev->driver ? "successful" : "failed"); }
>+
>+static const struct pci_error_handlers cxl_error_handlers = {
>+	.error_detected	= cxl_error_detected,
>+	.slot_reset	= cxl_slot_reset,
>+	.resume		= cxl_error_resume,
If the FLR (Function level reset) supported, please add the corresponding callback functions
reset_prepare(..) and reset_done(..).

>+};
>+
> static struct pci_driver cxl_pci_driver = {
> 	.name			= KBUILD_MODNAME,
> 	.id_table		= cxl_mem_pci_tbl,
> 	.probe			= cxl_pci_probe,
>+	.err_handler		= &cxl_error_handlers,
> 	.driver	= {
> 		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
> 	},

Thanks,
Shiju