All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] nvme-pci: Move two functions to avoid forward reference
@ 2022-06-01  4:12 Michael Kelley
  2022-06-01  4:12 ` [PATCH 2/2] nvme-pci: handle persistent internal error AER from NVMe controller Michael Kelley
  0 siblings, 1 reply; 5+ messages in thread
From: Michael Kelley @ 2022-06-01  4:12 UTC (permalink / raw)
  To: kbusch, axboe, hch, sagi, linux-nvme, linux-kernel
  Cc: mikelley, caroline.subramoney, riwurd, nathan.obr

Move nvme_should_reset() and nvme_warn_reset() earlier in
the source file to avoid forward references in a subsequent
patch. No functional change.

Signed-off-by: Michael Kelley <mikelley@microsoft.com>
---
 drivers/nvme/host/pci.c | 86 ++++++++++++++++++++++++-------------------------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5a98a7d..4dd87ac 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1076,6 +1076,49 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
 	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
 }
 
+static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
+{
+	/* If true, indicates loss of adapter communication, possibly by a
+	 * NVMe Subsystem reset.
+	 */
+	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
+
+	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
+	switch (dev->ctrl.state) {
+	case NVME_CTRL_RESETTING:
+	case NVME_CTRL_CONNECTING:
+		return false;
+	default:
+		break;
+	}
+
+	/* We shouldn't reset unless the controller is on fatal error state
+	 * _or_ if we lost the communication with it.
+	 */
+	if (!(csts & NVME_CSTS_CFS) && !nssro)
+		return false;
+
+	return true;
+}
+
+static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
+{
+	/* Read a config register to help see what died. */
+	u16 pci_status;
+	int result;
+
+	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
+				      &pci_status);
+	if (result == PCIBIOS_SUCCESSFUL)
+		dev_warn(dev->ctrl.device,
+			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
+			 csts, pci_status);
+	else
+		dev_warn(dev->ctrl.device,
+			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
+			 csts, result);
+}
+
 static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 				   struct io_comp_batch *iob, u16 idx)
 {
@@ -1293,49 +1336,6 @@ static void abort_endio(struct request *req, blk_status_t error)
 	blk_mq_free_request(req);
 }
 
-static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
-{
-	/* If true, indicates loss of adapter communication, possibly by a
-	 * NVMe Subsystem reset.
-	 */
-	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
-
-	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
-	switch (dev->ctrl.state) {
-	case NVME_CTRL_RESETTING:
-	case NVME_CTRL_CONNECTING:
-		return false;
-	default:
-		break;
-	}
-
-	/* We shouldn't reset unless the controller is on fatal error state
-	 * _or_ if we lost the communication with it.
-	 */
-	if (!(csts & NVME_CSTS_CFS) && !nssro)
-		return false;
-
-	return true;
-}
-
-static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
-{
-	/* Read a config register to help see what died. */
-	u16 pci_status;
-	int result;
-
-	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
-				      &pci_status);
-	if (result == PCIBIOS_SUCCESSFUL)
-		dev_warn(dev->ctrl.device,
-			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
-			 csts, pci_status);
-	else
-		dev_warn(dev->ctrl.device,
-			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
-			 csts, result);
-}
-
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/2] nvme-pci: handle persistent internal error AER from NVMe controller
  2022-06-01  4:12 [PATCH 1/2] nvme-pci: Move two functions to avoid forward reference Michael Kelley
@ 2022-06-01  4:12 ` Michael Kelley
  2022-06-01  7:35   ` Christoph Hellwig
  0 siblings, 1 reply; 5+ messages in thread
From: Michael Kelley @ 2022-06-01  4:12 UTC (permalink / raw)
  To: kbusch, axboe, hch, sagi, linux-nvme, linux-kernel
  Cc: mikelley, caroline.subramoney, riwurd, nathan.obr

In the NVM Express Revision 1.4 spec, Figure 145 describes possible
values for an AER with event type "Error" (value 000b). For a
Persistent Internal Error (value 03h), the host should perform a
controller reset.

Add support for this error using code that already exists for
doing a controller reset in response to a request timeout.

This new support was tested in a lab environment where we can
generate the persistent internal error on demand, and observe
both the Linux side and NVMe controller side to see that the
controller reset has been done.

Signed-off-by: Michael Kelley <mikelley@microsoft.com>
---

 drivers/nvme/host/pci.c | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h    |  4 ++++
 2 files changed, 41 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4dd87ac..b2140e9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -131,6 +131,7 @@ struct nvme_dev {
 	void __iomem *bar;
 	unsigned long bar_mapped_size;
 	struct work_struct remove_work;
+	struct work_struct persistent_err_work;
 	struct mutex shutdown_lock;
 	bool subsystem;
 	u64 cmb_size;
@@ -1119,6 +1120,39 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 			 csts, result);
 }
 
+static void nvme_persistent_err_work(struct work_struct *work)
+{
+	struct nvme_dev *dev = container_of(work, struct nvme_dev,
+						persistent_err_work);
+
+	nvme_dev_disable(dev, false);
+	nvme_reset_ctrl(&dev->ctrl);
+}
+
+static bool nvme_check_aen_error(struct nvme_dev *dev,
+			__le16 status, volatile union nvme_result *res)
+{
+	u32 result = le32_to_cpu(res->u32);
+	u32 csts;
+
+	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
+		return false;
+
+	/* Currently only handle Persistent Internal Error */
+	if ((result & 0x07) != NVME_AER_ERROR ||
+	    ((result & 0xff00) >> 8) != NVME_AER_ERROR_PERSIST_INT_ERR)
+		return false;
+
+	/* NVMe Spec 1.4 says to reset the controller */
+	csts = readl(dev->bar + NVME_REG_CSTS);
+	if (!nvme_should_reset(dev, csts))
+		return false;
+
+	nvme_warn_reset(dev, csts);
+	queue_work(nvme_wq, &dev->persistent_err_work);
+	return true;
+}
+
 static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 				   struct io_comp_batch *iob, u16 idx)
 {
@@ -1133,6 +1167,8 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 	 * for them but rather special case them here.
 	 */
 	if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) {
+		if (nvme_check_aen_error(nvmeq->dev, cqe->status, &cqe->result))
+			return;
 		nvme_complete_async_event(&nvmeq->dev->ctrl,
 				cqe->status, &cqe->result);
 		return;
@@ -3085,6 +3121,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
 	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
+	INIT_WORK(&dev->persistent_err_work, nvme_persistent_err_work);
 	mutex_init(&dev->shutdown_lock);
 
 	result = nvme_setup_prp_pools(dev);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 29ec3e3..8ced243 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -712,6 +712,10 @@ enum {
 };
 
 enum {
+	NVME_AER_ERROR_PERSIST_INT_ERR	= 0x03,
+};
+
+enum {
 	NVME_AER_NOTICE_NS_CHANGED	= 0x00,
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
 	NVME_AER_NOTICE_ANA		= 0x03,
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/2] nvme-pci: handle persistent internal error AER from NVMe controller
  2022-06-01  4:12 ` [PATCH 2/2] nvme-pci: handle persistent internal error AER from NVMe controller Michael Kelley
@ 2022-06-01  7:35   ` Christoph Hellwig
  2022-06-01 15:56     ` Michael Kelley (LINUX)
  0 siblings, 1 reply; 5+ messages in thread
From: Christoph Hellwig @ 2022-06-01  7:35 UTC (permalink / raw)
  To: Michael Kelley
  Cc: kbusch, axboe, hch, sagi, linux-nvme, linux-kernel,
	caroline.subramoney, riwurd, nathan.obr

This really belongs into common code.  See the untested patch below
of how I'd do it.  The nvme_should_reset would be a separate prep
patch again.

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 72f7c955c7078..b8b8e9ee04120 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -171,6 +171,24 @@ static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
 	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 }
 
+bool nvme_should_reset(struct nvme_ctrl *ctrl, u32 csts)
+{
+	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
+	switch (ctrl->state) {
+	case NVME_CTRL_RESETTING:
+	case NVME_CTRL_CONNECTING:
+		return false;
+	default:
+		break;
+	}
+
+	/*
+	 * We shouldn't reset unless the controller is on fatal error state or
+	 * if we lost the communication with it.
+	 */
+	return (csts & NVME_CSTS_CFS) ||
+		(ctrl->subsystem && (csts & NVME_CSTS_NSSRO));
+}
 
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
 {
@@ -4537,24 +4555,41 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
 	}
 }
 
+static void nvme_handle_aen_persistent_error(struct nvme_ctrl *ctrl)
+{
+	u32 csts;
+
+	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts) < 0 ||
+	    nvme_should_reset(ctrl, csts)) {
+		dev_warn(ctrl->device, "resetting due to AEN\n");
+		nvme_reset_ctrl(ctrl);
+	}
+}
+
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		volatile union nvme_result *res)
 {
 	u32 result = le32_to_cpu(res->u32);
-	u32 aer_type = result & 0x07;
+	u32 aen_type = result & 0x07;
+	u32 aen_subtype = (result & 0xff00) >> 8;
 
 	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
 		return;
 
-	switch (aer_type) {
+	switch (aen_type) {
 	case NVME_AER_NOTICE:
 		nvme_handle_aen_notice(ctrl, result);
 		break;
 	case NVME_AER_ERROR:
+		if (aen_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
+			nvme_handle_aen_persistent_error(ctrl);
+			break;
+		}
+		fallthrough;
 	case NVME_AER_SMART:
 	case NVME_AER_CSS:
 	case NVME_AER_VS:
-		trace_nvme_async_event(ctrl, aer_type);
+		trace_nvme_async_event(ctrl, aen_type);
 		ctrl->aen_result = result;
 		break;
 	default:
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9b72b6ecf33c9..0d7e9ac52d25a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -762,6 +762,7 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
 		      u32 *result);
 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
+bool nvme_should_reset(struct nvme_ctrl *ctrl, u32 csts);
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
 int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5a98a7de09642..c57023d98f8f3 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1293,31 +1293,6 @@ static void abort_endio(struct request *req, blk_status_t error)
 	blk_mq_free_request(req);
 }
 
-static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
-{
-	/* If true, indicates loss of adapter communication, possibly by a
-	 * NVMe Subsystem reset.
-	 */
-	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
-
-	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
-	switch (dev->ctrl.state) {
-	case NVME_CTRL_RESETTING:
-	case NVME_CTRL_CONNECTING:
-		return false;
-	default:
-		break;
-	}
-
-	/* We shouldn't reset unless the controller is on fatal error state
-	 * _or_ if we lost the communication with it.
-	 */
-	if (!(csts & NVME_CSTS_CFS) && !nssro)
-		return false;
-
-	return true;
-}
-
 static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 {
 	/* Read a config register to help see what died. */
@@ -1355,7 +1330,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	/*
 	 * Reset immediately if the controller is failed
 	 */
-	if (nvme_should_reset(dev, csts)) {
+	if (nvme_should_reset(&dev->ctrl, csts)) {
 		nvme_warn_reset(dev, csts);
 		nvme_dev_disable(dev, false);
 		nvme_reset_ctrl(&dev->ctrl);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 29ec3e3481ff6..8ced2439f1f34 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -711,6 +711,10 @@ enum {
 	NVME_AER_VS			= 7,
 };
 
+enum {
+	NVME_AER_ERROR_PERSIST_INT_ERR	= 0x03,
+};
+
 enum {
 	NVME_AER_NOTICE_NS_CHANGED	= 0x00,
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* RE: [PATCH 2/2] nvme-pci: handle persistent internal error AER from NVMe controller
  2022-06-01  7:35   ` Christoph Hellwig
@ 2022-06-01 15:56     ` Michael Kelley (LINUX)
  2022-06-01 17:08       ` Christoph Hellwig
  0 siblings, 1 reply; 5+ messages in thread
From: Michael Kelley (LINUX) @ 2022-06-01 15:56 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: kbusch, axboe, sagi, linux-nvme, linux-kernel,
	Caroline Subramoney, Richard Wurdack, Nathan Obr

From: Christoph Hellwig <hch@lst.de>
> 
> This really belongs into common code.  See the untested patch below
> of how I'd do it.  The nvme_should_reset would be a separate prep
> patch again.

Indeed, that makes sense.  I had missed that execution gets from
the common code back to the PCI-specific code via the reset_work
function, so unnecessarily did everything in the pci.c.

If there is a persistent error that does a controller reset, it looks
like we should *not* queue async_event_work at the end of
nvme_complete_async_event().  The controller reset will
submit an AER on the admin queue, and so presumably
we don't want nvme_async_event_work() to also try to submit
another AER, which may or may not succeed depending on the
timing of when the controller state shows LIVE again.
Agreed?

Michael

> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 72f7c955c7078..b8b8e9ee04120 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -171,6 +171,24 @@ static inline void nvme_stop_failfast_work(struct nvme_ctrl
> *ctrl)
>  	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
>  }
> 
> +bool nvme_should_reset(struct nvme_ctrl *ctrl, u32 csts)
> +{
> +	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
> +	switch (ctrl->state) {
> +	case NVME_CTRL_RESETTING:
> +	case NVME_CTRL_CONNECTING:
> +		return false;
> +	default:
> +		break;
> +	}
> +
> +	/*
> +	 * We shouldn't reset unless the controller is on fatal error state or
> +	 * if we lost the communication with it.
> +	 */
> +	return (csts & NVME_CSTS_CFS) ||
> +		(ctrl->subsystem && (csts & NVME_CSTS_NSSRO));
> +}
> 
>  int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
>  {
> @@ -4537,24 +4555,41 @@ static void nvme_handle_aen_notice(struct nvme_ctrl
> *ctrl, u32 result)
>  	}
>  }
> 
> +static void nvme_handle_aen_persistent_error(struct nvme_ctrl *ctrl)
> +{
> +	u32 csts;
> +
> +	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts) < 0 ||
> +	    nvme_should_reset(ctrl, csts)) {
> +		dev_warn(ctrl->device, "resetting due to AEN\n");
> +		nvme_reset_ctrl(ctrl);
> +	}
> +}
> +
>  void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
>  		volatile union nvme_result *res)
>  {
>  	u32 result = le32_to_cpu(res->u32);
> -	u32 aer_type = result & 0x07;
> +	u32 aen_type = result & 0x07;
> +	u32 aen_subtype = (result & 0xff00) >> 8;
> 
>  	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
>  		return;
> 
> -	switch (aer_type) {
> +	switch (aen_type) {
>  	case NVME_AER_NOTICE:
>  		nvme_handle_aen_notice(ctrl, result);
>  		break;
>  	case NVME_AER_ERROR:
> +		if (aen_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
> +			nvme_handle_aen_persistent_error(ctrl);
> +			break;
> +		}
> +		fallthrough;
>  	case NVME_AER_SMART:
>  	case NVME_AER_CSS:
>  	case NVME_AER_VS:
> -		trace_nvme_async_event(ctrl, aer_type);
> +		trace_nvme_async_event(ctrl, aen_type);
>  		ctrl->aen_result = result;
>  		break;
>  	default:
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 9b72b6ecf33c9..0d7e9ac52d25a 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -762,6 +762,7 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
>  		      u32 *result);
>  int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
>  void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
> +bool nvme_should_reset(struct nvme_ctrl *ctrl, u32 csts);
>  int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
>  int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
>  int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 5a98a7de09642..c57023d98f8f3 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1293,31 +1293,6 @@ static void abort_endio(struct request *req, blk_status_t
> error)
>  	blk_mq_free_request(req);
>  }
> 
> -static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
> -{
> -	/* If true, indicates loss of adapter communication, possibly by a
> -	 * NVMe Subsystem reset.
> -	 */
> -	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
> -
> -	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
> -	switch (dev->ctrl.state) {
> -	case NVME_CTRL_RESETTING:
> -	case NVME_CTRL_CONNECTING:
> -		return false;
> -	default:
> -		break;
> -	}
> -
> -	/* We shouldn't reset unless the controller is on fatal error state
> -	 * _or_ if we lost the communication with it.
> -	 */
> -	if (!(csts & NVME_CSTS_CFS) && !nssro)
> -		return false;
> -
> -	return true;
> -}
> -
>  static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
>  {
>  	/* Read a config register to help see what died. */
> @@ -1355,7 +1330,7 @@ static enum blk_eh_timer_return nvme_timeout(struct
> request *req, bool reserved)
>  	/*
>  	 * Reset immediately if the controller is failed
>  	 */
> -	if (nvme_should_reset(dev, csts)) {
> +	if (nvme_should_reset(&dev->ctrl, csts)) {
>  		nvme_warn_reset(dev, csts);
>  		nvme_dev_disable(dev, false);
>  		nvme_reset_ctrl(&dev->ctrl);
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 29ec3e3481ff6..8ced2439f1f34 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -711,6 +711,10 @@ enum {
>  	NVME_AER_VS			= 7,
>  };
> 
> +enum {
> +	NVME_AER_ERROR_PERSIST_INT_ERR	= 0x03,
> +};
> +
>  enum {
>  	NVME_AER_NOTICE_NS_CHANGED	= 0x00,
>  	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/2] nvme-pci: handle persistent internal error AER from NVMe controller
  2022-06-01 15:56     ` Michael Kelley (LINUX)
@ 2022-06-01 17:08       ` Christoph Hellwig
  0 siblings, 0 replies; 5+ messages in thread
From: Christoph Hellwig @ 2022-06-01 17:08 UTC (permalink / raw)
  To: Michael Kelley (LINUX)
  Cc: Christoph Hellwig, kbusch, axboe, sagi, linux-nvme, linux-kernel,
	Caroline Subramoney, Richard Wurdack, Nathan Obr

On Wed, Jun 01, 2022 at 03:56:59PM +0000, Michael Kelley (LINUX) wrote:
> If there is a persistent error that does a controller reset, it looks
> like we should *not* queue async_event_work at the end of
> nvme_complete_async_event().  The controller reset will
> submit an AER on the admin queue, and so presumably
> we don't want nvme_async_event_work() to also try to submit
> another AER, which may or may not succeed depending on the
> timing of when the controller state shows LIVE again.
> Agreed?

Yes, that makes sense.  I guess we can just check the return value
from nvme_reset_ctrl and propagate this to nvme_async_event_work
and skip the rearming for that case.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-06-01 17:08 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-01  4:12 [PATCH 1/2] nvme-pci: Move two functions to avoid forward reference Michael Kelley
2022-06-01  4:12 ` [PATCH 2/2] nvme-pci: handle persistent internal error AER from NVMe controller Michael Kelley
2022-06-01  7:35   ` Christoph Hellwig
2022-06-01 15:56     ` Michael Kelley (LINUX)
2022-06-01 17:08       ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.