From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <ming.lei@redhat.com>
From: Ming Lei <ming.lei@redhat.com>
To: Keith Busch <keith.busch@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>,
	linux-block@vger.kernel.org,
	Ming Lei <ming.lei@redhat.com>,
	James Smart <james.smart@broadcom.com>,
	Jianchao Wang <jianchao.w.wang@oracle.com>,
	Christoph Hellwig <hch@lst.de>,
	Sagi Grimberg <sagi@grimberg.me>,
	linux-nvme@lists.infradead.org,
	Laurence Oberman <loberman@redhat.com>
Subject: [PATCH V6 11/11] nvme: pci: support nested EH
Date: Wed, 16 May 2018 12:03:13 +0800
Message-Id: <20180516040313.13596-12-ming.lei@redhat.com>
In-Reply-To: <20180516040313.13596-1-ming.lei@redhat.com>
References: <20180516040313.13596-1-ming.lei@redhat.com>
List-ID: <linux-block@vger.kernel.org>

When one req is timed out, now nvme_timeout() handles it by the
following way:

	nvme_dev_disable(dev, false);
	nvme_reset_ctrl(&dev->ctrl);
	return BLK_EH_HANDLED.

There are several issues about the above approach:

1) IO may fail during resetting

Admin IO timeout may be triggered in nvme_reset_dev() when error happens.
Normal IO timeout may be triggered too during nvme_wait_freeze() in reset
path. When the two kinds of timeout happen, the current reset mechanism
can't work any more.

2) race between nvme_start_freeze and nvme_wait_freeze() & nvme_unfreeze()

- nvme_dev_disable() and resetting controller are required for recovering
controller, but the two are run from different contexts. nvme_start_freeze()
is call from nvme_dev_disable() which is run timeout work context, and
nvme_unfreeze() is run from reset work context. Unfortunatley timeout may be
triggered during resetting controller, so nvme_start_freeze() may be run
several times.

- Also two reset work may run one by one, this may cause hang in
nvme_wait_freeze() forever too.

3) all namespace's EH require to shutdown & reset the controller

block's timeout handler is per-request-queue, that means each
namespace's error handling may shutdown & reset the whole controller,
then the shutdown from one namespace may quiese queues when resetting
from another namespace is in-progress.

This patch fixes the above issues by using nested EH:

1) run controller shutdown(nvme_dev_disable()) and resetting(nvme_reset_dev)
from one same EH context, so the above race 2) can be fixed easily.

2) always start a new context for handling EH, and cancel all in-flight
requests(include the timed-out ones) in nvme_dev_disable() by quiescing
timeout event before shutdown controller.

3) limit the max number of nested EH, when the limit is reached, removes
the controller and fails all in-flight request.

With this approach, blktest block/011 can be passed.

Cc: James Smart <james.smart@broadcom.com>
Cc: Jianchao Wang <jianchao.w.wang@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: linux-nvme@lists.infradead.org
Cc: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/nvme/host/core.c |  22 +++++
 drivers/nvme/host/nvme.h |   2 +
 drivers/nvme/host/pci.c  | 252 +++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 256 insertions(+), 20 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9e51c3e1f534..264619dc81db 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3587,6 +3587,28 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_unquiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_unquiesce_timeout);
+
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_quiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_quiesce_timeout);
+
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 715239226f4c..5ed7d7ddd597 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -412,6 +412,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl);
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4236d79e3643..58e92c7c10e0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -71,6 +71,7 @@ struct nvme_queue;
 static void nvme_process_cq(struct nvme_queue *nvmeq);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
 		freeze_queue);
+static int nvme_reset_dev(struct nvme_dev *dev);
 
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
@@ -113,6 +114,23 @@ struct nvme_dev {
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
 	void **host_mem_desc_bufs;
+
+	/* EH handler */
+	spinlock_t	eh_lock;
+	bool		ctrl_shutdown_started;
+	bool		ctrl_failed;
+	unsigned int	nested_eh;
+	struct work_struct fail_ctrl_work;
+	wait_queue_head_t	eh_wq;
+	struct list_head	eh_head;
+};
+
+#define  NVME_MAX_NESTED_EH	32
+struct nvme_eh_work {
+	struct work_struct	work;
+	struct nvme_dev		*dev;
+	int			seq;
+	struct list_head	list;
 };
 
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -1186,6 +1204,183 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 			 csts, result);
 }
 
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+{
+	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
+
+	nvme_get_ctrl(&dev->ctrl);
+	nvme_dev_disable(dev, false, false);
+	if (!queue_work(nvme_wq, &dev->remove_work))
+		nvme_put_ctrl(&dev->ctrl);
+}
+
+static void nvme_eh_fail_ctrl_work(struct work_struct *work)
+{
+	struct nvme_dev *dev =
+		container_of(work, struct nvme_dev, fail_ctrl_work);
+
+	dev_info(dev->ctrl.device, "EH: fail controller\n");
+	nvme_remove_dead_ctrl(dev, 0);
+}
+
+static void nvme_eh_mark_ctrl_shutdown(struct nvme_dev *dev)
+{
+	spin_lock(&dev->eh_lock);
+	dev->ctrl_shutdown_started = false;
+	spin_unlock(&dev->eh_lock);
+}
+
+static void nvme_eh_sched_fail_ctrl(struct nvme_dev *dev)
+{
+	INIT_WORK(&dev->fail_ctrl_work, nvme_eh_fail_ctrl_work);
+	queue_work(nvme_reset_wq, &dev->fail_ctrl_work);
+}
+
+/* either controller is updated to LIVE or will be removed */
+static bool nvme_eh_reset_done(struct nvme_dev *dev)
+{
+	return dev->ctrl.state == NVME_CTRL_LIVE ||
+		dev->ctrl.state == NVME_CTRL_ADMIN_ONLY ||
+		dev->ctrl_failed;
+}
+
+static void nvme_eh_done(struct nvme_eh_work *eh_work, int result)
+{
+	struct nvme_dev *dev = eh_work->dev;
+	bool top_eh;
+
+	spin_lock(&dev->eh_lock);
+	top_eh = list_is_last(&eh_work->list, &dev->eh_head);
+
+	/* Fail controller if the top EH can't recover it */
+	if (!result)
+		wake_up_all(&dev->eh_wq);
+	else if (top_eh) {
+		dev->ctrl_failed = true;
+		nvme_eh_sched_fail_ctrl(dev);
+		wake_up_all(&dev->eh_wq);
+	}
+
+	list_del(&eh_work->list);
+	spin_unlock(&dev->eh_lock);
+
+	dev_info(dev->ctrl.device, "EH %d: state %d, eh_done %d, top eh %d\n",
+			eh_work->seq, dev->ctrl.state, result, top_eh);
+	wait_event(dev->eh_wq, nvme_eh_reset_done(dev));
+
+	/* release the EH seq, so outer EH can be allocated bigger seq No. */
+	spin_lock(&dev->eh_lock);
+	dev->nested_eh--;
+	spin_unlock(&dev->eh_lock);
+
+	/*
+	 * After controller is recovered in upper EH finally, we have to
+	 * unfreeze queues if reset failed in this EH, otherwise blk-mq
+	 * queues' freeze counter may be leaked.
+	 *
+	 * nvme_unfreeze() can only be called after controller state is
+	 * updated to LIVE.
+	 */
+	if (result && (dev->ctrl.state == NVME_CTRL_LIVE ||
+				dev->ctrl.state == NVME_CTRL_ADMIN_ONLY))
+		nvme_unfreeze(&dev->ctrl);
+}
+
+static void nvme_eh_work(struct work_struct *work)
+{
+	struct nvme_eh_work *eh_work =
+		container_of(work, struct nvme_eh_work, work);
+	struct nvme_dev *dev = eh_work->dev;
+	int result = -ENODEV;
+	bool top_eh;
+
+	dev_info(dev->ctrl.device, "EH %d: before shutdown\n",
+			eh_work->seq);
+	nvme_dev_disable(dev, false, true);
+
+	/* allow new EH to be created */
+	nvme_eh_mark_ctrl_shutdown(dev);
+
+	/*
+	 * nvme_dev_disable cancels all in-flight requests, and wont't
+	 * cause timout at all, so I am always the top EH now, but it
+	 * becomes not true after 'reset_lock' is held, so have to check
+	 * if I am still the top EH, and force to update to NVME_CTRL_RESETTING
+	 * if yes.
+	 */
+	mutex_lock(&dev->ctrl.reset_lock);
+	spin_lock(&dev->eh_lock);
+
+	/* allow top EH to preempt other inner EH */
+	top_eh = list_is_last(&eh_work->list, &dev->eh_head);
+	dev_info(dev->ctrl.device, "EH %d: after shutdown, top eh: %d\n",
+			eh_work->seq, top_eh);
+	if (!top_eh) {
+		if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
+			spin_unlock(&dev->eh_lock);
+			goto done;
+		}
+	} else {
+		nvme_force_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
+		result = 0;
+	}
+
+	spin_unlock(&dev->eh_lock);
+	result = nvme_reset_dev(dev);
+done:
+	mutex_unlock(&dev->ctrl.reset_lock);
+	nvme_eh_done(eh_work, result);
+	dev_info(dev->ctrl.device, "EH %d: after recovery %d\n",
+			eh_work->seq, result);
+
+	kfree(eh_work);
+}
+
+static void nvme_eh_schedule(struct nvme_dev *dev)
+{
+	bool need_sched = false;
+	bool fail_ctrl = false;
+	struct nvme_eh_work *eh_work;
+	int seq;
+
+	spin_lock(&dev->eh_lock);
+	if (!dev->ctrl_shutdown_started) {
+		need_sched = true;
+		seq = dev->nested_eh;
+		if (++dev->nested_eh >= NVME_MAX_NESTED_EH) {
+			if (!dev->ctrl_failed)
+				dev->ctrl_failed = fail_ctrl = true;
+			else
+				need_sched = false;
+		} else
+			dev->ctrl_shutdown_started = true;
+	}
+	spin_unlock(&dev->eh_lock);
+
+	if (!need_sched)
+		return;
+
+	if (fail_ctrl) {
+ fail_ctrl:
+		nvme_eh_sched_fail_ctrl(dev);
+		return;
+	}
+
+	eh_work = kzalloc(sizeof(*eh_work), GFP_NOIO);
+	if (!eh_work)
+		goto fail_ctrl;
+
+	eh_work->dev = dev;
+	eh_work->seq = seq;
+
+	spin_lock(&dev->eh_lock);
+	list_add_tail(&eh_work->list, &dev->eh_head);
+	spin_unlock(&dev->eh_lock);
+
+	INIT_WORK(&eh_work->work, nvme_eh_work);
+	queue_work(nvme_reset_wq, &eh_work->work);
+}
+
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1207,9 +1402,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 */
 	if (nvme_should_reset(dev, csts)) {
 		nvme_warn_reset(dev, csts);
-		nvme_dev_disable(dev, false, true);
-		nvme_reset_ctrl(&dev->ctrl);
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	}
 
 	/*
@@ -1234,9 +1428,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false, false);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	default:
 		break;
 	}
@@ -1250,15 +1444,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, reset controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false, true);
-		nvme_reset_ctrl(&dev->ctrl);
-
 		/*
 		 * Mark the request as handled, since the inline shutdown
 		 * forces all outstanding requests to complete.
 		 */
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	}
 
 	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
@@ -2316,12 +2508,28 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
 	}
 	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
 		nvme_suspend_queue(&dev->queues[i]);
+	/*
+	 * safe to sync timeout after queues are quiesced, then all
+	 * requests(include the time-out ones) will be canceled.
+	 */
+	nvme_quiesce_timeout(&dev->ctrl);
+	if (dev->ctrl.admin_q)
+		blk_quiesce_timeout(dev->ctrl.admin_q);
 
 	nvme_pci_disable(dev);
 
+	/*
+	 * Both timeout and interrupt handler have been drained, and all
+	 * in-flight requests will be canceled now.
+	 */
 	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
 	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
 
+	/* all requests have been canceled now, so enable timeout now */
+	nvme_unquiesce_timeout(&dev->ctrl);
+	if (dev->ctrl.admin_q)
+		blk_unquiesce_timeout(dev->ctrl.admin_q);
+
 	/*
 	 * The driver will not be starting up queues again if shutting down so
 	 * must flush all entered requests to their failed completion to avoid
@@ -2381,16 +2589,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 	kfree(dev);
 }
 
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
-{
-	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
-
-	nvme_get_ctrl(&dev->ctrl);
-	nvme_dev_disable(dev, false, false);
-	if (!queue_work(nvme_wq, &dev->remove_work))
-		nvme_put_ctrl(&dev->ctrl);
-}
-
 static int nvme_reset_dev(struct nvme_dev *dev)
 {
 	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
@@ -2400,7 +2598,7 @@ static int nvme_reset_dev(struct nvme_dev *dev)
 
 	lockdep_assert_held(&dev->ctrl.reset_lock);
 
-	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
+	if (dev->ctrl.state != NVME_CTRL_RESETTING)
 		goto out;
 
 	/*
@@ -2486,6 +2684,10 @@ static int nvme_reset_dev(struct nvme_dev *dev)
 		unfreeze_queue = true;
 	}
 
+	/* controller state may have been updated already by inner EH */
+	if (dev->ctrl.state == new_state)
+		goto reset_done;
+
 	result = -ENODEV;
 	/*
 	 * If only admin queue live, keep it to do further investigation or
@@ -2499,6 +2701,7 @@ static int nvme_reset_dev(struct nvme_dev *dev)
 
 	nvme_start_ctrl(&dev->ctrl);
 
+ reset_done:
 	if (unfreeze_queue)
 		nvme_unfreeze(&dev->ctrl);
 	return 0;
@@ -2615,6 +2818,13 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
 	return 0;
 }
 
+static void nvme_eh_init(struct nvme_dev *dev)
+{
+	spin_lock_init(&dev->eh_lock);
+	init_waitqueue_head(&dev->eh_wq);
+	INIT_LIST_HEAD(&dev->eh_head);
+}
+
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	int node, result = -ENOMEM;
@@ -2659,6 +2869,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
+	nvme_eh_init(dev);
+
 	nvme_reset_ctrl(&dev->ctrl);
 
 	return 0;
-- 
2.9.5

From mboxrd@z Thu Jan  1 00:00:00 1970
From: ming.lei@redhat.com (Ming Lei)
Date: Wed, 16 May 2018 12:03:13 +0800
Subject: [PATCH V6 11/11] nvme: pci: support nested EH
In-Reply-To: <20180516040313.13596-1-ming.lei@redhat.com>
References: <20180516040313.13596-1-ming.lei@redhat.com>
Message-ID: <20180516040313.13596-12-ming.lei@redhat.com>

When one req is timed out, now nvme_timeout() handles it by the
following way:

	nvme_dev_disable(dev, false);
	nvme_reset_ctrl(&dev->ctrl);
	return BLK_EH_HANDLED.

There are several issues about the above approach:

1) IO may fail during resetting

Admin IO timeout may be triggered in nvme_reset_dev() when error happens.
Normal IO timeout may be triggered too during nvme_wait_freeze() in reset
path. When the two kinds of timeout happen, the current reset mechanism
can't work any more.

2) race between nvme_start_freeze and nvme_wait_freeze() & nvme_unfreeze()

- nvme_dev_disable() and resetting controller are required for recovering
controller, but the two are run from different contexts. nvme_start_freeze()
is call from nvme_dev_disable() which is run timeout work context, and
nvme_unfreeze() is run from reset work context. Unfortunatley timeout may be
triggered during resetting controller, so nvme_start_freeze() may be run
several times.

- Also two reset work may run one by one, this may cause hang in
nvme_wait_freeze() forever too.

3) all namespace's EH require to shutdown & reset the controller

block's timeout handler is per-request-queue, that means each
namespace's error handling may shutdown & reset the whole controller,
then the shutdown from one namespace may quiese queues when resetting
from another namespace is in-progress.

This patch fixes the above issues by using nested EH:

1) run controller shutdown(nvme_dev_disable()) and resetting(nvme_reset_dev)
from one same EH context, so the above race 2) can be fixed easily.

2) always start a new context for handling EH, and cancel all in-flight
requests(include the timed-out ones) in nvme_dev_disable() by quiescing
timeout event before shutdown controller.

3) limit the max number of nested EH, when the limit is reached, removes
the controller and fails all in-flight request.

With this approach, blktest block/011 can be passed.

Cc: James Smart <james.smart at broadcom.com>
Cc: Jianchao Wang <jianchao.w.wang at oracle.com>
Cc: Christoph Hellwig <hch at lst.de>
Cc: Sagi Grimberg <sagi at grimberg.me>
Cc: linux-nvme at lists.infradead.org
Cc: Laurence Oberman <loberman at redhat.com>
Signed-off-by: Ming Lei <ming.lei at redhat.com>
---
 drivers/nvme/host/core.c |  22 +++++
 drivers/nvme/host/nvme.h |   2 +
 drivers/nvme/host/pci.c  | 252 +++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 256 insertions(+), 20 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9e51c3e1f534..264619dc81db 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3587,6 +3587,28 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_unquiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_unquiesce_timeout);
+
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_quiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_quiesce_timeout);
+
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 715239226f4c..5ed7d7ddd597 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -412,6 +412,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl);
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4236d79e3643..58e92c7c10e0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -71,6 +71,7 @@ struct nvme_queue;
 static void nvme_process_cq(struct nvme_queue *nvmeq);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
 		freeze_queue);
+static int nvme_reset_dev(struct nvme_dev *dev);
 
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
@@ -113,6 +114,23 @@ struct nvme_dev {
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
 	void **host_mem_desc_bufs;
+
+	/* EH handler */
+	spinlock_t	eh_lock;
+	bool		ctrl_shutdown_started;
+	bool		ctrl_failed;
+	unsigned int	nested_eh;
+	struct work_struct fail_ctrl_work;
+	wait_queue_head_t	eh_wq;
+	struct list_head	eh_head;
+};
+
+#define  NVME_MAX_NESTED_EH	32
+struct nvme_eh_work {
+	struct work_struct	work;
+	struct nvme_dev		*dev;
+	int			seq;
+	struct list_head	list;
 };
 
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -1186,6 +1204,183 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 			 csts, result);
 }
 
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+{
+	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
+
+	nvme_get_ctrl(&dev->ctrl);
+	nvme_dev_disable(dev, false, false);
+	if (!queue_work(nvme_wq, &dev->remove_work))
+		nvme_put_ctrl(&dev->ctrl);
+}
+
+static void nvme_eh_fail_ctrl_work(struct work_struct *work)
+{
+	struct nvme_dev *dev =
+		container_of(work, struct nvme_dev, fail_ctrl_work);
+
+	dev_info(dev->ctrl.device, "EH: fail controller\n");
+	nvme_remove_dead_ctrl(dev, 0);
+}
+
+static void nvme_eh_mark_ctrl_shutdown(struct nvme_dev *dev)
+{
+	spin_lock(&dev->eh_lock);
+	dev->ctrl_shutdown_started = false;
+	spin_unlock(&dev->eh_lock);
+}
+
+static void nvme_eh_sched_fail_ctrl(struct nvme_dev *dev)
+{
+	INIT_WORK(&dev->fail_ctrl_work, nvme_eh_fail_ctrl_work);
+	queue_work(nvme_reset_wq, &dev->fail_ctrl_work);
+}
+
+/* either controller is updated to LIVE or will be removed */
+static bool nvme_eh_reset_done(struct nvme_dev *dev)
+{
+	return dev->ctrl.state == NVME_CTRL_LIVE ||
+		dev->ctrl.state == NVME_CTRL_ADMIN_ONLY ||
+		dev->ctrl_failed;
+}
+
+static void nvme_eh_done(struct nvme_eh_work *eh_work, int result)
+{
+	struct nvme_dev *dev = eh_work->dev;
+	bool top_eh;
+
+	spin_lock(&dev->eh_lock);
+	top_eh = list_is_last(&eh_work->list, &dev->eh_head);
+
+	/* Fail controller if the top EH can't recover it */
+	if (!result)
+		wake_up_all(&dev->eh_wq);
+	else if (top_eh) {
+		dev->ctrl_failed = true;
+		nvme_eh_sched_fail_ctrl(dev);
+		wake_up_all(&dev->eh_wq);
+	}
+
+	list_del(&eh_work->list);
+	spin_unlock(&dev->eh_lock);
+
+	dev_info(dev->ctrl.device, "EH %d: state %d, eh_done %d, top eh %d\n",
+			eh_work->seq, dev->ctrl.state, result, top_eh);
+	wait_event(dev->eh_wq, nvme_eh_reset_done(dev));
+
+	/* release the EH seq, so outer EH can be allocated bigger seq No. */
+	spin_lock(&dev->eh_lock);
+	dev->nested_eh--;
+	spin_unlock(&dev->eh_lock);
+
+	/*
+	 * After controller is recovered in upper EH finally, we have to
+	 * unfreeze queues if reset failed in this EH, otherwise blk-mq
+	 * queues' freeze counter may be leaked.
+	 *
+	 * nvme_unfreeze() can only be called after controller state is
+	 * updated to LIVE.
+	 */
+	if (result && (dev->ctrl.state == NVME_CTRL_LIVE ||
+				dev->ctrl.state == NVME_CTRL_ADMIN_ONLY))
+		nvme_unfreeze(&dev->ctrl);
+}
+
+static void nvme_eh_work(struct work_struct *work)
+{
+	struct nvme_eh_work *eh_work =
+		container_of(work, struct nvme_eh_work, work);
+	struct nvme_dev *dev = eh_work->dev;
+	int result = -ENODEV;
+	bool top_eh;
+
+	dev_info(dev->ctrl.device, "EH %d: before shutdown\n",
+			eh_work->seq);
+	nvme_dev_disable(dev, false, true);
+
+	/* allow new EH to be created */
+	nvme_eh_mark_ctrl_shutdown(dev);
+
+	/*
+	 * nvme_dev_disable cancels all in-flight requests, and wont't
+	 * cause timout at all, so I am always the top EH now, but it
+	 * becomes not true after 'reset_lock' is held, so have to check
+	 * if I am still the top EH, and force to update to NVME_CTRL_RESETTING
+	 * if yes.
+	 */
+	mutex_lock(&dev->ctrl.reset_lock);
+	spin_lock(&dev->eh_lock);
+
+	/* allow top EH to preempt other inner EH */
+	top_eh = list_is_last(&eh_work->list, &dev->eh_head);
+	dev_info(dev->ctrl.device, "EH %d: after shutdown, top eh: %d\n",
+			eh_work->seq, top_eh);
+	if (!top_eh) {
+		if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
+			spin_unlock(&dev->eh_lock);
+			goto done;
+		}
+	} else {
+		nvme_force_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
+		result = 0;
+	}
+
+	spin_unlock(&dev->eh_lock);
+	result = nvme_reset_dev(dev);
+done:
+	mutex_unlock(&dev->ctrl.reset_lock);
+	nvme_eh_done(eh_work, result);
+	dev_info(dev->ctrl.device, "EH %d: after recovery %d\n",
+			eh_work->seq, result);
+
+	kfree(eh_work);
+}
+
+static void nvme_eh_schedule(struct nvme_dev *dev)
+{
+	bool need_sched = false;
+	bool fail_ctrl = false;
+	struct nvme_eh_work *eh_work;
+	int seq;
+
+	spin_lock(&dev->eh_lock);
+	if (!dev->ctrl_shutdown_started) {
+		need_sched = true;
+		seq = dev->nested_eh;
+		if (++dev->nested_eh >= NVME_MAX_NESTED_EH) {
+			if (!dev->ctrl_failed)
+				dev->ctrl_failed = fail_ctrl = true;
+			else
+				need_sched = false;
+		} else
+			dev->ctrl_shutdown_started = true;
+	}
+	spin_unlock(&dev->eh_lock);
+
+	if (!need_sched)
+		return;
+
+	if (fail_ctrl) {
+ fail_ctrl:
+		nvme_eh_sched_fail_ctrl(dev);
+		return;
+	}
+
+	eh_work = kzalloc(sizeof(*eh_work), GFP_NOIO);
+	if (!eh_work)
+		goto fail_ctrl;
+
+	eh_work->dev = dev;
+	eh_work->seq = seq;
+
+	spin_lock(&dev->eh_lock);
+	list_add_tail(&eh_work->list, &dev->eh_head);
+	spin_unlock(&dev->eh_lock);
+
+	INIT_WORK(&eh_work->work, nvme_eh_work);
+	queue_work(nvme_reset_wq, &eh_work->work);
+}
+
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1207,9 +1402,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 */
 	if (nvme_should_reset(dev, csts)) {
 		nvme_warn_reset(dev, csts);
-		nvme_dev_disable(dev, false, true);
-		nvme_reset_ctrl(&dev->ctrl);
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	}
 
 	/*
@@ -1234,9 +1428,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false, false);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	default:
 		break;
 	}
@@ -1250,15 +1444,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, reset controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false, true);
-		nvme_reset_ctrl(&dev->ctrl);
-
 		/*
 		 * Mark the request as handled, since the inline shutdown
 		 * forces all outstanding requests to complete.
 		 */
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	}
 
 	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
@@ -2316,12 +2508,28 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
 	}
 	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
 		nvme_suspend_queue(&dev->queues[i]);
+	/*
+	 * safe to sync timeout after queues are quiesced, then all
+	 * requests(include the time-out ones) will be canceled.
+	 */
+	nvme_quiesce_timeout(&dev->ctrl);
+	if (dev->ctrl.admin_q)
+		blk_quiesce_timeout(dev->ctrl.admin_q);
 
 	nvme_pci_disable(dev);
 
+	/*
+	 * Both timeout and interrupt handler have been drained, and all
+	 * in-flight requests will be canceled now.
+	 */
 	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
 	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
 
+	/* all requests have been canceled now, so enable timeout now */
+	nvme_unquiesce_timeout(&dev->ctrl);
+	if (dev->ctrl.admin_q)
+		blk_unquiesce_timeout(dev->ctrl.admin_q);
+
 	/*
 	 * The driver will not be starting up queues again if shutting down so
 	 * must flush all entered requests to their failed completion to avoid
@@ -2381,16 +2589,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 	kfree(dev);
 }
 
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
-{
-	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
-
-	nvme_get_ctrl(&dev->ctrl);
-	nvme_dev_disable(dev, false, false);
-	if (!queue_work(nvme_wq, &dev->remove_work))
-		nvme_put_ctrl(&dev->ctrl);
-}
-
 static int nvme_reset_dev(struct nvme_dev *dev)
 {
 	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
@@ -2400,7 +2598,7 @@ static int nvme_reset_dev(struct nvme_dev *dev)
 
 	lockdep_assert_held(&dev->ctrl.reset_lock);
 
-	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
+	if (dev->ctrl.state != NVME_CTRL_RESETTING)
 		goto out;
 
 	/*
@@ -2486,6 +2684,10 @@ static int nvme_reset_dev(struct nvme_dev *dev)
 		unfreeze_queue = true;
 	}
 
+	/* controller state may have been updated already by inner EH */
+	if (dev->ctrl.state == new_state)
+		goto reset_done;
+
 	result = -ENODEV;
 	/*
 	 * If only admin queue live, keep it to do further investigation or
@@ -2499,6 +2701,7 @@ static int nvme_reset_dev(struct nvme_dev *dev)
 
 	nvme_start_ctrl(&dev->ctrl);
 
+ reset_done:
 	if (unfreeze_queue)
 		nvme_unfreeze(&dev->ctrl);
 	return 0;
@@ -2615,6 +2818,13 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
 	return 0;
 }
 
+static void nvme_eh_init(struct nvme_dev *dev)
+{
+	spin_lock_init(&dev->eh_lock);
+	init_waitqueue_head(&dev->eh_wq);
+	INIT_LIST_HEAD(&dev->eh_head);
+}
+
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	int node, result = -ENOMEM;
@@ -2659,6 +2869,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
+	nvme_eh_init(dev);
+
 	nvme_reset_ctrl(&dev->ctrl);
 
 	return 0;
-- 
2.9.5