All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ming Lei <ming.lei@redhat.com>
To: Keith Busch <keith.busch@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>,
	linux-block@vger.kernel.org, Ming Lei <ming.lei@redhat.com>,
	James Smart <james.smart@broadcom.com>,
	Jianchao Wang <jianchao.w.wang@oracle.com>,
	Christoph Hellwig <hch@lst.de>, Sagi Grimberg <sagi@grimberg.me>,
	linux-nvme@lists.infradead.org,
	Laurence Oberman <loberman@redhat.com>
Subject: [PATCH V5 9/9] nvme: pci: support nested EH
Date: Fri, 11 May 2018 20:29:33 +0800	[thread overview]
Message-ID: <20180511122933.27155-10-ming.lei@redhat.com> (raw)
In-Reply-To: <20180511122933.27155-1-ming.lei@redhat.com>

When one req is timed out, now nvme_timeout() handles it by the
following way:

	nvme_dev_disable(dev, false);
	nvme_reset_ctrl(&dev->ctrl);
	return BLK_EH_HANDLED.

There are several issues about the above approach:

1) IO may fail during resetting

Admin IO timeout may be triggered in nvme_reset_dev() when error happens.
Normal IO timeout may be triggered too during nvme_wait_freeze() in reset
path. When the two kinds of timeout happen, the current reset mechanism
can't work any more.

2) race between nvme_start_freeze and nvme_wait_freeze() & nvme_unfreeze()

- nvme_dev_disable() and resetting controller are required for recovering
controller, but the two are run from different contexts. nvme_start_freeze()
is call from nvme_dev_disable() which is run timeout work context, and
nvme_unfreeze() is run from reset work context. Unfortunatley timeout may be
triggered during resetting controller, so nvme_start_freeze() may be run
several times.

- Also two reset work may run one by one, this may cause hang in
nvme_wait_freeze() forever too.

3) all namespace's EH require to shutdown & reset the controller

block's timeout handler is per-request-queue, that means each
namespace's error handling may shutdown & reset the whole controller,
then the shutdown from one namespace may quiese queues when resetting
from another namespace is in-progress.

This patch fixes the above issues by using nested EH:

1) run controller shutdown(nvme_dev_disable()) and resetting(nvme_reset_dev)
from one same EH context, so the above race 2) can be fixed easily.

2) always start a new context for handling EH, and cancel all in-flight
requests(include the timed-out ones) in nvme_dev_disable() by quiescing
timeout event before shutdown controller.

3) limit the max number of nested EH, when the limit is reached, removes
the controller and fails all in-flight request.

With this approach, blktest block/011 can be passed.

Cc: James Smart <james.smart@broadcom.com>
Cc: Jianchao Wang <jianchao.w.wang@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: linux-nvme@lists.infradead.org
Cc: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/nvme/host/core.c |  22 +++++
 drivers/nvme/host/nvme.h |   2 +
 drivers/nvme/host/pci.c  | 243 +++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 247 insertions(+), 20 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index af053309c610..1dec353388be 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3581,6 +3581,28 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_unquiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_unquiesce_timeout);
+
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_quiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_quiesce_timeout);
+
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2a68df197c71..934cf98925f3 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -407,6 +407,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl);
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b79c7f016489..4366c21e59b2 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -71,6 +71,7 @@ struct nvme_queue;
 static void nvme_process_cq(struct nvme_queue *nvmeq);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
 		freeze_queue);
+static int nvme_reset_dev(struct nvme_dev *dev);
 
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
@@ -113,6 +114,23 @@ struct nvme_dev {
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
 	void **host_mem_desc_bufs;
+
+	/* EH handler */
+	spinlock_t	eh_lock;
+	bool		ctrl_shutdown_started;
+	bool		ctrl_failed;
+	unsigned int	nested_eh;
+	struct work_struct fail_ctrl_work;
+	wait_queue_head_t	eh_wq;
+	struct list_head	eh_head;
+};
+
+#define  NVME_MAX_NESTED_EH	32
+struct nvme_eh_work {
+	struct work_struct	work;
+	struct nvme_dev		*dev;
+	int			seq;
+	struct list_head	list;
 };
 
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -1177,6 +1195,176 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 			 csts, result);
 }
 
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+{
+	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
+
+	nvme_get_ctrl(&dev->ctrl);
+	nvme_dev_disable(dev, false, false);
+	if (!queue_work(nvme_wq, &dev->remove_work))
+		nvme_put_ctrl(&dev->ctrl);
+}
+
+static void nvme_eh_fail_ctrl_work(struct work_struct *work)
+{
+	struct nvme_dev *dev =
+		container_of(work, struct nvme_dev, fail_ctrl_work);
+
+	dev_info(dev->ctrl.device, "EH: fail controller\n");
+	nvme_remove_dead_ctrl(dev, 0);
+}
+
+static void nvme_eh_mark_ctrl_shutdown(struct nvme_dev *dev)
+{
+	spin_lock(&dev->eh_lock);
+	dev->ctrl_shutdown_started = false;
+	spin_unlock(&dev->eh_lock);
+}
+
+static void nvme_eh_sched_fail_ctrl(struct nvme_dev *dev)
+{
+	INIT_WORK(&dev->fail_ctrl_work, nvme_eh_fail_ctrl_work);
+	queue_work(nvme_reset_wq, &dev->fail_ctrl_work);
+}
+
+/* either controller is updated to LIVE or will be removed */
+static bool nvme_eh_reset_done(struct nvme_dev *dev)
+{
+	return dev->ctrl.state == NVME_CTRL_LIVE || dev->ctrl_failed;
+}
+
+static void nvme_eh_done(struct nvme_eh_work *eh_work, int result)
+{
+	struct nvme_dev *dev = eh_work->dev;
+	bool top_eh;
+
+	spin_lock(&dev->eh_lock);
+	top_eh = list_is_last(&eh_work->list, &dev->eh_head);
+	dev->nested_eh--;
+
+	/* Fail controller if the top EH can't recover it */
+	if (!result)
+		wake_up_all(&dev->eh_wq);
+	else if (top_eh) {
+		dev->ctrl_failed = true;
+		nvme_eh_sched_fail_ctrl(dev);
+		wake_up_all(&dev->eh_wq);
+	}
+
+	list_del(&eh_work->list);
+	spin_unlock(&dev->eh_lock);
+
+	dev_info(dev->ctrl.device, "EH %d: state %d, eh_done %d, top eh %d\n",
+			eh_work->seq, dev->ctrl.state, result, top_eh);
+	wait_event(dev->eh_wq, nvme_eh_reset_done(dev));
+
+	/*
+	 * After controller is recovered in upper EH finally, we have to
+	 * unfreeze queues if reset failed in this EH, otherwise blk-mq
+	 * queues' freeze counter may be leaked.
+	 *
+	 * nvme_unfreeze() can only be called after controller state is
+	 * updated to LIVE.
+	 */
+	if (result && (dev->ctrl.state == NVME_CTRL_LIVE))
+		nvme_unfreeze(&dev->ctrl);
+}
+
+static void nvme_eh_work(struct work_struct *work)
+{
+	struct nvme_eh_work *eh_work =
+		container_of(work, struct nvme_eh_work, work);
+	struct nvme_dev *dev = eh_work->dev;
+	int result = -ENODEV;
+	bool top_eh;
+
+	dev_info(dev->ctrl.device, "EH %d: before shutdown\n",
+			eh_work->seq);
+	nvme_dev_disable(dev, false, true);
+
+	/* allow new EH to be created */
+	nvme_eh_mark_ctrl_shutdown(dev);
+
+	/*
+	 * nvme_dev_disable cancels all in-flight requests, and wont't
+	 * cause timout at all, so I am always the top EH now, but it
+	 * becomes not true after 'reset_lock' is held, so have to check
+	 * if I am still the top EH, and force to update to NVME_CTRL_RESETTING
+	 * if yes.
+	 */
+	mutex_lock(&dev->ctrl.reset_lock);
+	spin_lock(&dev->eh_lock);
+
+	/* allow top EH to preempt other inner EH */
+	top_eh = list_is_last(&eh_work->list, &dev->eh_head);
+	dev_info(dev->ctrl.device, "EH %d: after shutdown, top eh: %d\n",
+			eh_work->seq, top_eh);
+	if (!top_eh) {
+		if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
+			spin_unlock(&dev->eh_lock);
+			goto done;
+		}
+	} else {
+		nvme_force_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
+		result = 0;
+	}
+
+	spin_unlock(&dev->eh_lock);
+	result = nvme_reset_dev(dev);
+done:
+	mutex_unlock(&dev->ctrl.reset_lock);
+	nvme_eh_done(eh_work, result);
+	dev_info(dev->ctrl.device, "EH %d: after recovery %d\n",
+			eh_work->seq, result);
+
+	kfree(eh_work);
+}
+
+static void nvme_eh_schedule(struct nvme_dev *dev)
+{
+	bool need_sched = false;
+	bool fail_ctrl = false;
+	struct nvme_eh_work *eh_work;
+	int seq;
+
+	spin_lock(&dev->eh_lock);
+	if (!dev->ctrl_shutdown_started) {
+		need_sched = true;
+		seq = dev->nested_eh;
+		if (++dev->nested_eh >= NVME_MAX_NESTED_EH) {
+			if (!dev->ctrl_failed)
+				dev->ctrl_failed = fail_ctrl = true;
+			else
+				need_sched = false;
+		} else
+			dev->ctrl_shutdown_started = true;
+	}
+	spin_unlock(&dev->eh_lock);
+
+	if (!need_sched)
+		return;
+
+	if (fail_ctrl) {
+ fail_ctrl:
+		nvme_eh_sched_fail_ctrl(dev);
+		return;
+	}
+
+	eh_work = kzalloc(sizeof(*eh_work), GFP_NOIO);
+	if (!eh_work)
+		goto fail_ctrl;
+
+	eh_work->dev = dev;
+	eh_work->seq = seq;
+
+	spin_lock(&dev->eh_lock);
+	list_add_tail(&eh_work->list, &dev->eh_head);
+	spin_unlock(&dev->eh_lock);
+
+	INIT_WORK(&eh_work->work, nvme_eh_work);
+	queue_work(nvme_reset_wq, &eh_work->work);
+}
+
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1198,9 +1386,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 */
 	if (nvme_should_reset(dev, csts)) {
 		nvme_warn_reset(dev, csts);
-		nvme_dev_disable(dev, false, true);
-		nvme_reset_ctrl(&dev->ctrl);
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	}
 
 	/*
@@ -1225,9 +1412,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false, false);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	default:
 		break;
 	}
@@ -1241,15 +1428,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, reset controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false, true);
-		nvme_reset_ctrl(&dev->ctrl);
-
 		/*
 		 * Mark the request as handled, since the inline shutdown
 		 * forces all outstanding requests to complete.
 		 */
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	}
 
 	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
@@ -2301,12 +2486,26 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
 	}
 	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
 		nvme_suspend_queue(&dev->queues[i]);
+	/*
+	 * safe to sync timeout after queues are quiesced, then all
+	 * requests(include the time-out ones) will be canceled.
+	 */
+	nvme_quiesce_timeout(&dev->ctrl);
+	blk_quiesce_timeout(dev->ctrl.admin_q);
 
 	nvme_pci_disable(dev);
 
+	/*
+	 * Both timeout and interrupt handler have been drained, and all
+	 * in-flight requests will be canceled now.
+	 */
 	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
 	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
 
+	/* all requests have been canceled now, so enable timeout now */
+	nvme_unquiesce_timeout(&dev->ctrl);
+	blk_unquiesce_timeout(dev->ctrl.admin_q);
+
 	/*
 	 * The driver will not be starting up queues again if shutting down so
 	 * must flush all entered requests to their failed completion to avoid
@@ -2355,16 +2554,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 	kfree(dev);
 }
 
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
-{
-	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
-
-	nvme_get_ctrl(&dev->ctrl);
-	nvme_dev_disable(dev, false, false);
-	if (!queue_work(nvme_wq, &dev->remove_work))
-		nvme_put_ctrl(&dev->ctrl);
-}
-
 static int nvme_reset_dev(struct nvme_dev *dev)
 {
 	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
@@ -2374,7 +2563,7 @@ static int nvme_reset_dev(struct nvme_dev *dev)
 
 	lockdep_assert_held(&dev->ctrl.reset_lock);
 
-	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
+	if (dev->ctrl.state != NVME_CTRL_RESETTING)
 		goto out;
 
 	/*
@@ -2460,6 +2649,10 @@ static int nvme_reset_dev(struct nvme_dev *dev)
 		unfreeze_queue = true;
 	}
 
+	/* controller state may have been updated already by inner EH */
+	if (dev->ctrl.state == new_state)
+		goto reset_done;
+
 	result = -ENODEV;
 	/*
 	 * If only admin queue live, keep it to do further investigation or
@@ -2473,6 +2666,7 @@ static int nvme_reset_dev(struct nvme_dev *dev)
 
 	nvme_start_ctrl(&dev->ctrl);
 
+ reset_done:
 	if (unfreeze_queue)
 		nvme_unfreeze(&dev->ctrl);
 	return 0;
@@ -2589,6 +2783,13 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
 	return 0;
 }
 
+static void nvme_eh_init(struct nvme_dev *dev)
+{
+	spin_lock_init(&dev->eh_lock);
+	init_waitqueue_head(&dev->eh_wq);
+	INIT_LIST_HEAD(&dev->eh_head);
+}
+
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	int node, result = -ENOMEM;
@@ -2633,6 +2834,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
+	nvme_eh_init(dev);
+
 	nvme_reset_ctrl(&dev->ctrl);
 
 	return 0;
-- 
2.9.5

WARNING: multiple messages have this Message-ID (diff)
From: ming.lei@redhat.com (Ming Lei)
Subject: [PATCH V5 9/9] nvme: pci: support nested EH
Date: Fri, 11 May 2018 20:29:33 +0800	[thread overview]
Message-ID: <20180511122933.27155-10-ming.lei@redhat.com> (raw)
In-Reply-To: <20180511122933.27155-1-ming.lei@redhat.com>

When one req is timed out, now nvme_timeout() handles it by the
following way:

	nvme_dev_disable(dev, false);
	nvme_reset_ctrl(&dev->ctrl);
	return BLK_EH_HANDLED.

There are several issues about the above approach:

1) IO may fail during resetting

Admin IO timeout may be triggered in nvme_reset_dev() when error happens.
Normal IO timeout may be triggered too during nvme_wait_freeze() in reset
path. When the two kinds of timeout happen, the current reset mechanism
can't work any more.

2) race between nvme_start_freeze and nvme_wait_freeze() & nvme_unfreeze()

- nvme_dev_disable() and resetting controller are required for recovering
controller, but the two are run from different contexts. nvme_start_freeze()
is call from nvme_dev_disable() which is run timeout work context, and
nvme_unfreeze() is run from reset work context. Unfortunatley timeout may be
triggered during resetting controller, so nvme_start_freeze() may be run
several times.

- Also two reset work may run one by one, this may cause hang in
nvme_wait_freeze() forever too.

3) all namespace's EH require to shutdown & reset the controller

block's timeout handler is per-request-queue, that means each
namespace's error handling may shutdown & reset the whole controller,
then the shutdown from one namespace may quiese queues when resetting
from another namespace is in-progress.

This patch fixes the above issues by using nested EH:

1) run controller shutdown(nvme_dev_disable()) and resetting(nvme_reset_dev)
from one same EH context, so the above race 2) can be fixed easily.

2) always start a new context for handling EH, and cancel all in-flight
requests(include the timed-out ones) in nvme_dev_disable() by quiescing
timeout event before shutdown controller.

3) limit the max number of nested EH, when the limit is reached, removes
the controller and fails all in-flight request.

With this approach, blktest block/011 can be passed.

Cc: James Smart <james.smart at broadcom.com>
Cc: Jianchao Wang <jianchao.w.wang at oracle.com>
Cc: Christoph Hellwig <hch at lst.de>
Cc: Sagi Grimberg <sagi at grimberg.me>
Cc: linux-nvme at lists.infradead.org
Cc: Laurence Oberman <loberman at redhat.com>
Signed-off-by: Ming Lei <ming.lei at redhat.com>
---
 drivers/nvme/host/core.c |  22 +++++
 drivers/nvme/host/nvme.h |   2 +
 drivers/nvme/host/pci.c  | 243 +++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 247 insertions(+), 20 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index af053309c610..1dec353388be 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3581,6 +3581,28 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_unquiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_unquiesce_timeout);
+
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_quiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_quiesce_timeout);
+
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2a68df197c71..934cf98925f3 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -407,6 +407,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl);
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b79c7f016489..4366c21e59b2 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -71,6 +71,7 @@ struct nvme_queue;
 static void nvme_process_cq(struct nvme_queue *nvmeq);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
 		freeze_queue);
+static int nvme_reset_dev(struct nvme_dev *dev);
 
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
@@ -113,6 +114,23 @@ struct nvme_dev {
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
 	void **host_mem_desc_bufs;
+
+	/* EH handler */
+	spinlock_t	eh_lock;
+	bool		ctrl_shutdown_started;
+	bool		ctrl_failed;
+	unsigned int	nested_eh;
+	struct work_struct fail_ctrl_work;
+	wait_queue_head_t	eh_wq;
+	struct list_head	eh_head;
+};
+
+#define  NVME_MAX_NESTED_EH	32
+struct nvme_eh_work {
+	struct work_struct	work;
+	struct nvme_dev		*dev;
+	int			seq;
+	struct list_head	list;
 };
 
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -1177,6 +1195,176 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 			 csts, result);
 }
 
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+{
+	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
+
+	nvme_get_ctrl(&dev->ctrl);
+	nvme_dev_disable(dev, false, false);
+	if (!queue_work(nvme_wq, &dev->remove_work))
+		nvme_put_ctrl(&dev->ctrl);
+}
+
+static void nvme_eh_fail_ctrl_work(struct work_struct *work)
+{
+	struct nvme_dev *dev =
+		container_of(work, struct nvme_dev, fail_ctrl_work);
+
+	dev_info(dev->ctrl.device, "EH: fail controller\n");
+	nvme_remove_dead_ctrl(dev, 0);
+}
+
+static void nvme_eh_mark_ctrl_shutdown(struct nvme_dev *dev)
+{
+	spin_lock(&dev->eh_lock);
+	dev->ctrl_shutdown_started = false;
+	spin_unlock(&dev->eh_lock);
+}
+
+static void nvme_eh_sched_fail_ctrl(struct nvme_dev *dev)
+{
+	INIT_WORK(&dev->fail_ctrl_work, nvme_eh_fail_ctrl_work);
+	queue_work(nvme_reset_wq, &dev->fail_ctrl_work);
+}
+
+/* either controller is updated to LIVE or will be removed */
+static bool nvme_eh_reset_done(struct nvme_dev *dev)
+{
+	return dev->ctrl.state == NVME_CTRL_LIVE || dev->ctrl_failed;
+}
+
+static void nvme_eh_done(struct nvme_eh_work *eh_work, int result)
+{
+	struct nvme_dev *dev = eh_work->dev;
+	bool top_eh;
+
+	spin_lock(&dev->eh_lock);
+	top_eh = list_is_last(&eh_work->list, &dev->eh_head);
+	dev->nested_eh--;
+
+	/* Fail controller if the top EH can't recover it */
+	if (!result)
+		wake_up_all(&dev->eh_wq);
+	else if (top_eh) {
+		dev->ctrl_failed = true;
+		nvme_eh_sched_fail_ctrl(dev);
+		wake_up_all(&dev->eh_wq);
+	}
+
+	list_del(&eh_work->list);
+	spin_unlock(&dev->eh_lock);
+
+	dev_info(dev->ctrl.device, "EH %d: state %d, eh_done %d, top eh %d\n",
+			eh_work->seq, dev->ctrl.state, result, top_eh);
+	wait_event(dev->eh_wq, nvme_eh_reset_done(dev));
+
+	/*
+	 * After controller is recovered in upper EH finally, we have to
+	 * unfreeze queues if reset failed in this EH, otherwise blk-mq
+	 * queues' freeze counter may be leaked.
+	 *
+	 * nvme_unfreeze() can only be called after controller state is
+	 * updated to LIVE.
+	 */
+	if (result && (dev->ctrl.state == NVME_CTRL_LIVE))
+		nvme_unfreeze(&dev->ctrl);
+}
+
+static void nvme_eh_work(struct work_struct *work)
+{
+	struct nvme_eh_work *eh_work =
+		container_of(work, struct nvme_eh_work, work);
+	struct nvme_dev *dev = eh_work->dev;
+	int result = -ENODEV;
+	bool top_eh;
+
+	dev_info(dev->ctrl.device, "EH %d: before shutdown\n",
+			eh_work->seq);
+	nvme_dev_disable(dev, false, true);
+
+	/* allow new EH to be created */
+	nvme_eh_mark_ctrl_shutdown(dev);
+
+	/*
+	 * nvme_dev_disable cancels all in-flight requests, and wont't
+	 * cause timout at all, so I am always the top EH now, but it
+	 * becomes not true after 'reset_lock' is held, so have to check
+	 * if I am still the top EH, and force to update to NVME_CTRL_RESETTING
+	 * if yes.
+	 */
+	mutex_lock(&dev->ctrl.reset_lock);
+	spin_lock(&dev->eh_lock);
+
+	/* allow top EH to preempt other inner EH */
+	top_eh = list_is_last(&eh_work->list, &dev->eh_head);
+	dev_info(dev->ctrl.device, "EH %d: after shutdown, top eh: %d\n",
+			eh_work->seq, top_eh);
+	if (!top_eh) {
+		if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
+			spin_unlock(&dev->eh_lock);
+			goto done;
+		}
+	} else {
+		nvme_force_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
+		result = 0;
+	}
+
+	spin_unlock(&dev->eh_lock);
+	result = nvme_reset_dev(dev);
+done:
+	mutex_unlock(&dev->ctrl.reset_lock);
+	nvme_eh_done(eh_work, result);
+	dev_info(dev->ctrl.device, "EH %d: after recovery %d\n",
+			eh_work->seq, result);
+
+	kfree(eh_work);
+}
+
+static void nvme_eh_schedule(struct nvme_dev *dev)
+{
+	bool need_sched = false;
+	bool fail_ctrl = false;
+	struct nvme_eh_work *eh_work;
+	int seq;
+
+	spin_lock(&dev->eh_lock);
+	if (!dev->ctrl_shutdown_started) {
+		need_sched = true;
+		seq = dev->nested_eh;
+		if (++dev->nested_eh >= NVME_MAX_NESTED_EH) {
+			if (!dev->ctrl_failed)
+				dev->ctrl_failed = fail_ctrl = true;
+			else
+				need_sched = false;
+		} else
+			dev->ctrl_shutdown_started = true;
+	}
+	spin_unlock(&dev->eh_lock);
+
+	if (!need_sched)
+		return;
+
+	if (fail_ctrl) {
+ fail_ctrl:
+		nvme_eh_sched_fail_ctrl(dev);
+		return;
+	}
+
+	eh_work = kzalloc(sizeof(*eh_work), GFP_NOIO);
+	if (!eh_work)
+		goto fail_ctrl;
+
+	eh_work->dev = dev;
+	eh_work->seq = seq;
+
+	spin_lock(&dev->eh_lock);
+	list_add_tail(&eh_work->list, &dev->eh_head);
+	spin_unlock(&dev->eh_lock);
+
+	INIT_WORK(&eh_work->work, nvme_eh_work);
+	queue_work(nvme_reset_wq, &eh_work->work);
+}
+
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1198,9 +1386,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 */
 	if (nvme_should_reset(dev, csts)) {
 		nvme_warn_reset(dev, csts);
-		nvme_dev_disable(dev, false, true);
-		nvme_reset_ctrl(&dev->ctrl);
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	}
 
 	/*
@@ -1225,9 +1412,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false, false);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	default:
 		break;
 	}
@@ -1241,15 +1428,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, reset controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false, true);
-		nvme_reset_ctrl(&dev->ctrl);
-
 		/*
 		 * Mark the request as handled, since the inline shutdown
 		 * forces all outstanding requests to complete.
 		 */
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	}
 
 	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
@@ -2301,12 +2486,26 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
 	}
 	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
 		nvme_suspend_queue(&dev->queues[i]);
+	/*
+	 * safe to sync timeout after queues are quiesced, then all
+	 * requests(include the time-out ones) will be canceled.
+	 */
+	nvme_quiesce_timeout(&dev->ctrl);
+	blk_quiesce_timeout(dev->ctrl.admin_q);
 
 	nvme_pci_disable(dev);
 
+	/*
+	 * Both timeout and interrupt handler have been drained, and all
+	 * in-flight requests will be canceled now.
+	 */
 	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
 	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
 
+	/* all requests have been canceled now, so enable timeout now */
+	nvme_unquiesce_timeout(&dev->ctrl);
+	blk_unquiesce_timeout(dev->ctrl.admin_q);
+
 	/*
 	 * The driver will not be starting up queues again if shutting down so
 	 * must flush all entered requests to their failed completion to avoid
@@ -2355,16 +2554,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 	kfree(dev);
 }
 
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
-{
-	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
-
-	nvme_get_ctrl(&dev->ctrl);
-	nvme_dev_disable(dev, false, false);
-	if (!queue_work(nvme_wq, &dev->remove_work))
-		nvme_put_ctrl(&dev->ctrl);
-}
-
 static int nvme_reset_dev(struct nvme_dev *dev)
 {
 	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
@@ -2374,7 +2563,7 @@ static int nvme_reset_dev(struct nvme_dev *dev)
 
 	lockdep_assert_held(&dev->ctrl.reset_lock);
 
-	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
+	if (dev->ctrl.state != NVME_CTRL_RESETTING)
 		goto out;
 
 	/*
@@ -2460,6 +2649,10 @@ static int nvme_reset_dev(struct nvme_dev *dev)
 		unfreeze_queue = true;
 	}
 
+	/* controller state may have been updated already by inner EH */
+	if (dev->ctrl.state == new_state)
+		goto reset_done;
+
 	result = -ENODEV;
 	/*
 	 * If only admin queue live, keep it to do further investigation or
@@ -2473,6 +2666,7 @@ static int nvme_reset_dev(struct nvme_dev *dev)
 
 	nvme_start_ctrl(&dev->ctrl);
 
+ reset_done:
 	if (unfreeze_queue)
 		nvme_unfreeze(&dev->ctrl);
 	return 0;
@@ -2589,6 +2783,13 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
 	return 0;
 }
 
+static void nvme_eh_init(struct nvme_dev *dev)
+{
+	spin_lock_init(&dev->eh_lock);
+	init_waitqueue_head(&dev->eh_wq);
+	INIT_LIST_HEAD(&dev->eh_head);
+}
+
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	int node, result = -ENOMEM;
@@ -2633,6 +2834,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
+	nvme_eh_init(dev);
+
 	nvme_reset_ctrl(&dev->ctrl);
 
 	return 0;
-- 
2.9.5

  parent reply	other threads:[~2018-05-11 12:29 UTC|newest]

Thread overview: 64+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-05-11 12:29 [PATCH V5 0/9] nvme: pci: fix & improve timeout handling Ming Lei
2018-05-11 12:29 ` Ming Lei
2018-05-11 12:29 ` [PATCH V5 1/9] block: introduce blk_quiesce_timeout() and blk_unquiesce_timeout() Ming Lei
2018-05-11 12:29   ` Ming Lei
2018-05-11 12:29 ` [PATCH V5 2/9] nvme: pci: cover timeout for admin commands running in EH Ming Lei
2018-05-11 12:29   ` Ming Lei
2018-05-11 12:29 ` [PATCH V5 3/9] nvme: pci: only wait freezing if queue is frozen Ming Lei
2018-05-11 12:29   ` Ming Lei
2018-05-11 12:29 ` [PATCH V5 4/9] nvme: pci: freeze queue in nvme_dev_disable() in case of error recovery Ming Lei
2018-05-11 12:29   ` Ming Lei
2018-05-11 12:29 ` [PATCH V5 5/9] nvme: pci: prepare for supporting error recovery from resetting context Ming Lei
2018-05-11 12:29   ` Ming Lei
2018-05-11 12:29 ` [PATCH V5 6/9] nvme: pci: move error handling out of nvme_reset_dev() Ming Lei
2018-05-11 12:29   ` Ming Lei
2018-05-11 12:29 ` [PATCH V5 7/9] nvme: pci: don't unfreeze queue until controller state updating succeeds Ming Lei
2018-05-11 12:29   ` Ming Lei
2018-05-11 12:29 ` [PATCH V5 8/9] nvme: core: introduce nvme_force_change_ctrl_state() Ming Lei
2018-05-11 12:29   ` Ming Lei
2018-05-11 12:29 ` Ming Lei [this message]
2018-05-11 12:29   ` [PATCH V5 9/9] nvme: pci: support nested EH Ming Lei
2018-05-15 10:02   ` jianchao.wang
2018-05-15 10:02     ` jianchao.wang
2018-05-15 12:39     ` Ming Lei
2018-05-15 12:39       ` Ming Lei
2018-05-11 20:50 ` [PATCH V5 0/9] nvme: pci: fix & improve timeout handling Keith Busch
2018-05-11 20:50   ` Keith Busch
2018-05-12  0:21   ` Ming Lei
2018-05-12  0:21     ` Ming Lei
2018-05-14 15:18     ` Keith Busch
2018-05-14 15:18       ` Keith Busch
2018-05-14 23:47       ` Ming Lei
2018-05-14 23:47         ` Ming Lei
2018-05-15  0:33         ` Keith Busch
2018-05-15  0:33           ` Keith Busch
2018-05-15  9:08           ` Ming Lei
2018-05-15  9:08             ` Ming Lei
2018-05-16  4:31           ` Ming Lei
2018-05-16  4:31             ` Ming Lei
2018-05-16 15:18             ` Keith Busch
2018-05-16 15:18               ` Keith Busch
2018-05-16 22:18               ` Ming Lei
2018-05-16 22:18                 ` Ming Lei
2018-05-14  8:21 ` jianchao.wang
2018-05-14  8:21   ` jianchao.wang
2018-05-14  9:38   ` Ming Lei
2018-05-14  9:38     ` Ming Lei
2018-05-14 10:05     ` jianchao.wang
2018-05-14 10:05       ` jianchao.wang
2018-05-14 12:22       ` Ming Lei
2018-05-14 12:22         ` Ming Lei
2018-05-15  0:33         ` Ming Lei
2018-05-15  0:33           ` Ming Lei
2018-05-15  9:56           ` jianchao.wang
2018-05-15  9:56             ` jianchao.wang
2018-05-15 12:56             ` Ming Lei
2018-05-15 12:56               ` Ming Lei
2018-05-16  3:03               ` jianchao.wang
2018-05-16  3:03                 ` jianchao.wang
2018-05-16  2:04             ` Ming Lei
2018-05-16  2:04               ` Ming Lei
2018-05-16  2:09               ` Ming Lei
2018-05-16  2:09                 ` Ming Lei
2018-05-16  2:15                 ` jianchao.wang
2018-05-16  2:15                   ` jianchao.wang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180511122933.27155-10-ming.lei@redhat.com \
    --to=ming.lei@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=hch@lst.de \
    --cc=james.smart@broadcom.com \
    --cc=jianchao.w.wang@oracle.com \
    --cc=keith.busch@intel.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=loberman@redhat.com \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.