All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv3 0/9] nvme timeout fixes, v3
@ 2018-05-24 20:34 Keith Busch
  2018-05-24 20:34 ` [PATCHv3 1/9] nvme: Sync request queues on reset Keith Busch
                   ` (9 more replies)
  0 siblings, 10 replies; 45+ messages in thread
From: Keith Busch @ 2018-05-24 20:34 UTC (permalink / raw)


v2 -> v3:

The main difference is getting rid of the bad idea that we could do
a CONNECTING -> RESETTING transtion that breaks the state machine, so
that patch is dropped. The series maintains the concept of retrying a
reset when an init admin command times out. It just does so in the same
context instead of requeueing itself.

This series also includes two additional patches to handle other unlikely
errors on queue initialization and HMB teardown.

Keith Busch (9):
  nvme: Sync request queues on reset
  nvme-pci: Fix queue freeze criteria on reset
  nvme: Move all IO out of controller reset
  nvme-pci: Rate limit the nvme timeout warnings
  nvme-pci: End IO requests immediately in CONNECTING state
  nvme-pci: Unquiesce dead controller queues
  nvme-pci: Attempt reset retry for IO failures
  nvme-pci: Queue creation error handling
  nvme-pci: Don't wait for HMB completion on shutdown

 drivers/nvme/host/core.c |  23 ++++++-
 drivers/nvme/host/nvme.h |   2 +
 drivers/nvme/host/pci.c  | 173 ++++++++++++++++++++++++++++++++++-------------
 3 files changed, 150 insertions(+), 48 deletions(-)

-- 
2.14.3

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 1/9] nvme: Sync request queues on reset
  2018-05-24 20:34 [PATCHv3 0/9] nvme timeout fixes, v3 Keith Busch
@ 2018-05-24 20:34 ` Keith Busch
  2018-05-25 12:42   ` Christoph Hellwig
  2018-05-24 20:34 ` [PATCHv3 2/9] nvme-pci: Fix queue freeze criteria " Keith Busch
                   ` (8 subsequent siblings)
  9 siblings, 1 reply; 45+ messages in thread
From: Keith Busch @ 2018-05-24 20:34 UTC (permalink / raw)


This patch fixes races that occur with simultaneous controller
resets by synchronizing request queues prior to initializing the
controller. Without this, a timeout thread may attempt disabling a
controller at the same time as we're trying to enable it.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/core.c | 21 +++++++++++++++++++--
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/pci.c  | 11 +++++++----
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index dc8aa2c1c22a..33034e469bbc 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3469,6 +3469,12 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
 
+static void nvme_start_queue(struct nvme_ns *ns)
+{
+	blk_mq_unquiesce_queue(ns->queue);
+	blk_mq_kick_requeue_list(ns->queue);
+}
+
 /**
  * nvme_kill_queues(): Ends all namespace queues
  * @ctrl: the dead controller that needs to end
@@ -3497,7 +3503,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
 		blk_set_queue_dying(ns->queue);
 
 		/* Forcibly unquiesce queues to avoid blocking dispatch */
-		blk_mq_unquiesce_queue(ns->queue);
+		nvme_start_queue(ns);
 	}
 	up_read(&ctrl->namespaces_rwsem);
 }
@@ -3567,11 +3573,22 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
 
 	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
-		blk_mq_unquiesce_queue(ns->queue);
+		nvme_start_queue(ns);
 	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
+void nvme_sync_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_sync_queue(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_sync_queues);
+
 int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
 {
 	if (!ctrl->ops->reinit_request)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ec6e4acc4d48..4f43918cd902 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -403,6 +403,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		volatile union nvme_result *res);
 
+void nvme_sync_queues(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 917e1714f7d9..9da28e10d942 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2317,11 +2317,14 @@ static void nvme_reset_work(struct work_struct *work)
 		goto out;
 
 	/*
-	 * If we're called to reset a live controller first shut it down before
-	 * moving on.
+	 * Ensure there are no timeout work in progress prior to forcefully
+	 * disabling the queue. There is no harm in disabling the device even
+	 * when it was already disabled, as this will forcefully reclaim any
+	 * IOs that are stuck due to blk-mq's timeout handling that prevents
+	 * timed out requests from completing.
 	 */
-	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
-		nvme_dev_disable(dev, false);
+	nvme_sync_queues(&dev->ctrl);
+	nvme_dev_disable(dev, false);
 
 	/*
 	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCHv3 2/9] nvme-pci: Fix queue freeze criteria on reset
  2018-05-24 20:34 [PATCHv3 0/9] nvme timeout fixes, v3 Keith Busch
  2018-05-24 20:34 ` [PATCHv3 1/9] nvme: Sync request queues on reset Keith Busch
@ 2018-05-24 20:34 ` Keith Busch
  2018-05-25 12:43   ` Christoph Hellwig
  2018-05-30 23:36   ` Sagi Grimberg
  2018-05-24 20:34 ` [PATCHv3 3/9] nvme: Move all IO out of controller reset Keith Busch
                   ` (7 subsequent siblings)
  9 siblings, 2 replies; 45+ messages in thread
From: Keith Busch @ 2018-05-24 20:34 UTC (permalink / raw)


The driver had been relying on the pci_dev to maintain the state of
the pci device to know when starting a freeze would be appropriate. The
blktests block/011 however shows us that users may alter the state of
pci_dev out from under drivers and break the criteria we had been using.

This patch uses the private nvme controller struct to track the
enabling/disabling state. Since we're relying on that now, the reset
will unconditionally disable the admin queue on reset. This was already
being done anyway during admin bring up, and there is no harm to do a
second time. Disable the controller just ensures the controller enable
fields are toggled appropriately, so this patch moves that outside the
'dead' controller condition.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/pci.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9da28e10d942..bc2e377e029d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2206,39 +2206,38 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	mutex_lock(&dev->shutdown_lock);
-	if (pci_is_enabled(pdev)) {
+	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE &&
+	    (dev->ctrl.state == NVME_CTRL_LIVE ||
+	     dev->ctrl.state == NVME_CTRL_RESETTING)) {
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
-		if (dev->ctrl.state == NVME_CTRL_LIVE ||
-		    dev->ctrl.state == NVME_CTRL_RESETTING)
-			nvme_start_freeze(&dev->ctrl);
+		nvme_start_freeze(&dev->ctrl);
 		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
-			pdev->error_state  != pci_channel_io_normal);
+			pci_channel_offline(pdev) || !pci_is_enabled(pdev));
 	}
 
 	/*
 	 * Give the controller a chance to complete all entered requests if
 	 * doing a safe shutdown.
 	 */
-	if (!dead) {
-		if (shutdown)
-			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
-	}
+	if (!dead && shutdown)
+		nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
 
 	nvme_stop_queues(&dev->ctrl);
 
-	if (!dead && dev->ctrl.queue_count > 0) {
+	if (!dead) {
 		/*
 		 * If the controller is still alive tell it to stop using the
-		 * host memory buffer.  In theory the shutdown / reset should
+		 * host memory buffer. In theory the shutdown / reset should
 		 * make sure that it doesn't access the host memoery anymore,
 		 * but I'd rather be safe than sorry..
 		 */
 		if (dev->host_mem_descs)
 			nvme_set_host_mem(dev, 0);
 		nvme_disable_io_queues(dev);
-		nvme_disable_admin_queue(dev, shutdown);
 	}
+	if (dev->ctrl.queue_count > 0)
+		nvme_disable_admin_queue(dev, shutdown);
 	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
 		nvme_suspend_queue(&dev->queues[i]);
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCHv3 3/9] nvme: Move all IO out of controller reset
  2018-05-24 20:34 [PATCHv3 0/9] nvme timeout fixes, v3 Keith Busch
  2018-05-24 20:34 ` [PATCHv3 1/9] nvme: Sync request queues on reset Keith Busch
  2018-05-24 20:34 ` [PATCHv3 2/9] nvme-pci: Fix queue freeze criteria " Keith Busch
@ 2018-05-24 20:34 ` Keith Busch
  2018-05-25 13:00   ` Christoph Hellwig
  2018-05-24 20:34 ` [PATCHv3 4/9] nvme-pci: Rate limit the nvme timeout warnings Keith Busch
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 45+ messages in thread
From: Keith Busch @ 2018-05-24 20:34 UTC (permalink / raw)


IO may be retryable, so don't wait for them in the reset path. These
commands may trigger a reset if that IO expires without a completion,
placing it on the requeue list, so waiting for these would deadlock the
reset handler.

To fix the theoretical deadlock, this patch unblocks IO submission from
the reset_work as before, but moves the waiting to the scan_work, where
waiting for IO is safe so that reset_work may proceed to completion. Since
unfreezing the queues now happens in the controller LIVE state, nvme_dev
now tracks if the queues were frozen now to prevent incorrect freeze
depths.

This patch is also renaming the function 'nvme_dev_add' to a
more appropriate name that describes what it's actually doing:
nvme_alloc_io_tags.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/core.c |  2 ++
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/pci.c  | 46 +++++++++++++++++++++++++++++++---------------
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 33034e469bbc..0f0eb85c64b8 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3175,6 +3175,8 @@ static void nvme_scan_work(struct work_struct *work)
 	struct nvme_id_ctrl *id;
 	unsigned nn;
 
+	if (ctrl->ops->update_hw_ctx)
+		ctrl->ops->update_hw_ctx(ctrl);
 	if (ctrl->state != NVME_CTRL_LIVE)
 		return;
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 4f43918cd902..df4d634e0efd 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -316,6 +316,7 @@ struct nvme_ctrl_ops {
 	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
 	int (*reinit_request)(void *data, struct request *rq);
 	void (*stop_ctrl)(struct nvme_ctrl *ctrl);
+	void (*update_hw_ctx)(struct nvme_ctrl *ctrl);
 };
 
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index bc2e377e029d..243534139df7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -99,6 +99,7 @@ struct nvme_dev {
 	u32 cmbloc;
 	struct nvme_ctrl ctrl;
 	struct completion ioq_wait;
+	bool queues_froze;
 
 	/* shadow doorbell buffer support: */
 	u32 *dbbuf_dbs;
@@ -2075,10 +2076,32 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
 	}
 }
 
+static void nvme_pci_update_hw_ctx(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dev *dev = to_nvme_dev(ctrl);
+	bool unfreeze;
+
+	mutex_lock(&dev->shutdown_lock);
+	unfreeze = dev->queues_froze;
+	mutex_unlock(&dev->shutdown_lock);
+
+	if (!unfreeze)
+		return;
+
+	nvme_wait_freeze(&dev->ctrl);
+	blk_mq_update_nr_hw_queues(ctrl->tagset, dev->online_queues - 1);
+	nvme_free_queues(dev, dev->online_queues);
+	nvme_unfreeze(&dev->ctrl);
+
+	mutex_lock(&dev->shutdown_lock);
+	dev->queues_froze = false;
+	mutex_unlock(&dev->shutdown_lock);
+}
+
 /*
  * return error value only when tagset allocation failed
  */
-static int nvme_dev_add(struct nvme_dev *dev)
+static int nvme_alloc_io_tags(struct nvme_dev *dev)
 {
 	int ret;
 
@@ -2106,13 +2129,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 		dev->ctrl.tagset = &dev->tagset;
 
 		nvme_dbbuf_set(dev);
-	} else {
-		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
-
-		/* Free previously allocated queues that are no longer usable */
-		nvme_free_queues(dev, dev->online_queues);
 	}
-
 	return 0;
 }
 
@@ -2211,7 +2228,10 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	     dev->ctrl.state == NVME_CTRL_RESETTING)) {
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
-		nvme_start_freeze(&dev->ctrl);
+		if (!dev->queues_froze)	{
+			nvme_start_freeze(&dev->ctrl);
+			dev->queues_froze = true;
+		}
 		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
 			pci_channel_offline(pdev) || !pci_is_enabled(pdev));
 	}
@@ -2388,13 +2408,8 @@ static void nvme_reset_work(struct work_struct *work)
 		nvme_kill_queues(&dev->ctrl);
 		nvme_remove_namespaces(&dev->ctrl);
 		new_state = NVME_CTRL_ADMIN_ONLY;
-	} else {
-		nvme_start_queues(&dev->ctrl);
-		nvme_wait_freeze(&dev->ctrl);
-		/* hit this only when allocate tagset fails */
-		if (nvme_dev_add(dev))
-			new_state = NVME_CTRL_ADMIN_ONLY;
-		nvme_unfreeze(&dev->ctrl);
+	} else if (nvme_alloc_io_tags(dev)) {
+		new_state = NVME_CTRL_ADMIN_ONLY;
 	}
 
 	/*
@@ -2459,6 +2474,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.reg_read64		= nvme_pci_reg_read64,
 	.free_ctrl		= nvme_pci_free_ctrl,
 	.submit_async_event	= nvme_pci_submit_async_event,
+	.update_hw_ctx		= nvme_pci_update_hw_ctx,
 	.get_address		= nvme_pci_get_address,
 };
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCHv3 4/9] nvme-pci: Rate limit the nvme timeout warnings
  2018-05-24 20:34 [PATCHv3 0/9] nvme timeout fixes, v3 Keith Busch
                   ` (2 preceding siblings ...)
  2018-05-24 20:34 ` [PATCHv3 3/9] nvme: Move all IO out of controller reset Keith Busch
@ 2018-05-24 20:34 ` Keith Busch
  2018-05-25 13:01   ` Christoph Hellwig
  2018-05-30  6:06   ` Christoph Hellwig
  2018-05-24 20:34 ` [PATCHv3 5/9] nvme-pci: End IO requests in CONNECTING state Keith Busch
                   ` (5 subsequent siblings)
  9 siblings, 2 replies; 45+ messages in thread
From: Keith Busch @ 2018-05-24 20:34 UTC (permalink / raw)


The block layer's timeout handling currently prevents drivers from
completing commands outside the timeout callback once blk-mq decides
they've expired. If a device breaks, this could potentially create many
thousands of timed out commands. There's nothing of value to be gleaned
from observing each of those messages, so this patch adds a rate limit
on them.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 243534139df7..6be88f662e7d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1228,7 +1228,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	switch (dev->ctrl.state) {
 	case NVME_CTRL_CONNECTING:
 	case NVME_CTRL_RESETTING:
-		dev_warn(dev->ctrl.device,
+		dev_warn_ratelimited(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
 		nvme_dev_disable(dev, false);
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCHv3 5/9] nvme-pci: End IO requests in CONNECTING state
  2018-05-24 20:34 [PATCHv3 0/9] nvme timeout fixes, v3 Keith Busch
                   ` (3 preceding siblings ...)
  2018-05-24 20:34 ` [PATCHv3 4/9] nvme-pci: Rate limit the nvme timeout warnings Keith Busch
@ 2018-05-24 20:34 ` Keith Busch
  2018-05-24 20:47   ` Christoph Hellwig
  2018-05-24 20:34 ` [PATCHv3 6/9] nvme-pci: Unquiesce dead controller queues Keith Busch
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 45+ messages in thread
From: Keith Busch @ 2018-05-24 20:34 UTC (permalink / raw)


IO is always quiesced in the CONNECTING state, so any any timeout for an
IO command had already been completed.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/pci.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6be88f662e7d..54e22b964385 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1227,6 +1227,14 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 */
 	switch (dev->ctrl.state) {
 	case NVME_CTRL_CONNECTING:
+		/*
+		 * IO is never dispatched from the connecting state. If an IO
+		 * queue timed out here, the block layer missed the completion
+		 * the driver already requested, so return handled.
+		 */
+		if (nvmeq->qid)
+			return BLK_EH_HANDLED;
+		/* FALLTHRU */
 	case NVME_CTRL_RESETTING:
 		dev_warn_ratelimited(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCHv3 6/9] nvme-pci: Unquiesce dead controller queues
  2018-05-24 20:34 [PATCHv3 0/9] nvme timeout fixes, v3 Keith Busch
                   ` (4 preceding siblings ...)
  2018-05-24 20:34 ` [PATCHv3 5/9] nvme-pci: End IO requests in CONNECTING state Keith Busch
@ 2018-05-24 20:34 ` Keith Busch
  2018-05-25 13:03   ` Christoph Hellwig
  2018-05-24 20:34 ` [PATCHv3 7/9] nvme-pci: Attempt reset retry for IO failures Keith Busch
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 45+ messages in thread
From: Keith Busch @ 2018-05-24 20:34 UTC (permalink / raw)


This patch ensures the nvme namsepace request queues are not quiesced
on a surprise removal. It's possible the queues were previously killed
in a failed reset, so the queues need to be unquiesced to ensure all
requests are flushed to completion.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 54e22b964385..40863ed759de 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2637,7 +2637,7 @@ static void nvme_remove(struct pci_dev *pdev)
 
 	if (!pci_device_is_present(pdev)) {
 		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
-		nvme_dev_disable(dev, false);
+		nvme_dev_disable(dev, true);
 	}
 
 	flush_work(&dev->ctrl.reset_work);
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCHv3 7/9] nvme-pci: Attempt reset retry for IO failures
  2018-05-24 20:34 [PATCHv3 0/9] nvme timeout fixes, v3 Keith Busch
                   ` (5 preceding siblings ...)
  2018-05-24 20:34 ` [PATCHv3 6/9] nvme-pci: Unquiesce dead controller queues Keith Busch
@ 2018-05-24 20:34 ` Keith Busch
  2018-05-25 13:04   ` Christoph Hellwig
  2018-05-30 23:40   ` Sagi Grimberg
  2018-05-24 20:34 ` [PATCHv3 8/9] nvme-pci: Queue creation error handling Keith Busch
                   ` (2 subsequent siblings)
  9 siblings, 2 replies; 45+ messages in thread
From: Keith Busch @ 2018-05-24 20:34 UTC (permalink / raw)


If the reset failed due to a non-fatal error, this patch will attempt
to reset the controller again, with a maximum of 4 attempts.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/pci.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 40863ed759de..7c8076411dbc 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -38,6 +38,8 @@
 
 #define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
 
+#define MAX_RESET_FAILURES 4
+
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
@@ -2324,7 +2326,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 
 static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
 {
-	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
+	dev_warn(dev->ctrl.device, "Removing after reset failure status:%d\n", status);
 
 	nvme_get_ctrl(&dev->ctrl);
 	nvme_dev_disable(dev, false);
@@ -2337,8 +2339,9 @@ static void nvme_reset_work(struct work_struct *work)
 	struct nvme_dev *dev =
 		container_of(work, struct nvme_dev, ctrl.reset_work);
 	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
-	int result = -ENODEV;
-	enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
+	int result = -ENODEV, reset_failures = 0;
+	enum nvme_ctrl_state new_state;
+
 
 	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
 		goto out;
@@ -2363,6 +2366,8 @@ static void nvme_reset_work(struct work_struct *work)
 		goto out;
 	}
 
+ retry:
+	new_state = NVME_CTRL_LIVE;
 	result = nvme_pci_enable(dev);
 	if (result)
 		goto out;
@@ -2427,6 +2432,7 @@ static void nvme_reset_work(struct work_struct *work)
 	if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
 		dev_warn(dev->ctrl.device,
 			"failed to mark controller state %d\n", new_state);
+		result = -ENODEV;
 		goto out;
 	}
 
@@ -2434,6 +2440,22 @@ static void nvme_reset_work(struct work_struct *work)
 	return;
 
  out:
+	reset_failures++;
+
+	/* IO and Interrupted Call may indicate a retryable error */
+	switch (result) {
+	case -EIO:
+	case -EINTR:
+		if (reset_failures < MAX_RESET_FAILURES &&
+		    dev->ctrl.state == NVME_CTRL_CONNECTING) {
+			dev_warn(dev->ctrl.device,
+				 "Reset failure status:%d, failures:%d\n",
+				 result, reset_failures);
+			nvme_dev_disable(dev, false);
+			goto retry;
+		}
+		break;
+	}
 	nvme_remove_dead_ctrl(dev, result);
 }
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCHv3 8/9] nvme-pci: Queue creation error handling
  2018-05-24 20:34 [PATCHv3 0/9] nvme timeout fixes, v3 Keith Busch
                   ` (6 preceding siblings ...)
  2018-05-24 20:34 ` [PATCHv3 7/9] nvme-pci: Attempt reset retry for IO failures Keith Busch
@ 2018-05-24 20:34 ` Keith Busch
  2018-05-25 12:35   ` Christoph Hellwig
  2018-05-30 23:37   ` Sagi Grimberg
  2018-05-24 20:35 ` [PATCHv3 9/9] nvme-pci: Don't wait for HMB completion on shutdown Keith Busch
  2018-07-13  0:48 ` [PATCHv3 0/9] nvme timeout fixes, v3 Ming Lei
  9 siblings, 2 replies; 45+ messages in thread
From: Keith Busch @ 2018-05-24 20:34 UTC (permalink / raw)


This patch sets the nvmeq's cq_vector only after the cq was successfully
created. This way a device reset doesn't mistakenly believe that the
vector is allocated. In case a reset does occur during queue creation,
this patch will return status immediately instead of trying to unwind
the created queues since the device won't be able to delete queues anyway.

This patch will also handle device reported failures correctly. These
errors are reported as positive nvme status codes, which were previously
not handled at all.

Based-on-patch-by: Jianchao Wang <jianchao.w.wang at oracle.com>
Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/pci.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7c8076411dbc..e12b4ee91254 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1079,7 +1079,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 }
 
 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
-						struct nvme_queue *nvmeq)
+			    struct nvme_queue *nvmeq, u16 vector)
 {
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
@@ -1094,7 +1094,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	c.create_cq.cqid = cpu_to_le16(qid);
 	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
 	c.create_cq.cq_flags = cpu_to_le16(flags);
-	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
+	c.create_cq.irq_vector = cpu_to_le16(vector);
 
 	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
 }
@@ -1477,6 +1477,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 {
 	struct nvme_dev *dev = nvmeq->dev;
 	int result;
+	u16 vector;
 
 	if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
 		unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth),
@@ -1489,16 +1490,20 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 	 * A queue's vector matches the queue identifier unless the controller
 	 * has only one vector available.
 	 */
-	nvmeq->cq_vector = dev->num_vecs == 1 ? 0 : qid;
-	result = adapter_alloc_cq(dev, qid, nvmeq);
-	if (result < 0)
-		goto release_vector;
+	vector = dev->num_vecs == 1 ? 0 : qid;
+	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
+	if (result)
+		return result;
 
 	result = adapter_alloc_sq(dev, qid, nvmeq);
 	if (result < 0)
+		return result;
+	else if (result)
 		goto release_cq;
 
 	nvme_init_queue(nvmeq, qid);
+
+	nvmeq->cq_vector = vector;
 	result = queue_request_irq(nvmeq);
 	if (result < 0)
 		goto release_sq;
@@ -1506,12 +1511,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 	return result;
 
  release_sq:
+	nvmeq->cq_vector = -1;
 	dev->online_queues--;
 	adapter_delete_sq(dev, qid);
  release_cq:
 	adapter_delete_cq(dev, qid);
- release_vector:
-	nvmeq->cq_vector = -1;
 	return result;
 }
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCHv3 9/9] nvme-pci: Don't wait for HMB completion on shutdown
  2018-05-24 20:34 [PATCHv3 0/9] nvme timeout fixes, v3 Keith Busch
                   ` (7 preceding siblings ...)
  2018-05-24 20:34 ` [PATCHv3 8/9] nvme-pci: Queue creation error handling Keith Busch
@ 2018-05-24 20:35 ` Keith Busch
  2018-05-24 20:45   ` Christoph Hellwig
  2018-07-13  0:48 ` [PATCHv3 0/9] nvme timeout fixes, v3 Ming Lei
  9 siblings, 1 reply; 45+ messages in thread
From: Keith Busch @ 2018-05-24 20:35 UTC (permalink / raw)



An nvme controller reset can't depend on the timeout handling to
complete timed out commands since we're already trying to disable the
controller. The HMB disabling is the only command in this path that was
not handling its own timeout, so this patch fixes that by putting a time
limit on how long it will wait for completion.

Based-on-patch-by: Ming Lei <ming.lei at redhat.com>
Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/pci.c | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index e12b4ee91254..83fc5bfe20e8 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1764,9 +1764,25 @@ static inline void nvme_release_cmb(struct nvme_dev *dev)
 	}
 }
 
-static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
+static void nvme_set_host_mem_end_io(struct request *rq, blk_status_t sts)
+{
+	struct completion *wait = rq->end_io_data;
+
+	rq->end_io_data = NULL;
+	blk_mq_free_request(rq);
+	complete(wait);
+}
+
+/*
+ * Use 'wait' when sending this command in a context can't complete blocks the
+ * reset handler, as required for device shutdown.
+ */
+static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits,
+			     struct completion *wait)
 {
 	u64 dma_addr = dev->host_mem_descs_dma;
+	struct request_queue *q = dev->ctrl.admin_q;
+	struct request *req;
 	struct nvme_command c;
 	int ret;
 
@@ -1780,7 +1796,19 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
 	c.features.dword14	= cpu_to_le32(upper_32_bits(dma_addr));
 	c.features.dword15	= cpu_to_le32(dev->nr_host_mem_descs);
 
-	ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
+	if (!wait) {
+		ret = nvme_submit_sync_cmd(q, &c, NULL, 0);
+	} else {
+		req = nvme_alloc_request(q, &c, 0, NVME_QID_ANY);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
+		req->timeout = ADMIN_TIMEOUT;
+		req->end_io_data = wait;
+		blk_execute_rq_nowait(q, NULL, req, false,
+				      nvme_set_host_mem_end_io);
+		ret = wait_for_completion_io_timeout(wait, ADMIN_TIMEOUT);
+	}
+
 	if (ret) {
 		dev_warn(dev->ctrl.device,
 			 "failed to set host mem (err %d, flags %#x).\n",
@@ -1934,7 +1962,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 			dev->host_mem_size >> ilog2(SZ_1M));
 	}
 
-	ret = nvme_set_host_mem(dev, enable_bits);
+	ret = nvme_set_host_mem(dev, enable_bits, NULL);
 	if (ret)
 		nvme_free_host_mem(dev);
 	return ret;
@@ -2235,6 +2263,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	int i;
 	bool dead = true;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
+	DECLARE_COMPLETION_ONSTACK(hmb_wait);
 
 	mutex_lock(&dev->shutdown_lock);
 	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE &&
@@ -2267,7 +2296,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 		 * but I'd rather be safe than sorry..
 		 */
 		if (dev->host_mem_descs)
-			nvme_set_host_mem(dev, 0);
+			nvme_set_host_mem(dev, 0, &hmb_wait);
 		nvme_disable_io_queues(dev);
 	}
 	if (dev->ctrl.queue_count > 0)
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCHv3 9/9] nvme-pci: Don't wait for HMB completion on shutdown
  2018-05-24 20:35 ` [PATCHv3 9/9] nvme-pci: Don't wait for HMB completion on shutdown Keith Busch
@ 2018-05-24 20:45   ` Christoph Hellwig
  2018-05-24 21:15     ` Keith Busch
  0 siblings, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2018-05-24 20:45 UTC (permalink / raw)


On Thu, May 24, 2018@02:35:00PM -0600, Keith Busch wrote:
> 
> An nvme controller reset can't depend on the timeout handling to
> complete timed out commands since we're already trying to disable the
> controller. The HMB disabling is the only command in this path that was
> not handling its own timeout, so this patch fixes that by putting a time
> limit on how long it will wait for completion.

What does 'did not handle its own timeout' mean?

> +static void nvme_set_host_mem_end_io(struct request *rq, blk_status_t sts)
> +{
> +	struct completion *wait = rq->end_io_data;
> +
> +	rq->end_io_data = NULL;
> +	blk_mq_free_request(rq);
> +	complete(wait);
> +}
> +
> +/*
> + * Use 'wait' when sending this command in a context can't complete blocks the
> + * reset handler, as required for device shutdown.
> + */
> +static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits,
> +			     struct completion *wait)
>  {
>  	u64 dma_addr = dev->host_mem_descs_dma;
> +	struct request_queue *q = dev->ctrl.admin_q;
> +	struct request *req;
>  	struct nvme_command c;
>  	int ret;
>  
> @@ -1780,7 +1796,19 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
>  	c.features.dword14	= cpu_to_le32(upper_32_bits(dma_addr));
>  	c.features.dword15	= cpu_to_le32(dev->nr_host_mem_descs);
>  
> -	ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
> +	if (!wait) {
> +		ret = nvme_submit_sync_cmd(q, &c, NULL, 0);
> +	} else {
> +		req = nvme_alloc_request(q, &c, 0, NVME_QID_ANY);
> +		if (IS_ERR(req))
> +			return PTR_ERR(req);
> +		req->timeout = ADMIN_TIMEOUT;
> +		req->end_io_data = wait;
> +		blk_execute_rq_nowait(q, NULL, req, false,
> +				      nvme_set_host_mem_end_io);
> +		ret = wait_for_completion_io_timeout(wait, ADMIN_TIMEOUT);
> +	}
> +

None of this is intimately related to the HMB code.  If we really have
to we could handle this either in __nvme_submit_sync_cmd or all the
way down in blk_execute_rq, but why doesn't the block layer timeout
code kill the command once we've reached the timeout?

Also if we really need to reset the controller submitting any command
just doesn't seem very helpful.  We might as well just skip trying
to disable the HMB, as the controller needs to come up in a clean
state anyway.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 5/9] nvme-pci: End IO requests in CONNECTING state
  2018-05-24 20:34 ` [PATCHv3 5/9] nvme-pci: End IO requests in CONNECTING state Keith Busch
@ 2018-05-24 20:47   ` Christoph Hellwig
  2018-05-24 21:03     ` Keith Busch
  0 siblings, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2018-05-24 20:47 UTC (permalink / raw)


On Thu, May 24, 2018@02:34:56PM -0600, Keith Busch wrote:
> IO is always quiesced in the CONNECTING state, so any any timeout for an
> IO command had already been completed.
> 
> Signed-off-by: Keith Busch <keith.busch at intel.com>
> ---
>  drivers/nvme/host/pci.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 6be88f662e7d..54e22b964385 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1227,6 +1227,14 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>  	 */
>  	switch (dev->ctrl.state) {
>  	case NVME_CTRL_CONNECTING:
> +		/*
> +		 * IO is never dispatched from the connecting state. If an IO
> +		 * queue timed out here, the block layer missed the completion
> +		 * the driver already requested, so return handled.
> +		 */
> +		if (nvmeq->qid)
> +			return BLK_EH_HANDLED;

How can we hit this case?  This just looks a lot like papering
over the real issue..

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 5/9] nvme-pci: End IO requests in CONNECTING state
  2018-05-24 20:47   ` Christoph Hellwig
@ 2018-05-24 21:03     ` Keith Busch
  2018-05-25 12:31       ` Christoph Hellwig
  0 siblings, 1 reply; 45+ messages in thread
From: Keith Busch @ 2018-05-24 21:03 UTC (permalink / raw)


On Thu, May 24, 2018@10:47:22PM +0200, Christoph Hellwig wrote:
> On Thu, May 24, 2018@02:34:56PM -0600, Keith Busch wrote:
> > IO is always quiesced in the CONNECTING state, so any any timeout for an
> > IO command had already been completed.
> > 
> > Signed-off-by: Keith Busch <keith.busch at intel.com>
> > ---
> >  drivers/nvme/host/pci.c | 8 ++++++++
> >  1 file changed, 8 insertions(+)
> > 
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index 6be88f662e7d..54e22b964385 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -1227,6 +1227,14 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
> >  	 */
> >  	switch (dev->ctrl.state) {
> >  	case NVME_CTRL_CONNECTING:
> > +		/*
> > +		 * IO is never dispatched from the connecting state. If an IO
> > +		 * queue timed out here, the block layer missed the completion
> > +		 * the driver already requested, so return handled.
> > +		 */
> > +		if (nvmeq->qid)
> > +			return BLK_EH_HANDLED;
> 
> How can we hit this case?  This just looks a lot like papering
> over the real issue..

It'll most likley never really happen. The conditions are a pretty obscure
timeout handling cases mixed with other errors that could theoretically
hit it, requiring two or more namespaces.

The real fix, IMO, is wahat the blk-mq timeout enhancements are working
toward, so I've no problem dropping this patch.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 9/9] nvme-pci: Don't wait for HMB completion on shutdown
  2018-05-24 20:45   ` Christoph Hellwig
@ 2018-05-24 21:15     ` Keith Busch
  2018-05-25  3:10       ` jianchao.wang
  2018-05-25 12:36       ` Christoph Hellwig
  0 siblings, 2 replies; 45+ messages in thread
From: Keith Busch @ 2018-05-24 21:15 UTC (permalink / raw)


On Thu, May 24, 2018@10:45:06PM +0200, Christoph Hellwig wrote:
> None of this is intimately related to the HMB code.  If we really have
> to we could handle this either in __nvme_submit_sync_cmd or all the
> way down in blk_execute_rq, 

Yeah, that's a better idea.

> but why doesn't the block layer timeout
> code kill the command once we've reached the timeout?

The shutdown_lock serializes resets, and we can't release a command until
the pci device is disabled under that lock. That requirement is mainly
for IO, as we can't assume a controller won't access transfer buffers
just because the command timed out.
 
> Also if we really need to reset the controller submitting any command
> just doesn't seem very helpful.  We might as well just skip trying
> to disable the HMB, as the controller needs to come up in a clean
> state anyway.

That's also fine with me.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 9/9] nvme-pci: Don't wait for HMB completion on shutdown
  2018-05-24 21:15     ` Keith Busch
@ 2018-05-25  3:10       ` jianchao.wang
  2018-05-25 15:09         ` Keith Busch
  2018-05-25 12:36       ` Christoph Hellwig
  1 sibling, 1 reply; 45+ messages in thread
From: jianchao.wang @ 2018-05-25  3:10 UTC (permalink / raw)


Hi Keith

On 05/25/2018 05:15 AM, Keith Busch wrote:
>> Also if we really need to reset the controller submitting any command
>> just doesn't seem very helpful.  We might as well just skip trying
>> to disable the HMB, as the controller needs to come up in a clean
>> state anyway.
> That's also fine with me.
> 

Does it mean just need to disable the controller with transiting CC.EN ?1? to ?0?
without sending any admin commands including deleting cq/sq ?
The specification indeed says that
when Controller Reset (CC.EN transitions from ?1? to ?0
 - The controller stops processing any outstanding Admin or I/O commands.
 - All I/O Submission Queues are deleted.
 - All I/O Completion Queues are deleted.
 - The controller is brought to an Idle state. When this is complete, CSTS.RDY is cleared to ?0?.
 - The Admin Queue registers (AQA, ASQ, or ACQ) are not reset as part of a controller reset. All
other controller registers defined in section 3 and internal controller state are reset.

Then for the timeout case at least, for LIVE state, nvme_timeout could hand over all the things to
reset_work and return BLK_EH_RESET_TIMER, when reset_work is ongoing, just  disable the controller
with setting CC.EN and disable the related pci things and return BLK_EH_HANDLED to complete the
admin commands to wake up the reset_work. (certainly with your patch to move update_hw_nr to scan_work)
nvme_timeout doesn't need to invoke nvme_dev_disable any more and life will be simpler :).

Thanks
jianchao

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 5/9] nvme-pci: End IO requests in CONNECTING state
  2018-05-24 21:03     ` Keith Busch
@ 2018-05-25 12:31       ` Christoph Hellwig
  0 siblings, 0 replies; 45+ messages in thread
From: Christoph Hellwig @ 2018-05-25 12:31 UTC (permalink / raw)


On Thu, May 24, 2018@03:03:13PM -0600, Keith Busch wrote:
> > How can we hit this case?  This just looks a lot like papering
> > over the real issue..
> 
> It'll most likley never really happen. The conditions are a pretty obscure
> timeout handling cases mixed with other errors that could theoretically
> hit it, requiring two or more namespaces.
> 
> The real fix, IMO, is wahat the blk-mq timeout enhancements are working
> toward, so I've no problem dropping this patch.

Ok, lets fix the real issue then.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 8/9] nvme-pci: Queue creation error handling
  2018-05-24 20:34 ` [PATCHv3 8/9] nvme-pci: Queue creation error handling Keith Busch
@ 2018-05-25 12:35   ` Christoph Hellwig
  2018-06-05 16:28     ` Keith Busch
  2018-05-30 23:37   ` Sagi Grimberg
  1 sibling, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2018-05-25 12:35 UTC (permalink / raw)


On Thu, May 24, 2018@02:34:59PM -0600, Keith Busch wrote:
> This patch sets the nvmeq's cq_vector only after the cq was successfully
> created. This way a device reset doesn't mistakenly believe that the
> vector is allocated. In case a reset does occur during queue creation,
> this patch will return status immediately instead of trying to unwind
> the created queues since the device won't be able to delete queues anyway.
> 
> This patch will also handle device reported failures correctly. These
> errors are reported as positive nvme status codes, which were previously
> not handled at all.

I applied the original patch this morning with some minor fixup.  Can
you send a relative patch to the nvme-4.18-2 tree?

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 9/9] nvme-pci: Don't wait for HMB completion on shutdown
  2018-05-24 21:15     ` Keith Busch
  2018-05-25  3:10       ` jianchao.wang
@ 2018-05-25 12:36       ` Christoph Hellwig
  1 sibling, 0 replies; 45+ messages in thread
From: Christoph Hellwig @ 2018-05-25 12:36 UTC (permalink / raw)


On Thu, May 24, 2018@03:15:39PM -0600, Keith Busch wrote:
> > Also if we really need to reset the controller submitting any command
> > just doesn't seem very helpful.  We might as well just skip trying
> > to disable the HMB, as the controller needs to come up in a clean
> > state anyway.
> 
> That's also fine with me.

So let's go for the most simple option..

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 1/9] nvme: Sync request queues on reset
  2018-05-24 20:34 ` [PATCHv3 1/9] nvme: Sync request queues on reset Keith Busch
@ 2018-05-25 12:42   ` Christoph Hellwig
  2018-05-25 14:22     ` Keith Busch
  0 siblings, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2018-05-25 12:42 UTC (permalink / raw)


On Thu, May 24, 2018@02:34:52PM -0600, Keith Busch wrote:
> This patch fixes races that occur with simultaneous controller
> resets

Wait..  How do we end up with simultaneous controller resets?  We
not only have the NVME_CTRL_RESETTING resetting state, but also
execute all resets from ctrl->reset_work, so they are implicitly
single threaded.

> by synchronizing request queues prior to initializing the
> controller. Without this, a timeout thread may attempt disabling a
> controller at the same time as we're trying to enable it.
> 
> Signed-off-by: Keith Busch <keith.busch at intel.com>
> ---
>  drivers/nvme/host/core.c | 21 +++++++++++++++++++--
>  drivers/nvme/host/nvme.h |  1 +
>  drivers/nvme/host/pci.c  | 11 +++++++----
>  3 files changed, 27 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index dc8aa2c1c22a..33034e469bbc 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3469,6 +3469,12 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
>  }
>  EXPORT_SYMBOL_GPL(nvme_init_ctrl);
>  
> +static void nvme_start_queue(struct nvme_ns *ns)
> +{
> +	blk_mq_unquiesce_queue(ns->queue);
> +	blk_mq_kick_requeue_list(ns->queue);
> +}
> +
>  /**
>   * nvme_kill_queues(): Ends all namespace queues
>   * @ctrl: the dead controller that needs to end
> @@ -3497,7 +3503,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
>  		blk_set_queue_dying(ns->queue);
>  
>  		/* Forcibly unquiesce queues to avoid blocking dispatch */
> -		blk_mq_unquiesce_queue(ns->queue);
> +		nvme_start_queue(ns);
>  	}
>  	up_read(&ctrl->namespaces_rwsem);
>  }
> @@ -3567,11 +3573,22 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
>  
>  	down_read(&ctrl->namespaces_rwsem);
>  	list_for_each_entry(ns, &ctrl->namespaces, list)
> -		blk_mq_unquiesce_queue(ns->queue);
> +		nvme_start_queue(ns);
>  	up_read(&ctrl->namespaces_rwsem);

The whole kick the requeue list when starting queues bit seems like
it should be a separate patch.

>  }
>  EXPORT_SYMBOL_GPL(nvme_start_queues);
>  
> +void nvme_sync_queues(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_ns *ns;
> +
> +	down_read(&ctrl->namespaces_rwsem);
> +	list_for_each_entry(ns, &ctrl->namespaces, list)
> +		blk_sync_queue(ns->queue);
> +	up_read(&ctrl->namespaces_rwsem);
> +}
> +EXPORT_SYMBOL_GPL(nvme_sync_queues);
> +
>  int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
>  {
>  	if (!ctrl->ops->reinit_request)
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index ec6e4acc4d48..4f43918cd902 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -403,6 +403,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
>  void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
>  		volatile union nvme_result *res);
>  
> +void nvme_sync_queues(struct nvme_ctrl *ctrl);
>  void nvme_stop_queues(struct nvme_ctrl *ctrl);
>  void nvme_start_queues(struct nvme_ctrl *ctrl);
>  void nvme_kill_queues(struct nvme_ctrl *ctrl);
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 917e1714f7d9..9da28e10d942 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2317,11 +2317,14 @@ static void nvme_reset_work(struct work_struct *work)
>  		goto out;
>  
>  	/*
> -	 * If we're called to reset a live controller first shut it down before
> -	 * moving on.
> +	 * Ensure there are no timeout work in progress prior to forcefully
> +	 * disabling the queue. There is no harm in disabling the device even
> +	 * when it was already disabled, as this will forcefully reclaim any
> +	 * IOs that are stuck due to blk-mq's timeout handling that prevents
> +	 * timed out requests from completing.
>  	 */
> -	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
> -		nvme_dev_disable(dev, false);
> +	nvme_sync_queues(&dev->ctrl);
> +	nvme_dev_disable(dev, false);

And this part also makes sense to me, but I don't really understand
how it relates to the commit message.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 2/9] nvme-pci: Fix queue freeze criteria on reset
  2018-05-24 20:34 ` [PATCHv3 2/9] nvme-pci: Fix queue freeze criteria " Keith Busch
@ 2018-05-25 12:43   ` Christoph Hellwig
  2018-05-30 23:36   ` Sagi Grimberg
  1 sibling, 0 replies; 45+ messages in thread
From: Christoph Hellwig @ 2018-05-25 12:43 UTC (permalink / raw)


> +	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE &&

Please throw in a pair of parentheses here.

> +	    (dev->ctrl.state == NVME_CTRL_LIVE ||
> +	     dev->ctrl.state == NVME_CTRL_RESETTING)) {
>  		u32 csts = readl(dev->bar + NVME_REG_CSTS);
>  
> -		if (dev->ctrl.state == NVME_CTRL_LIVE ||
> -		    dev->ctrl.state == NVME_CTRL_RESETTING)
> -			nvme_start_freeze(&dev->ctrl);
> +		nvme_start_freeze(&dev->ctrl);

should we same csts after starting the freeze as that might take some
time?

Otherwise this looks fine to me.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 3/9] nvme: Move all IO out of controller reset
  2018-05-24 20:34 ` [PATCHv3 3/9] nvme: Move all IO out of controller reset Keith Busch
@ 2018-05-25 13:00   ` Christoph Hellwig
  2018-05-25 14:41     ` Keith Busch
  0 siblings, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2018-05-25 13:00 UTC (permalink / raw)


On Thu, May 24, 2018@02:34:54PM -0600, Keith Busch wrote:
> IO may be retryable, so don't wait for them in the reset path.

Can't parse this.


> These
> commands may trigger a reset if that IO expires without a completion,

What are "these commands"?

> placing it on the requeue list, so waiting for these would deadlock the
> reset handler.
> 
> To fix the theoretical deadlock, this patch unblocks IO submission from

How did you find it if it is theoretical?

> the reset_work as before, but moves the waiting to the scan_work, where
> waiting for IO is safe so that reset_work may proceed to completion. Since
> unfreezing the queues now happens in the controller LIVE state, nvme_dev
> now tracks if the queues were frozen now to prevent incorrect freeze
> depths.
> 
> This patch is also renaming the function 'nvme_dev_add' to a
> more appropriate name that describes what it's actually doing:
> nvme_alloc_io_tags.

Can you split this out into a separate patch?

> 
> Signed-off-by: Keith Busch <keith.busch at intel.com>
> ---
>  drivers/nvme/host/core.c |  2 ++
>  drivers/nvme/host/nvme.h |  1 +
>  drivers/nvme/host/pci.c  | 46 +++++++++++++++++++++++++++++++---------------
>  3 files changed, 34 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 33034e469bbc..0f0eb85c64b8 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3175,6 +3175,8 @@ static void nvme_scan_work(struct work_struct *work)
>  	struct nvme_id_ctrl *id;
>  	unsigned nn;
>  
> +	if (ctrl->ops->update_hw_ctx)
> +		ctrl->ops->update_hw_ctx(ctrl);

nvme_scan_work gets kicked from all kinds of places including
ioctls and AERs. I don't think the code you added below should
be called from all of them.

> +static void nvme_pci_update_hw_ctx(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_dev *dev = to_nvme_dev(ctrl);
> +	bool unfreeze;
> +
> +	mutex_lock(&dev->shutdown_lock);
> +	unfreeze = dev->queues_froze;
> +	mutex_unlock(&dev->shutdown_lock);

No need to take a mutex here is you sample as single <= register
sized value.

> +	if (!unfreeze)
> +		return;

But this whole scheme stinks to me.  For one we are adding more ad-hoc
state outside the state machine, second it all seems very "ad-hoc".

> +
> +	nvme_wait_freeze(&dev->ctrl);
> +	blk_mq_update_nr_hw_queues(ctrl->tagset, dev->online_queues - 1);
> +	nvme_free_queues(dev, dev->online_queues);
> +	nvme_unfreeze(&dev->ctrl);
> +
> +	mutex_lock(&dev->shutdown_lock);
> +	dev->queues_froze = false;
> +	mutex_unlock(&dev->shutdown_lock);

Same here.  Simple READ_ONCE/WRITE_ONCE will give you the right
memory barriers with no need for the lock.

Also except for the nvme_free_queues this all is generic code,
so I think we want this in the core.

And I wonder where this would fit better than the scan work, but I
can't think of anything else but an entirely new work_struct, which
isn't all that great either.

> @@ -2211,7 +2228,10 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
>  	     dev->ctrl.state == NVME_CTRL_RESETTING)) {
>  		u32 csts = readl(dev->bar + NVME_REG_CSTS);
>  
> -		nvme_start_freeze(&dev->ctrl);
> +		if (!dev->queues_froze)	{
> +			nvme_start_freeze(&dev->ctrl);
> +			dev->queues_froze = true;
> +		}

And this sounds like another indicator for a new FROZEN state.  Once
the ctrl already is frozen we really shouldn't even end up in here
anymore.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 4/9] nvme-pci: Rate limit the nvme timeout warnings
  2018-05-24 20:34 ` [PATCHv3 4/9] nvme-pci: Rate limit the nvme timeout warnings Keith Busch
@ 2018-05-25 13:01   ` Christoph Hellwig
  2018-05-30  6:06   ` Christoph Hellwig
  1 sibling, 0 replies; 45+ messages in thread
From: Christoph Hellwig @ 2018-05-25 13:01 UTC (permalink / raw)


On Thu, May 24, 2018@02:34:55PM -0600, Keith Busch wrote:
> The block layer's timeout handling currently prevents drivers from
> completing commands outside the timeout callback once blk-mq decides
> they've expired. If a device breaks, this could potentially create many
> thousands of timed out commands. There's nothing of value to be gleaned
> from observing each of those messages, so this patch adds a rate limit
> on them.

Looks good,

Reviewed-by: Christoph Hellwig <hch at lst.de>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 6/9] nvme-pci: Unquiesce dead controller queues
  2018-05-24 20:34 ` [PATCHv3 6/9] nvme-pci: Unquiesce dead controller queues Keith Busch
@ 2018-05-25 13:03   ` Christoph Hellwig
  0 siblings, 0 replies; 45+ messages in thread
From: Christoph Hellwig @ 2018-05-25 13:03 UTC (permalink / raw)


On Thu, May 24, 2018@02:34:57PM -0600, Keith Busch wrote:
> This patch ensures the nvme namsepace request queues are not quiesced
> on a surprise removal. It's possible the queues were previously killed
> in a failed reset, so the queues need to be unquiesced to ensure all
> requests are flushed to completion.
> 
> Signed-off-by: Keith Busch <keith.busch at intel.com>

Looks good,

Reviewed-by: Christoph Hellwig <hch at lst.de>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 7/9] nvme-pci: Attempt reset retry for IO failures
  2018-05-24 20:34 ` [PATCHv3 7/9] nvme-pci: Attempt reset retry for IO failures Keith Busch
@ 2018-05-25 13:04   ` Christoph Hellwig
  2018-05-25 14:25     ` Keith Busch
  2018-05-30 23:40   ` Sagi Grimberg
  1 sibling, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2018-05-25 13:04 UTC (permalink / raw)


On Thu, May 24, 2018@02:34:58PM -0600, Keith Busch wrote:
> If the reset failed due to a non-fatal error, this patch will attempt
> to reset the controller again, with a maximum of 4 attempts.

What kind of non-fatal errors do you see that this tries to handle?

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 1/9] nvme: Sync request queues on reset
  2018-05-25 12:42   ` Christoph Hellwig
@ 2018-05-25 14:22     ` Keith Busch
  2018-05-25 14:32       ` Christoph Hellwig
  0 siblings, 1 reply; 45+ messages in thread
From: Keith Busch @ 2018-05-25 14:22 UTC (permalink / raw)


On Fri, May 25, 2018@02:42:09PM +0200, Christoph Hellwig wrote:
> On Thu, May 24, 2018@02:34:52PM -0600, Keith Busch wrote:
> > This patch fixes races that occur with simultaneous controller
> > resets
> 
> Wait..  How do we end up with simultaneous controller resets?  We
> not only have the NVME_CTRL_RESETTING resetting state, but also
> execute all resets from ctrl->reset_work, so they are implicitly
> single threaded.

Right, the bring up is single threaded, but the NVMe Controller Level
Reset (CC.EN 1 -> 0) can happen through a timeout. This patch is really
just working with the way blk-mq's timeout handler claims requests
and prevents the driver from completing them. The reset_work operates
under the assumption that there are no outstanding commands after
nvme_dev_disable, so this patch just ensures that's the case.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 7/9] nvme-pci: Attempt reset retry for IO failures
  2018-05-25 13:04   ` Christoph Hellwig
@ 2018-05-25 14:25     ` Keith Busch
  0 siblings, 0 replies; 45+ messages in thread
From: Keith Busch @ 2018-05-25 14:25 UTC (permalink / raw)


On Fri, May 25, 2018@03:04:23PM +0200, Christoph Hellwig wrote:
> On Thu, May 24, 2018@02:34:58PM -0600, Keith Busch wrote:
> > If the reset failed due to a non-fatal error, this patch will attempt
> > to reset the controller again, with a maximum of 4 attempts.
> 
> What kind of non-fatal errors do you see that this tries to handle?

Like how a non-fatal uncorrectable error can trigger a reset. If we
happen to be executing an initialization admin command, it returns
-EINTR. The driver would normally just unbind from the controller if
any admin command fails, but retrying may succeed.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 1/9] nvme: Sync request queues on reset
  2018-05-25 14:22     ` Keith Busch
@ 2018-05-25 14:32       ` Christoph Hellwig
  2018-05-25 14:45         ` Keith Busch
  2018-05-25 15:56         ` James Smart
  0 siblings, 2 replies; 45+ messages in thread
From: Christoph Hellwig @ 2018-05-25 14:32 UTC (permalink / raw)


On Fri, May 25, 2018@08:22:34AM -0600, Keith Busch wrote:
> On Fri, May 25, 2018@02:42:09PM +0200, Christoph Hellwig wrote:
> > On Thu, May 24, 2018@02:34:52PM -0600, Keith Busch wrote:
> > > This patch fixes races that occur with simultaneous controller
> > > resets
> > 
> > Wait..  How do we end up with simultaneous controller resets?  We
> > not only have the NVME_CTRL_RESETTING resetting state, but also
> > execute all resets from ctrl->reset_work, so they are implicitly
> > single threaded.
> 
> Right, the bring up is single threaded, but the NVMe Controller Level
> Reset (CC.EN 1 -> 0) can happen through a timeout. This patch is really
> just working with the way blk-mq's timeout handler claims requests
> and prevents the driver from completing them. The reset_work operates
> under the assumption that there are no outstanding commands after
> nvme_dev_disable, so this patch just ensures that's the case.

Ok, so we are talking about simultaneous nvme_dev_disable calls, which
makes more sense.

That being said I really like the idea that Jianchao floated about
always returning BLK_EH_RESET_TIMER and just letting the reset work
do the work.  I hope it actually works and doesn't have hidden
pitfalls..

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 3/9] nvme: Move all IO out of controller reset
  2018-05-25 13:00   ` Christoph Hellwig
@ 2018-05-25 14:41     ` Keith Busch
  0 siblings, 0 replies; 45+ messages in thread
From: Keith Busch @ 2018-05-25 14:41 UTC (permalink / raw)


On Fri, May 25, 2018@03:00:57PM +0200, Christoph Hellwig wrote:
> On Thu, May 24, 2018@02:34:54PM -0600, Keith Busch wrote:
> > IO may be retryable, so don't wait for them in the reset path.
> 
> Can't parse this.
> 
> 
> > These
> > commands may trigger a reset if that IO expires without a completion,
> 
> What are "these commands"?

Sorry, I'm referring to the failed requests that were requeued after
a controller disabling. We've been dispatching them from reset_work and
just hoping they don't time out.
 
> > placing it on the requeue list, so waiting for these would deadlock the
> > reset handler.
> > 
> > To fix the theoretical deadlock, this patch unblocks IO submission from
> 
> How did you find it if it is theoretical?

Variants of the blktests block/011 can trigger this. I've neven seen it
in real life, but its not too much of a leap to imagine it can happen.
 
> > This patch is also renaming the function 'nvme_dev_add' to a
> > more appropriate name that describes what it's actually doing:
> > nvme_alloc_io_tags.
> 
> Can you split this out into a separate patch?

Sure thing.
 
> > @@ -3175,6 +3175,8 @@ static void nvme_scan_work(struct work_struct *work)
> >  	struct nvme_id_ctrl *id;
> >  	unsigned nn;
> >  
> > +	if (ctrl->ops->update_hw_ctx)
> > +		ctrl->ops->update_hw_ctx(ctrl);
> 
> nvme_scan_work gets kicked from all kinds of places including
> ioctls and AERs. I don't think the code you added below should
> be called from all of them.

True, most of the time nothing happens on this call. I'm trying to not
require another work_struct, and scan_work provides a safe context for
what this needs to accomplish, but I can try to find another way.

> > +static void nvme_pci_update_hw_ctx(struct nvme_ctrl *ctrl)
> > +{
> > +	struct nvme_dev *dev = to_nvme_dev(ctrl);
> > +	bool unfreeze;
> > +
> > +	mutex_lock(&dev->shutdown_lock);
> > +	unfreeze = dev->queues_froze;
> > +	mutex_unlock(&dev->shutdown_lock);
> 
> No need to take a mutex here is you sample as single <= register
> sized value.
> 
> > +	if (!unfreeze)
> > +		return;
> 
> But this whole scheme stinks to me.  For one we are adding more ad-hoc
> state outside the state machine, second it all seems very "ad-hoc".
> 
> > +
> > +	nvme_wait_freeze(&dev->ctrl);
> > +	blk_mq_update_nr_hw_queues(ctrl->tagset, dev->online_queues - 1);
> > +	nvme_free_queues(dev, dev->online_queues);
> > +	nvme_unfreeze(&dev->ctrl);
> > +
> > +	mutex_lock(&dev->shutdown_lock);
> > +	dev->queues_froze = false;
> > +	mutex_unlock(&dev->shutdown_lock);
> 
> Same here.  Simple READ_ONCE/WRITE_ONCE will give you the right
> memory barriers with no need for the lock.

Good point.
 
> Also except for the nvme_free_queues this all is generic code,
> so I think we want this in the core.

That can be arranged.

> And I wonder where this would fit better than the scan work, but I
> can't think of anything else but an entirely new work_struct, which
> isn't all that great either.

Yeah, I was trying to avoid introducing yet another work_struct here.

> > @@ -2211,7 +2228,10 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
> >  	     dev->ctrl.state == NVME_CTRL_RESETTING)) {
> >  		u32 csts = readl(dev->bar + NVME_REG_CSTS);
> >  
> > -		nvme_start_freeze(&dev->ctrl);
> > +		if (!dev->queues_froze)	{
> > +			nvme_start_freeze(&dev->ctrl);
> > +			dev->queues_froze = true;
> > +		}
> 
> And this sounds like another indicator for a new FROZEN state.  Once
> the ctrl already is frozen we really shouldn't even end up in here
> anymore.

I'll have to think about this idea.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 1/9] nvme: Sync request queues on reset
  2018-05-25 14:32       ` Christoph Hellwig
@ 2018-05-25 14:45         ` Keith Busch
  2018-05-25 15:56         ` James Smart
  1 sibling, 0 replies; 45+ messages in thread
From: Keith Busch @ 2018-05-25 14:45 UTC (permalink / raw)


On Fri, May 25, 2018@04:32:53PM +0200, Christoph Hellwig wrote:
> Ok, so we are talking about simultaneous nvme_dev_disable calls, which
> makes more sense.
> 
> That being said I really like the idea that Jianchao floated about
> always returning BLK_EH_RESET_TIMER and just letting the reset work
> do the work.  I hope it actually works and doesn't have hidden
> pitfalls..

There are some pitfalls there. The reset_work won't queue if we're in the
DELETING state, for example, and we do still need something to disable
the controller if an admin command issued within reset_work times out. We
do the nvme_dev_disable inline with the timeout for those reasons.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 9/9] nvme-pci: Don't wait for HMB completion on shutdown
  2018-05-25  3:10       ` jianchao.wang
@ 2018-05-25 15:09         ` Keith Busch
  0 siblings, 0 replies; 45+ messages in thread
From: Keith Busch @ 2018-05-25 15:09 UTC (permalink / raw)


On Fri, May 25, 2018@11:10:39AM +0800, jianchao.wang wrote:
> Does it mean just need to disable the controller with transiting CC.EN ?1? to ?0?
> without sending any admin commands including deleting cq/sq ?
> The specification indeed says that
> when Controller Reset (CC.EN transitions from ?1? to ?0
>  - The controller stops processing any outstanding Admin or I/O commands.
>  - All I/O Submission Queues are deleted.
>  - All I/O Completion Queues are deleted.
>  - The controller is brought to an Idle state. When this is complete, CSTS.RDY is cleared to ?0?.
>  - The Admin Queue registers (AQA, ASQ, or ACQ) are not reset as part of a controller reset. All
> other controller registers defined in section 3 and internal controller state are reset.

Yeah, that's true. I'd still like to preserve the orderly shutdown as
recommended per spec, but maybe we don't need a single function to handle
both orderly and error cases.

> Then for the timeout case at least, for LIVE state, nvme_timeout could hand over all the things to
> reset_work and return BLK_EH_RESET_TIMER, when reset_work is ongoing, just  disable the controller
> with setting CC.EN and disable the related pci things and return BLK_EH_HANDLED to complete the
> admin commands to wake up the reset_work. (certainly with your patch to move update_hw_nr to scan_work)
> nvme_timeout doesn't need to invoke nvme_dev_disable any more and life will be simpler :).

I'll need to think about this a little more. I do like simpler. :)

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 1/9] nvme: Sync request queues on reset
  2018-05-25 14:32       ` Christoph Hellwig
  2018-05-25 14:45         ` Keith Busch
@ 2018-05-25 15:56         ` James Smart
  2018-05-25 16:24           ` Keith Busch
  2018-05-30 23:24           ` Sagi Grimberg
  1 sibling, 2 replies; 45+ messages in thread
From: James Smart @ 2018-05-25 15:56 UTC (permalink / raw)




On 5/25/2018 7:32 AM, Christoph Hellwig wrote:
> On Fri, May 25, 2018@08:22:34AM -0600, Keith Busch wrote:
>> On Fri, May 25, 2018@02:42:09PM +0200, Christoph Hellwig wrote:
>>> On Thu, May 24, 2018@02:34:52PM -0600, Keith Busch wrote:
>>>> This patch fixes races that occur with simultaneous controller
>>>> resets
>>> Wait..  How do we end up with simultaneous controller resets?  We
>>> not only have the NVME_CTRL_RESETTING resetting state, but also
>>> execute all resets from ctrl->reset_work, so they are implicitly
>>> single threaded.
>> Right, the bring up is single threaded, but the NVMe Controller Level
>> Reset (CC.EN 1 -> 0) can happen through a timeout. This patch is really
>> just working with the way blk-mq's timeout handler claims requests
>> and prevents the driver from completing them. The reset_work operates
>> under the assumption that there are no outstanding commands after
>> nvme_dev_disable, so this patch just ensures that's the case.
> Ok, so we are talking about simultaneous nvme_dev_disable calls, which
> makes more sense.
>
> That being said I really like the idea that Jianchao floated about
> always returning BLK_EH_RESET_TIMER and just letting the reset work
> do the work.  I hope it actually works and doesn't have hidden
> pitfalls..
>

I came to this same conclusion and this is how FC works.

-- james

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 1/9] nvme: Sync request queues on reset
  2018-05-25 15:56         ` James Smart
@ 2018-05-25 16:24           ` Keith Busch
  2018-05-25 18:04             ` James Smart
  2018-05-30 23:24           ` Sagi Grimberg
  1 sibling, 1 reply; 45+ messages in thread
From: Keith Busch @ 2018-05-25 16:24 UTC (permalink / raw)


On Fri, May 25, 2018@08:56:51AM -0700, James Smart wrote:
> 
> 
> On 5/25/2018 7:32 AM, Christoph Hellwig wrote:
> > On Fri, May 25, 2018@08:22:34AM -0600, Keith Busch wrote:
> > > On Fri, May 25, 2018@02:42:09PM +0200, Christoph Hellwig wrote:
> > > > On Thu, May 24, 2018@02:34:52PM -0600, Keith Busch wrote:
> > > > > This patch fixes races that occur with simultaneous controller
> > > > > resets
> > > > Wait..  How do we end up with simultaneous controller resets?  We
> > > > not only have the NVME_CTRL_RESETTING resetting state, but also
> > > > execute all resets from ctrl->reset_work, so they are implicitly
> > > > single threaded.
> > > Right, the bring up is single threaded, but the NVMe Controller Level
> > > Reset (CC.EN 1 -> 0) can happen through a timeout. This patch is really
> > > just working with the way blk-mq's timeout handler claims requests
> > > and prevents the driver from completing them. The reset_work operates
> > > under the assumption that there are no outstanding commands after
> > > nvme_dev_disable, so this patch just ensures that's the case.
> > Ok, so we are talking about simultaneous nvme_dev_disable calls, which
> > makes more sense.
> > 
> > That being said I really like the idea that Jianchao floated about
> > always returning BLK_EH_RESET_TIMER and just letting the reset work
> > do the work.  I hope it actually works and doesn't have hidden
> > pitfalls..
> > 
> 
> I came to this same conclusion and this is how FC works.

At least in the current blk-mq timeout handling, returning RESET_TIMER
presents other challenges for the reset handler: the timer may have
reclaimed the request that reset_work is trying to complete.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 1/9] nvme: Sync request queues on reset
  2018-05-25 16:24           ` Keith Busch
@ 2018-05-25 18:04             ` James Smart
  2018-05-25 18:30               ` Keith Busch
  0 siblings, 1 reply; 45+ messages in thread
From: James Smart @ 2018-05-25 18:04 UTC (permalink / raw)




On 5/25/2018 9:24 AM, Keith Busch wrote:
>
> At least in the current blk-mq timeout handling, returning RESET_TIMER
> presents other challenges for the reset handler: the timer may have
> reclaimed the request that reset_work is trying to complete.

why would that be true if BLK_EH_RESET_TIMER was returned ?

-- james

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 1/9] nvme: Sync request queues on reset
  2018-05-25 18:04             ` James Smart
@ 2018-05-25 18:30               ` Keith Busch
  2018-05-30 23:25                 ` Sagi Grimberg
  0 siblings, 1 reply; 45+ messages in thread
From: Keith Busch @ 2018-05-25 18:30 UTC (permalink / raw)


On Fri, May 25, 2018@11:04:47AM -0700, James Smart wrote:
> On 5/25/2018 9:24 AM, Keith Busch wrote:
> > 
> > At least in the current blk-mq timeout handling, returning RESET_TIMER
> > presents other challenges for the reset handler: the timer may have
> > reclaimed the request that reset_work is trying to complete.
> 
> why would that be true if BLK_EH_RESET_TIMER was returned ?

That return rearms the timer for triggering timeout handling, and
when that timeout handling triggers, your reset work won't be able to
complete the request. You're basically relying on the timer being high
enough that your reset work beats the timer to the completion. That may
be sufficient in practice most of the time.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 4/9] nvme-pci: Rate limit the nvme timeout warnings
  2018-05-24 20:34 ` [PATCHv3 4/9] nvme-pci: Rate limit the nvme timeout warnings Keith Busch
  2018-05-25 13:01   ` Christoph Hellwig
@ 2018-05-30  6:06   ` Christoph Hellwig
  1 sibling, 0 replies; 45+ messages in thread
From: Christoph Hellwig @ 2018-05-30  6:06 UTC (permalink / raw)


On Thu, May 24, 2018@02:34:55PM -0600, Keith Busch wrote:
> The block layer's timeout handling currently prevents drivers from
> completing commands outside the timeout callback once blk-mq decides
> they've expired. If a device breaks, this could potentially create many
> thousands of timed out commands. There's nothing of value to be gleaned
> from observing each of those messages, so this patch adds a rate limit
> on them.
> 
> Signed-off-by: Keith Busch <keith.busch at intel.com>

I've picked this for the nvme-4.18 tree.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 1/9] nvme: Sync request queues on reset
  2018-05-25 15:56         ` James Smart
  2018-05-25 16:24           ` Keith Busch
@ 2018-05-30 23:24           ` Sagi Grimberg
  1 sibling, 0 replies; 45+ messages in thread
From: Sagi Grimberg @ 2018-05-30 23:24 UTC (permalink / raw)



>>> Right, the bring up is single threaded, but the NVMe Controller Level
>>> Reset (CC.EN 1 -> 0) can happen through a timeout. This patch is really
>>> just working with the way blk-mq's timeout handler claims requests
>>> and prevents the driver from completing them. The reset_work operates
>>> under the assumption that there are no outstanding commands after
>>> nvme_dev_disable, so this patch just ensures that's the case.
>> Ok, so we are talking about simultaneous nvme_dev_disable calls, which
>> makes more sense.
>>
>> That being said I really like the idea that Jianchao floated about
>> always returning BLK_EH_RESET_TIMER and just letting the reset work
>> do the work.? I hope it actually works and doesn't have hidden
>> pitfalls..
>>
> 
> I came to this same conclusion and this is how FC works.

and rdma saw a patch for it but was differed to when the block layer
complete/timeout races were resolved.. Perhaps we should
resurrect that.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 1/9] nvme: Sync request queues on reset
  2018-05-25 18:30               ` Keith Busch
@ 2018-05-30 23:25                 ` Sagi Grimberg
  2018-06-05 16:25                   ` Keith Busch
  0 siblings, 1 reply; 45+ messages in thread
From: Sagi Grimberg @ 2018-05-30 23:25 UTC (permalink / raw)



>>> At least in the current blk-mq timeout handling, returning RESET_TIMER
>>> presents other challenges for the reset handler: the timer may have
>>> reclaimed the request that reset_work is trying to complete.
>>
>> why would that be true if BLK_EH_RESET_TIMER was returned ?
> 
> That return rearms the timer for triggering timeout handling, and
> when that timeout handling triggers, your reset work won't be able to
> complete the request. You're basically relying on the timer being high
> enough that your reset work beats the timer to the completion. That may
> be sufficient in practice most of the time.

Why? the timeout handler would reset the timer yet again.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 2/9] nvme-pci: Fix queue freeze criteria on reset
  2018-05-24 20:34 ` [PATCHv3 2/9] nvme-pci: Fix queue freeze criteria " Keith Busch
  2018-05-25 12:43   ` Christoph Hellwig
@ 2018-05-30 23:36   ` Sagi Grimberg
  1 sibling, 0 replies; 45+ messages in thread
From: Sagi Grimberg @ 2018-05-30 23:36 UTC (permalink / raw)


Reviewed-by: Sagi Grimberg <sagi at grimberg.me>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 8/9] nvme-pci: Queue creation error handling
  2018-05-24 20:34 ` [PATCHv3 8/9] nvme-pci: Queue creation error handling Keith Busch
  2018-05-25 12:35   ` Christoph Hellwig
@ 2018-05-30 23:37   ` Sagi Grimberg
  1 sibling, 0 replies; 45+ messages in thread
From: Sagi Grimberg @ 2018-05-30 23:37 UTC (permalink / raw)


Reviewed-by: Sagi Grimberg <sagi at grimberg.me>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 7/9] nvme-pci: Attempt reset retry for IO failures
  2018-05-24 20:34 ` [PATCHv3 7/9] nvme-pci: Attempt reset retry for IO failures Keith Busch
  2018-05-25 13:04   ` Christoph Hellwig
@ 2018-05-30 23:40   ` Sagi Grimberg
  2018-06-04 22:46     ` Keith Busch
  1 sibling, 1 reply; 45+ messages in thread
From: Sagi Grimberg @ 2018-05-30 23:40 UTC (permalink / raw)


> If the reset failed due to a non-fatal error, this patch will attempt
> to reset the controller again, with a maximum of 4 attempts.

curious, where did 4 came from?

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 7/9] nvme-pci: Attempt reset retry for IO failures
  2018-05-30 23:40   ` Sagi Grimberg
@ 2018-06-04 22:46     ` Keith Busch
  0 siblings, 0 replies; 45+ messages in thread
From: Keith Busch @ 2018-06-04 22:46 UTC (permalink / raw)


On Thu, May 31, 2018@02:40:26AM +0300, Sagi Grimberg wrote:
> > If the reset failed due to a non-fatal error, this patch will attempt
> > to reset the controller again, with a maximum of 4 attempts.
> 
> curious, where did 4 came from?

It was a pretty arbitrary choice based on what passed a modified blktests
block/011. I don't really like having test code deciding how a driver
should behave, so I'm okay with either changing the retry counts,
or just dropping this patch to have the driver unbind on init errors
instead of retrying.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 1/9] nvme: Sync request queues on reset
  2018-05-30 23:25                 ` Sagi Grimberg
@ 2018-06-05 16:25                   ` Keith Busch
  0 siblings, 0 replies; 45+ messages in thread
From: Keith Busch @ 2018-06-05 16:25 UTC (permalink / raw)


On Thu, May 31, 2018@02:25:54AM +0300, Sagi Grimberg wrote:
> > > > At least in the current blk-mq timeout handling, returning RESET_TIMER
> > > > presents other challenges for the reset handler: the timer may have
> > > > reclaimed the request that reset_work is trying to complete.
> > > 
> > > why would that be true if BLK_EH_RESET_TIMER was returned ?
> > 
> > That return rearms the timer for triggering timeout handling, and
> > when that timeout handling triggers, your reset work won't be able to
> > complete the request. You're basically relying on the timer being high
> > enough that your reset work beats the timer to the completion. That may
> > be sufficient in practice most of the time.
> 
> Why? the timeout handler would reset the timer yet again.

Right, the timeout hanlder resets the timer on a request the driver believes
it already completed.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 8/9] nvme-pci: Queue creation error handling
  2018-05-25 12:35   ` Christoph Hellwig
@ 2018-06-05 16:28     ` Keith Busch
  0 siblings, 0 replies; 45+ messages in thread
From: Keith Busch @ 2018-06-05 16:28 UTC (permalink / raw)


On Fri, May 25, 2018@02:35:49PM +0200, Christoph Hellwig wrote:
> On Thu, May 24, 2018@02:34:59PM -0600, Keith Busch wrote:
> > This patch sets the nvmeq's cq_vector only after the cq was successfully
> > created. This way a device reset doesn't mistakenly believe that the
> > vector is allocated. In case a reset does occur during queue creation,
> > this patch will return status immediately instead of trying to unwind
> > the created queues since the device won't be able to delete queues anyway.
> > 
> > This patch will also handle device reported failures correctly. These
> > errors are reported as positive nvme status codes, which were previously
> > not handled at all.
> 
> I applied the original patch this morning with some minor fixup.  Can
> you send a relative patch to the nvme-4.18-2 tree?

Sounds good, working on that now.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 0/9] nvme timeout fixes, v3
  2018-05-24 20:34 [PATCHv3 0/9] nvme timeout fixes, v3 Keith Busch
                   ` (8 preceding siblings ...)
  2018-05-24 20:35 ` [PATCHv3 9/9] nvme-pci: Don't wait for HMB completion on shutdown Keith Busch
@ 2018-07-13  0:48 ` Ming Lei
  2018-07-13 20:54   ` Keith Busch
  9 siblings, 1 reply; 45+ messages in thread
From: Ming Lei @ 2018-07-13  0:48 UTC (permalink / raw)


On Fri, May 25, 2018@4:34 AM, Keith Busch <keith.busch@intel.com> wrote:
> v2 -> v3:
>
> The main difference is getting rid of the bad idea that we could do
> a CONNECTING -> RESETTING transtion that breaks the state machine, so
> that patch is dropped. The series maintains the concept of retrying a
> reset when an init admin command times out. It just does so in the same
> context instead of requeueing itself.
>
> This series also includes two additional patches to handle other unlikely
> errors on queue initialization and HMB teardown.
>
> Keith Busch (9):
>   nvme: Sync request queues on reset
>   nvme-pci: Fix queue freeze criteria on reset
>   nvme: Move all IO out of controller reset
>   nvme-pci: Rate limit the nvme timeout warnings
>   nvme-pci: End IO requests immediately in CONNECTING state
>   nvme-pci: Unquiesce dead controller queues
>   nvme-pci: Attempt reset retry for IO failures
>   nvme-pci: Queue creation error handling
>   nvme-pci: Don't wait for HMB completion on shutdown
>
>  drivers/nvme/host/core.c |  23 ++++++-
>  drivers/nvme/host/nvme.h |   2 +
>  drivers/nvme/host/pci.c  | 173 ++++++++++++++++++++++++++++++++++-------------
>  3 files changed, 150 insertions(+), 48 deletions(-)

Hello Keith,

I guess there will be V4 for addressing the reported issue?

Thanks,
Ming Lei

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCHv3 0/9] nvme timeout fixes, v3
  2018-07-13  0:48 ` [PATCHv3 0/9] nvme timeout fixes, v3 Ming Lei
@ 2018-07-13 20:54   ` Keith Busch
  0 siblings, 0 replies; 45+ messages in thread
From: Keith Busch @ 2018-07-13 20:54 UTC (permalink / raw)


On Fri, Jul 13, 2018@08:48:24AM +0800, Ming Lei wrote:
> On Fri, May 25, 2018@4:34 AM, Keith Busch <keith.busch@intel.com> wrote:
> > Keith Busch (9):
> >   nvme: Sync request queues on reset
> >   nvme-pci: Fix queue freeze criteria on reset
> >   nvme: Move all IO out of controller reset
> >   nvme-pci: Rate limit the nvme timeout warnings
> >   nvme-pci: End IO requests immediately in CONNECTING state
> >   nvme-pci: Unquiesce dead controller queues
> >   nvme-pci: Attempt reset retry for IO failures
> >   nvme-pci: Queue creation error handling
> >   nvme-pci: Don't wait for HMB completion on shutdown
> >
> >  drivers/nvme/host/core.c |  23 ++++++-
> >  drivers/nvme/host/nvme.h |   2 +
> >  drivers/nvme/host/pci.c  | 173 ++++++++++++++++++++++++++++++++++-------------
> >  3 files changed, 150 insertions(+), 48 deletions(-)
> 
> Hello Keith,
> 
> I guess there will be V4 for addressing the reported issue?

Yes, I'll send that out in a few minutes.

The majority of the above are either merged already or unnecessary
anymore, but patches 1 and 3 still look useful. Patch 1 doesn't need to
change, but patch 3 from this series will be split into a series of prep
patches and made generic for the v4.

^ permalink raw reply	[flat|nested] 45+ messages in thread

end of thread, other threads:[~2018-07-13 20:54 UTC | newest]

Thread overview: 45+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-24 20:34 [PATCHv3 0/9] nvme timeout fixes, v3 Keith Busch
2018-05-24 20:34 ` [PATCHv3 1/9] nvme: Sync request queues on reset Keith Busch
2018-05-25 12:42   ` Christoph Hellwig
2018-05-25 14:22     ` Keith Busch
2018-05-25 14:32       ` Christoph Hellwig
2018-05-25 14:45         ` Keith Busch
2018-05-25 15:56         ` James Smart
2018-05-25 16:24           ` Keith Busch
2018-05-25 18:04             ` James Smart
2018-05-25 18:30               ` Keith Busch
2018-05-30 23:25                 ` Sagi Grimberg
2018-06-05 16:25                   ` Keith Busch
2018-05-30 23:24           ` Sagi Grimberg
2018-05-24 20:34 ` [PATCHv3 2/9] nvme-pci: Fix queue freeze criteria " Keith Busch
2018-05-25 12:43   ` Christoph Hellwig
2018-05-30 23:36   ` Sagi Grimberg
2018-05-24 20:34 ` [PATCHv3 3/9] nvme: Move all IO out of controller reset Keith Busch
2018-05-25 13:00   ` Christoph Hellwig
2018-05-25 14:41     ` Keith Busch
2018-05-24 20:34 ` [PATCHv3 4/9] nvme-pci: Rate limit the nvme timeout warnings Keith Busch
2018-05-25 13:01   ` Christoph Hellwig
2018-05-30  6:06   ` Christoph Hellwig
2018-05-24 20:34 ` [PATCHv3 5/9] nvme-pci: End IO requests in CONNECTING state Keith Busch
2018-05-24 20:47   ` Christoph Hellwig
2018-05-24 21:03     ` Keith Busch
2018-05-25 12:31       ` Christoph Hellwig
2018-05-24 20:34 ` [PATCHv3 6/9] nvme-pci: Unquiesce dead controller queues Keith Busch
2018-05-25 13:03   ` Christoph Hellwig
2018-05-24 20:34 ` [PATCHv3 7/9] nvme-pci: Attempt reset retry for IO failures Keith Busch
2018-05-25 13:04   ` Christoph Hellwig
2018-05-25 14:25     ` Keith Busch
2018-05-30 23:40   ` Sagi Grimberg
2018-06-04 22:46     ` Keith Busch
2018-05-24 20:34 ` [PATCHv3 8/9] nvme-pci: Queue creation error handling Keith Busch
2018-05-25 12:35   ` Christoph Hellwig
2018-06-05 16:28     ` Keith Busch
2018-05-30 23:37   ` Sagi Grimberg
2018-05-24 20:35 ` [PATCHv3 9/9] nvme-pci: Don't wait for HMB completion on shutdown Keith Busch
2018-05-24 20:45   ` Christoph Hellwig
2018-05-24 21:15     ` Keith Busch
2018-05-25  3:10       ` jianchao.wang
2018-05-25 15:09         ` Keith Busch
2018-05-25 12:36       ` Christoph Hellwig
2018-07-13  0:48 ` [PATCHv3 0/9] nvme timeout fixes, v3 Ming Lei
2018-07-13 20:54   ` Keith Busch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.