All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/6] nvme: Sync request queues on reset
@ 2018-05-18 16:38 ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-18 16:38 UTC (permalink / raw)
  To: linux-nvme, linux-block, Ming Lei, Christoph Hellwig, Sagi Grimberg
  Cc: Jens Axboe, Laurence Oberman, James Smart, Johannes Thumshirn,
	Keith Busch

This patch fixes races that occur with simultaneous controller
resets by synchronizing request queues prior to initializing the
controller. Withouth this, a thread may attempt disabling a controller
at the same time as we're trying to enable it.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/core.c | 21 +++++++++++++++++++--
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/pci.c  |  1 +
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 99b857e5a7a9..1de68b56b318 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3471,6 +3471,12 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
 
+static void nvme_start_queue(struct nvme_ns *ns)
+{
+	blk_mq_unquiesce_queue(ns->queue);
+	blk_mq_kick_requeue_list(ns->queue);
+}
+
 /**
  * nvme_kill_queues(): Ends all namespace queues
  * @ctrl: the dead controller that needs to end
@@ -3499,7 +3505,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
 		blk_set_queue_dying(ns->queue);
 
 		/* Forcibly unquiesce queues to avoid blocking dispatch */
-		blk_mq_unquiesce_queue(ns->queue);
+		nvme_start_queue(ns);
 	}
 	up_read(&ctrl->namespaces_rwsem);
 }
@@ -3569,11 +3575,22 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
 
 	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
-		blk_mq_unquiesce_queue(ns->queue);
+		nvme_start_queue(ns);
 	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
+void nvme_sync_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_sync_queue(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_sync_queues);
+
 int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
 {
 	if (!ctrl->ops->reinit_request)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 17d2f7cf3fed..c15c2ee7f61a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -407,6 +407,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res);
 
+void nvme_sync_queues(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 17a0190bd88f..8da63402d474 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2312,6 +2312,7 @@ static void nvme_reset_work(struct work_struct *work)
 	 */
 	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
 		nvme_dev_disable(dev, false);
+	nvme_sync_queues(&dev->ctrl);
 
 	/*
 	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 1/6] nvme: Sync request queues on reset
@ 2018-05-18 16:38 ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-18 16:38 UTC (permalink / raw)


This patch fixes races that occur with simultaneous controller
resets by synchronizing request queues prior to initializing the
controller. Withouth this, a thread may attempt disabling a controller
at the same time as we're trying to enable it.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/core.c | 21 +++++++++++++++++++--
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/pci.c  |  1 +
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 99b857e5a7a9..1de68b56b318 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3471,6 +3471,12 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
 
+static void nvme_start_queue(struct nvme_ns *ns)
+{
+	blk_mq_unquiesce_queue(ns->queue);
+	blk_mq_kick_requeue_list(ns->queue);
+}
+
 /**
  * nvme_kill_queues(): Ends all namespace queues
  * @ctrl: the dead controller that needs to end
@@ -3499,7 +3505,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
 		blk_set_queue_dying(ns->queue);
 
 		/* Forcibly unquiesce queues to avoid blocking dispatch */
-		blk_mq_unquiesce_queue(ns->queue);
+		nvme_start_queue(ns);
 	}
 	up_read(&ctrl->namespaces_rwsem);
 }
@@ -3569,11 +3575,22 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
 
 	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
-		blk_mq_unquiesce_queue(ns->queue);
+		nvme_start_queue(ns);
 	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
+void nvme_sync_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_sync_queue(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_sync_queues);
+
 int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
 {
 	if (!ctrl->ops->reinit_request)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 17d2f7cf3fed..c15c2ee7f61a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -407,6 +407,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res);
 
+void nvme_sync_queues(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 17a0190bd88f..8da63402d474 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2312,6 +2312,7 @@ static void nvme_reset_work(struct work_struct *work)
 	 */
 	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
 		nvme_dev_disable(dev, false);
+	nvme_sync_queues(&dev->ctrl);
 
 	/*
 	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 2/6] nvme-pci: Fix queue freeze criteria on reset
  2018-05-18 16:38 ` Keith Busch
@ 2018-05-18 16:38   ` Keith Busch
  -1 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-18 16:38 UTC (permalink / raw)
  To: linux-nvme, linux-block, Ming Lei, Christoph Hellwig, Sagi Grimberg
  Cc: Jens Axboe, Laurence Oberman, James Smart, Johannes Thumshirn,
	Keith Busch

The driver had been relying on the pci_dev to maintain the state of
the pci device to know when starting a freeze would be appropriate. The
blktests block/011 however shows us that users may alter the state of
pci_dev out from under drivers and break the criteria we had been using.

This patch uses the private nvme controller struct to track the
enabling/disabling state. Since we're relying on that now, the reset will
unconditionally disable the device on reset. This is necessary anyway
on a controller failure reset, and was already being done in the reset
during admin bring up, and is not harmful to do a second time.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/pci.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8da63402d474..2bd9d84f58d0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2196,24 +2196,22 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	mutex_lock(&dev->shutdown_lock);
-	if (pci_is_enabled(pdev)) {
+	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE &&
+	    (dev->ctrl.state == NVME_CTRL_LIVE ||
+	     dev->ctrl.state == NVME_CTRL_RESETTING)) {
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
-		if (dev->ctrl.state == NVME_CTRL_LIVE ||
-		    dev->ctrl.state == NVME_CTRL_RESETTING)
-			nvme_start_freeze(&dev->ctrl);
+		nvme_start_freeze(&dev->ctrl);
 		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
-			pdev->error_state  != pci_channel_io_normal);
+			pci_channel_offline(pdev) || !pci_is_enabled(pdev));
 	}
 
 	/*
 	 * Give the controller a chance to complete all entered requests if
 	 * doing a safe shutdown.
 	 */
-	if (!dead) {
-		if (shutdown)
-			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
-	}
+	if (!dead && shutdown)
+		nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
 
 	nvme_stop_queues(&dev->ctrl);
 
@@ -2227,8 +2225,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 		if (dev->host_mem_descs)
 			nvme_set_host_mem(dev, 0);
 		nvme_disable_io_queues(dev);
-		nvme_disable_admin_queue(dev, shutdown);
 	}
+	nvme_disable_admin_queue(dev, shutdown);
 	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
 		nvme_suspend_queue(&dev->queues[i]);
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 2/6] nvme-pci: Fix queue freeze criteria on reset
@ 2018-05-18 16:38   ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-18 16:38 UTC (permalink / raw)


The driver had been relying on the pci_dev to maintain the state of
the pci device to know when starting a freeze would be appropriate. The
blktests block/011 however shows us that users may alter the state of
pci_dev out from under drivers and break the criteria we had been using.

This patch uses the private nvme controller struct to track the
enabling/disabling state. Since we're relying on that now, the reset will
unconditionally disable the device on reset. This is necessary anyway
on a controller failure reset, and was already being done in the reset
during admin bring up, and is not harmful to do a second time.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/pci.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8da63402d474..2bd9d84f58d0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2196,24 +2196,22 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	mutex_lock(&dev->shutdown_lock);
-	if (pci_is_enabled(pdev)) {
+	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE &&
+	    (dev->ctrl.state == NVME_CTRL_LIVE ||
+	     dev->ctrl.state == NVME_CTRL_RESETTING)) {
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
-		if (dev->ctrl.state == NVME_CTRL_LIVE ||
-		    dev->ctrl.state == NVME_CTRL_RESETTING)
-			nvme_start_freeze(&dev->ctrl);
+		nvme_start_freeze(&dev->ctrl);
 		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
-			pdev->error_state  != pci_channel_io_normal);
+			pci_channel_offline(pdev) || !pci_is_enabled(pdev));
 	}
 
 	/*
 	 * Give the controller a chance to complete all entered requests if
 	 * doing a safe shutdown.
 	 */
-	if (!dead) {
-		if (shutdown)
-			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
-	}
+	if (!dead && shutdown)
+		nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
 
 	nvme_stop_queues(&dev->ctrl);
 
@@ -2227,8 +2225,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 		if (dev->host_mem_descs)
 			nvme_set_host_mem(dev, 0);
 		nvme_disable_io_queues(dev);
-		nvme_disable_admin_queue(dev, shutdown);
 	}
+	nvme_disable_admin_queue(dev, shutdown);
 	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
 		nvme_suspend_queue(&dev->queues[i]);
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 3/6] nvme: Move all IO out of controller reset
  2018-05-18 16:38 ` Keith Busch
@ 2018-05-18 16:38   ` Keith Busch
  -1 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-18 16:38 UTC (permalink / raw)
  To: linux-nvme, linux-block, Ming Lei, Christoph Hellwig, Sagi Grimberg
  Cc: Jens Axboe, Laurence Oberman, James Smart, Johannes Thumshirn,
	Keith Busch

IO may be retryable, so don't wait for them in the reset path. These
commands may trigger a reset if that IO expires without a completion,
placing it on the requeue list. Waiting for these would then deadlock
the reset handler.

To fix the theoretical deadlock, this patch unblocks IO submission from
the reset_work as before, but moves the waiting to the IO safe scan_work
so that the reset_work may proceed to completion. Since the unfreezing
happens in the controller LIVE state, the nvme device has to track if
the queues were frozen now to prevent incorrect freeze depths.

This patch is also renaming the function 'nvme_dev_add' to a
more appropriate name that describes what it's actually doing:
nvme_alloc_io_tags.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/core.c |  3 +++
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/pci.c  | 46 +++++++++++++++++++++++++++++++++-------------
 3 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1de68b56b318..34d7731f1419 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -214,6 +214,7 @@ static inline bool nvme_req_needs_retry(struct request *req)
 	if (blk_noretry_request(req))
 		return false;
 	if (nvme_req(req)->status & NVME_SC_DNR)
+
 		return false;
 	if (nvme_req(req)->retries >= nvme_max_retries)
 		return false;
@@ -3177,6 +3178,8 @@ static void nvme_scan_work(struct work_struct *work)
 	struct nvme_id_ctrl *id;
 	unsigned nn;
 
+	if (ctrl->ops->update_hw_ctx)
+		ctrl->ops->update_hw_ctx(ctrl);
 	if (ctrl->state != NVME_CTRL_LIVE)
 		return;
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index c15c2ee7f61a..230c5424b197 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -320,6 +320,7 @@ struct nvme_ctrl_ops {
 	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
 	int (*reinit_request)(void *data, struct request *rq);
 	void (*stop_ctrl)(struct nvme_ctrl *ctrl);
+	void (*update_hw_ctx)(struct nvme_ctrl *ctrl);
 };
 
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2bd9d84f58d0..6a7cbc631d92 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -99,6 +99,7 @@ struct nvme_dev {
 	u32 cmbloc;
 	struct nvme_ctrl ctrl;
 	struct completion ioq_wait;
+	bool queues_froze;
 
 	/* shadow doorbell buffer support: */
 	u32 *dbbuf_dbs;
@@ -2065,10 +2066,33 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
 	}
 }
 
+static void nvme_pci_update_hw_ctx(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dev *dev = to_nvme_dev(ctrl);
+	bool unfreeze;
+
+	mutex_lock(&dev->shutdown_lock);
+	unfreeze = dev->queues_froze;
+	mutex_unlock(&dev->shutdown_lock);
+
+	if (unfreeze)
+		nvme_wait_freeze(&dev->ctrl);
+
+	blk_mq_update_nr_hw_queues(ctrl->tagset, dev->online_queues - 1);
+	nvme_free_queues(dev, dev->online_queues);
+
+	if (unfreeze)
+		nvme_unfreeze(&dev->ctrl);
+
+	mutex_lock(&dev->shutdown_lock);
+	dev->queues_froze = false;
+	mutex_unlock(&dev->shutdown_lock);
+}
+
 /*
  * return error value only when tagset allocation failed
  */
-static int nvme_dev_add(struct nvme_dev *dev)
+static int nvme_alloc_io_tags(struct nvme_dev *dev)
 {
 	int ret;
 
@@ -2097,10 +2121,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 
 		nvme_dbbuf_set(dev);
 	} else {
-		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
-
-		/* Free previously allocated queues that are no longer usable */
-		nvme_free_queues(dev, dev->online_queues);
+		nvme_start_queues(&dev->ctrl);
 	}
 
 	return 0;
@@ -2201,7 +2222,10 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	     dev->ctrl.state == NVME_CTRL_RESETTING)) {
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
-		nvme_start_freeze(&dev->ctrl);
+		if (!dev->queues_froze)	{
+			nvme_start_freeze(&dev->ctrl);
+			dev->queues_froze = true;
+		}
 		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
 			pci_channel_offline(pdev) || !pci_is_enabled(pdev));
 	}
@@ -2375,13 +2399,8 @@ static void nvme_reset_work(struct work_struct *work)
 		nvme_kill_queues(&dev->ctrl);
 		nvme_remove_namespaces(&dev->ctrl);
 		new_state = NVME_CTRL_ADMIN_ONLY;
-	} else {
-		nvme_start_queues(&dev->ctrl);
-		nvme_wait_freeze(&dev->ctrl);
-		/* hit this only when allocate tagset fails */
-		if (nvme_dev_add(dev))
-			new_state = NVME_CTRL_ADMIN_ONLY;
-		nvme_unfreeze(&dev->ctrl);
+	} else if (nvme_alloc_io_tags(dev)) {
+		new_state = NVME_CTRL_ADMIN_ONLY;
 	}
 
 	/*
@@ -2446,6 +2465,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.reg_read64		= nvme_pci_reg_read64,
 	.free_ctrl		= nvme_pci_free_ctrl,
 	.submit_async_event	= nvme_pci_submit_async_event,
+	.update_hw_ctx		= nvme_pci_update_hw_ctx,
 	.get_address		= nvme_pci_get_address,
 };
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 3/6] nvme: Move all IO out of controller reset
@ 2018-05-18 16:38   ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-18 16:38 UTC (permalink / raw)


IO may be retryable, so don't wait for them in the reset path. These
commands may trigger a reset if that IO expires without a completion,
placing it on the requeue list. Waiting for these would then deadlock
the reset handler.

To fix the theoretical deadlock, this patch unblocks IO submission from
the reset_work as before, but moves the waiting to the IO safe scan_work
so that the reset_work may proceed to completion. Since the unfreezing
happens in the controller LIVE state, the nvme device has to track if
the queues were frozen now to prevent incorrect freeze depths.

This patch is also renaming the function 'nvme_dev_add' to a
more appropriate name that describes what it's actually doing:
nvme_alloc_io_tags.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/core.c |  3 +++
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/pci.c  | 46 +++++++++++++++++++++++++++++++++-------------
 3 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1de68b56b318..34d7731f1419 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -214,6 +214,7 @@ static inline bool nvme_req_needs_retry(struct request *req)
 	if (blk_noretry_request(req))
 		return false;
 	if (nvme_req(req)->status & NVME_SC_DNR)
+
 		return false;
 	if (nvme_req(req)->retries >= nvme_max_retries)
 		return false;
@@ -3177,6 +3178,8 @@ static void nvme_scan_work(struct work_struct *work)
 	struct nvme_id_ctrl *id;
 	unsigned nn;
 
+	if (ctrl->ops->update_hw_ctx)
+		ctrl->ops->update_hw_ctx(ctrl);
 	if (ctrl->state != NVME_CTRL_LIVE)
 		return;
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index c15c2ee7f61a..230c5424b197 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -320,6 +320,7 @@ struct nvme_ctrl_ops {
 	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
 	int (*reinit_request)(void *data, struct request *rq);
 	void (*stop_ctrl)(struct nvme_ctrl *ctrl);
+	void (*update_hw_ctx)(struct nvme_ctrl *ctrl);
 };
 
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2bd9d84f58d0..6a7cbc631d92 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -99,6 +99,7 @@ struct nvme_dev {
 	u32 cmbloc;
 	struct nvme_ctrl ctrl;
 	struct completion ioq_wait;
+	bool queues_froze;
 
 	/* shadow doorbell buffer support: */
 	u32 *dbbuf_dbs;
@@ -2065,10 +2066,33 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
 	}
 }
 
+static void nvme_pci_update_hw_ctx(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dev *dev = to_nvme_dev(ctrl);
+	bool unfreeze;
+
+	mutex_lock(&dev->shutdown_lock);
+	unfreeze = dev->queues_froze;
+	mutex_unlock(&dev->shutdown_lock);
+
+	if (unfreeze)
+		nvme_wait_freeze(&dev->ctrl);
+
+	blk_mq_update_nr_hw_queues(ctrl->tagset, dev->online_queues - 1);
+	nvme_free_queues(dev, dev->online_queues);
+
+	if (unfreeze)
+		nvme_unfreeze(&dev->ctrl);
+
+	mutex_lock(&dev->shutdown_lock);
+	dev->queues_froze = false;
+	mutex_unlock(&dev->shutdown_lock);
+}
+
 /*
  * return error value only when tagset allocation failed
  */
-static int nvme_dev_add(struct nvme_dev *dev)
+static int nvme_alloc_io_tags(struct nvme_dev *dev)
 {
 	int ret;
 
@@ -2097,10 +2121,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 
 		nvme_dbbuf_set(dev);
 	} else {
-		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
-
-		/* Free previously allocated queues that are no longer usable */
-		nvme_free_queues(dev, dev->online_queues);
+		nvme_start_queues(&dev->ctrl);
 	}
 
 	return 0;
@@ -2201,7 +2222,10 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	     dev->ctrl.state == NVME_CTRL_RESETTING)) {
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
-		nvme_start_freeze(&dev->ctrl);
+		if (!dev->queues_froze)	{
+			nvme_start_freeze(&dev->ctrl);
+			dev->queues_froze = true;
+		}
 		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
 			pci_channel_offline(pdev) || !pci_is_enabled(pdev));
 	}
@@ -2375,13 +2399,8 @@ static void nvme_reset_work(struct work_struct *work)
 		nvme_kill_queues(&dev->ctrl);
 		nvme_remove_namespaces(&dev->ctrl);
 		new_state = NVME_CTRL_ADMIN_ONLY;
-	} else {
-		nvme_start_queues(&dev->ctrl);
-		nvme_wait_freeze(&dev->ctrl);
-		/* hit this only when allocate tagset fails */
-		if (nvme_dev_add(dev))
-			new_state = NVME_CTRL_ADMIN_ONLY;
-		nvme_unfreeze(&dev->ctrl);
+	} else if (nvme_alloc_io_tags(dev)) {
+		new_state = NVME_CTRL_ADMIN_ONLY;
 	}
 
 	/*
@@ -2446,6 +2465,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.reg_read64		= nvme_pci_reg_read64,
 	.free_ctrl		= nvme_pci_free_ctrl,
 	.submit_async_event	= nvme_pci_submit_async_event,
+	.update_hw_ctx		= nvme_pci_update_hw_ctx,
 	.get_address		= nvme_pci_get_address,
 };
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 4/6] nvme: Allow reset from CONNECTING state
  2018-05-18 16:38 ` Keith Busch
@ 2018-05-18 16:38   ` Keith Busch
  -1 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-18 16:38 UTC (permalink / raw)
  To: linux-nvme, linux-block, Ming Lei, Christoph Hellwig, Sagi Grimberg
  Cc: Jens Axboe, Laurence Oberman, James Smart, Johannes Thumshirn,
	Keith Busch

A failed connection may be retryable. This patch allows the connecting
state to initiate a reset so that it may try to connect again.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 34d7731f1419..bccc92206fba 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -293,6 +293,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		case NVME_CTRL_NEW:
 		case NVME_CTRL_LIVE:
 		case NVME_CTRL_ADMIN_ONLY:
+		case NVME_CTRL_CONNECTING:
 			changed = true;
 			/* FALLTHRU */
 		default:
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 4/6] nvme: Allow reset from CONNECTING state
@ 2018-05-18 16:38   ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-18 16:38 UTC (permalink / raw)


A failed connection may be retryable. This patch allows the connecting
state to initiate a reset so that it may try to connect again.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 34d7731f1419..bccc92206fba 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -293,6 +293,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		case NVME_CTRL_NEW:
 		case NVME_CTRL_LIVE:
 		case NVME_CTRL_ADMIN_ONLY:
+		case NVME_CTRL_CONNECTING:
 			changed = true;
 			/* FALLTHRU */
 		default:
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 5/6] nvme-pci: Attempt reset retry for IO failures
  2018-05-18 16:38 ` Keith Busch
@ 2018-05-18 16:38   ` Keith Busch
  -1 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-18 16:38 UTC (permalink / raw)
  To: linux-nvme, linux-block, Ming Lei, Christoph Hellwig, Sagi Grimberg
  Cc: Jens Axboe, Laurence Oberman, James Smart, Johannes Thumshirn,
	Keith Busch

If the reset failed due to a non-fatal error, this patch will attempt
to reset the controller again, with a maximum of 4 attempts.

Since the failed reset case has changed purpose, this patch provides a
more appropriate name and warning message for the reset failure.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/pci.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6a7cbc631d92..ddfeb186d129 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -37,6 +37,8 @@
 
 #define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
 
+#define MAX_RESET_FAILURES 4
+
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
@@ -101,6 +103,8 @@ struct nvme_dev {
 	struct completion ioq_wait;
 	bool queues_froze;
 
+	int reset_failures;
+
 	/* shadow doorbell buffer support: */
 	u32 *dbbuf_dbs;
 	dma_addr_t dbbuf_dbs_dma_addr;
@@ -2307,9 +2311,23 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 	kfree(dev);
 }
 
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+static void nvme_reset_failure(struct nvme_dev *dev, int status)
 {
-	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
+	dev->reset_failures++;
+	dev_warn(dev->ctrl.device, "Reset failure status: %d, failures:%d\n",
+		status, dev->reset_failures);
+
+	/* IO and Interrupted Call may indicate a retryable error */
+	switch (status) {
+	case -EIO:
+	case -EINTR:
+		if (dev->reset_failures < MAX_RESET_FAILURES &&
+		    !nvme_reset_ctrl(&dev->ctrl))
+			return;
+		break;
+	default:
+		break;
+	}
 
 	nvme_get_ctrl(&dev->ctrl);
 	nvme_dev_disable(dev, false);
@@ -2410,14 +2428,16 @@ static void nvme_reset_work(struct work_struct *work)
 	if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
 		dev_warn(dev->ctrl.device,
 			"failed to mark controller state %d\n", new_state);
+		result = -ENODEV;
 		goto out;
 	}
 
+	dev->reset_failures = 0;
 	nvme_start_ctrl(&dev->ctrl);
 	return;
 
  out:
-	nvme_remove_dead_ctrl(dev, result);
+	nvme_reset_failure(dev, result);
 }
 
 static void nvme_remove_dead_ctrl_work(struct work_struct *work)
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 5/6] nvme-pci: Attempt reset retry for IO failures
@ 2018-05-18 16:38   ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-18 16:38 UTC (permalink / raw)


If the reset failed due to a non-fatal error, this patch will attempt
to reset the controller again, with a maximum of 4 attempts.

Since the failed reset case has changed purpose, this patch provides a
more appropriate name and warning message for the reset failure.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/pci.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6a7cbc631d92..ddfeb186d129 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -37,6 +37,8 @@
 
 #define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
 
+#define MAX_RESET_FAILURES 4
+
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
@@ -101,6 +103,8 @@ struct nvme_dev {
 	struct completion ioq_wait;
 	bool queues_froze;
 
+	int reset_failures;
+
 	/* shadow doorbell buffer support: */
 	u32 *dbbuf_dbs;
 	dma_addr_t dbbuf_dbs_dma_addr;
@@ -2307,9 +2311,23 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 	kfree(dev);
 }
 
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+static void nvme_reset_failure(struct nvme_dev *dev, int status)
 {
-	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
+	dev->reset_failures++;
+	dev_warn(dev->ctrl.device, "Reset failure status: %d, failures:%d\n",
+		status, dev->reset_failures);
+
+	/* IO and Interrupted Call may indicate a retryable error */
+	switch (status) {
+	case -EIO:
+	case -EINTR:
+		if (dev->reset_failures < MAX_RESET_FAILURES &&
+		    !nvme_reset_ctrl(&dev->ctrl))
+			return;
+		break;
+	default:
+		break;
+	}
 
 	nvme_get_ctrl(&dev->ctrl);
 	nvme_dev_disable(dev, false);
@@ -2410,14 +2428,16 @@ static void nvme_reset_work(struct work_struct *work)
 	if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
 		dev_warn(dev->ctrl.device,
 			"failed to mark controller state %d\n", new_state);
+		result = -ENODEV;
 		goto out;
 	}
 
+	dev->reset_failures = 0;
 	nvme_start_ctrl(&dev->ctrl);
 	return;
 
  out:
-	nvme_remove_dead_ctrl(dev, result);
+	nvme_reset_failure(dev, result);
 }
 
 static void nvme_remove_dead_ctrl_work(struct work_struct *work)
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 6/6] nvme-pci: Rate limit the nvme timeout warnings
  2018-05-18 16:38 ` Keith Busch
@ 2018-05-18 16:38   ` Keith Busch
  -1 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-18 16:38 UTC (permalink / raw)
  To: linux-nvme, linux-block, Ming Lei, Christoph Hellwig, Sagi Grimberg
  Cc: Jens Axboe, Laurence Oberman, James Smart, Johannes Thumshirn,
	Keith Busch

The block layer's timeout handling currently refuses to let the driver
complete commands outside the timeout callback once blk-mq decides they've
expired. If a device breaks, this could potentially create many thousands
of timed out commands. There's nothing of value to be gleaned from
observing each of those messages, so this patch adds a ratelimit on them.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ddfeb186d129..e4b91c246e36 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1251,7 +1251,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
  	 * returned to the driver, or if this is the admin queue.
 	 */
 	if (!nvmeq->qid || iod->aborted) {
-		dev_warn(dev->ctrl.device,
+		dev_warn_ratelimited(dev->ctrl.device,
 			 "I/O %d QID %d timeout, reset controller\n",
 			 req->tag, nvmeq->qid);
 		nvme_dev_disable(dev, false);
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 6/6] nvme-pci: Rate limit the nvme timeout warnings
@ 2018-05-18 16:38   ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-18 16:38 UTC (permalink / raw)


The block layer's timeout handling currently refuses to let the driver
complete commands outside the timeout callback once blk-mq decides they've
expired. If a device breaks, this could potentially create many thousands
of timed out commands. There's nothing of value to be gleaned from
observing each of those messages, so this patch adds a ratelimit on them.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ddfeb186d129..e4b91c246e36 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1251,7 +1251,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
  	 * returned to the driver, or if this is the admin queue.
 	 */
 	if (!nvmeq->qid || iod->aborted) {
-		dev_warn(dev->ctrl.device,
+		dev_warn_ratelimited(dev->ctrl.device,
 			 "I/O %d QID %d timeout, reset controller\n",
 			 req->tag, nvmeq->qid);
 		nvme_dev_disable(dev, false);
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [PATCH 1/6] nvme: Sync request queues on reset
  2018-05-18 16:38 ` Keith Busch
@ 2018-05-18 22:32   ` Ming Lei
  -1 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-18 22:32 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-nvme, linux-block, Christoph Hellwig, Sagi Grimberg,
	Jens Axboe, Laurence Oberman, James Smart, Johannes Thumshirn

On Fri, May 18, 2018 at 10:38:18AM -0600, Keith Busch wrote:
> This patch fixes races that occur with simultaneous controller
> resets by synchronizing request queues prior to initializing the
> controller. Withouth this, a thread may attempt disabling a controller
> at the same time as we're trying to enable it.
> 
> Signed-off-by: Keith Busch <keith.busch@intel.com>
> ---
>  drivers/nvme/host/core.c | 21 +++++++++++++++++++--
>  drivers/nvme/host/nvme.h |  1 +
>  drivers/nvme/host/pci.c  |  1 +
>  3 files changed, 21 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 99b857e5a7a9..1de68b56b318 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3471,6 +3471,12 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
>  }
>  EXPORT_SYMBOL_GPL(nvme_init_ctrl);
>  
> +static void nvme_start_queue(struct nvme_ns *ns)
> +{
> +	blk_mq_unquiesce_queue(ns->queue);
> +	blk_mq_kick_requeue_list(ns->queue);
> +}
> +
>  /**
>   * nvme_kill_queues(): Ends all namespace queues
>   * @ctrl: the dead controller that needs to end
> @@ -3499,7 +3505,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
>  		blk_set_queue_dying(ns->queue);
>  
>  		/* Forcibly unquiesce queues to avoid blocking dispatch */
> -		blk_mq_unquiesce_queue(ns->queue);
> +		nvme_start_queue(ns);
>  	}
>  	up_read(&ctrl->namespaces_rwsem);
>  }
> @@ -3569,11 +3575,22 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
>  
>  	down_read(&ctrl->namespaces_rwsem);
>  	list_for_each_entry(ns, &ctrl->namespaces, list)
> -		blk_mq_unquiesce_queue(ns->queue);
> +		nvme_start_queue(ns);
>  	up_read(&ctrl->namespaces_rwsem);
>  }
>  EXPORT_SYMBOL_GPL(nvme_start_queues);
>  
> +void nvme_sync_queues(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_ns *ns;
> +
> +	down_read(&ctrl->namespaces_rwsem);
> +	list_for_each_entry(ns, &ctrl->namespaces, list)
> +		blk_sync_queue(ns->queue);
> +	up_read(&ctrl->namespaces_rwsem);
> +}
> +EXPORT_SYMBOL_GPL(nvme_sync_queues);

This way can't sync timeout reliably, since timeout events can
come from two NS at the same time, and one may be handled as
RESET_TIMER, and another one can be handled as EH_HANDLED.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 1/6] nvme: Sync request queues on reset
@ 2018-05-18 22:32   ` Ming Lei
  0 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-18 22:32 UTC (permalink / raw)


On Fri, May 18, 2018@10:38:18AM -0600, Keith Busch wrote:
> This patch fixes races that occur with simultaneous controller
> resets by synchronizing request queues prior to initializing the
> controller. Withouth this, a thread may attempt disabling a controller
> at the same time as we're trying to enable it.
> 
> Signed-off-by: Keith Busch <keith.busch at intel.com>
> ---
>  drivers/nvme/host/core.c | 21 +++++++++++++++++++--
>  drivers/nvme/host/nvme.h |  1 +
>  drivers/nvme/host/pci.c  |  1 +
>  3 files changed, 21 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 99b857e5a7a9..1de68b56b318 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3471,6 +3471,12 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
>  }
>  EXPORT_SYMBOL_GPL(nvme_init_ctrl);
>  
> +static void nvme_start_queue(struct nvme_ns *ns)
> +{
> +	blk_mq_unquiesce_queue(ns->queue);
> +	blk_mq_kick_requeue_list(ns->queue);
> +}
> +
>  /**
>   * nvme_kill_queues(): Ends all namespace queues
>   * @ctrl: the dead controller that needs to end
> @@ -3499,7 +3505,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
>  		blk_set_queue_dying(ns->queue);
>  
>  		/* Forcibly unquiesce queues to avoid blocking dispatch */
> -		blk_mq_unquiesce_queue(ns->queue);
> +		nvme_start_queue(ns);
>  	}
>  	up_read(&ctrl->namespaces_rwsem);
>  }
> @@ -3569,11 +3575,22 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
>  
>  	down_read(&ctrl->namespaces_rwsem);
>  	list_for_each_entry(ns, &ctrl->namespaces, list)
> -		blk_mq_unquiesce_queue(ns->queue);
> +		nvme_start_queue(ns);
>  	up_read(&ctrl->namespaces_rwsem);
>  }
>  EXPORT_SYMBOL_GPL(nvme_start_queues);
>  
> +void nvme_sync_queues(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_ns *ns;
> +
> +	down_read(&ctrl->namespaces_rwsem);
> +	list_for_each_entry(ns, &ctrl->namespaces, list)
> +		blk_sync_queue(ns->queue);
> +	up_read(&ctrl->namespaces_rwsem);
> +}
> +EXPORT_SYMBOL_GPL(nvme_sync_queues);

This way can't sync timeout reliably, since timeout events can
come from two NS at the same time, and one may be handled as
RESET_TIMER, and another one can be handled as EH_HANDLED.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 3/6] nvme: Move all IO out of controller reset
  2018-05-18 16:38   ` Keith Busch
@ 2018-05-18 23:03     ` Ming Lei
  -1 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-18 23:03 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-nvme, linux-block, Christoph Hellwig, Sagi Grimberg,
	Jens Axboe, Laurence Oberman, James Smart, Johannes Thumshirn

On Fri, May 18, 2018 at 10:38:20AM -0600, Keith Busch wrote:
> IO may be retryable, so don't wait for them in the reset path. These
> commands may trigger a reset if that IO expires without a completion,
> placing it on the requeue list. Waiting for these would then deadlock
> the reset handler.
> 
> To fix the theoretical deadlock, this patch unblocks IO submission from
> the reset_work as before, but moves the waiting to the IO safe scan_work
> so that the reset_work may proceed to completion. Since the unfreezing
> happens in the controller LIVE state, the nvme device has to track if
> the queues were frozen now to prevent incorrect freeze depths.
> 
> This patch is also renaming the function 'nvme_dev_add' to a
> more appropriate name that describes what it's actually doing:
> nvme_alloc_io_tags.
> 
> Signed-off-by: Keith Busch <keith.busch@intel.com>
> ---
>  drivers/nvme/host/core.c |  3 +++
>  drivers/nvme/host/nvme.h |  1 +
>  drivers/nvme/host/pci.c  | 46 +++++++++++++++++++++++++++++++++-------------
>  3 files changed, 37 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 1de68b56b318..34d7731f1419 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -214,6 +214,7 @@ static inline bool nvme_req_needs_retry(struct request *req)
>  	if (blk_noretry_request(req))
>  		return false;
>  	if (nvme_req(req)->status & NVME_SC_DNR)
> +
>  		return false;
>  	if (nvme_req(req)->retries >= nvme_max_retries)
>  		return false;
> @@ -3177,6 +3178,8 @@ static void nvme_scan_work(struct work_struct *work)
>  	struct nvme_id_ctrl *id;
>  	unsigned nn;
>  
> +	if (ctrl->ops->update_hw_ctx)
> +		ctrl->ops->update_hw_ctx(ctrl);
>  	if (ctrl->state != NVME_CTRL_LIVE)
>  		return;
>  
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index c15c2ee7f61a..230c5424b197 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -320,6 +320,7 @@ struct nvme_ctrl_ops {
>  	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
>  	int (*reinit_request)(void *data, struct request *rq);
>  	void (*stop_ctrl)(struct nvme_ctrl *ctrl);
> +	void (*update_hw_ctx)(struct nvme_ctrl *ctrl);
>  };
>  
>  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 2bd9d84f58d0..6a7cbc631d92 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -99,6 +99,7 @@ struct nvme_dev {
>  	u32 cmbloc;
>  	struct nvme_ctrl ctrl;
>  	struct completion ioq_wait;
> +	bool queues_froze;
>  
>  	/* shadow doorbell buffer support: */
>  	u32 *dbbuf_dbs;
> @@ -2065,10 +2066,33 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
>  	}
>  }
>  
> +static void nvme_pci_update_hw_ctx(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_dev *dev = to_nvme_dev(ctrl);
> +	bool unfreeze;
> +
> +	mutex_lock(&dev->shutdown_lock);
> +	unfreeze = dev->queues_froze;
> +	mutex_unlock(&dev->shutdown_lock);

What if nvme_dev_disable() just sets the .queues_froze flag and
userspace sends a RESCAN command at the same time?

> +
> +	if (unfreeze)
> +		nvme_wait_freeze(&dev->ctrl);
> +

timeout may comes just before&during blk_mq_update_nr_hw_queues() or
the above nvme_wait_freeze(), then both two may hang forever.

> +	blk_mq_update_nr_hw_queues(ctrl->tagset, dev->online_queues - 1);
> +	nvme_free_queues(dev, dev->online_queues);
> +
> +	if (unfreeze)
> +		nvme_unfreeze(&dev->ctrl);
> +
> +	mutex_lock(&dev->shutdown_lock);
> +	dev->queues_froze = false;
> +	mutex_unlock(&dev->shutdown_lock);

If the running scan work is triggered via user-space, the above code
may clear the .queues_froze flag wrong.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 3/6] nvme: Move all IO out of controller reset
@ 2018-05-18 23:03     ` Ming Lei
  0 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-18 23:03 UTC (permalink / raw)


On Fri, May 18, 2018@10:38:20AM -0600, Keith Busch wrote:
> IO may be retryable, so don't wait for them in the reset path. These
> commands may trigger a reset if that IO expires without a completion,
> placing it on the requeue list. Waiting for these would then deadlock
> the reset handler.
> 
> To fix the theoretical deadlock, this patch unblocks IO submission from
> the reset_work as before, but moves the waiting to the IO safe scan_work
> so that the reset_work may proceed to completion. Since the unfreezing
> happens in the controller LIVE state, the nvme device has to track if
> the queues were frozen now to prevent incorrect freeze depths.
> 
> This patch is also renaming the function 'nvme_dev_add' to a
> more appropriate name that describes what it's actually doing:
> nvme_alloc_io_tags.
> 
> Signed-off-by: Keith Busch <keith.busch at intel.com>
> ---
>  drivers/nvme/host/core.c |  3 +++
>  drivers/nvme/host/nvme.h |  1 +
>  drivers/nvme/host/pci.c  | 46 +++++++++++++++++++++++++++++++++-------------
>  3 files changed, 37 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 1de68b56b318..34d7731f1419 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -214,6 +214,7 @@ static inline bool nvme_req_needs_retry(struct request *req)
>  	if (blk_noretry_request(req))
>  		return false;
>  	if (nvme_req(req)->status & NVME_SC_DNR)
> +
>  		return false;
>  	if (nvme_req(req)->retries >= nvme_max_retries)
>  		return false;
> @@ -3177,6 +3178,8 @@ static void nvme_scan_work(struct work_struct *work)
>  	struct nvme_id_ctrl *id;
>  	unsigned nn;
>  
> +	if (ctrl->ops->update_hw_ctx)
> +		ctrl->ops->update_hw_ctx(ctrl);
>  	if (ctrl->state != NVME_CTRL_LIVE)
>  		return;
>  
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index c15c2ee7f61a..230c5424b197 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -320,6 +320,7 @@ struct nvme_ctrl_ops {
>  	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
>  	int (*reinit_request)(void *data, struct request *rq);
>  	void (*stop_ctrl)(struct nvme_ctrl *ctrl);
> +	void (*update_hw_ctx)(struct nvme_ctrl *ctrl);
>  };
>  
>  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 2bd9d84f58d0..6a7cbc631d92 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -99,6 +99,7 @@ struct nvme_dev {
>  	u32 cmbloc;
>  	struct nvme_ctrl ctrl;
>  	struct completion ioq_wait;
> +	bool queues_froze;
>  
>  	/* shadow doorbell buffer support: */
>  	u32 *dbbuf_dbs;
> @@ -2065,10 +2066,33 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
>  	}
>  }
>  
> +static void nvme_pci_update_hw_ctx(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_dev *dev = to_nvme_dev(ctrl);
> +	bool unfreeze;
> +
> +	mutex_lock(&dev->shutdown_lock);
> +	unfreeze = dev->queues_froze;
> +	mutex_unlock(&dev->shutdown_lock);

What if nvme_dev_disable() just sets the .queues_froze flag and
userspace sends a RESCAN command at the same time?

> +
> +	if (unfreeze)
> +		nvme_wait_freeze(&dev->ctrl);
> +

timeout may comes just before&during blk_mq_update_nr_hw_queues() or
the above nvme_wait_freeze(), then both two may hang forever.

> +	blk_mq_update_nr_hw_queues(ctrl->tagset, dev->online_queues - 1);
> +	nvme_free_queues(dev, dev->online_queues);
> +
> +	if (unfreeze)
> +		nvme_unfreeze(&dev->ctrl);
> +
> +	mutex_lock(&dev->shutdown_lock);
> +	dev->queues_froze = false;
> +	mutex_unlock(&dev->shutdown_lock);

If the running scan work is triggered via user-space, the above code
may clear the .queues_froze flag wrong.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 1/6] nvme: Sync request queues on reset
  2018-05-18 22:32   ` Ming Lei
@ 2018-05-18 23:44     ` Keith Busch
  -1 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-18 23:44 UTC (permalink / raw)
  To: Ming Lei
  Cc: Keith Busch, Jens Axboe, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, linux-block, Johannes Thumshirn,
	Christoph Hellwig

On Sat, May 19, 2018 at 06:32:11AM +0800, Ming Lei wrote:
> This way can't sync timeout reliably, since timeout events can
> come from two NS at the same time, and one may be handled as
> RESET_TIMER, and another one can be handled as EH_HANDLED.

You keep saying that, but the controller state is global to the
controller. It doesn't matter which namespace request_queue started the
reset: every namespaces request queue sees the RESETTING controller state
from the point the syncing occurs, and they don't return RESET_TIMER,
and on top of that, the reset reclaims every single IO command no matter
what namespace request_queue initiated the reset.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 1/6] nvme: Sync request queues on reset
@ 2018-05-18 23:44     ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-18 23:44 UTC (permalink / raw)


On Sat, May 19, 2018@06:32:11AM +0800, Ming Lei wrote:
> This way can't sync timeout reliably, since timeout events can
> come from two NS at the same time, and one may be handled as
> RESET_TIMER, and another one can be handled as EH_HANDLED.

You keep saying that, but the controller state is global to the
controller. It doesn't matter which namespace request_queue started the
reset: every namespaces request queue sees the RESETTING controller state
from the point the syncing occurs, and they don't return RESET_TIMER,
and on top of that, the reset reclaims every single IO command no matter
what namespace request_queue initiated the reset.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 1/6] nvme: Sync request queues on reset
  2018-05-18 23:44     ` Keith Busch
@ 2018-05-19  0:01       ` Ming Lei
  -1 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-19  0:01 UTC (permalink / raw)
  To: Keith Busch
  Cc: Keith Busch, Jens Axboe, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, linux-block, Johannes Thumshirn,
	Christoph Hellwig

On Fri, May 18, 2018 at 05:44:08PM -0600, Keith Busch wrote:
> On Sat, May 19, 2018 at 06:32:11AM +0800, Ming Lei wrote:
> > This way can't sync timeout reliably, since timeout events can
> > come from two NS at the same time, and one may be handled as
> > RESET_TIMER, and another one can be handled as EH_HANDLED.
> 
> You keep saying that, but the controller state is global to the
> controller. It doesn't matter which namespace request_queue started the
> reset: every namespaces request queue sees the RESETTING controller state

When timeouts come, the global state of RESETTING may not be updated
yet, so all the timeouts may not observe the state.

Please see my previous explanation:

https://marc.info/?l=linux-block&m=152600464317808&w=2


Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 1/6] nvme: Sync request queues on reset
@ 2018-05-19  0:01       ` Ming Lei
  0 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-19  0:01 UTC (permalink / raw)


On Fri, May 18, 2018@05:44:08PM -0600, Keith Busch wrote:
> On Sat, May 19, 2018@06:32:11AM +0800, Ming Lei wrote:
> > This way can't sync timeout reliably, since timeout events can
> > come from two NS at the same time, and one may be handled as
> > RESET_TIMER, and another one can be handled as EH_HANDLED.
> 
> You keep saying that, but the controller state is global to the
> controller. It doesn't matter which namespace request_queue started the
> reset: every namespaces request queue sees the RESETTING controller state

When timeouts come, the global state of RESETTING may not be updated
yet, so all the timeouts may not observe the state.

Please see my previous explanation:

https://marc.info/?l=linux-block&m=152600464317808&w=2


Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 1/6] nvme: Sync request queues on reset
  2018-05-19  0:01       ` Ming Lei
@ 2018-05-21 14:04         ` Keith Busch
  -1 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-21 14:04 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, Keith Busch, Johannes Thumshirn,
	Christoph Hellwig

On Sat, May 19, 2018 at 08:01:42AM +0800, Ming Lei wrote:
> > You keep saying that, but the controller state is global to the
> > controller. It doesn't matter which namespace request_queue started the
> > reset: every namespaces request queue sees the RESETTING controller state
> 
> When timeouts come, the global state of RESETTING may not be updated
> yet, so all the timeouts may not observe the state.

Even prior to the RESETING state, every single command, no matter
which namespace or request_queue it came on, is reclaimed by the driver.
There *should* be no requests to timeout after nvme_dev_disable is called
because the nvme driver returned control of all requests in the tagset
to blk-mq.

In any case, if blk-mq decides it won't complete those requests, we
can just swap the order in the reset_work: sync first, uncondintionally
disable. Does the following snippet look more okay?

---
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 17a0190bd88f..42af077ee07a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2307,11 +2307,14 @@ static void nvme_reset_work(struct work_struct *work)
 		goto out;
 
 	/*
-	 * If we're called to reset a live controller first shut it down before
-	 * moving on.
+	 * Ensure there are no timeout work in progress prior to forcefully
+	 * disabling the queue. There is no harm in disabling the device even
+	 * when it was already disabled, as this will forcefully reclaim any
+	 * IOs that are stuck due to blk-mq's timeout handling that prevents
+	 * timed out requests from completing.
 	 */
-	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
-		nvme_dev_disable(dev, false);
+	nvme_sync_queues(&dev->ctrl);
+	nvme_dev_disable(dev, false);
 
 	/*
 	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
--

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 1/6] nvme: Sync request queues on reset
@ 2018-05-21 14:04         ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-21 14:04 UTC (permalink / raw)


On Sat, May 19, 2018@08:01:42AM +0800, Ming Lei wrote:
> > You keep saying that, but the controller state is global to the
> > controller. It doesn't matter which namespace request_queue started the
> > reset: every namespaces request queue sees the RESETTING controller state
> 
> When timeouts come, the global state of RESETTING may not be updated
> yet, so all the timeouts may not observe the state.

Even prior to the RESETING state, every single command, no matter
which namespace or request_queue it came on, is reclaimed by the driver.
There *should* be no requests to timeout after nvme_dev_disable is called
because the nvme driver returned control of all requests in the tagset
to blk-mq.

In any case, if blk-mq decides it won't complete those requests, we
can just swap the order in the reset_work: sync first, uncondintionally
disable. Does the following snippet look more okay?

---
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 17a0190bd88f..42af077ee07a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2307,11 +2307,14 @@ static void nvme_reset_work(struct work_struct *work)
 		goto out;
 
 	/*
-	 * If we're called to reset a live controller first shut it down before
-	 * moving on.
+	 * Ensure there are no timeout work in progress prior to forcefully
+	 * disabling the queue. There is no harm in disabling the device even
+	 * when it was already disabled, as this will forcefully reclaim any
+	 * IOs that are stuck due to blk-mq's timeout handling that prevents
+	 * timed out requests from completing.
 	 */
-	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
-		nvme_dev_disable(dev, false);
+	nvme_sync_queues(&dev->ctrl);
+	nvme_dev_disable(dev, false);
 
 	/*
 	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
--

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [PATCH 3/6] nvme: Move all IO out of controller reset
  2018-05-18 23:03     ` Ming Lei
@ 2018-05-21 14:22       ` Keith Busch
  -1 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-21 14:22 UTC (permalink / raw)
  To: Ming Lei
  Cc: Keith Busch, Jens Axboe, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, linux-block, Johannes Thumshirn,
	Christoph Hellwig

On Sat, May 19, 2018 at 07:03:58AM +0800, Ming Lei wrote:
> On Fri, May 18, 2018 at 10:38:20AM -0600, Keith Busch wrote:
> > +
> > +	if (unfreeze)
> > +		nvme_wait_freeze(&dev->ctrl);
> > +
> 
> timeout may comes just before&during blk_mq_update_nr_hw_queues() or
> the above nvme_wait_freeze(), then both two may hang forever.

Why would it hang forever? The scan_work doesn't stop a timeout from
triggering a reset to reclaim requests necessary to complete a freeze.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 3/6] nvme: Move all IO out of controller reset
@ 2018-05-21 14:22       ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-21 14:22 UTC (permalink / raw)


On Sat, May 19, 2018@07:03:58AM +0800, Ming Lei wrote:
> On Fri, May 18, 2018@10:38:20AM -0600, Keith Busch wrote:
> > +
> > +	if (unfreeze)
> > +		nvme_wait_freeze(&dev->ctrl);
> > +
> 
> timeout may comes just before&during blk_mq_update_nr_hw_queues() or
> the above nvme_wait_freeze(), then both two may hang forever.

Why would it hang forever? The scan_work doesn't stop a timeout from
triggering a reset to reclaim requests necessary to complete a freeze.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 3/6] nvme: Move all IO out of controller reset
  2018-05-21 14:22       ` Keith Busch
@ 2018-05-21 14:58         ` Ming Lei
  -1 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-21 14:58 UTC (permalink / raw)
  To: Keith Busch
  Cc: Keith Busch, Jens Axboe, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, linux-block, Johannes Thumshirn,
	Christoph Hellwig

On Mon, May 21, 2018 at 08:22:19AM -0600, Keith Busch wrote:
> On Sat, May 19, 2018 at 07:03:58AM +0800, Ming Lei wrote:
> > On Fri, May 18, 2018 at 10:38:20AM -0600, Keith Busch wrote:
> > > +
> > > +	if (unfreeze)
> > > +		nvme_wait_freeze(&dev->ctrl);
> > > +
> > 
> > timeout may comes just before&during blk_mq_update_nr_hw_queues() or
> > the above nvme_wait_freeze(), then both two may hang forever.
> 
> Why would it hang forever? The scan_work doesn't stop a timeout from
> triggering a reset to reclaim requests necessary to complete a freeze.

nvme_dev_disable() will quiesce queues, then nvme_wait_freeze() or
blk_mq_update_nr_hw_queues() may hang forever.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 3/6] nvme: Move all IO out of controller reset
@ 2018-05-21 14:58         ` Ming Lei
  0 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-21 14:58 UTC (permalink / raw)


On Mon, May 21, 2018@08:22:19AM -0600, Keith Busch wrote:
> On Sat, May 19, 2018@07:03:58AM +0800, Ming Lei wrote:
> > On Fri, May 18, 2018@10:38:20AM -0600, Keith Busch wrote:
> > > +
> > > +	if (unfreeze)
> > > +		nvme_wait_freeze(&dev->ctrl);
> > > +
> > 
> > timeout may comes just before&during blk_mq_update_nr_hw_queues() or
> > the above nvme_wait_freeze(), then both two may hang forever.
> 
> Why would it hang forever? The scan_work doesn't stop a timeout from
> triggering a reset to reclaim requests necessary to complete a freeze.

nvme_dev_disable() will quiesce queues, then nvme_wait_freeze() or
blk_mq_update_nr_hw_queues() may hang forever.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 3/6] nvme: Move all IO out of controller reset
  2018-05-21 14:58         ` Ming Lei
@ 2018-05-21 15:03           ` Keith Busch
  -1 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-21 15:03 UTC (permalink / raw)
  To: Ming Lei
  Cc: Keith Busch, Jens Axboe, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, linux-block, Johannes Thumshirn,
	Christoph Hellwig

On Mon, May 21, 2018 at 10:58:51PM +0800, Ming Lei wrote:
> On Mon, May 21, 2018 at 08:22:19AM -0600, Keith Busch wrote:
> > On Sat, May 19, 2018 at 07:03:58AM +0800, Ming Lei wrote:
> > > On Fri, May 18, 2018 at 10:38:20AM -0600, Keith Busch wrote:
> > > > +
> > > > +	if (unfreeze)
> > > > +		nvme_wait_freeze(&dev->ctrl);
> > > > +
> > > 
> > > timeout may comes just before&during blk_mq_update_nr_hw_queues() or
> > > the above nvme_wait_freeze(), then both two may hang forever.
> > 
> > Why would it hang forever? The scan_work doesn't stop a timeout from
> > triggering a reset to reclaim requests necessary to complete a freeze.
> 
> nvme_dev_disable() will quiesce queues, then nvme_wait_freeze() or
> blk_mq_update_nr_hw_queues() may hang forever.

nvme_dev_disable is just the first part of the timeout sequence. You
have to follow it through to the reset_work that either restarts or
kills the queues.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 3/6] nvme: Move all IO out of controller reset
@ 2018-05-21 15:03           ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-21 15:03 UTC (permalink / raw)


On Mon, May 21, 2018@10:58:51PM +0800, Ming Lei wrote:
> On Mon, May 21, 2018@08:22:19AM -0600, Keith Busch wrote:
> > On Sat, May 19, 2018@07:03:58AM +0800, Ming Lei wrote:
> > > On Fri, May 18, 2018@10:38:20AM -0600, Keith Busch wrote:
> > > > +
> > > > +	if (unfreeze)
> > > > +		nvme_wait_freeze(&dev->ctrl);
> > > > +
> > > 
> > > timeout may comes just before&during blk_mq_update_nr_hw_queues() or
> > > the above nvme_wait_freeze(), then both two may hang forever.
> > 
> > Why would it hang forever? The scan_work doesn't stop a timeout from
> > triggering a reset to reclaim requests necessary to complete a freeze.
> 
> nvme_dev_disable() will quiesce queues, then nvme_wait_freeze() or
> blk_mq_update_nr_hw_queues() may hang forever.

nvme_dev_disable is just the first part of the timeout sequence. You
have to follow it through to the reset_work that either restarts or
kills the queues.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 1/6] nvme: Sync request queues on reset
  2018-05-21 14:04         ` Keith Busch
@ 2018-05-21 15:25           ` Ming Lei
  -1 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-21 15:25 UTC (permalink / raw)
  To: Keith Busch
  Cc: Jens Axboe, Keith Busch, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, linux-block, Johannes Thumshirn,
	Christoph Hellwig

On Mon, May 21, 2018 at 08:04:13AM -0600, Keith Busch wrote:
> On Sat, May 19, 2018 at 08:01:42AM +0800, Ming Lei wrote:
> > > You keep saying that, but the controller state is global to the
> > > controller. It doesn't matter which namespace request_queue started the
> > > reset: every namespaces request queue sees the RESETTING controller state
> > 
> > When timeouts come, the global state of RESETTING may not be updated
> > yet, so all the timeouts may not observe the state.
> 
> Even prior to the RESETING state, every single command, no matter
> which namespace or request_queue it came on, is reclaimed by the driver.
> There *should* be no requests to timeout after nvme_dev_disable is called
> because the nvme driver returned control of all requests in the tagset
> to blk-mq.

The timed-out requests won't be canceled by nvme_dev_disable().

If the timed-out requests is handled as RESET_TIMER, there may
be new timeout event triggered again.

> 
> In any case, if blk-mq decides it won't complete those requests, we
> can just swap the order in the reset_work: sync first, uncondintionally
> disable. Does the following snippet look more okay?
> 
> ---
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 17a0190bd88f..42af077ee07a 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2307,11 +2307,14 @@ static void nvme_reset_work(struct work_struct *work)
>  		goto out;
>  
>  	/*
> -	 * If we're called to reset a live controller first shut it down before
> -	 * moving on.
> +	 * Ensure there are no timeout work in progress prior to forcefully
> +	 * disabling the queue. There is no harm in disabling the device even
> +	 * when it was already disabled, as this will forcefully reclaim any
> +	 * IOs that are stuck due to blk-mq's timeout handling that prevents
> +	 * timed out requests from completing.
>  	 */
> -	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
> -		nvme_dev_disable(dev, false);
> +	nvme_sync_queues(&dev->ctrl);
> +	nvme_dev_disable(dev, false);

That may not work reliably too.

For example, request A from NS_0 is timed-out and handled as RESET_TIMER,
meantime request B from NS_1 is timed-out and handled as EH_HANDLED.

When the above reset work is run for handling timeout of req B,
new timeout event on request A may come just between the above
nvme_sync_queues() and nvme_dev_disable(), then nvme_dev_disable()
can't cover request A, and finally the timed-out event for req A
will nvme_dev_disable() when the current reset is just in-progress,
then this reset can't move on, and IO hang is caused.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 1/6] nvme: Sync request queues on reset
@ 2018-05-21 15:25           ` Ming Lei
  0 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-21 15:25 UTC (permalink / raw)


On Mon, May 21, 2018@08:04:13AM -0600, Keith Busch wrote:
> On Sat, May 19, 2018@08:01:42AM +0800, Ming Lei wrote:
> > > You keep saying that, but the controller state is global to the
> > > controller. It doesn't matter which namespace request_queue started the
> > > reset: every namespaces request queue sees the RESETTING controller state
> > 
> > When timeouts come, the global state of RESETTING may not be updated
> > yet, so all the timeouts may not observe the state.
> 
> Even prior to the RESETING state, every single command, no matter
> which namespace or request_queue it came on, is reclaimed by the driver.
> There *should* be no requests to timeout after nvme_dev_disable is called
> because the nvme driver returned control of all requests in the tagset
> to blk-mq.

The timed-out requests won't be canceled by nvme_dev_disable().

If the timed-out requests is handled as RESET_TIMER, there may
be new timeout event triggered again.

> 
> In any case, if blk-mq decides it won't complete those requests, we
> can just swap the order in the reset_work: sync first, uncondintionally
> disable. Does the following snippet look more okay?
> 
> ---
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 17a0190bd88f..42af077ee07a 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2307,11 +2307,14 @@ static void nvme_reset_work(struct work_struct *work)
>  		goto out;
>  
>  	/*
> -	 * If we're called to reset a live controller first shut it down before
> -	 * moving on.
> +	 * Ensure there are no timeout work in progress prior to forcefully
> +	 * disabling the queue. There is no harm in disabling the device even
> +	 * when it was already disabled, as this will forcefully reclaim any
> +	 * IOs that are stuck due to blk-mq's timeout handling that prevents
> +	 * timed out requests from completing.
>  	 */
> -	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
> -		nvme_dev_disable(dev, false);
> +	nvme_sync_queues(&dev->ctrl);
> +	nvme_dev_disable(dev, false);

That may not work reliably too.

For example, request A from NS_0 is timed-out and handled as RESET_TIMER,
meantime request B from NS_1 is timed-out and handled as EH_HANDLED.

When the above reset work is run for handling timeout of req B,
new timeout event on request A may come just between the above
nvme_sync_queues() and nvme_dev_disable(), then nvme_dev_disable()
can't cover request A, and finally the timed-out event for req A
will nvme_dev_disable() when the current reset is just in-progress,
then this reset can't move on, and IO hang is caused.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 3/6] nvme: Move all IO out of controller reset
  2018-05-21 15:03           ` Keith Busch
@ 2018-05-21 15:34             ` Ming Lei
  -1 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-21 15:34 UTC (permalink / raw)
  To: Keith Busch
  Cc: Jens Axboe, linux-block, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, Keith Busch, Johannes Thumshirn,
	Christoph Hellwig

On Mon, May 21, 2018 at 09:03:31AM -0600, Keith Busch wrote:
> On Mon, May 21, 2018 at 10:58:51PM +0800, Ming Lei wrote:
> > On Mon, May 21, 2018 at 08:22:19AM -0600, Keith Busch wrote:
> > > On Sat, May 19, 2018 at 07:03:58AM +0800, Ming Lei wrote:
> > > > On Fri, May 18, 2018 at 10:38:20AM -0600, Keith Busch wrote:
> > > > > +
> > > > > +	if (unfreeze)
> > > > > +		nvme_wait_freeze(&dev->ctrl);
> > > > > +
> > > > 
> > > > timeout may comes just before&during blk_mq_update_nr_hw_queues() or
> > > > the above nvme_wait_freeze(), then both two may hang forever.
> > > 
> > > Why would it hang forever? The scan_work doesn't stop a timeout from
> > > triggering a reset to reclaim requests necessary to complete a freeze.
> > 
> > nvme_dev_disable() will quiesce queues, then nvme_wait_freeze() or
> > blk_mq_update_nr_hw_queues() may hang forever.
> 
> nvme_dev_disable is just the first part of the timeout sequence. You
> have to follow it through to the reset_work that either restarts or
> kills the queues.

nvme_dev_disable() quiesces queues first before killing queues.

If queues are quiesced during or before nvme_wait_freeze() is run
from the 2nd part of reset, the 2nd part can't move on, and IO hang
is caused. Finally no reset can be scheduled at all.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 3/6] nvme: Move all IO out of controller reset
@ 2018-05-21 15:34             ` Ming Lei
  0 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-21 15:34 UTC (permalink / raw)


On Mon, May 21, 2018@09:03:31AM -0600, Keith Busch wrote:
> On Mon, May 21, 2018@10:58:51PM +0800, Ming Lei wrote:
> > On Mon, May 21, 2018@08:22:19AM -0600, Keith Busch wrote:
> > > On Sat, May 19, 2018@07:03:58AM +0800, Ming Lei wrote:
> > > > On Fri, May 18, 2018@10:38:20AM -0600, Keith Busch wrote:
> > > > > +
> > > > > +	if (unfreeze)
> > > > > +		nvme_wait_freeze(&dev->ctrl);
> > > > > +
> > > > 
> > > > timeout may comes just before&during blk_mq_update_nr_hw_queues() or
> > > > the above nvme_wait_freeze(), then both two may hang forever.
> > > 
> > > Why would it hang forever? The scan_work doesn't stop a timeout from
> > > triggering a reset to reclaim requests necessary to complete a freeze.
> > 
> > nvme_dev_disable() will quiesce queues, then nvme_wait_freeze() or
> > blk_mq_update_nr_hw_queues() may hang forever.
> 
> nvme_dev_disable is just the first part of the timeout sequence. You
> have to follow it through to the reset_work that either restarts or
> kills the queues.

nvme_dev_disable() quiesces queues first before killing queues.

If queues are quiesced during or before nvme_wait_freeze() is run
from the 2nd part of reset, the 2nd part can't move on, and IO hang
is caused. Finally no reset can be scheduled at all.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 3/6] nvme: Move all IO out of controller reset
  2018-05-21 15:34             ` Ming Lei
@ 2018-05-21 15:44               ` Keith Busch
  -1 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-21 15:44 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Keith Busch, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, linux-block, Johannes Thumshirn,
	Christoph Hellwig

On Mon, May 21, 2018 at 11:34:27PM +0800, Ming Lei wrote:
> nvme_dev_disable() quiesces queues first before killing queues.
> 
> If queues are quiesced during or before nvme_wait_freeze() is run
> from the 2nd part of reset, the 2nd part can't move on, and IO hang
> is caused. Finally no reset can be scheduled at all.

But this patch moves nvme_wait_freeze outside the reset path, so I'm
afraid I'm unable to follow how you've concluded the wait freeze is
somehow part of the reset.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 3/6] nvme: Move all IO out of controller reset
@ 2018-05-21 15:44               ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-21 15:44 UTC (permalink / raw)


On Mon, May 21, 2018@11:34:27PM +0800, Ming Lei wrote:
> nvme_dev_disable() quiesces queues first before killing queues.
> 
> If queues are quiesced during or before nvme_wait_freeze() is run
> from the 2nd part of reset, the 2nd part can't move on, and IO hang
> is caused. Finally no reset can be scheduled at all.

But this patch moves nvme_wait_freeze outside the reset path, so I'm
afraid I'm unable to follow how you've concluded the wait freeze is
somehow part of the reset.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 1/6] nvme: Sync request queues on reset
  2018-05-21 15:25           ` Ming Lei
@ 2018-05-21 15:59             ` Keith Busch
  -1 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-21 15:59 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Keith Busch, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, linux-block, Johannes Thumshirn,
	Christoph Hellwig

On Mon, May 21, 2018 at 11:25:43PM +0800, Ming Lei wrote:
> On Mon, May 21, 2018 at 08:04:13AM -0600, Keith Busch wrote:
> > On Sat, May 19, 2018 at 08:01:42AM +0800, Ming Lei wrote:
> > > > You keep saying that, but the controller state is global to the
> > > > controller. It doesn't matter which namespace request_queue started the
> > > > reset: every namespaces request queue sees the RESETTING controller state
> > > 
> > > When timeouts come, the global state of RESETTING may not be updated
> > > yet, so all the timeouts may not observe the state.
> > 
> > Even prior to the RESETING state, every single command, no matter
> > which namespace or request_queue it came on, is reclaimed by the driver.
> > There *should* be no requests to timeout after nvme_dev_disable is called
> > because the nvme driver returned control of all requests in the tagset
> > to blk-mq.
> 
> The timed-out requests won't be canceled by nvme_dev_disable().

??? nvme_dev_disable cancels all started requests. There are no exceptions.

> If the timed-out requests is handled as RESET_TIMER, there may
> be new timeout event triggered again.
> 
> > 
> > In any case, if blk-mq decides it won't complete those requests, we
> > can just swap the order in the reset_work: sync first, uncondintionally
> > disable. Does the following snippet look more okay?
> > 
> > ---
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index 17a0190bd88f..42af077ee07a 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -2307,11 +2307,14 @@ static void nvme_reset_work(struct work_struct *work)
> >  		goto out;
> >  
> >  	/*
> > -	 * If we're called to reset a live controller first shut it down before
> > -	 * moving on.
> > +	 * Ensure there are no timeout work in progress prior to forcefully
> > +	 * disabling the queue. There is no harm in disabling the device even
> > +	 * when it was already disabled, as this will forcefully reclaim any
> > +	 * IOs that are stuck due to blk-mq's timeout handling that prevents
> > +	 * timed out requests from completing.
> >  	 */
> > -	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
> > -		nvme_dev_disable(dev, false);
> > +	nvme_sync_queues(&dev->ctrl);
> > +	nvme_dev_disable(dev, false);
> 
> That may not work reliably too.
> 
> For example, request A from NS_0 is timed-out and handled as RESET_TIMER,
> meantime request B from NS_1 is timed-out and handled as EH_HANDLED.

Meanwhile, request B's nvme_dev_disable prior to returning EH_HANDLED
cancels all requests, which includes request A.

> When the above reset work is run for handling timeout of req B,
> new timeout event on request A may come just between the above
> nvme_sync_queues() and nvme_dev_disable()

The sync queues either stops the timer from running, or waits for it to
complete. We are in the RESETTING state: if request A's timeout happens
to be running, we're not restarting the timer; we're returning EH_HANDLED.

> then nvme_dev_disable()
> can't cover request A, and finally the timed-out event for req A
> will nvme_dev_disable() when the current reset is just in-progress,
> then this reset can't move on, and IO hang is caused.

At no point in the nvme_reset are we waiting for any IO to complete.
Reset continues to make forward progress.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 1/6] nvme: Sync request queues on reset
@ 2018-05-21 15:59             ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-21 15:59 UTC (permalink / raw)


On Mon, May 21, 2018@11:25:43PM +0800, Ming Lei wrote:
> On Mon, May 21, 2018@08:04:13AM -0600, Keith Busch wrote:
> > On Sat, May 19, 2018@08:01:42AM +0800, Ming Lei wrote:
> > > > You keep saying that, but the controller state is global to the
> > > > controller. It doesn't matter which namespace request_queue started the
> > > > reset: every namespaces request queue sees the RESETTING controller state
> > > 
> > > When timeouts come, the global state of RESETTING may not be updated
> > > yet, so all the timeouts may not observe the state.
> > 
> > Even prior to the RESETING state, every single command, no matter
> > which namespace or request_queue it came on, is reclaimed by the driver.
> > There *should* be no requests to timeout after nvme_dev_disable is called
> > because the nvme driver returned control of all requests in the tagset
> > to blk-mq.
> 
> The timed-out requests won't be canceled by nvme_dev_disable().

??? nvme_dev_disable cancels all started requests. There are no exceptions.

> If the timed-out requests is handled as RESET_TIMER, there may
> be new timeout event triggered again.
> 
> > 
> > In any case, if blk-mq decides it won't complete those requests, we
> > can just swap the order in the reset_work: sync first, uncondintionally
> > disable. Does the following snippet look more okay?
> > 
> > ---
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index 17a0190bd88f..42af077ee07a 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -2307,11 +2307,14 @@ static void nvme_reset_work(struct work_struct *work)
> >  		goto out;
> >  
> >  	/*
> > -	 * If we're called to reset a live controller first shut it down before
> > -	 * moving on.
> > +	 * Ensure there are no timeout work in progress prior to forcefully
> > +	 * disabling the queue. There is no harm in disabling the device even
> > +	 * when it was already disabled, as this will forcefully reclaim any
> > +	 * IOs that are stuck due to blk-mq's timeout handling that prevents
> > +	 * timed out requests from completing.
> >  	 */
> > -	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
> > -		nvme_dev_disable(dev, false);
> > +	nvme_sync_queues(&dev->ctrl);
> > +	nvme_dev_disable(dev, false);
> 
> That may not work reliably too.
> 
> For example, request A from NS_0 is timed-out and handled as RESET_TIMER,
> meantime request B from NS_1 is timed-out and handled as EH_HANDLED.

Meanwhile, request B's nvme_dev_disable prior to returning EH_HANDLED
cancels all requests, which includes request A.

> When the above reset work is run for handling timeout of req B,
> new timeout event on request A may come just between the above
> nvme_sync_queues() and nvme_dev_disable()

The sync queues either stops the timer from running, or waits for it to
complete. We are in the RESETTING state: if request A's timeout happens
to be running, we're not restarting the timer; we're returning EH_HANDLED.

> then nvme_dev_disable()
> can't cover request A, and finally the timed-out event for req A
> will nvme_dev_disable() when the current reset is just in-progress,
> then this reset can't move on, and IO hang is caused.

At no point in the nvme_reset are we waiting for any IO to complete.
Reset continues to make forward progress.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 3/6] nvme: Move all IO out of controller reset
  2018-05-21 15:44               ` Keith Busch
@ 2018-05-21 16:04                 ` Ming Lei
  -1 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-21 16:04 UTC (permalink / raw)
  To: Keith Busch
  Cc: Jens Axboe, Keith Busch, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, linux-block, Johannes Thumshirn,
	Christoph Hellwig

On Mon, May 21, 2018 at 09:44:33AM -0600, Keith Busch wrote:
> On Mon, May 21, 2018 at 11:34:27PM +0800, Ming Lei wrote:
> > nvme_dev_disable() quiesces queues first before killing queues.
> > 
> > If queues are quiesced during or before nvme_wait_freeze() is run
> > from the 2nd part of reset, the 2nd part can't move on, and IO hang
> > is caused. Finally no reset can be scheduled at all.
> 
> But this patch moves nvme_wait_freeze outside the reset path, so I'm
> afraid I'm unable to follow how you've concluded the wait freeze is
> somehow part of the reset.

For example:

1) the 1st timeout event:

- nvme_dev_disable()
- reset
- scan_work

2) the 2nd timeout event:

nvme_dev_disable() may come just after nvme_start_queues() in
the above reset of the 1st timeout. And nvme_timeout() won't
schedule a new reset since the controller state is NVME_CTRL_CONNECTING.

Then scan_work in 1st timeout still may hang for ever.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 3/6] nvme: Move all IO out of controller reset
@ 2018-05-21 16:04                 ` Ming Lei
  0 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-21 16:04 UTC (permalink / raw)


On Mon, May 21, 2018@09:44:33AM -0600, Keith Busch wrote:
> On Mon, May 21, 2018@11:34:27PM +0800, Ming Lei wrote:
> > nvme_dev_disable() quiesces queues first before killing queues.
> > 
> > If queues are quiesced during or before nvme_wait_freeze() is run
> > from the 2nd part of reset, the 2nd part can't move on, and IO hang
> > is caused. Finally no reset can be scheduled at all.
> 
> But this patch moves nvme_wait_freeze outside the reset path, so I'm
> afraid I'm unable to follow how you've concluded the wait freeze is
> somehow part of the reset.

For example:

1) the 1st timeout event:

- nvme_dev_disable()
- reset
- scan_work

2) the 2nd timeout event:

nvme_dev_disable() may come just after nvme_start_queues() in
the above reset of the 1st timeout. And nvme_timeout() won't
schedule a new reset since the controller state is NVME_CTRL_CONNECTING.

Then scan_work in 1st timeout still may hang for ever.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 1/6] nvme: Sync request queues on reset
  2018-05-21 15:59             ` Keith Busch
@ 2018-05-21 16:08               ` Ming Lei
  -1 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-21 16:08 UTC (permalink / raw)
  To: Keith Busch
  Cc: Jens Axboe, linux-block, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, Keith Busch, Johannes Thumshirn,
	Christoph Hellwig

On Mon, May 21, 2018 at 09:59:09AM -0600, Keith Busch wrote:
> On Mon, May 21, 2018 at 11:25:43PM +0800, Ming Lei wrote:
> > On Mon, May 21, 2018 at 08:04:13AM -0600, Keith Busch wrote:
> > > On Sat, May 19, 2018 at 08:01:42AM +0800, Ming Lei wrote:
> > > > > You keep saying that, but the controller state is global to the
> > > > > controller. It doesn't matter which namespace request_queue started the
> > > > > reset: every namespaces request queue sees the RESETTING controller state
> > > > 
> > > > When timeouts come, the global state of RESETTING may not be updated
> > > > yet, so all the timeouts may not observe the state.
> > > 
> > > Even prior to the RESETING state, every single command, no matter
> > > which namespace or request_queue it came on, is reclaimed by the driver.
> > > There *should* be no requests to timeout after nvme_dev_disable is called
> > > because the nvme driver returned control of all requests in the tagset
> > > to blk-mq.
> > 
> > The timed-out requests won't be canceled by nvme_dev_disable().
> 
> ??? nvme_dev_disable cancels all started requests. There are no exceptions.

Please take a look at blk_mq_complete_request(). Even with Bart's
change, the request still won't be completed by driver. The request can
only be completed by either driver or blk-mq, not both.

> 
> > If the timed-out requests is handled as RESET_TIMER, there may
> > be new timeout event triggered again.
> > 
> > > 
> > > In any case, if blk-mq decides it won't complete those requests, we
> > > can just swap the order in the reset_work: sync first, uncondintionally
> > > disable. Does the following snippet look more okay?
> > > 
> > > ---
> > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > > index 17a0190bd88f..42af077ee07a 100644
> > > --- a/drivers/nvme/host/pci.c
> > > +++ b/drivers/nvme/host/pci.c
> > > @@ -2307,11 +2307,14 @@ static void nvme_reset_work(struct work_struct *work)
> > >  		goto out;
> > >  
> > >  	/*
> > > -	 * If we're called to reset a live controller first shut it down before
> > > -	 * moving on.
> > > +	 * Ensure there are no timeout work in progress prior to forcefully
> > > +	 * disabling the queue. There is no harm in disabling the device even
> > > +	 * when it was already disabled, as this will forcefully reclaim any
> > > +	 * IOs that are stuck due to blk-mq's timeout handling that prevents
> > > +	 * timed out requests from completing.
> > >  	 */
> > > -	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
> > > -		nvme_dev_disable(dev, false);
> > > +	nvme_sync_queues(&dev->ctrl);
> > > +	nvme_dev_disable(dev, false);
> > 
> > That may not work reliably too.
> > 
> > For example, request A from NS_0 is timed-out and handled as RESET_TIMER,
> > meantime request B from NS_1 is timed-out and handled as EH_HANDLED.
> 
> Meanwhile, request B's nvme_dev_disable prior to returning EH_HANDLED
> cancels all requests, which includes request A.

No, please see my above comment.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 1/6] nvme: Sync request queues on reset
@ 2018-05-21 16:08               ` Ming Lei
  0 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-21 16:08 UTC (permalink / raw)


On Mon, May 21, 2018@09:59:09AM -0600, Keith Busch wrote:
> On Mon, May 21, 2018@11:25:43PM +0800, Ming Lei wrote:
> > On Mon, May 21, 2018@08:04:13AM -0600, Keith Busch wrote:
> > > On Sat, May 19, 2018@08:01:42AM +0800, Ming Lei wrote:
> > > > > You keep saying that, but the controller state is global to the
> > > > > controller. It doesn't matter which namespace request_queue started the
> > > > > reset: every namespaces request queue sees the RESETTING controller state
> > > > 
> > > > When timeouts come, the global state of RESETTING may not be updated
> > > > yet, so all the timeouts may not observe the state.
> > > 
> > > Even prior to the RESETING state, every single command, no matter
> > > which namespace or request_queue it came on, is reclaimed by the driver.
> > > There *should* be no requests to timeout after nvme_dev_disable is called
> > > because the nvme driver returned control of all requests in the tagset
> > > to blk-mq.
> > 
> > The timed-out requests won't be canceled by nvme_dev_disable().
> 
> ??? nvme_dev_disable cancels all started requests. There are no exceptions.

Please take a look at blk_mq_complete_request(). Even with Bart's
change, the request still won't be completed by driver. The request can
only be completed by either driver or blk-mq, not both.

> 
> > If the timed-out requests is handled as RESET_TIMER, there may
> > be new timeout event triggered again.
> > 
> > > 
> > > In any case, if blk-mq decides it won't complete those requests, we
> > > can just swap the order in the reset_work: sync first, uncondintionally
> > > disable. Does the following snippet look more okay?
> > > 
> > > ---
> > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > > index 17a0190bd88f..42af077ee07a 100644
> > > --- a/drivers/nvme/host/pci.c
> > > +++ b/drivers/nvme/host/pci.c
> > > @@ -2307,11 +2307,14 @@ static void nvme_reset_work(struct work_struct *work)
> > >  		goto out;
> > >  
> > >  	/*
> > > -	 * If we're called to reset a live controller first shut it down before
> > > -	 * moving on.
> > > +	 * Ensure there are no timeout work in progress prior to forcefully
> > > +	 * disabling the queue. There is no harm in disabling the device even
> > > +	 * when it was already disabled, as this will forcefully reclaim any
> > > +	 * IOs that are stuck due to blk-mq's timeout handling that prevents
> > > +	 * timed out requests from completing.
> > >  	 */
> > > -	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
> > > -		nvme_dev_disable(dev, false);
> > > +	nvme_sync_queues(&dev->ctrl);
> > > +	nvme_dev_disable(dev, false);
> > 
> > That may not work reliably too.
> > 
> > For example, request A from NS_0 is timed-out and handled as RESET_TIMER,
> > meantime request B from NS_1 is timed-out and handled as EH_HANDLED.
> 
> Meanwhile, request B's nvme_dev_disable prior to returning EH_HANDLED
> cancels all requests, which includes request A.

No, please see my above comment.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 3/6] nvme: Move all IO out of controller reset
  2018-05-21 16:04                 ` Ming Lei
@ 2018-05-21 16:23                   ` Keith Busch
  -1 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-21 16:23 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Keith Busch, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, linux-block, Johannes Thumshirn,
	Christoph Hellwig

On Tue, May 22, 2018 at 12:04:53AM +0800, Ming Lei wrote:
> On Mon, May 21, 2018 at 09:44:33AM -0600, Keith Busch wrote:
> > On Mon, May 21, 2018 at 11:34:27PM +0800, Ming Lei wrote:
> > > nvme_dev_disable() quiesces queues first before killing queues.
> > > 
> > > If queues are quiesced during or before nvme_wait_freeze() is run
> > > from the 2nd part of reset, the 2nd part can't move on, and IO hang
> > > is caused. Finally no reset can be scheduled at all.
> > 
> > But this patch moves nvme_wait_freeze outside the reset path, so I'm
> > afraid I'm unable to follow how you've concluded the wait freeze is
> > somehow part of the reset.
> 
> For example:
> 
> 1) the 1st timeout event:
> 
> - nvme_dev_disable()
> - reset
> - scan_work
> 
> 2) the 2nd timeout event:
> 
> nvme_dev_disable() may come just after nvme_start_queues() in
> the above reset of the 1st timeout. And nvme_timeout() won't
> schedule a new reset since the controller state is NVME_CTRL_CONNECTING.

Let me get this straight -- you're saying nvme_start_queues is going
to somehow immediately trigger timeout work? I can't see how that could
possibly happen in real life, but we can just remove it and use the existing
nvme_start_ctrl to handle that in the LIVE state.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 3/6] nvme: Move all IO out of controller reset
@ 2018-05-21 16:23                   ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-21 16:23 UTC (permalink / raw)


On Tue, May 22, 2018@12:04:53AM +0800, Ming Lei wrote:
> On Mon, May 21, 2018@09:44:33AM -0600, Keith Busch wrote:
> > On Mon, May 21, 2018@11:34:27PM +0800, Ming Lei wrote:
> > > nvme_dev_disable() quiesces queues first before killing queues.
> > > 
> > > If queues are quiesced during or before nvme_wait_freeze() is run
> > > from the 2nd part of reset, the 2nd part can't move on, and IO hang
> > > is caused. Finally no reset can be scheduled at all.
> > 
> > But this patch moves nvme_wait_freeze outside the reset path, so I'm
> > afraid I'm unable to follow how you've concluded the wait freeze is
> > somehow part of the reset.
> 
> For example:
> 
> 1) the 1st timeout event:
> 
> - nvme_dev_disable()
> - reset
> - scan_work
> 
> 2) the 2nd timeout event:
> 
> nvme_dev_disable() may come just after nvme_start_queues() in
> the above reset of the 1st timeout. And nvme_timeout() won't
> schedule a new reset since the controller state is NVME_CTRL_CONNECTING.

Let me get this straight -- you're saying nvme_start_queues is going
to somehow immediately trigger timeout work? I can't see how that could
possibly happen in real life, but we can just remove it and use the existing
nvme_start_ctrl to handle that in the LIVE state.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 1/6] nvme: Sync request queues on reset
  2018-05-21 16:08               ` Ming Lei
@ 2018-05-21 16:25                 ` Keith Busch
  -1 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-21 16:25 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Keith Busch, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, linux-block, Johannes Thumshirn,
	Christoph Hellwig

On Tue, May 22, 2018 at 12:08:37AM +0800, Ming Lei wrote:
> Please take a look at blk_mq_complete_request(). Even with Bart's
> change, the request still won't be completed by driver. The request can
> only be completed by either driver or blk-mq, not both.

So you're saying blk-mq can't complete a request the driver returned to
blk-mq to complete. And that's the nvme driver's problem to fix?

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 1/6] nvme: Sync request queues on reset
@ 2018-05-21 16:25                 ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-21 16:25 UTC (permalink / raw)


On Tue, May 22, 2018@12:08:37AM +0800, Ming Lei wrote:
> Please take a look at blk_mq_complete_request(). Even with Bart's
> change, the request still won't be completed by driver. The request can
> only be completed by either driver or blk-mq, not both.

So you're saying blk-mq can't complete a request the driver returned to
blk-mq to complete. And that's the nvme driver's problem to fix?

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 3/6] nvme: Move all IO out of controller reset
  2018-05-21 16:23                   ` Keith Busch
@ 2018-05-22  1:46                     ` Ming Lei
  -1 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-22  1:46 UTC (permalink / raw)
  To: Keith Busch
  Cc: Jens Axboe, Keith Busch, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, linux-block, Johannes Thumshirn,
	Christoph Hellwig

On Mon, May 21, 2018 at 10:23:55AM -0600, Keith Busch wrote:
> On Tue, May 22, 2018 at 12:04:53AM +0800, Ming Lei wrote:
> > On Mon, May 21, 2018 at 09:44:33AM -0600, Keith Busch wrote:
> > > On Mon, May 21, 2018 at 11:34:27PM +0800, Ming Lei wrote:
> > > > nvme_dev_disable() quiesces queues first before killing queues.
> > > > 
> > > > If queues are quiesced during or before nvme_wait_freeze() is run
> > > > from the 2nd part of reset, the 2nd part can't move on, and IO hang
> > > > is caused. Finally no reset can be scheduled at all.
> > > 
> > > But this patch moves nvme_wait_freeze outside the reset path, so I'm
> > > afraid I'm unable to follow how you've concluded the wait freeze is
> > > somehow part of the reset.
> > 
> > For example:
> > 
> > 1) the 1st timeout event:
> > 
> > - nvme_dev_disable()
> > - reset
> > - scan_work
> > 
> > 2) the 2nd timeout event:
> > 
> > nvme_dev_disable() may come just after nvme_start_queues() in
> > the above reset of the 1st timeout. And nvme_timeout() won't
> > schedule a new reset since the controller state is NVME_CTRL_CONNECTING.
> 
> Let me get this straight -- you're saying nvme_start_queues is going
> to somehow immediately trigger timeout work? I can't see how that could

It may be difficult to trigger in reality, but won't be impossible
since the timeout value can be adjusted via module parameter, and any
schedule delay can be introduced in one busy system.

It isn't a good practice to rely on timing for avoiding race, IMO.

> possibly happen in real life, but we can just remove it and use the existing
> nvme_start_ctrl to handle that in the LIVE state.

OK, please fix other issues mentioned in the following comment together,
then I will review further and see if it can work well.

https://marc.info/?l=linux-block&m=152668465710591&w=2

Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 3/6] nvme: Move all IO out of controller reset
@ 2018-05-22  1:46                     ` Ming Lei
  0 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-22  1:46 UTC (permalink / raw)


On Mon, May 21, 2018@10:23:55AM -0600, Keith Busch wrote:
> On Tue, May 22, 2018@12:04:53AM +0800, Ming Lei wrote:
> > On Mon, May 21, 2018@09:44:33AM -0600, Keith Busch wrote:
> > > On Mon, May 21, 2018@11:34:27PM +0800, Ming Lei wrote:
> > > > nvme_dev_disable() quiesces queues first before killing queues.
> > > > 
> > > > If queues are quiesced during or before nvme_wait_freeze() is run
> > > > from the 2nd part of reset, the 2nd part can't move on, and IO hang
> > > > is caused. Finally no reset can be scheduled at all.
> > > 
> > > But this patch moves nvme_wait_freeze outside the reset path, so I'm
> > > afraid I'm unable to follow how you've concluded the wait freeze is
> > > somehow part of the reset.
> > 
> > For example:
> > 
> > 1) the 1st timeout event:
> > 
> > - nvme_dev_disable()
> > - reset
> > - scan_work
> > 
> > 2) the 2nd timeout event:
> > 
> > nvme_dev_disable() may come just after nvme_start_queues() in
> > the above reset of the 1st timeout. And nvme_timeout() won't
> > schedule a new reset since the controller state is NVME_CTRL_CONNECTING.
> 
> Let me get this straight -- you're saying nvme_start_queues is going
> to somehow immediately trigger timeout work? I can't see how that could

It may be difficult to trigger in reality, but won't be impossible
since the timeout value can be adjusted via module parameter, and any
schedule delay can be introduced in one busy system.

It isn't a good practice to rely on timing for avoiding race, IMO.

> possibly happen in real life, but we can just remove it and use the existing
> nvme_start_ctrl to handle that in the LIVE state.

OK, please fix other issues mentioned in the following comment together,
then I will review further and see if it can work well.

https://marc.info/?l=linux-block&m=152668465710591&w=2

Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 1/6] nvme: Sync request queues on reset
  2018-05-21 16:25                 ` Keith Busch
@ 2018-05-22  1:56                   ` Ming Lei
  -1 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-22  1:56 UTC (permalink / raw)
  To: Keith Busch
  Cc: Jens Axboe, linux-block, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, Keith Busch, Johannes Thumshirn,
	Christoph Hellwig

On Mon, May 21, 2018 at 10:25:17AM -0600, Keith Busch wrote:
> On Tue, May 22, 2018 at 12:08:37AM +0800, Ming Lei wrote:
> > Please take a look at blk_mq_complete_request(). Even with Bart's
> > change, the request still won't be completed by driver. The request can
> > only be completed by either driver or blk-mq, not both.
> 
> So you're saying blk-mq can't complete a request the driver returned to
> blk-mq to complete. And that's the nvme driver's problem to fix?

For avoiding use-after-free, one request can only be completed by
one path, either by timeout path or normal completion(irq or cancel)
from driver.

So before handling this req's timeout, this request has to be
marked as completed by blk-mq timeout code already, then nvme_cancel_request()
can't cover this timed-out request.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 1/6] nvme: Sync request queues on reset
@ 2018-05-22  1:56                   ` Ming Lei
  0 siblings, 0 replies; 50+ messages in thread
From: Ming Lei @ 2018-05-22  1:56 UTC (permalink / raw)


On Mon, May 21, 2018@10:25:17AM -0600, Keith Busch wrote:
> On Tue, May 22, 2018@12:08:37AM +0800, Ming Lei wrote:
> > Please take a look at blk_mq_complete_request(). Even with Bart's
> > change, the request still won't be completed by driver. The request can
> > only be completed by either driver or blk-mq, not both.
> 
> So you're saying blk-mq can't complete a request the driver returned to
> blk-mq to complete. And that's the nvme driver's problem to fix?

For avoiding use-after-free, one request can only be completed by
one path, either by timeout path or normal completion(irq or cancel)
from driver.

So before handling this req's timeout, this request has to be
marked as completed by blk-mq timeout code already, then nvme_cancel_request()
can't cover this timed-out request.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 3/6] nvme: Move all IO out of controller reset
  2018-05-22  1:46                     ` Ming Lei
@ 2018-05-22 14:03                       ` Keith Busch
  -1 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-22 14:03 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Laurence Oberman, Sagi Grimberg,
	James Smart, linux-nvme, Keith Busch, Johannes Thumshirn,
	Christoph Hellwig

On Tue, May 22, 2018 at 09:46:32AM +0800, Ming Lei wrote:
> It isn't a good practice to rely on timing for avoiding race, IMO.

Sure thing, and it's easy enough to avoid this one.
 
> OK, please fix other issues mentioned in the following comment together,
> then I will review further and see if it can work well.
> 
> https://marc.info/?l=linux-block&m=152668465710591&w=2

Will do, v2 coming up by end of today.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 3/6] nvme: Move all IO out of controller reset
@ 2018-05-22 14:03                       ` Keith Busch
  0 siblings, 0 replies; 50+ messages in thread
From: Keith Busch @ 2018-05-22 14:03 UTC (permalink / raw)


On Tue, May 22, 2018@09:46:32AM +0800, Ming Lei wrote:
> It isn't a good practice to rely on timing for avoiding race, IMO.

Sure thing, and it's easy enough to avoid this one.
 
> OK, please fix other issues mentioned in the following comment together,
> then I will review further and see if it can work well.
> 
> https://marc.info/?l=linux-block&m=152668465710591&w=2

Will do, v2 coming up by end of today.

^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2018-05-22 14:03 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-18 16:38 [PATCH 1/6] nvme: Sync request queues on reset Keith Busch
2018-05-18 16:38 ` Keith Busch
2018-05-18 16:38 ` [PATCH 2/6] nvme-pci: Fix queue freeze criteria " Keith Busch
2018-05-18 16:38   ` Keith Busch
2018-05-18 16:38 ` [PATCH 3/6] nvme: Move all IO out of controller reset Keith Busch
2018-05-18 16:38   ` Keith Busch
2018-05-18 23:03   ` Ming Lei
2018-05-18 23:03     ` Ming Lei
2018-05-21 14:22     ` Keith Busch
2018-05-21 14:22       ` Keith Busch
2018-05-21 14:58       ` Ming Lei
2018-05-21 14:58         ` Ming Lei
2018-05-21 15:03         ` Keith Busch
2018-05-21 15:03           ` Keith Busch
2018-05-21 15:34           ` Ming Lei
2018-05-21 15:34             ` Ming Lei
2018-05-21 15:44             ` Keith Busch
2018-05-21 15:44               ` Keith Busch
2018-05-21 16:04               ` Ming Lei
2018-05-21 16:04                 ` Ming Lei
2018-05-21 16:23                 ` Keith Busch
2018-05-21 16:23                   ` Keith Busch
2018-05-22  1:46                   ` Ming Lei
2018-05-22  1:46                     ` Ming Lei
2018-05-22 14:03                     ` Keith Busch
2018-05-22 14:03                       ` Keith Busch
2018-05-18 16:38 ` [PATCH 4/6] nvme: Allow reset from CONNECTING state Keith Busch
2018-05-18 16:38   ` Keith Busch
2018-05-18 16:38 ` [PATCH 5/6] nvme-pci: Attempt reset retry for IO failures Keith Busch
2018-05-18 16:38   ` Keith Busch
2018-05-18 16:38 ` [PATCH 6/6] nvme-pci: Rate limit the nvme timeout warnings Keith Busch
2018-05-18 16:38   ` Keith Busch
2018-05-18 22:32 ` [PATCH 1/6] nvme: Sync request queues on reset Ming Lei
2018-05-18 22:32   ` Ming Lei
2018-05-18 23:44   ` Keith Busch
2018-05-18 23:44     ` Keith Busch
2018-05-19  0:01     ` Ming Lei
2018-05-19  0:01       ` Ming Lei
2018-05-21 14:04       ` Keith Busch
2018-05-21 14:04         ` Keith Busch
2018-05-21 15:25         ` Ming Lei
2018-05-21 15:25           ` Ming Lei
2018-05-21 15:59           ` Keith Busch
2018-05-21 15:59             ` Keith Busch
2018-05-21 16:08             ` Ming Lei
2018-05-21 16:08               ` Ming Lei
2018-05-21 16:25               ` Keith Busch
2018-05-21 16:25                 ` Keith Busch
2018-05-22  1:56                 ` Ming Lei
2018-05-22  1:56                   ` Ming Lei

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.