All of lore.kernel.org
 help / color / mirror / Atom feed
* nvme abort and reset fixes
@ 2015-10-22 12:03 Christoph Hellwig
  2015-10-22 12:03 ` [PATCH 1/9] nvme: only add a controller to dev_list after it's been fully initialized Christoph Hellwig
                   ` (8 more replies)
  0 siblings, 9 replies; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 12:03 UTC (permalink / raw)


This series allows my controller that dosn't like the small but aligned discard
requests to recover slowly but safely.  To get there I had to turn most of the
abort and reset path upside down unfortunately.

This is on top of Keith' master branch with all the nvme driver split work in it.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 1/9] nvme: only add a controller to dev_list after it's been fully initialized
  2015-10-22 12:03 nvme abort and reset fixes Christoph Hellwig
@ 2015-10-22 12:03 ` Christoph Hellwig
  2015-10-22 12:03 ` [PATCH 2/9] nvme: don't take the I/O queue q_lock in nvme_timeout Christoph Hellwig
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 12:03 UTC (permalink / raw)


Without this we can easily get bad derferences on nvmeq->d_db when the nvme
kthread tries to poll the CQs for controllers that are in half initialized
state.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/nvme/host/pci.c | 51 +++++++++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index cc0177c..fd53420 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2050,6 +2050,30 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
 	kthread_stop(kworker_task);
 }
 
+static int nvme_dev_list_add(struct nvme_dev *dev)
+{
+	bool start_thread = false;
+
+	spin_lock(&dev_list_lock);
+	if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
+		start_thread = true;
+		nvme_thread = NULL;
+	}
+	list_add(&dev->node, &dev_list);
+	spin_unlock(&dev_list_lock);
+
+	if (start_thread) {
+		nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
+		wake_up_all(&nvme_kthread_wait);
+	} else
+		wait_event_killable(nvme_kthread_wait, nvme_thread);
+
+	if (IS_ERR_OR_NULL(nvme_thread))
+		return nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
+
+	return 0;
+}
+
 /*
 * Remove the node from the device list and check
 * for whether or not we need to stop the nvme_thread.
@@ -2165,7 +2189,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 static void nvme_probe_work(struct work_struct *work)
 {
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
-	bool start_thread = false;
 	int result;
 
 	result = nvme_dev_map(dev);
@@ -2176,25 +2199,6 @@ static void nvme_probe_work(struct work_struct *work)
 	if (result)
 		goto unmap;
 
-	spin_lock(&dev_list_lock);
-	if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
-		start_thread = true;
-		nvme_thread = NULL;
-	}
-	list_add(&dev->node, &dev_list);
-	spin_unlock(&dev_list_lock);
-
-	if (start_thread) {
-		nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
-		wake_up_all(&nvme_kthread_wait);
-	} else
-		wait_event_killable(nvme_kthread_wait, nvme_thread);
-
-	if (IS_ERR_OR_NULL(nvme_thread)) {
-		result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
-		goto disable;
-	}
-
 	nvme_init_queue(dev->queues[0], 0);
 	result = nvme_alloc_admin_tags(dev);
 	if (result)
@@ -2210,6 +2214,10 @@ static void nvme_probe_work(struct work_struct *work)
 
 	dev->ctrl.event_limit = 1;
 
+	result = nvme_dev_list_add(dev);
+	if (result)
+		goto remove;
+
 	/*
 	 * Keep the controller around but remove all namespaces if we don't have
 	 * any working I/O queue.
@@ -2224,6 +2232,8 @@ static void nvme_probe_work(struct work_struct *work)
 
 	return;
 
+ remove:
+	nvme_dev_list_remove(dev);
  free_tags:
 	nvme_dev_remove_admin(dev);
 	blk_put_queue(dev->ctrl.admin_q);
@@ -2231,7 +2241,6 @@ static void nvme_probe_work(struct work_struct *work)
 	dev->queues[0]->tags = NULL;
  disable:
 	nvme_disable_queue(dev, 0);
-	nvme_dev_list_remove(dev);
  unmap:
 	nvme_dev_unmap(dev);
  out:
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 2/9] nvme: don't take the I/O queue q_lock in nvme_timeout
  2015-10-22 12:03 nvme abort and reset fixes Christoph Hellwig
  2015-10-22 12:03 ` [PATCH 1/9] nvme: only add a controller to dev_list after it's been fully initialized Christoph Hellwig
@ 2015-10-22 12:03 ` Christoph Hellwig
  2015-10-22 12:03 ` [PATCH 3/9] nvme: merge nvme_abort_req and nvme_timeout Christoph Hellwig
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 12:03 UTC (permalink / raw)


There is nothing it protects, but it makes lockdep unhappy in many different
ways.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/nvme/host/pci.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index fd53420..11bba95 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1057,13 +1057,13 @@ static void nvme_abort_req(struct request *req)
 	struct nvme_command cmd;
 
 	if (!nvmeq->qid || cmd_rq->aborted) {
-		spin_lock(&dev_list_lock);
+		spin_lock_irq(&dev_list_lock);
 		if (!__nvme_reset(dev)) {
 			dev_warn(dev->dev,
 				 "I/O %d QID %d timeout, reset controller\n",
 				 req->tag, nvmeq->qid);
 		}
-		spin_unlock(&dev_list_lock);
+		spin_unlock_irq(&dev_list_lock);
 		return;
 	}
 
@@ -1127,9 +1127,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 
 	dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
 							nvmeq->qid);
-	spin_lock_irq(&nvmeq->q_lock);
 	nvme_abort_req(req);
-	spin_unlock_irq(&nvmeq->q_lock);
 
 	/*
 	 * The aborted req will be completed on receiving the abort req.
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 3/9] nvme: merge nvme_abort_req and nvme_timeout
  2015-10-22 12:03 nvme abort and reset fixes Christoph Hellwig
  2015-10-22 12:03 ` [PATCH 1/9] nvme: only add a controller to dev_list after it's been fully initialized Christoph Hellwig
  2015-10-22 12:03 ` [PATCH 2/9] nvme: don't take the I/O queue q_lock in nvme_timeout Christoph Hellwig
@ 2015-10-22 12:03 ` Christoph Hellwig
  2015-10-22 12:03 ` [PATCH 4/9] nvme: do not restart the request timeout if we're resetting the controller Christoph Hellwig
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 12:03 UTC (permalink / raw)


We want to be able to return bettern error values frmo nvme_timeout, which
is significantly easier if the two functions are merged.  Also clean up and
reduce the printk spew so that we only get one message per abort.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/nvme/host/pci.c | 47 ++++++++++++++++++-----------------------------
 1 file changed, 18 insertions(+), 29 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 11bba95..644256c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1041,13 +1041,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
 }
 
-/**
- * nvme_abort_req - Attempt aborting a request
- *
- * Schedule controller reset if the command was already aborted once before and
- * still hasn't been returned to the driver, or if this is the admin queue.
- */
-static void nvme_abort_req(struct request *req)
+static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
 	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = cmd_rq->nvmeq;
@@ -1056,6 +1050,11 @@ static void nvme_abort_req(struct request *req)
 	struct nvme_cmd_info *abort_cmd;
 	struct nvme_command cmd;
 
+	/*
+	 * Schedule controller reset if the command was already aborted once
+	 * before and still hasn't been returned to the driver, or if this is
+	 * the admin queue.
+	 */
 	if (!nvmeq->qid || cmd_rq->aborted) {
 		spin_lock_irq(&dev_list_lock);
 		if (!__nvme_reset(dev)) {
@@ -1064,16 +1063,16 @@ static void nvme_abort_req(struct request *req)
 				 req->tag, nvmeq->qid);
 		}
 		spin_unlock_irq(&dev_list_lock);
-		return;
+		return BLK_EH_RESET_TIMER;
 	}
 
 	if (!dev->ctrl.abort_limit)
-		return;
+		return BLK_EH_RESET_TIMER;
 
 	abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, GFP_ATOMIC,
 									false);
 	if (IS_ERR(abort_req))
-		return;
+		return BLK_EH_RESET_TIMER;
 
 	abort_cmd = blk_mq_rq_to_pdu(abort_req);
 	nvme_set_info(abort_cmd, abort_req, abort_completion);
@@ -1087,9 +1086,16 @@ static void nvme_abort_req(struct request *req)
 	--dev->ctrl.abort_limit;
 	cmd_rq->aborted = 1;
 
-	dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
-							nvmeq->qid);
+	dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n",
+				 req->tag, nvmeq->qid);
 	nvme_submit_cmd(dev->queues[0], &cmd);
+
+	/*
+	 * The aborted req will be completed on receiving the abort req.
+	 * We enable the timer again. If hit twice, it'll cause a device reset,
+	 * as the device then is in a faulty state.
+	 */
+	return BLK_EH_RESET_TIMER;
 }
 
 static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
@@ -1120,23 +1126,6 @@ static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved
 	fn(nvmeq, ctx, &cqe);
 }
 
-static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
-{
-	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
-	struct nvme_queue *nvmeq = cmd->nvmeq;
-
-	dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
-							nvmeq->qid);
-	nvme_abort_req(req);
-
-	/*
-	 * The aborted req will be completed on receiving the abort req.
-	 * We enable the timer again. If hit twice, it'll cause a device reset,
-	 * as the device then is in a faulty state.
-	 */
-	return BLK_EH_RESET_TIMER;
-}
-
 static void nvme_free_queue(struct nvme_queue *nvmeq)
 {
 	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 4/9] nvme: do not restart the request timeout if we're resetting the controller
  2015-10-22 12:03 nvme abort and reset fixes Christoph Hellwig
                   ` (2 preceding siblings ...)
  2015-10-22 12:03 ` [PATCH 3/9] nvme: merge nvme_abort_req and nvme_timeout Christoph Hellwig
@ 2015-10-22 12:03 ` Christoph Hellwig
  2015-10-22 16:27   ` Busch, Keith
  2015-10-22 12:03 ` [PATCH 5/9] nvme: simplify resets Christoph Hellwig
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 12:03 UTC (permalink / raw)


Otherwise we're never going to complete a command when it is restarted just
after we completed all other outstanding commands in nvme_clear_queue.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/nvme/host/pci.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 644256c..a3250b8 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1063,7 +1063,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 				 req->tag, nvmeq->qid);
 		}
 		spin_unlock_irq(&dev_list_lock);
-		return BLK_EH_RESET_TIMER;
+
+		req->errors = -EIO;
+		return BLK_EH_HANDLED;
 	}
 
 	if (!dev->ctrl.abort_limit)
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 5/9] nvme: simplify resets
  2015-10-22 12:03 nvme abort and reset fixes Christoph Hellwig
                   ` (3 preceding siblings ...)
  2015-10-22 12:03 ` [PATCH 4/9] nvme: do not restart the request timeout if we're resetting the controller Christoph Hellwig
@ 2015-10-22 12:03 ` Christoph Hellwig
  2015-10-22 12:03 ` [PATCH 6/9] nvme: abort requests on the reqeueue list when shutting down a controller Christoph Hellwig
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 12:03 UTC (permalink / raw)


Don't delete the controller from dev_list before queuing a reset, instead
just check for it being reset in the polling kthread.  This allows to remove
the dev_list_lock in various places, and in addition we can simply rely on
checking the queue_work return value to see if we could reset a controller.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/nvme/host/pci.c | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a3250b8..fb44100 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -75,7 +75,6 @@ static wait_queue_head_t nvme_kthread_wait;
 struct nvme_dev;
 struct nvme_queue;
 
-static int __nvme_reset(struct nvme_dev *dev);
 static int nvme_reset(struct nvme_dev *dev);
 static int nvme_process_cq(struct nvme_queue *nvmeq);
 static void nvme_dead_ctrl(struct nvme_dev *dev);
@@ -1056,13 +1055,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 * the admin queue.
 	 */
 	if (!nvmeq->qid || cmd_rq->aborted) {
-		spin_lock_irq(&dev_list_lock);
-		if (!__nvme_reset(dev)) {
+		if (queue_work(nvme_workq, &dev->reset_work)) {
 			dev_warn(dev->dev,
 				 "I/O %d QID %d timeout, reset controller\n",
 				 req->tag, nvmeq->qid);
 		}
-		spin_unlock_irq(&dev_list_lock);
 
 		req->errors = -EIO;
 		return BLK_EH_HANDLED;
@@ -1555,9 +1552,15 @@ static int nvme_kthread(void *data)
 			int i;
 			u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
+			/*
+			 * Skip controllers currently under reset.
+			 */
+			if (work_pending(&dev->reset_work) || work_busy(&dev->reset_work))
+				continue;
+
 			if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
 							csts & NVME_CSTS_CFS) {
-				if (!__nvme_reset(dev)) {
+				if (queue_work(nvme_workq, &dev->reset_work)) {
 					dev_warn(dev->dev,
 						"Failed status: %x, reset controller\n",
 						readl(dev->bar + NVME_REG_CSTS));
@@ -2282,33 +2285,17 @@ static void nvme_reset_work(struct work_struct *ws)
 	schedule_work(&dev->probe_work);
 }
 
-static int __nvme_reset(struct nvme_dev *dev)
-{
-	if (work_pending(&dev->reset_work))
-		return -EBUSY;
-	list_del_init(&dev->node);
-	queue_work(nvme_workq, &dev->reset_work);
-	return 0;
-}
-
 static int nvme_reset(struct nvme_dev *dev)
 {
-	int ret;
-
 	if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
 		return -ENODEV;
 
-	spin_lock(&dev_list_lock);
-	ret = __nvme_reset(dev);
-	spin_unlock(&dev_list_lock);
-
-	if (!ret) {
-		flush_work(&dev->reset_work);
-		flush_work(&dev->probe_work);
-		return 0;
-	}
+	if (!queue_work(nvme_workq, &dev->reset_work))
+		return -EBUSY;
 
-	return ret;
+	flush_work(&dev->reset_work);
+	flush_work(&dev->probe_work);
+	return 0;
 }
 
 static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 6/9] nvme: abort requests on the reqeueue list when shutting down a controller
  2015-10-22 12:03 nvme abort and reset fixes Christoph Hellwig
                   ` (4 preceding siblings ...)
  2015-10-22 12:03 ` [PATCH 5/9] nvme: simplify resets Christoph Hellwig
@ 2015-10-22 12:03 ` Christoph Hellwig
  2015-10-22 14:44   ` Busch, Keith
  2015-10-22 12:03 ` [PATCH 7/9] nvme: merge probe_work and reset_work Christoph Hellwig
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 12:03 UTC (permalink / raw)


Otherwise we might never complete them as nvme_clear_queue won't find these
requests.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/nvme/host/pci.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index fb44100..73d015d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2116,11 +2116,15 @@ static void nvme_unfreeze_queues(struct nvme_dev *dev)
 
 static void nvme_dev_shutdown(struct nvme_dev *dev)
 {
+	struct nvme_ns *ns;
 	int i;
 	u32 csts = -1;
 
 	nvme_dev_list_remove(dev);
 
+	list_for_each_entry(ns, &dev->ctrl.namespaces, list)
+		blk_mq_abort_requeue_list(ns->queue);
+
 	if (dev->bar) {
 		nvme_freeze_queues(dev);
 		csts = readl(dev->bar + NVME_REG_CSTS);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 7/9] nvme: merge probe_work and reset_work
  2015-10-22 12:03 nvme abort and reset fixes Christoph Hellwig
                   ` (5 preceding siblings ...)
  2015-10-22 12:03 ` [PATCH 6/9] nvme: abort requests on the reqeueue list when shutting down a controller Christoph Hellwig
@ 2015-10-22 12:03 ` Christoph Hellwig
  2015-10-22 12:03 ` [PATCH 8/9] nvme: remove dead controllers from a work item Christoph Hellwig
  2015-10-22 12:03 ` [PATCH 9/9] nvme: switch abort_limit to an atomic_t Christoph Hellwig
  8 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 12:03 UTC (permalink / raw)


If we're using two work queues we're always going to run into races where
one item is tearing down what the other one is initializing.  So insted
merge the two work queues, and let the old probe_work also tear the
controller down first if it was alive.  Togethe with the better detection
of the probe path using a flag this gives us a properly serialized
reset/probe path that also doesn't accidentally trigger when two command
time out and the second one tries to reset the controller while the first
reset is still in progress.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/nvme/host/pci.c | 59 ++++++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 73d015d..696749d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -109,7 +109,6 @@ struct nvme_dev {
 	struct msix_entry *entry;
 	void __iomem *bar;
 	struct work_struct reset_work;
-	struct work_struct probe_work;
 	struct work_struct scan_work;
 	bool subsystem;
 	u32 page_size;
@@ -117,6 +116,8 @@ struct nvme_dev {
 	dma_addr_t cmb_dma_addr;
 	u64 cmb_size;
 	u32 cmbsz;
+	unsigned long flags;
+#define NVME_CTRL_RESETTING	0
 
 	struct nvme_ctrl ctrl;
 };
@@ -2182,11 +2183,29 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 	kfree(dev);
 }
 
-static void nvme_probe_work(struct work_struct *work)
+static void nvme_reset_work(struct work_struct *work)
 {
-	struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
+	struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
 	int result;
 
+	/*
+	 * Fail this device if reset occured during probe to avoid infinite
+	 * initialization loops.
+	 */
+	if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
+		nvme_dead_ctrl(dev);
+		return;
+	}
+
+	/*
+	 * If we're called to reset a live controller first shut it down before
+	 * moving on.
+	 */
+	if (dev->bar)
+		nvme_dev_shutdown(dev);
+
+	set_bit(NVME_CTRL_RESETTING, &dev->flags);
+
 	result = nvme_dev_map(dev);
 	if (result)
 		goto out;
@@ -2226,6 +2245,7 @@ static void nvme_probe_work(struct work_struct *work)
 		nvme_dev_add(dev);
 	}
 
+	clear_bit(NVME_CTRL_RESETTING, &dev->flags);
 	return;
 
  remove:
@@ -2240,7 +2260,7 @@ static void nvme_probe_work(struct work_struct *work)
  unmap:
 	nvme_dev_unmap(dev);
  out:
-	if (!work_busy(&dev->reset_work))
+	if (!work_pending(&dev->reset_work))
 		nvme_dead_ctrl(dev);
 }
 
@@ -2267,28 +2287,6 @@ static void nvme_dead_ctrl(struct nvme_dev *dev)
 	}
 }
 
-static void nvme_reset_work(struct work_struct *ws)
-{
-	struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
-	bool in_probe = work_busy(&dev->probe_work);
-
-	nvme_dev_shutdown(dev);
-
-	/* Synchronize with device probe so that work will see failure status
-	 * and exit gracefully without trying to schedule another reset */
-	flush_work(&dev->probe_work);
-
-	/* Fail this device if reset occured during probe to avoid
-	 * infinite initialization loops. */
-	if (in_probe) {
-		nvme_dead_ctrl(dev);
-		return;
-	}
-	/* Schedule device resume asynchronously so the reset work is available
-	 * to cleanup errors that may occur during reinitialization */
-	schedule_work(&dev->probe_work);
-}
-
 static int nvme_reset(struct nvme_dev *dev)
 {
 	if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
@@ -2298,7 +2296,6 @@ static int nvme_reset(struct nvme_dev *dev)
 		return -EBUSY;
 
 	flush_work(&dev->reset_work);
-	flush_work(&dev->probe_work);
 	return 0;
 }
 
@@ -2367,7 +2364,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	INIT_LIST_HEAD(&dev->node);
 	INIT_WORK(&dev->scan_work, nvme_dev_scan);
-	INIT_WORK(&dev->probe_work, nvme_probe_work);
 	INIT_WORK(&dev->reset_work, nvme_reset_work);
 
 	result = nvme_setup_prp_pools(dev);
@@ -2379,7 +2375,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (result)
 		goto release_pools;
 
-	schedule_work(&dev->probe_work);
+	schedule_work(&dev->reset_work);
 	return 0;
 
  release_pools:
@@ -2400,7 +2396,7 @@ static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
 	if (prepare)
 		nvme_dev_shutdown(dev);
 	else
-		schedule_work(&dev->probe_work);
+		schedule_work(&dev->reset_work);
 }
 
 static void nvme_shutdown(struct pci_dev *pdev)
@@ -2418,7 +2414,6 @@ static void nvme_remove(struct pci_dev *pdev)
 	spin_unlock(&dev_list_lock);
 
 	pci_set_drvdata(pdev, NULL);
-	flush_work(&dev->probe_work);
 	flush_work(&dev->reset_work);
 	flush_work(&dev->scan_work);
 	nvme_remove_namespaces(&dev->ctrl);
@@ -2452,7 +2447,7 @@ static int nvme_resume(struct device *dev)
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct nvme_dev *ndev = pci_get_drvdata(pdev);
 
-	schedule_work(&ndev->probe_work);
+	schedule_work(&ndev->reset_work);
 	return 0;
 }
 #endif
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 8/9] nvme: remove dead controllers from a work item
  2015-10-22 12:03 nvme abort and reset fixes Christoph Hellwig
                   ` (6 preceding siblings ...)
  2015-10-22 12:03 ` [PATCH 7/9] nvme: merge probe_work and reset_work Christoph Hellwig
@ 2015-10-22 12:03 ` Christoph Hellwig
  2015-10-22 18:10   ` Busch, Keith
  2015-10-22 12:03 ` [PATCH 9/9] nvme: switch abort_limit to an atomic_t Christoph Hellwig
  8 siblings, 1 reply; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 12:03 UTC (permalink / raw)


Compared to the kthread this gives us multiple call prevention for free.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/nvme/host/pci.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 696749d..e1fe093 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -77,7 +77,7 @@ struct nvme_queue;
 
 static int nvme_reset(struct nvme_dev *dev);
 static int nvme_process_cq(struct nvme_queue *nvmeq);
-static void nvme_dead_ctrl(struct nvme_dev *dev);
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
 
 struct async_cmd_info {
 	struct kthread_work work;
@@ -110,6 +110,7 @@ struct nvme_dev {
 	void __iomem *bar;
 	struct work_struct reset_work;
 	struct work_struct scan_work;
+	struct work_struct remove_work;
 	bool subsystem;
 	u32 page_size;
 	void __iomem *cmb;
@@ -2193,7 +2194,7 @@ static void nvme_reset_work(struct work_struct *work)
 	 * initialization loops.
 	 */
 	if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
-		nvme_dead_ctrl(dev);
+		nvme_remove_dead_ctrl(dev);
 		return;
 	}
 
@@ -2260,31 +2261,25 @@ static void nvme_reset_work(struct work_struct *work)
  unmap:
 	nvme_dev_unmap(dev);
  out:
-	if (!work_pending(&dev->reset_work))
-		nvme_dead_ctrl(dev);
+	nvme_remove_dead_ctrl(dev);
 }
 
-static int nvme_remove_dead_ctrl(void *arg)
+static void nvme_remove_dead_ctrl_work(struct work_struct *work)
 {
-	struct nvme_dev *dev = (struct nvme_dev *)arg;
+	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	if (pci_get_drvdata(pdev))
 		pci_stop_and_remove_bus_device_locked(pdev);
 	nvme_put_ctrl(&dev->ctrl);
-	return 0;
 }
 
-static void nvme_dead_ctrl(struct nvme_dev *dev)
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
 {
-	dev_warn(dev->dev, "Device failed to resume\n");
+	dev_warn(dev->dev, "Removing after probe failure\n");
 	kref_get(&dev->ctrl.kref);
-	if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
-						dev->ctrl.instance))) {
-		dev_err(dev->dev,
-			"Failed to start controller remove task\n");
+	if (!schedule_work(&dev->remove_work))
 		nvme_put_ctrl(&dev->ctrl);
-	}
 }
 
 static int nvme_reset(struct nvme_dev *dev)
@@ -2365,6 +2360,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_LIST_HEAD(&dev->node);
 	INIT_WORK(&dev->scan_work, nvme_dev_scan);
 	INIT_WORK(&dev->reset_work, nvme_reset_work);
+	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
 
 	result = nvme_setup_prp_pools(dev);
 	if (result)
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 9/9] nvme: switch abort_limit to an atomic_t
  2015-10-22 12:03 nvme abort and reset fixes Christoph Hellwig
                   ` (7 preceding siblings ...)
  2015-10-22 12:03 ` [PATCH 8/9] nvme: remove dead controllers from a work item Christoph Hellwig
@ 2015-10-22 12:03 ` Christoph Hellwig
  8 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 12:03 UTC (permalink / raw)


There is no lock to sychronize access to the abort_limit field of
struct nvme_ctrl, so switch it to an atomic_t.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/nvme/host/core.c | 2 +-
 drivers/nvme/host/nvme.h | 2 +-
 drivers/nvme/host/pci.c  | 9 +++++----
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index edfc75e..f4075f1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -625,7 +625,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	}
 
 	ctrl->oncs = le16_to_cpup(&id->oncs);
-	ctrl->abort_limit = id->acl + 1;
+	atomic_set(&ctrl->abort_limit, id->acl + 1);
 	ctrl->vwc = id->vwc;
 	memcpy(ctrl->serial, id->sn, sizeof(id->sn));
 	memcpy(ctrl->model, id->mn, sizeof(id->mn));
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index da63835..1658048 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -56,7 +56,7 @@ struct nvme_ctrl {
 	u32 stripe_size;
 	u32 page_size;
 	u16 oncs;
-	u16 abort_limit;
+	atomic_t abort_limit;
 	u8 event_limit;
 	u8 vwc;
 	u32 vs;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index e1fe093..794ca2c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -384,7 +384,7 @@ static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
 	blk_mq_free_request(req);
 
 	dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
-	++nvmeq->dev->ctrl.abort_limit;
+	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
 }
 
 static void async_completion(struct nvme_queue *nvmeq, void *ctx,
@@ -1067,13 +1067,15 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		return BLK_EH_HANDLED;
 	}
 
-	if (!dev->ctrl.abort_limit)
+	if (atomic_dec_and_test(&dev->ctrl.abort_limit))
 		return BLK_EH_RESET_TIMER;
 
 	abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, GFP_ATOMIC,
 									false);
-	if (IS_ERR(abort_req))
+	if (IS_ERR(abort_req)) {
+		atomic_inc(&dev->ctrl.abort_limit);
 		return BLK_EH_RESET_TIMER;
+	}
 
 	abort_cmd = blk_mq_rq_to_pdu(abort_req);
 	nvme_set_info(abort_cmd, abort_req, abort_completion);
@@ -1084,7 +1086,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
 	cmd.abort.command_id = abort_req->tag;
 
-	--dev->ctrl.abort_limit;
 	cmd_rq->aborted = 1;
 
 	dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n",
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 6/9] nvme: abort requests on the reqeueue list when shutting down a controller
  2015-10-22 12:03 ` [PATCH 6/9] nvme: abort requests on the reqeueue list when shutting down a controller Christoph Hellwig
@ 2015-10-22 14:44   ` Busch, Keith
  2015-10-22 14:58     ` Christoph Hellwig
  0 siblings, 1 reply; 24+ messages in thread
From: Busch, Keith @ 2015-10-22 14:44 UTC (permalink / raw)


On Thu, Oct 22, 2015@02:03:38PM +0200, Christoph Hellwig wrote:
> Otherwise we might never complete them as nvme_clear_queue won't find these
> requests.

With this change, requests will be ended in failure even though they
may have succeeded immediately after a reset. Nvme's unfreeze kicks the
namespaces' requeue_list just for that reason.

We shouldn't want to abort the requeue list unless we're freeing the
request_queue too, right?

I'm still trying to figure out what gap this is filling.

>  static void nvme_dev_shutdown(struct nvme_dev *dev)
>  {
> +	struct nvme_ns *ns;
>  	int i;
>  	u32 csts = -1;
>  
>  	nvme_dev_list_remove(dev);
>  
> +	list_for_each_entry(ns, &dev->ctrl.namespaces, list)
> +		blk_mq_abort_requeue_list(ns->queue);

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 6/9] nvme: abort requests on the reqeueue list when shutting down a controller
  2015-10-22 14:44   ` Busch, Keith
@ 2015-10-22 14:58     ` Christoph Hellwig
  2015-10-22 15:16       ` Busch, Keith
  0 siblings, 1 reply; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 14:58 UTC (permalink / raw)


On Thu, Oct 22, 2015@02:44:19PM +0000, Busch, Keith wrote:
> With this change, requests will be ended in failure even though they
> may have succeeded immediately after a reset. Nvme's unfreeze kicks the
> namespaces' requeue_list just for that reason.
> 
> We shouldn't want to abort the requeue list unless we're freeing the
> request_queue too, right?

We're aborting all active commands through nvme_dev_shutdown ->
nvme_clear_queue.  Why would we skip commands that were active and are
going to be active again ASAP?  But this part actually was just something
I suspected to be an issue first, the ->timeout return value change
turned out to fix the actual bug I was seeing.  So feel free to skip it
for now but I suspect we'll eventually need something similar.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 6/9] nvme: abort requests on the reqeueue list when shutting down a controller
  2015-10-22 14:58     ` Christoph Hellwig
@ 2015-10-22 15:16       ` Busch, Keith
  2015-10-22 16:27         ` Christoph Hellwig
  0 siblings, 1 reply; 24+ messages in thread
From: Busch, Keith @ 2015-10-22 15:16 UTC (permalink / raw)


On Thu, Oct 22, 2015@04:58:42PM +0200, Christoph Hellwig wrote:
> We're aborting all active commands through nvme_dev_shutdown ->
> nvme_clear_queue.  Why would we skip commands that were active and are
> going to be active again ASAP? 

This is a bit subtle. nvme_clear_queue at the end of shutdown allows
active commands being aborted to requeue based on the state of the
namespace's request_queue: if it isn't "dying", nvme_clear_queue won't
set NVMe CQE "do-not-retry" (DNR) bit, so req_completion may requeue.

In the reset scenario, this change would fail requests that happen to be
on the requeue_list, but active commands being cancelled will be allowed
to retry. That's not fair to requests that got on the list early. :)

But it sounds like you may have found the real fix to the gap in a
different patch, so I'll skip this as suggested and continue with the
rest of the series.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 4/9] nvme: do not restart the request timeout if we're resetting the controller
  2015-10-22 12:03 ` [PATCH 4/9] nvme: do not restart the request timeout if we're resetting the controller Christoph Hellwig
@ 2015-10-22 16:27   ` Busch, Keith
  2015-10-22 16:30     ` Christoph Hellwig
  0 siblings, 1 reply; 24+ messages in thread
From: Busch, Keith @ 2015-10-22 16:27 UTC (permalink / raw)


On Thu, Oct 22, 2015@02:03:36PM +0200, Christoph Hellwig wrote:
> Otherwise we're never going to complete a command when it is restarted just
> after we completed all other outstanding commands in nvme_clear_queue.

Returning BLK_EH_HANDLED to block mq's timeout handler has the block layer
complete the request. However, the reset work is going to force cancel
the request anyway, so the same command will be completed twice. That
can't be right ... right?

> @@ -1063,7 +1063,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>  				 req->tag, nvmeq->qid);
>  		}
>  		spin_unlock_irq(&dev_list_lock);
> -		return BLK_EH_RESET_TIMER;
> +
> +		req->errors = -EIO;
> +		return BLK_EH_HANDLED;
>  	}

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 6/9] nvme: abort requests on the reqeueue list when shutting down a controller
  2015-10-22 15:16       ` Busch, Keith
@ 2015-10-22 16:27         ` Christoph Hellwig
  0 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 16:27 UTC (permalink / raw)


On Thu, Oct 22, 2015@03:16:26PM +0000, Busch, Keith wrote:
> This is a bit subtle. nvme_clear_queue at the end of shutdown allows
> active commands being aborted to requeue based on the state of the
> namespace's request_queue: if it isn't "dying", nvme_clear_queue won't
> set NVMe CQE "do-not-retry" (DNR) bit, so req_completion may requeue.
> 
> In the reset scenario, this change would fail requests that happen to be
> on the requeue_list, but active commands being cancelled will be allowed
> to retry. That's not fair to requests that got on the list early. :)
> 
> But it sounds like you may have found the real fix to the gap in a
> different patch, so I'll skip this as suggested and continue with the
> rest of the series.

Oh, so you want to resubmit them after the reset.  I guess this makes
sense in general, although in my particular case it delays recovery
significantly.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 4/9] nvme: do not restart the request timeout if we're resetting the controller
  2015-10-22 16:27   ` Busch, Keith
@ 2015-10-22 16:30     ` Christoph Hellwig
  2015-10-22 17:15       ` Busch, Keith
  0 siblings, 1 reply; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 16:30 UTC (permalink / raw)


On Thu, Oct 22, 2015@04:27:03PM +0000, Busch, Keith wrote:
> On Thu, Oct 22, 2015@02:03:36PM +0200, Christoph Hellwig wrote:
> > Otherwise we're never going to complete a command when it is restarted just
> > after we completed all other outstanding commands in nvme_clear_queue.
> 
> Returning BLK_EH_HANDLED to block mq's timeout handler has the block layer
> complete the request. However, the reset work is going to force cancel
> the request anyway, so the same command will be completed twice. That
> can't be right ... right?

No, it's not going to complete it if it already was completed, check the
logic around the REQ_ATOM_COMPLETE flag.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 4/9] nvme: do not restart the request timeout if we're resetting the controller
  2015-10-22 16:30     ` Christoph Hellwig
@ 2015-10-22 17:15       ` Busch, Keith
  2015-10-22 18:17         ` Christoph Hellwig
  0 siblings, 1 reply; 24+ messages in thread
From: Busch, Keith @ 2015-10-22 17:15 UTC (permalink / raw)


But the request is available for reuse after completion. Can we can rely on REQ_ATOM_COMPETE when the req is allocated for a new command?

> -----Original Message-----
> From: Christoph Hellwig [mailto:hch at lst.de]
> Sent: Thursday, October 22, 2015 10:31 AM
> To: Busch, Keith
> Cc: Christoph Hellwig; axboe at fb.com; linux-nvme at lists.infradead.org
> Subject: Re: [PATCH 4/9] nvme: do not restart the request timeout if we're resetting the
> controller
> 
> On Thu, Oct 22, 2015@04:27:03PM +0000, Busch, Keith wrote:
> > On Thu, Oct 22, 2015@02:03:36PM +0200, Christoph Hellwig wrote:
> > > Otherwise we're never going to complete a command when it is restarted just
> > > after we completed all other outstanding commands in nvme_clear_queue.
> >
> > Returning BLK_EH_HANDLED to block mq's timeout handler has the block layer
> > complete the request. However, the reset work is going to force cancel
> > the request anyway, so the same command will be completed twice. That
> > can't be right ... right?
> 
> No, it's not going to complete it if it already was completed, check the
> logic around the REQ_ATOM_COMPLETE flag.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 8/9] nvme: remove dead controllers from a work item
  2015-10-22 12:03 ` [PATCH 8/9] nvme: remove dead controllers from a work item Christoph Hellwig
@ 2015-10-22 18:10   ` Busch, Keith
  2015-10-22 18:12     ` Christoph Hellwig
  0 siblings, 1 reply; 24+ messages in thread
From: Busch, Keith @ 2015-10-22 18:10 UTC (permalink / raw)


On Thu, Oct 22, 2015@02:03:40PM +0200, Christoph Hellwig wrote:
> Compared to the kthread this gives us multiple call prevention for free.

My apologies in advance, I usually try to be more helpful debugging
new issues, but I'm running low on time today. I might be able to dig
into this more tomorrow, but if you could, please verify what happens
if an admin command times out during initialization. The reset_work
gets queued twice on two different work_queues and causes trouble. I
get one of several crashes or warnings after applying the whole series,
but this is the most frequent I'm seeing so far:

[  348.845441] BUG: unable to handle kernel NULL pointer dereference at 0000000000000014
[  348.846615] IP: [<ffffffffa02c1a48>] nvme_dev_shutdown+0x303/0x3e8 [nvme]
[  348.847615] PGD b8749067 PUD b8748067 PMD 0
[  348.848328] Oops: 0002 [#1] SMP
[  348.848826] Modules linked in: nvme bnep rfcomm bluetooth rfkill nfsd auth_rpcgss oid_registry nfs_acl nfs lockdgrace fscache sunrpc dm_round_robin loop dm_multipath parport_pc parport evdev psmouse pcspkr serio_raw processor button ext4 crc16 jbd2 mbcache sg sr_mod sd_mod cdrom ata_generic floppy e1000 ata_piix libata scsi_mod
[  348.849428] CPU: 0 PID: 3415 Comm: kworker/0:0 Not tainted 4.3.0-rc4+ #4
[  348.849428] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
[  348.849428] Workqueue: events nvme_remove_dead_ctrl_work [nvme]
[  348.849428] task: ffff88013afe6ec0 ti: ffff88013afe8000 task.ti: ffff88013afe8000
[  348.849428] RIP: 0010:[<ffffffffa02c1a48>]  [<ffffffffa02c1a48>] nvme_dev_shutdown+0x303/0x3e8 [nvme]
[  348.849428] RSP: 0018:ffff88013afebcd8  EFLAGS: 00010202
[  348.849428] RAX: 0000000000464001 RBX: ffff8800369c2800 RCX: 0000005138ced189
[  348.849428] RDX: 0000000000000000 RSI: ffffffff81409bf0 RDI: ffff88013ab53f10
[  348.849428] RBP: ffff88013afe6ec0 R08: ffff88013afe8000 R09: 0000000000000000
[  348.849428] R10: 0000000000000000 R11: ffff88013fc142c0 R12: 0000000000000000
[  348.849428] R13: ffff880036f0c540 R14: ffff88013afebcf8 R15: 0000000000000000
[  348.849428] FS:  0000000000000000(0000) GS:ffff88013fc00000(0000) knlGS:0000000000000000
[  348.849428] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[  348.849428] CR2: 0000000000000014 CR3: 00000000b8cb2000 CR4: 00000000000006f0
[  348.849428] Stack:
[  348.849428]  ffff8800b8e556b8 0000000000000000 ffff8800b8e55640 0000000000000082
[  348.849428]  ffff88013afe6ec0 ffff88013afebd10 ffff880100000000 0000000000000000
[  348.849428]  ffff88013afebd18 ffff88013afebd18 0000000000000000 0000000000000000
[  348.849428] Call Trace:
[  348.849428]  [<ffffffffa02c1bd6>] ? nvme_remove+0x6f/0xc0 [nvme]
[  348.849428]  [<ffffffff8121b67a>] ? pci_device_remove+0x43/0x8e
[  348.849428]  [<ffffffff812bb882>] ? __device_release_driver+0x91/0x109
[  348.849428]  [<ffffffff812bb90b>] ? device_release_driver+0x11/0x1a
[  348.849428]  [<ffffffff812165e2>] ? pci_stop_bus_device+0x62/0x85
[  348.849428]  [<ffffffff81216798>] ? pci_stop_and_remove_bus_device+0x9/0x12
[  348.849428]  [<ffffffff812167b9>] ? pci_stop_and_remove_bus_device_locked+0x18/0x21
[  348.849428]  [<ffffffffa02bfd44>] ? nvme_remove_dead_ctrl_work+0x1e/0x28 [nvme]
[  348.849428]  [<ffffffff810560b6>] ? process_one_work+0x177/0x27b
[  348.849428]  [<ffffffff81056394>] ? worker_thread+0x1b4/0x28b
[  348.849428]  [<ffffffff810561e0>] ? process_scheduled_works+0x26/0x26
[  348.849428]  [<ffffffff8105a4b6>] ? kthread+0x99/0xa1
[  348.849428]  [<ffffffff8105a41d>] ? kthread_parkme+0x16/0x16
[  348.849428]  [<ffffffff813c9d6f>] ? ret_from_fork+0x3f/0x70
[  348.849428]  [<ffffffff8105a41d>] ? kthread_parkme+0x16/0x16
[  348.849428] Code: 48 89 44 24 18 48 8b 44 24 18 4c 89 ef e8 ce 88 d9 e0 8b 83 2c 01 00 00 48 8b 93 38 01 00 00 80 e4 3f 80 cc 40 89 83 2c 01 00 00 <89> 42 14 0f b6 2d fe 46 00 00 48 8b 05 a7 85 34 e1 65 4c 8b 24
[  348.849428] RIP  [<ffffffffa02c1a48>] nvme_dev_shutdown+0x303/0x3e8 [nvme]
[  348.849428]  RSP <ffff88013afebcd8>
[  348.849428] CR2: 0000000000000014
[  348.849428] ---[ end trace f11db05b8d65bab0 ]---

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 8/9] nvme: remove dead controllers from a work item
  2015-10-22 18:10   ` Busch, Keith
@ 2015-10-22 18:12     ` Christoph Hellwig
  2015-10-22 20:36       ` Busch, Keith
  0 siblings, 1 reply; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 18:12 UTC (permalink / raw)


On Thu, Oct 22, 2015@06:10:15PM +0000, Busch, Keith wrote:
> On Thu, Oct 22, 2015@02:03:40PM +0200, Christoph Hellwig wrote:
> > Compared to the kthread this gives us multiple call prevention for free.
> 
> My apologies in advance, I usually try to be more helpful debugging
> new issues, but I'm running low on time today. I might be able to dig
> into this more tomorrow, but if you could, please verify what happens
> if an admin command times out during initialization. The reset_work
> gets queued twice on two different work_queues and causes trouble. I
> get one of several crashes or warnings after applying the whole series,
> but this is the most frequent I'm seeing so far:

Oops - I'm queing it both on the system workqueue and nvme_workq, which
is clearly broken.  I'll fix it in the next resend.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 4/9] nvme: do not restart the request timeout if we're resetting the controller
  2015-10-22 17:15       ` Busch, Keith
@ 2015-10-22 18:17         ` Christoph Hellwig
  0 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-22 18:17 UTC (permalink / raw)


On Thu, Oct 22, 2015@05:15:18PM +0000, Busch, Keith wrote:
> But the request is available for reuse after completion. Can we
> can rely on REQ_ATOM_COMPETE when the req is allocated for a new command?

We can't.  What I intended to rely on is a controller reset killing all
I/O queue state, but we still have a small race window between the return from
->timeout and the actual controller reset.  I'll have to figure out a way
to mitigate that.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 8/9] nvme: remove dead controllers from a work item
  2015-10-22 18:12     ` Christoph Hellwig
@ 2015-10-22 20:36       ` Busch, Keith
  2015-10-23  5:57         ` Christoph Hellwig
  0 siblings, 1 reply; 24+ messages in thread
From: Busch, Keith @ 2015-10-22 20:36 UTC (permalink / raw)


On Thu, Oct 22, 2015@08:12:40PM +0200, Christoph Hellwig wrote:
> Oops - I'm queing it both on the system workqueue and nvme_workq, which
> is clearly broken.  I'll fix it in the next resend.

Ha, let's just get rid of nvme_workq.

I've got that plus a couple other fixups below. I noticed that earlier
in the recent changes (18/18 from the previous series) we lost the
"device_remove_file()" on the reset controller attribute. We also don't
want to leave the nvme management handle available when we're trying to
release it, so we need to delete the sysfs and char dev immediately on
removal instead of after all references are released.

This fixes a WARN in fs/sysfs/group.c line 222.

Here's all my "fixes", also pushed to my linux-nvme master:
---
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f4075f1..ebeb797 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -919,18 +919,22 @@ static void nvme_release_instance(struct nvme_ctrl *ctrl)
 	spin_unlock(&dev_list_lock);
 }
 
-static void nvme_free_ctrl(struct kref *kref)
+void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
 {
-	struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
+	device_remove_file(ctrl->device, &dev_attr_reset_controller);
+	device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
 
 	spin_lock(&dev_list_lock);
 	list_del(&ctrl->node);
 	spin_unlock(&dev_list_lock);
+}
+
+static void nvme_free_ctrl(struct kref *kref)
+{
+	struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
 
 	put_device(ctrl->device);
 	nvme_release_instance(ctrl);
-	device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
-
 	ctrl->ops->free_ctrl(ctrl);
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1658048..a879f9e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -189,6 +189,7 @@ static inline int nvme_error_status(u16 status)
 	}
 }
 
+void nvme_uninit_ctrl(struct nvme_ctrl *ctrl);
 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 		const struct nvme_ctrl_ops *ops, u16 vendor,
 		unsigned long quirks);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 437c3dc..e20952d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -69,7 +69,6 @@ MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
 
 static LIST_HEAD(dev_list);
 static struct task_struct *nvme_thread;
-static struct workqueue_struct *nvme_workq;
 static wait_queue_head_t nvme_kthread_wait;
 
 struct nvme_dev;
@@ -1057,12 +1056,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 * the admin queue.
 	 */
 	if (!nvmeq->qid || cmd_rq->aborted) {
-		if (queue_work(nvme_workq, &dev->reset_work)) {
+		if (schedule_work(&dev->reset_work))
 			dev_warn(dev->dev,
 				 "I/O %d QID %d timeout, reset controller\n",
 				 req->tag, nvmeq->qid);
-		}
-
 		req->errors = -EIO;
 		return BLK_EH_HANDLED;
 	}
@@ -1563,7 +1560,7 @@ static int nvme_kthread(void *data)
 
 			if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
 							csts & NVME_CSTS_CFS) {
-				if (queue_work(nvme_workq, &dev->reset_work)) {
+				if (schedule_work(&dev->reset_work)) {
 					dev_warn(dev->dev,
 						"Failed status: %x, reset controller\n",
 						readl(dev->bar + NVME_REG_CSTS));
@@ -2284,7 +2281,7 @@ static int nvme_reset(struct nvme_dev *dev)
 	if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
 		return -ENODEV;
 
-	if (!queue_work(nvme_workq, &dev->reset_work))
+	if (!schedule_work(&dev->reset_work))
 		return -EBUSY;
 
 	flush_work(&dev->reset_work);
@@ -2412,6 +2409,7 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_remove_namespaces(&dev->ctrl);
 	nvme_dev_shutdown(dev);
 	nvme_dev_remove_admin(dev);
+	nvme_uninit_ctrl(&dev->ctrl);
 	nvme_free_queues(dev, 0);
 	nvme_release_cmb(dev);
 	nvme_release_prp_pools(dev);
@@ -2485,13 +2483,9 @@ static int __init nvme_init(void)
 
 	init_waitqueue_head(&nvme_kthread_wait);
 
-	nvme_workq = create_singlethread_workqueue("nvme");
-	if (!nvme_workq)
-		return -ENOMEM;
-
 	result = nvme_core_init();
 	if (result < 0)
-		goto kill_workq;
+		return result;
 
 	result = pci_register_driver(&nvme_driver);
 	if (result)
@@ -2500,8 +2494,6 @@ static int __init nvme_init(void)
 
  core_exit:
 	nvme_core_exit();
- kill_workq:
-	destroy_workqueue(nvme_workq);
 	return result;
 }
 
@@ -2509,7 +2501,6 @@ static void __exit nvme_exit(void)
 {
 	pci_unregister_driver(&nvme_driver);
 	nvme_core_exit();
-	destroy_workqueue(nvme_workq);
 	BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
 	_nvme_check_size();
 }
--

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 8/9] nvme: remove dead controllers from a work item
  2015-10-22 20:36       ` Busch, Keith
@ 2015-10-23  5:57         ` Christoph Hellwig
  2015-10-23 14:51           ` Busch, Keith
  0 siblings, 1 reply; 24+ messages in thread
From: Christoph Hellwig @ 2015-10-23  5:57 UTC (permalink / raw)


On Thu, Oct 22, 2015@08:36:50PM +0000, Busch, Keith wrote:
> On Thu, Oct 22, 2015@08:12:40PM +0200, Christoph Hellwig wrote:
> > Oops - I'm queing it both on the system workqueue and nvme_workq, which
> > is clearly broken.  I'll fix it in the next resend.
> 
> Ha, let's just get rid of nvme_workq.

I thought about this a bit and I'm not sure it's the right thing -
as we reset controllers as part of error handling we'd really need a
WQ_MEM_RECLAIM workqueue here to be guranteed to make progress.  Although
hopefully not a single threaded one.

> I've got that plus a couple other fixups below. I noticed that earlier
> in the recent changes (18/18 from the previous series) we lost the
> "device_remove_file()" on the reset controller attribute.

Indeed.  For some reason I assumed we don't need the device_remove_file
when dropping the device and it didn't complain in my testing either.

> We also don't
> want to leave the nvme management handle available when we're trying to
> release it, so we need to delete the sysfs and char dev immediately on
> removal instead of after all references are released.
> 
> This fixes a WARN in fs/sysfs/group.c line 222.

I guess we need to sort out this life time model in more detail later
as the existing one you restored here looks a bit odd to me.  Nothing
I'll be able to finish this week, though.

> Here's all my "fixes", also pushed to my linux-nvme master:

Thanks.  So what's the plan for the patches in it?  They'll clash a bit
with the PR API and integrity branches due to all the moves.  But those
are also the reason why I'd really like to see it go into 4.4.  Do you
need help with a rebase?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 8/9] nvme: remove dead controllers from a work item
  2015-10-23  5:57         ` Christoph Hellwig
@ 2015-10-23 14:51           ` Busch, Keith
  2015-10-23 19:31             ` Busch, Keith
  0 siblings, 1 reply; 24+ messages in thread
From: Busch, Keith @ 2015-10-23 14:51 UTC (permalink / raw)


On Fri, Oct 23, 2015@07:57:15AM +0200, Christoph Hellwig wrote:
> I guess we need to sort out this life time model in more detail later
> as the existing one you restored here looks a bit odd to me.  Nothing
> I'll be able to finish this week, though.

We just don't want the character device to be available to accept new
opens after the physical device was removed, so I think it has to be in
the "remove" path rather than after the reference count drops to 0.

> Thanks.  So what's the plan for the patches in it?  They'll clash a bit
> with the PR API and integrity branches due to all the moves.  But those
> are also the reason why I'd really like to see it go into 4.4.  Do you
> need help with a rebase?

No worries, I'll take care of the rebase and resend for consideration.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 8/9] nvme: remove dead controllers from a work item
  2015-10-23 14:51           ` Busch, Keith
@ 2015-10-23 19:31             ` Busch, Keith
  0 siblings, 0 replies; 24+ messages in thread
From: Busch, Keith @ 2015-10-23 19:31 UTC (permalink / raw)


On Fri, Oct 23, 2015@02:51:59PM +0000, Busch, Keith wrote:
> No worries, I'll take care of the rebase and resend for consideration.

Only a little painful. :)

I merged the linux-block for-4.4/integrity, for-4.4/reservations,
for-linus, and for-next, plus all of Christoph's recent NVMe with some
merge conflict fixups, plus a few of my own:

http://git.infradead.org/users/kbusch/linux-nvme.git

The tree is just a convience for me, but just linking it here in hopes
that it's useful. It was a forced push, so abandon older clones if
they exist.

My first merge like this. I hope I got the process "right", but I wouldn't
be surprised there was a smarter way to do it.

I will send out separate patches to the list for the ones that haven't
been reviewed yet. I'll also try to apply some of the other patches
from the list that seem to have been forgotten; the PPC IOMMU issue is
pretty important.

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2015-10-23 19:31 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-22 12:03 nvme abort and reset fixes Christoph Hellwig
2015-10-22 12:03 ` [PATCH 1/9] nvme: only add a controller to dev_list after it's been fully initialized Christoph Hellwig
2015-10-22 12:03 ` [PATCH 2/9] nvme: don't take the I/O queue q_lock in nvme_timeout Christoph Hellwig
2015-10-22 12:03 ` [PATCH 3/9] nvme: merge nvme_abort_req and nvme_timeout Christoph Hellwig
2015-10-22 12:03 ` [PATCH 4/9] nvme: do not restart the request timeout if we're resetting the controller Christoph Hellwig
2015-10-22 16:27   ` Busch, Keith
2015-10-22 16:30     ` Christoph Hellwig
2015-10-22 17:15       ` Busch, Keith
2015-10-22 18:17         ` Christoph Hellwig
2015-10-22 12:03 ` [PATCH 5/9] nvme: simplify resets Christoph Hellwig
2015-10-22 12:03 ` [PATCH 6/9] nvme: abort requests on the reqeueue list when shutting down a controller Christoph Hellwig
2015-10-22 14:44   ` Busch, Keith
2015-10-22 14:58     ` Christoph Hellwig
2015-10-22 15:16       ` Busch, Keith
2015-10-22 16:27         ` Christoph Hellwig
2015-10-22 12:03 ` [PATCH 7/9] nvme: merge probe_work and reset_work Christoph Hellwig
2015-10-22 12:03 ` [PATCH 8/9] nvme: remove dead controllers from a work item Christoph Hellwig
2015-10-22 18:10   ` Busch, Keith
2015-10-22 18:12     ` Christoph Hellwig
2015-10-22 20:36       ` Busch, Keith
2015-10-23  5:57         ` Christoph Hellwig
2015-10-23 14:51           ` Busch, Keith
2015-10-23 19:31             ` Busch, Keith
2015-10-22 12:03 ` [PATCH 9/9] nvme: switch abort_limit to an atomic_t Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.