All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue
@ 2015-11-24 18:35 Keith Busch
  2015-11-24 18:35 ` [PATCHv2 2/2] NVMe: Shutdown fixes Keith Busch
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Keith Busch @ 2015-11-24 18:35 UTC (permalink / raw)


This patch removes synchronizing the timeout work so that the timer can
start a freeze on its own queue. The timer enters the queue, so timer
context can only start a freeze, but not wait for frozen.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 block/blk-mq.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5142219..2714c9c2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -85,7 +85,6 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
 	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
 	if (freeze_depth == 1) {
 		percpu_ref_kill(&q->q_usage_counter);
-		cancel_work_sync(&q->timeout_work);
 		blk_mq_run_hw_queues(q, false);
 	}
 }
@@ -628,6 +627,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
 	};
 	int i;
 
+	if (blk_queue_enter(q, true))
+		return;
+
 	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
 
 	if (data.next_set) {
@@ -642,6 +644,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
 				blk_mq_tag_idle(hctx);
 		}
 	}
+	blk_queue_exit(q);
 }
 
 /*
-- 
2.6.2.307.g37023ba

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCHv2 2/2] NVMe: Shutdown fixes
  2015-11-24 18:35 [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue Keith Busch
@ 2015-11-24 18:35 ` Keith Busch
  2015-11-24 19:42   ` Jens Axboe
  2015-11-24 19:42 ` [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue Jens Axboe
  2015-11-24 19:47 ` Christoph Hellwig
  2 siblings, 1 reply; 8+ messages in thread
From: Keith Busch @ 2015-11-24 18:35 UTC (permalink / raw)


The controller must be disabled prior to completing a presumed lost
command. This patch makes the shutdown safe to simultaneous invocations,
and directly calls it from the timeout handler if timeout occurs during
initialization.

blk-mq marks requests as completed in its timeout handler, though the
driver still owns the request. To propagate the appropriate status
to the caller, the driver must directly set req->errors. This also
happens to fix a race if the controller being reset actually completes
the request that timed out.

A side effect of this patch is a controller that times out IO queue
creation will be failed. The driver silently proceeded before, so this
sounds like an improvement. There hasn't been any reports of such
a condition actually occurring, though.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
v1 -> v2:

  Get rid of the struct work for shutdown. Make shutdown robust to
  multiple invocations instead.

 drivers/nvme/host/pci.c | 56 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ff253c8..cf9722f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -86,6 +86,7 @@ struct nvme_queue;
 static int nvme_reset(struct nvme_dev *dev);
 static void nvme_process_cq(struct nvme_queue *nvmeq);
 static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
+static void nvme_dev_shutdown(struct nvme_dev *dev);
 
 struct async_cmd_info {
 	struct kthread_work work;
@@ -117,6 +118,7 @@ struct nvme_dev {
 	struct work_struct reset_work;
 	struct work_struct scan_work;
 	struct work_struct remove_work;
+	wait_queue_head_t shutdown_wait;
 	bool subsystem;
 	u32 page_size;
 	void __iomem *cmb;
@@ -125,7 +127,7 @@ struct nvme_dev {
 	u32 cmbsz;
 	unsigned long flags;
 #define NVME_CTRL_RESETTING	0
-
+#define NVME_CTRL_SHUTDOWN	1
 	struct nvme_ctrl ctrl;
 };
 
@@ -723,6 +725,19 @@ static void nvme_complete_rq(struct request *req)
 	blk_mq_end_request(req, error);
 }
 
+static inline void nvme_end_req(struct request *req, int error)
+{
+	/*
+	 * The request may already be marked "complete" by blk-mq if
+	 * completion occurs during a timeout. We set req->errors
+	 * before telling blk-mq in case this is the target request
+	 * of a timeout handler. This is safe since this driver never
+	 * completes the same command twice.
+	 */
+	req->errors = error;
+	blk_mq_complete_request(req, req->errors);
+}
+
 static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 {
 	u16 head, phase;
@@ -770,8 +785,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 			u32 result = le32_to_cpu(cqe.result);
 			req->special = (void *)(uintptr_t)result;
 		}
-		blk_mq_complete_request(req, status >> 1);
-
+		nvme_end_req(req, status >> 1);
 	}
 
 	/* If the controller ignores the cq head doorbell and continuously
@@ -938,13 +952,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	struct nvme_command cmd;
 
 	/*
-	 * Just return an error to reset_work if we're already called from
-	 * probe context.  We don't worry about request / tag reuse as
-	 * reset_work will immeditalely unwind and remove the controller
-	 * once we fail a command.
+	 * Shutdown immediately if controller times out while starting. The
+	 * reset work will see the pci device disabled when it gets the forced
+	 * cancellation error. All outstanding requests are completed on
+	 * shutdown, so we return BLK_EH_HANDLED.
 	 */
 	if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
-		req->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+		dev_warn(dev->dev,
+				"I/O %d QID %d timeout, disable controller\n",
+				 req->tag, nvmeq->qid);
+		nvme_dev_shutdown(dev);
 		return BLK_EH_HANDLED;
 	}
 
@@ -1015,7 +1032,8 @@ static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved
 
 	if (blk_queue_dying(req->q))
 		status |= NVME_SC_DNR;
-	blk_mq_complete_request(req, status);
+
+	nvme_end_req(req, status);
 }
 
 static void nvme_free_queue(struct nvme_queue *nvmeq)
@@ -1510,9 +1528,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 
 	status = nvme_set_features(&dev->ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
 								&result);
-	if (status < 0)
-		return status;
-	if (status > 0) {
+	if (status) {
 		dev_err(dev->dev, "Could not set queue count (%d)\n", status);
 		return 0;
 	}
@@ -2023,6 +2039,11 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 
 	nvme_dev_list_remove(dev);
 
+	if (test_and_set_bit(NVME_CTRL_SHUTDOWN, &dev->flags)) {
+		wait_event(dev->shutdown_wait, !test_bit(NVME_CTRL_SHUTDOWN,
+								&dev->flags));
+		return;
+	}
 	if (dev->bar) {
 		nvme_freeze_queues(dev);
 		csts = readl(dev->bar + NVME_REG_CSTS);
@@ -2041,6 +2062,9 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 
 	for (i = dev->queue_count - 1; i >= 0; i--)
 		nvme_clear_queue(dev->queues[i]);
+
+	clear_bit(NVME_CTRL_SHUTDOWN, &dev->flags);
+	wake_up_all(&dev->shutdown_wait);
 }
 
 static int nvme_setup_prp_pools(struct nvme_dev *dev)
@@ -2115,7 +2139,12 @@ static void nvme_reset_work(struct work_struct *work)
 		goto free_tags;
 
 	result = nvme_setup_io_queues(dev);
-	if (result)
+
+	/*
+	 * The pci device will be disabled if a timeout occurs while setting up
+	 * IO queues, but return 0 for "result", so we have to check both.
+	 */
+	if (result || !pci_is_enabled(to_pci_dev(dev->dev)))
 		goto free_tags;
 
 	dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
@@ -2251,6 +2280,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_WORK(&dev->scan_work, nvme_dev_scan);
 	INIT_WORK(&dev->reset_work, nvme_reset_work);
 	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
+	init_waitqueue_head(&dev->shutdown_wait);
 
 	result = nvme_setup_prp_pools(dev);
 	if (result)
-- 
2.6.2.307.g37023ba

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCHv2 2/2] NVMe: Shutdown fixes
  2015-11-24 18:35 ` [PATCHv2 2/2] NVMe: Shutdown fixes Keith Busch
@ 2015-11-24 19:42   ` Jens Axboe
  2015-11-24 20:20     ` Keith Busch
  0 siblings, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2015-11-24 19:42 UTC (permalink / raw)


On 11/24/2015 11:35 AM, Keith Busch wrote:
> The controller must be disabled prior to completing a presumed lost
> command. This patch makes the shutdown safe to simultaneous invocations,
> and directly calls it from the timeout handler if timeout occurs during
> initialization.
>
> blk-mq marks requests as completed in its timeout handler, though the
> driver still owns the request. To propagate the appropriate status
> to the caller, the driver must directly set req->errors. This also
> happens to fix a race if the controller being reset actually completes
> the request that timed out.
>
> A side effect of this patch is a controller that times out IO queue
> creation will be failed. The driver silently proceeded before, so this
> sounds like an improvement. There hasn't been any reports of such
> a condition actually occurring, though.
>
> Signed-off-by: Keith Busch <keith.busch at intel.com>

> @@ -2023,6 +2039,11 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
>
>   	nvme_dev_list_remove(dev);
>
> +	if (test_and_set_bit(NVME_CTRL_SHUTDOWN, &dev->flags)) {
> +		wait_event(dev->shutdown_wait, !test_bit(NVME_CTRL_SHUTDOWN,
> +								&dev->flags));
> +		return;
> +	}
>   	if (dev->bar) {
>   		nvme_freeze_queues(dev);
>   		csts = readl(dev->bar + NVME_REG_CSTS);
> @@ -2041,6 +2062,9 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
>
>   	for (i = dev->queue_count - 1; i >= 0; i--)
>   		nvme_clear_queue(dev->queues[i]);
> +
> +	clear_bit(NVME_CTRL_SHUTDOWN, &dev->flags);
> +	wake_up_all(&dev->shutdown_wait);
>   }

I don't like this. You're essentially protecting the code here, not the 
data structure. There must be a cleaner way to solve this.

It this hiding missing references somewhere else?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue
  2015-11-24 18:35 [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue Keith Busch
  2015-11-24 18:35 ` [PATCHv2 2/2] NVMe: Shutdown fixes Keith Busch
@ 2015-11-24 19:42 ` Jens Axboe
  2015-11-24 19:47 ` Christoph Hellwig
  2 siblings, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2015-11-24 19:42 UTC (permalink / raw)


On 11/24/2015 11:35 AM, Keith Busch wrote:
> This patch removes synchronizing the timeout work so that the timer can
> start a freeze on its own queue. The timer enters the queue, so timer
> context can only start a freeze, but not wait for frozen.

This looks good.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue
  2015-11-24 18:35 [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue Keith Busch
  2015-11-24 18:35 ` [PATCHv2 2/2] NVMe: Shutdown fixes Keith Busch
  2015-11-24 19:42 ` [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue Jens Axboe
@ 2015-11-24 19:47 ` Christoph Hellwig
  2 siblings, 0 replies; 8+ messages in thread
From: Christoph Hellwig @ 2015-11-24 19:47 UTC (permalink / raw)


Looks good,

Reviewed-by: Christoph Hellwig <hch at lst.de>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCHv2 2/2] NVMe: Shutdown fixes
  2015-11-24 19:42   ` Jens Axboe
@ 2015-11-24 20:20     ` Keith Busch
  2015-11-24 20:26       ` Jens Axboe
  0 siblings, 1 reply; 8+ messages in thread
From: Keith Busch @ 2015-11-24 20:20 UTC (permalink / raw)


On Tue, Nov 24, 2015@12:42:50PM -0700, Jens Axboe wrote:
> On 11/24/2015 11:35 AM, Keith Busch wrote:
> >@@ -2023,6 +2039,11 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
> >
> >  	nvme_dev_list_remove(dev);
> >
> >+	if (test_and_set_bit(NVME_CTRL_SHUTDOWN, &dev->flags)) {
> >+		wait_event(dev->shutdown_wait, !test_bit(NVME_CTRL_SHUTDOWN,
> >+								&dev->flags));
> >+		return;
> >+	}
> >  	if (dev->bar) {
> >  		nvme_freeze_queues(dev);
> >  		csts = readl(dev->bar + NVME_REG_CSTS);
> >@@ -2041,6 +2062,9 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
> >
> >  	for (i = dev->queue_count - 1; i >= 0; i--)
> >  		nvme_clear_queue(dev->queues[i]);
> >+
> >+	clear_bit(NVME_CTRL_SHUTDOWN, &dev->flags);
> >+	wake_up_all(&dev->shutdown_wait);
> >  }
> 
> I don't like this. You're essentially protecting the code here, not
> the data structure. There must be a cleaner way to solve this.

Sure, I'm all for a better solution. We just need to prevent two threads
from hitting the same controller registers, and return when queues are
cleared. Is there a better way to control this?

The driver's queue structures are already protected, but the existing
implementation allows one thread to pass the first that's waiting for
orderly queue deletion.

> It this hiding missing references somewhere else?

We can get here through explicit user request, error handling, or device
removal. Any implies device references are held.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCHv2 2/2] NVMe: Shutdown fixes
  2015-11-24 20:20     ` Keith Busch
@ 2015-11-24 20:26       ` Jens Axboe
  2015-11-24 20:57         ` Keith Busch
  0 siblings, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2015-11-24 20:26 UTC (permalink / raw)


On 11/24/2015 01:20 PM, Keith Busch wrote:
> On Tue, Nov 24, 2015@12:42:50PM -0700, Jens Axboe wrote:
>> On 11/24/2015 11:35 AM, Keith Busch wrote:
>>> @@ -2023,6 +2039,11 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
>>>
>>>   	nvme_dev_list_remove(dev);
>>>
>>> +	if (test_and_set_bit(NVME_CTRL_SHUTDOWN, &dev->flags)) {
>>> +		wait_event(dev->shutdown_wait, !test_bit(NVME_CTRL_SHUTDOWN,
>>> +								&dev->flags));
>>> +		return;
>>> +	}
>>>   	if (dev->bar) {
>>>   		nvme_freeze_queues(dev);
>>>   		csts = readl(dev->bar + NVME_REG_CSTS);
>>> @@ -2041,6 +2062,9 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
>>>
>>>   	for (i = dev->queue_count - 1; i >= 0; i--)
>>>   		nvme_clear_queue(dev->queues[i]);
>>> +
>>> +	clear_bit(NVME_CTRL_SHUTDOWN, &dev->flags);
>>> +	wake_up_all(&dev->shutdown_wait);
>>>   }
>>
>> I don't like this. You're essentially protecting the code here, not
>> the data structure. There must be a cleaner way to solve this.
>
> Sure, I'm all for a better solution. We just need to prevent two threads
> from hitting the same controller registers, and return when queues are
> cleared. Is there a better way to control this?
>
> The driver's queue structures are already protected, but the existing
> implementation allows one thread to pass the first that's waiting for
> orderly queue deletion.

Just use a mutex? Check shutdown state after having held the lock, exit 
if we don't need to do anything.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCHv2 2/2] NVMe: Shutdown fixes
  2015-11-24 20:26       ` Jens Axboe
@ 2015-11-24 20:57         ` Keith Busch
  0 siblings, 0 replies; 8+ messages in thread
From: Keith Busch @ 2015-11-24 20:57 UTC (permalink / raw)


On Tue, Nov 24, 2015@01:26:12PM -0700, Jens Axboe wrote:
> On 11/24/2015 01:20 PM, Keith Busch wrote:
> >Sure, I'm all for a better solution. We just need to prevent two threads
> >from hitting the same controller registers, and return when queues are
> >cleared. Is there a better way to control this?
> >
> >The driver's queue structures are already protected, but the existing
> >implementation allows one thread to pass the first that's waiting for
> >orderly queue deletion.
> 
> Just use a mutex? Check shutdown state after having held the lock,
> exit if we don't need to do anything.

Got it, thanks. Will send an update.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2015-11-24 20:57 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-24 18:35 [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue Keith Busch
2015-11-24 18:35 ` [PATCHv2 2/2] NVMe: Shutdown fixes Keith Busch
2015-11-24 19:42   ` Jens Axboe
2015-11-24 20:20     ` Keith Busch
2015-11-24 20:26       ` Jens Axboe
2015-11-24 20:57         ` Keith Busch
2015-11-24 19:42 ` [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue Jens Axboe
2015-11-24 19:47 ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.