* [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue
@ 2015-11-24 18:35 Keith Busch
2015-11-24 18:35 ` [PATCHv2 2/2] NVMe: Shutdown fixes Keith Busch
` (2 more replies)
0 siblings, 3 replies; 8+ messages in thread
From: Keith Busch @ 2015-11-24 18:35 UTC (permalink / raw)
This patch removes synchronizing the timeout work so that the timer can
start a freeze on its own queue. The timer enters the queue, so timer
context can only start a freeze, but not wait for frozen.
Signed-off-by: Keith Busch <keith.busch at intel.com>
---
block/blk-mq.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5142219..2714c9c2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -85,7 +85,6 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
if (freeze_depth == 1) {
percpu_ref_kill(&q->q_usage_counter);
- cancel_work_sync(&q->timeout_work);
blk_mq_run_hw_queues(q, false);
}
}
@@ -628,6 +627,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
};
int i;
+ if (blk_queue_enter(q, true))
+ return;
+
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
if (data.next_set) {
@@ -642,6 +644,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
blk_mq_tag_idle(hctx);
}
}
+ blk_queue_exit(q);
}
/*
--
2.6.2.307.g37023ba
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCHv2 2/2] NVMe: Shutdown fixes
2015-11-24 18:35 [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue Keith Busch
@ 2015-11-24 18:35 ` Keith Busch
2015-11-24 19:42 ` Jens Axboe
2015-11-24 19:42 ` [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue Jens Axboe
2015-11-24 19:47 ` Christoph Hellwig
2 siblings, 1 reply; 8+ messages in thread
From: Keith Busch @ 2015-11-24 18:35 UTC (permalink / raw)
The controller must be disabled prior to completing a presumed lost
command. This patch makes the shutdown safe to simultaneous invocations,
and directly calls it from the timeout handler if timeout occurs during
initialization.
blk-mq marks requests as completed in its timeout handler, though the
driver still owns the request. To propagate the appropriate status
to the caller, the driver must directly set req->errors. This also
happens to fix a race if the controller being reset actually completes
the request that timed out.
A side effect of this patch is a controller that times out IO queue
creation will be failed. The driver silently proceeded before, so this
sounds like an improvement. There hasn't been any reports of such
a condition actually occurring, though.
Signed-off-by: Keith Busch <keith.busch at intel.com>
---
v1 -> v2:
Get rid of the struct work for shutdown. Make shutdown robust to
multiple invocations instead.
drivers/nvme/host/pci.c | 56 +++++++++++++++++++++++++++++++++++++------------
1 file changed, 43 insertions(+), 13 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ff253c8..cf9722f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -86,6 +86,7 @@ struct nvme_queue;
static int nvme_reset(struct nvme_dev *dev);
static void nvme_process_cq(struct nvme_queue *nvmeq);
static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
+static void nvme_dev_shutdown(struct nvme_dev *dev);
struct async_cmd_info {
struct kthread_work work;
@@ -117,6 +118,7 @@ struct nvme_dev {
struct work_struct reset_work;
struct work_struct scan_work;
struct work_struct remove_work;
+ wait_queue_head_t shutdown_wait;
bool subsystem;
u32 page_size;
void __iomem *cmb;
@@ -125,7 +127,7 @@ struct nvme_dev {
u32 cmbsz;
unsigned long flags;
#define NVME_CTRL_RESETTING 0
-
+#define NVME_CTRL_SHUTDOWN 1
struct nvme_ctrl ctrl;
};
@@ -723,6 +725,19 @@ static void nvme_complete_rq(struct request *req)
blk_mq_end_request(req, error);
}
+static inline void nvme_end_req(struct request *req, int error)
+{
+ /*
+ * The request may already be marked "complete" by blk-mq if
+ * completion occurs during a timeout. We set req->errors
+ * before telling blk-mq in case this is the target request
+ * of a timeout handler. This is safe since this driver never
+ * completes the same command twice.
+ */
+ req->errors = error;
+ blk_mq_complete_request(req, req->errors);
+}
+
static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
{
u16 head, phase;
@@ -770,8 +785,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
u32 result = le32_to_cpu(cqe.result);
req->special = (void *)(uintptr_t)result;
}
- blk_mq_complete_request(req, status >> 1);
-
+ nvme_end_req(req, status >> 1);
}
/* If the controller ignores the cq head doorbell and continuously
@@ -938,13 +952,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
struct nvme_command cmd;
/*
- * Just return an error to reset_work if we're already called from
- * probe context. We don't worry about request / tag reuse as
- * reset_work will immeditalely unwind and remove the controller
- * once we fail a command.
+ * Shutdown immediately if controller times out while starting. The
+ * reset work will see the pci device disabled when it gets the forced
+ * cancellation error. All outstanding requests are completed on
+ * shutdown, so we return BLK_EH_HANDLED.
*/
if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
- req->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+ dev_warn(dev->dev,
+ "I/O %d QID %d timeout, disable controller\n",
+ req->tag, nvmeq->qid);
+ nvme_dev_shutdown(dev);
return BLK_EH_HANDLED;
}
@@ -1015,7 +1032,8 @@ static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved
if (blk_queue_dying(req->q))
status |= NVME_SC_DNR;
- blk_mq_complete_request(req, status);
+
+ nvme_end_req(req, status);
}
static void nvme_free_queue(struct nvme_queue *nvmeq)
@@ -1510,9 +1528,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
status = nvme_set_features(&dev->ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
&result);
- if (status < 0)
- return status;
- if (status > 0) {
+ if (status) {
dev_err(dev->dev, "Could not set queue count (%d)\n", status);
return 0;
}
@@ -2023,6 +2039,11 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
nvme_dev_list_remove(dev);
+ if (test_and_set_bit(NVME_CTRL_SHUTDOWN, &dev->flags)) {
+ wait_event(dev->shutdown_wait, !test_bit(NVME_CTRL_SHUTDOWN,
+ &dev->flags));
+ return;
+ }
if (dev->bar) {
nvme_freeze_queues(dev);
csts = readl(dev->bar + NVME_REG_CSTS);
@@ -2041,6 +2062,9 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
for (i = dev->queue_count - 1; i >= 0; i--)
nvme_clear_queue(dev->queues[i]);
+
+ clear_bit(NVME_CTRL_SHUTDOWN, &dev->flags);
+ wake_up_all(&dev->shutdown_wait);
}
static int nvme_setup_prp_pools(struct nvme_dev *dev)
@@ -2115,7 +2139,12 @@ static void nvme_reset_work(struct work_struct *work)
goto free_tags;
result = nvme_setup_io_queues(dev);
- if (result)
+
+ /*
+ * The pci device will be disabled if a timeout occurs while setting up
+ * IO queues, but return 0 for "result", so we have to check both.
+ */
+ if (result || !pci_is_enabled(to_pci_dev(dev->dev)))
goto free_tags;
dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
@@ -2251,6 +2280,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
INIT_WORK(&dev->scan_work, nvme_dev_scan);
INIT_WORK(&dev->reset_work, nvme_reset_work);
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
+ init_waitqueue_head(&dev->shutdown_wait);
result = nvme_setup_prp_pools(dev);
if (result)
--
2.6.2.307.g37023ba
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCHv2 2/2] NVMe: Shutdown fixes
2015-11-24 18:35 ` [PATCHv2 2/2] NVMe: Shutdown fixes Keith Busch
@ 2015-11-24 19:42 ` Jens Axboe
2015-11-24 20:20 ` Keith Busch
0 siblings, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2015-11-24 19:42 UTC (permalink / raw)
On 11/24/2015 11:35 AM, Keith Busch wrote:
> The controller must be disabled prior to completing a presumed lost
> command. This patch makes the shutdown safe to simultaneous invocations,
> and directly calls it from the timeout handler if timeout occurs during
> initialization.
>
> blk-mq marks requests as completed in its timeout handler, though the
> driver still owns the request. To propagate the appropriate status
> to the caller, the driver must directly set req->errors. This also
> happens to fix a race if the controller being reset actually completes
> the request that timed out.
>
> A side effect of this patch is a controller that times out IO queue
> creation will be failed. The driver silently proceeded before, so this
> sounds like an improvement. There hasn't been any reports of such
> a condition actually occurring, though.
>
> Signed-off-by: Keith Busch <keith.busch at intel.com>
> @@ -2023,6 +2039,11 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
>
> nvme_dev_list_remove(dev);
>
> + if (test_and_set_bit(NVME_CTRL_SHUTDOWN, &dev->flags)) {
> + wait_event(dev->shutdown_wait, !test_bit(NVME_CTRL_SHUTDOWN,
> + &dev->flags));
> + return;
> + }
> if (dev->bar) {
> nvme_freeze_queues(dev);
> csts = readl(dev->bar + NVME_REG_CSTS);
> @@ -2041,6 +2062,9 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
>
> for (i = dev->queue_count - 1; i >= 0; i--)
> nvme_clear_queue(dev->queues[i]);
> +
> + clear_bit(NVME_CTRL_SHUTDOWN, &dev->flags);
> + wake_up_all(&dev->shutdown_wait);
> }
I don't like this. You're essentially protecting the code here, not the
data structure. There must be a cleaner way to solve this.
It this hiding missing references somewhere else?
--
Jens Axboe
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue
2015-11-24 18:35 [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue Keith Busch
2015-11-24 18:35 ` [PATCHv2 2/2] NVMe: Shutdown fixes Keith Busch
@ 2015-11-24 19:42 ` Jens Axboe
2015-11-24 19:47 ` Christoph Hellwig
2 siblings, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2015-11-24 19:42 UTC (permalink / raw)
On 11/24/2015 11:35 AM, Keith Busch wrote:
> This patch removes synchronizing the timeout work so that the timer can
> start a freeze on its own queue. The timer enters the queue, so timer
> context can only start a freeze, but not wait for frozen.
This looks good.
--
Jens Axboe
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue
2015-11-24 18:35 [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue Keith Busch
2015-11-24 18:35 ` [PATCHv2 2/2] NVMe: Shutdown fixes Keith Busch
2015-11-24 19:42 ` [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue Jens Axboe
@ 2015-11-24 19:47 ` Christoph Hellwig
2 siblings, 0 replies; 8+ messages in thread
From: Christoph Hellwig @ 2015-11-24 19:47 UTC (permalink / raw)
Looks good,
Reviewed-by: Christoph Hellwig <hch at lst.de>
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCHv2 2/2] NVMe: Shutdown fixes
2015-11-24 19:42 ` Jens Axboe
@ 2015-11-24 20:20 ` Keith Busch
2015-11-24 20:26 ` Jens Axboe
0 siblings, 1 reply; 8+ messages in thread
From: Keith Busch @ 2015-11-24 20:20 UTC (permalink / raw)
On Tue, Nov 24, 2015@12:42:50PM -0700, Jens Axboe wrote:
> On 11/24/2015 11:35 AM, Keith Busch wrote:
> >@@ -2023,6 +2039,11 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
> >
> > nvme_dev_list_remove(dev);
> >
> >+ if (test_and_set_bit(NVME_CTRL_SHUTDOWN, &dev->flags)) {
> >+ wait_event(dev->shutdown_wait, !test_bit(NVME_CTRL_SHUTDOWN,
> >+ &dev->flags));
> >+ return;
> >+ }
> > if (dev->bar) {
> > nvme_freeze_queues(dev);
> > csts = readl(dev->bar + NVME_REG_CSTS);
> >@@ -2041,6 +2062,9 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
> >
> > for (i = dev->queue_count - 1; i >= 0; i--)
> > nvme_clear_queue(dev->queues[i]);
> >+
> >+ clear_bit(NVME_CTRL_SHUTDOWN, &dev->flags);
> >+ wake_up_all(&dev->shutdown_wait);
> > }
>
> I don't like this. You're essentially protecting the code here, not
> the data structure. There must be a cleaner way to solve this.
Sure, I'm all for a better solution. We just need to prevent two threads
from hitting the same controller registers, and return when queues are
cleared. Is there a better way to control this?
The driver's queue structures are already protected, but the existing
implementation allows one thread to pass the first that's waiting for
orderly queue deletion.
> It this hiding missing references somewhere else?
We can get here through explicit user request, error handling, or device
removal. Any implies device references are held.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCHv2 2/2] NVMe: Shutdown fixes
2015-11-24 20:20 ` Keith Busch
@ 2015-11-24 20:26 ` Jens Axboe
2015-11-24 20:57 ` Keith Busch
0 siblings, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2015-11-24 20:26 UTC (permalink / raw)
On 11/24/2015 01:20 PM, Keith Busch wrote:
> On Tue, Nov 24, 2015@12:42:50PM -0700, Jens Axboe wrote:
>> On 11/24/2015 11:35 AM, Keith Busch wrote:
>>> @@ -2023,6 +2039,11 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
>>>
>>> nvme_dev_list_remove(dev);
>>>
>>> + if (test_and_set_bit(NVME_CTRL_SHUTDOWN, &dev->flags)) {
>>> + wait_event(dev->shutdown_wait, !test_bit(NVME_CTRL_SHUTDOWN,
>>> + &dev->flags));
>>> + return;
>>> + }
>>> if (dev->bar) {
>>> nvme_freeze_queues(dev);
>>> csts = readl(dev->bar + NVME_REG_CSTS);
>>> @@ -2041,6 +2062,9 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
>>>
>>> for (i = dev->queue_count - 1; i >= 0; i--)
>>> nvme_clear_queue(dev->queues[i]);
>>> +
>>> + clear_bit(NVME_CTRL_SHUTDOWN, &dev->flags);
>>> + wake_up_all(&dev->shutdown_wait);
>>> }
>>
>> I don't like this. You're essentially protecting the code here, not
>> the data structure. There must be a cleaner way to solve this.
>
> Sure, I'm all for a better solution. We just need to prevent two threads
> from hitting the same controller registers, and return when queues are
> cleared. Is there a better way to control this?
>
> The driver's queue structures are already protected, but the existing
> implementation allows one thread to pass the first that's waiting for
> orderly queue deletion.
Just use a mutex? Check shutdown state after having held the lock, exit
if we don't need to do anything.
--
Jens Axboe
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCHv2 2/2] NVMe: Shutdown fixes
2015-11-24 20:26 ` Jens Axboe
@ 2015-11-24 20:57 ` Keith Busch
0 siblings, 0 replies; 8+ messages in thread
From: Keith Busch @ 2015-11-24 20:57 UTC (permalink / raw)
On Tue, Nov 24, 2015@01:26:12PM -0700, Jens Axboe wrote:
> On 11/24/2015 01:20 PM, Keith Busch wrote:
> >Sure, I'm all for a better solution. We just need to prevent two threads
> >from hitting the same controller registers, and return when queues are
> >cleared. Is there a better way to control this?
> >
> >The driver's queue structures are already protected, but the existing
> >implementation allows one thread to pass the first that's waiting for
> >orderly queue deletion.
>
> Just use a mutex? Check shutdown state after having held the lock,
> exit if we don't need to do anything.
Got it, thanks. Will send an update.
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2015-11-24 20:57 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-24 18:35 [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue Keith Busch
2015-11-24 18:35 ` [PATCHv2 2/2] NVMe: Shutdown fixes Keith Busch
2015-11-24 19:42 ` Jens Axboe
2015-11-24 20:20 ` Keith Busch
2015-11-24 20:26 ` Jens Axboe
2015-11-24 20:57 ` Keith Busch
2015-11-24 19:42 ` [PATCHv2 1/2] blk-mq: allow timer to freeze its own queue Jens Axboe
2015-11-24 19:47 ` Christoph Hellwig
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.