All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] NVMe: Shutdown fixes
@ 2015-11-24 23:30 Keith Busch
  2015-11-25  1:39 ` Jens Axboe
  2015-11-25  8:55 ` Christoph Hellwig
  0 siblings, 2 replies; 17+ messages in thread
From: Keith Busch @ 2015-11-24 23:30 UTC (permalink / raw)


The controller must be disabled prior to completing a presumed lost
command. This patch makes the shutdown safe to simultaneous invocations
using mutex, and directly calls shutdown from the timeout handler if
timeout occurs.

The driver must directly set req->errors to get the appropriate status
propogated to the caller, since the timeout handler already set the
targeted request to "complete". The shutdown unconditionally clears
all outstanding commands, so it is safe to return "EH_HANDLED" from the
timeout handler and assume the command was completed.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/pci.c | 69 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 23 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ff253c8..e4c014f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -31,6 +31,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/poison.h>
 #include <linux/ptrace.h>
@@ -86,6 +87,7 @@ struct nvme_queue;
 static int nvme_reset(struct nvme_dev *dev);
 static void nvme_process_cq(struct nvme_queue *nvmeq);
 static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
+static void nvme_dev_shutdown(struct nvme_dev *dev);
 
 struct async_cmd_info {
 	struct kthread_work work;
@@ -117,6 +119,7 @@ struct nvme_dev {
 	struct work_struct reset_work;
 	struct work_struct scan_work;
 	struct work_struct remove_work;
+	struct mutex shutdown_lock;
 	bool subsystem;
 	u32 page_size;
 	void __iomem *cmb;
@@ -723,6 +726,19 @@ static void nvme_complete_rq(struct request *req)
 	blk_mq_end_request(req, error);
 }
 
+static inline void nvme_end_req(struct request *req, int error)
+{
+	/*
+	 * The request may already be marked "complete" by blk-mq if
+	 * completion occurs during a timeout. We set req->errors
+	 * before telling blk-mq in case this is the target request
+	 * of a timeout handler. This is safe since this driver never
+	 * completes the same command twice.
+	 */
+	req->errors = error;
+	blk_mq_complete_request(req, req->errors);
+}
+
 static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 {
 	u16 head, phase;
@@ -770,8 +786,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 			u32 result = le32_to_cpu(cqe.result);
 			req->special = (void *)(uintptr_t)result;
 		}
-		blk_mq_complete_request(req, status >> 1);
-
+		nvme_end_req(req, status >> 1);
 	}
 
 	/* If the controller ignores the cq head doorbell and continuously
@@ -938,34 +953,35 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	struct nvme_command cmd;
 
 	/*
-	 * Just return an error to reset_work if we're already called from
-	 * probe context.  We don't worry about request / tag reuse as
-	 * reset_work will immeditalely unwind and remove the controller
-	 * once we fail a command.
+	 * Shutdown immediately if controller times out while starting. The
+	 * reset work will see the pci device disabled when it gets the forced
+	 * cancellation error. All outstanding requests are completed on
+	 * shutdown, so we return BLK_EH_HANDLED.
 	 */
 	if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
-		req->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+		dev_warn(dev->dev,
+				"I/O %d QID %d timeout, disable controller\n",
+				 req->tag, nvmeq->qid);
+		nvme_dev_shutdown(dev);
 		return BLK_EH_HANDLED;
 	}
 
 	/*
-	 * Schedule controller reset if the command was already aborted once
-	 * before and still hasn't been returned to the driver, or if this is
-	 * the admin queue.
+	 * Shutdown controller if the command was already aborted once
+	 * before and still hasn't been returned to the driver, or if this
+	 * is * the admin queue, and then schedule restart.
 	 */
 	if (!nvmeq->qid || iod->aborted) {
-		if (queue_work(nvme_workq, &dev->reset_work)) {
-			dev_warn(dev->dev,
-				 "I/O %d QID %d timeout, reset controller\n",
+		dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n",
 				 req->tag, nvmeq->qid);
-		}
+		nvme_dev_shutdown(dev);
+		queue_work(nvme_workq, &dev->reset_work);
 
 		/*
-		 * Mark the request as quiesced, that is don't do anything with
-		 * it until the error completion from the controller reset
-		 * comes in.
+		 * Mark the request as handled, since the inline shutdown
+		 * forces all outstanding requests to complete.
 		 */
-		return BLK_EH_QUIESCED;
+		return BLK_EH_HANDLED;
 	}
 
 	iod->aborted = 1;
@@ -1015,7 +1031,8 @@ static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved
 
 	if (blk_queue_dying(req->q))
 		status |= NVME_SC_DNR;
-	blk_mq_complete_request(req, status);
+
+	nvme_end_req(req, status);
 }
 
 static void nvme_free_queue(struct nvme_queue *nvmeq)
@@ -1510,9 +1527,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 
 	status = nvme_set_features(&dev->ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
 								&result);
-	if (status < 0)
-		return status;
-	if (status > 0) {
+	if (status) {
 		dev_err(dev->dev, "Could not set queue count (%d)\n", status);
 		return 0;
 	}
@@ -2023,6 +2038,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 
 	nvme_dev_list_remove(dev);
 
+	mutex_lock(&dev->shutdown_lock);
 	if (dev->bar) {
 		nvme_freeze_queues(dev);
 		csts = readl(dev->bar + NVME_REG_CSTS);
@@ -2041,6 +2057,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 
 	for (i = dev->queue_count - 1; i >= 0; i--)
 		nvme_clear_queue(dev->queues[i]);
+	mutex_unlock(&dev->shutdown_lock);
 }
 
 static int nvme_setup_prp_pools(struct nvme_dev *dev)
@@ -2115,7 +2132,12 @@ static void nvme_reset_work(struct work_struct *work)
 		goto free_tags;
 
 	result = nvme_setup_io_queues(dev);
-	if (result)
+
+	/*
+	 * The pci device will be disabled if a timeout occurs while setting up
+	 * IO queues, but return 0 for "result", so we have to check both.
+	 */
+	if (result || !pci_is_enabled(to_pci_dev(dev->dev)))
 		goto free_tags;
 
 	dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
@@ -2251,6 +2273,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_WORK(&dev->scan_work, nvme_dev_scan);
 	INIT_WORK(&dev->reset_work, nvme_reset_work);
 	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
+	mutex_init(&dev->shutdown_lock);
 
 	result = nvme_setup_prp_pools(dev);
 	if (result)
-- 
2.6.2.307.g37023ba

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-11-24 23:30 [PATCH] NVMe: Shutdown fixes Keith Busch
@ 2015-11-25  1:39 ` Jens Axboe
  2015-11-25  8:55 ` Christoph Hellwig
  1 sibling, 0 replies; 17+ messages in thread
From: Jens Axboe @ 2015-11-25  1:39 UTC (permalink / raw)


On 11/24/2015 04:30 PM, Keith Busch wrote:
> The controller must be disabled prior to completing a presumed lost
> command. This patch makes the shutdown safe to simultaneous invocations
> using mutex, and directly calls shutdown from the timeout handler if
> timeout occurs.
>
> The driver must directly set req->errors to get the appropriate status
> propogated to the caller, since the timeout handler already set the
> targeted request to "complete". The shutdown unconditionally clears
> all outstanding commands, so it is safe to return "EH_HANDLED" from the
> timeout handler and assume the command was completed.

Much better, looks good to me.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-11-24 23:30 [PATCH] NVMe: Shutdown fixes Keith Busch
  2015-11-25  1:39 ` Jens Axboe
@ 2015-11-25  8:55 ` Christoph Hellwig
  2015-11-25 16:05   ` Keith Busch
  1 sibling, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2015-11-25  8:55 UTC (permalink / raw)


> +static inline void nvme_end_req(struct request *req, int error)
> +{
> +	/*
> +	 * The request may already be marked "complete" by blk-mq if
> +	 * completion occurs during a timeout. We set req->errors
> +	 * before telling blk-mq in case this is the target request
> +	 * of a timeout handler. This is safe since this driver never
> +	 * completes the same command twice.
> +	 */
> +	req->errors = error;
> +	blk_mq_complete_request(req, req->errors);


I don't think overwriting req->errors on an already completed request
is a good idea - it's what we used to earlier, but I specificly changed
the blk_mq_complete_request so that we don't do that.  Can you explain
when you want to overwrite the value exactly and why?

>  	/*
> -	 * Just return an error to reset_work if we're already called from
> -	 * probe context.  We don't worry about request / tag reuse as
> -	 * reset_work will immeditalely unwind and remove the controller
> -	 * once we fail a command.
> +	 * Shutdown immediately if controller times out while starting. The
> +	 * reset work will see the pci device disabled when it gets the forced
> +	 * cancellation error. All outstanding requests are completed on
> +	 * shutdown, so we return BLK_EH_HANDLED.
>  	 */
>  	if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
> -		req->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
> +		dev_warn(dev->dev,
> +				"I/O %d QID %d timeout, disable controller\n",
> +				 req->tag, nvmeq->qid);
> +		nvme_dev_shutdown(dev);
>  		return BLK_EH_HANDLED;

So here you return BLK_EH_HANDLED without setting req->errors, which
sounds like it might be related to the above issue.

>  	/*
> +	 * Shutdown controller if the command was already aborted once
> +	 * before and still hasn't been returned to the driver, or if this
> +	 * is * the admin queue, and then schedule restart.

little formatting error with the '* ' after the is above.

>  	 */
>  	if (!nvmeq->qid || iod->aborted) {
> +		dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n",
>  				 req->tag, nvmeq->qid);
> +		nvme_dev_shutdown(dev);
> +		queue_work(nvme_workq, &dev->reset_work);

So now we will get a call to nvme_dev_shutdown for every I/O that times
out.  I'll see how my controller handles the discard of death with that.

>  
>  		/*
> +		 * Mark the request as handled, since the inline shutdown
> +		 * forces all outstanding requests to complete.
>  		 */
> -		return BLK_EH_QUIESCED;
> +		return BLK_EH_HANDLED;

Again we're not setting req->errors here.

> -	if (status > 0) {
> +	if (status) {
>  		dev_err(dev->dev, "Could not set queue count (%d)\n", status);
>  		return 0;
>  	}
> @@ -2023,6 +2038,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
>  
>  	nvme_dev_list_remove(dev);
>  
> +	mutex_lock(&dev->shutdown_lock);

Shoudn't we also have a state check that skips all the work here if
the controller has already been shut down?

> +
> +	/*
> +	 * The pci device will be disabled if a timeout occurs while setting up
> +	 * IO queues, but return 0 for "result", so we have to check both.
> +	 */
> +	if (result || !pci_is_enabled(to_pci_dev(dev->dev)))
>  		goto free_tags;

I don't think we should return zero here in that case - that seems to
be a bug in setting up req->errrors in the timeout handler.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-11-25  8:55 ` Christoph Hellwig
@ 2015-11-25 16:05   ` Keith Busch
  2015-11-25 16:42     ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Keith Busch @ 2015-11-25 16:05 UTC (permalink / raw)


On Wed, Nov 25, 2015@12:55:37AM -0800, Christoph Hellwig wrote:
> > +static inline void nvme_end_req(struct request *req, int error)
> > +{
> > +	/*
> > +	 * The request may already be marked "complete" by blk-mq if
> > +	 * completion occurs during a timeout. We set req->errors
> > +	 * before telling blk-mq in case this is the target request
> > +	 * of a timeout handler. This is safe since this driver never
> > +	 * completes the same command twice.
> > +	 */
> > +	req->errors = error;
> > +	blk_mq_complete_request(req, req->errors);
> 
> I don't think overwriting req->errors on an already completed request
> is a good idea - it's what we used to earlier, but I specificly changed
> the blk_mq_complete_request so that we don't do that.  Can you explain
> when you want to overwrite the value exactly and why?

Explanation follows:

> >  	if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
> > -		req->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
> > +		dev_warn(dev->dev,
> > +				"I/O %d QID %d timeout, disable controller\n",
> > +				 req->tag, nvmeq->qid);
> > +		nvme_dev_shutdown(dev);
> >  		return BLK_EH_HANDLED;
> 
> So here you return BLK_EH_HANDLED without setting req->errors, which
> sounds like it might be related to the above issue.

Right, we don't want to set req->errors here. nvme_dev_shutdown
unconditionally reaps every request known to the driver, so it's safe
assume the timed out request is completed and return EH_HANDLED.

Executing the nvme_timeout code path more often than not means the
controller isn't going to return that command, but we can't count on that.
So there are two possibilities:

  1. request completes through nvme_process_cq
  2. request completes through nvme_cancel_queue_ios

In either case, req->errors is set through the new nvme_end_req, as you
surmised. We don't want to set req->errors here directly in case the
request completes through process_cq. We should prefer the controller's
status rather than make one up.

> > +		 * Mark the request as handled, since the inline shutdown
> > +		 * forces all outstanding requests to complete.
> >  		 */
> > -		return BLK_EH_QUIESCED;
> > +		return BLK_EH_HANDLED;
> 
> Again we're not setting req->errors here.

Yep, same reasoning as above. This also fixes this race:

        CPU 0                           CPU 1

  blk_mq_check_expired                nvme_irq
    set_bit(REQ_ATOM_COMPLETE)          nvme_process_cq
    nvme_timeout                          blk_mq_complete_request
                                            if (!test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags) ||
	                                         test_and_clear_bit(REQ_ATOM_QUIESCED, &rq->atomic_flags))
      return BLK_EH_QUIESCED
    set_bit REQ_ATOM_QUIESCED

Both checks on blk_mq_complete_request will fail since the timeout
handler hasn't returned "quiesce" yet, and the request will be orphaned.

Instead we can let nvme_dev_shudtown reap the request and set req->errors
directly through one of the two paths described above, then return
BLK_EH_HANDLED.
 
> > @@ -2023,6 +2038,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
> >  
> >  	nvme_dev_list_remove(dev);
> >  
> > +	mutex_lock(&dev->shutdown_lock);
> 
> Shoudn't we also have a state check that skips all the work here if
> the controller has already been shut down?

This is implied through dev->bar != NULL, and the nvmeq's cq_vector value.

> > +	 * The pci device will be disabled if a timeout occurs while setting up
> > +	 * IO queues, but return 0 for "result", so we have to check both.
> > +	 */
> > +	if (result || !pci_is_enabled(to_pci_dev(dev->dev)))
> >  		goto free_tags;
>
> I don't think we should return zero here in that case - that seems to
> be a bug in setting up req->errrors in the timeout handler.

To set an appropriate non-zero result, we'd need to add pci_is_enabled()
checks in nvme_setup_io_queues to distinguish a non-fatal command error
vs timeout. Is that preferable to the check where I have it?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-11-25 16:05   ` Keith Busch
@ 2015-11-25 16:42     ` Christoph Hellwig
  2015-11-25 16:57       ` Keith Busch
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2015-11-25 16:42 UTC (permalink / raw)


On Wed, Nov 25, 2015@04:05:45PM +0000, Keith Busch wrote:
> > >  	if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
> > > -		req->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
> > > +		dev_warn(dev->dev,
> > > +				"I/O %d QID %d timeout, disable controller\n",
> > > +				 req->tag, nvmeq->qid);
> > > +		nvme_dev_shutdown(dev);
> > >  		return BLK_EH_HANDLED;
> > 
> > So here you return BLK_EH_HANDLED without setting req->errors, which
> > sounds like it might be related to the above issue.
> 
> Right, we don't want to set req->errors here. nvme_dev_shutdown
> unconditionally reaps every request known to the driver,

No, it's doesn't.  nvme_dev_shutdown through nvme_clear_queue and
nvme_cancel_queue_ios calls blk_mq_complete_request on every started
request.  But blk_mq_complete_request is a no-op if REQ_ATOM_COMPLETE
is already set, which it is for a request in the timeout handler

> so it's safe
> assume the timed out request is completed and return EH_HANDLED.

Returning EH_HANDLED after you did the shutdown does indeed look
safe to me, but only because a EH_HANDLED makes blk_mq_rq_timed_out
call __blk_mq_complete_request after we returned.  And for that
we want to set req->errors.  Now your nvme_end_req ends up setting
a return value for it as well, but I'd much prefer not to do that
unconditional assignment and set a proper here when we have ownership of
the request.

> In either case, req->errors is set through the new nvme_end_req, as you
> surmised. We don't want to set req->errors here directly in case the
> request completes through process_cq. We should prefer the controller's
> status rather than make one up.

If the controller returns the command after we already started the
shutdown it's too late, and we have to retry it or fence of the
controller.  Note that the the unconditional assignment in
in nvme_cancel_queue_ios can also race the other way, a successully
completed command may get req->errors overwritten with NVME_SC_ABORT_REQ
before it's been returned to the caller (that window is very small,
though).

> Yep, same reasoning as above. This also fixes this race:
> 
>         CPU 0                           CPU 1
> 
>   blk_mq_check_expired                nvme_irq
>     set_bit(REQ_ATOM_COMPLETE)          nvme_process_cq
>     nvme_timeout                          blk_mq_complete_request
>                                             if (!test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags) ||
> 	                                         test_and_clear_bit(REQ_ATOM_QUIESCED, &rq->atomic_flags))
>       return BLK_EH_QUIESCED
>     set_bit REQ_ATOM_QUIESCED
>
> Both checks on blk_mq_complete_request will fail since the timeout
> handler hasn't returned "quiesce" yet, and the request will be orphaned.

BLK_EH_QUIESCED is never returned with your patch applied..

> Instead we can let nvme_dev_shudtown reap the request and set req->errors
> directly through one of the two paths described above, then return
> BLK_EH_HANDLED.

nvme_dev_shutdown will not complete the command that we timed out,
blk_mq_complete_request will skip it because REQ_ATOM_COMPLETE is set,
and blk_mq_rq_timed_out will complete after we returned from the timeout
handler.

> > > @@ -2023,6 +2038,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
> > >  
> > >  	nvme_dev_list_remove(dev);
> > >  
> > > +	mutex_lock(&dev->shutdown_lock);
> > 
> > Shoudn't we also have a state check that skips all the work here if
> > the controller has already been shut down?
> 
> This is implied through dev->bar != NULL, and the nvmeq's cq_vector value.

Well, we're not doing anything in the low-level functions at the moment
(at the checks in nvme_dev_unmap, and all requests with
REQ_ATOM_COMPLETE to the list), but it still seems rather fragile to
rely on these low level functions deep down the stack to get everything
right.

> > > +	 * The pci device will be disabled if a timeout occurs while setting up
> > > +	 * IO queues, but return 0 for "result", so we have to check both.
> > > +	 */
> > > +	if (result || !pci_is_enabled(to_pci_dev(dev->dev)))
> > >  		goto free_tags;
> >
> > I don't think we should return zero here in that case - that seems to
> > be a bug in setting up req->errrors in the timeout handler.
> 
> To set an appropriate non-zero result, we'd need to add pci_is_enabled()
> checks in nvme_setup_io_queues to distinguish a non-fatal command error
> vs timeout. Is that preferable to the check where I have it?

No, that's not my preference.  My preference is to figure out why you
get a zero req->errors on timed request.  That really shouldn't happen
to start with.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-11-25 16:42     ` Christoph Hellwig
@ 2015-11-25 16:57       ` Keith Busch
  2015-11-25 17:08         ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Keith Busch @ 2015-11-25 16:57 UTC (permalink / raw)


On Wed, Nov 25, 2015@08:42:57AM -0800, Christoph Hellwig wrote:
> >         CPU 0                           CPU 1
> > 
> >   blk_mq_check_expired                nvme_irq
> >     set_bit(REQ_ATOM_COMPLETE)          nvme_process_cq
> >     nvme_timeout                          blk_mq_complete_request
> >                                             if (!test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags) ||
> > 	                                         test_and_clear_bit(REQ_ATOM_QUIESCED, &rq->atomic_flags))
> >       return BLK_EH_QUIESCED
> >     set_bit REQ_ATOM_QUIESCED
> >
> > Both checks on blk_mq_complete_request will fail since the timeout
> > handler hasn't returned "quiesce" yet, and the request will be orphaned.
> 
> BLK_EH_QUIESCED is never returned with your patch applied..

I'm just showing why it was broken before. My patch fixes it since we
reap the command and return BLK_EH_HANDLED.
 
> > Instead we can let nvme_dev_shudtown reap the request and set req->errors
> > directly through one of the two paths described above, then return
> > BLK_EH_HANDLED.
> 
> nvme_dev_shutdown will not complete the command that we timed out,
> blk_mq_complete_request will skip it because REQ_ATOM_COMPLETE is set,
> and blk_mq_rq_timed_out will complete after we returned from the timeout
> handler.

I'm not saying nvme_dev_shutdown "completes" the command. I'm just
saying it reaps it and sets an appropriate req->errors. The actual blk-mq
completion happens as you described through nvme_timeout's return code
of BLK_EH_HANDLED.

> > This is implied through dev->bar != NULL, and the nvmeq's cq_vector value.
> 
> Well, we're not doing anything in the low-level functions at the moment
> (at the checks in nvme_dev_unmap, and all requests with
> REQ_ATOM_COMPLETE to the list), but it still seems rather fragile to
> rely on these low level functions deep down the stack to get everything
> right.

Agreed.
 
> > To set an appropriate non-zero result, we'd need to add pci_is_enabled()
> > checks in nvme_setup_io_queues to distinguish a non-fatal command error
> > vs timeout. Is that preferable to the check where I have it?
> 
> No, that's not my preference.  My preference is to figure out why you
> get a zero req->errors on timed request.  That really shouldn't happen
> to start with.

req->errors wouldn't be 0, but nvme_set_queue_count returns "0" queues
if req->errors is non-zero. It's not a fatal error to have 0 queues so
we don't want to propogate that error to the initialization unless it's
a timeout.

We also don't return error codes on create sq/cq commands because an error
here isn't fatal either unless, of course, it's a timeout.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-11-25 16:57       ` Keith Busch
@ 2015-11-25 17:08         ` Christoph Hellwig
  2015-11-25 17:28           ` Keith Busch
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2015-11-25 17:08 UTC (permalink / raw)


On Wed, Nov 25, 2015@04:57:03PM +0000, Keith Busch wrote:
> > nvme_dev_shutdown will not complete the command that we timed out,
> > blk_mq_complete_request will skip it because REQ_ATOM_COMPLETE is set,
> > and blk_mq_rq_timed_out will complete after we returned from the timeout
> > handler.
> 
> I'm not saying nvme_dev_shutdown "completes" the command. I'm just
> saying it reaps it and sets an appropriate req->errors. The actual blk-mq
> completion happens as you described through nvme_timeout's return code
> of BLK_EH_HANDLED.

So let's get rid of the nvme_end_req thing and return the real errors.

> > > To set an appropriate non-zero result, we'd need to add pci_is_enabled()
> > > checks in nvme_setup_io_queues to distinguish a non-fatal command error
> > > vs timeout. Is that preferable to the check where I have it?
> > 
> > No, that's not my preference.  My preference is to figure out why you
> > get a zero req->errors on timed request.  That really shouldn't happen
> > to start with.
> 
> req->errors wouldn't be 0, but nvme_set_queue_count returns "0" queues
> if req->errors is non-zero. It's not a fatal error to have 0 queues so
> we don't want to propogate that error to the initialization unless it's
> a timeout.

Ok, step back here, I think that set_queue_count beast is getting too
confusing.  What do you 'failing' devices return in the NVMe CQE status
and result fields?  Do we get a failure in status, or do we get a zero
status and a zero result?

> We also don't return error codes on create sq/cq commands because an error
> here isn't fatal either unless, of course, it's a timeout.

So make sure that we a) don't mix up the status and result return values
by making the count parameter of set_queue_count a pointer and thus
keeping it separate from the status.

And then explicitly check for a timeout on create sq/cq.  And for
set_queue_count I disagree that it shouldn't be non-fatal, it's the way
how the hardware tells you how many queues are supported.  If some
hardware gets that wrong we'll have to handle it for better or worse,
but we should document that it's buggy and probably even log a message.

And a failing create sq/cq actually is fatal for us except for the
initial probe as well as blk-mq will keep using it once we set up the
queue count.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-11-25 17:08         ` Christoph Hellwig
@ 2015-11-25 17:28           ` Keith Busch
  2015-11-25 17:55             ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Keith Busch @ 2015-11-25 17:28 UTC (permalink / raw)


On Wed, Nov 25, 2015@09:08:42AM -0800, Christoph Hellwig wrote:
> On Wed, Nov 25, 2015@04:57:03PM +0000, Keith Busch wrote:
> > I'm not saying nvme_dev_shutdown "completes" the command. I'm just
> > saying it reaps it and sets an appropriate req->errors. The actual blk-mq
> > completion happens as you described through nvme_timeout's return code
> > of BLK_EH_HANDLED.
> 
> So let's get rid of the nvme_end_req thing and return the real errors.

nvme_end_req _does_ set the real error.

We can't set it in the timeout handler. How would you even know to
set an error? Maybe the controller posted success status, so we should
let nvme_dev_shutdown reap the command and set req->errors. It sets
the real status observed through process_cq, or a fake status through
cancel_queue_ios. Either way, req->errors is set and the req is safe to
complete after nvme_dev_shutdown.

> > req->errors wouldn't be 0, but nvme_set_queue_count returns "0" queues
> > if req->errors is non-zero. It's not a fatal error to have 0 queues so
> > we don't want to propogate that error to the initialization unless it's
> > a timeout.
> 
> Ok, step back here, I think that set_queue_count beast is getting too
> confusing.  What do you 'failing' devices return in the NVMe CQE status
> and result fields?  Do we get a failure in status, or do we get a zero
> status and a zero result?

Such devices return status 6, internal device error, and the senario
requires user management to rectify.
 
> > We also don't return error codes on create sq/cq commands because an error
> > here isn't fatal either unless, of course, it's a timeout.
> 
> So make sure that we a) don't mix up the status and result return values
> by making the count parameter of set_queue_count a pointer and thus
> keeping it separate from the status.

> And then explicitly check for a timeout on create sq/cq.

How do you explicitly check for a timeout? We used to have negative
error codes for these, but you changed that to fake NVMe status codes.

As far as I can tell, pci_is_enabled is the only way to see if a timeout
happened in this path, as it implies the device was shutdown by the
timeout handler.

> And for
> set_queue_count I disagree that it shouldn't be non-fatal, it's the way
> how the hardware tells you how many queues are supported.  If some
> hardware gets that wrong we'll have to handle it for better or worse,
> but we should document that it's buggy and probably even log a message.

It's not "wrong" to return an appropriate error code to a command if the
device is in a state that can't support it. The h/w may be in a non-optimal
condition, but we handle that correctly today.

> And a failing create sq/cq actually is fatal for us except for the
> initial probe as well as blk-mq will keep using it once we set up the
> queue count.

That's true, blk-mq does not let us change nr_hw_queues, but that's a
different issue we can fix.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-11-25 17:28           ` Keith Busch
@ 2015-11-25 17:55             ` Christoph Hellwig
  2015-11-25 18:24               ` Keith Busch
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2015-11-25 17:55 UTC (permalink / raw)


On Wed, Nov 25, 2015@05:28:45PM +0000, Keith Busch wrote:
> > So let's get rid of the nvme_end_req thing and return the real errors.
> 
> nvme_end_req _does_ set the real error.
> 
> We can't set it in the timeout handler. How would you even know to
> set an error?

Because the command timed out, which means we're done with it.  That's
the whole point of the timeout - after that the command is fenced off
and we are done with it.

> Maybe the controller posted success status, so we should
> let nvme_dev_shutdown reap the command and set req->errors.

If the controller had posted success status in time we would not have
timed out the command.

What you basically seem to care about is handling the race where the
controller still completes the command after the timeout handler is
running, and you want the controller to win over the timeout handler.

But as I said above the race also works the other way around: the
controller completed the command just before we shut down the
controller, and now we overwrite req->errors with the wrong value
in nvme_cancel_queue_ios.

The controller had plenty of time to complete the command properly,
one we are in the timeout handler we decided to cut it off and now we're
going to retry after a reset.

> > Ok, step back here, I think that set_queue_count beast is getting too
> > confusing.  What do you 'failing' devices return in the NVMe CQE status
> > and result fields?  Do we get a failure in status, or do we get a zero
> > status and a zero result?
> 
> Such devices return status 6, internal device error, and the senario
> requires user management to rectify.

Eww, ok.  So what do we want to do with the devices?  Just ensure the
admin quue is up to be able to send vendor specific admin commands to
fix it?

> How do you explicitly check for a timeout? We used to have negative
> error codes for these, but you changed that to fake NVMe status codes.

The aborted one is pretty obvious for NVMe.  Anyway, I'm finally
understanding what the magic negative values are for.  Without the long
back story that you actually want to support broken devices that need
to come up without I/O queues and thus we have to ignore the Set
Features field there was absolutely no way to figure them out.

I'm perfectly happy to add back a:

+       /* Linux driver specific value: */
+       NVME_SC_TIMEOUT                 = -1;

and use it specificly for this case.  But only as a trade in for
documenting in detail why we even bother with all this special casing.

> > set_queue_count I disagree that it shouldn't be non-fatal, it's the way
> > how the hardware tells you how many queues are supported.  If some
> > hardware gets that wrong we'll have to handle it for better or worse,
> > but we should document that it's buggy and probably even log a message.
> 
> It's not "wrong" to return an appropriate error code to a command if the
> device is in a state that can't support it. The h/w may be in a non-optimal
> condition, but we handle that correctly today.

I assume correctly means the admin queu is up, and no I/O queus are
working.  That soudns reasonable to me, but it's absolutely not obvious
from the code that such a case matters.  Comment, comments, comments!

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-11-25 17:55             ` Christoph Hellwig
@ 2015-11-25 18:24               ` Keith Busch
  2015-11-25 19:38                 ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Keith Busch @ 2015-11-25 18:24 UTC (permalink / raw)


On Wed, Nov 25, 2015@09:55:12AM -0800, Christoph Hellwig wrote:
> On Wed, Nov 25, 2015@05:28:45PM +0000, Keith Busch wrote:
> > > So let's get rid of the nvme_end_req thing and return the real errors.
> > 
> > nvme_end_req _does_ set the real error.
> > 
> > We can't set it in the timeout handler. How would you even know to
> > set an error?
> 
> Because the command timed out, which means we're done with it.  That's
> the whole point of the timeout - after that the command is fenced off
> and we are done with it.

I suppose some users with their erasure codes prefer to see failure
status quicker than wait for a slower success.

> > Maybe the controller posted success status, so we should
> > let nvme_dev_shutdown reap the command and set req->errors.
> 
> If the controller had posted success status in time we would not have
> timed out the command.

Right, I'm just concerned with the race. There's no reason the controller
couldn't post completion at the same time as our completely arbitrary
timeout kicks in.

> What you basically seem to care about is handling the race where the
> controller still completes the command after the timeout handler is
> running, and you want the controller to win over the timeout handler.

Yes, preferring not to fake a status if a real one is available.
 
> But as I said above the race also works the other way around: the
> controller completed the command just before we shut down the
> controller, and now we overwrite req->errors with the wrong value
> in nvme_cancel_queue_ios.

Oh I see. I thought the tag wouldn't be seen by nvme_clear_queue's busy
iterator if the request's CQ entry was seen through process_cq, but I
see now that I was wrong about that if it's the timed out request.

Okay, so the status is overridden to "abort" in my patch. I think that's
the status you wanted to see, but it's far removed from where we're returning
"BLK_EH_HANDLED".

> > Such devices return status 6, internal device error, and the senario
> > requires user management to rectify.
> 
> Eww, ok.  So what do we want to do with the devices?  Just ensure the
> admin quue is up to be able to send vendor specific admin commands to
> fix it?

Yep.

> > How do you explicitly check for a timeout? We used to have negative
> > error codes for these, but you changed that to fake NVMe status codes.
> 
> The aborted one is pretty obvious for NVMe.  Anyway, I'm finally
> understanding what the magic negative values are for.  Without the long
> back story that you actually want to support broken devices that need
> to come up without I/O queues and thus we have to ignore the Set
> Features field there was absolutely no way to figure them out.
> 
> I'm perfectly happy to add back a:
> 
> +       /* Linux driver specific value: */
> +       NVME_SC_TIMEOUT                 = -1;
> 
> and use it specificly for this case.  But only as a trade in for
> documenting in detail why we even bother with all this special casing.

Cool, sounds good.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-11-25 18:24               ` Keith Busch
@ 2015-11-25 19:38                 ` Christoph Hellwig
  2015-11-30  8:47                   ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2015-11-25 19:38 UTC (permalink / raw)


I'll give this a spin the next day.  I assume you and Jns will be out
for a long weekend starting tomorrow, so I should have a new series
ready for review.  I'll fold a modified version of your patch in, as I
think we should get this right from the start.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-11-25 19:38                 ` Christoph Hellwig
@ 2015-11-30  8:47                   ` Christoph Hellwig
  2015-11-30 21:42                     ` Keith Busch
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2015-11-30  8:47 UTC (permalink / raw)


On Wed, Nov 25, 2015@11:38:51AM -0800, Christoph Hellwig wrote:
> I'll give this a spin the next day.  I assume you and Jns will be out
> for a long weekend starting tomorrow, so I should have a new series
> ready for review.  I'll fold a modified version of your patch in, as I
> think we should get this right from the start.

Can you review the code in
http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/nvme-req.11
and see if this fits your intention?

I've only sent out the hopefully uncontroversial first part for now
to limit the amount of patch traffic on the list.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-11-30  8:47                   ` Christoph Hellwig
@ 2015-11-30 21:42                     ` Keith Busch
  2015-12-01 12:36                       ` Christoph Hellwig
  2015-12-07 14:12                       ` Christoph Hellwig
  0 siblings, 2 replies; 17+ messages in thread
From: Keith Busch @ 2015-11-30 21:42 UTC (permalink / raw)


On Mon, Nov 30, 2015@12:47:18AM -0800, Christoph Hellwig wrote:
> On Wed, Nov 25, 2015@11:38:51AM -0800, Christoph Hellwig wrote:
> > I'll give this a spin the next day.  I assume you and Jns will be out
> > for a long weekend starting tomorrow, so I should have a new series
> > ready for review.  I'll fold a modified version of your patch in, as I
> > think we should get this right from the start.
> 
> Can you review the code in
> http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/nvme-req.11
> and see if this fits your intention?

Looking good so far, passing all tests.

For something new not related to this work, I was recently copied on an
issue and wanted to check with you for your opinion on how to fix.

We probe in a workqueue to improve startup time, but not blocking probe
messes up some user space software looking for expected mount
points. I was comparing how SCSI works, and it looks like it relies on
async_schedule and wait_for_device_probe.

We didn't use async_shedule in nvme because nvme_remove can't synchronize
its own work. It doesn't look like much trouble to make "remove" not require
flushing startup, so I'm thinking we can just remove all the work queues
and just use the async API instead. Does that sound reasonable, or any
other ideas?
 
> I've only sent out the hopefully uncontroversial first part for now
> to limit the amount of patch traffic on the list.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-11-30 21:42                     ` Keith Busch
@ 2015-12-01 12:36                       ` Christoph Hellwig
  2015-12-07 14:12                       ` Christoph Hellwig
  1 sibling, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2015-12-01 12:36 UTC (permalink / raw)


On Mon, Nov 30, 2015@09:42:10PM +0000, Keith Busch wrote:
> > Can you review the code in
> > http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/nvme-req.11
> > and see if this fits your intention?
> 
> Looking good so far, passing all tests.

Thanks.  I'll send it out once I got your Ack and Jens has picked up
the currently posted batch.

> For something new not related to this work, I was recently copied on an
> issue and wanted to check with you for your opinion on how to fix.
> 
> We probe in a workqueue to improve startup time, but not blocking probe
> messes up some user space software looking for expected mount
> points. I was comparing how SCSI works, and it looks like it relies on
> async_schedule and wait_for_device_probe.
> 
> We didn't use async_shedule in nvme because nvme_remove can't synchronize
> its own work. It doesn't look like much trouble to make "remove" not require
> flushing startup, so I'm thinking we can just remove all the work queues
> and just use the async API instead. Does that sound reasonable, or any
> other ideas?

The async mechnisms are a huge nightmare in SCSI and I would strongly
suggest not to introduce any new users.  Code mounting file systems
must wait for the device node to appear and not rely on the modprobe
return anyway.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-11-30 21:42                     ` Keith Busch
  2015-12-01 12:36                       ` Christoph Hellwig
@ 2015-12-07 14:12                       ` Christoph Hellwig
  2015-12-07 22:19                         ` Keith Busch
  1 sibling, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2015-12-07 14:12 UTC (permalink / raw)


On Mon, Nov 30, 2015@09:42:10PM +0000, Keith Busch wrote:
> > Can you review the code in
> > http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/nvme-req.11
> > and see if this fits your intention?
> 
> Looking good so far, passing all tests.

Can you Ack the remaining patches on the list?  I'd like to get them
into the safe harbour of Jens' tree before starting on the round of
updates.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-12-07 14:12                       ` Christoph Hellwig
@ 2015-12-07 22:19                         ` Keith Busch
  2015-12-09 18:34                           ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Keith Busch @ 2015-12-07 22:19 UTC (permalink / raw)


On Mon, Dec 07, 2015@06:12:47AM -0800, Christoph Hellwig wrote:
> On Mon, Nov 30, 2015@09:42:10PM +0000, Keith Busch wrote:
> > > Can you review the code in
> > > http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/nvme-req.11
> > > and see if this fits your intention?
> > 
> > Looking good so far, passing all tests.
> 
> Can you Ack the remaining patches on the list?  I'd like to get them
> into the safe harbour of Jens' tree before starting on the round of
> updates.

Looks good, thanks! For the series:

Acked-by: Keith Busch <keith.busch at intel.com>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] NVMe: Shutdown fixes
  2015-12-07 22:19                         ` Keith Busch
@ 2015-12-09 18:34                           ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2015-12-09 18:34 UTC (permalink / raw)


On Mon, Dec 07, 2015@10:19:03PM +0000, Keith Busch wrote:
> Looks good, thanks! For the series:
> 
> Acked-by: Keith Busch <keith.busch at intel.com>

Jens, can you pull
http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/nvme-req.11
in with this ACK?

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2015-12-09 18:34 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-24 23:30 [PATCH] NVMe: Shutdown fixes Keith Busch
2015-11-25  1:39 ` Jens Axboe
2015-11-25  8:55 ` Christoph Hellwig
2015-11-25 16:05   ` Keith Busch
2015-11-25 16:42     ` Christoph Hellwig
2015-11-25 16:57       ` Keith Busch
2015-11-25 17:08         ` Christoph Hellwig
2015-11-25 17:28           ` Keith Busch
2015-11-25 17:55             ` Christoph Hellwig
2015-11-25 18:24               ` Keith Busch
2015-11-25 19:38                 ` Christoph Hellwig
2015-11-30  8:47                   ` Christoph Hellwig
2015-11-30 21:42                     ` Keith Busch
2015-12-01 12:36                       ` Christoph Hellwig
2015-12-07 14:12                       ` Christoph Hellwig
2015-12-07 22:19                         ` Keith Busch
2015-12-09 18:34                           ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.