All of lore.kernel.org
 help / color / mirror / Atom feed
* RFC: what to do about abort?
@ 2016-05-04 10:03 Christoph Hellwig
  2016-05-04 10:19 ` Hannes Reinecke
  0 siblings, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2016-05-04 10:03 UTC (permalink / raw)


I've recently spent some time on the Abort implementation for
the Fabrics host and target drivers, and given how vaguele Abort
is defined in the spec, I fail to see how sending an abort actually
buys us anything.  The controller doesn't even have to respond, and
in fact many don't, so we simply end up escalation to a reset after
the abort command times out.  Similarly the amount of abort commands
is severly limited, and I've not seen a controller exceeding the
recommended 4 allowed abort commands, leading to sever pressure on
the error handling code for the typical case of a somehow stuck
controller that has multiple outstanding commands beyoned the allowed
timeout.

Based on that I came to the conclusion that we'd be much better off
to just use the 60 second timeout for I/O command as well, and simply
avoid ever sending the abort command.  RFC patch below:

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2de248b..21b10ed 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -38,7 +38,7 @@ module_param(admin_timeout, byte, 0644);
 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
 EXPORT_SYMBOL_GPL(admin_timeout);
 
-unsigned char nvme_io_timeout = 30;
+unsigned char nvme_io_timeout = 60;
 module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
 EXPORT_SYMBOL_GPL(nvme_io_timeout);
@@ -1100,7 +1100,6 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 
 	ctrl->vid = le16_to_cpu(id->vid);
 	ctrl->oncs = le16_to_cpup(&id->oncs);
-	atomic_set(&ctrl->abort_limit, id->acl + 1);
 	ctrl->vwc = id->vwc;
 	ctrl->cntlid = le16_to_cpup(&id->cntlid);
 	memcpy(ctrl->serial, id->sn, sizeof(id->sn));
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 114b928..4ea0914 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -102,7 +102,6 @@ struct nvme_ctrl {
 	u32 stripe_size;
 	u16 oncs;
 	u16 vid;
-	atomic_t abort_limit;
 	u8 event_limit;
 	u8 vwc;
 	u32 vs;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index fb741d0..3ac42d3 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -141,7 +141,6 @@ struct nvme_queue {
  */
 struct nvme_iod {
 	struct nvme_queue *nvmeq;
-	int aborted;
 	int npages;		/* In the PRP list. 0 means small pool in use */
 	int nents;		/* Used in scatterlist */
 	int length;		/* Of data, in bytes */
@@ -306,7 +305,6 @@ static int nvme_init_iod(struct request *rq, unsigned size,
 		iod->sg = iod->inline_sg;
 	}
 
-	iod->aborted = 0;
 	iod->npages = -1;
 	iod->nents = 0;
 	iod->length = size;
@@ -633,12 +631,6 @@ static void nvme_complete_rq(struct request *req)
 			error = nvme_error_status(req->errors);
 	}
 
-	if (unlikely(iod->aborted)) {
-		dev_warn(dev->ctrl.device,
-			"completing aborted command with status: %04x\n",
-			req->errors);
-	}
-
 	blk_mq_end_request(req, error);
 }
 
@@ -830,93 +822,26 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
 }
 
-static void abort_endio(struct request *req, int error)
-{
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	struct nvme_queue *nvmeq = iod->nvmeq;
-	u16 status = req->errors;
-
-	dev_warn(nvmeq->dev->ctrl.device, "Abort status: 0x%x", status);
-	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
-	blk_mq_free_request(req);
-}
-
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = iod->nvmeq;
 	struct nvme_dev *dev = nvmeq->dev;
-	struct request *abort_req;
-	struct nvme_command cmd;
-
-	/*
-	 * Shutdown immediately if controller times out while starting. The
-	 * reset work will see the pci device disabled when it gets the forced
-	 * cancellation error. All outstanding requests are completed on
-	 * shutdown, so we return BLK_EH_HANDLED.
-	 */
-	if (dev->ctrl.state == NVME_CTRL_RESETTING) {
-		dev_warn(dev->ctrl.device,
-			 "I/O %d QID %d timeout, disable controller\n",
-			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false);
-		req->errors = NVME_SC_CANCELLED;
-		return BLK_EH_HANDLED;
-	}
-
-	/*
- 	 * Shutdown the controller immediately and schedule a reset if the
- 	 * command was already aborted once before and still hasn't been
- 	 * returned to the driver, or if this is the admin queue.
-	 */
-	if (!nvmeq->qid || iod->aborted) {
-		dev_warn(dev->ctrl.device,
-			 "I/O %d QID %d timeout, reset controller\n",
-			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false);
-		queue_work(nvme_workq, &dev->reset_work);
+	enum nvme_ctrl_state state = dev->ctrl.state;
 
-		/*
-		 * Mark the request as handled, since the inline shutdown
-		 * forces all outstanding requests to complete.
-		 */
-		req->errors = NVME_SC_CANCELLED;
-		return BLK_EH_HANDLED;
-	}
-
-	iod->aborted = 1;
-
-	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
-		atomic_inc(&dev->ctrl.abort_limit);
-		return BLK_EH_RESET_TIMER;
-	}
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.abort.opcode = nvme_admin_abort_cmd;
-	cmd.abort.cid = req->tag;
-	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
-
-	dev_warn(nvmeq->dev->ctrl.device,
-		"I/O %d QID %d timeout, aborting\n",
+	dev_warn(dev->ctrl.device,
+		 "I/O %d QID %d timeout, reset controller\n",
 		 req->tag, nvmeq->qid);
-
-	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
-			BLK_MQ_REQ_NOWAIT);
-	if (IS_ERR(abort_req)) {
-		atomic_inc(&dev->ctrl.abort_limit);
-		return BLK_EH_RESET_TIMER;
-	}
-
-	abort_req->timeout = ADMIN_TIMEOUT;
-	abort_req->end_io_data = NULL;
-	blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
+	nvme_dev_disable(dev, false);
+	if (state != NVME_CTRL_RESETTING)
+		queue_work(nvme_workq, &dev->reset_work);
 
 	/*
-	 * The aborted req will be completed on receiving the abort req.
-	 * We enable the timer again. If hit twice, it'll cause a device reset,
-	 * as the device then is in a faulty state.
+	 * Mark the request as handled, since the inline shutdown
+	 * forces all outstanding requests to complete.
 	 */
-	return BLK_EH_RESET_TIMER;
+	req->errors = NVME_SC_CANCELLED;
+	return BLK_EH_HANDLED;
 }
 
 static void nvme_cancel_io(struct request *req, void *data, bool reserved)

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* RFC: what to do about abort?
  2016-05-04 10:03 RFC: what to do about abort? Christoph Hellwig
@ 2016-05-04 10:19 ` Hannes Reinecke
  2016-05-04 10:58   ` Christoph Hellwig
  0 siblings, 1 reply; 7+ messages in thread
From: Hannes Reinecke @ 2016-05-04 10:19 UTC (permalink / raw)


On 05/04/2016 12:03 PM, Christoph Hellwig wrote:
> I've recently spent some time on the Abort implementation for
> the Fabrics host and target drivers, and given how vaguele Abort
> is defined in the spec, I fail to see how sending an abort actually
> buys us anything.  The controller doesn't even have to respond, and
> in fact many don't, so we simply end up escalation to a reset after
> the abort command times out.  Similarly the amount of abort commands
> is severly limited, and I've not seen a controller exceeding the
> recommended 4 allowed abort commands, leading to sever pressure on
> the error handling code for the typical case of a somehow stuck
> controller that has multiple outstanding commands beyoned the allowed
> timeout.
> 
Failure to respond to an abort command?
IE you send the abort and never ever get a completion for the abort?
Uh-oh.
How would be able to figure out if the card has been able to process
the abort?

> Based on that I came to the conclusion that we'd be much better off
> to just use the 60 second timeout for I/O command as well, and simply
> avoid ever sending the abort command.  RFC patch below:
> 
Doesn't really help if the command has been dropped into a black
hole somewhere on the way, right?
In general you _do_ want to handle aborts; there might be
long-running commands or the array might be genuinely stuck.
In which case you do want to send an abort to inform the array that
it should stop processing the command.

If and how the aborts are handled from the initiator side is another
story, but for the transport you do need them.
And increasing the timeout is just deferring the issue to another
time; why should any command return within the increased timeout, if
it already failed to return within the original timeout?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RFC: what to do about abort?
  2016-05-04 10:19 ` Hannes Reinecke
@ 2016-05-04 10:58   ` Christoph Hellwig
  2016-05-04 14:59     ` Busch, Keith
  0 siblings, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2016-05-04 10:58 UTC (permalink / raw)


On Wed, May 04, 2016@12:19:53PM +0200, Hannes Reinecke wrote:
> Failure to respond to an abort command?
> IE you send the abort and never ever get a completion for the abort?
> Uh-oh.

The NVMe spec is extremely vague about Abort doing anything:

"An Abort command is a best effort command; the command to abort may
 have already completed, currently be in execution, or may be deeply
 queued. It is implementation specific if/when a controller chooses
 to complete the command when the command to abort is not found."

And various controllers make full use of the ambiguity offered.

> How would be able to figure out if the card has been able to process
> the abort?

We don't.  Once the abort command itself times out we finally reset
the controller.  And I've not seen a Abort command doing anything but
timing out in the wild.
> 
> > Based on that I came to the conclusion that we'd be much better off
> > to just use the 60 second timeout for I/O command as well, and simply
> > avoid ever sending the abort command.  RFC patch below:
> > 
> Doesn't really help if the command has been dropped into a black
> hole somewhere on the way, right?

It does.  We still end up resetting the controller about 60 seconds
after the black hole appeared, but we put a whole lot less stress
on the host and controller in the meantime.

> In general you _do_ want to handle aborts; there might be
> long-running commands or the array might be genuinely stuck.
> In which case you do want to send an abort to inform the array that
> it should stop processing the command.

There might be special cases where we want to abort a long running
command without touching the rest of the controller state.  However
for the current NVMe driver there are no such long running commands
except for the asynchronous event requests (which never respond to
aborts anyway in practice), and there is no infrastructure to abort
commands except for from the timeout handler.  This mail and patch
should not be interpreted as blocking that use case of abort in the
long run.

> 
> If and how the aborts are handled from the initiator side is another
> story, but for the transport you do need them.

Which transport?  NVMe aborts are protocol level aborts that are just
another command as far as the transport is concerned.

> And increasing the timeout is just deferring the issue to another
> time; why should any command return within the increased timeout, if
> it already failed to return within the original timeout?

No good reason.  I'm just trying to keep existing behavior as much
as possible, and the existing behavior is:

 60 second timeouts for admin command, then reset the controller
 30 second timeouts for I/O commands, then:

    a) send a abort command if under the abort limit
    b) reset the timer for another 30 seconds

The I/O command behavior essentially is a 60 to 90 second timeout due to
the way aborts are actually implemented.  Setting a consistent 60 second
timeout and do the deterministic reset will provide us with a much more
consistent behavior, and put a lot less stress on the host and
controller in the even of the timeout.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RFC: what to do about abort?
  2016-05-04 10:58   ` Christoph Hellwig
@ 2016-05-04 14:59     ` Busch, Keith
  2016-05-05 14:11       ` Christoph Hellwig
  0 siblings, 1 reply; 7+ messages in thread
From: Busch, Keith @ 2016-05-04 14:59 UTC (permalink / raw)


> -----Original Message-----
> From: Linux-nvme [mailto:linux-nvme-bounces at lists.infradead.org] On Behalf Of Christoph Hellwig
> Sent: Wednesday, May 04, 2016 4:58 AM
> To: Hannes Reinecke
> Cc: linux-nvme at lists.infradead.org
> Subject: Re: RFC: what to do about abort?


> The NVMe spec is extremely vague about Abort doing anything:
> 
> "An Abort command is a best effort command; the command to abort may
>  have already completed, currently be in execution, or may be deeply
>  queued. It is implementation specific if/when a controller chooses
>  to complete the command when the command to abort is not found."
> 
> And various controllers make full use of the ambiguity offered.

That's an unfortunate way to the spec describes this command. All my controllers immediately post success status with the CQE's DWORD0 set to 0;abort is essentially a no-op.

Even though it's a no-op for me, the main benefit is that abort makes for an awesome trigger for analyzers and firmware. It's been a valuable tool for debugging issues, and I'd hate to lose it.

Can we propose changing the spec instead?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RFC: what to do about abort?
  2016-05-04 14:59     ` Busch, Keith
@ 2016-05-05 14:11       ` Christoph Hellwig
  2016-05-05 21:56         ` Keith Busch
  0 siblings, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2016-05-05 14:11 UTC (permalink / raw)


On Wed, May 04, 2016@02:59:11PM +0000, Busch, Keith wrote:
> That's an unfortunate way to the spec describes this command. All my
> controllers immediately post success status with the CQE's DWORD0 set
> to 0;abort is essentially a no-op.

That's one of the most common behaviors instead.

> Even though it's a no-op for me, the main benefit is that abort makes
> for an awesome trigger for analyzers and firmware. It's been a valuable
> tool for debugging issues, and I'd hate to lose it.

You're the maintainer, so it's up to you - it just doesn't seem very
useful to me from both the host and device perspectice.

> Can we propose changing the spec instead?

Good luck changing existing NVMe behavior.  We could probably introduce
an optional well defined abort, but I'm not sure lots of vendors would
even bother to implement it unless there is a hard requirement for it.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RFC: what to do about abort?
  2016-05-05 14:11       ` Christoph Hellwig
@ 2016-05-05 21:56         ` Keith Busch
  2016-05-08  9:04           ` Christoph Hellwig
  0 siblings, 1 reply; 7+ messages in thread
From: Keith Busch @ 2016-05-05 21:56 UTC (permalink / raw)


On Thu, May 05, 2016@07:11:18AM -0700, Christoph Hellwig wrote:
> You're the maintainer, so it's up to you - it just doesn't seem very
> useful to me from both the host and device perspectice.

If you don't find a deterministic event on failure to be useful for the
device side, you must have better visibility into your internal device
state than most others do. :)

The spec so badly defines abort, though, I have to agree you're right
to remove it. Since having a dependable event greatly helps debugging
field issues when device makers desperately want to blame the driver
for command timeouts, does it sound okay to have a module param toggle
its use, defaulting to off?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RFC: what to do about abort?
  2016-05-05 21:56         ` Keith Busch
@ 2016-05-08  9:04           ` Christoph Hellwig
  0 siblings, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2016-05-08  9:04 UTC (permalink / raw)


On Thu, May 05, 2016@05:56:33PM -0400, Keith Busch wrote:
> If you don't find a deterministic event on failure to be useful for the
> device side, you must have better visibility into your internal device
> state than most others do. :)

I get a deterministic error: the controller reset.

> The spec so badly defines abort, though, I have to agree you're right
> to remove it. Since having a dependable event greatly helps debugging
> field issues when device makers desperately want to blame the driver
> for command timeouts, does it sound okay to have a module param toggle
> its use, defaulting to off?

Let's just keep it.  I don't think it's helpful, but it's not that bad
that we have to get rid of it.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-05-08  9:04 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-05-04 10:03 RFC: what to do about abort? Christoph Hellwig
2016-05-04 10:19 ` Hannes Reinecke
2016-05-04 10:58   ` Christoph Hellwig
2016-05-04 14:59     ` Busch, Keith
2016-05-05 14:11       ` Christoph Hellwig
2016-05-05 21:56         ` Keith Busch
2016-05-08  9:04           ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.