All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv3] nvme-pci: enhance timeout kernel log
@ 2024-01-05 18:20 Keith Busch
  2024-01-07  0:26 ` Max Gurtovoy
  0 siblings, 1 reply; 4+ messages in thread
From: Keith Busch @ 2024-01-05 18:20 UTC (permalink / raw)
  To: linux-nvme; +Cc: hch, sagi, axboe, chaitanyak, mgurtovoy, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Kernel configs don't necessarily have opcode decoding, and some opcodes
are not even decodable. It is still interesting for debugging SSD issues
to know what opcode is timing out, what request type it came from, and
the data size (if applicable).

Also print the command_id along side blk-mq's tag to help match commands
with protocol wire traces and firmware logs,

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
v2->v3: formatting updates and print out the command_id too

 drivers/nvme/host/pci.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f272026807419..698c6e497a22e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1284,6 +1284,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 	struct request *abort_req;
 	struct nvme_command cmd = { };
 	u32 csts = readl(dev->bar + NVME_REG_CSTS);
+	u8 opcode;
 
 	/* If PCI error recovery process is happening, we cannot reset or
 	 * the recovery mechanism will surely fail.
@@ -1310,8 +1311,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 
 	if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT) {
 		dev_warn(dev->ctrl.device,
-			 "I/O %d QID %d timeout, completion polled\n",
-			 req->tag, nvmeq->qid);
+			 "I/O tag %d (%04x) QID %d timeout, completion polled\n",
+			 req->tag, nvme_cid(req), nvmeq->qid);
 		return BLK_EH_DONE;
 	}
 
@@ -1327,8 +1328,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 		fallthrough;
 	case NVME_CTRL_DELETING:
 		dev_warn_ratelimited(dev->ctrl.device,
-			 "I/O %d QID %d timeout, disable controller\n",
-			 req->tag, nvmeq->qid);
+			 "I/O tag %d (%04x) QID %d timeout, disable controller\n",
+			 req->tag, nvme_cid(req), nvmeq->qid);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
 		nvme_dev_disable(dev, true);
 		return BLK_EH_DONE;
@@ -1343,10 +1344,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 	 * command was already aborted once before and still hasn't been
 	 * returned to the driver, or if this is the admin queue.
 	 */
+	opcode = nvme_req(req)->cmd->common.opcode;
 	if (!nvmeq->qid || iod->aborted) {
 		dev_warn(dev->ctrl.device,
-			 "I/O %d QID %d timeout, reset controller\n",
-			 req->tag, nvmeq->qid);
+			 "I/O tag %d (%04x) opcode %s %x QID %d timeout, reset controller\n",
+			 req->tag, nvme_cid(req),
+			 nvme_opcode_str(nvmeq->qid, opcode, 0), opcode,
+			 nvmeq->qid);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
 		goto disable;
 	}
@@ -1362,10 +1366,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
 
 	dev_warn(nvmeq->dev->ctrl.device,
-		"I/O %d (%s) QID %d timeout, aborting\n",
-		 req->tag,
-		 nvme_get_opcode_str(nvme_req(req)->cmd->common.opcode),
-		 nvmeq->qid);
+		 "I/O tag %d (%04x) opcode %s %x QID %d timeout, aborting req_op:%s(%u) size:%u\n",
+		 req->tag, nvme_cid(req), nvme_get_opcode_str(opcode), opcode,
+		 nvmeq->qid, blk_op_str(req_op(req)), req_op(req),
+		 blk_rq_bytes(req));
 
 	abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
 					 BLK_MQ_REQ_NOWAIT);
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCHv3] nvme-pci: enhance timeout kernel log
  2024-01-05 18:20 [PATCHv3] nvme-pci: enhance timeout kernel log Keith Busch
@ 2024-01-07  0:26 ` Max Gurtovoy
  2024-01-08  8:26   ` Christoph Hellwig
  2024-01-08 16:17   ` Keith Busch
  0 siblings, 2 replies; 4+ messages in thread
From: Max Gurtovoy @ 2024-01-07  0:26 UTC (permalink / raw)
  To: Keith Busch, linux-nvme; +Cc: hch, sagi, axboe, chaitanyak, Keith Busch



On 05/01/2024 20:20, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Kernel configs don't necessarily have opcode decoding, and some opcodes
> are not even decodable. It is still interesting for debugging SSD issues
> to know what opcode is timing out, what request type it came from, and
> the data size (if applicable).
> 
> Also print the command_id along side blk-mq's tag to help match commands
> with protocol wire traces and firmware logs,
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
> v2->v3: formatting updates and print out the command_id too
> 
>   drivers/nvme/host/pci.c | 24 ++++++++++++++----------
>   1 file changed, 14 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index f272026807419..698c6e497a22e 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1284,6 +1284,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
>   	struct request *abort_req;
>   	struct nvme_command cmd = { };
>   	u32 csts = readl(dev->bar + NVME_REG_CSTS);
> +	u8 opcode;
>   
>   	/* If PCI error recovery process is happening, we cannot reset or
>   	 * the recovery mechanism will surely fail.
> @@ -1310,8 +1311,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
>   
>   	if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT) {
>   		dev_warn(dev->ctrl.device,
> -			 "I/O %d QID %d timeout, completion polled\n",
> -			 req->tag, nvmeq->qid);
> +			 "I/O tag %d (%04x) QID %d timeout, completion polled\n",
> +			 req->tag, nvme_cid(req), nvmeq->qid);
>   		return BLK_EH_DONE;
>   	}
>   
> @@ -1327,8 +1328,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
>   		fallthrough;
>   	case NVME_CTRL_DELETING:
>   		dev_warn_ratelimited(dev->ctrl.device,
> -			 "I/O %d QID %d timeout, disable controller\n",
> -			 req->tag, nvmeq->qid);
> +			 "I/O tag %d (%04x) QID %d timeout, disable controller\n",
> +			 req->tag, nvme_cid(req), nvmeq->qid);
>   		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
>   		nvme_dev_disable(dev, true);
>   		return BLK_EH_DONE;
> @@ -1343,10 +1344,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
>   	 * command was already aborted once before and still hasn't been
>   	 * returned to the driver, or if this is the admin queue.
>   	 */
> +	opcode = nvme_req(req)->cmd->common.opcode;
>   	if (!nvmeq->qid || iod->aborted) {
>   		dev_warn(dev->ctrl.device,
> -			 "I/O %d QID %d timeout, reset controller\n",
> -			 req->tag, nvmeq->qid);
> +			 "I/O tag %d (%04x) opcode %s %x QID %d timeout, reset controller\n",

maybe:

"I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",

I'm sending patches to align rdma and tcp to print tag and cid as well.

otherwise looks good,

Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>

> +			 req->tag, nvme_cid(req),
> +			 nvme_opcode_str(nvmeq->qid, opcode, 0), opcode,
> +			 nvmeq->qid);
>   		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
>   		goto disable;
>   	}
> @@ -1362,10 +1366,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
>   	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
>   
>   	dev_warn(nvmeq->dev->ctrl.device,
> -		"I/O %d (%s) QID %d timeout, aborting\n",
> -		 req->tag,
> -		 nvme_get_opcode_str(nvme_req(req)->cmd->common.opcode),
> -		 nvmeq->qid);
> +		 "I/O tag %d (%04x) opcode %s %x QID %d timeout, aborting req_op:%s(%u) size:%u\n",
> +		 req->tag, nvme_cid(req), nvme_get_opcode_str(opcode), opcode,
> +		 nvmeq->qid, blk_op_str(req_op(req)), req_op(req),
> +		 blk_rq_bytes(req));
>   
>   	abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
>   					 BLK_MQ_REQ_NOWAIT);


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCHv3] nvme-pci: enhance timeout kernel log
  2024-01-07  0:26 ` Max Gurtovoy
@ 2024-01-08  8:26   ` Christoph Hellwig
  2024-01-08 16:17   ` Keith Busch
  1 sibling, 0 replies; 4+ messages in thread
From: Christoph Hellwig @ 2024-01-08  8:26 UTC (permalink / raw)
  To: Max Gurtovoy
  Cc: Keith Busch, linux-nvme, hch, sagi, axboe, chaitanyak, Keith Busch

On Sun, Jan 07, 2024 at 02:26:54AM +0200, Max Gurtovoy wrote:
> "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
>
> I'm sending patches to align rdma and tcp to print tag and cid as well.

Yes, that sounds good to me.



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCHv3] nvme-pci: enhance timeout kernel log
  2024-01-07  0:26 ` Max Gurtovoy
  2024-01-08  8:26   ` Christoph Hellwig
@ 2024-01-08 16:17   ` Keith Busch
  1 sibling, 0 replies; 4+ messages in thread
From: Keith Busch @ 2024-01-08 16:17 UTC (permalink / raw)
  To: Max Gurtovoy; +Cc: Keith Busch, linux-nvme, hch, sagi, axboe, chaitanyak

On Sun, Jan 07, 2024 at 02:26:54AM +0200, Max Gurtovoy wrote:
> On 05/01/2024 20:20, Keith Busch wrote:
> >   	 * returned to the driver, or if this is the admin queue.
> >   	 */
> > +	opcode = nvme_req(req)->cmd->common.opcode;
> >   	if (!nvmeq->qid || iod->aborted) {
> >   		dev_warn(dev->ctrl.device,
> > -			 "I/O %d QID %d timeout, reset controller\n",
> > -			 req->tag, nvmeq->qid);
> > +			 "I/O tag %d (%04x) opcode %s %x QID %d timeout, reset controller\n",
> 
> maybe:
> 
> "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
> 
> I'm sending patches to align rdma and tcp to print tag and cid as well.
> 
> otherwise looks good,
> 
> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>

Thanks for the suggestion. Applied to nvme-6.8.


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-01-08 16:17 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-05 18:20 [PATCHv3] nvme-pci: enhance timeout kernel log Keith Busch
2024-01-07  0:26 ` Max Gurtovoy
2024-01-08  8:26   ` Christoph Hellwig
2024-01-08 16:17   ` Keith Busch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.