All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V3 0/1] nvme: Add verbose error logging
@ 2022-01-05 18:32 Alan Adamson
  2022-01-05 18:32 ` [PATCH V3 1/1] " Alan Adamson
  0 siblings, 1 reply; 3+ messages in thread
From: Alan Adamson @ 2022-01-05 18:32 UTC (permalink / raw)
  To: linux-nvme; +Cc: alan.adamson, kbusch, hch, sagi

V3:
- Don't populate nvme_errors[] array with NULL strings. 

V2:
- Change nvme_errors[] to a sparse array where the status is used as an index into the array.

- Change pr_err() to pr_err_ratelimited().

- By enabling CONFIG_NVME_VERBOSE_ERRORS, the verbose status error is displayed.  If it is not
enabled, then a message will still be displayed, but without the verbose status.

- Remove call to nvme_error_status() when determining whether to call nvme_error_log().  Speeds
up the fast path just a bit.


This patch improves logging for NVMe errors.  Currently, we only get a
a vague idea as to why commands fail since only the block layer status
is captured on error. This patch allows us to see why a command was failed
by the controller. This is very useful when debugging problems in the field.

An example of an improved logged error:

[ 4652.812867] nvme0n1: Read @ LBA 1536, 1 blocks, Unrecovered Read Error (sct 0x2 / sc 0x81) DNR 


Alan Adamson (1):
  nvme: Add verbose error logging

 drivers/nvme/host/Kconfig  |   8 ++
 drivers/nvme/host/Makefile |   2 +-
 drivers/nvme/host/core.c   |   3 +
 drivers/nvme/host/errors.c | 181 +++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h   |   2 +
 include/linux/nvme.h       |   1 +
 6 files changed, 196 insertions(+), 1 deletion(-)
 create mode 100644 drivers/nvme/host/errors.c

-- 
2.27.0



^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH V3 1/1] nvme: Add verbose error logging
  2022-01-05 18:32 [PATCH V3 0/1] nvme: Add verbose error logging Alan Adamson
@ 2022-01-05 18:32 ` Alan Adamson
  2022-01-11 20:25   ` Chaitanya Kulkarni
  0 siblings, 1 reply; 3+ messages in thread
From: Alan Adamson @ 2022-01-05 18:32 UTC (permalink / raw)
  To: linux-nvme; +Cc: alan.adamson, kbusch, hch, sagi

Improves logging of NVMe errors.  If NVME_VERBOSE_ERRORS is configured,
a verbose description of the error is logged, otherwise only status codes/bits
is logged.

Verbose logging example:
nvme0n1: Read @ LBA 2304, 1 blocks, Unrecovered Read Error (sct 0x2 / sc 0x81) DNR

Non-verbose logging example:
nvme0n1: Read @ LBA 2304, 1 blocks, NVME Error (sct 0x2 / sc 0x81) DNR

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Alan Adamson <alan.adamson@oracle.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
---
 drivers/nvme/host/Kconfig  |   8 ++
 drivers/nvme/host/Makefile |   2 +-
 drivers/nvme/host/core.c   |   3 +
 drivers/nvme/host/errors.c | 181 +++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h   |   2 +
 include/linux/nvme.h       |   1 +
 6 files changed, 196 insertions(+), 1 deletion(-)
 create mode 100644 drivers/nvme/host/errors.c

diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index dc0450ca23a3..d6d056963c06 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -24,6 +24,14 @@ config NVME_MULTIPATH
 	   /dev/nvmeXnY device will show up for each NVMe namespace,
 	   even if it is accessible through multiple controllers.
 
+config NVME_VERBOSE_ERRORS
+	bool "NVMe verbose error reporting"
+	depends on NVME_CORE
+	help
+	   This option enables verbose reporting for NVMe errors. The
+	   error translation table will grow the kernel image size by
+	   about 4 KB.
+
 config NVME_HWMON
 	bool "NVMe hardware monitoring"
 	depends on (NVME_CORE=y && HWMON=y) || (NVME_CORE=m && HWMON)
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index dfaacd472e5d..ea3d702feb51 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_NVME_RDMA)			+= nvme-rdma.o
 obj-$(CONFIG_NVME_FC)			+= nvme-fc.o
 obj-$(CONFIG_NVME_TCP)			+= nvme-tcp.o
 
-nvme-core-y				:= core.o ioctl.o
+nvme-core-y				:= core.o ioctl.o errors.o
 nvme-core-$(CONFIG_TRACING)		+= trace.o
 nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
 nvme-core-$(CONFIG_BLK_DEV_ZONED)	+= zns.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1af8a4513708..fda029a55436 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -354,6 +354,9 @@ void nvme_complete_rq(struct request *req)
 
 	switch (nvme_decide_disposition(req)) {
 	case COMPLETE:
+		if (unlikely(nvme_req(req)->status != NVME_SC_SUCCESS))
+			nvme_error_log(req);
+
 		nvme_end_req(req);
 		return;
 	case RETRY:
diff --git a/drivers/nvme/host/errors.c b/drivers/nvme/host/errors.c
new file mode 100644
index 000000000000..2668d2f584e2
--- /dev/null
+++ b/drivers/nvme/host/errors.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVM Express device driver verbose errors
+ * Copyright (c) 2021, Oracle and/or its affiliates
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/blkdev.h>
+#include "nvme.h"
+
+struct nvme_string_table {
+	unsigned int code;
+	const unsigned char *string;
+};
+
+static const struct nvme_string_table nvme_ops[] = {
+	{ REQ_OP_READ,			"Read" },
+	{ REQ_OP_WRITE,			"Write" },
+	{ REQ_OP_FLUSH,			"Flush" },
+	{ REQ_OP_DISCARD,		"Deallocate (DSM)" },
+	{ REQ_OP_WRITE_ZEROES,		"Write Zeroes" },
+	{ REQ_OP_ZONE_RESET_ALL,	"Zone Reset All" },
+	{ REQ_OP_ZONE_RESET,		"Zone Reset" },
+	{ REQ_OP_ZONE_OPEN,		"Zone Open" },
+	{ REQ_OP_ZONE_CLOSE,		"Zone Close" },
+	{ REQ_OP_ZONE_FINISH,		"Zone Finish" },
+	{ REQ_OP_ZONE_APPEND,		"Zone Append" },
+};
+#define NVME_OPS_SIZE ARRAY_SIZE(nvme_ops)
+
+#ifdef CONFIG_NVME_VERBOSE_ERRORS
+static const char * const nvme_errors[] = {
+	[NVME_SC_SUCCESS] = "Success",
+	[NVME_SC_INVALID_OPCODE] = "Invalid Command Opcode",
+	[NVME_SC_INVALID_FIELD] = "Invalid Field in Command",
+	[NVME_SC_CMDID_CONFLICT] = "Command ID Conflict",
+	[NVME_SC_DATA_XFER_ERROR] = "Data Transfer Error",
+	[NVME_SC_POWER_LOSS] = "Commands Aborted due to Power Loss Notification",
+	[NVME_SC_INTERNAL] = "Internal Error",
+	[NVME_SC_ABORT_REQ] = "Command Abort Requested",
+	[NVME_SC_ABORT_QUEUE] = "Command Aborted due to SQ Deletion",
+	[NVME_SC_FUSED_FAIL] = "Command Aborted due to Failed Fused Command",
+	[NVME_SC_FUSED_MISSING] = "Command Aborted due to Missing Fused Command",
+	[NVME_SC_INVALID_NS] = "Invalid Namespace or Format",
+	[NVME_SC_CMD_SEQ_ERROR] = "Command Sequence Error",
+	[NVME_SC_SGL_INVALID_LAST] = "Invalid SGL Segment Descriptor",
+	[NVME_SC_SGL_INVALID_COUNT] = "Invalid Number of SGL Descriptors",
+	[NVME_SC_SGL_INVALID_DATA] = "Data SGL Length Invalid",
+	[NVME_SC_SGL_INVALID_METADATA] = "Metadata SGL Length Invalid",
+	[NVME_SC_SGL_INVALID_TYPE] = "SGL Descriptor Type Invalid",
+	[NVME_SC_CMB_INVALID_USE] = "Invalid Use of Controller Memory Buffer",
+	[NVME_SC_PRP_INVALID_OFFSET] = "PRP Offset Invalid",
+	[NVME_SC_ATOMIC_WU_EXCEEDED] = "Atomic Write Unit Exceeded",
+	[NVME_SC_OP_DENIED] = "Operation Denied",
+	[NVME_SC_SGL_INVALID_OFFSET] = "SGL Offset Invalid",
+	[NVME_SC_RESERVED] = "Reserved",
+	[NVME_SC_HOST_ID_INCONSIST] = "Host Identifier Inconsistent Format",
+	[NVME_SC_KA_TIMEOUT_EXPIRED] = "Keep Alive Timeout Expired",
+	[NVME_SC_KA_TIMEOUT_INVALID] = "Keep Alive Timeout Invalid",
+	[NVME_SC_ABORTED_PREEMPT_ABORT] = "Command Aborted due to Preempt and Abort",
+	[NVME_SC_SANITIZE_FAILED] = "Sanitize Failed",
+	[NVME_SC_SANITIZE_IN_PROGRESS] = "Sanitize In Progress",
+	[NVME_SC_SGL_INVALID_GRANULARITY] = "SGL Data Block Granularity Invalid",
+	[NVME_SC_CMD_NOT_SUP_CMB_QUEUE] = "Command Not Supported for Queue in CMB",
+	[NVME_SC_NS_WRITE_PROTECTED] = "Namespace is Write Protected",
+	[NVME_SC_CMD_INTERRUPTED] = "Command Interrupted",
+	[NVME_SC_TRANSIENT_TR_ERR] = "Transient Transport Error",
+	[NVME_SC_INVALID_IO_CMD_SET] = "Invalid IO Command Set",
+	[NVME_SC_LBA_RANGE] = "LBA Out of Range",
+	[NVME_SC_CAP_EXCEEDED] = "Capacity Exceeded",
+	[NVME_SC_NS_NOT_READY] = "Namespace Not Ready",
+	[NVME_SC_RESERVATION_CONFLICT] = "Reservation Conflict",
+	[NVME_SC_FORMAT_IN_PROGRESS] = "Format In Progress",
+	[NVME_SC_CQ_INVALID] = "Completion Queue Invalid",
+	[NVME_SC_QID_INVALID] = "Invalid Queue Identifier",
+	[NVME_SC_QUEUE_SIZE] = "Invalid Queue Size",
+	[NVME_SC_ABORT_LIMIT] = "Abort Command Limit Exceeded",
+	[NVME_SC_ABORT_MISSING] = "Reserved", /* XXX */
+	[NVME_SC_ASYNC_LIMIT] = "Asynchronous Event Request Limit Exceeded",
+	[NVME_SC_FIRMWARE_SLOT] = "Invalid Firmware Slot",
+	[NVME_SC_FIRMWARE_IMAGE] = "Invalid Firmware Image",
+	[NVME_SC_INVALID_VECTOR] = "Invalid Interrupt Vector",
+	[NVME_SC_INVALID_LOG_PAGE] = "Invalid Log Page",
+	[NVME_SC_INVALID_FORMAT] = "Invalid Format",
+	[NVME_SC_FW_NEEDS_CONV_RESET] = "Firmware Activation Requires Conventional Reset",
+	[NVME_SC_INVALID_QUEUE] = "Invalid Queue Deletion",
+	[NVME_SC_FEATURE_NOT_SAVEABLE] = "Feature Identifier Not Saveable",
+	[NVME_SC_FEATURE_NOT_CHANGEABLE] = "Feature Not Changeable",
+	[NVME_SC_FEATURE_NOT_PER_NS] = "Feature Not Namespace Specific",
+	[NVME_SC_FW_NEEDS_SUBSYS_RESET] = "Firmware Activation Requires NVM Subsystem Reset",
+	[NVME_SC_FW_NEEDS_RESET] = "Firmware Activation Requires Reset",
+	[NVME_SC_FW_NEEDS_MAX_TIME] = "Firmware Activation Requires Maximum Time Violation",
+	[NVME_SC_FW_ACTIVATE_PROHIBITED] = "Firmware Activation Prohibited",
+	[NVME_SC_OVERLAPPING_RANGE] = "Overlapping Range",
+	[NVME_SC_NS_INSUFFICIENT_CAP] = "Namespace Insufficient Capacity",
+	[NVME_SC_NS_ID_UNAVAILABLE] = "Namespace Identifier Unavailable",
+	[NVME_SC_NS_ALREADY_ATTACHED] = "Namespace Already Attached",
+	[NVME_SC_NS_IS_PRIVATE] = "Namespace Is Private",
+	[NVME_SC_NS_NOT_ATTACHED] = "Namespace Not Attached",
+	[NVME_SC_THIN_PROV_NOT_SUPP] = "Thin Provisioning Not Supported",
+	[NVME_SC_CTRL_LIST_INVALID] = "Controller List Invalid",
+	[NVME_SC_SELT_TEST_IN_PROGRESS] = "Device Self-test In Progress",
+	[NVME_SC_BP_WRITE_PROHIBITED] = "Boot Partition Write Prohibited",
+	[NVME_SC_CTRL_ID_INVALID] = "Invalid Controller Identifier",
+	[NVME_SC_SEC_CTRL_STATE_INVALID] = "Invalid Secondary Controller State",
+	[NVME_SC_CTRL_RES_NUM_INVALID] = "Invalid Number of Controller Resources",
+	[NVME_SC_RES_ID_INVALID] = "Invalid Resource Identifier",
+	[NVME_SC_PMR_SAN_PROHIBITED] = "Sanitize Prohibited",
+	[NVME_SC_ANA_GROUP_ID_INVALID] = "ANA Group Identifier Invalid",
+	[NVME_SC_ANA_ATTACH_FAILED] = "ANA Attach Failed",
+	[NVME_SC_BAD_ATTRIBUTES] = "Conflicting Attributes",
+	[NVME_SC_INVALID_PI] = "Invalid Protection Information",
+	[NVME_SC_READ_ONLY] = "Attempted Write to Read Only Range",
+	[NVME_SC_ONCS_NOT_SUPPORTED] = "ONCS Not Supported",
+	[NVME_SC_ZONE_BOUNDARY_ERROR] = "Zoned Boundary Error",
+	[NVME_SC_ZONE_FULL] = "Zone Is Full",
+	[NVME_SC_ZONE_READ_ONLY] = "Zone Is Read Only",
+	[NVME_SC_ZONE_OFFLINE] = "Zone Is Offline",
+	[NVME_SC_ZONE_INVALID_WRITE] = "Zone Invalid Write",
+	[NVME_SC_ZONE_TOO_MANY_ACTIVE] = "Too Many Active Zones",
+	[NVME_SC_ZONE_TOO_MANY_OPEN] = "Too Many Open Zones",
+	[NVME_SC_ZONE_INVALID_TRANSITION] = "Invalid Zone State Transition",
+	[NVME_SC_WRITE_FAULT] = "Write Fault",
+	[NVME_SC_READ_ERROR] = "Unrecovered Read Error",
+	[NVME_SC_GUARD_CHECK] = "End-to-end Guard Check Error",
+	[NVME_SC_APPTAG_CHECK] = "End-to-end Application Tag Check Error",
+	[NVME_SC_REFTAG_CHECK] = "End-to-end Reference Tag Check Error",
+	[NVME_SC_COMPARE_FAILED] = "Compare Failure",
+	[NVME_SC_ACCESS_DENIED] = "Access Denied",
+	[NVME_SC_UNWRITTEN_BLOCK] = "Deallocated or Unwritten Logical Block",
+	[NVME_SC_ANA_PERSISTENT_LOSS] = "Asymmetric Access Persistent Loss",
+	[NVME_SC_ANA_INACCESSIBLE] = "Asymmetric Access Inaccessible",
+	[NVME_SC_ANA_TRANSITION] = "Asymmetric Access Transition",
+	[NVME_SC_HOST_PATH_ERROR] = "Host Pathing Error",
+};
+#define NVME_ERRORS_SIZE ARRAY_SIZE(nvme_errors)
+#endif /* CONFIG_NVME_VERBOSE_ERRORS */
+
+void nvme_error_log(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	struct nvme_request *nr = nvme_req(req);
+	const struct nvme_string_table *entry;
+	const unsigned char *op_str  = "Unknown";
+	const unsigned char *err_str = NULL;
+	unsigned int i;
+
+	if (!ns)
+		return;
+
+	for (i = 0, entry = nvme_ops ; i < NVME_OPS_SIZE ; i++)
+		if (entry[i].code == (req->cmd_flags & REQ_OP_MASK))
+			op_str = entry[i].string;
+
+#ifdef CONFIG_NVME_VERBOSE_ERRORS
+	if ((nr->status & 0x7ff) <= NVME_ERRORS_SIZE)
+		err_str = nvme_errors[nr->status & 0x7ff];
+#endif
+	if (err_str == NULL)
+		err_str =  "NVME Error";
+
+	pr_err_ratelimited("%s: %s @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
+	       req->rq_disk ? req->rq_disk->disk_name : "?",
+	       op_str,
+	       (unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
+	       (unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
+	       err_str,
+	       nr->status >> 8 & 7, /* Status Code Type */
+	       nr->status & 0xff,    /* Status Code */
+	       nr->status & NVME_SC_MORE ? "MORE " : "",
+	       nr->status & NVME_SC_DNR  ? "DNR "  : "");
+}
+EXPORT_SYMBOL_GPL(nvme_error_log);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9b095ee01364..7edd67b92a6b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -922,4 +922,6 @@ static inline bool nvme_multi_css(struct nvme_ctrl *ctrl)
 	return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
 }
 
+extern void nvme_error_log(struct request *req);
+
 #endif /* _NVME_H */
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 855dd9b3e84b..1f946e5bf7c1 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1636,6 +1636,7 @@ enum {
 	NVME_SC_HOST_ABORTED_CMD	= 0x371,
 
 	NVME_SC_CRD			= 0x1800,
+	NVME_SC_MORE			= 0x2000,
 	NVME_SC_DNR			= 0x4000,
 };
 
-- 
2.27.0



^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH V3 1/1] nvme: Add verbose error logging
  2022-01-05 18:32 ` [PATCH V3 1/1] " Alan Adamson
@ 2022-01-11 20:25   ` Chaitanya Kulkarni
  0 siblings, 0 replies; 3+ messages in thread
From: Chaitanya Kulkarni @ 2022-01-11 20:25 UTC (permalink / raw)
  To: Alan Adamson, linux-nvme; +Cc: kbusch, hch, sagi

On 1/5/22 10:32 AM, Alan Adamson wrote:
> Improves logging of NVMe errors.  If NVME_VERBOSE_ERRORS is configured,
> a verbose description of the error is logged, otherwise only status codes/bits
> is logged.
> 
> Verbose logging example:
> nvme0n1: Read @ LBA 2304, 1 blocks, Unrecovered Read Error (sct 0x2 / sc 0x81) DNR
> 
> Non-verbose logging example:
> nvme0n1: Read @ LBA 2304, 1 blocks, NVME Error (sct 0x2 / sc 0x81) DNR
> 
> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
> Signed-off-by: Alan Adamson <alan.adamson@oracle.com>
> Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
> ---
>   drivers/nvme/host/Kconfig  |   8 ++
>   drivers/nvme/host/Makefile |   2 +-
>   drivers/nvme/host/core.c   |   3 +
>   drivers/nvme/host/errors.c | 181 +++++++++++++++++++++++++++++++++++++
>   drivers/nvme/host/nvme.h   |   2 +
>   include/linux/nvme.h       |   1 +
>   6 files changed, 196 insertions(+), 1 deletion(-)
>   create mode 100644 drivers/nvme/host/errors.c
> 
> diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
> index dc0450ca23a3..d6d056963c06 100644
> --- a/drivers/nvme/host/Kconfig
> +++ b/drivers/nvme/host/Kconfig
> @@ -24,6 +24,14 @@ config NVME_MULTIPATH
>   	   /dev/nvmeXnY device will show up for each NVMe namespace,
>   	   even if it is accessible through multiple controllers.
>   
> +config NVME_VERBOSE_ERRORS
> +	bool "NVMe verbose error reporting"
> +	depends on NVME_CORE
> +	help
> +	   This option enables verbose reporting for NVMe errors. The
> +	   error translation table will grow the kernel image size by
> +	   about 4 KB.
> +
>   config NVME_HWMON
>   	bool "NVMe hardware monitoring"
>   	depends on (NVME_CORE=y && HWMON=y) || (NVME_CORE=m && HWMON)
> diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
> index dfaacd472e5d..ea3d702feb51 100644
> --- a/drivers/nvme/host/Makefile
> +++ b/drivers/nvme/host/Makefile
> @@ -9,7 +9,7 @@ obj-$(CONFIG_NVME_RDMA)			+= nvme-rdma.o
>   obj-$(CONFIG_NVME_FC)			+= nvme-fc.o
>   obj-$(CONFIG_NVME_TCP)			+= nvme-tcp.o
>   
> -nvme-core-y				:= core.o ioctl.o
> +nvme-core-y				:= core.o ioctl.o errors.o
>   nvme-core-$(CONFIG_TRACING)		+= trace.o
>   nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
>   nvme-core-$(CONFIG_BLK_DEV_ZONED)	+= zns.o
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 1af8a4513708..fda029a55436 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -354,6 +354,9 @@ void nvme_complete_rq(struct request *req)
>   
>   	switch (nvme_decide_disposition(req)) {
>   	case COMPLETE:
> +		if (unlikely(nvme_req(req)->status != NVME_SC_SUCCESS))
> +			nvme_error_log(req);
> +
>   		nvme_end_req(req);
>   		return;
>   	case RETRY:
> diff --git a/drivers/nvme/host/errors.c b/drivers/nvme/host/errors.c
> new file mode 100644
> index 000000000000..2668d2f584e2
> --- /dev/null
> +++ b/drivers/nvme/host/errors.c
> @@ -0,0 +1,181 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * NVM Express device driver verbose errors
> + * Copyright (c) 2021, Oracle and/or its affiliates
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + */
> +
> +#include <linux/blkdev.h>
> +#include "nvme.h"
> +
> +struct nvme_string_table {
> +	unsigned int code;
> +	const unsigned char *string;
> +};
> +
> +static const struct nvme_string_table nvme_ops[] = {
> +	{ REQ_OP_READ,			"Read" },
> +	{ REQ_OP_WRITE,			"Write" },
> +	{ REQ_OP_FLUSH,			"Flush" },
> +	{ REQ_OP_DISCARD,		"Deallocate (DSM)" },
> +	{ REQ_OP_WRITE_ZEROES,		"Write Zeroes" },
> +	{ REQ_OP_ZONE_RESET_ALL,	"Zone Reset All" },
> +	{ REQ_OP_ZONE_RESET,		"Zone Reset" },
> +	{ REQ_OP_ZONE_OPEN,		"Zone Open" },
> +	{ REQ_OP_ZONE_CLOSE,		"Zone Close" },
> +	{ REQ_OP_ZONE_FINISH,		"Zone Finish" },
> +	{ REQ_OP_ZONE_APPEND,		"Zone Append" },
> +};
> +#define NVME_OPS_SIZE ARRAY_SIZE(nvme_ops)
> +

Do we really need above macro ? since it is only in one place
I think direct use of ARRAY_SIZE(nvme_ops) is pretty readable.

> +#ifdef CONFIG_NVME_VERBOSE_ERRORS
> +static const char * const nvme_errors[] = {
> +	[NVME_SC_SUCCESS] = "Success",
> +	[NVME_SC_INVALID_OPCODE] = "Invalid Command Opcode",
> +	[NVME_SC_INVALID_FIELD] = "Invalid Field in Command",
> +	[NVME_SC_CMDID_CONFLICT] = "Command ID Conflict",
> +	[NVME_SC_DATA_XFER_ERROR] = "Data Transfer Error",
> +	[NVME_SC_POWER_LOSS] = "Commands Aborted due to Power Loss Notification",
> +	[NVME_SC_INTERNAL] = "Internal Error",
> +	[NVME_SC_ABORT_REQ] = "Command Abort Requested",
> +	[NVME_SC_ABORT_QUEUE] = "Command Aborted due to SQ Deletion",
> +	[NVME_SC_FUSED_FAIL] = "Command Aborted due to Failed Fused Command",
> +	[NVME_SC_FUSED_MISSING] = "Command Aborted due to Missing Fused Command",
> +	[NVME_SC_INVALID_NS] = "Invalid Namespace or Format",
> +	[NVME_SC_CMD_SEQ_ERROR] = "Command Sequence Error",
> +	[NVME_SC_SGL_INVALID_LAST] = "Invalid SGL Segment Descriptor",
> +	[NVME_SC_SGL_INVALID_COUNT] = "Invalid Number of SGL Descriptors",
> +	[NVME_SC_SGL_INVALID_DATA] = "Data SGL Length Invalid",
> +	[NVME_SC_SGL_INVALID_METADATA] = "Metadata SGL Length Invalid",
> +	[NVME_SC_SGL_INVALID_TYPE] = "SGL Descriptor Type Invalid",
> +	[NVME_SC_CMB_INVALID_USE] = "Invalid Use of Controller Memory Buffer",
> +	[NVME_SC_PRP_INVALID_OFFSET] = "PRP Offset Invalid",
> +	[NVME_SC_ATOMIC_WU_EXCEEDED] = "Atomic Write Unit Exceeded",
> +	[NVME_SC_OP_DENIED] = "Operation Denied",
> +	[NVME_SC_SGL_INVALID_OFFSET] = "SGL Offset Invalid",
> +	[NVME_SC_RESERVED] = "Reserved",
> +	[NVME_SC_HOST_ID_INCONSIST] = "Host Identifier Inconsistent Format",
> +	[NVME_SC_KA_TIMEOUT_EXPIRED] = "Keep Alive Timeout Expired",
> +	[NVME_SC_KA_TIMEOUT_INVALID] = "Keep Alive Timeout Invalid",
> +	[NVME_SC_ABORTED_PREEMPT_ABORT] = "Command Aborted due to Preempt and Abort",
> +	[NVME_SC_SANITIZE_FAILED] = "Sanitize Failed",
> +	[NVME_SC_SANITIZE_IN_PROGRESS] = "Sanitize In Progress",
> +	[NVME_SC_SGL_INVALID_GRANULARITY] = "SGL Data Block Granularity Invalid",
> +	[NVME_SC_CMD_NOT_SUP_CMB_QUEUE] = "Command Not Supported for Queue in CMB",
> +	[NVME_SC_NS_WRITE_PROTECTED] = "Namespace is Write Protected",
> +	[NVME_SC_CMD_INTERRUPTED] = "Command Interrupted",
> +	[NVME_SC_TRANSIENT_TR_ERR] = "Transient Transport Error",
> +	[NVME_SC_INVALID_IO_CMD_SET] = "Invalid IO Command Set",
> +	[NVME_SC_LBA_RANGE] = "LBA Out of Range",
> +	[NVME_SC_CAP_EXCEEDED] = "Capacity Exceeded",
> +	[NVME_SC_NS_NOT_READY] = "Namespace Not Ready",
> +	[NVME_SC_RESERVATION_CONFLICT] = "Reservation Conflict",
> +	[NVME_SC_FORMAT_IN_PROGRESS] = "Format In Progress",
> +	[NVME_SC_CQ_INVALID] = "Completion Queue Invalid",
> +	[NVME_SC_QID_INVALID] = "Invalid Queue Identifier",
> +	[NVME_SC_QUEUE_SIZE] = "Invalid Queue Size",
> +	[NVME_SC_ABORT_LIMIT] = "Abort Command Limit Exceeded",
> +	[NVME_SC_ABORT_MISSING] = "Reserved", /* XXX */
> +	[NVME_SC_ASYNC_LIMIT] = "Asynchronous Event Request Limit Exceeded",
> +	[NVME_SC_FIRMWARE_SLOT] = "Invalid Firmware Slot",
> +	[NVME_SC_FIRMWARE_IMAGE] = "Invalid Firmware Image",
> +	[NVME_SC_INVALID_VECTOR] = "Invalid Interrupt Vector",
> +	[NVME_SC_INVALID_LOG_PAGE] = "Invalid Log Page",
> +	[NVME_SC_INVALID_FORMAT] = "Invalid Format",
> +	[NVME_SC_FW_NEEDS_CONV_RESET] = "Firmware Activation Requires Conventional Reset",
> +	[NVME_SC_INVALID_QUEUE] = "Invalid Queue Deletion",
> +	[NVME_SC_FEATURE_NOT_SAVEABLE] = "Feature Identifier Not Saveable",
> +	[NVME_SC_FEATURE_NOT_CHANGEABLE] = "Feature Not Changeable",
> +	[NVME_SC_FEATURE_NOT_PER_NS] = "Feature Not Namespace Specific",
> +	[NVME_SC_FW_NEEDS_SUBSYS_RESET] = "Firmware Activation Requires NVM Subsystem Reset",
> +	[NVME_SC_FW_NEEDS_RESET] = "Firmware Activation Requires Reset",
> +	[NVME_SC_FW_NEEDS_MAX_TIME] = "Firmware Activation Requires Maximum Time Violation",
> +	[NVME_SC_FW_ACTIVATE_PROHIBITED] = "Firmware Activation Prohibited",
> +	[NVME_SC_OVERLAPPING_RANGE] = "Overlapping Range",
> +	[NVME_SC_NS_INSUFFICIENT_CAP] = "Namespace Insufficient Capacity",
> +	[NVME_SC_NS_ID_UNAVAILABLE] = "Namespace Identifier Unavailable",
> +	[NVME_SC_NS_ALREADY_ATTACHED] = "Namespace Already Attached",
> +	[NVME_SC_NS_IS_PRIVATE] = "Namespace Is Private",
> +	[NVME_SC_NS_NOT_ATTACHED] = "Namespace Not Attached",
> +	[NVME_SC_THIN_PROV_NOT_SUPP] = "Thin Provisioning Not Supported",
> +	[NVME_SC_CTRL_LIST_INVALID] = "Controller List Invalid",
> +	[NVME_SC_SELT_TEST_IN_PROGRESS] = "Device Self-test In Progress",
> +	[NVME_SC_BP_WRITE_PROHIBITED] = "Boot Partition Write Prohibited",
> +	[NVME_SC_CTRL_ID_INVALID] = "Invalid Controller Identifier",
> +	[NVME_SC_SEC_CTRL_STATE_INVALID] = "Invalid Secondary Controller State",
> +	[NVME_SC_CTRL_RES_NUM_INVALID] = "Invalid Number of Controller Resources",
> +	[NVME_SC_RES_ID_INVALID] = "Invalid Resource Identifier",
> +	[NVME_SC_PMR_SAN_PROHIBITED] = "Sanitize Prohibited",
> +	[NVME_SC_ANA_GROUP_ID_INVALID] = "ANA Group Identifier Invalid",
> +	[NVME_SC_ANA_ATTACH_FAILED] = "ANA Attach Failed",
> +	[NVME_SC_BAD_ATTRIBUTES] = "Conflicting Attributes",
> +	[NVME_SC_INVALID_PI] = "Invalid Protection Information",
> +	[NVME_SC_READ_ONLY] = "Attempted Write to Read Only Range",
> +	[NVME_SC_ONCS_NOT_SUPPORTED] = "ONCS Not Supported",
> +	[NVME_SC_ZONE_BOUNDARY_ERROR] = "Zoned Boundary Error",
> +	[NVME_SC_ZONE_FULL] = "Zone Is Full",
> +	[NVME_SC_ZONE_READ_ONLY] = "Zone Is Read Only",
> +	[NVME_SC_ZONE_OFFLINE] = "Zone Is Offline",
> +	[NVME_SC_ZONE_INVALID_WRITE] = "Zone Invalid Write",
> +	[NVME_SC_ZONE_TOO_MANY_ACTIVE] = "Too Many Active Zones",
> +	[NVME_SC_ZONE_TOO_MANY_OPEN] = "Too Many Open Zones",
> +	[NVME_SC_ZONE_INVALID_TRANSITION] = "Invalid Zone State Transition",
> +	[NVME_SC_WRITE_FAULT] = "Write Fault",
> +	[NVME_SC_READ_ERROR] = "Unrecovered Read Error",
> +	[NVME_SC_GUARD_CHECK] = "End-to-end Guard Check Error",
> +	[NVME_SC_APPTAG_CHECK] = "End-to-end Application Tag Check Error",
> +	[NVME_SC_REFTAG_CHECK] = "End-to-end Reference Tag Check Error",
> +	[NVME_SC_COMPARE_FAILED] = "Compare Failure",
> +	[NVME_SC_ACCESS_DENIED] = "Access Denied",
> +	[NVME_SC_UNWRITTEN_BLOCK] = "Deallocated or Unwritten Logical Block",
> +	[NVME_SC_ANA_PERSISTENT_LOSS] = "Asymmetric Access Persistent Loss",
> +	[NVME_SC_ANA_INACCESSIBLE] = "Asymmetric Access Inaccessible",
> +	[NVME_SC_ANA_TRANSITION] = "Asymmetric Access Transition",
> +	[NVME_SC_HOST_PATH_ERROR] = "Host Pathing Error",
> +};
> +#define NVME_ERRORS_SIZE ARRAY_SIZE(nvme_errors)
> +#endif /* CONFIG_NVME_VERBOSE_ERRORS */
> +
> +void nvme_error_log(struct request *req)
> +{
> +	struct nvme_ns *ns = req->q->queuedata;
> +	struct nvme_request *nr = nvme_req(req);
> +	const struct nvme_string_table *entry;
> +	const unsigned char *op_str  = "Unknown";
> +	const unsigned char *err_str = NULL;
> +	unsigned int i;
> +
> +	if (!ns)
> +		return;
> +

We really need to track down the admin command errors as they are
very important and mostly be more useful in general.

if we are not handling admin command errors thenthe array
nvme_errors needs to be trimmed down to only I/O commands errors
as admin command codes will never be used.

> +	for (i = 0, entry = nvme_ops ; i < NVME_OPS_SIZE ; i++)
> +		if (entry[i].code == (req->cmd_flags & REQ_OP_MASK))
> +			op_str = entry[i].string;
> +
> +#ifdef CONFIG_NVME_VERBOSE_ERRORS
> +	if ((nr->status & 0x7ff) <= NVME_ERRORS_SIZE)
> +		err_str = nvme_errors[nr->status & 0x7ff];
> +#endif
> +	if (err_str == NULL)
> +		err_str =  "NVME Error";
> +
> +	pr_err_ratelimited("%s: %s @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
> +	       req->rq_disk ? req->rq_disk->disk_name : "?",
> +	       op_str,
> +	       (unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
> +	       (unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
> +	       err_str,
> +	       nr->status >> 8 & 7, /* Status Code Type */
> +	       nr->status & 0xff,    /* Status Code */
> +	       nr->status & NVME_SC_MORE ? "MORE " : "",
> +	       nr->status & NVME_SC_DNR  ? "DNR "  : "");
> +}
> +EXPORT_SYMBOL_GPL(nvme_error_log);
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 9b095ee01364..7edd67b92a6b 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -922,4 +922,6 @@ static inline bool nvme_multi_css(struct nvme_ctrl *ctrl)
>   	return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
>   }
>   
> +extern void nvme_error_log(struct request *req);
> +

maybe we can drop extern keyword ?

>   #endif /* _NVME_H */
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 855dd9b3e84b..1f946e5bf7c1 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -1636,6 +1636,7 @@ enum {
>   	NVME_SC_HOST_ABORTED_CMD	= 0x371,
>   
>   	NVME_SC_CRD			= 0x1800,
> +	NVME_SC_MORE			= 0x2000,
>   	NVME_SC_DNR			= 0x4000,
>   };
>   
> 


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2022-01-11 20:40 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-05 18:32 [PATCH V3 0/1] nvme: Add verbose error logging Alan Adamson
2022-01-05 18:32 ` [PATCH V3 1/1] " Alan Adamson
2022-01-11 20:25   ` Chaitanya Kulkarni

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.