linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER
@ 2022-10-07 21:17 Smita Koralahalli
  2022-10-07 21:17 ` [PATCH 1/2] efi/cper, cxl: Decode CXL Protocol Error Section Smita Koralahalli
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Smita Koralahalli @ 2022-10-07 21:17 UTC (permalink / raw)
  To: linux-cxl, linux-kernel
  Cc: Alison Schofield, Vishal Verma, Ira Weiny, Ben Widawsky,
	Dan Williams, Robert Richter, Yazen Ghannam, Terry Bowman,
	Smita Koralahalli

This series adds decoding for the CXL Protocol Errors Common Platform
Error Record.

Smita Koralahalli (2):
  efi/cper, cxl: Decode CXL Protocol Error Section
  efi/cper, cxl: Decode CXL Error Log

 drivers/firmware/efi/Makefile   |   2 +-
 drivers/firmware/efi/cper.c     |   9 +++
 drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
 drivers/firmware/efi/cper_cxl.h |  58 +++++++++++++++++
 include/linux/cxl_err.h         |  21 +++++++
 5 files changed, 197 insertions(+), 1 deletion(-)
 create mode 100644 drivers/firmware/efi/cper_cxl.c
 create mode 100644 drivers/firmware/efi/cper_cxl.h
 create mode 100644 include/linux/cxl_err.h

-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/2] efi/cper, cxl: Decode CXL Protocol Error Section
  2022-10-07 21:17 [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER Smita Koralahalli
@ 2022-10-07 21:17 ` Smita Koralahalli
  2022-10-10 14:24   ` Jonathan Cameron
  2022-10-07 21:17 ` [PATCH 2/2] efi/cper, cxl: Decode CXL Error Log Smita Koralahalli
  2022-10-21 22:18 ` [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER Dan Williams
  2 siblings, 1 reply; 16+ messages in thread
From: Smita Koralahalli @ 2022-10-07 21:17 UTC (permalink / raw)
  To: linux-cxl, linux-kernel
  Cc: Alison Schofield, Vishal Verma, Ira Weiny, Ben Widawsky,
	Dan Williams, Robert Richter, Yazen Ghannam, Terry Bowman,
	Smita Koralahalli

Add support for decoding CXL Protocol Error Section as defined in UEFI 2.9
Section N.2.13.

Do the section decoding in a new cper_cxl.c file. This new file will be
used in the future for more CXL CPERs decode support. Add this to the
existing UEFI_CPER config.

Decode only the fields that are useful to parse the error.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
---
 drivers/firmware/efi/Makefile   |  2 +-
 drivers/firmware/efi/cper.c     |  9 ++++
 drivers/firmware/efi/cper_cxl.c | 87 +++++++++++++++++++++++++++++++++
 drivers/firmware/efi/cper_cxl.h | 58 ++++++++++++++++++++++
 4 files changed, 155 insertions(+), 1 deletion(-)
 create mode 100644 drivers/firmware/efi/cper_cxl.c
 create mode 100644 drivers/firmware/efi/cper_cxl.h

diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index 8d151e332584..4f332de54173 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -19,7 +19,7 @@ endif
 obj-$(CONFIG_EFI_PARAMS_FROM_FDT)	+= fdtparams.o
 obj-$(CONFIG_EFI_ESRT)			+= esrt.o
 obj-$(CONFIG_EFI_VARS_PSTORE)		+= efi-pstore.o
-obj-$(CONFIG_UEFI_CPER)			+= cper.o
+obj-$(CONFIG_UEFI_CPER)			+= cper.o cper_cxl.o
 obj-$(CONFIG_EFI_RUNTIME_MAP)		+= runtime-map.o
 obj-$(CONFIG_EFI_RUNTIME_WRAPPERS)	+= runtime-wrappers.o
 subdir-$(CONFIG_EFI_STUB)		+= libstub
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index e4e5ea7ce910..181deb9af527 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -24,6 +24,7 @@
 #include <linux/bcd.h>
 #include <acpi/ghes.h>
 #include <ras/ras_event.h>
+#include "cper_cxl.h"
 
 /*
  * CPER record ID need to be unique even after reboot, because record
@@ -595,6 +596,14 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
 			cper_print_fw_err(newpfx, gdata, fw_err);
 		else
 			goto err_section_too_small;
+	} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
+		struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
+
+		printk("%ssection_type: CXL Protocol Error\n", newpfx);
+		if (gdata->error_data_length >= sizeof(*prot_err))
+			cper_print_prot_err(newpfx, prot_err);
+		else
+			goto err_section_too_small;
 	} else {
 		const void *err = acpi_hest_get_payload(gdata);
 
diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
new file mode 100644
index 000000000000..e5f48f0de1a4
--- /dev/null
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * UEFI Common Platform Error Record (CPER) support for CXL Section.
+ *
+ * Copyright (C) 2022 Advanced Micro Devices, Inc.
+ *
+ * Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
+ */
+
+#include <linux/cper.h>
+#include "cper_cxl.h"
+
+#define PROT_ERR_VALID_AGENT_TYPE		BIT_ULL(0)
+#define PROT_ERR_VALID_AGENT_ADDRESS		BIT_ULL(1)
+#define PROT_ERR_VALID_DEVICE_ID		BIT_ULL(2)
+#define PROT_ERR_VALID_SERIAL_NUMBER		BIT_ULL(3)
+#define PROT_ERR_VALID_CAPABILITY		BIT_ULL(4)
+#define PROT_ERR_VALID_DVSEC			BIT_ULL(5)
+
+static const char * const prot_err_agent_type_strs[] = {
+	"Restricted CXL Device",
+	"Restricted CXL Host DP",
+};
+
+enum {
+	RCD,
+	RCH_DP,
+};
+
+void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
+{
+	if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
+		pr_info("%s agent_type: %d, %s\n", pfx, prot_err->agent_type,
+			prot_err->agent_type < ARRAY_SIZE(prot_err_agent_type_strs)
+			? prot_err_agent_type_strs[prot_err->agent_type]
+			: "unknown");
+
+	if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS) {
+		switch (prot_err->agent_type) {
+		case RCD:
+			pr_info("%s agent_address: %04x:%02x:%02x.%x\n",
+				pfx, prot_err->segment, prot_err->bus,
+				prot_err->device, prot_err->function);
+			break;
+		case RCH_DP:
+			pr_info("%s rcrb_base_address: 0x%016llx\n", pfx,
+				prot_err->agent_addr);
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID) {
+		const __u8 *class_code;
+
+		switch (prot_err->agent_type) {
+		case RCD:
+			pr_info("%s slot: %d\n", pfx,
+				prot_err->device_id.slot >> CPER_PCIE_SLOT_SHIFT);
+			pr_info("%s vendor_id: 0x%04x, device_id: 0x%04x\n",
+				pfx, prot_err->device_id.vendor_id,
+				prot_err->device_id.device_id);
+			pr_info("%s sub_vendor_id: 0x%04x, sub_device_id: 0x%04x\n",
+				pfx, prot_err->device_id.sub_vendor_id,
+				prot_err->device_id.sub_device_id);
+			class_code = prot_err->device_id.class_code;
+			pr_info("%s class_code: %02x%02x\n", pfx,
+				class_code[1], class_code[0]);
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) {
+		switch (prot_err->agent_type) {
+		case RCD:
+			pr_info("%s lower_dw: 0x%08x, upper_dw: 0x%08x\n", pfx,
+				prot_err->dev_serial_num.lower_dw,
+				prot_err->dev_serial_num.upper_dw);
+			break;
+		default:
+			break;
+		}
+	}
+}
diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
new file mode 100644
index 000000000000..f924d96e4bb2
--- /dev/null
+++ b/drivers/firmware/efi/cper_cxl.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * UEFI Common Platform Error Record (CPER) support for CXL Section.
+ *
+ * Copyright (C) 2022 Advanced Micro Devices, Inc.
+ *
+ * Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
+ */
+
+#ifndef LINUX_CPER_CXL_H
+#define LINUX_CPER_CXL_H
+
+/* CXL Protocol Error Section */
+#define CPER_SEC_CXL_PROT_ERR						\
+	GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78,	\
+		  0x4B, 0x77, 0x10, 0x48)
+
+#pragma pack(1)
+
+/* Compute Express Link Protocol Error Section, UEFI v2.9 sec N.2.13 */
+struct cper_sec_prot_err {
+	u64			valid_bits;
+	u8			agent_type;
+	u8			reserved[7];
+	union {
+		u64		agent_addr;
+		struct {
+			u8	function;
+			u8	device;
+			u8	bus;
+			u16	segment;
+			u8	reserved_1[3];
+		};
+	};
+	struct {
+		u16		vendor_id;
+		u16		device_id;
+		u16		sub_vendor_id;
+		u16		sub_device_id;
+		u8		class_code[2];
+		u16		slot;
+		u8		reserved_1[4];
+	}			device_id;
+	struct {
+		u32		lower_dw;
+		u32		upper_dw;
+	}			dev_serial_num;
+	u8			capability[60];
+	u16			dvsec_len;
+	u16			err_len;
+	u8			reserved_2[4];
+};
+
+#pragma pack()
+
+void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err);
+
+#endif //__CPER_CXL_
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/2] efi/cper, cxl: Decode CXL Error Log
  2022-10-07 21:17 [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER Smita Koralahalli
  2022-10-07 21:17 ` [PATCH 1/2] efi/cper, cxl: Decode CXL Protocol Error Section Smita Koralahalli
@ 2022-10-07 21:17 ` Smita Koralahalli
  2022-10-10 14:34   ` Jonathan Cameron
  2022-10-21 22:18 ` [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER Dan Williams
  2 siblings, 1 reply; 16+ messages in thread
From: Smita Koralahalli @ 2022-10-07 21:17 UTC (permalink / raw)
  To: linux-cxl, linux-kernel
  Cc: Alison Schofield, Vishal Verma, Ira Weiny, Ben Widawsky,
	Dan Williams, Robert Richter, Yazen Ghannam, Terry Bowman,
	Smita Koralahalli

Print the CXL Error Log field as found in CXL Protocol Error Section.

The CXL RAS Capability structure will be reused by OS First Handling
and the duplication/appropriate placement will be addressed eventually.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
---
 drivers/firmware/efi/cper_cxl.c | 21 +++++++++++++++++++++
 include/linux/cxl_err.h         | 21 +++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 include/linux/cxl_err.h

diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index e5f48f0de1a4..c3d1d0770aef 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/cper.h>
+#include <linux/cxl_err.h>
 #include "cper_cxl.h"
 
 #define PROT_ERR_VALID_AGENT_TYPE		BIT_ULL(0)
@@ -16,6 +17,7 @@
 #define PROT_ERR_VALID_SERIAL_NUMBER		BIT_ULL(3)
 #define PROT_ERR_VALID_CAPABILITY		BIT_ULL(4)
 #define PROT_ERR_VALID_DVSEC			BIT_ULL(5)
+#define PROT_ERR_VALID_ERROR_LOG		BIT_ULL(6)
 
 static const char * const prot_err_agent_type_strs[] = {
 	"Restricted CXL Device",
@@ -84,4 +86,23 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
 			break;
 		}
 	}
+
+	if (prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG) {
+		size_t size = sizeof(*prot_err) + prot_err->dvsec_len;
+		struct ras_capability_regs *cxl_ras;
+
+		pr_info("%s Error log length: 0x%04x\n", pfx, prot_err->err_len);
+
+		pr_info("%s CXL Error Log:\n", pfx);
+		cxl_ras = (struct ras_capability_regs *)((long)prot_err + size);
+		pr_info("%s cxl_ras_uncor_status: 0x%08x, cxl_ras_uncor_mask: 0x%08x\n",
+			pfx, cxl_ras->uncor_status, cxl_ras->uncor_mask);
+		pr_info("%s cxl_ras_uncor_severity: 0x%08x\n", pfx,
+			cxl_ras->uncor_severity);
+		pr_info("%s cxl_ras_cor_status: 0x%08x, cxl_ras_cor_mask: 0x%08x\n",
+			pfx, cxl_ras->cor_status, cxl_ras->cor_mask);
+		pr_info("%s Header Log Registers:\n", pfx);
+		print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4, cxl_ras->header_log,
+			       sizeof(cxl_ras->header_log), 0);
+	}
 }
diff --git a/include/linux/cxl_err.h b/include/linux/cxl_err.h
new file mode 100644
index 000000000000..c89dbb6c286f
--- /dev/null
+++ b/include/linux/cxl_err.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 Advanced Micro Devices, Inc.
+ *
+ * Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
+ */
+
+#ifndef LINUX_CXL_ERR_H
+#define LINUX_CXL_ERR_H
+
+struct ras_capability_regs {
+	u32 uncor_status;
+	u32 uncor_mask;
+	u32 uncor_severity;
+	u32 cor_status;
+	u32 cor_mask;
+	u32 cap_control;
+	u32 header_log[16];
+};
+
+#endif //__CXL_ERR_
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] efi/cper, cxl: Decode CXL Protocol Error Section
  2022-10-07 21:17 ` [PATCH 1/2] efi/cper, cxl: Decode CXL Protocol Error Section Smita Koralahalli
@ 2022-10-10 14:24   ` Jonathan Cameron
  2022-10-11 23:13     ` Smita Koralahalli
  2022-10-28 20:10     ` Smita Koralahalli
  0 siblings, 2 replies; 16+ messages in thread
From: Jonathan Cameron @ 2022-10-10 14:24 UTC (permalink / raw)
  To: Smita Koralahalli
  Cc: linux-cxl, linux-kernel, Alison Schofield, Vishal Verma,
	Ira Weiny, Ben Widawsky, Dan Williams, Robert Richter,
	Yazen Ghannam, Terry Bowman

On Fri, 7 Oct 2022 21:17:13 +0000
Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:

> Add support for decoding CXL Protocol Error Section as defined in UEFI 2.9
> Section N.2.13.
> 
> Do the section decoding in a new cper_cxl.c file. This new file will be
> used in the future for more CXL CPERs decode support. Add this to the
> existing UEFI_CPER config.
> 
> Decode only the fields that are useful to parse the error.
> 
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>

Hi Smita,

A few comments inline, but this looks pretty good to me though lots to
build on top of this (trace events, any in kernel handling necessary etc).

Jonathan

> ---
>  drivers/firmware/efi/Makefile   |  2 +-
>  drivers/firmware/efi/cper.c     |  9 ++++
>  drivers/firmware/efi/cper_cxl.c | 87 +++++++++++++++++++++++++++++++++
>  drivers/firmware/efi/cper_cxl.h | 58 ++++++++++++++++++++++
>  4 files changed, 155 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/firmware/efi/cper_cxl.c
>  create mode 100644 drivers/firmware/efi/cper_cxl.h
> 
> diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
> index 8d151e332584..4f332de54173 100644
> --- a/drivers/firmware/efi/Makefile
> +++ b/drivers/firmware/efi/Makefile
> @@ -19,7 +19,7 @@ endif
>  obj-$(CONFIG_EFI_PARAMS_FROM_FDT)	+= fdtparams.o
>  obj-$(CONFIG_EFI_ESRT)			+= esrt.o
>  obj-$(CONFIG_EFI_VARS_PSTORE)		+= efi-pstore.o
> -obj-$(CONFIG_UEFI_CPER)			+= cper.o
> +obj-$(CONFIG_UEFI_CPER)			+= cper.o cper_cxl.o
>  obj-$(CONFIG_EFI_RUNTIME_MAP)		+= runtime-map.o
>  obj-$(CONFIG_EFI_RUNTIME_WRAPPERS)	+= runtime-wrappers.o
>  subdir-$(CONFIG_EFI_STUB)		+= libstub
> diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
> index e4e5ea7ce910..181deb9af527 100644
> --- a/drivers/firmware/efi/cper.c
> +++ b/drivers/firmware/efi/cper.c
> @@ -24,6 +24,7 @@
>  #include <linux/bcd.h>
>  #include <acpi/ghes.h>
>  #include <ras/ras_event.h>
> +#include "cper_cxl.h"
>  
>  /*
>   * CPER record ID need to be unique even after reboot, because record
> @@ -595,6 +596,14 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
>  			cper_print_fw_err(newpfx, gdata, fw_err);
>  		else
>  			goto err_section_too_small;
> +	} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
> +		struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
> +
> +		printk("%ssection_type: CXL Protocol Error\n", newpfx);
> +		if (gdata->error_data_length >= sizeof(*prot_err))
> +			cper_print_prot_err(newpfx, prot_err);
> +		else
> +			goto err_section_too_small;
>  	} else {
>  		const void *err = acpi_hest_get_payload(gdata);
>  
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> new file mode 100644
> index 000000000000..e5f48f0de1a4
> --- /dev/null
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -0,0 +1,87 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * UEFI Common Platform Error Record (CPER) support for CXL Section.
> + *
> + * Copyright (C) 2022 Advanced Micro Devices, Inc.
> + *
> + * Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> + */
> +
> +#include <linux/cper.h>
> +#include "cper_cxl.h"
> +
> +#define PROT_ERR_VALID_AGENT_TYPE		BIT_ULL(0)
> +#define PROT_ERR_VALID_AGENT_ADDRESS		BIT_ULL(1)
> +#define PROT_ERR_VALID_DEVICE_ID		BIT_ULL(2)
> +#define PROT_ERR_VALID_SERIAL_NUMBER		BIT_ULL(3)
> +#define PROT_ERR_VALID_CAPABILITY		BIT_ULL(4)
> +#define PROT_ERR_VALID_DVSEC			BIT_ULL(5)

Bit 6 has been defined by 2.10 so would be good to add that here as well.
As below, could be a follow up patch.

> +
> +static const char * const prot_err_agent_type_strs[] = {
> +	"Restricted CXL Device",
> +	"Restricted CXL Host DP",

Values up to 7 now defined, so good to list the ones we know...
https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#compute-express-link-cxl-protocol-error-section
even if they aren't going to be generated by your firmware.

I've hacked qemu to poke out appropriate records in the past
(for CCIX a while back) and it would be easy enough to add
them for these cases if needed (obviously that's a dirty hack
but it works for testing these flows ;)

I guess I can send a follow up patch if you aren't happy jumping
directly to the 2.10 version of these structures.

> +};
> +
> +enum {
> +	RCD,
> +	RCH_DP,
> +};
> +
> +void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
> +{
> +	if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
> +		pr_info("%s agent_type: %d, %s\n", pfx, prot_err->agent_type,
> +			prot_err->agent_type < ARRAY_SIZE(prot_err_agent_type_strs)
> +			? prot_err_agent_type_strs[prot_err->agent_type]
> +			: "unknown");
> +
> +	if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS) {
> +		switch (prot_err->agent_type) {
> +		case RCD:
> +			pr_info("%s agent_address: %04x:%02x:%02x.%x\n",
> +				pfx, prot_err->segment, prot_err->bus,
> +				prot_err->device, prot_err->function);
> +			break;
> +		case RCH_DP:
> +			pr_info("%s rcrb_base_address: 0x%016llx\n", pfx,
> +				prot_err->agent_addr);
> +			break;
> +		default:
> +			break;
> +		}
> +	}
> +
> +	if (prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID) {
> +		const __u8 *class_code;
> +
> +		switch (prot_err->agent_type) {
> +		case RCD:
> +			pr_info("%s slot: %d\n", pfx,
> +				prot_err->device_id.slot >> CPER_PCIE_SLOT_SHIFT);
> +			pr_info("%s vendor_id: 0x%04x, device_id: 0x%04x\n",
> +				pfx, prot_err->device_id.vendor_id,
> +				prot_err->device_id.device_id);
> +			pr_info("%s sub_vendor_id: 0x%04x, sub_device_id: 0x%04x\n",
> +				pfx, prot_err->device_id.sub_vendor_id,
> +				prot_err->device_id.sub_device_id);
> +			class_code = prot_err->device_id.class_code;
> +			pr_info("%s class_code: %02x%02x\n", pfx,
> +				class_code[1], class_code[0]);
> +			break;
> +		default:
> +			break;
> +		}
> +	}
> +
> +	if (prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) {
> +		switch (prot_err->agent_type) {
> +		case RCD:
> +			pr_info("%s lower_dw: 0x%08x, upper_dw: 0x%08x\n", pfx,
> +				prot_err->dev_serial_num.lower_dw,
> +				prot_err->dev_serial_num.upper_dw);
> +			break;
> +		default:
> +			break;
> +		}
> +	}

Nice to pretty print the cap structure and appropriate DVSEC and Error logs as well
if valid, but that could be a follow up patch.

> +}
> diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
> new file mode 100644
> index 000000000000..f924d96e4bb2
> --- /dev/null
> +++ b/drivers/firmware/efi/cper_cxl.h
> @@ -0,0 +1,58 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * UEFI Common Platform Error Record (CPER) support for CXL Section.
> + *
> + * Copyright (C) 2022 Advanced Micro Devices, Inc.
> + *
> + * Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> + */
> +
> +#ifndef LINUX_CPER_CXL_H
> +#define LINUX_CPER_CXL_H
> +
> +/* CXL Protocol Error Section */
> +#define CPER_SEC_CXL_PROT_ERR						\
> +	GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78,	\
> +		  0x4B, 0x77, 0x10, 0x48)
> +
> +#pragma pack(1)
> +
> +/* Compute Express Link Protocol Error Section, UEFI v2.9 sec N.2.13 */

Given 2.10 is out an expands on a few corners of this. Perhaps update
an switch the reference over to that to cover that.
I haven't checked, but assume the additional Agent Types were added in 2.10.

> +struct cper_sec_prot_err {
> +	u64			valid_bits;
> +	u8			agent_type;
> +	u8			reserved[7];
> +	union {
> +		u64		agent_addr;

Perhaps useful to add a few comments to say when the different union
elements are relevant.  Perhaps also name the field as per the spec
which would give the overall union the agent_address.
I admit that is a little confusing with the union element having
the same name for when it's treated as an address.
Perhaps call the union element rcrb_base_addr?

> +		struct {
> +			u8	function;
> +			u8	device;
> +			u8	bus;
> +			u16	segment;
> +			u8	reserved_1[3];
> +		};
> +	};
> +	struct {
> +		u16		vendor_id;	
> +		u16		device_id;
> +		u16		sub_vendor_id;
> +		u16		sub_device_id;

This is always fun.  Far as I can tell not all PCI elements
have subsystem IDs - so who knows what goes in these..
Also, there is wonderfully no such thing as a PCI subsystem device ID.
It's just called subsystem ID in the PCI spec.

> +		u8		class_code[2];

Why treat class code as two u8s?  If doing so, shall
we name them?  base_class_code, sub_class_code?

> +		u16		slot;
> +		u8		reserved_1[4];
> +	}			device_id;

I would not align device-id structure name with the contents.
I'm not personally a big fan of aligning structure elements
because it often goes wrong, but if you want to do it, drop
the alignment of device_id by a few levels.

	} device_id;

> +	struct {
> +		u32		lower_dw;
> +		u32		upper_dw;
> +	}			dev_serial_num;
> +	u8			capability[60];
> +	u16			dvsec_len;
> +	u16			err_len;
> +	u8			reserved_2[4];

You could add a [] array on the end to make it clear there are more elements.
That's not a perfect solution though given there are two different variable length
fields on the end.  They aren't that variable (as defined by the structures
in the CXL spec) however the complexity comes from the fact that they may not
be valid / lengths will be 0 (I assume lengths will be 0
if not valid anyway, the spec doesn't seem to say either way...)

Either way, adding a variable array element gives a good way to address the first
one, and provides a good location for a comment on what else might be here.

> +};
> +
> +#pragma pack()

I would push the structure definition down into the c file and provide a forwards
definition only in the header.

struct cper_sec_prot_err;


> +
> +void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err);
> +
> +#endif //__CPER_CXL_


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] efi/cper, cxl: Decode CXL Error Log
  2022-10-07 21:17 ` [PATCH 2/2] efi/cper, cxl: Decode CXL Error Log Smita Koralahalli
@ 2022-10-10 14:34   ` Jonathan Cameron
  2022-10-11 23:17     ` Smita Koralahalli
  0 siblings, 1 reply; 16+ messages in thread
From: Jonathan Cameron @ 2022-10-10 14:34 UTC (permalink / raw)
  To: Smita Koralahalli
  Cc: linux-cxl, linux-kernel, Alison Schofield, Vishal Verma,
	Ira Weiny, Ben Widawsky, Dan Williams, Robert Richter,
	Yazen Ghannam, Terry Bowman

On Fri, 7 Oct 2022 21:17:14 +0000
Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:

> Print the CXL Error Log field as found in CXL Protocol Error Section.
> 
> The CXL RAS Capability structure will be reused by OS First Handling
> and the duplication/appropriate placement will be addressed eventually.
> 
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>

Ah. This clearly answers at least a few comments from my patch one review.
I should have read on!

> ---
>  drivers/firmware/efi/cper_cxl.c | 21 +++++++++++++++++++++
>  include/linux/cxl_err.h         | 21 +++++++++++++++++++++
>  2 files changed, 42 insertions(+)
>  create mode 100644 include/linux/cxl_err.h
> 
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> index e5f48f0de1a4..c3d1d0770aef 100644
> --- a/drivers/firmware/efi/cper_cxl.c
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -8,6 +8,7 @@
>   */
>  
>  #include <linux/cper.h>
> +#include <linux/cxl_err.h>
>  #include "cper_cxl.h"
>  
>  #define PROT_ERR_VALID_AGENT_TYPE		BIT_ULL(0)
> @@ -16,6 +17,7 @@
>  #define PROT_ERR_VALID_SERIAL_NUMBER		BIT_ULL(3)
>  #define PROT_ERR_VALID_CAPABILITY		BIT_ULL(4)
>  #define PROT_ERR_VALID_DVSEC			BIT_ULL(5)
> +#define PROT_ERR_VALID_ERROR_LOG		BIT_ULL(6)
>  
>  static const char * const prot_err_agent_type_strs[] = {
>  	"Restricted CXL Device",
> @@ -84,4 +86,23 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
>  			break;
>  		}
>  	}
> +
> +	if (prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG) {
> +		size_t size = sizeof(*prot_err) + prot_err->dvsec_len;
> +		struct ras_capability_regs *cxl_ras;
> +
> +		pr_info("%s Error log length: 0x%04x\n", pfx, prot_err->err_len);
> +
> +		pr_info("%s CXL Error Log:\n", pfx);
> +		cxl_ras = (struct ras_capability_regs *)((long)prot_err + size);
> +		pr_info("%s cxl_ras_uncor_status: 0x%08x, cxl_ras_uncor_mask: 0x%08x\n",
> +			pfx, cxl_ras->uncor_status, cxl_ras->uncor_mask);
Is it worth splitting these up, so that we get a human readable line with the
individual fields broken out?

> +		pr_info("%s cxl_ras_uncor_severity: 0x%08x\n", pfx,
> +			cxl_ras->uncor_severity);
> +		pr_info("%s cxl_ras_cor_status: 0x%08x, cxl_ras_cor_mask: 0x%08x\n",
> +			pfx, cxl_ras->cor_status, cxl_ras->cor_mask);

Not outputting the cap_control register?  Some of that might be useful.

> +		pr_info("%s Header Log Registers:\n", pfx);
> +		print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4, cxl_ras->header_log,
> +			       sizeof(cxl_ras->header_log), 0);
> +	}
>  }
> diff --git a/include/linux/cxl_err.h b/include/linux/cxl_err.h
> new file mode 100644
> index 000000000000..c89dbb6c286f
> --- /dev/null
> +++ b/include/linux/cxl_err.h
> @@ -0,0 +1,21 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2022 Advanced Micro Devices, Inc.
> + *
> + * Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> + */
> +
> +#ifndef LINUX_CXL_ERR_H
> +#define LINUX_CXL_ERR_H
> +
> +struct ras_capability_regs {

CXL r3.0 Spec reference plus prefix it with cxl_ 

Agreed with your comment at the top. Some discussion needed on where to
put this - or whether to delay figuring that out until a later stage.

> +	u32 uncor_status;
> +	u32 uncor_mask;
> +	u32 uncor_severity;
> +	u32 cor_status;
> +	u32 cor_mask;
> +	u32 cap_control;
> +	u32 header_log[16];
> +};
> +
> +#endif //__CXL_ERR_


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] efi/cper, cxl: Decode CXL Protocol Error Section
  2022-10-10 14:24   ` Jonathan Cameron
@ 2022-10-11 23:13     ` Smita Koralahalli
  2022-10-12 13:04       ` Jonathan Cameron
  2022-10-28 20:10     ` Smita Koralahalli
  1 sibling, 1 reply; 16+ messages in thread
From: Smita Koralahalli @ 2022-10-11 23:13 UTC (permalink / raw)
  To: Jonathan Cameron
  Cc: linux-cxl, linux-kernel, Alison Schofield, Vishal Verma,
	Ira Weiny, Ben Widawsky, Dan Williams, Robert Richter,
	Yazen Ghannam, Terry Bowman

On 10/10/2022 7:24 AM, Jonathan Cameron wrote:
> On Fri, 7 Oct 2022 21:17:13 +0000
> Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:
>
>> Add support for decoding CXL Protocol Error Section as defined in UEFI 2.9
>> Section N.2.13.
>>
>> Do the section decoding in a new cper_cxl.c file. This new file will be
>> used in the future for more CXL CPERs decode support. Add this to the
>> existing UEFI_CPER config.
>>
>> Decode only the fields that are useful to parse the error.
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Hi Smita,
>
> A few comments inline, but this looks pretty good to me though lots to
> build on top of this (trace events, any in kernel handling necessary etc).
>
> Jonathan

Hi Jonathan,

Thanks for the review comments. Please find my responses/answers inline.

>
>> +#include <linux/cper.h>
>> +#include "cper_cxl.h"
>> +
>> +#define PROT_ERR_VALID_AGENT_TYPE		BIT_ULL(0)
>> +#define PROT_ERR_VALID_AGENT_ADDRESS		BIT_ULL(1)
>> +#define PROT_ERR_VALID_DEVICE_ID		BIT_ULL(2)
>> +#define PROT_ERR_VALID_SERIAL_NUMBER		BIT_ULL(3)
>> +#define PROT_ERR_VALID_CAPABILITY		BIT_ULL(4)
>> +#define PROT_ERR_VALID_DVSEC			BIT_ULL(5)
> Bit 6 has been defined by 2.10 so would be good to add that here as well.
> As below, could be a follow up patch.

Yes, as you may have noticed, I have added this in the next patch of the 
series.

>
>> +
>> +static const char * const prot_err_agent_type_strs[] = {
>> +	"Restricted CXL Device",
>> +	"Restricted CXL Host DP",
> Values up to 7 now defined, so good to list the ones we know...
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fuefi.org%2Fspecs%2FUEFI%2F2.10%2FApx_N_Common_Platform_Error_Record.html%23compute-express-link-cxl-protocol-error-section&amp;data=05%7C01%7CSmita.KoralahalliChannabasappa%40amd.com%7Cbbc0e9642a314f875d9b08daaacb283f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638010086801792139%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=ZM0zOEVtdxsQtvAxHYrTEW%2FnIBttTK48v2x%2F5VdfQ0U%3D&amp;reserved=0
> even if they aren't going to be generated by your firmware.

Yes, I'm sorry that I missed it. I started my development with the old 
spec..
and did not pay heed to go back and check the latest spec again.

Will add the remaining agent types in the next revision.
Do you recommend adding all of them?
>
> I've hacked qemu to poke out appropriate records in the past
> (for CCIX a while back) and it would be easy enough to add
> them for these cases if needed (obviously that's a dirty hack
> but it works for testing these flows ;)

Yes that would be of a great help to verify all the fields I think.

With our hardware support, I have just verified for CXL 1.1
Downstream Port.

So, I haven't really gotten a chance to verify how Device ID and
Device Serial Number would look like..

>
> I guess I can send a follow up patch if you aren't happy jumping
> directly to the 2.10 version of these structures.

I think, I can add them in the next revision. But I don't think I can 
test them
though :(

>
>> +};
>> +
>> +enum {
>> +	RCD,
>> +	RCH_DP,
>> +};
>> +
>> +void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
>> +{
>> +	if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
>> +		pr_info("%s agent_type: %d, %s\n", pfx, prot_err->agent_type,
>> +			prot_err->agent_type < ARRAY_SIZE(prot_err_agent_type_strs)
>> +			? prot_err_agent_type_strs[prot_err->agent_type]
>> +			: "unknown");
>> +
>> +	if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS) {
>> +		switch (prot_err->agent_type) {
>> +		case RCD:
>> +			pr_info("%s agent_address: %04x:%02x:%02x.%x\n",
>> +				pfx, prot_err->segment, prot_err->bus,
>> +				prot_err->device, prot_err->function);
>> +			break;
>> +		case RCH_DP:
>> +			pr_info("%s rcrb_base_address: 0x%016llx\n", pfx,
>> +				prot_err->agent_addr);
>> +			break;
>> +		default:
>> +			break;
>> +		}
>> +	}
>> +
>> +	if (prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID) {
>> +		const __u8 *class_code;
>> +
>> +		switch (prot_err->agent_type) {
>> +		case RCD:
>> +			pr_info("%s slot: %d\n", pfx,
>> +				prot_err->device_id.slot >> CPER_PCIE_SLOT_SHIFT);
>> +			pr_info("%s vendor_id: 0x%04x, device_id: 0x%04x\n",
>> +				pfx, prot_err->device_id.vendor_id,
>> +				prot_err->device_id.device_id);
>> +			pr_info("%s sub_vendor_id: 0x%04x, sub_device_id: 0x%04x\n",
>> +				pfx, prot_err->device_id.sub_vendor_id,
>> +				prot_err->device_id.sub_device_id);
>> +			class_code = prot_err->device_id.class_code;
>> +			pr_info("%s class_code: %02x%02x\n", pfx,
>> +				class_code[1], class_code[0]);
>> +			break;
>> +		default:
>> +			break;
>> +		}
>> +	}
>> +
>> +	if (prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) {
>> +		switch (prot_err->agent_type) {
>> +		case RCD:
>> +			pr_info("%s lower_dw: 0x%08x, upper_dw: 0x%08x\n", pfx,
>> +				prot_err->dev_serial_num.lower_dw,
>> +				prot_err->dev_serial_num.upper_dw);
>> +			break;
>> +		default:
>> +			break;
>> +		}
>> +	}
> Nice to pretty print the cap structure and appropriate DVSEC and Error logs as well
> if valid, but that could be a follow up patch.

Hmm, I have added the Error log decoding support in the next patch.

But, I did not include cap structure and DVSEC though as I initially 
thought it might not be
that important to parse the error. Do you recommend adding them?

>
>> +}
>> diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
>> new file mode 100644
>> index 000000000000..f924d96e4bb2
>> --- /dev/null
>> +++ b/drivers/firmware/efi/cper_cxl.h
>> @@ -0,0 +1,58 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * UEFI Common Platform Error Record (CPER) support for CXL Section.
>> + *
>> + * Copyright (C) 2022 Advanced Micro Devices, Inc.
>> + *
>> + * Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> + */
>> +
>> +#ifndef LINUX_CPER_CXL_H
>> +#define LINUX_CPER_CXL_H
>> +
>> +/* CXL Protocol Error Section */
>> +#define CPER_SEC_CXL_PROT_ERR						\
>> +	GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78,	\
>> +		  0x4B, 0x77, 0x10, 0x48)
>> +
>> +#pragma pack(1)
>> +
>> +/* Compute Express Link Protocol Error Section, UEFI v2.9 sec N.2.13 */
> Given 2.10 is out an expands on a few corners of this. Perhaps update
> an switch the reference over to that to cover that.
> I haven't checked, but assume the additional Agent Types were added in 2.10.

Right, will fix the comment to refer to 2.10.

>
>> +struct cper_sec_prot_err {
>> +	u64			valid_bits;
>> +	u8			agent_type;
>> +	u8			reserved[7];
>> +	union {
>> +		u64		agent_addr;
> Perhaps useful to add a few comments to say when the different union
> elements are relevant.  Perhaps also name the field as per the spec
> which would give the overall union the agent_address.
> I admit that is a little confusing with the union element having
> the same name for when it's treated as an address.
> Perhaps call the union element rcrb_base_addr?

Okay, probably something like this?

     union {
                     u64                rcrb_base_addr;
                     struct {
                                     u8    function;
                                     u8    device;
                                     u8    bus;
                                     u16  segment;
                                     u8    reserved_1[3];
                     };
     } agent_addr;

And change printing from prot_err->segment to prot_err->agent_addr.segment
and so on.. Please ignore my indentation/alignments here..

>
>> +		struct {
>> +			u8	function;
>> +			u8	device;
>> +			u8	bus;
>> +			u16	segment;
>> +			u8	reserved_1[3];
>> +		};
>> +	};
>> +	struct {
>> +		u16		vendor_id;	
>> +		u16		device_id;
>> +		u16		sub_vendor_id;
>> +		u16		sub_device_id;
> This is always fun.  Far as I can tell not all PCI elements
> have subsystem IDs - so who knows what goes in these..
> Also, there is wonderfully no such thing as a PCI subsystem device ID.
> It's just called subsystem ID in the PCI spec.

Thanks for correcting this. Will fix.

>
>> +		u8		class_code[2];
> Why treat class code as two u8s?  If doing so, shall
> we name them?  base_class_code, sub_class_code?

Just followed the PCI CPER decoding here.. Will name them if that makes
more sense..

>
>> +		u16		slot;
>> +		u8		reserved_1[4];
>> +	}			device_id;
> I would not align device-id structure name with the contents.
> I'm not personally a big fan of aligning structure elements
> because it often goes wrong, but if you want to do it, drop
> the alignment of device_id by a few levels.

Thats true actually. Will just drop the alignment for all elements in next
revision.

>
> 	} device_id;
>
>> +	struct {
>> +		u32		lower_dw;
>> +		u32		upper_dw;
>> +	}			dev_serial_num;
>> +	u8			capability[60];
>> +	u16			dvsec_len;
>> +	u16			err_len;
>> +	u8			reserved_2[4];
> You could add a [] array on the end to make it clear there are more elements.
> That's not a perfect solution though given there are two different variable length
> fields on the end.  They aren't that variable (as defined by the structures
> in the CXL spec) however the complexity comes from the fact that they may not
> be valid / lengths will be 0 (I assume lengths will be 0
> if not valid anyway, the spec doesn't seem to say either way...)

Hmm, I probably got the idea to do this way by referring to "efi/cper-x86.c"
(N.2.4.2 IA32/X64 Processor Error Section in UEFI spec) and probably at few
places under the APEI tables.

Also to note, defining the new variable array member the parsing needs 
to be changed
accordingly in the next patch.  Add dvsec length to this new variable array
member to get to the Error Log field. Am I right?

>
> Either way, adding a variable array element gives a good way to address the first
> one, and provides a good location for a comment on what else might be here.
>
>> +};
>> +
>> +#pragma pack()
> I would push the structure definition down into the c file and provide a forwards
> definition only in the header.
>
> struct cper_sec_prot_err;

Okay will do that!

Thanks,
Smita

>
>
>> +
>> +void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err);
>> +
>> +#endif //__CPER_CXL_



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] efi/cper, cxl: Decode CXL Error Log
  2022-10-10 14:34   ` Jonathan Cameron
@ 2022-10-11 23:17     ` Smita Koralahalli
  0 siblings, 0 replies; 16+ messages in thread
From: Smita Koralahalli @ 2022-10-11 23:17 UTC (permalink / raw)
  To: Jonathan Cameron
  Cc: linux-cxl, linux-kernel, Alison Schofield, Vishal Verma,
	Ira Weiny, Ben Widawsky, Dan Williams, Robert Richter,
	Yazen Ghannam, Terry Bowman

On 10/10/2022 7:34 AM, Jonathan Cameron wrote:
> On Fri, 7 Oct 2022 21:17:14 +0000
> Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:
>
>
>> +
>> +	if (prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG) {
>> +		size_t size = sizeof(*prot_err) + prot_err->dvsec_len;
>> +		struct ras_capability_regs *cxl_ras;
>> +
>> +		pr_info("%s Error log length: 0x%04x\n", pfx, prot_err->err_len);
>> +
>> +		pr_info("%s CXL Error Log:\n", pfx);
>> +		cxl_ras = (struct ras_capability_regs *)((long)prot_err + size);
>> +		pr_info("%s cxl_ras_uncor_status: 0x%08x, cxl_ras_uncor_mask: 0x%08x\n",
>> +			pfx, cxl_ras->uncor_status, cxl_ras->uncor_mask);
> Is it worth splitting these up, so that we get a human readable line with the
> individual fields broken out?

Will do.

>
>> +		pr_info("%s cxl_ras_uncor_severity: 0x%08x\n", pfx,
>> +			cxl_ras->uncor_severity);
>> +		pr_info("%s cxl_ras_cor_status: 0x%08x, cxl_ras_cor_mask: 0x%08x\n",
>> +			pfx, cxl_ras->cor_status, cxl_ras->cor_mask);
> Not outputting the cap_control register?  Some of that might be useful.

Will add.

>
>> +		pr_info("%s Header Log Registers:\n", pfx);
>> +		print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4, cxl_ras->header_log,
>> +			       sizeof(cxl_ras->header_log), 0);
>> +	}
>>   }
>> diff --git a/include/linux/cxl_err.h b/include/linux/cxl_err.h
>> new file mode 100644
>> index 000000000000..c89dbb6c286f
>> --- /dev/null
>> +++ b/include/linux/cxl_err.h
>> @@ -0,0 +1,21 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Copyright (C) 2022 Advanced Micro Devices, Inc.
>> + *
>> + * Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> + */
>> +
>> +#ifndef LINUX_CXL_ERR_H
>> +#define LINUX_CXL_ERR_H
>> +
>> +struct ras_capability_regs {
> CXL r3.0 Spec reference plus prefix it with cxl_

Okay.

>
> Agreed with your comment at the top. Some discussion needed on where to
> put this - or whether to delay figuring that out until a later stage.

Yes, more discussion needed here though!

Thanks,
Smita

>
>> +	u32 uncor_status;
>> +	u32 uncor_mask;
>> +	u32 uncor_severity;
>> +	u32 cor_status;
>> +	u32 cor_mask;
>> +	u32 cap_control;
>> +	u32 header_log[16];
>> +};
>> +
>> +#endif //__CXL_ERR_



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] efi/cper, cxl: Decode CXL Protocol Error Section
  2022-10-11 23:13     ` Smita Koralahalli
@ 2022-10-12 13:04       ` Jonathan Cameron
  0 siblings, 0 replies; 16+ messages in thread
From: Jonathan Cameron @ 2022-10-12 13:04 UTC (permalink / raw)
  To: Smita Koralahalli
  Cc: linux-cxl, linux-kernel, Alison Schofield, Vishal Verma,
	Ira Weiny, Ben Widawsky, Dan Williams, Robert Richter,
	Yazen Ghannam, Terry Bowman

> >> +	if (prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) {
> >> +		switch (prot_err->agent_type) {
> >> +		case RCD:
> >> +			pr_info("%s lower_dw: 0x%08x, upper_dw: 0x%08x\n", pfx,
> >> +				prot_err->dev_serial_num.lower_dw,
> >> +				prot_err->dev_serial_num.upper_dw);
> >> +			break;
> >> +		default:
> >> +			break;
> >> +		}
> >> +	}  
> > Nice to pretty print the cap structure and appropriate DVSEC and Error logs as well
> > if valid, but that could be a follow up patch.  
> 
> Hmm, I have added the Error log decoding support in the next patch.
> 
> But, I did not include cap structure and DVSEC though as I initially 
> thought it might not be
> that important to parse the error. Do you recommend adding them?

Given it would be straight forward to do, probably better to supply slightly too
much info than miss one particular chunk out.
> >  
> >> +struct cper_sec_prot_err {
> >> +	u64			valid_bits;
> >> +	u8			agent_type;
> >> +	u8			reserved[7];
> >> +	union {
> >> +		u64		agent_addr;  
> > Perhaps useful to add a few comments to say when the different union
> > elements are relevant.  Perhaps also name the field as per the spec
> > which would give the overall union the agent_address.
> > I admit that is a little confusing with the union element having
> > the same name for when it's treated as an address.
> > Perhaps call the union element rcrb_base_addr?  
> 
> Okay, probably something like this?
> 
>      union {
>                      u64                rcrb_base_addr;
>                      struct {
>                                      u8    function;
>                                      u8    device;
>                                      u8    bus;
>                                      u16  segment;
>                                      u8    reserved_1[3];
>                      };
>      } agent_addr;
> 
> And change printing from prot_err->segment to prot_err->agent_addr.segment
> and so on.. Please ignore my indentation/alignments here..

Looks good to me.


> 
> >  
> >> +		struct {
> >> +			u8	function;
> >> +			u8	device;
> >> +			u8	bus;
> >> +			u16	segment;
> >> +			u8	reserved_1[3];
> >> +		};
> >> +	};
> >> +	struct {
> >> +		u16		vendor_id;	
> >> +		u16		device_id;
> >> +		u16		sub_vendor_id;
> >> +		u16		sub_device_id;  
> > This is always fun.  Far as I can tell not all PCI elements
> > have subsystem IDs - so who knows what goes in these..
> > Also, there is wonderfully no such thing as a PCI subsystem device ID.
> > It's just called subsystem ID in the PCI spec.  
> 
> Thanks for correcting this. Will fix.

UEFI spec is inconsistent with PCIe :( 


> 
> >  
> >> +		u8		class_code[2];  
> > Why treat class code as two u8s?  If doing so, shall
> > we name them?  base_class_code, sub_class_code?  
> 
> Just followed the PCI CPER decoding here.. Will name them if that makes
> more sense..
> 

ok. Fine as is.


> 
> >
> > 	} device_id;
> >  
> >> +	struct {
> >> +		u32		lower_dw;
> >> +		u32		upper_dw;
> >> +	}			dev_serial_num;
> >> +	u8			capability[60];
> >> +	u16			dvsec_len;
> >> +	u16			err_len;
> >> +	u8			reserved_2[4];  
> > You could add a [] array on the end to make it clear there are more elements.
> > That's not a perfect solution though given there are two different variable length
> > fields on the end.  They aren't that variable (as defined by the structures
> > in the CXL spec) however the complexity comes from the fact that they may not
> > be valid / lengths will be 0 (I assume lengths will be 0
> > if not valid anyway, the spec doesn't seem to say either way...)  
> 
> Hmm, I probably got the idea to do this way by referring to "efi/cper-x86.c"
> (N.2.4.2 IA32/X64 Processor Error Section in UEFI spec) and probably at few
> places under the APEI tables.

Ah. I didn't check for precedence.  Down to maintainer preference then.

> 
> Also to note, defining the new variable array member the parsing needs 
> to be changed
> accordingly in the next patch.  Add dvsec length to this new variable array
> member to get to the Error Log field. Am I right?

Yes.

Jonathan


^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER
  2022-10-07 21:17 [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER Smita Koralahalli
  2022-10-07 21:17 ` [PATCH 1/2] efi/cper, cxl: Decode CXL Protocol Error Section Smita Koralahalli
  2022-10-07 21:17 ` [PATCH 2/2] efi/cper, cxl: Decode CXL Error Log Smita Koralahalli
@ 2022-10-21 22:18 ` Dan Williams
  2022-10-25 23:55   ` Smita Koralahalli
  2 siblings, 1 reply; 16+ messages in thread
From: Dan Williams @ 2022-10-21 22:18 UTC (permalink / raw)
  To: Smita Koralahalli, linux-cxl, linux-kernel
  Cc: Alison Schofield, Vishal Verma, Ira Weiny, Ben Widawsky,
	Dan Williams, Robert Richter, Yazen Ghannam, Terry Bowman,
	Smita Koralahalli, Ard Biesheuvel

Hi Smita,

Smita Koralahalli wrote:
> This series adds decoding for the CXL Protocol Errors Common Platform
> Error Record.

Be sure to copy Ard Biesheuvel <ardb@kernel.org>, added, on
drivers/firmware/efi/ patches.

Along those lines, drivers/cxl/ developers have an idea of what is
contained in the new CXL protocol error records and why Linux might want
to decode them, others from outside drivers/cxl/ might not. It always
helps to have a small summary of the benefit to end users of the
motivation to apply a patch set.

> 
> Smita Koralahalli (2):
>   efi/cper, cxl: Decode CXL Protocol Error Section
>   efi/cper, cxl: Decode CXL Error Log
> 
>  drivers/firmware/efi/Makefile   |   2 +-
>  drivers/firmware/efi/cper.c     |   9 +++
>  drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
>  drivers/firmware/efi/cper_cxl.h |  58 +++++++++++++++++
>  include/linux/cxl_err.h         |  21 +++++++
>  5 files changed, 197 insertions(+), 1 deletion(-)

I notice no updates for the trace events in ghes_do_proc(), is that next
in your queue? That's ok to be a follow-on after v2.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER
  2022-10-21 22:18 ` [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER Dan Williams
@ 2022-10-25 23:55   ` Smita Koralahalli
  2022-10-26  0:11     ` Dan Williams
  0 siblings, 1 reply; 16+ messages in thread
From: Smita Koralahalli @ 2022-10-25 23:55 UTC (permalink / raw)
  To: Dan Williams, linux-cxl, linux-kernel
  Cc: Alison Schofield, Vishal Verma, Ira Weiny, Ben Widawsky,
	Robert Richter, Yazen Ghannam, Terry Bowman, Ard Biesheuvel

Hi Dan,

On 10/21/2022 3:18 PM, Dan Williams wrote:
> Hi Smita,
>
> Smita Koralahalli wrote:
>> This series adds decoding for the CXL Protocol Errors Common Platform
>> Error Record.
> Be sure to copy Ard Biesheuvel <ardb@kernel.org>, added, on
> drivers/firmware/efi/ patches.
>
> Along those lines, drivers/cxl/ developers have an idea of what is
> contained in the new CXL protocol error records and why Linux might want
> to decode them, others from outside drivers/cxl/ might not. It always
> helps to have a small summary of the benefit to end users of the
> motivation to apply a patch set.

Sure, will include in my v2.

>
>> Smita Koralahalli (2):
>>    efi/cper, cxl: Decode CXL Protocol Error Section
>>    efi/cper, cxl: Decode CXL Error Log
>>
>>   drivers/firmware/efi/Makefile   |   2 +-
>>   drivers/firmware/efi/cper.c     |   9 +++
>>   drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
>>   drivers/firmware/efi/cper_cxl.h |  58 +++++++++++++++++
>>   include/linux/cxl_err.h         |  21 +++++++
>>   5 files changed, 197 insertions(+), 1 deletion(-)
> I notice no updates for the trace events in ghes_do_proc(), is that next
> in your queue? That's ok to be a follow-on after v2.

Sorry, if I haven't understood this right. Are you implying about the 
"handling"
of cxl memory errors in ghes_do_proc() or is it just copying of CPER 
entries to
tracepoints?

Thanks,
Smita



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER
  2022-10-25 23:55   ` Smita Koralahalli
@ 2022-10-26  0:11     ` Dan Williams
  2022-10-26 19:31       ` Smita Koralahalli
  0 siblings, 1 reply; 16+ messages in thread
From: Dan Williams @ 2022-10-26  0:11 UTC (permalink / raw)
  To: Smita Koralahalli, Dan Williams, linux-cxl, linux-kernel
  Cc: Alison Schofield, Vishal Verma, Ira Weiny, Ben Widawsky,
	Robert Richter, Yazen Ghannam, Terry Bowman, Ard Biesheuvel

Smita Koralahalli wrote:
> Hi Dan,
> 
> On 10/21/2022 3:18 PM, Dan Williams wrote:
> > Hi Smita,
> >
> > Smita Koralahalli wrote:
> >> This series adds decoding for the CXL Protocol Errors Common Platform
> >> Error Record.
> > Be sure to copy Ard Biesheuvel <ardb@kernel.org>, added, on
> > drivers/firmware/efi/ patches.
> >
> > Along those lines, drivers/cxl/ developers have an idea of what is
> > contained in the new CXL protocol error records and why Linux might want
> > to decode them, others from outside drivers/cxl/ might not. It always
> > helps to have a small summary of the benefit to end users of the
> > motivation to apply a patch set.
> 
> Sure, will include in my v2.
> 
> >
> >> Smita Koralahalli (2):
> >>    efi/cper, cxl: Decode CXL Protocol Error Section
> >>    efi/cper, cxl: Decode CXL Error Log
> >>
> >>   drivers/firmware/efi/Makefile   |   2 +-
> >>   drivers/firmware/efi/cper.c     |   9 +++
> >>   drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
> >>   drivers/firmware/efi/cper_cxl.h |  58 +++++++++++++++++
> >>   include/linux/cxl_err.h         |  21 +++++++
> >>   5 files changed, 197 insertions(+), 1 deletion(-)
> > I notice no updates for the trace events in ghes_do_proc(), is that next
> > in your queue? That's ok to be a follow-on after v2.
> 
> Sorry, if I haven't understood this right. Are you implying about the 
> "handling"
> of cxl memory errors in ghes_do_proc() or is it just copying of CPER 
> entries to
> tracepoints?

Right now ghes_do_proc() will let the CXL CPER records fall through to
log_non_standard_event(). Are you planning to add trace event decode
there for CPER_SEC_CXL_PROT_ERR records?

I am not sure if the CXL CPER to trace record conversion belongs there,
or somewhere closer to trace_aer_event() invocations since the CXL
protocol errors are effectively an extenstion of PCI AER events.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER
  2022-10-26  0:11     ` Dan Williams
@ 2022-10-26 19:31       ` Smita Koralahalli
  2022-10-28  3:07         ` Jonathan Zhang (Infra)
  0 siblings, 1 reply; 16+ messages in thread
From: Smita Koralahalli @ 2022-10-26 19:31 UTC (permalink / raw)
  To: Dan Williams, linux-cxl, linux-kernel
  Cc: Alison Schofield, Vishal Verma, Ira Weiny, Ben Widawsky,
	Robert Richter, Yazen Ghannam, Terry Bowman, Ard Biesheuvel

On 10/25/2022 5:11 PM, Dan Williams wrote:
> Smita Koralahalli wrote:
>> Hi Dan,
>>
>> On 10/21/2022 3:18 PM, Dan Williams wrote:
>>> Hi Smita,
>>>
>>> Smita Koralahalli wrote:
>>>> This series adds decoding for the CXL Protocol Errors Common Platform
>>>> Error Record.
>>> Be sure to copy Ard Biesheuvel <ardb@kernel.org>, added, on
>>> drivers/firmware/efi/ patches.
>>>
>>> Along those lines, drivers/cxl/ developers have an idea of what is
>>> contained in the new CXL protocol error records and why Linux might want
>>> to decode them, others from outside drivers/cxl/ might not. It always
>>> helps to have a small summary of the benefit to end users of the
>>> motivation to apply a patch set.
>> Sure, will include in my v2.
>>
>>>> Smita Koralahalli (2):
>>>>     efi/cper, cxl: Decode CXL Protocol Error Section
>>>>     efi/cper, cxl: Decode CXL Error Log
>>>>
>>>>    drivers/firmware/efi/Makefile   |   2 +-
>>>>    drivers/firmware/efi/cper.c     |   9 +++
>>>>    drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
>>>>    drivers/firmware/efi/cper_cxl.h |  58 +++++++++++++++++
>>>>    include/linux/cxl_err.h         |  21 +++++++
>>>>    5 files changed, 197 insertions(+), 1 deletion(-)
>>> I notice no updates for the trace events in ghes_do_proc(), is that next
>>> in your queue? That's ok to be a follow-on after v2.
>> Sorry, if I haven't understood this right. Are you implying about the
>> "handling"
>> of cxl memory errors in ghes_do_proc() or is it just copying of CPER
>> entries to
>> tracepoints?
> Right now ghes_do_proc() will let the CXL CPER records fall through to
> log_non_standard_event(). Are you planning to add trace event decode
> there for CPER_SEC_CXL_PROT_ERR records?

Thanks! Yeah its a good idea to add. I did not think about this before.
I will send this as a separate patchset after v2.

I think with this cxl cper trace event support and Ira's patchset which 
traces
specific event record types via Get Event Record, we can start the userspace
handling probably in rasdaemon?

>
> I am not sure if the CXL CPER to trace record conversion belongs there,
> or somewhere closer to trace_aer_event() invocations since the CXL
> protocol errors are effectively an extenstion of PCI AER events.

Right, I will keep it simple in v1 and get the comments about the 
placement..

Thanks,
Smita



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER
  2022-10-26 19:31       ` Smita Koralahalli
@ 2022-10-28  3:07         ` Jonathan Zhang (Infra)
  2022-10-28 20:46           ` Dan Williams
  0 siblings, 1 reply; 16+ messages in thread
From: Jonathan Zhang (Infra) @ 2022-10-28  3:07 UTC (permalink / raw)
  To: Smita Koralahalli
  Cc: Dan Williams, linux-cxl, linux-kernel, Alison Schofield,
	Vishal Verma, Ira Weiny, Ben Widawsky, Robert Richter,
	Yazen Ghannam, Terry Bowman, Ard Biesheuvel



> On Oct 26, 2022, at 12:31 PM, Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:
> 
> On 10/25/2022 5:11 PM, Dan Williams wrote:
>> Smita Koralahalli wrote:
>>> Hi Dan,
>>> 
>>> On 10/21/2022 3:18 PM, Dan Williams wrote:
>>>> Hi Smita,
>>>> 
>>>> Smita Koralahalli wrote:
>>>>> This series adds decoding for the CXL Protocol Errors Common Platform
>>>>> Error Record.
>>>> Be sure to copy Ard Biesheuvel <ardb@kernel.org>, added, on
>>>> drivers/firmware/efi/ patches.
>>>> 
>>>> Along those lines, drivers/cxl/ developers have an idea of what is
>>>> contained in the new CXL protocol error records and why Linux might want
>>>> to decode them, others from outside drivers/cxl/ might not. It always
>>>> helps to have a small summary of the benefit to end users of the
>>>> motivation to apply a patch set.
>>> Sure, will include in my v2.
>>> 
>>>>> Smita Koralahalli (2):
>>>>>    efi/cper, cxl: Decode CXL Protocol Error Section
>>>>>    efi/cper, cxl: Decode CXL Error Log
>>>>> 
>>>>>   drivers/firmware/efi/Makefile   |   2 +-
>>>>>   drivers/firmware/efi/cper.c     |   9 +++
>>>>>   drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
>>>>>   drivers/firmware/efi/cper_cxl.h |  58 +++++++++++++++++
>>>>>   include/linux/cxl_err.h         |  21 +++++++
>>>>>   5 files changed, 197 insertions(+), 1 deletion(-)
>>>> I notice no updates for the trace events in ghes_do_proc(), is that next
>>>> in your queue? That's ok to be a follow-on after v2.
>>> Sorry, if I haven't understood this right. Are you implying about the
>>> "handling"
>>> of cxl memory errors in ghes_do_proc() or is it just copying of CPER
>>> entries to
>>> tracepoints?
>> Right now ghes_do_proc() will let the CXL CPER records fall through to
>> log_non_standard_event(). Are you planning to add trace event decode
>> there for CPER_SEC_CXL_PROT_ERR records?
> 
> Thanks! Yeah its a good idea to add. I did not think about this before.
> I will send this as a separate patchset after v2.
> 
> I think with this cxl cper trace event support and Ira's patchset which traces
> specific event record types via Get Event Record, we can start the userspace
> handling probably in rasdaemon?
Yes, I think this makes sense. rasdaemon could aggregate data and provide user
with full picture:
* Memory errors from both processor attached memory and CXL memory.
* CXL protocol errors.
* CXL device errors.
Such errors may be handled either firmware first or OS first.

> 
>> 
>> I am not sure if the CXL CPER to trace record conversion belongs there,
>> or somewhere closer to trace_aer_event() invocations since the CXL
>> protocol errors are effectively an extenstion of PCI AER events.
> 
> Right, I will keep it simple in v1 and get the comments about the placement..
> 
> Thanks,
> Smita
> 
> 
> 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] efi/cper, cxl: Decode CXL Protocol Error Section
  2022-10-10 14:24   ` Jonathan Cameron
  2022-10-11 23:13     ` Smita Koralahalli
@ 2022-10-28 20:10     ` Smita Koralahalli
  1 sibling, 0 replies; 16+ messages in thread
From: Smita Koralahalli @ 2022-10-28 20:10 UTC (permalink / raw)
  To: Jonathan Cameron
  Cc: linux-cxl, linux-kernel, Alison Schofield, Vishal Verma,
	Ira Weiny, Ben Widawsky, Dan Williams, Robert Richter,
	Yazen Ghannam, Terry Bowman

On 10/10/2022 7:24 AM, Jonathan Cameron wrote:
> On Fri, 7 Oct 2022 21:17:13 +0000
> Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:
>
>> Add support for decoding CXL Protocol Error Section as defined in UEFI 2.9
>> Section N.2.13.
>>
>> Do the section decoding in a new cper_cxl.c file. This new file will be
>> used in the future for more CXL CPERs decode support. Add this to the
>> existing UEFI_CPER config.
>>
>> Decode only the fields that are useful to parse the error.
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Hi Smita,
>
> A few comments inline, but this looks pretty good to me though lots to
> build on top of this (trace events, any in kernel handling necessary etc).
>
> Jonathan
>
>
>> +};
>> +
>> +#pragma pack()
> I would push the structure definition down into the c file and provide a forwards
> definition only in the header.
>
> struct cper_sec_prot_err;
>

I haven't done this in my v2. In efi/cper.c, I'm calculating the size of 
the struct. I think forward
structure cannot be laid out there as size isn't known. Please correct 
me if I'm wrong. I can
incorporate changes accordingly in my v3.

Thanks,
Smita

>> +
>> +void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err);
>> +
>> +#endif //__CPER_CXL_



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER
  2022-10-28  3:07         ` Jonathan Zhang (Infra)
@ 2022-10-28 20:46           ` Dan Williams
  2022-11-03 16:58             ` Jonathan Cameron
  0 siblings, 1 reply; 16+ messages in thread
From: Dan Williams @ 2022-10-28 20:46 UTC (permalink / raw)
  To: Jonathan Zhang (Infra), Smita Koralahalli
  Cc: Dan Williams, linux-cxl, linux-kernel, Alison Schofield,
	Vishal Verma, Ira Weiny, Ben Widawsky, Robert Richter,
	Yazen Ghannam, Terry Bowman, Ard Biesheuvel

Jonathan Zhang (Infra) wrote:
> 
> 
> > On Oct 26, 2022, at 12:31 PM, Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:
> > 
> > On 10/25/2022 5:11 PM, Dan Williams wrote:
> >> Smita Koralahalli wrote:
> >>> Hi Dan,
> >>> 
> >>> On 10/21/2022 3:18 PM, Dan Williams wrote:
> >>>> Hi Smita,
> >>>> 
> >>>> Smita Koralahalli wrote:
> >>>>> This series adds decoding for the CXL Protocol Errors Common Platform
> >>>>> Error Record.
> >>>> Be sure to copy Ard Biesheuvel <ardb@kernel.org>, added, on
> >>>> drivers/firmware/efi/ patches.
> >>>> 
> >>>> Along those lines, drivers/cxl/ developers have an idea of what is
> >>>> contained in the new CXL protocol error records and why Linux might want
> >>>> to decode them, others from outside drivers/cxl/ might not. It always
> >>>> helps to have a small summary of the benefit to end users of the
> >>>> motivation to apply a patch set.
> >>> Sure, will include in my v2.
> >>> 
> >>>>> Smita Koralahalli (2):
> >>>>>    efi/cper, cxl: Decode CXL Protocol Error Section
> >>>>>    efi/cper, cxl: Decode CXL Error Log
> >>>>> 
> >>>>>   drivers/firmware/efi/Makefile   |   2 +-
> >>>>>   drivers/firmware/efi/cper.c     |   9 +++
> >>>>>   drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
> >>>>>   drivers/firmware/efi/cper_cxl.h |  58 +++++++++++++++++
> >>>>>   include/linux/cxl_err.h         |  21 +++++++
> >>>>>   5 files changed, 197 insertions(+), 1 deletion(-)
> >>>> I notice no updates for the trace events in ghes_do_proc(), is that next
> >>>> in your queue? That's ok to be a follow-on after v2.
> >>> Sorry, if I haven't understood this right. Are you implying about the
> >>> "handling"
> >>> of cxl memory errors in ghes_do_proc() or is it just copying of CPER
> >>> entries to
> >>> tracepoints?
> >> Right now ghes_do_proc() will let the CXL CPER records fall through to
> >> log_non_standard_event(). Are you planning to add trace event decode
> >> there for CPER_SEC_CXL_PROT_ERR records?
> > 
> > Thanks! Yeah its a good idea to add. I did not think about this before.
> > I will send this as a separate patchset after v2.
> > 
> > I think with this cxl cper trace event support and Ira's patchset which traces
> > specific event record types via Get Event Record, we can start the userspace
> > handling probably in rasdaemon?
> Yes, I think this makes sense. rasdaemon could aggregate data and provide user
> with full picture:
> * Memory errors from both processor attached memory and CXL memory.
> * CXL protocol errors.
> * CXL device errors.
> Such errors may be handled either firmware first or OS first.

I have no concerns about rasdaemon subscribing to CXL RAS events, but
the nice thing about trace-events is that any number of subscribers can
attach to the event stream. So I expect cxl-cli to have a monitor of
these CXL specific events and that does not preclude rasdaemon from
also incorporating CXL events into its event list.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER
  2022-10-28 20:46           ` Dan Williams
@ 2022-11-03 16:58             ` Jonathan Cameron
  0 siblings, 0 replies; 16+ messages in thread
From: Jonathan Cameron @ 2022-11-03 16:58 UTC (permalink / raw)
  To: Dan Williams
  Cc: Jonathan Zhang (Infra),
	Smita Koralahalli, linux-cxl, linux-kernel, Alison Schofield,
	Vishal Verma, Ira Weiny, Ben Widawsky, Robert Richter,
	Yazen Ghannam, Terry Bowman, Ard Biesheuvel

On Fri, 28 Oct 2022 13:46:22 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> Jonathan Zhang (Infra) wrote:
> > 
> >   
> > > On Oct 26, 2022, at 12:31 PM, Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:
> > > 
> > > On 10/25/2022 5:11 PM, Dan Williams wrote:  
> > >> Smita Koralahalli wrote:  
> > >>> Hi Dan,
> > >>> 
> > >>> On 10/21/2022 3:18 PM, Dan Williams wrote:  
> > >>>> Hi Smita,
> > >>>> 
> > >>>> Smita Koralahalli wrote:  
> > >>>>> This series adds decoding for the CXL Protocol Errors Common Platform
> > >>>>> Error Record.  
> > >>>> Be sure to copy Ard Biesheuvel <ardb@kernel.org>, added, on
> > >>>> drivers/firmware/efi/ patches.
> > >>>> 
> > >>>> Along those lines, drivers/cxl/ developers have an idea of what is
> > >>>> contained in the new CXL protocol error records and why Linux might want
> > >>>> to decode them, others from outside drivers/cxl/ might not. It always
> > >>>> helps to have a small summary of the benefit to end users of the
> > >>>> motivation to apply a patch set.  
> > >>> Sure, will include in my v2.
> > >>>   
> > >>>>> Smita Koralahalli (2):
> > >>>>>    efi/cper, cxl: Decode CXL Protocol Error Section
> > >>>>>    efi/cper, cxl: Decode CXL Error Log
> > >>>>> 
> > >>>>>   drivers/firmware/efi/Makefile   |   2 +-
> > >>>>>   drivers/firmware/efi/cper.c     |   9 +++
> > >>>>>   drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
> > >>>>>   drivers/firmware/efi/cper_cxl.h |  58 +++++++++++++++++
> > >>>>>   include/linux/cxl_err.h         |  21 +++++++
> > >>>>>   5 files changed, 197 insertions(+), 1 deletion(-)  
> > >>>> I notice no updates for the trace events in ghes_do_proc(), is that next
> > >>>> in your queue? That's ok to be a follow-on after v2.  
> > >>> Sorry, if I haven't understood this right. Are you implying about the
> > >>> "handling"
> > >>> of cxl memory errors in ghes_do_proc() or is it just copying of CPER
> > >>> entries to
> > >>> tracepoints?  
> > >> Right now ghes_do_proc() will let the CXL CPER records fall through to
> > >> log_non_standard_event(). Are you planning to add trace event decode
> > >> there for CPER_SEC_CXL_PROT_ERR records?  
> > > 
> > > Thanks! Yeah its a good idea to add. I did not think about this before.
> > > I will send this as a separate patchset after v2.
> > > 
> > > I think with this cxl cper trace event support and Ira's patchset which traces
> > > specific event record types via Get Event Record, we can start the userspace
> > > handling probably in rasdaemon?  
> > Yes, I think this makes sense. rasdaemon could aggregate data and provide user
> > with full picture:
> > * Memory errors from both processor attached memory and CXL memory.
> > * CXL protocol errors.
> > * CXL device errors.
> > Such errors may be handled either firmware first or OS first.  
> 
> I have no concerns about rasdaemon subscribing to CXL RAS events, but
> the nice thing about trace-events is that any number of subscribers can
> attach to the event stream. So I expect cxl-cli to have a monitor of
> these CXL specific events and that does not preclude rasdaemon from
> also incorporating CXL events into its event list.

FYI, we posted some poison list RAS daemon patches a while back. 
https://lore.kernel.org/all/20220622122021.1986-1-shiju.jose@huawei.com/

Absolutely agree we'll want all the rest of these once kernel patches are in
(and hence we know the tracepoints definitions are stable)

If anyone is working on the RASdaemon side of things, shout on the list as
I'd rather not see duplication of effort.

Jonathan


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2022-11-03 16:58 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-07 21:17 [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER Smita Koralahalli
2022-10-07 21:17 ` [PATCH 1/2] efi/cper, cxl: Decode CXL Protocol Error Section Smita Koralahalli
2022-10-10 14:24   ` Jonathan Cameron
2022-10-11 23:13     ` Smita Koralahalli
2022-10-12 13:04       ` Jonathan Cameron
2022-10-28 20:10     ` Smita Koralahalli
2022-10-07 21:17 ` [PATCH 2/2] efi/cper, cxl: Decode CXL Error Log Smita Koralahalli
2022-10-10 14:34   ` Jonathan Cameron
2022-10-11 23:17     ` Smita Koralahalli
2022-10-21 22:18 ` [PATCH 0/2] efi/cper, cxl: Decode CXL Protocol Errors CPER Dan Williams
2022-10-25 23:55   ` Smita Koralahalli
2022-10-26  0:11     ` Dan Williams
2022-10-26 19:31       ` Smita Koralahalli
2022-10-28  3:07         ` Jonathan Zhang (Infra)
2022-10-28 20:46           ` Dan Williams
2022-11-03 16:58             ` Jonathan Cameron

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).