linux-edac.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v5 0/2] Adds additional information to ARM RAS errors
@ 2024-03-21 22:55 Daniel Ferguson
  2024-03-21 22:55 ` [PATCH v5 1/2] RAS: ACPI: APEI: add conditional compilation to ARM specific error reporting routines Daniel Ferguson
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Daniel Ferguson @ 2024-03-21 22:55 UTC (permalink / raw)
  To: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck, Borislav Petkov
  Cc: linux-acpi, linux-kernel, linux-edac, Daniel Ferguson,
	Shengwei Luo, Jason Tian

The patch by Shengwei includes most of the justification for this series
in the commit message. The only thing i've done is add a few conditional compilation
directives based on feedback from a previous submission attempt. This series adds:

	1) Conditional compilation directives around ARM specific RAS error
           handling routines, so non-ARM platforms are not unnecessarily bloated.
        2) ARM Processor error section (As defined by UEFI 2.9 N2.4) to tracepoints for userspace
           consumption. This particular patch is a RESEND.

Originally:
	I did a RESEND of Shengwei's V3. I didn't make
        changes, and I didn't rev.
	The original patch: https://lore.kernel.org/lkml/20220214030813.135766-1-lostway@zju.edu.cn/
Changes since v3:
        Added conditional compilation directives
	previous submission(RESEND): https://lore.kernel.org/lkml/20231214232330.306526-1-danielf@os.amperecomputing.com/
Changes since v4:
        Rebased on latest linux master.
	No functional changes.
	previous submission: https://lore.kernel.org/linux-kernel/20240226-b4-arm-ras-error-vendor-info-v4-rc3-v4-0-08e0f168fec1@os.amperecomputing.com/

Signed-off-by: Daniel Ferguson <danielf@os.amperecomputing.com>
---
Daniel Ferguson (1):
      RAS: ACPI: APEI: add conditional compilation to ARM specific error reporting routines.

Shengwei Luo (1):
      RAS: Report ARM processor information to userspace

 drivers/acpi/apei/ghes.c |  7 +++++--
 drivers/ras/ras.c        | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/ras.h      | 20 ++++++++++++++++----
 include/ras/ras_event.h  | 48 +++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 110 insertions(+), 13 deletions(-)
---
base-commit: 2ac2b1665d3fbec6ca709dd6ef3ea05f4a51ee4c
change-id: 20240321-b4-arm-ras-error-vendor-info-v5-rc3-b1dc428f519c

Best regards,
-- 
Daniel Ferguson <danielf@os.amperecomputing.com>


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v5 1/2] RAS: ACPI: APEI: add conditional compilation to ARM specific error reporting routines.
  2024-03-21 22:55 [PATCH v5 0/2] Adds additional information to ARM RAS errors Daniel Ferguson
@ 2024-03-21 22:55 ` Daniel Ferguson
  2024-03-21 22:55 ` [PATCH v5 2/2] RAS: Report ARM processor information to userspace Daniel Ferguson
  2024-04-11 20:43 ` [PATCH v5 0/2] Adds additional information to ARM RAS errors Daniel Ferguson
  2 siblings, 0 replies; 5+ messages in thread
From: Daniel Ferguson @ 2024-03-21 22:55 UTC (permalink / raw)
  To: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck, Borislav Petkov
  Cc: linux-acpi, linux-kernel, linux-edac, Daniel Ferguson

This prevents the unnecessary inclusion of ARM specific RAS error handling
routines in non-ARM platforms.

Signed-off-by: Daniel Ferguson <danielf@os.amperecomputing.com>
---
 drivers/acpi/apei/ghes.c | 4 ++++
 drivers/ras/ras.c        | 2 ++
 include/linux/ras.h      | 5 ++++-
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 512067cac170..58014558b8e0 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -525,6 +525,7 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
 	return false;
 }
 
+#if defined(CONFIG_ARM) || defined (CONFIG_ARM64)
 static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
 				       int sev, bool sync)
 {
@@ -570,6 +571,7 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
 
 	return queued;
 }
+#endif
 
 /*
  * PCIe AER errors need to be sent to the AER driver for reporting and
@@ -704,9 +706,11 @@ static bool ghes_do_proc(struct ghes *ghes,
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
 			ghes_handle_aer(gdata);
+#if defined(CONFIG_ARM) || defined (CONFIG_ARM64)
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
 			queued = ghes_handle_arm_hw_error(gdata, sev, sync);
+#endif
 		} else {
 			void *err = acpi_hest_get_payload(gdata);
 
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index a6e4792a1b2e..249dce21a738 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -52,10 +52,12 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id,
 	trace_non_standard_event(sec_type, fru_id, fru_text, sev, err, len);
 }
 
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
 void log_arm_hw_error(struct cper_sec_proc_arm *err)
 {
 	trace_arm_event(err);
 }
+#endif
 
 static int __init ras_init(void)
 {
diff --git a/include/linux/ras.h b/include/linux/ras.h
index a64182bc72ad..811feb9d8160 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -24,17 +24,20 @@ int __init parse_cec_param(char *str);
 void log_non_standard_event(const guid_t *sec_type,
 			    const guid_t *fru_id, const char *fru_text,
 			    const u8 sev, const u8 *err, const u32 len);
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
 void log_arm_hw_error(struct cper_sec_proc_arm *err);
-
+#endif
 #else
 static inline void
 log_non_standard_event(const guid_t *sec_type,
 		       const guid_t *fru_id, const char *fru_text,
 		       const u8 sev, const u8 *err, const u32 len)
 { return; }
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
 static inline void
 log_arm_hw_error(struct cper_sec_proc_arm *err) { return; }
 #endif
+#endif
 
 struct atl_err {
 	u64 addr;

-- 
2.43.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH v5 2/2] RAS: Report ARM processor information to userspace
  2024-03-21 22:55 [PATCH v5 0/2] Adds additional information to ARM RAS errors Daniel Ferguson
  2024-03-21 22:55 ` [PATCH v5 1/2] RAS: ACPI: APEI: add conditional compilation to ARM specific error reporting routines Daniel Ferguson
@ 2024-03-21 22:55 ` Daniel Ferguson
  2024-04-26 11:45   ` Shiju Jose
  2024-04-11 20:43 ` [PATCH v5 0/2] Adds additional information to ARM RAS errors Daniel Ferguson
  2 siblings, 1 reply; 5+ messages in thread
From: Daniel Ferguson @ 2024-03-21 22:55 UTC (permalink / raw)
  To: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck, Borislav Petkov
  Cc: linux-acpi, linux-kernel, linux-edac, Daniel Ferguson,
	Shengwei Luo, Jason Tian

From: Shengwei Luo <luoshengwei@huawei.com>

The original arm_event trace code only traces out ARM processor error
information data. It's not enough for user to take appropriate action.

According to UEFI_2_9 specification chapter N2.4.4, the ARM processor
error section includes several ARM processor error information, several
ARM processor context information and several vendor specific error
information structures. In addition to these info, there are error
severity and cpu logical index about the event. Report all of these
information to userspace via perf i/f. So that the user can do cpu core
isolation according to error severity and other info.

Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
Signed-off-by: Jason Tian <jason@os.amperecomputing.com>
Signed-off-by: Daniel Ferguson <danielf@os.amperecomputing.com>
---
 drivers/acpi/apei/ghes.c |  3 +--
 drivers/ras/ras.c        | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/ras.h      | 15 ++++++++++++---
 include/ras/ras_event.h  | 48 +++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 100 insertions(+), 12 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 58014558b8e0..a93c80fe1bab 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -535,9 +535,8 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
 	int sec_sev, i;
 	char *p;
 
-	log_arm_hw_error(err);
-
 	sec_sev = ghes_severity(gdata->error_severity);
+	log_arm_hw_error(err, sec_sev);
 	if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
 		return false;
 
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index 249dce21a738..3e2beed2db07 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -53,9 +53,51 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id,
 }
 
 #if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
-void log_arm_hw_error(struct cper_sec_proc_arm *err)
+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev)
 {
-	trace_arm_event(err);
+	u32 pei_len;
+	u32 ctx_len = 0;
+	s32 vsei_len;
+	u8 *pei_err;
+	u8 *ctx_err;
+	u8 *ven_err_data;
+	struct cper_arm_err_info *err_info;
+	struct cper_arm_ctx_info *ctx_info;
+	int n, sz;
+	int cpu;
+
+	pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num;
+	pei_err = (u8 *)err + sizeof(struct cper_sec_proc_arm);
+
+	err_info = (struct cper_arm_err_info *)(err + 1);
+	ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num);
+	ctx_err = (u8 *)ctx_info;
+	for (n = 0; n < err->context_info_num; n++) {
+		sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
+		ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
+		ctx_len += sz;
+	}
+
+	vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) +
+						pei_len + ctx_len);
+	if (vsei_len < 0) {
+		pr_warn(FW_BUG
+			"section length: %d\n", err->section_length);
+		pr_warn(FW_BUG
+			"section length is too small\n");
+		pr_warn(FW_BUG
+			"firmware-generated error record is incorrect\n");
+		vsei_len = 0;
+	}
+	ven_err_data = (u8 *)ctx_info;
+
+	cpu = GET_LOGICAL_INDEX(err->mpidr);
+	/* when return value is invalid, set cpu index to -1 */
+	if (cpu < 0)
+		cpu = -1;
+
+	trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len,
+			ven_err_data, (u32)vsei_len, sev, cpu);
 }
 #endif
 
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 811feb9d8160..2070e4ae0626 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -25,7 +25,7 @@ void log_non_standard_event(const guid_t *sec_type,
 			    const guid_t *fru_id, const char *fru_text,
 			    const u8 sev, const u8 *err, const u32 len);
 #if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
-void log_arm_hw_error(struct cper_sec_proc_arm *err);
+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev);
 #endif
 #else
 static inline void
@@ -35,7 +35,7 @@ log_non_standard_event(const guid_t *sec_type,
 { return; }
 #if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
 static inline void
-log_arm_hw_error(struct cper_sec_proc_arm *err) { return; }
+log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { return; }
 #endif
 #endif
 
@@ -55,5 +55,14 @@ static inline void amd_retire_dram_row(struct atl_err *err) { }
 static inline unsigned long
 amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
 #endif /* CONFIG_AMD_ATL */
-
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+#include <asm/smp_plat.h>
+/*
+ * Include ARM specific SMP header which provides a function mapping mpidr to
+ * cpu logical index.
+ */
+#define GET_LOGICAL_INDEX(mpidr) get_logical_index(mpidr & MPIDR_HWID_BITMASK)
+#else
+#define GET_LOGICAL_INDEX(mpidr) -EINVAL
+#endif /* CONFIG_ARM || CONFIG_ARM64 */
 #endif /* __RAS_H__ */
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index c011ea236e9b..a7d7b6e717b6 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -168,11 +168,24 @@ TRACE_EVENT(mc_event,
  * This event is generated when hardware detects an ARM processor error
  * has occurred. UEFI 2.6 spec section N.2.4.4.
  */
+#define APEIL "ARM Processor Err Info data len"
+#define APEID "ARM Processor Err Info raw data"
+#define APECIL "ARM Processor Err Context Info data len"
+#define APECID "ARM Processor Err Context Info raw data"
+#define VSEIL "Vendor Specific Err Info data len"
+#define VSEID "Vendor Specific Err Info raw data"
 TRACE_EVENT(arm_event,
 
-	TP_PROTO(const struct cper_sec_proc_arm *proc),
+	TP_PROTO(const struct cper_sec_proc_arm *proc, const u8 *pei_err,
+			const u32 pei_len,
+			const u8 *ctx_err,
+			const u32 ctx_len,
+			const u8 *oem,
+			const u32 oem_len,
+			u8 sev,
+			int cpu),
 
-	TP_ARGS(proc),
+	TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev, cpu),
 
 	TP_STRUCT__entry(
 		__field(u64, mpidr)
@@ -180,6 +193,14 @@ TRACE_EVENT(arm_event,
 		__field(u32, running_state)
 		__field(u32, psci_state)
 		__field(u8, affinity)
+		__field(u32, pei_len)
+		__dynamic_array(u8, buf, pei_len)
+		__field(u32, ctx_len)
+		__dynamic_array(u8, buf1, ctx_len)
+		__field(u32, oem_len)
+		__dynamic_array(u8, buf2, oem_len)
+		__field(u8, sev)
+		__field(int, cpu)
 	),
 
 	TP_fast_assign(
@@ -199,12 +220,29 @@ TRACE_EVENT(arm_event,
 			__entry->running_state = ~0;
 			__entry->psci_state = ~0;
 		}
+		__entry->pei_len = pei_len;
+		memcpy(__get_dynamic_array(buf), pei_err, pei_len);
+		__entry->ctx_len = ctx_len;
+		memcpy(__get_dynamic_array(buf1), ctx_err, ctx_len);
+		__entry->oem_len = oem_len;
+		memcpy(__get_dynamic_array(buf2), oem, oem_len);
+		__entry->sev = sev;
+		__entry->cpu = cpu;
 	),
 
-	TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
-		  "running state: %d; PSCI state: %d",
+	TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
+		  "running state: %d; PSCI state: %d; "
+		  "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s",
+		  __entry->cpu,
+		  __entry->sev,
 		  __entry->affinity, __entry->mpidr, __entry->midr,
-		  __entry->running_state, __entry->psci_state)
+		  __entry->running_state, __entry->psci_state,
+		  APEIL, __entry->pei_len, APEID,
+		  __print_hex(__get_dynamic_array(buf), __entry->pei_len),
+		  APECIL, __entry->ctx_len, APECID,
+		  __print_hex(__get_dynamic_array(buf1), __entry->ctx_len),
+		  VSEIL, __entry->oem_len, VSEID,
+		  __print_hex(__get_dynamic_array(buf2), __entry->oem_len))
 );
 
 /*

-- 
2.43.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v5 0/2] Adds additional information to ARM RAS errors
  2024-03-21 22:55 [PATCH v5 0/2] Adds additional information to ARM RAS errors Daniel Ferguson
  2024-03-21 22:55 ` [PATCH v5 1/2] RAS: ACPI: APEI: add conditional compilation to ARM specific error reporting routines Daniel Ferguson
  2024-03-21 22:55 ` [PATCH v5 2/2] RAS: Report ARM processor information to userspace Daniel Ferguson
@ 2024-04-11 20:43 ` Daniel Ferguson
  2 siblings, 0 replies; 5+ messages in thread
From: Daniel Ferguson @ 2024-04-11 20:43 UTC (permalink / raw)
  To: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck, Borislav Petkov
  Cc: linux-acpi, linux-kernel, linux-edac, Shengwei Luo, Jason Tian


On 3/21/2024 3:55 PM, Daniel Ferguson wrote:
> The patch by Shengwei includes most of the justification for this series
> in the commit message. The only thing i've done is add a few conditional compilation
> directives based on feedback from a previous submission attempt. This series adds:
> 
> 	1) Conditional compilation directives around ARM specific RAS error
>          handling routines, so non-ARM platforms are not unnecessarily bloated.
>       2) ARM Processor error section (As defined by UEFI 2.9 N2.4) to tracepoints for userspace
>          consumption. This particular patch is a RESEND.
> 
> Originally:
> 	I did a RESEND of Shengwei's V3. I didn't make
>       changes, and I didn't rev.
> 	The original patch: https://lore.kernel.org/lkml/20220214030813.135766-1-lostway@zju.edu.cn/
> Changes since v3:
>       Added conditional compilation directives
> 	previous submission(RESEND): https://lore.kernel.org/lkml/20231214232330.306526-1-danielf@os.amperecomputing.com/
> Changes since v4:
>       Rebased on latest linux master.
> 	No functional changes.
> 	previous submission: https://lore.kernel.org/linux-kernel/20240226-b4-arm-ras-error-vendor-info-v4-rc3-v4-0-08e0f168fec1@os.amperecomputing.com/
> 

Hi Tony,
Thank you for your original comments several months ago for my v3
resend. I'm hoping I can get you to take another peek at this.

Daniel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH v5 2/2] RAS: Report ARM processor information to userspace
  2024-03-21 22:55 ` [PATCH v5 2/2] RAS: Report ARM processor information to userspace Daniel Ferguson
@ 2024-04-26 11:45   ` Shiju Jose
  0 siblings, 0 replies; 5+ messages in thread
From: Shiju Jose @ 2024-04-26 11:45 UTC (permalink / raw)
  To: Daniel Ferguson, Rafael J. Wysocki, Len Brown, James Morse,
	Tony Luck, Borislav Petkov
  Cc: linux-acpi, linux-kernel, linux-edac, luoshengwei, Jason Tian

Tested-by: Shiju Jose <shiju.jose@huawei.com>

CPU core isolation feature in rasdaemon has dependency on this kernel patch.

Thanks,
Shiju
>-----Original Message-----
>From: Daniel Ferguson <danielf@os.amperecomputing.com>
>Sent: 21 March 2024 22:56
>To: Rafael J. Wysocki <rafael@kernel.org>; Len Brown <lenb@kernel.org>;
>James Morse <james.morse@arm.com>; Tony Luck <tony.luck@intel.com>;
>Borislav Petkov <bp@alien8.de>
>Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-
>edac@vger.kernel.org; Daniel Ferguson <danielf@os.amperecomputing.com>;
>luoshengwei <luoshengwei@huawei.com>; Jason Tian
><jason@os.amperecomputing.com>
>Subject: [PATCH v5 2/2] RAS: Report ARM processor information to userspace
>
>From: Shengwei Luo <luoshengwei@huawei.com>
>
>The original arm_event trace code only traces out ARM processor error
>information data. It's not enough for user to take appropriate action.
>
>According to UEFI_2_9 specification chapter N2.4.4, the ARM processor error
>section includes several ARM processor error information, several ARM
>processor context information and several vendor specific error information
>structures. In addition to these info, there are error severity and cpu logical
>index about the event. Report all of these information to userspace via perf i/f.
>So that the user can do cpu core isolation according to error severity and other
>info.
>
>Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
>Signed-off-by: Jason Tian <jason@os.amperecomputing.com>
>Signed-off-by: Daniel Ferguson <danielf@os.amperecomputing.com>
>---
> drivers/acpi/apei/ghes.c |  3 +--
> drivers/ras/ras.c        | 46
>++++++++++++++++++++++++++++++++++++++++++++--
> include/linux/ras.h      | 15 ++++++++++++---
> include/ras/ras_event.h  | 48
>+++++++++++++++++++++++++++++++++++++++++++-----
> 4 files changed, 100 insertions(+), 12 deletions(-)
>
>diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index
>58014558b8e0..a93c80fe1bab 100644
>--- a/drivers/acpi/apei/ghes.c
>+++ b/drivers/acpi/apei/ghes.c
>@@ -535,9 +535,8 @@ static bool ghes_handle_arm_hw_error(struct
>acpi_hest_generic_data *gdata,
> 	int sec_sev, i;
> 	char *p;
>
>-	log_arm_hw_error(err);
>-
> 	sec_sev = ghes_severity(gdata->error_severity);
>+	log_arm_hw_error(err, sec_sev);
> 	if (sev != GHES_SEV_RECOVERABLE || sec_sev !=
>GHES_SEV_RECOVERABLE)
> 		return false;
>
>diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index
>249dce21a738..3e2beed2db07 100644
>--- a/drivers/ras/ras.c
>+++ b/drivers/ras/ras.c
>@@ -53,9 +53,51 @@ void log_non_standard_event(const guid_t *sec_type,
>const guid_t *fru_id,  }
>
> #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) -void
>log_arm_hw_error(struct cper_sec_proc_arm *err)
>+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev)
> {
>-	trace_arm_event(err);
>+	u32 pei_len;
>+	u32 ctx_len = 0;
>+	s32 vsei_len;
>+	u8 *pei_err;
>+	u8 *ctx_err;
>+	u8 *ven_err_data;
>+	struct cper_arm_err_info *err_info;
>+	struct cper_arm_ctx_info *ctx_info;
>+	int n, sz;
>+	int cpu;
>+
>+	pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num;
>+	pei_err = (u8 *)err + sizeof(struct cper_sec_proc_arm);
>+
>+	err_info = (struct cper_arm_err_info *)(err + 1);
>+	ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num);
>+	ctx_err = (u8 *)ctx_info;
>+	for (n = 0; n < err->context_info_num; n++) {
>+		sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
>+		ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
>+		ctx_len += sz;
>+	}
>+
>+	vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) +
>+						pei_len + ctx_len);
>+	if (vsei_len < 0) {
>+		pr_warn(FW_BUG
>+			"section length: %d\n", err->section_length);
>+		pr_warn(FW_BUG
>+			"section length is too small\n");
>+		pr_warn(FW_BUG
>+			"firmware-generated error record is incorrect\n");
>+		vsei_len = 0;
>+	}
>+	ven_err_data = (u8 *)ctx_info;
>+
>+	cpu = GET_LOGICAL_INDEX(err->mpidr);
>+	/* when return value is invalid, set cpu index to -1 */
>+	if (cpu < 0)
>+		cpu = -1;
>+
>+	trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len,
>+			ven_err_data, (u32)vsei_len, sev, cpu);
> }
> #endif
>
>diff --git a/include/linux/ras.h b/include/linux/ras.h index
>811feb9d8160..2070e4ae0626 100644
>--- a/include/linux/ras.h
>+++ b/include/linux/ras.h
>@@ -25,7 +25,7 @@ void log_non_standard_event(const guid_t *sec_type,
> 			    const guid_t *fru_id, const char *fru_text,
> 			    const u8 sev, const u8 *err, const u32 len);  #if
>defined(CONFIG_ARM) || defined(CONFIG_ARM64) -void
>log_arm_hw_error(struct cper_sec_proc_arm *err);
>+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev);
> #endif
> #else
> static inline void
>@@ -35,7 +35,7 @@ log_non_standard_event(const guid_t *sec_type,  { return;
>}  #if defined(CONFIG_ARM) || defined(CONFIG_ARM64)  static inline void -
>log_arm_hw_error(struct cper_sec_proc_arm *err) { return; }
>+log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { return;
>+}
> #endif
> #endif
>
>@@ -55,5 +55,14 @@ static inline void amd_retire_dram_row(struct atl_err
>*err) { }  static inline unsigned long
>amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL;
>}  #endif /* CONFIG_AMD_ATL */
>-
>+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) #include
>+<asm/smp_plat.h>
>+/*
>+ * Include ARM specific SMP header which provides a function mapping
>+mpidr to
>+ * cpu logical index.
>+ */
>+#define GET_LOGICAL_INDEX(mpidr) get_logical_index(mpidr &
>+MPIDR_HWID_BITMASK) #else #define GET_LOGICAL_INDEX(mpidr) -EINVAL
>+#endif /* CONFIG_ARM || CONFIG_ARM64 */
> #endif /* __RAS_H__ */
>diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index
>c011ea236e9b..a7d7b6e717b6 100644
>--- a/include/ras/ras_event.h
>+++ b/include/ras/ras_event.h
>@@ -168,11 +168,24 @@ TRACE_EVENT(mc_event,
>  * This event is generated when hardware detects an ARM processor error
>  * has occurred. UEFI 2.6 spec section N.2.4.4.
>  */
>+#define APEIL "ARM Processor Err Info data len"
>+#define APEID "ARM Processor Err Info raw data"
>+#define APECIL "ARM Processor Err Context Info data len"
>+#define APECID "ARM Processor Err Context Info raw data"
>+#define VSEIL "Vendor Specific Err Info data len"
>+#define VSEID "Vendor Specific Err Info raw data"
> TRACE_EVENT(arm_event,
>
>-	TP_PROTO(const struct cper_sec_proc_arm *proc),
>+	TP_PROTO(const struct cper_sec_proc_arm *proc, const u8 *pei_err,
>+			const u32 pei_len,
>+			const u8 *ctx_err,
>+			const u32 ctx_len,
>+			const u8 *oem,
>+			const u32 oem_len,
>+			u8 sev,
>+			int cpu),
>
>-	TP_ARGS(proc),
>+	TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev,
>+cpu),
>
> 	TP_STRUCT__entry(
> 		__field(u64, mpidr)
>@@ -180,6 +193,14 @@ TRACE_EVENT(arm_event,
> 		__field(u32, running_state)
> 		__field(u32, psci_state)
> 		__field(u8, affinity)
>+		__field(u32, pei_len)
>+		__dynamic_array(u8, buf, pei_len)
>+		__field(u32, ctx_len)
>+		__dynamic_array(u8, buf1, ctx_len)
>+		__field(u32, oem_len)
>+		__dynamic_array(u8, buf2, oem_len)
>+		__field(u8, sev)
>+		__field(int, cpu)
> 	),
>
> 	TP_fast_assign(
>@@ -199,12 +220,29 @@ TRACE_EVENT(arm_event,
> 			__entry->running_state = ~0;
> 			__entry->psci_state = ~0;
> 		}
>+		__entry->pei_len = pei_len;
>+		memcpy(__get_dynamic_array(buf), pei_err, pei_len);
>+		__entry->ctx_len = ctx_len;
>+		memcpy(__get_dynamic_array(buf1), ctx_err, ctx_len);
>+		__entry->oem_len = oem_len;
>+		memcpy(__get_dynamic_array(buf2), oem, oem_len);
>+		__entry->sev = sev;
>+		__entry->cpu = cpu;
> 	),
>
>-	TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
>-		  "running state: %d; PSCI state: %d",
>+	TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR:
>%016llx; "
>+		  "running state: %d; PSCI state: %d; "
>+		  "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s",
>+		  __entry->cpu,
>+		  __entry->sev,
> 		  __entry->affinity, __entry->mpidr, __entry->midr,
>-		  __entry->running_state, __entry->psci_state)
>+		  __entry->running_state, __entry->psci_state,
>+		  APEIL, __entry->pei_len, APEID,
>+		  __print_hex(__get_dynamic_array(buf), __entry->pei_len),
>+		  APECIL, __entry->ctx_len, APECID,
>+		  __print_hex(__get_dynamic_array(buf1), __entry->ctx_len),
>+		  VSEIL, __entry->oem_len, VSEID,
>+		  __print_hex(__get_dynamic_array(buf2), __entry->oem_len))
> );
>
> /*
>
>--
>2.43.0
>


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-04-26 11:46 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-21 22:55 [PATCH v5 0/2] Adds additional information to ARM RAS errors Daniel Ferguson
2024-03-21 22:55 ` [PATCH v5 1/2] RAS: ACPI: APEI: add conditional compilation to ARM specific error reporting routines Daniel Ferguson
2024-03-21 22:55 ` [PATCH v5 2/2] RAS: Report ARM processor information to userspace Daniel Ferguson
2024-04-26 11:45   ` Shiju Jose
2024-04-11 20:43 ` [PATCH v5 0/2] Adds additional information to ARM RAS errors Daniel Ferguson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).