From: Yazen Ghannam <yazen.ghannam@amd.com>
To: <linux-edac@vger.kernel.org>
Cc: <linux-kernel@vger.kernel.org>, <tony.luck@intel.com>,
<x86@kernel.org>, <Avadhut.Naik@amd.com>, <John.Allen@amd.com>,
Yazen Ghannam <yazen.ghannam@amd.com>
Subject: [PATCH v2 14/16] x86/mce, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers
Date: Thu, 4 Apr 2024 10:13:57 -0500 [thread overview]
Message-ID: <20240404151359.47970-15-yazen.ghannam@amd.com> (raw)
In-Reply-To: <20240404151359.47970-1-yazen.ghannam@amd.com>
From: Avadhut Naik <Avadhut.Naik@amd.com>
AMD's Scalable MCA systems viz. Genoa will include two new registers:
MCA_SYND1 and MCA_SYND2.
These registers will include supplemental error information in addition
to the existing MCA_SYND register. The data within the registers is
considered valid if MCA_STATUS[SyndV] is set.
Add fields for these registers as vendor-specific error information
in struct mce_hw_err. Save and print these registers wherever
MCA_STATUS[SyndV]/MCA_SYND is currently used.
Also, modify the mce_record tracepoint to export these new registers
through __dynamic_array. While the sizeof() operator has been used to
determine the size of this __dynamic_array, the same, if needed in the
future can be substituted by caching the size of vendor-specific error
information as part of struct mce_hw_err.
Signed-off-by: Avadhut Naik <Avadhut.Naik@amd.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
Notes:
Link:
https://lkml.kernel.org/r/20231118193248.1296798-19-yazen.ghannam@amd.com
v1->v2:
* Rebase on upstream changes for MCE trace event. (Avadhut)
arch/x86/include/asm/mce.h | 12 ++++++++++++
arch/x86/kernel/cpu/mce/core.c | 26 ++++++++++++++++++--------
drivers/edac/mce_amd.c | 10 +++++++---
include/trace/events/mce.h | 9 +++++++--
4 files changed, 44 insertions(+), 13 deletions(-)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index e4ad9807b3e3..a701290f80a1 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -118,6 +118,9 @@
#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008
#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009
#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
+/* Registers MISC2 to MISC4 are at offsets B to D. */
+#define MSR_AMD64_SMCA_MC0_SYND1 0xc000200e
+#define MSR_AMD64_SMCA_MC0_SYND2 0xc000200f
#define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
@@ -128,6 +131,8 @@
#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
+#define MSR_AMD64_SMCA_MCx_SYND1(x) (MSR_AMD64_SMCA_MC0_SYND1 + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_SYND2(x) (MSR_AMD64_SMCA_MC0_SYND2 + 0x10*(x))
#define XEC(x, mask) (((x) >> 16) & mask)
@@ -185,6 +190,13 @@ enum mce_notifier_prios {
struct mce_hw_err {
struct mce m;
+
+ union vendor_info {
+ struct {
+ u64 synd1;
+ u64 synd2;
+ } amd;
+ } vi;
};
struct notifier_block;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index fef025bda2af..aa27729f7899 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -201,6 +201,10 @@ static void __print_mce(struct mce_hw_err *err)
if (mce_flags.smca) {
if (m->synd)
pr_cont("SYND %llx ", m->synd);
+ if (err->vi.amd.synd1)
+ pr_cont("SYND1 %llx ", err->vi.amd.synd1);
+ if (err->vi.amd.synd2)
+ pr_cont("SYND2 %llx ", err->vi.amd.synd2);
if (m->ipid)
pr_cont("IPID %llx ", m->ipid);
}
@@ -651,8 +655,10 @@ static struct notifier_block mce_default_nb = {
/*
* Read ADDR and MISC registers.
*/
-static noinstr void mce_read_aux(struct mce *m, int i)
+static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
{
+ struct mce *m = &err->m;
+
if (m->status & MCI_STATUS_MISCV)
m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
@@ -674,8 +680,11 @@ static noinstr void mce_read_aux(struct mce *m, int i)
if (mce_flags.smca) {
m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
- if (m->status & MCI_STATUS_SYNDV)
+ if (m->status & MCI_STATUS_SYNDV) {
m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
+ err->vi.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i));
+ err->vi.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i));
+ }
}
}
@@ -751,7 +760,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
/* If this entry is not valid, ignore it */
if (!(m->status & MCI_STATUS_VAL)) {
if (smca_destat_is_valid(i)) {
- mce_read_aux(m, i);
+ mce_read_aux(&err, i);
goto clear_it;
}
@@ -801,7 +810,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (flags & MCP_DONTLOG)
goto clear_it;
- mce_read_aux(m, i);
+ mce_read_aux(&err, i);
m->severity = mce_severity(m, NULL, NULL, false);
/*
@@ -943,9 +952,10 @@ static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_reg
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
-static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
+static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp,
struct pt_regs *regs)
{
+ struct mce *m = &err->m;
char *tmp = *msg;
int i;
@@ -963,7 +973,7 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo
m->bank = i;
if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
- mce_read_aux(m, i);
+ mce_read_aux(err, i);
*msg = tmp;
return 1;
}
@@ -1361,7 +1371,7 @@ __mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, struct mce *final,
if (severity == MCE_NO_SEVERITY)
continue;
- mce_read_aux(m, i);
+ mce_read_aux(err, i);
/* assuming valid severity level != 0 */
m->severity = severity;
@@ -1562,7 +1572,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
final = this_cpu_ptr(&hw_errs_seen);
final->m = *m;
- no_way_out = mce_no_way_out(m, &msg, valid_banks, regs);
+ no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs);
barrier();
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index e02af5da1ec2..32bf4cc564a3 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -792,7 +792,8 @@ static const char *decode_error_status(struct mce *m)
static int
amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
{
- struct mce *m = (struct mce *)data;
+ struct mce_hw_err *err = (struct mce_hw_err *)data;
+ struct mce *m = &err->m;
unsigned int fam = x86_family(m->cpuid);
int ecc;
@@ -850,8 +851,11 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
if (boot_cpu_has(X86_FEATURE_SMCA)) {
pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
- if (m->status & MCI_STATUS_SYNDV)
- pr_cont(", Syndrome: 0x%016llx", m->synd);
+ if (m->status & MCI_STATUS_SYNDV) {
+ pr_cont(", Syndrome: 0x%016llx\n", m->synd);
+ pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx",
+ err->vi.amd.synd1, err->vi.amd.synd2);
+ }
pr_cont("\n");
diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h
index 65aba1afcd07..43e8ecc11881 100644
--- a/include/trace/events/mce.h
+++ b/include/trace/events/mce.h
@@ -43,6 +43,8 @@ TRACE_EVENT(mce_record,
__field( u8, bank )
__field( u8, cpuvendor )
__field( u32, microcode )
+ __field( u8, len )
+ __dynamic_array(u8, v_data, sizeof(err->vi))
),
TP_fast_assign(
@@ -65,9 +67,11 @@ TRACE_EVENT(mce_record,
__entry->bank = err->m.bank;
__entry->cpuvendor = err->m.cpuvendor;
__entry->microcode = err->m.microcode;
+ __entry->len = sizeof(err->vi);
+ memcpy(__get_dynamic_array(v_data), &err->vi, sizeof(err->vi));
),
- TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x",
+ TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016llx, IPID: %016llx, ADDR: %016llx, MISC: %016llx, SYND: %016llx, RIP: %02x:<%016llx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x, vendor data: %s",
__entry->cpu,
__entry->mcgcap, __entry->mcgstatus,
__entry->bank, __entry->status,
@@ -83,7 +87,8 @@ TRACE_EVENT(mce_record,
__entry->walltime,
__entry->socketid,
__entry->apicid,
- __entry->microcode)
+ __entry->microcode,
+ __print_array(__get_dynamic_array(v_data), __entry->len / 8, 8))
);
#endif /* _TRACE_MCE_H */
--
2.34.1
next prev parent reply other threads:[~2024-04-04 15:14 UTC|newest]
Thread overview: 53+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-04-04 15:13 [PATCH v2 00/16] MCA Updates Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 01/16] x86/mce: Define mce_setup() helpers for common and per-CPU fields Yazen Ghannam
2024-04-16 10:02 ` Borislav Petkov
2024-04-17 13:50 ` Yazen Ghannam
2024-04-22 8:13 ` Borislav Petkov
2024-04-04 15:13 ` [PATCH v2 02/16] x86/mce: Use mce_setup() helpers for apei_smca_report_x86_error() Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 03/16] x86/mce/amd: Use fixed bank number for quirks Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 04/16] x86/mce/amd: Look up bank type by IPID Yazen Ghannam
2024-04-23 17:06 ` Borislav Petkov
2024-04-23 19:16 ` Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 05/16] x86/mce/amd: Clean up SMCA configuration Yazen Ghannam
2024-04-23 19:06 ` Borislav Petkov
2024-04-23 19:32 ` Yazen Ghannam
2024-04-24 2:29 ` Borislav Petkov
2024-04-24 13:44 ` Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 06/16] x86/mce/amd: Prep DFR handler before enabling banks Yazen Ghannam
2024-04-24 18:34 ` Borislav Petkov
2024-04-25 13:31 ` Yazen Ghannam
2024-04-29 12:38 ` Borislav Petkov
2024-04-29 13:22 ` Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 07/16] x86/mce/amd: Simplify DFR handler setup Yazen Ghannam
2024-04-24 19:06 ` Borislav Petkov
2024-04-25 14:12 ` Yazen Ghannam
2024-04-29 12:59 ` Borislav Petkov
2024-04-29 13:56 ` Yazen Ghannam
2024-04-29 14:12 ` Borislav Petkov
2024-04-29 14:25 ` Yazen Ghannam
2024-04-30 13:47 ` Borislav Petkov
2024-04-29 18:34 ` Robert Richter
2024-04-30 18:06 ` Borislav Petkov
2024-05-02 16:02 ` Yazen Ghannam
2024-05-02 18:48 ` Robert Richter
2024-05-04 14:37 ` Borislav Petkov
2024-04-04 15:13 ` [PATCH v2 08/16] x86/mce/amd: Clean up enable_deferred_error_interrupt() Yazen Ghannam
2024-04-29 13:12 ` Borislav Petkov
2024-04-29 14:18 ` Yazen Ghannam
2024-05-04 14:41 ` Borislav Petkov
2024-04-04 15:13 ` [PATCH v2 09/16] x86/mce: Unify AMD THR handler with MCA Polling Yazen Ghannam
2024-04-29 13:40 ` Borislav Petkov
2024-04-29 14:36 ` Yazen Ghannam
2024-05-04 14:52 ` Borislav Petkov
2024-05-07 16:25 ` Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 10/16] x86/mce: Unify AMD DFR " Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 11/16] x86/mce: Skip AMD threshold init if no threshold banks found Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 12/16] x86/mce/amd: Support SMCA Corrected Error Interrupt Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 13/16] x86/mce: Add wrapper for struct mce to export vendor specific info Yazen Ghannam
2024-04-04 15:13 ` Yazen Ghannam [this message]
2024-04-04 15:13 ` [PATCH v2 15/16] x86/mce/apei: Handle variable register array size Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 16/16] EDAC/mce_amd: Add support for FRU Text in MCA Yazen Ghannam
2024-04-05 16:06 ` Luck, Tony
2024-04-07 13:19 ` Yazen Ghannam
2024-04-08 19:47 ` Naik, Avadhut
2024-04-08 19:57 ` Luck, Tony
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240404151359.47970-15-yazen.ghannam@amd.com \
--to=yazen.ghannam@amd.com \
--cc=Avadhut.Naik@amd.com \
--cc=John.Allen@amd.com \
--cc=linux-edac@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=tony.luck@intel.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).