linux-edac.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Yazen Ghannam <yazen.ghannam@amd.com>
To: <linux-edac@vger.kernel.org>
Cc: <linux-kernel@vger.kernel.org>, <tony.luck@intel.com>,
	<x86@kernel.org>, <Avadhut.Naik@amd.com>, <John.Allen@amd.com>,
	Yazen Ghannam <yazen.ghannam@amd.com>
Subject: [PATCH v2 10/16] x86/mce: Unify AMD DFR handler with MCA Polling
Date: Thu, 4 Apr 2024 10:13:53 -0500	[thread overview]
Message-ID: <20240404151359.47970-11-yazen.ghannam@amd.com> (raw)
In-Reply-To: <20240404151359.47970-1-yazen.ghannam@amd.com>

AMD systems optionally support a Deferred error interrupt. The interrupt
should be used as another signal to trigger MCA polling. This is similar
to how other MCA interrupts are handled.

Deferred errors do not require any special handling related to the
interrupt, e.g. resetting or rearming the interrupt, etc.

However, Scalable MCA systems include a pair of registers, MCA_DESTAT
and MCA_DEADDR, that should be checked for valid errors. This check
should be done whenever MCA registers are polled. Currently, the
Deferred error interrupt does this check, but the MCA polling function
does not.

Call the MCA polling function when handling the Deferred error
interrupt. This keeps all "polling" cases in a common function.

Call the polling function only for banks that have the Deferred error
interrupt enabled.

Add a "SMCA DFR handler" for Deferred errors to the AMD vendor-specific
error handler callback. This will do the same status check, register
clearing, and logging that the interrupt handler has done. And it
extends the common polling flow to find AMD Deferred errors.

Remove old code whose functionality is already covered in the common MCA
code.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---

Notes:
    Link:
    https://lkml.kernel.org/r/20231118193248.1296798-15-yazen.ghannam@amd.com
    
    v1->v2:
    * Keep separate interrupt entry points. (Yazen)
    * Move DFR error setup for MCA_CONFIG to a helper. (Yazen)

 arch/x86/kernel/cpu/mce/amd.c  | 155 +++++++++++++--------------------
 arch/x86/kernel/cpu/mce/core.c |  16 +++-
 2 files changed, 76 insertions(+), 95 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 75195d6fe971..40912c5e35d1 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -62,11 +62,13 @@
 #define CFG_MCAX_EN			BIT_ULL(32)
 #define CFG_LSB_IN_STATUS		BIT_ULL(8)
 #define CFG_DFR_INT_SUPP		BIT_ULL(5)
+#define CFG_DFR_LOG_SUPP		BIT_ULL(2)
 
 /* Threshold LVT offset is at MSR0xC0000410[15:12] */
 #define SMCA_THR_LVT_OFF	0xF000
 
 static bool thresholding_irq_en;
+static DEFINE_PER_CPU(mce_banks_t, mce_dfr_int_banks);
 
 static const char * const th_names[] = {
 	"load_store",
@@ -350,6 +352,28 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)
 
 }
 
+/* SMCA sets the Deferred Error Interrupt type per bank. */
+static void configure_smca_dfr(unsigned int bank, u64 *mca_config)
+{
+	/* Nothing to do if the bank doesn't support deferred error logging. */
+	if (!FIELD_GET(CFG_DFR_LOG_SUPP, *mca_config))
+		return;
+
+	/* Nothing to do if the bank doesn't support setting the interrupt type. */
+	if (!FIELD_GET(CFG_DFR_INT_SUPP, *mca_config))
+		return;
+
+	/*
+	 * Nothing to do if the interrupt type is already set. Either it was set by
+	 * the OS already. Or it was set by firmware, and the OS should leave it as-is.
+	 */
+	if (FIELD_GET(CFG_DFR_INT_TYPE, *mca_config))
+		return;
+
+	*mca_config |= FIELD_PREP(CFG_DFR_INT_TYPE, INTR_TYPE_APIC);
+	set_bit(bank, (void *)this_cpu_ptr(&mce_dfr_int_banks));
+}
+
 /* Set appropriate bits in MCA_CONFIG. */
 static void configure_smca(unsigned int bank)
 {
@@ -370,18 +394,7 @@ static void configure_smca(unsigned int bank)
 	 */
 	mca_config |= FIELD_PREP(CFG_MCAX_EN, 0x1);
 
-	/*
-	 * SMCA sets the Deferred Error Interrupt type per bank.
-	 *
-	 * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
-	 * if the DeferredIntType bit field is available.
-	 *
-	 * MCA_CONFIG[DeferredIntType] is bits [38:37]. OS should set
-	 * this to 0x1 to enable APIC based interrupt. First, check that
-	 * no interrupt has been set.
-	 */
-	if (FIELD_GET(CFG_DFR_INT_SUPP, mca_config) && !FIELD_GET(CFG_DFR_INT_TYPE, mca_config))
-		mca_config |= FIELD_PREP(CFG_DFR_INT_TYPE, INTR_TYPE_APIC);
+	configure_smca_dfr(bank, &mca_config);
 
 	if (FIELD_GET(CFG_LSB_IN_STATUS, mca_config))
 		this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = true;
@@ -872,33 +885,6 @@ bool amd_mce_usable_address(struct mce *m)
 	return false;
 }
 
-static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
-{
-	struct mce m;
-
-	mce_setup(&m);
-
-	m.status = status;
-	m.misc   = misc;
-	m.bank   = bank;
-	m.tsc	 = rdtsc();
-
-	if (m.status & MCI_STATUS_ADDRV) {
-		m.addr = addr;
-
-		smca_extract_err_addr(&m);
-	}
-
-	if (mce_flags.smca) {
-		rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);
-
-		if (m.status & MCI_STATUS_SYNDV)
-			rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
-	}
-
-	mce_log(&m);
-}
-
 DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
 {
 	trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
@@ -908,75 +894,46 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
 	apic_eoi();
 }
 
-/*
- * Returns true if the logged error is deferred. False, otherwise.
- */
-static inline bool
-_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
-{
-	u64 status, addr = 0;
-
-	rdmsrl(msr_stat, status);
-	if (!(status & MCI_STATUS_VAL))
-		return false;
-
-	if (status & MCI_STATUS_ADDRV)
-		rdmsrl(msr_addr, addr);
-
-	__log_error(bank, status, addr, misc);
-
-	wrmsrl(msr_stat, 0);
-
-	return status & MCI_STATUS_DEFERRED;
-}
-
-static bool _log_error_deferred(unsigned int bank, u32 misc)
-{
-	if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
-			     mca_msr_reg(bank, MCA_ADDR), misc))
-		return false;
-
-	/*
-	 * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers.
-	 * Return true here to avoid accessing these registers.
-	 */
-	if (!mce_flags.smca)
-		return true;
-
-	/* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */
-	wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
-	return true;
-}
-
 /*
  * We have three scenarios for checking for Deferred errors:
  *
  * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ *    This is already handled in machine_check_poll().
  * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
  *    clear MCA_DESTAT.
  * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
  *    log it.
  */
-static void log_error_deferred(unsigned int bank)
+static void handle_smca_dfr_error(struct mce *m)
 {
-	if (_log_error_deferred(bank, 0))
+	struct mce m_dfr;
+	u64 mca_destat;
+
+	/* Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers. */
+	if (!mce_flags.smca)
 		return;
 
-	/*
-	 * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
-	 * for a valid error.
-	 */
-	_log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank),
-			      MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
-}
+	/* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */
+	if (m->status & MCI_STATUS_DEFERRED)
+		goto out;
 
-/* APIC interrupt handler for deferred errors */
-static void amd_deferred_error_interrupt(void)
-{
-	unsigned int bank;
+	/* MCA_STATUS didn't have a deferred error, so check MCA_DESTAT for one. */
+	mca_destat = mce_rdmsrl(MSR_AMD64_SMCA_MCx_DESTAT(m->bank));
+
+	if (!(mca_destat & MCI_STATUS_VAL))
+		return;
+
+	/* Reuse the same data collected from machine_check_poll(). */
+	memcpy(&m_dfr, m, sizeof(m_dfr));
+
+	/* Save the MCA_DE{STAT,ADDR} values. */
+	m_dfr.status = mca_destat;
+	m_dfr.addr = mce_rdmsrl(MSR_AMD64_SMCA_MCx_DEADDR(m_dfr.bank));
 
-	for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank)
-		log_error_deferred(bank);
+	mce_log(&m_dfr);
+
+out:
+	wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0);
 }
 
 static void reset_block(struct threshold_block *block)
@@ -1035,9 +992,19 @@ static void amd_threshold_interrupt(void)
 	machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
 }
 
+/*
+ * Deferred error interrupt handler will service DEFERRED_ERROR_VECTOR. The interrupt
+ * is triggered when a bank logs a deferred error.
+ */
+static void amd_deferred_error_interrupt(void)
+{
+	machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_dfr_int_banks));
+}
+
 void amd_handle_error(struct mce *m)
 {
 	reset_thr_blocks(m->bank);
+	handle_smca_dfr_error(m);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 75297e7eb980..308766868f39 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -680,6 +680,14 @@ static void vendor_handle_error(struct mce *m)
 
 DEFINE_PER_CPU(unsigned, mce_poll_count);
 
+static bool smca_destat_is_valid(unsigned int bank)
+{
+	if (!mce_flags.smca)
+		return false;
+
+	return mce_rdmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank)) & MCI_STATUS_VAL;
+}
+
 /*
  * Poll for corrected events or events that happened before reset.
  * Those are just logged through /dev/mcelog.
@@ -731,8 +739,14 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 			mce_track_storm(&m);
 
 		/* If this entry is not valid, ignore it */
-		if (!(m.status & MCI_STATUS_VAL))
+		if (!(m.status & MCI_STATUS_VAL)) {
+			if (smca_destat_is_valid(i)) {
+				mce_read_aux(&m, i);
+				goto clear_it;
+			}
+
 			continue;
+		}
 
 		/*
 		 * If we are logging everything (at CPU online) or this
-- 
2.34.1


  parent reply	other threads:[~2024-04-04 15:14 UTC|newest]

Thread overview: 53+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-04 15:13 [PATCH v2 00/16] MCA Updates Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 01/16] x86/mce: Define mce_setup() helpers for common and per-CPU fields Yazen Ghannam
2024-04-16 10:02   ` Borislav Petkov
2024-04-17 13:50     ` Yazen Ghannam
2024-04-22  8:13       ` Borislav Petkov
2024-04-04 15:13 ` [PATCH v2 02/16] x86/mce: Use mce_setup() helpers for apei_smca_report_x86_error() Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 03/16] x86/mce/amd: Use fixed bank number for quirks Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 04/16] x86/mce/amd: Look up bank type by IPID Yazen Ghannam
2024-04-23 17:06   ` Borislav Petkov
2024-04-23 19:16     ` Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 05/16] x86/mce/amd: Clean up SMCA configuration Yazen Ghannam
2024-04-23 19:06   ` Borislav Petkov
2024-04-23 19:32     ` Yazen Ghannam
2024-04-24  2:29       ` Borislav Petkov
2024-04-24 13:44         ` Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 06/16] x86/mce/amd: Prep DFR handler before enabling banks Yazen Ghannam
2024-04-24 18:34   ` Borislav Petkov
2024-04-25 13:31     ` Yazen Ghannam
2024-04-29 12:38       ` Borislav Petkov
2024-04-29 13:22         ` Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 07/16] x86/mce/amd: Simplify DFR handler setup Yazen Ghannam
2024-04-24 19:06   ` Borislav Petkov
2024-04-25 14:12     ` Yazen Ghannam
2024-04-29 12:59       ` Borislav Petkov
2024-04-29 13:56         ` Yazen Ghannam
2024-04-29 14:12           ` Borislav Petkov
2024-04-29 14:25             ` Yazen Ghannam
2024-04-30 13:47               ` Borislav Petkov
2024-04-29 18:34       ` Robert Richter
2024-04-30 18:06         ` Borislav Petkov
2024-05-02 16:02           ` Yazen Ghannam
2024-05-02 18:48             ` Robert Richter
2024-05-04 14:37               ` Borislav Petkov
2024-04-04 15:13 ` [PATCH v2 08/16] x86/mce/amd: Clean up enable_deferred_error_interrupt() Yazen Ghannam
2024-04-29 13:12   ` Borislav Petkov
2024-04-29 14:18     ` Yazen Ghannam
2024-05-04 14:41       ` Borislav Petkov
2024-04-04 15:13 ` [PATCH v2 09/16] x86/mce: Unify AMD THR handler with MCA Polling Yazen Ghannam
2024-04-29 13:40   ` Borislav Petkov
2024-04-29 14:36     ` Yazen Ghannam
2024-05-04 14:52       ` Borislav Petkov
2024-05-07 16:25         ` Yazen Ghannam
2024-04-04 15:13 ` Yazen Ghannam [this message]
2024-04-04 15:13 ` [PATCH v2 11/16] x86/mce: Skip AMD threshold init if no threshold banks found Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 12/16] x86/mce/amd: Support SMCA Corrected Error Interrupt Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 13/16] x86/mce: Add wrapper for struct mce to export vendor specific info Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 14/16] x86/mce, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 15/16] x86/mce/apei: Handle variable register array size Yazen Ghannam
2024-04-04 15:13 ` [PATCH v2 16/16] EDAC/mce_amd: Add support for FRU Text in MCA Yazen Ghannam
2024-04-05 16:06   ` Luck, Tony
2024-04-07 13:19     ` Yazen Ghannam
2024-04-08 19:47     ` Naik, Avadhut
2024-04-08 19:57       ` Luck, Tony

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240404151359.47970-11-yazen.ghannam@amd.com \
    --to=yazen.ghannam@amd.com \
    --cc=Avadhut.Naik@amd.com \
    --cc=John.Allen@amd.com \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tony.luck@intel.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).