All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86/MCE/AMD: Clear DFR errors found in THR handler
@ 2022-06-21 15:59 Yazen Ghannam
  2022-10-27 11:46 ` Borislav Petkov
  2022-10-28  6:41 ` [tip: ras/core] " tip-bot2 for Yazen Ghannam
  0 siblings, 2 replies; 4+ messages in thread
From: Yazen Ghannam @ 2022-06-21 15:59 UTC (permalink / raw)
  To: linux-edac
  Cc: linux-kernel, tony.luck, x86, Smita.KoralahalliChannabasappa,
	Yazen Ghannam

AMD's MCA Thresholding feature counts errors of all severites not just
correctable errors. If a deferred error causes the threshold limit to be
reached (it was the error that caused the overflow), then both a
deferred error interrupt and a thresholding interrupt will be triggered.

The order of the interrupts is not guaranteed. If the threshold
interrupt handler is executed first, then it will clear MCA_STATUS for
the error. It will not check or clear MCA_DESTAT which also holds a copy
of the deferred error. When the deferred error interrupt handler runs it
will not find an error in MCA_STATUS, but it will find the error in
MCA_DESTAT. This will cause two errors to be logged.

Check for deferred errors when handling a threshold interrupt. If a bank
contains a deferred error, then clear the bank's MCA_DESTAT register.

Define a new helper function to do the deferred error check and clearing
of MCA_DESTAT.

Fixes: 37d43acfd79f ("x86/mce/AMD: Redo error logging from APIC LVT interrupt handlers")
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Cc: stable@vger.kernel.org
---
 arch/x86/kernel/cpu/mce/amd.c | 37 +++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 1c87501e0fa3..ab1145cf8328 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -788,6 +788,28 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
 	return status & MCI_STATUS_DEFERRED;
 }
 
+static bool _log_error_deferred(unsigned int bank, u32 misc)
+{
+	bool defrd;
+
+	defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
+				mca_msr_reg(bank, MCA_ADDR), misc);
+
+	if (!defrd)
+		return false;
+
+	/*
+	 * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers.
+	 * Return true here to avoid accessing these registers.
+	 */
+	if (!mce_flags.smca)
+		return true;
+
+	/* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
+	wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+	return true;
+}
+
 /*
  * We have three scenarios for checking for Deferred errors:
  *
@@ -799,19 +821,8 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
  */
 static void log_error_deferred(unsigned int bank)
 {
-	bool defrd;
-
-	defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
-				mca_msr_reg(bank, MCA_ADDR), 0);
-
-	if (!mce_flags.smca)
-		return;
-
-	/* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
-	if (defrd) {
-		wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+	if (_log_error_deferred(bank, 0))
 		return;
-	}
 
 	/*
 	 * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
@@ -832,7 +843,7 @@ static void amd_deferred_error_interrupt(void)
 
 static void log_error_thresholding(unsigned int bank, u64 misc)
 {
-	_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc);
+	_log_error_deferred(bank, misc);
 }
 
 static void log_and_reset_block(struct threshold_block *block)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] x86/MCE/AMD: Clear DFR errors found in THR handler
  2022-06-21 15:59 [PATCH] x86/MCE/AMD: Clear DFR errors found in THR handler Yazen Ghannam
@ 2022-10-27 11:46 ` Borislav Petkov
  2022-10-27 15:01   ` Yazen Ghannam
  2022-10-28  6:41 ` [tip: ras/core] " tip-bot2 for Yazen Ghannam
  1 sibling, 1 reply; 4+ messages in thread
From: Borislav Petkov @ 2022-10-27 11:46 UTC (permalink / raw)
  To: Yazen Ghannam
  Cc: linux-edac, linux-kernel, tony.luck, x86, Smita.KoralahalliChannabasappa

On Tue, Jun 21, 2022 at 03:59:43PM +0000, Yazen Ghannam wrote:
> AMD's MCA Thresholding feature counts errors of all severites not just
> correctable errors. If a deferred error causes the threshold limit to be
> reached (it was the error that caused the overflow), then both a
> deferred error interrupt and a thresholding interrupt will be triggered.
> 
> The order of the interrupts is not guaranteed. If the threshold
> interrupt handler is executed first, then it will clear MCA_STATUS for
> the error. It will not check or clear MCA_DESTAT which also holds a copy
> of the deferred error. When the deferred error interrupt handler runs it
> will not find an error in MCA_STATUS, but it will find the error in
> MCA_DESTAT. This will cause two errors to be logged.
> 
> Check for deferred errors when handling a threshold interrupt. If a bank
> contains a deferred error, then clear the bank's MCA_DESTAT register.
> 
> Define a new helper function to do the deferred error check and clearing
> of MCA_DESTAT.
> 
> Fixes: 37d43acfd79f ("x86/mce/AMD: Redo error logging from APIC LVT interrupt handlers")
> Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
> Cc: stable@vger.kernel.org
> ---
>  arch/x86/kernel/cpu/mce/amd.c | 37 +++++++++++++++++++++++------------
>  1 file changed, 24 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
> index 1c87501e0fa3..ab1145cf8328 100644
> --- a/arch/x86/kernel/cpu/mce/amd.c
> +++ b/arch/x86/kernel/cpu/mce/amd.c
> @@ -788,6 +788,28 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
>  	return status & MCI_STATUS_DEFERRED;
>  }
>  
> +static bool _log_error_deferred(unsigned int bank, u32 misc)
> +{
> +	bool defrd;
> +
> +	defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
> +				mca_msr_reg(bank, MCA_ADDR), misc);
> +
> +	if (!defrd)
> +		return false;

I've zapped that defrd variable:

diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index ab1145cf8328..6ae7edea3270 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -790,12 +790,8 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
 
 static bool _log_error_deferred(unsigned int bank, u32 misc)
 {
-	bool defrd;
-
-	defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
-				mca_msr_reg(bank, MCA_ADDR), misc);
-
-	if (!defrd)
+	if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
+			     mca_msr_reg(bank, MCA_ADDR), misc))
 		return false;
 
 	/*

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] x86/MCE/AMD: Clear DFR errors found in THR handler
  2022-10-27 11:46 ` Borislav Petkov
@ 2022-10-27 15:01   ` Yazen Ghannam
  0 siblings, 0 replies; 4+ messages in thread
From: Yazen Ghannam @ 2022-10-27 15:01 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: linux-edac, linux-kernel, tony.luck, x86, Smita.KoralahalliChannabasappa

On Thu, Oct 27, 2022 at 01:46:24PM +0200, Borislav Petkov wrote:
> On Tue, Jun 21, 2022 at 03:59:43PM +0000, Yazen Ghannam wrote:
> > AMD's MCA Thresholding feature counts errors of all severites not just
> > correctable errors. If a deferred error causes the threshold limit to be
> > reached (it was the error that caused the overflow), then both a
> > deferred error interrupt and a thresholding interrupt will be triggered.
> > 
> > The order of the interrupts is not guaranteed. If the threshold
> > interrupt handler is executed first, then it will clear MCA_STATUS for
> > the error. It will not check or clear MCA_DESTAT which also holds a copy
> > of the deferred error. When the deferred error interrupt handler runs it
> > will not find an error in MCA_STATUS, but it will find the error in
> > MCA_DESTAT. This will cause two errors to be logged.
> > 
> > Check for deferred errors when handling a threshold interrupt. If a bank
> > contains a deferred error, then clear the bank's MCA_DESTAT register.
> > 
> > Define a new helper function to do the deferred error check and clearing
> > of MCA_DESTAT.
> > 
> > Fixes: 37d43acfd79f ("x86/mce/AMD: Redo error logging from APIC LVT interrupt handlers")
> > Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
> > Cc: stable@vger.kernel.org
> > ---
> >  arch/x86/kernel/cpu/mce/amd.c | 37 +++++++++++++++++++++++------------
> >  1 file changed, 24 insertions(+), 13 deletions(-)
> > 
> > diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
> > index 1c87501e0fa3..ab1145cf8328 100644
> > --- a/arch/x86/kernel/cpu/mce/amd.c
> > +++ b/arch/x86/kernel/cpu/mce/amd.c
> > @@ -788,6 +788,28 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
> >  	return status & MCI_STATUS_DEFERRED;
> >  }
> >  
> > +static bool _log_error_deferred(unsigned int bank, u32 misc)
> > +{
> > +	bool defrd;
> > +
> > +	defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
> > +				mca_msr_reg(bank, MCA_ADDR), misc);
> > +
> > +	if (!defrd)
> > +		return false;
> 
> I've zapped that defrd variable:
> 
> diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
> index ab1145cf8328..6ae7edea3270 100644
> --- a/arch/x86/kernel/cpu/mce/amd.c
> +++ b/arch/x86/kernel/cpu/mce/amd.c
> @@ -790,12 +790,8 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
>  
>  static bool _log_error_deferred(unsigned int bank, u32 misc)
>  {
> -	bool defrd;
> -
> -	defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
> -				mca_msr_reg(bank, MCA_ADDR), misc);
> -
> -	if (!defrd)
> +	if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
> +			     mca_msr_reg(bank, MCA_ADDR), misc))
>  		return false;
>  
>  	/*
> 
> -- 

Yep, looks good to me. Thanks!

-Yazen

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [tip: ras/core] x86/MCE/AMD: Clear DFR errors found in THR handler
  2022-06-21 15:59 [PATCH] x86/MCE/AMD: Clear DFR errors found in THR handler Yazen Ghannam
  2022-10-27 11:46 ` Borislav Petkov
@ 2022-10-28  6:41 ` tip-bot2 for Yazen Ghannam
  1 sibling, 0 replies; 4+ messages in thread
From: tip-bot2 for Yazen Ghannam @ 2022-10-28  6:41 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Yazen Ghannam, Borislav Petkov, stable, x86, linux-kernel

The following commit has been merged into the ras/core branch of tip:

Commit-ID:     bc1b705b0eee4c645ad8b3bbff3c8a66e9688362
Gitweb:        https://git.kernel.org/tip/bc1b705b0eee4c645ad8b3bbff3c8a66e9688362
Author:        Yazen Ghannam <yazen.ghannam@amd.com>
AuthorDate:    Tue, 21 Jun 2022 15:59:43 
Committer:     Borislav Petkov <bp@suse.de>
CommitterDate: Thu, 27 Oct 2022 17:01:25 +02:00

x86/MCE/AMD: Clear DFR errors found in THR handler

AMD's MCA Thresholding feature counts errors of all severity levels, not
just correctable errors. If a deferred error causes the threshold limit
to be reached (it was the error that caused the overflow), then both a
deferred error interrupt and a thresholding interrupt will be triggered.

The order of the interrupts is not guaranteed. If the threshold
interrupt handler is executed first, then it will clear MCA_STATUS for
the error. It will not check or clear MCA_DESTAT which also holds a copy
of the deferred error. When the deferred error interrupt handler runs it
will not find an error in MCA_STATUS, but it will find the error in
MCA_DESTAT. This will cause two errors to be logged.

Check for deferred errors when handling a threshold interrupt. If a bank
contains a deferred error, then clear the bank's MCA_DESTAT register.

Define a new helper function to do the deferred error check and clearing
of MCA_DESTAT.

  [ bp: Simplify, convert comment to passive voice. ]

Fixes: 37d43acfd79f ("x86/mce/AMD: Redo error logging from APIC LVT interrupt handlers")
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20220621155943.33623-1-yazen.ghannam@amd.com
---
 arch/x86/kernel/cpu/mce/amd.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 1c87501..10fb5b5 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -788,6 +788,24 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
 	return status & MCI_STATUS_DEFERRED;
 }
 
+static bool _log_error_deferred(unsigned int bank, u32 misc)
+{
+	if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
+			     mca_msr_reg(bank, MCA_ADDR), misc))
+		return false;
+
+	/*
+	 * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers.
+	 * Return true here to avoid accessing these registers.
+	 */
+	if (!mce_flags.smca)
+		return true;
+
+	/* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */
+	wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+	return true;
+}
+
 /*
  * We have three scenarios for checking for Deferred errors:
  *
@@ -799,19 +817,8 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
  */
 static void log_error_deferred(unsigned int bank)
 {
-	bool defrd;
-
-	defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
-				mca_msr_reg(bank, MCA_ADDR), 0);
-
-	if (!mce_flags.smca)
-		return;
-
-	/* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
-	if (defrd) {
-		wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+	if (_log_error_deferred(bank, 0))
 		return;
-	}
 
 	/*
 	 * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
@@ -832,7 +839,7 @@ static void amd_deferred_error_interrupt(void)
 
 static void log_error_thresholding(unsigned int bank, u64 misc)
 {
-	_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc);
+	_log_error_deferred(bank, misc);
 }
 
 static void log_and_reset_block(struct threshold_block *block)

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-10-28  6:42 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-21 15:59 [PATCH] x86/MCE/AMD: Clear DFR errors found in THR handler Yazen Ghannam
2022-10-27 11:46 ` Borislav Petkov
2022-10-27 15:01   ` Yazen Ghannam
2022-10-28  6:41 ` [tip: ras/core] " tip-bot2 for Yazen Ghannam

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.