[v2,2/2] x86/mce: Report only DRAM ECC as memory errors on AMD systems
diff mbox series

Message ID 20171207203955.118171-2-Yazen.Ghannam@amd.com
State New, archived
Headers show
Series
  • [v2,1/2] x86/mce/AMD: Define function to get SMCA bank type
Related show

Commit Message

Ghannam, Yazen Dec. 7, 2017, 8:39 p.m. UTC
From: Yazen Ghannam <yazen.ghannam@amd.com>

The MCA_STATUS[ErrorCodeExt] field is very bank type specific. We currently
check if the ErrorCodeExt value is 0x0 or 0x8 in mce_is_memory_error(), but
we don't check the bank. This means that we could flag non-memory errors as
memory errors.

We know that we want to flag DRAM ECC errors as memory errors, so let's do
those cases first. We can add more cases later when needed.

Check that bank type is UMC and xec is 0 on SMCA systems.

Check that bank is 4 (Northbridge) and xec is 8 on legacy systems.

Define a wrapper function in mce_amd.c so we can use SMCA enums.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
Link:
https://lkml.kernel.org/r/20171201155034.39206-2-Yazen.Ghannam@amd.com

v1->v2:
* No changes.

 arch/x86/include/asm/mce.h           |  2 ++
 arch/x86/kernel/cpu/mcheck/mce.c     |  4 +---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 11 +++++++++++
 3 files changed, 14 insertions(+), 3 deletions(-)

Comments

Borislav Petkov Dec. 10, 2017, 4:18 p.m. UTC | #1
On Thu, Dec 07, 2017 at 02:39:55PM -0600, Yazen Ghannam wrote:
> From: Yazen Ghannam <yazen.ghannam@amd.com>
> 
> The MCA_STATUS[ErrorCodeExt] field is very bank type specific. We currently
> check if the ErrorCodeExt value is 0x0 or 0x8 in mce_is_memory_error(), but
> we don't check the bank. This means that we could flag non-memory errors as
> memory errors.
> 
> We know that we want to flag DRAM ECC errors as memory errors, so let's do
> those cases first. We can add more cases later when needed.
> 
> Check that bank type is UMC and xec is 0 on SMCA systems.
> 
> Check that bank is 4 (Northbridge) and xec is 8 on legacy systems.
> 
> Define a wrapper function in mce_amd.c so we can use SMCA enums.
> 
> Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
> ---
> Link:
> https://lkml.kernel.org/r/20171201155034.39206-2-Yazen.Ghannam@amd.com
> 
> v1->v2:
> * No changes.
> 
>  arch/x86/include/asm/mce.h           |  2 ++
>  arch/x86/kernel/cpu/mcheck/mce.c     |  4 +---
>  arch/x86/kernel/cpu/mcheck/mce_amd.c | 11 +++++++++++
>  3 files changed, 14 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> index b1e8d8db921f..96ea4b5ba658 100644
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -376,6 +376,7 @@ struct smca_bank {
>  extern struct smca_bank smca_banks[MAX_NR_BANKS];
>  
>  extern const char *smca_get_long_name(enum smca_bank_types t);
> +extern bool amd_mce_is_memory_error(struct mce *m);
>  
>  extern int mce_threshold_create_device(unsigned int cpu);
>  extern int mce_threshold_remove_device(unsigned int cpu);
> @@ -384,6 +385,7 @@ extern int mce_threshold_remove_device(unsigned int cpu);
>  
>  static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
>  static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
> +static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
>  
>  #endif
>  
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index b1d616d08eee..321c7a80be66 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -503,10 +503,8 @@ static int mce_usable_address(struct mce *m)
>  bool mce_is_memory_error(struct mce *m)
>  {
>  	if (m->cpuvendor == X86_VENDOR_AMD) {
> -		/* ErrCodeExt[20:16] */
> -		u8 xec = (m->status >> 16) & 0x1f;
> +		return amd_mce_is_memory_error(m);
>  
> -		return (xec == 0x0 || xec == 0x8);
>  	} else if (m->cpuvendor == X86_VENDOR_INTEL) {
>  		/*
>  		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
> diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
> index 219d5115f4d4..2b7f7ce4bedf 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
> @@ -750,6 +750,17 @@ int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
>  }
>  EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
>  
> +bool amd_mce_is_memory_error(struct mce *m)
> +{
> +	/* ErrCodeExt[20:16] */
> +	u8 xec = (m->status >> 16) & 0x1f;
> +
> +	if (mce_flags.smca)
> +		return (smca_get_bank_type(m) == SMCA_UMC && xec == 0x0);
> +
> +	return (m->bank == 4 && xec == 0x8);

You don't need brackets around the return statements.

Anyway, applied, thx.

Patch
diff mbox series

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index b1e8d8db921f..96ea4b5ba658 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -376,6 +376,7 @@  struct smca_bank {
 extern struct smca_bank smca_banks[MAX_NR_BANKS];
 
 extern const char *smca_get_long_name(enum smca_bank_types t);
+extern bool amd_mce_is_memory_error(struct mce *m);
 
 extern int mce_threshold_create_device(unsigned int cpu);
 extern int mce_threshold_remove_device(unsigned int cpu);
@@ -384,6 +385,7 @@  extern int mce_threshold_remove_device(unsigned int cpu);
 
 static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
 static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
+static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
 
 #endif
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b1d616d08eee..321c7a80be66 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -503,10 +503,8 @@  static int mce_usable_address(struct mce *m)
 bool mce_is_memory_error(struct mce *m)
 {
 	if (m->cpuvendor == X86_VENDOR_AMD) {
-		/* ErrCodeExt[20:16] */
-		u8 xec = (m->status >> 16) & 0x1f;
+		return amd_mce_is_memory_error(m);
 
-		return (xec == 0x0 || xec == 0x8);
 	} else if (m->cpuvendor == X86_VENDOR_INTEL) {
 		/*
 		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 219d5115f4d4..2b7f7ce4bedf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -750,6 +750,17 @@  int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
 }
 EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
 
+bool amd_mce_is_memory_error(struct mce *m)
+{
+	/* ErrCodeExt[20:16] */
+	u8 xec = (m->status >> 16) & 0x1f;
+
+	if (mce_flags.smca)
+		return (smca_get_bank_type(m) == SMCA_UMC && xec == 0x0);
+
+	return (m->bank == 4 && xec == 0x8);
+}
+
 static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
 {
 	struct mce m;