linux-edac.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types.
@ 2021-05-11 15:25 Naveen Krishna Chatradhi
  2021-05-11 15:25 ` [PATCH 2/3] x86/MCE/AMD: Helper function to check UMC v2 Naveen Krishna Chatradhi
                   ` (3 more replies)
  0 siblings, 4 replies; 33+ messages in thread
From: Naveen Krishna Chatradhi @ 2021-05-11 15:25 UTC (permalink / raw)
  To: linux-edac, x86; +Cc: linux-kernel, bp, mingo, mchehab, Muralidhara M K

From: Muralidhara M K <muralimk@amd.com>

Add the (HWID, MCATYPE) tuples and names for new
SMCA bank types.

Also, add their respective error descriptions to the MCE
decoding module edac_mce_amd.

Signed-off-by: Muralidhara M K <muralimk@amd.com>
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
 arch/x86/include/asm/mce.h    |  5 +++
 arch/x86/kernel/cpu/mce/amd.c | 16 ++++++++
 drivers/edac/mce_amd.c        | 70 +++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index ddfb3cad8dff..cf7f35fdf2c8 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -317,6 +317,7 @@ enum smca_bank_types {
 	SMCA_CS_V2,	/* Coherent Slave */
 	SMCA_PIE,	/* Power, Interrupts, etc. */
 	SMCA_UMC,	/* Unified Memory Controller */
+	SMCA_UMC_V2,	/* Unified Memory Controller */
 	SMCA_PB,	/* Parameter Block */
 	SMCA_PSP,	/* Platform Security Processor */
 	SMCA_PSP_V2,	/* Platform Security Processor */
@@ -325,6 +326,10 @@ enum smca_bank_types {
 	SMCA_MP5,	/* Microprocessor 5 Unit */
 	SMCA_NBIO,	/* Northbridge IO Unit */
 	SMCA_PCIE,	/* PCI Express Unit */
+	SMCA_PCIE_V2,	/* PCI Express Unit */
+	SMCA_XGMI_PCS,	/* xGMI PCS Unit */
+	SMCA_XGMI_PHY,	/* xGMI PHY Unit */
+	SMCA_WAFL_PHY,	/* WAFL PHY Unit */
 	N_SMCA_BANK_TYPES
 };
 
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index e486f96b3cb3..055f3a0acf5e 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -90,6 +90,7 @@ static struct smca_bank_name smca_names[] = {
 	[SMCA_CS_V2]	= { "coherent_slave",	"Coherent Slave" },
 	[SMCA_PIE]	= { "pie",		"Power, Interrupts, etc." },
 	[SMCA_UMC]	= { "umc",		"Unified Memory Controller" },
+	[SMCA_UMC_V2]	= { "umc_v2",		"Unified Memory Controller" },
 	[SMCA_PB]	= { "param_block",	"Parameter Block" },
 	[SMCA_PSP]	= { "psp",		"Platform Security Processor" },
 	[SMCA_PSP_V2]	= { "psp",		"Platform Security Processor" },
@@ -98,6 +99,10 @@ static struct smca_bank_name smca_names[] = {
 	[SMCA_MP5]	= { "mp5",		"Microprocessor 5 Unit" },
 	[SMCA_NBIO]	= { "nbio",		"Northbridge IO Unit" },
 	[SMCA_PCIE]	= { "pcie",		"PCI Express Unit" },
+	[SMCA_PCIE_V2]	= { "pcie",		"PCI Express Unit" },
+	[SMCA_XGMI_PCS]	= { "xgmi_pcs",		"Ext Global Memory Interconnect PCS Unit" },
+	[SMCA_XGMI_PHY]	= { "xgmi_phy",		"Ext Global Memory Interconnect PHY Unit" },
+	[SMCA_WAFL_PHY]	= { "wafl_phy",		"WAFL PHY Unit" },
 };
 
 static const char *smca_get_name(enum smca_bank_types t)
@@ -155,6 +160,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
 
 	/* Unified Memory Controller MCA type */
 	{ SMCA_UMC,	 HWID_MCATYPE(0x96, 0x0)	},
+	{ SMCA_UMC_V2,	 HWID_MCATYPE(0x96, 0x1)	},
 
 	/* Parameter Block MCA type */
 	{ SMCA_PB,	 HWID_MCATYPE(0x05, 0x0)	},
@@ -175,6 +181,16 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
 
 	/* PCI Express Unit MCA type */
 	{ SMCA_PCIE,	 HWID_MCATYPE(0x46, 0x0)	},
+	{ SMCA_PCIE_V2,	 HWID_MCATYPE(0x46, 0x1)	},
+
+	/* xGMI PCS MCA type */
+	{ SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0)	},
+
+	/* xGMI PHY MCA type */
+	{ SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0)	},
+
+	/* WAFL PHY MCA type */
+	{ SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0)	},
 };
 
 struct smca_bank smca_banks[MAX_NR_BANKS];
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 5dd905a3f30c..5515fd9336b1 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -323,6 +323,21 @@ static const char * const smca_umc_mce_desc[] = {
 	"AES SRAM ECC error",
 };
 
+static const char * const smca_umc2_mce_desc[] = {
+	"DRAM ECC error",
+	"Data poison error",
+	"SDP parity error",
+	"Reserved",
+	"Address/Command parity error",
+	"Write data parity error",
+	"DCQ SRAM ECC error",
+	"Reserved",
+	"Read data parity error",
+	"Rdb SRAM ECC error",
+	"RdRsp SRAM ECC error",
+	"LM32 MP errors",
+};
+
 static const char * const smca_pb_mce_desc[] = {
 	"An ECC error in the Parameter Block RAM array",
 };
@@ -400,6 +415,56 @@ static const char * const smca_pcie_mce_desc[] = {
 	"CCIX Non-okay write response with data error",
 };
 
+static const char * const smca_pcie2_mce_desc[] = {
+	"SDP Parity Error logging",
+};
+
+static const char * const smca_xgmipcs_mce_desc[] = {
+	"DataLossErr",
+	"TrainingErr",
+	"FlowCtrlAckErr",
+	"RxFifoUnderflowErr",
+	"RxFifoOverflowErr",
+	"CRCErr",
+	"BERExceededErr",
+	"TxVcidDataErr",
+	"ReplayBufParityErr",
+	"DataParityErr",
+	"ReplayFifoOverflowErr",
+	"ReplayFIfoUnderflowErr",
+	"ElasticFifoOverflowErr",
+	"DeskewErr",
+	"FlowCtrlCRCErr",
+	"DataStartupLimitErr",
+	"FCInitTimeoutErr",
+	"RecoveryTimeoutErr",
+	"ReadySerialTimeoutErr",
+	"ReadySerialAttemptErr",
+	"RecoveryAttemptErr",
+	"RecoveryRelockAttemptErr",
+	"ReplayAttemptErr",
+	"SyncHdrErr",
+	"TxReplayTimeoutErr",
+	"RxReplayTimeoutErr",
+	"LinkSubTxTimeoutErr",
+	"LinkSubRxTimeoutErr",
+	"RxCMDPktErr",
+};
+
+static const char * const smca_xgmiphy_mce_desc[] = {
+	"RAM ECC Error",
+	"ARC instruction buffer parity error",
+	"ARC data buffer parity error",
+	"PHY APB error",
+};
+
+static const char * const smca_waflphy_mce_desc[] = {
+	"RAM ECC Error",
+	"ARC instruction buffer parity error",
+	"ARC data buffer parity error",
+	"PHY APB error",
+};
+
 struct smca_mce_desc {
 	const char * const *descs;
 	unsigned int num_descs;
@@ -418,6 +483,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
 	[SMCA_CS_V2]	= { smca_cs2_mce_desc,	ARRAY_SIZE(smca_cs2_mce_desc)	},
 	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
 	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
+	[SMCA_UMC_V2]	= { smca_umc2_mce_desc,	ARRAY_SIZE(smca_umc2_mce_desc)	},
 	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
 	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
 	[SMCA_PSP_V2]	= { smca_psp2_mce_desc,	ARRAY_SIZE(smca_psp2_mce_desc)	},
@@ -426,6 +492,10 @@ static struct smca_mce_desc smca_mce_descs[] = {
 	[SMCA_MP5]	= { smca_mp5_mce_desc,	ARRAY_SIZE(smca_mp5_mce_desc)	},
 	[SMCA_NBIO]	= { smca_nbio_mce_desc,	ARRAY_SIZE(smca_nbio_mce_desc)	},
 	[SMCA_PCIE]	= { smca_pcie_mce_desc,	ARRAY_SIZE(smca_pcie_mce_desc)	},
+	[SMCA_PCIE_V2]	= { smca_pcie2_mce_desc,   ARRAY_SIZE(smca_pcie2_mce_desc)	},
+	[SMCA_XGMI_PCS]	= { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc)	},
+	[SMCA_XGMI_PHY]	= { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc)	},
+	[SMCA_WAFL_PHY]	= { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc)	},
 };
 
 static bool f12h_mc0_mce(u16 ec, u8 xec)
-- 
2.17.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/3] x86/MCE/AMD: Helper function to check UMC v2
  2021-05-11 15:25 [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types Naveen Krishna Chatradhi
@ 2021-05-11 15:25 ` Naveen Krishna Chatradhi
  2021-05-11 17:34   ` Borislav Petkov
  2021-09-13  2:13   ` [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol Mukul Joshi
  2021-05-11 15:25 ` [PATCH 3/3] x86/mce: Add MCE priority for Accelerator devices Naveen Krishna Chatradhi
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 33+ messages in thread
From: Naveen Krishna Chatradhi @ 2021-05-11 15:25 UTC (permalink / raw)
  To: linux-edac, x86; +Cc: linux-kernel, bp, mingo, mchehab, Mukul Joshi

From: Mukul Joshi <mukul.joshi@amd.com>

Add a helper function to check if a given bank is
UMCv2 or not.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: John Clements <John.Clements@amd.com>
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
 arch/x86/include/asm/mce.h    | 2 ++
 arch/x86/kernel/cpu/mce/amd.c | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index cf7f35fdf2c8..8cbe7221a253 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -357,6 +357,7 @@ extern int mce_threshold_remove_device(unsigned int cpu);
 
 void mce_amd_feature_init(struct cpuinfo_x86 *c);
 int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr);
+bool is_smca_umc_v2(int bank);
 
 #else
 
@@ -366,6 +367,7 @@ static inline bool amd_mce_is_memory_error(struct mce *m)		{ return false; };
 static inline void mce_amd_feature_init(struct cpuinfo_x86 *c)		{ }
 static inline int
 umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)	{ return -EINVAL; };
+static inline bool is_smca_umc_v2(int bank)				{ return false;	};
 #endif
 
 static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c)	{ return mce_amd_feature_init(c); }
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 055f3a0acf5e..41718e3111f2 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -1542,3 +1542,9 @@ int mce_threshold_create_device(unsigned int cpu)
 	mce_threshold_remove_device(cpu);
 	return err;
 }
+
+bool is_smca_umc_v2(int bank)
+{
+	return (smca_get_bank_type(bank) == SMCA_UMC_V2);
+}
+EXPORT_SYMBOL_GPL(is_smca_umc_v2);
-- 
2.17.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 3/3] x86/mce: Add MCE priority for Accelerator devices
  2021-05-11 15:25 [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types Naveen Krishna Chatradhi
  2021-05-11 15:25 ` [PATCH 2/3] x86/MCE/AMD: Helper function to check UMC v2 Naveen Krishna Chatradhi
@ 2021-05-11 15:25 ` Naveen Krishna Chatradhi
  2021-05-11 17:27 ` [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types Borislav Petkov
  2021-05-26 16:46 ` [PATCH v2] " Naveen Krishna Chatradhi
  3 siblings, 0 replies; 33+ messages in thread
From: Naveen Krishna Chatradhi @ 2021-05-11 15:25 UTC (permalink / raw)
  To: linux-edac, x86; +Cc: linux-kernel, bp, mingo, mchehab, Mukul Joshi

From: Mukul Joshi <mukul.joshi@amd.com>

Create a new MCE priority for accelerator devices on the
notifier chain. On some future AMD platforms, GPU devices
will process MCE errors and work as error detection
devices.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: John Clements <John.Clements@amd.com>
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
 arch/x86/include/asm/mce.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 8cbe7221a253..849f10a8602d 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -173,6 +173,7 @@ enum mce_notifier_prios {
 	MCE_PRIO_LOWEST,
 	MCE_PRIO_MCELOG,
 	MCE_PRIO_EDAC,
+	MCE_PRIO_ACCEL,
 	MCE_PRIO_NFIT,
 	MCE_PRIO_EXTLOG,
 	MCE_PRIO_UC,
-- 
2.17.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types.
  2021-05-11 15:25 [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types Naveen Krishna Chatradhi
  2021-05-11 15:25 ` [PATCH 2/3] x86/MCE/AMD: Helper function to check UMC v2 Naveen Krishna Chatradhi
  2021-05-11 15:25 ` [PATCH 3/3] x86/mce: Add MCE priority for Accelerator devices Naveen Krishna Chatradhi
@ 2021-05-11 17:27 ` Borislav Petkov
  2021-05-24 16:41   ` Chatradhi, Naveen Krishna
  2021-05-26 16:46 ` [PATCH v2] " Naveen Krishna Chatradhi
  3 siblings, 1 reply; 33+ messages in thread
From: Borislav Petkov @ 2021-05-11 17:27 UTC (permalink / raw)
  To: Naveen Krishna Chatradhi
  Cc: linux-edac, x86, linux-kernel, mingo, mchehab, Muralidhara M K

On Tue, May 11, 2021 at 08:55:36PM +0530, Naveen Krishna Chatradhi wrote:
> diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
> index e486f96b3cb3..055f3a0acf5e 100644
> --- a/arch/x86/kernel/cpu/mce/amd.c
> +++ b/arch/x86/kernel/cpu/mce/amd.c
> @@ -90,6 +90,7 @@ static struct smca_bank_name smca_names[] = {
>  	[SMCA_CS_V2]	= { "coherent_slave",	"Coherent Slave" },
>  	[SMCA_PIE]	= { "pie",		"Power, Interrupts, etc." },
>  	[SMCA_UMC]	= { "umc",		"Unified Memory Controller" },
> +	[SMCA_UMC_V2]	= { "umc_v2",		"Unified Memory Controller" },

So this is called "umc_v2" but the other V2 FUs's strings are the same.
Why?

Also, if you're going to repeat strings, you can just as well group all
those which are the same this way:

	[ SMCA_UMC ... SMCA_UMC_V2 ]    = { "umc",              "Unified Memory Controller" },

and do that for all which have V1 and V2.

I mean, gcc is smart enough to do that behind the scenes for identical
strings but you should do that in C too.

> diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
> index 5dd905a3f30c..5515fd9336b1 100644
> --- a/drivers/edac/mce_amd.c
> +++ b/drivers/edac/mce_amd.c
> @@ -323,6 +323,21 @@ static const char * const smca_umc_mce_desc[] = {
>  	"AES SRAM ECC error",
>  };
>  
> +static const char * const smca_umc2_mce_desc[] = {

Ok, gcc reuses the identical string pointers from smca_umc_mce_desc[] so
we should be ok wrt duplication.

> +	"DRAM ECC error",
> +	"Data poison error",
> +	"SDP parity error",
> +	"Reserved",
> +	"Address/Command parity error",
> +	"Write data parity error",
> +	"DCQ SRAM ECC error",
> +	"Reserved",
> +	"Read data parity error",
> +	"Rdb SRAM ECC error",
> +	"RdRsp SRAM ECC error",
> +	"LM32 MP errors",
> +};

...


> +static const char * const smca_xgmipcs_mce_desc[] = {
> +	"DataLossErr",
> +	"TrainingErr",
> +	"FlowCtrlAckErr",
> +	"RxFifoUnderflowErr",
> +	"RxFifoOverflowErr",
> +	"CRCErr",
> +	"BERExceededErr",
> +	"TxVcidDataErr",
> +	"ReplayBufParityErr",
> +	"DataParityErr",
> +	"ReplayFifoOverflowErr",
> +	"ReplayFIfoUnderflowErr",
> +	"ElasticFifoOverflowErr",
> +	"DeskewErr",
> +	"FlowCtrlCRCErr",
> +	"DataStartupLimitErr",
> +	"FCInitTimeoutErr",
> +	"RecoveryTimeoutErr",
> +	"ReadySerialTimeoutErr",
> +	"ReadySerialAttemptErr",
> +	"RecoveryAttemptErr",
> +	"RecoveryRelockAttemptErr",
> +	"ReplayAttemptErr",
> +	"SyncHdrErr",
> +	"TxReplayTimeoutErr",
> +	"RxReplayTimeoutErr",
> +	"LinkSubTxTimeoutErr",
> +	"LinkSubRxTimeoutErr",
> +	"RxCMDPktErr",

What happened to those and why aren't they proper words like the other
error descriptions?

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/3] x86/MCE/AMD: Helper function to check UMC v2
  2021-05-11 15:25 ` [PATCH 2/3] x86/MCE/AMD: Helper function to check UMC v2 Naveen Krishna Chatradhi
@ 2021-05-11 17:34   ` Borislav Petkov
  2021-05-12  1:40     ` Joshi, Mukul
  2021-09-13  2:13   ` [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol Mukul Joshi
  1 sibling, 1 reply; 33+ messages in thread
From: Borislav Petkov @ 2021-05-11 17:34 UTC (permalink / raw)
  To: Naveen Krishna Chatradhi
  Cc: linux-edac, x86, linux-kernel, mingo, mchehab, Mukul Joshi

On Tue, May 11, 2021 at 08:55:37PM +0530, Naveen Krishna Chatradhi wrote:

> Subject: Re: [PATCH 2/3] x86/MCE/AMD: Helper function to check UMC v2

The condensed patch description in the subject line should start with a
uppercase letter and should be written in imperative tone:

"x86/MCE/AMD: Add a helper function... "

> Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
> Reviewed-by: John Clements <John.Clements@amd.com>
> Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>

This is all fine and dandy but it needs your SOB too when you send the
patch.

Please read Documentation/process/submitting-patches.rst

sections

Sign your work - the Developer's Certificate of Origin
When to use Acked-by:, Cc:, and Co-developed-by:

> +bool is_smca_umc_v2(int bank)
> +{
> +	return (smca_get_bank_type(bank) == SMCA_UMC_V2);
> +}
> +EXPORT_SYMBOL_GPL(is_smca_umc_v2);

This addition looks useless when it doesn't have any users.

Also, I'm pretty sceptical this even makes sense to have it exported -
I'm guessing this is for mce_amd.c but I can't say without seeing it in
use.

The same remarks hold true for your patch 3.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH 2/3] x86/MCE/AMD: Helper function to check UMC v2
  2021-05-11 17:34   ` Borislav Petkov
@ 2021-05-12  1:40     ` Joshi, Mukul
  2021-05-12  7:26       ` Borislav Petkov
  0 siblings, 1 reply; 33+ messages in thread
From: Joshi, Mukul @ 2021-05-12  1:40 UTC (permalink / raw)
  To: Borislav Petkov, Chatradhi, Naveen Krishna
  Cc: linux-edac, x86, linux-kernel, mingo, mchehab

[AMD Official Use Only - Internal Distribution Only]

Hi Boris,

Thanks for the review comments.
I will update the Subject line and send out an updated patch.

In the meantime, here is the link to where the changes in Patch 2 and 3 are being used:
https://lists.freedesktop.org/archives/amd-gfx/2021-May/063423.html

Thanks.

Regards,
Mukul

-----Original Message-----
From: Borislav Petkov <bp@alien8.de> 
Sent: Tuesday, May 11, 2021 1:35 PM
To: Chatradhi, Naveen Krishna <NaveenKrishna.Chatradhi@amd.com>
Cc: linux-edac@vger.kernel.org; x86@kernel.org; linux-kernel@vger.kernel.org; mingo@redhat.com; mchehab@kernel.org; Joshi, Mukul <Mukul.Joshi@amd.com>
Subject: Re: [PATCH 2/3] x86/MCE/AMD: Helper function to check UMC v2

[CAUTION: External Email]

On Tue, May 11, 2021 at 08:55:37PM +0530, Naveen Krishna Chatradhi wrote:

> Subject: Re: [PATCH 2/3] x86/MCE/AMD: Helper function to check UMC v2

The condensed patch description in the subject line should start with a uppercase letter and should be written in imperative tone:

"x86/MCE/AMD: Add a helper function... "

> Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
> Reviewed-by: John Clements <John.Clements@amd.com>
> Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>

This is all fine and dandy but it needs your SOB too when you send the patch.

Please read Documentation/process/submitting-patches.rst

sections

Sign your work - the Developer's Certificate of Origin When to use Acked-by:, Cc:, and Co-developed-by:

> +bool is_smca_umc_v2(int bank)
> +{
> +     return (smca_get_bank_type(bank) == SMCA_UMC_V2); } 
> +EXPORT_SYMBOL_GPL(is_smca_umc_v2);

This addition looks useless when it doesn't have any users.

Also, I'm pretty sceptical this even makes sense to have it exported - I'm guessing this is for mce_amd.c but I can't say without seeing it in use.

The same remarks hold true for your patch 3.

Thx.

--
Regards/Gruss,
    Boris.

https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpeople.kernel.org%2Ftglx%2Fnotes-about-netiquette&amp;data=04%7C01%7Cmukul.joshi%40amd.com%7C0a7f1e0a3bf04c9ed89408d914a3141c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637563512928580623%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=z3xbuS8yX4Ea1kG9tWv0tGNQRPZP9N6QcBI2VNjn6W0%3D&amp;reserved=0

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/3] x86/MCE/AMD: Helper function to check UMC v2
  2021-05-12  1:40     ` Joshi, Mukul
@ 2021-05-12  7:26       ` Borislav Petkov
  0 siblings, 0 replies; 33+ messages in thread
From: Borislav Petkov @ 2021-05-12  7:26 UTC (permalink / raw)
  To: Joshi, Mukul
  Cc: Chatradhi, Naveen Krishna, linux-edac, x86, linux-kernel, mingo, mchehab

Hi Mukul,

On Wed, May 12, 2021 at 01:40:54AM +0000, Joshi, Mukul wrote:
> [AMD Official Use Only - Internal Distribution Only]

first of all, please do not top-post.

> Thanks for the review comments.
> I will update the Subject line and send out an updated patch.

Ok.

> In the meantime, here is the link to where the changes in Patch 2 and 3 are being used:
> https://lists.freedesktop.org/archives/amd-gfx/2021-May/063423.html

Yes, in the future, please Cc x86@kernel.org too when you're
using/exporting facilities to be used by other subsystems.

I'll try to reply to that patch through lore.kernel.org now.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types.
  2021-05-11 17:27 ` [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types Borislav Petkov
@ 2021-05-24 16:41   ` Chatradhi, Naveen Krishna
  2021-05-25 18:02     ` Borislav Petkov
  0 siblings, 1 reply; 33+ messages in thread
From: Chatradhi, Naveen Krishna @ 2021-05-24 16:41 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: linux-edac, x86, linux-kernel, mingo, mchehab, M K, Muralidhara

[AMD Official Use Only]

Hi Boris

My apologies for delayed response. Thanks for your review comments, will submit a v2 shortly.

Regards,
Naveenk

-----Original Message-----
From: Borislav Petkov <bp@alien8.de> 
Sent: Tuesday, May 11, 2021 10:57 PM
To: Chatradhi, Naveen Krishna <NaveenKrishna.Chatradhi@amd.com>
Cc: linux-edac@vger.kernel.org; x86@kernel.org; linux-kernel@vger.kernel.org; mingo@redhat.com; mchehab@kernel.org; M K, Muralidhara <Muralidhara.MK@amd.com>
Subject: Re: [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types.

[CAUTION: External Email]

On Tue, May 11, 2021 at 08:55:36PM +0530, Naveen Krishna Chatradhi wrote:
> diff --git a/arch/x86/kernel/cpu/mce/amd.c 
> b/arch/x86/kernel/cpu/mce/amd.c index e486f96b3cb3..055f3a0acf5e 
> 100644
> --- a/arch/x86/kernel/cpu/mce/amd.c
> +++ b/arch/x86/kernel/cpu/mce/amd.c
> @@ -90,6 +90,7 @@ static struct smca_bank_name smca_names[] = {
>       [SMCA_CS_V2]    = { "coherent_slave",   "Coherent Slave" },
>       [SMCA_PIE]      = { "pie",              "Power, Interrupts, etc." },
>       [SMCA_UMC]      = { "umc",              "Unified Memory Controller" },
> +     [SMCA_UMC_V2]   = { "umc_v2",           "Unified Memory Controller" },

So this is called "umc_v2" but the other V2 FUs's strings are the same.
Why?
[naveenk:] There is a possibility for a heterogenous system with both the SMCA_UMC and SMCA_UMC_V2 variant of controllers to exist.
I will update the long name to describe accordingly.

Also, if you're going to repeat strings, you can just as well group all those which are the same this way:

        [ SMCA_UMC ... SMCA_UMC_V2 ]    = { "umc",              "Unified Memory Controller" },

and do that for all which have V1 and V2.

I mean, gcc is smart enough to do that behind the scenes for identical strings but you should do that in C too.
[naveenk:] thanks for the suggestion, I can do this for the other units.

> diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 
> 5dd905a3f30c..5515fd9336b1 100644
> --- a/drivers/edac/mce_amd.c
> +++ b/drivers/edac/mce_amd.c
> @@ -323,6 +323,21 @@ static const char * const smca_umc_mce_desc[] = {
>       "AES SRAM ECC error",
>  };
>
> +static const char * const smca_umc2_mce_desc[] = {

Ok, gcc reuses the identical string pointers from smca_umc_mce_desc[] so we should be ok wrt duplication.

> +     "DRAM ECC error",
> +     "Data poison error",
> +     "SDP parity error",
> +     "Reserved",
> +     "Address/Command parity error",
> +     "Write data parity error",
> +     "DCQ SRAM ECC error",
> +     "Reserved",
> +     "Read data parity error",
> +     "Rdb SRAM ECC error",
> +     "RdRsp SRAM ECC error",
> +     "LM32 MP errors",
> +};

...


> +static const char * const smca_xgmipcs_mce_desc[] = {
> +     "DataLossErr",
> +     "TrainingErr",
> +     "FlowCtrlAckErr",
> +     "RxFifoUnderflowErr",
> +     "RxFifoOverflowErr",
> +     "CRCErr",
> +     "BERExceededErr",
> +     "TxVcidDataErr",
> +     "ReplayBufParityErr",
> +     "DataParityErr",
> +     "ReplayFifoOverflowErr",
> +     "ReplayFIfoUnderflowErr",
> +     "ElasticFifoOverflowErr",
> +     "DeskewErr",
> +     "FlowCtrlCRCErr",
> +     "DataStartupLimitErr",
> +     "FCInitTimeoutErr",
> +     "RecoveryTimeoutErr",
> +     "ReadySerialTimeoutErr",
> +     "ReadySerialAttemptErr",
> +     "RecoveryAttemptErr",
> +     "RecoveryRelockAttemptErr",
> +     "ReplayAttemptErr",
> +     "SyncHdrErr",
> +     "TxReplayTimeoutErr",
> +     "RxReplayTimeoutErr",
> +     "LinkSubTxTimeoutErr",
> +     "LinkSubRxTimeoutErr",
> +     "RxCMDPktErr",

What happened to those and why aren't they proper words like the other error descriptions?
[naveenk:] Will change these into proper words.

Thx.

--
Regards/Gruss,
    Boris.

https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpeople.kernel.org%2Ftglx%2Fnotes-about-netiquette&amp;data=04%7C01%7CNaveenKrishna.Chatradhi%40amd.com%7C9159e5c1aebd47969c2508d914a20789%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637563508427766424%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=%2FiHUZDkg99NnGdDrOCK%2FQWsui2yA1dADCfG%2F4xFr%2B7I%3D&amp;reserved=0

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types.
  2021-05-24 16:41   ` Chatradhi, Naveen Krishna
@ 2021-05-25 18:02     ` Borislav Petkov
  2021-05-25 20:03       ` Yazen Ghannam
  0 siblings, 1 reply; 33+ messages in thread
From: Borislav Petkov @ 2021-05-25 18:02 UTC (permalink / raw)
  To: Chatradhi, Naveen Krishna
  Cc: linux-edac, x86, linux-kernel, mingo, mchehab, M K, Muralidhara

On Mon, May 24, 2021 at 04:41:25PM +0000, Chatradhi, Naveen Krishna wrote:
> [naveenk:] There is a possibility for a heterogenous system with both
> the SMCA_UMC and SMCA_UMC_V2 variant of controllers to exist.

Wait, what? You can have systems with *both* UMCs in the same coherent
fabric and thus the OS can see UMCs of both types on the same system?

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types.
  2021-05-25 18:02     ` Borislav Petkov
@ 2021-05-25 20:03       ` Yazen Ghannam
  2021-05-25 20:12         ` Borislav Petkov
  0 siblings, 1 reply; 33+ messages in thread
From: Yazen Ghannam @ 2021-05-25 20:03 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Chatradhi, Naveen Krishna, linux-edac, x86, linux-kernel, mingo,
	mchehab, M K, Muralidhara

On Tue, May 25, 2021 at 08:02:44PM +0200, Borislav Petkov wrote:
> On Mon, May 24, 2021 at 04:41:25PM +0000, Chatradhi, Naveen Krishna wrote:
> > [naveenk:] There is a possibility for a heterogenous system with both
> > the SMCA_UMC and SMCA_UMC_V2 variant of controllers to exist.
> 
> Wait, what? You can have systems with *both* UMCs in the same coherent
> fabric and thus the OS can see UMCs of both types on the same system?
>

Yep, that's right. The UMCs are the only case of this so far and in the
foreseeable future. Though we may be moving towards more cases like
this.

Thanks,
Yazen

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types.
  2021-05-25 20:03       ` Yazen Ghannam
@ 2021-05-25 20:12         ` Borislav Petkov
  0 siblings, 0 replies; 33+ messages in thread
From: Borislav Petkov @ 2021-05-25 20:12 UTC (permalink / raw)
  To: Yazen Ghannam
  Cc: Chatradhi, Naveen Krishna, linux-edac, x86, linux-kernel, mingo,
	mchehab, M K, Muralidhara

On Tue, May 25, 2021 at 04:03:27PM -0400, Yazen Ghannam wrote:
> Yep, that's right. The UMCs are the only case of this so far and in the
> foreseeable future. Though we may be moving towards more cases like
> this.

Ok, then you guys really need to make sure those strings are unique
because they're in sysfs:

/sys/devices/system/machinecheck/machinecheck...

AFAIR, so we can't have duplicates there.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v2] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types.
  2021-05-11 15:25 [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types Naveen Krishna Chatradhi
                   ` (2 preceding siblings ...)
  2021-05-11 17:27 ` [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types Borislav Petkov
@ 2021-05-26 16:46 ` Naveen Krishna Chatradhi
  3 siblings, 0 replies; 33+ messages in thread
From: Naveen Krishna Chatradhi @ 2021-05-26 16:46 UTC (permalink / raw)
  To: linux-edac, x86
  Cc: linux-kernel, bp, mingo, mchehab, Muralidhara M K, Yazen Ghannam,
	Naveen Krishna Chatradhi

From: Muralidhara M K <muralimk@amd.com>

Add the (HWID, MCATYPE) tuples and names for new
SMCA bank types.

Also, add their respective error descriptions to the MCE
decoding module edac_mce_amd.
Also while at it, optimize the string names for some SMCA banks.

Signed-off-by: Muralidhara M K <muralimk@amd.com>
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Naveen Krishna Chatradhi  <nchatrad@amd.com>
---
 arch/x86/include/asm/mce.h    |  5 +++
 arch/x86/kernel/cpu/mce/amd.c | 55 ++++++++++++++++-----------
 drivers/edac/mce_amd.c        | 70 +++++++++++++++++++++++++++++++++++
 3 files changed, 109 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index ddfb3cad8dff..691a9985cb95 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -317,6 +317,7 @@ enum smca_bank_types {
 	SMCA_CS_V2,	/* Coherent Slave */
 	SMCA_PIE,	/* Power, Interrupts, etc. */
 	SMCA_UMC,	/* Unified Memory Controller */
+	SMCA_UMC_V2,	/* Unified Memory Controller V2 */
 	SMCA_PB,	/* Parameter Block */
 	SMCA_PSP,	/* Platform Security Processor */
 	SMCA_PSP_V2,	/* Platform Security Processor */
@@ -325,6 +326,10 @@ enum smca_bank_types {
 	SMCA_MP5,	/* Microprocessor 5 Unit */
 	SMCA_NBIO,	/* Northbridge IO Unit */
 	SMCA_PCIE,	/* PCI Express Unit */
+	SMCA_PCIE_V2,	/* PCI Express Unit */
+	SMCA_XGMI_PCS,	/* xGMI PCS Unit */
+	SMCA_XGMI_PHY,	/* xGMI PHY Unit */
+	SMCA_WAFL_PHY,	/* WAFL PHY Unit */
 	N_SMCA_BANK_TYPES
 };
 
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index e486f96b3cb3..d47edf207391 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -77,27 +77,29 @@ struct smca_bank_name {
 };
 
 static struct smca_bank_name smca_names[] = {
-	[SMCA_LS]	= { "load_store",	"Load Store Unit" },
-	[SMCA_LS_V2]	= { "load_store",	"Load Store Unit" },
-	[SMCA_IF]	= { "insn_fetch",	"Instruction Fetch Unit" },
-	[SMCA_L2_CACHE]	= { "l2_cache",		"L2 Cache" },
-	[SMCA_DE]	= { "decode_unit",	"Decode Unit" },
-	[SMCA_RESERVED]	= { "reserved",		"Reserved" },
-	[SMCA_EX]	= { "execution_unit",	"Execution Unit" },
-	[SMCA_FP]	= { "floating_point",	"Floating Point Unit" },
-	[SMCA_L3_CACHE]	= { "l3_cache",		"L3 Cache" },
-	[SMCA_CS]	= { "coherent_slave",	"Coherent Slave" },
-	[SMCA_CS_V2]	= { "coherent_slave",	"Coherent Slave" },
-	[SMCA_PIE]	= { "pie",		"Power, Interrupts, etc." },
-	[SMCA_UMC]	= { "umc",		"Unified Memory Controller" },
-	[SMCA_PB]	= { "param_block",	"Parameter Block" },
-	[SMCA_PSP]	= { "psp",		"Platform Security Processor" },
-	[SMCA_PSP_V2]	= { "psp",		"Platform Security Processor" },
-	[SMCA_SMU]	= { "smu",		"System Management Unit" },
-	[SMCA_SMU_V2]	= { "smu",		"System Management Unit" },
-	[SMCA_MP5]	= { "mp5",		"Microprocessor 5 Unit" },
-	[SMCA_NBIO]	= { "nbio",		"Northbridge IO Unit" },
-	[SMCA_PCIE]	= { "pcie",		"PCI Express Unit" },
+	[SMCA_LS ... SMCA_LS_V2]	= { "load_store",	"Load Store Unit" },
+	[SMCA_IF]			= { "insn_fetch",	"Instruction Fetch Unit" },
+	[SMCA_L2_CACHE]			= { "l2_cache",		"L2 Cache" },
+	[SMCA_DE]			= { "decode_unit",	"Decode Unit" },
+	[SMCA_RESERVED]			= { "reserved",		"Reserved" },
+	[SMCA_EX]			= { "execution_unit",	"Execution Unit" },
+	[SMCA_FP]			= { "floating_point",	"Floating Point Unit" },
+	[SMCA_L3_CACHE]			= { "l3_cache",		"L3 Cache" },
+	[SMCA_CS ... SMCA_CS_V2]	= { "coherent_slave",	"Coherent Slave" },
+	[SMCA_PIE]			= { "pie",		"Power, Interrupts, etc." },
+	[SMCA_UMC]			= { "umc",		"Unified Memory Controller" },
+	[SMCA_UMC_V2]			= { "umc_v2",		"Unified Memory Controller V2" },
+	[SMCA_PB]			= { "param_block",	"Parameter Block" },
+	[SMCA_PSP ... SMCA_PSP_V2]	= { "psp",		"Platform Security Processor" },
+	[SMCA_SMU ... SMCA_SMU_V2]	= { "smu",		"System Management Unit" },
+	[SMCA_MP5]			= { "mp5",		"Microprocessor 5 Unit" },
+	[SMCA_NBIO]			= { "nbio",		"Northbridge IO Unit" },
+	[SMCA_PCIE ... SMCA_PCIE_V2]	= { "pcie",		"PCI Express Unit" },
+	[SMCA_XGMI_PCS]			= { "xgmi_pcs",
+					    "Ext Global Memory Interconnect PCS Unit" },
+	[SMCA_XGMI_PHY]			= { "xgmi_phy",
+					    "Ext Global Memory Interconnect PHY Unit" },
+	[SMCA_WAFL_PHY]			= { "wafl_phy",		"WAFL PHY Unit" },
 };
 
 static const char *smca_get_name(enum smca_bank_types t)
@@ -155,6 +157,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
 
 	/* Unified Memory Controller MCA type */
 	{ SMCA_UMC,	 HWID_MCATYPE(0x96, 0x0)	},
+	{ SMCA_UMC_V2,	 HWID_MCATYPE(0x96, 0x1)	},
 
 	/* Parameter Block MCA type */
 	{ SMCA_PB,	 HWID_MCATYPE(0x05, 0x0)	},
@@ -175,6 +178,16 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
 
 	/* PCI Express Unit MCA type */
 	{ SMCA_PCIE,	 HWID_MCATYPE(0x46, 0x0)	},
+	{ SMCA_PCIE_V2,	 HWID_MCATYPE(0x46, 0x1)	},
+
+	/* xGMI PCS MCA type */
+	{ SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0)	},
+
+	/* xGMI PHY MCA type */
+	{ SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0)	},
+
+	/* WAFL PHY MCA type */
+	{ SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0)	},
 };
 
 struct smca_bank smca_banks[MAX_NR_BANKS];
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 5dd905a3f30c..43ba0f931629 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -323,6 +323,21 @@ static const char * const smca_umc_mce_desc[] = {
 	"AES SRAM ECC error",
 };
 
+static const char * const smca_umc2_mce_desc[] = {
+	"DRAM ECC error",
+	"Data poison error",
+	"SDP parity error",
+	"Reserved",
+	"Address/Command parity error",
+	"Write data parity error",
+	"DCQ SRAM ECC error",
+	"Reserved",
+	"Read data parity error",
+	"Rdb SRAM ECC error",
+	"RdRsp SRAM ECC error",
+	"LM32 MP errors",
+};
+
 static const char * const smca_pb_mce_desc[] = {
 	"An ECC error in the Parameter Block RAM array",
 };
@@ -400,6 +415,56 @@ static const char * const smca_pcie_mce_desc[] = {
 	"CCIX Non-okay write response with data error",
 };
 
+static const char * const smca_pcie2_mce_desc[] = {
+	"SDP Parity Error logging",
+};
+
+static const char * const smca_xgmipcs_mce_desc[] = {
+	"Data Loss Error",
+	"Training Error",
+	"Flow Control Acknowledge Error",
+	"Rx Fifo Underflow Error",
+	"Rx Fifo Overflow Error",
+	"CRC Error",
+	"BER Exceeded Error",
+	"Tx Vcid Data Error",
+	"Replay Buffer Parity Error",
+	"Data Parity Error",
+	"Replay Fifo Overflow Error",
+	"Replay FIfo Underflow Error",
+	"Elastic Fifo Overflow Error",
+	"Deskew Error",
+	"Flow Control CRC Error",
+	"Data Startup Limit Error",
+	"FC Init Timeout Error",
+	"Recovery Timeout Error",
+	"Ready Serial Timeout Error",
+	"Ready Serial Attempt Error",
+	"Recovery Attempt Error",
+	"Recovery Relock Attempt Error",
+	"Replay Attempt Error",
+	"Sync Header Error",
+	"Tx Replay Timeout Error",
+	"Rx Replay Timeout Error",
+	"LinkSub Tx Timeout Error",
+	"LinkSub Rx Timeout Error",
+	"Rx CMD Pocket Error",
+};
+
+static const char * const smca_xgmiphy_mce_desc[] = {
+	"RAM ECC Error",
+	"ARC instruction buffer parity error",
+	"ARC data buffer parity error",
+	"PHY APB error",
+};
+
+static const char * const smca_waflphy_mce_desc[] = {
+	"RAM ECC Error",
+	"ARC instruction buffer parity error",
+	"ARC data buffer parity error",
+	"PHY APB error",
+};
+
 struct smca_mce_desc {
 	const char * const *descs;
 	unsigned int num_descs;
@@ -418,6 +483,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
 	[SMCA_CS_V2]	= { smca_cs2_mce_desc,	ARRAY_SIZE(smca_cs2_mce_desc)	},
 	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
 	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
+	[SMCA_UMC_V2]	= { smca_umc2_mce_desc,	ARRAY_SIZE(smca_umc2_mce_desc)	},
 	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
 	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
 	[SMCA_PSP_V2]	= { smca_psp2_mce_desc,	ARRAY_SIZE(smca_psp2_mce_desc)	},
@@ -426,6 +492,10 @@ static struct smca_mce_desc smca_mce_descs[] = {
 	[SMCA_MP5]	= { smca_mp5_mce_desc,	ARRAY_SIZE(smca_mp5_mce_desc)	},
 	[SMCA_NBIO]	= { smca_nbio_mce_desc,	ARRAY_SIZE(smca_nbio_mce_desc)	},
 	[SMCA_PCIE]	= { smca_pcie_mce_desc,	ARRAY_SIZE(smca_pcie_mce_desc)	},
+	[SMCA_PCIE_V2]	= { smca_pcie2_mce_desc,   ARRAY_SIZE(smca_pcie2_mce_desc)	},
+	[SMCA_XGMI_PCS]	= { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc)	},
+	[SMCA_XGMI_PHY]	= { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc)	},
+	[SMCA_WAFL_PHY]	= { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc)	},
 };
 
 static bool f12h_mc0_mce(u16 ec, u8 xec)
-- 
2.26.2


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol
  2021-05-11 15:25 ` [PATCH 2/3] x86/MCE/AMD: Helper function to check UMC v2 Naveen Krishna Chatradhi
  2021-05-11 17:34   ` Borislav Petkov
@ 2021-09-13  2:13   ` Mukul Joshi
  2021-09-13  2:13     ` [PATCHv2 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS Mukul Joshi
  2021-09-22 11:33     ` [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol Borislav Petkov
  1 sibling, 2 replies; 33+ messages in thread
From: Mukul Joshi @ 2021-09-13  2:13 UTC (permalink / raw)
  To: linux-edac, x86
  Cc: linux-kernel, bp, mingo, mchehab, yazen.ghannam, amd-gfx, Mukul Joshi

Export smca_get_bank_type for use in the AMD GPU
driver to determine MCA bank while handling correctable
and uncorrectable errors in GPU UMC.

v1->v2:
- Drop the function is_smca_umc_v2().
- Drop the patch to introduce a new MCE priority (MCE_PRIO_ACEL)
  for GPU/accelarator cards.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
---
 arch/x86/include/asm/mce.h    | 2 +-
 arch/x86/kernel/cpu/mce/amd.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index fc3d36f1f9d0..d90d3ccb583a 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -358,7 +358,7 @@ extern int mce_threshold_remove_device(unsigned int cpu);
 
 void mce_amd_feature_init(struct cpuinfo_x86 *c);
 int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr);
-
+enum smca_bank_types smca_get_bank_type(unsigned int bank);
 #else
 
 static inline int mce_threshold_create_device(unsigned int cpu)		{ return 0; };
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 67a337672ee4..c9272c53026e 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -123,7 +123,7 @@ const char *smca_get_long_name(enum smca_bank_types t)
 }
 EXPORT_SYMBOL_GPL(smca_get_long_name);
 
-static enum smca_bank_types smca_get_bank_type(unsigned int bank)
+enum smca_bank_types smca_get_bank_type(unsigned int bank)
 {
 	struct smca_bank *b;
 
@@ -136,6 +136,7 @@ static enum smca_bank_types smca_get_bank_type(unsigned int bank)
 
 	return b->hwid->bank_type;
 }
+EXPORT_SYMBOL_GPL(smca_get_bank_type);
 
 static struct smca_hwid smca_hwid_mcatypes[] = {
 	/* { bank_type, hwid_mcatype } */
-- 
2.17.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCHv2 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-13  2:13   ` [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol Mukul Joshi
@ 2021-09-13  2:13     ` Mukul Joshi
  2021-09-22 11:40       ` Borislav Petkov
  2021-09-22 19:36       ` [PATCHv3 " Mukul Joshi
  2021-09-22 11:33     ` [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol Borislav Petkov
  1 sibling, 2 replies; 33+ messages in thread
From: Mukul Joshi @ 2021-09-13  2:13 UTC (permalink / raw)
  To: linux-edac, x86
  Cc: linux-kernel, bp, mingo, mchehab, yazen.ghannam, amd-gfx, Mukul Joshi

On Aldebaran, GPU driver will handle bad page retirement
even though UMC is host managed. As a result, register a
bad page retirement handler on the mce notifier chain to
retire bad pages on Aldebaran.

v1->v2:
- Use smca_get_bank_type() to determine MCA bank.
- Envelope the changes under #ifdef CONFIG_X86_MCE_AMD.
- Use MCE_PRIORITY_UC instead of MCE_PRIO_ACCEL as we are
  only handling uncorrectable errors.
- Use macros to determine UMC instance and channel instance
  where the uncorrectable error occured.
- Update the headline.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Link: https://lore.kernel.org/amd-gfx/20210512013058.6827-1-mukul.joshi@amd.com/
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 142 ++++++++++++++++++++++++
 1 file changed, 142 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b5332db4d287..35cfcc71ff94 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -35,7 +35,11 @@
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include "atom.h"
+#ifdef CONFIG_X86_MCE_AMD
+#include <asm/mce.h>
 
+static bool notifier_registered;
+#endif
 static const char *RAS_FS_NAME = "ras";
 
 const char *ras_error_string[] = {
@@ -86,6 +90,9 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
 				uint64_t addr);
 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
 				uint64_t addr);
+#ifdef CONFIG_X86_MCE_AMD
+static void amdgpu_register_bad_pages_mca_notifier(void);
+#endif
 
 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
 {
@@ -2018,6 +2025,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 			adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs);
 	}
 
+#ifdef CONFIG_X86_MCE_AMD
+	if ((adev->asic_type == CHIP_ALDEBARAN) &&
+	    (adev->gmc.xgmi.connected_to_cpu))
+		amdgpu_register_bad_pages_mca_notifier();
+#endif
 	return 0;
 
 free:
@@ -2511,3 +2523,133 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev)
 		kfree(con);
 	}
 }
+
+#ifdef CONFIG_X86_MCE_AMD
+static struct amdgpu_device *find_adev(uint32_t node_id)
+{
+	struct amdgpu_gpu_instance *gpu_instance;
+	int i;
+	struct amdgpu_device *adev = NULL;
+
+	mutex_lock(&mgpu_info.mutex);
+
+	for (i = 0; i < mgpu_info.num_gpu; i++) {
+		gpu_instance = &(mgpu_info.gpu_ins[i]);
+		adev = gpu_instance->adev;
+
+		if (adev->gmc.xgmi.connected_to_cpu &&
+		    adev->gmc.xgmi.physical_node_id == node_id)
+			break;
+		adev = NULL;
+	}
+
+	mutex_unlock(&mgpu_info.mutex);
+
+	return adev;
+}
+
+#define GET_MCA_IPID_GPUID(m)	(((m) >> 44) & 0xF)
+#define GET_UMC_INST(m)		(((m) >> 21) & 0x7)
+#define GET_CHAN_INDEX(m)	((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
+#define GPU_ID_OFFSET		8
+
+static int amdgpu_bad_page_notifier(struct notifier_block *nb,
+				    unsigned long val, void *data)
+{
+	struct mce *m = (struct mce *)data;
+	struct amdgpu_device *adev = NULL;
+	uint32_t gpu_id = 0;
+	uint32_t umc_inst = 0;
+	uint32_t ch_inst, channel_index = 0;
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	struct eeprom_table_record err_rec;
+	uint64_t retired_page;
+
+	/*
+	 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
+	 * and error occurred in DramECC (Extended error code = 0) then only
+	 * process the error, else bail out.
+	 */
+	if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
+		    (XEC(m->status, 0x1f) == 0x0)))
+		return NOTIFY_DONE;
+
+	/*
+	 * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
+	 */
+	gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
+
+	adev = find_adev(gpu_id);
+	if (!adev) {
+		dev_warn(adev->dev, "%s: Unable to find adev for gpu_id: %d\n",
+				     __func__, gpu_id);
+		return NOTIFY_DONE;
+	}
+
+	/*
+	 * If it is correctable error, return.
+	 */
+	if (mce_is_correctable(m)) {
+		return NOTIFY_OK;
+	}
+
+	/*
+	 * If it is uncorrectable error, then find out UMC instance and
+	 * channel index.
+	 */
+	umc_inst = GET_UMC_INST(m->ipid);
+	ch_inst = GET_CHAN_INDEX(m->ipid);
+
+	dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
+			     umc_inst, ch_inst);
+
+	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
+
+	/*
+	 * Translate UMC channel address to Physical address
+	 */
+	channel_index =
+		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num
+					  + ch_inst];
+
+	retired_page = ADDR_OF_8KB_BLOCK(m->addr) |
+			ADDR_OF_256B_BLOCK(channel_index) |
+			OFFSET_IN_256B_BLOCK(m->addr);
+
+	err_rec.address = m->addr;
+	err_rec.retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+	err_rec.ts = (uint64_t)ktime_get_real_seconds();
+	err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+	err_rec.cu = 0;
+	err_rec.mem_channel = channel_index;
+	err_rec.mcumc_id = umc_inst;
+
+	err_data.err_addr = &err_rec;
+	err_data.err_addr_cnt = 1;
+
+	if (amdgpu_bad_page_threshold != 0) {
+		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+						err_data.err_addr_cnt);
+		amdgpu_ras_save_bad_pages(adev);
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block amdgpu_bad_page_nb = {
+	.notifier_call  = amdgpu_bad_page_notifier,
+	.priority       = MCE_PRIO_UC,
+};
+
+static void amdgpu_register_bad_pages_mca_notifier(void)
+{
+	/*
+	 * Register the x86 notifier only once
+	 * with MCE subsystem.
+	 */
+	if (notifier_registered == false) {
+		mce_register_decode_chain(&amdgpu_bad_page_nb);
+		notifier_registered = true;
+	}
+}
+#endif
-- 
2.17.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol
  2021-09-13  2:13   ` [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol Mukul Joshi
  2021-09-13  2:13     ` [PATCHv2 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS Mukul Joshi
@ 2021-09-22 11:33     ` Borislav Petkov
  2021-09-22 16:27       ` Deucher, Alexander
  1 sibling, 1 reply; 33+ messages in thread
From: Borislav Petkov @ 2021-09-22 11:33 UTC (permalink / raw)
  To: Mukul Joshi, Alex Deucher
  Cc: linux-edac, x86, linux-kernel, mingo, mchehab, yazen.ghannam, amd-gfx

On Sun, Sep 12, 2021 at 10:13:10PM -0400, Mukul Joshi wrote:
> Export smca_get_bank_type for use in the AMD GPU
> driver to determine MCA bank while handling correctable
> and uncorrectable errors in GPU UMC.
> 
> v1->v2:
> - Drop the function is_smca_umc_v2().
> - Drop the patch to introduce a new MCE priority (MCE_PRIO_ACEL)
>   for GPU/accelarator cards.

Patch changelog information goes...

> 
> Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
> ---

... under this line so that it gets automatically removed by git when
applying the patch.

Alex, how do you wanna handle this?

Want me to ACK this and you can carry it through your tree along with
the second patch?

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCHv2 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-13  2:13     ` [PATCHv2 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS Mukul Joshi
@ 2021-09-22 11:40       ` Borislav Petkov
  2021-09-22 19:43         ` Joshi, Mukul
  2021-09-22 19:36       ` [PATCHv3 " Mukul Joshi
  1 sibling, 1 reply; 33+ messages in thread
From: Borislav Petkov @ 2021-09-22 11:40 UTC (permalink / raw)
  To: Mukul Joshi
  Cc: linux-edac, x86, linux-kernel, mingo, mchehab, yazen.ghannam, amd-gfx

On Sun, Sep 12, 2021 at 10:13:11PM -0400, Mukul Joshi wrote:
> On Aldebaran, GPU driver will handle bad page retirement
> even though UMC is host managed. As a result, register a
> bad page retirement handler on the mce notifier chain to
> retire bad pages on Aldebaran.
> 
> v1->v2:
> - Use smca_get_bank_type() to determine MCA bank.
> - Envelope the changes under #ifdef CONFIG_X86_MCE_AMD.
> - Use MCE_PRIORITY_UC instead of MCE_PRIO_ACCEL as we are
>   only handling uncorrectable errors.
> - Use macros to determine UMC instance and channel instance
>   where the uncorrectable error occured.
> - Update the headline.

Same note as for the previous patch.

> +static int amdgpu_bad_page_notifier(struct notifier_block *nb,
> +				    unsigned long val, void *data)
> +{
> +	struct mce *m = (struct mce *)data;
> +	struct amdgpu_device *adev = NULL;
> +	uint32_t gpu_id = 0;
> +	uint32_t umc_inst = 0;
> +	uint32_t ch_inst, channel_index = 0;
> +	struct ras_err_data err_data = {0, 0, 0, NULL};
> +	struct eeprom_table_record err_rec;
> +	uint64_t retired_page;
> +
> +	/*
> +	 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
> +	 * and error occurred in DramECC (Extended error code = 0) then only
> +	 * process the error, else bail out.
> +	 */
> +	if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
> +		    (XEC(m->status, 0x1f) == 0x0)))
> +		return NOTIFY_DONE;
> +
> +	/*
> +	 * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
> +	 */
> +	gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
> +
> +	adev = find_adev(gpu_id);
> +	if (!adev) {
> +		dev_warn(adev->dev, "%s: Unable to find adev for gpu_id: %d\n",
> +				     __func__, gpu_id);
> +		return NOTIFY_DONE;
> +	}
> +
> +	/*
> +	 * If it is correctable error, return.
> +	 */
> +	if (mce_is_correctable(m)) {
> +		return NOTIFY_OK;
> +	}

This can run before you find_adev().

> +static void amdgpu_register_bad_pages_mca_notifier(void)
> +{
> +	/*
> +	 * Register the x86 notifier only once
> +	 * with MCE subsystem.
> +	 */
> +	if (notifier_registered == false) {
> +		mce_register_decode_chain(&amdgpu_bad_page_nb);
> +		notifier_registered = true;
> +	}

I have a patchset which will get rid of the need to do this silliness -
only if I had some time to actually prepare it for submission... :-\

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol
  2021-09-22 11:33     ` [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol Borislav Petkov
@ 2021-09-22 16:27       ` Deucher, Alexander
  2021-09-22 16:43         ` Borislav Petkov
  0 siblings, 1 reply; 33+ messages in thread
From: Deucher, Alexander @ 2021-09-22 16:27 UTC (permalink / raw)
  To: Borislav Petkov, Joshi, Mukul, Alex Deucher
  Cc: linux-edac, x86, linux-kernel, mingo, mchehab, Ghannam, Yazen, amd-gfx

[Public]

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Borislav Petkov
> Sent: Wednesday, September 22, 2021 7:34 AM
> To: Joshi, Mukul <Mukul.Joshi@amd.com>; Alex Deucher
> <alexdeucher@gmail.com>
> Cc: linux-edac@vger.kernel.org; x86@kernel.org; linux-
> kernel@vger.kernel.org; mingo@redhat.com; mchehab@kernel.org;
> Ghannam, Yazen <Yazen.Ghannam@amd.com>; amd-
> gfx@lists.freedesktop.org
> Subject: Re: [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type
> symbol
> 
> On Sun, Sep 12, 2021 at 10:13:10PM -0400, Mukul Joshi wrote:
> > Export smca_get_bank_type for use in the AMD GPU driver to determine
> > MCA bank while handling correctable and uncorrectable errors in GPU
> > UMC.
> >
> > v1->v2:
> > - Drop the function is_smca_umc_v2().
> > - Drop the patch to introduce a new MCE priority (MCE_PRIO_ACEL)
> >   for GPU/accelarator cards.
> 
> Patch changelog information goes...
> 
> >
> > Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
> > ---
> 
> ... under this line so that it gets automatically removed by git when applying
> the patch.
> 
> Alex, how do you wanna handle this?
> 
> Want me to ACK this and you can carry it through your tree along with the
> second patch?

That would be great.  Thanks!

Alex

> 
> --
> Regards/Gruss,
>     Boris.
> 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpeo
> ple.kernel.org%2Ftglx%2Fnotes-about-
> netiquette&amp;data=04%7C01%7Calexander.deucher%40amd.com%7C12b
> cf4eeffad4e2533b508d97dca1cf4%7C3dd8961fe4884e608e11a82d994e183d%
> 7C0%7C0%7C637679129761221057%7CUnknown%7CTWFpbGZsb3d8eyJWIjoi
> MC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C200
> 0&amp;sdata=tK9aK%2FHf5RimF%2FenuTGeJSFFmRuk86Q%2BqY9Jt23gKMQ
> %3D&amp;reserved=0

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol
  2021-09-22 16:27       ` Deucher, Alexander
@ 2021-09-22 16:43         ` Borislav Petkov
  2021-09-22 16:47           ` Joshi, Mukul
  0 siblings, 1 reply; 33+ messages in thread
From: Borislav Petkov @ 2021-09-22 16:43 UTC (permalink / raw)
  To: Deucher, Alexander
  Cc: Joshi, Mukul, Alex Deucher, linux-edac, x86, linux-kernel, mingo,
	mchehab, Ghannam, Yazen, amd-gfx

On Wed, Sep 22, 2021 at 04:27:34PM +0000, Deucher, Alexander wrote:
> > On Sun, Sep 12, 2021 at 10:13:10PM -0400, Mukul Joshi wrote:
> > > Export smca_get_bank_type for use in the AMD GPU driver to determine
> > > MCA bank while handling correctable and uncorrectable errors in GPU
> > > UMC.
> > >
> > > v1->v2:
> > > - Drop the function is_smca_umc_v2().
> > > - Drop the patch to introduce a new MCE priority (MCE_PRIO_ACEL)
> > >   for GPU/accelarator cards.
> > 
> > Patch changelog information goes...
> > 
> > >
> > > Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
> > > ---
> > 
> > ... under this line so that it gets automatically removed by git when applying
> > the patch.
> > 
> > Alex, how do you wanna handle this?
> > 
> > Want me to ACK this and you can carry it through your tree along with the
> > second patch?
> 
> That would be great.  Thanks!

Ok, with the above changelog removed:

Acked-by: Borislav Petkov <bp@suse.de>

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol
  2021-09-22 16:43         ` Borislav Petkov
@ 2021-09-22 16:47           ` Joshi, Mukul
  0 siblings, 0 replies; 33+ messages in thread
From: Joshi, Mukul @ 2021-09-22 16:47 UTC (permalink / raw)
  To: Borislav Petkov, Deucher, Alexander
  Cc: Alex Deucher, linux-edac, x86, linux-kernel, mingo, mchehab,
	Ghannam, Yazen, amd-gfx

[AMD Official Use Only]



> -----Original Message-----
> From: Borislav Petkov <bp@alien8.de>
> Sent: Wednesday, September 22, 2021 12:44 PM
> To: Deucher, Alexander <Alexander.Deucher@amd.com>
> Cc: Joshi, Mukul <Mukul.Joshi@amd.com>; Alex Deucher
> <alexdeucher@gmail.com>; linux-edac@vger.kernel.org; x86@kernel.org; linux-
> kernel@vger.kernel.org; mingo@redhat.com; mchehab@kernel.org; Ghannam,
> Yazen <Yazen.Ghannam@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol
> 
> [CAUTION: External Email]
> 
> On Wed, Sep 22, 2021 at 04:27:34PM +0000, Deucher, Alexander wrote:
> > > On Sun, Sep 12, 2021 at 10:13:10PM -0400, Mukul Joshi wrote:
> > > > Export smca_get_bank_type for use in the AMD GPU driver to
> > > > determine MCA bank while handling correctable and uncorrectable
> > > > errors in GPU UMC.
> > > >
> > > > v1->v2:
> > > > - Drop the function is_smca_umc_v2().
> > > > - Drop the patch to introduce a new MCE priority (MCE_PRIO_ACEL)
> > > >   for GPU/accelarator cards.
> > >
> > > Patch changelog information goes...
> > >
> > > >
> > > > Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
> > > > ---
> > >
> > > ... under this line so that it gets automatically removed by git
> > > when applying the patch.
> > >
> > > Alex, how do you wanna handle this?
> > >
> > > Want me to ACK this and you can carry it through your tree along
> > > with the second patch?
> >
> > That would be great.  Thanks!
> 
> Ok, with the above changelog removed:
> 
> Acked-by: Borislav Petkov <bp@suse.de>
> 
> Thx.
> 
Thank you so much! 
I will make sure to remove the changelog.
And I will send the updated version for the second patch soon.

Regards,
Mukul

> --
> Regards/Gruss,
>     Boris.
> 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpeople.
> kernel.org%2Ftglx%2Fnotes-about-
> netiquette&amp;data=04%7C01%7CMukul.Joshi%40amd.com%7C3dc61ec5018f
> 487ec06e08d97de83039%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0
> %7C637679258473597532%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAw
> MDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata
> =4JqFDJpxM%2Bzl4%2BZoaC3tTwScEhRy2Aa7xJaNJn3rxbE%3D&amp;reserved=
> 0

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCHv3 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-13  2:13     ` [PATCHv2 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS Mukul Joshi
  2021-09-22 11:40       ` Borislav Petkov
@ 2021-09-22 19:36       ` Mukul Joshi
  2021-09-23 14:29         ` Yazen Ghannam
  2021-09-23 22:04         ` [PATCHv4 " Mukul Joshi
  1 sibling, 2 replies; 33+ messages in thread
From: Mukul Joshi @ 2021-09-22 19:36 UTC (permalink / raw)
  To: linux-edac, x86
  Cc: linux-kernel, bp, mingo, mchehab, yazen.ghannam, amd-gfx, Mukul Joshi

On Aldebaran, GPU driver will handle bad page retirement
even though UMC is host managed. As a result, register a
bad page retirement handler on the mce notifier chain to
retire bad pages on Aldebaran.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
---
v1->v2:
- Use smca_get_bank_type() to determine MCA bank.
- Envelope the changes under #ifdef CONFIG_X86_MCE_AMD.
- Use MCE_PRIORITY_UC instead of MCE_PRIO_ACCEL as we are
  only handling uncorrectable errors.
- Use macros to determine UMC instance and channel instance
  where the uncorrectable error occured.

v2->v3:
- Move the check for correctable error before find_adev().
- Fix a NULL pointer dereference if find_adev() returns NULL.

 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 141 ++++++++++++++++++++++++
 1 file changed, 141 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 912ea1f9fd04..c1e806762e41 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -35,7 +35,11 @@
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include "atom.h"
+#ifdef CONFIG_X86_MCE_AMD
+#include <asm/mce.h>
 
+static bool notifier_registered;
+#endif
 static const char *RAS_FS_NAME = "ras";
 
 const char *ras_error_string[] = {
@@ -107,6 +111,9 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
 				uint64_t addr);
 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
 				uint64_t addr);
+#ifdef CONFIG_X86_MCE_AMD
+static void amdgpu_register_bad_pages_mca_notifier(void);
+#endif
 
 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
 {
@@ -2089,6 +2096,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 			adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs);
 	}
 
+#ifdef CONFIG_X86_MCE_AMD
+	if ((adev->asic_type == CHIP_ALDEBARAN) &&
+	    (adev->gmc.xgmi.connected_to_cpu))
+		amdgpu_register_bad_pages_mca_notifier();
+#endif
 	return 0;
 
 free:
@@ -2583,3 +2595,132 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev)
 		kfree(con);
 	}
 }
+
+#ifdef CONFIG_X86_MCE_AMD
+static struct amdgpu_device *find_adev(uint32_t node_id)
+{
+	struct amdgpu_gpu_instance *gpu_instance;
+	int i;
+	struct amdgpu_device *adev = NULL;
+
+	mutex_lock(&mgpu_info.mutex);
+
+	for (i = 0; i < mgpu_info.num_gpu; i++) {
+		gpu_instance = &(mgpu_info.gpu_ins[i]);
+		adev = gpu_instance->adev;
+
+		if (adev->gmc.xgmi.connected_to_cpu &&
+		    adev->gmc.xgmi.physical_node_id == node_id)
+			break;
+		adev = NULL;
+	}
+
+	mutex_unlock(&mgpu_info.mutex);
+
+	return adev;
+}
+
+#define GET_MCA_IPID_GPUID(m)	(((m) >> 44) & 0xF)
+#define GET_UMC_INST(m)		(((m) >> 21) & 0x7)
+#define GET_CHAN_INDEX(m)	((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
+#define GPU_ID_OFFSET		8
+
+static int amdgpu_bad_page_notifier(struct notifier_block *nb,
+				    unsigned long val, void *data)
+{
+	struct mce *m = (struct mce *)data;
+	struct amdgpu_device *adev = NULL;
+	uint32_t gpu_id = 0;
+	uint32_t umc_inst = 0;
+	uint32_t ch_inst, channel_index = 0;
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	struct eeprom_table_record err_rec;
+	uint64_t retired_page;
+
+	/*
+	 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
+	 * and error occurred in DramECC (Extended error code = 0) then only
+	 * process the error, else bail out.
+	 */
+	if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
+		    (XEC(m->status, 0x1f) == 0x0)))
+		return NOTIFY_DONE;
+
+	/*
+	 * If it is correctable error, return.
+	 */
+	if (mce_is_correctable(m))
+		return NOTIFY_OK;
+
+	/*
+	 * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
+	 */
+	gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
+
+	adev = find_adev(gpu_id);
+	if (!adev) {
+		DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__,
+								gpu_id);
+		return NOTIFY_DONE;
+	}
+
+	/*
+	 * If it is uncorrectable error, then find out UMC instance and
+	 * channel index.
+	 */
+	umc_inst = GET_UMC_INST(m->ipid);
+	ch_inst = GET_CHAN_INDEX(m->ipid);
+
+	dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
+			     umc_inst, ch_inst);
+
+	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
+
+	/*
+	 * Translate UMC channel address to Physical address
+	 */
+	channel_index =
+		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num
+					  + ch_inst];
+
+	retired_page = ADDR_OF_8KB_BLOCK(m->addr) |
+			ADDR_OF_256B_BLOCK(channel_index) |
+			OFFSET_IN_256B_BLOCK(m->addr);
+
+	err_rec.address = m->addr;
+	err_rec.retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+	err_rec.ts = (uint64_t)ktime_get_real_seconds();
+	err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+	err_rec.cu = 0;
+	err_rec.mem_channel = channel_index;
+	err_rec.mcumc_id = umc_inst;
+
+	err_data.err_addr = &err_rec;
+	err_data.err_addr_cnt = 1;
+
+	if (amdgpu_bad_page_threshold != 0) {
+		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+						err_data.err_addr_cnt);
+		amdgpu_ras_save_bad_pages(adev);
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block amdgpu_bad_page_nb = {
+	.notifier_call  = amdgpu_bad_page_notifier,
+	.priority       = MCE_PRIO_UC,
+};
+
+static void amdgpu_register_bad_pages_mca_notifier(void)
+{
+	/*
+	 * Register the x86 notifier only once
+	 * with MCE subsystem.
+	 */
+	if (notifier_registered == false) {
+		mce_register_decode_chain(&amdgpu_bad_page_nb);
+		notifier_registered = true;
+	}
+}
+#endif
-- 
2.17.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCHv2 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-22 11:40       ` Borislav Petkov
@ 2021-09-22 19:43         ` Joshi, Mukul
  0 siblings, 0 replies; 33+ messages in thread
From: Joshi, Mukul @ 2021-09-22 19:43 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: linux-edac, x86, linux-kernel, mingo, mchehab, Ghannam, Yazen, amd-gfx

[AMD Official Use Only]



> -----Original Message-----
> From: Borislav Petkov <bp@alien8.de>
> Sent: Wednesday, September 22, 2021 7:41 AM
> To: Joshi, Mukul <Mukul.Joshi@amd.com>
> Cc: linux-edac@vger.kernel.org; x86@kernel.org; linux-kernel@vger.kernel.org;
> mingo@redhat.com; mchehab@kernel.org; Ghannam, Yazen
> <Yazen.Ghannam@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCHv2 2/2] drm/amdgpu: Register MCE notifier for Aldebaran
> RAS
> 
> [CAUTION: External Email]
> 
> On Sun, Sep 12, 2021 at 10:13:11PM -0400, Mukul Joshi wrote:
> > On Aldebaran, GPU driver will handle bad page retirement even though
> > UMC is host managed. As a result, register a bad page retirement
> > handler on the mce notifier chain to retire bad pages on Aldebaran.
> >
> > v1->v2:
> > - Use smca_get_bank_type() to determine MCA bank.
> > - Envelope the changes under #ifdef CONFIG_X86_MCE_AMD.
> > - Use MCE_PRIORITY_UC instead of MCE_PRIO_ACCEL as we are
> >   only handling uncorrectable errors.
> > - Use macros to determine UMC instance and channel instance
> >   where the uncorrectable error occured.
> > - Update the headline.
> 
> Same note as for the previous patch.
> 
Acked.

> > +static int amdgpu_bad_page_notifier(struct notifier_block *nb,
> > +                                 unsigned long val, void *data) {
> > +     struct mce *m = (struct mce *)data;
> > +     struct amdgpu_device *adev = NULL;
> > +     uint32_t gpu_id = 0;
> > +     uint32_t umc_inst = 0;
> > +     uint32_t ch_inst, channel_index = 0;
> > +     struct ras_err_data err_data = {0, 0, 0, NULL};
> > +     struct eeprom_table_record err_rec;
> > +     uint64_t retired_page;
> > +
> > +     /*
> > +      * If the error was generated in UMC_V2, which belongs to GPU UMCs,
> > +      * and error occurred in DramECC (Extended error code = 0) then only
> > +      * process the error, else bail out.
> > +      */
> > +     if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
> > +                 (XEC(m->status, 0x1f) == 0x0)))
> > +             return NOTIFY_DONE;
> > +
> > +     /*
> > +      * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
> > +      */
> > +     gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
> > +
> > +     adev = find_adev(gpu_id);
> > +     if (!adev) {
> > +             dev_warn(adev->dev, "%s: Unable to find adev for gpu_id: %d\n",
> > +                                  __func__, gpu_id);
> > +             return NOTIFY_DONE;
> > +     }
> > +
> > +     /*
> > +      * If it is correctable error, return.
> > +      */
> > +     if (mce_is_correctable(m)) {
> > +             return NOTIFY_OK;
> > +     }
> 
> This can run before you find_adev().
> 
Acked.

> > +static void amdgpu_register_bad_pages_mca_notifier(void)
> > +{
> > +     /*
> > +      * Register the x86 notifier only once
> > +      * with MCE subsystem.
> > +      */
> > +     if (notifier_registered == false) {
> > +             mce_register_decode_chain(&amdgpu_bad_page_nb);
> > +             notifier_registered = true;
> > +     }
> 
> I have a patchset which will get rid of the need to do this silliness - only if I had
> some time to actually prepare it for submission... :-\
> 
:-\

Thank you.

Regards,
Mukul
> --
> Regards/Gruss,
>     Boris.
> 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpeople.
> kernel.org%2Ftglx%2Fnotes-about-
> netiquette&amp;data=04%7C01%7Cmukul.joshi%40amd.com%7C7ae9c87153f7
> 4572712908d97dbdcc7a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0
> %7C637679076423976867%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAw
> MDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata
> =M5UDrSea0lnEi5%2BhU4ck0zk1dZD9kX4DUoXt95J6dJ4%3D&amp;reserved=0

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCHv3 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-22 19:36       ` [PATCHv3 " Mukul Joshi
@ 2021-09-23 14:29         ` Yazen Ghannam
  2021-09-23 14:37           ` Borislav Petkov
  2021-09-23 15:30           ` Joshi, Mukul
  2021-09-23 22:04         ` [PATCHv4 " Mukul Joshi
  1 sibling, 2 replies; 33+ messages in thread
From: Yazen Ghannam @ 2021-09-23 14:29 UTC (permalink / raw)
  To: Mukul Joshi; +Cc: linux-edac, x86, linux-kernel, bp, mingo, mchehab, amd-gfx

On Wed, Sep 22, 2021 at 03:36:20PM -0400, Mukul Joshi wrote:
> On Aldebaran, GPU driver will handle bad page retirement
> even though UMC is host managed. As a result, register a
> bad page retirement handler on the mce notifier chain to
> retire bad pages on Aldebaran.
> 

I think this should state that the driver will do page retirement for
GPU-managed memory. As written, it implies that the driver do page retirement
in general for the system.

...

> +
> +static int amdgpu_bad_page_notifier(struct notifier_block *nb,
> +				    unsigned long val, void *data)
> +{
> +	struct mce *m = (struct mce *)data;
> +	struct amdgpu_device *adev = NULL;
> +	uint32_t gpu_id = 0;
> +	uint32_t umc_inst = 0;
> +	uint32_t ch_inst, channel_index = 0;
> +	struct ras_err_data err_data = {0, 0, 0, NULL};
> +	struct eeprom_table_record err_rec;
> +	uint64_t retired_page;
> +
> +	/*
> +	 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
> +	 * and error occurred in DramECC (Extended error code = 0) then only
> +	 * process the error, else bail out.
> +	 */
> +	if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
> +		    (XEC(m->status, 0x1f) == 0x0)))

The MCA_STATUS[ErrorCodeExt] field is bits [21:16], so the mask should be
0x3f.

> +		return NOTIFY_DONE;
> +
> +	/*
> +	 * If it is correctable error, return.
> +	 */
> +	if (mce_is_correctable(m))
> +		return NOTIFY_OK;

Shouldn't this be "NOTIFY_DONE" if "don't care" about this error?

Thanks,
Yazen

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCHv3 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-23 14:29         ` Yazen Ghannam
@ 2021-09-23 14:37           ` Borislav Petkov
  2021-09-23 15:31             ` Joshi, Mukul
  2021-09-23 15:30           ` Joshi, Mukul
  1 sibling, 1 reply; 33+ messages in thread
From: Borislav Petkov @ 2021-09-23 14:37 UTC (permalink / raw)
  To: Yazen Ghannam
  Cc: Mukul Joshi, linux-edac, x86, linux-kernel, mingo, mchehab, amd-gfx

On Thu, Sep 23, 2021 at 02:29:07PM +0000, Yazen Ghannam wrote:
> > +	/*
> > +	 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
> > +	 * and error occurred in DramECC (Extended error code = 0) then only
> > +	 * process the error, else bail out.
> > +	 */
> > +	if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
> > +		    (XEC(m->status, 0x1f) == 0x0)))
> 
> The MCA_STATUS[ErrorCodeExt] field is bits [21:16], so the mask should be
> 0x3f.
> 
> > +		return NOTIFY_DONE;
> > +
> > +	/*
> > +	 * If it is correctable error, return.
> > +	 */
> > +	if (mce_is_correctable(m))
> > +		return NOTIFY_OK;
> 
> Shouldn't this be "NOTIFY_DONE" if "don't care" about this error?

I think the logic here is to stop calling any further consumers on the
notify chain because this is a GPU correctable error and they can't do
anything about it anyway, right?

Or am I misreading it?

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCHv3 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-23 14:29         ` Yazen Ghannam
  2021-09-23 14:37           ` Borislav Petkov
@ 2021-09-23 15:30           ` Joshi, Mukul
  2021-09-23 17:23             ` Yazen Ghannam
  1 sibling, 1 reply; 33+ messages in thread
From: Joshi, Mukul @ 2021-09-23 15:30 UTC (permalink / raw)
  To: Ghannam, Yazen; +Cc: linux-edac, x86, linux-kernel, bp, mingo, mchehab, amd-gfx

[AMD Official Use Only]



> -----Original Message-----
> From: Ghannam, Yazen <Yazen.Ghannam@amd.com>
> Sent: Thursday, September 23, 2021 10:29 AM
> To: Joshi, Mukul <Mukul.Joshi@amd.com>
> Cc: linux-edac@vger.kernel.org; x86@kernel.org; linux-kernel@vger.kernel.org;
> bp@alien8.de; mingo@redhat.com; mchehab@kernel.org; amd-
> gfx@lists.freedesktop.org
> Subject: Re: [PATCHv3 2/2] drm/amdgpu: Register MCE notifier for Aldebaran
> RAS
> 
> On Wed, Sep 22, 2021 at 03:36:20PM -0400, Mukul Joshi wrote:
> > On Aldebaran, GPU driver will handle bad page retirement even though
> > UMC is host managed. As a result, register a bad page retirement
> > handler on the mce notifier chain to retire bad pages on Aldebaran.
> >
> 
> I think this should state that the driver will do page retirement for GPU-managed
> memory. As written, it implies that the driver do page retirement in general for
> the system.
> 
ACK. I will update the description.
> ...
> 
> > +
> > +static int amdgpu_bad_page_notifier(struct notifier_block *nb,
> > +				    unsigned long val, void *data) {
> > +	struct mce *m = (struct mce *)data;
> > +	struct amdgpu_device *adev = NULL;
> > +	uint32_t gpu_id = 0;
> > +	uint32_t umc_inst = 0;
> > +	uint32_t ch_inst, channel_index = 0;
> > +	struct ras_err_data err_data = {0, 0, 0, NULL};
> > +	struct eeprom_table_record err_rec;
> > +	uint64_t retired_page;
> > +
> > +	/*
> > +	 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
> > +	 * and error occurred in DramECC (Extended error code = 0) then only
> > +	 * process the error, else bail out.
> > +	 */
> > +	if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
> > +		    (XEC(m->status, 0x1f) == 0x0)))
> 
> The MCA_STATUS[ErrorCodeExt] field is bits [21:16], so the mask should be
> 0x3f.

Ack. Thanks for catching this.
> 
> > +		return NOTIFY_DONE;
> > +
> > +	/*
> > +	 * If it is correctable error, return.
> > +	 */
> > +	if (mce_is_correctable(m))
> > +		return NOTIFY_OK;
> 
> Shouldn't this be "NOTIFY_DONE" if "don't care" about this error?

The thinking is we want to stop calling further consumers since it's a correctable error in GPU UMC and we are not taking any action about the correctable errors.

Thanks,
Mukul

> 
> Thanks,
> Yazen

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCHv3 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-23 14:37           ` Borislav Petkov
@ 2021-09-23 15:31             ` Joshi, Mukul
  0 siblings, 0 replies; 33+ messages in thread
From: Joshi, Mukul @ 2021-09-23 15:31 UTC (permalink / raw)
  To: Borislav Petkov, Ghannam, Yazen
  Cc: linux-edac, x86, linux-kernel, mingo, mchehab, amd-gfx

[AMD Official Use Only]



> -----Original Message-----
> From: Borislav Petkov <bp@alien8.de>
> Sent: Thursday, September 23, 2021 10:37 AM
> To: Ghannam, Yazen <Yazen.Ghannam@amd.com>
> Cc: Joshi, Mukul <Mukul.Joshi@amd.com>; linux-edac@vger.kernel.org;
> x86@kernel.org; linux-kernel@vger.kernel.org; mingo@redhat.com;
> mchehab@kernel.org; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCHv3 2/2] drm/amdgpu: Register MCE notifier for Aldebaran
> RAS
> 
> [CAUTION: External Email]
> 
> On Thu, Sep 23, 2021 at 02:29:07PM +0000, Yazen Ghannam wrote:
> > > +   /*
> > > +    * If the error was generated in UMC_V2, which belongs to GPU UMCs,
> > > +    * and error occurred in DramECC (Extended error code = 0) then only
> > > +    * process the error, else bail out.
> > > +    */
> > > +   if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
> > > +               (XEC(m->status, 0x1f) == 0x0)))
> >
> > The MCA_STATUS[ErrorCodeExt] field is bits [21:16], so the mask should
> > be 0x3f.
> >
> > > +           return NOTIFY_DONE;
> > > +
> > > +   /*
> > > +    * If it is correctable error, return.
> > > +    */
> > > +   if (mce_is_correctable(m))
> > > +           return NOTIFY_OK;
> >
> > Shouldn't this be "NOTIFY_DONE" if "don't care" about this error?
> 
> I think the logic here is to stop calling any further consumers on the notify chain
> because this is a GPU correctable error and they can't do anything about it
> anyway, right?

That's correct.

Regards,
Mukul

> 
> Or am I misreading it?
> 
> --
> Regards/Gruss,
>     Boris.
> 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpeople.
> kernel.org%2Ftglx%2Fnotes-about-
> netiquette&amp;data=04%7C01%7Cmukul.joshi%40amd.com%7C1493bcdae62
> 8466d5b8408d97e9fa6f0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0
> %7C637680046455034969%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAw
> MDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata
> =uqbjJslUPJabZonJc6m9Ub3C5IkZDPdqwr6MI0oLPcc%3D&amp;reserved=0

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCHv3 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-23 15:30           ` Joshi, Mukul
@ 2021-09-23 17:23             ` Yazen Ghannam
  2021-09-23 18:14               ` Borislav Petkov
  2021-09-23 18:34               ` Joshi, Mukul
  0 siblings, 2 replies; 33+ messages in thread
From: Yazen Ghannam @ 2021-09-23 17:23 UTC (permalink / raw)
  To: Joshi, Mukul; +Cc: linux-edac, x86, linux-kernel, bp, mingo, mchehab, amd-gfx

On Thu, Sep 23, 2021 at 11:30:55AM -0400, Joshi, Mukul wrote:
...
> > > +		return NOTIFY_DONE;
> > > +
> > > +	/*
> > > +	 * If it is correctable error, return.
> > > +	 */
> > > +	if (mce_is_correctable(m))
> > > +		return NOTIFY_OK;
> > 
> > Shouldn't this be "NOTIFY_DONE" if "don't care" about this error?
> 
> The thinking is we want to stop calling further consumers since it's a correctable error in GPU UMC and we are not taking any action about the correctable errors.

Shouldn't the error still be reported to EDAC for decoding and counting? I
think users want this.

But it looks to me that either NOTIFY_OK or NOTIFY_DONE will allow this, so
it's not a big deal. Was this intended to be NOTIFY_STOP?

Thanks,
Yazen

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCHv3 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-23 17:23             ` Yazen Ghannam
@ 2021-09-23 18:14               ` Borislav Petkov
  2021-09-24 19:46                 ` Yazen Ghannam
  2021-09-23 18:34               ` Joshi, Mukul
  1 sibling, 1 reply; 33+ messages in thread
From: Borislav Petkov @ 2021-09-23 18:14 UTC (permalink / raw)
  To: Yazen Ghannam
  Cc: Joshi, Mukul, linux-edac, x86, linux-kernel, mingo, mchehab, amd-gfx

On Thu, Sep 23, 2021 at 05:23:21PM +0000, Yazen Ghannam wrote:
> Shouldn't the error still be reported to EDAC for decoding and counting? I
> think users want this.

You know what happens with users getting ECCs reported, right? They
think immediately their hw is going bad and start wanting to replace
it...

So what does actually tell you if you were a simple user and you had 5
correctable errors in the GPU VRAM?

All you wanna do is play, I'd say.

:-)

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCHv3 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-23 17:23             ` Yazen Ghannam
  2021-09-23 18:14               ` Borislav Petkov
@ 2021-09-23 18:34               ` Joshi, Mukul
  1 sibling, 0 replies; 33+ messages in thread
From: Joshi, Mukul @ 2021-09-23 18:34 UTC (permalink / raw)
  To: Ghannam, Yazen; +Cc: linux-edac, x86, linux-kernel, bp, mingo, mchehab, amd-gfx

[AMD Official Use Only]



> -----Original Message-----
> From: Ghannam, Yazen <Yazen.Ghannam@amd.com>
> Sent: Thursday, September 23, 2021 1:23 PM
> To: Joshi, Mukul <Mukul.Joshi@amd.com>
> Cc: linux-edac@vger.kernel.org; x86@kernel.org; linux-kernel@vger.kernel.org;
> bp@alien8.de; mingo@redhat.com; mchehab@kernel.org; amd-
> gfx@lists.freedesktop.org
> Subject: Re: [PATCHv3 2/2] drm/amdgpu: Register MCE notifier for Aldebaran
> RAS
> 
> On Thu, Sep 23, 2021 at 11:30:55AM -0400, Joshi, Mukul wrote:
> ...
> > > > +		return NOTIFY_DONE;
> > > > +
> > > > +	/*
> > > > +	 * If it is correctable error, return.
> > > > +	 */
> > > > +	if (mce_is_correctable(m))
> > > > +		return NOTIFY_OK;
> > >
> > > Shouldn't this be "NOTIFY_DONE" if "don't care" about this error?
> >
> > The thinking is we want to stop calling further consumers since it's a
> correctable error in GPU UMC and we are not taking any action about the
> correctable errors.
> 
Sorry I have to retract this back a bit. I remembered I started with the intention
Of using NOTIFY_STOP but realized that we would not be doing any accounting in this function.

> Shouldn't the error still be reported to EDAC for decoding and counting? I think
> users want this.
> 
> But it looks to me that either NOTIFY_OK or NOTIFY_DONE will allow this, so it's
> not a big deal. Was this intended to be NOTIFY_STOP?
> 

Sorry I have to retract my previous comment about stopping further consumers a bit.
I remembered I started with the intention to use NOTIFY_STOP but realized we were not doing any accounting in this function.
Later I guess I went by the comments put against NOTIFY_OK in notifier.h:
#define NOTIFY_DONE		0x0000		/* Don't care */
#define NOTIFY_OK		0x0001		/* Suits me */

Because this was a correctable error on GPU UMC, NOTIFY_OK ("Suits me") was probably more suited to this condition,
even though we were not taking any action on the correctable errors. 

Thanks,
Mukul

> Thanks,
> Yazen

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCHv4 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-22 19:36       ` [PATCHv3 " Mukul Joshi
  2021-09-23 14:29         ` Yazen Ghannam
@ 2021-09-23 22:04         ` Mukul Joshi
  2021-09-24 19:53           ` Yazen Ghannam
  1 sibling, 1 reply; 33+ messages in thread
From: Mukul Joshi @ 2021-09-23 22:04 UTC (permalink / raw)
  To: linux-edac, x86
  Cc: linux-kernel, bp, mingo, mchehab, yazen.ghannam, amd-gfx, Mukul Joshi

On Aldebaran, GPU driver will handle bad page retirement
for GPU memory even though UMC is host managed. As a result,
register a bad page retirement handler on the mce notifier
chain to retire bad pages on Aldebaran.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
---
v1->v2:
- Use smca_get_bank_type() to determine MCA bank.
- Envelope the changes under #ifdef CONFIG_X86_MCE_AMD.
- Use MCE_PRIORITY_UC instead of MCE_PRIO_ACCEL as we are
  only handling uncorrectable errors.
- Use macros to determine UMC instance and channel instance
  where the uncorrectable error occured.

v2->v3:
- Move the check for correctable error before find_adev().
- Fix a NULL pointer dereference if find_adev() returns NULL.

v3->v4:
- Update the commit log to specify page retirement for GPU
  memory only.
- Fix the mask passed to XEC macro.

 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 141 ++++++++++++++++++++++++
 1 file changed, 141 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e1c34eef76b7..02841a0efbb9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -35,7 +35,11 @@
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include "atom.h"
+#ifdef CONFIG_X86_MCE_AMD
+#include <asm/mce.h>
 
+static bool notifier_registered;
+#endif
 static const char *RAS_FS_NAME = "ras";
 
 const char *ras_error_string[] = {
@@ -107,6 +111,9 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
 				uint64_t addr);
 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
 				uint64_t addr);
+#ifdef CONFIG_X86_MCE_AMD
+static void amdgpu_register_bad_pages_mca_notifier(void);
+#endif
 
 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
 {
@@ -2089,6 +2096,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 			adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs);
 	}
 
+#ifdef CONFIG_X86_MCE_AMD
+	if ((adev->asic_type == CHIP_ALDEBARAN) &&
+	    (adev->gmc.xgmi.connected_to_cpu))
+		amdgpu_register_bad_pages_mca_notifier();
+#endif
 	return 0;
 
 free:
@@ -2552,3 +2564,132 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev)
 		kfree(con);
 	}
 }
+
+#ifdef CONFIG_X86_MCE_AMD
+static struct amdgpu_device *find_adev(uint32_t node_id)
+{
+	struct amdgpu_gpu_instance *gpu_instance;
+	int i;
+	struct amdgpu_device *adev = NULL;
+
+	mutex_lock(&mgpu_info.mutex);
+
+	for (i = 0; i < mgpu_info.num_gpu; i++) {
+		gpu_instance = &(mgpu_info.gpu_ins[i]);
+		adev = gpu_instance->adev;
+
+		if (adev->gmc.xgmi.connected_to_cpu &&
+		    adev->gmc.xgmi.physical_node_id == node_id)
+			break;
+		adev = NULL;
+	}
+
+	mutex_unlock(&mgpu_info.mutex);
+
+	return adev;
+}
+
+#define GET_MCA_IPID_GPUID(m)	(((m) >> 44) & 0xF)
+#define GET_UMC_INST(m)		(((m) >> 21) & 0x7)
+#define GET_CHAN_INDEX(m)	((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
+#define GPU_ID_OFFSET		8
+
+static int amdgpu_bad_page_notifier(struct notifier_block *nb,
+				    unsigned long val, void *data)
+{
+	struct mce *m = (struct mce *)data;
+	struct amdgpu_device *adev = NULL;
+	uint32_t gpu_id = 0;
+	uint32_t umc_inst = 0;
+	uint32_t ch_inst, channel_index = 0;
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	struct eeprom_table_record err_rec;
+	uint64_t retired_page;
+
+	/*
+	 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
+	 * and error occurred in DramECC (Extended error code = 0) then only
+	 * process the error, else bail out.
+	 */
+	if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
+		    (XEC(m->status, 0x3f) == 0x0)))
+		return NOTIFY_DONE;
+
+	/*
+	 * If it is correctable error, return.
+	 */
+	if (mce_is_correctable(m))
+		return NOTIFY_OK;
+
+	/*
+	 * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
+	 */
+	gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
+
+	adev = find_adev(gpu_id);
+	if (!adev) {
+		DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__,
+								gpu_id);
+		return NOTIFY_DONE;
+	}
+
+	/*
+	 * If it is uncorrectable error, then find out UMC instance and
+	 * channel index.
+	 */
+	umc_inst = GET_UMC_INST(m->ipid);
+	ch_inst = GET_CHAN_INDEX(m->ipid);
+
+	dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
+			     umc_inst, ch_inst);
+
+	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
+
+	/*
+	 * Translate UMC channel address to Physical address
+	 */
+	channel_index =
+		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num
+					  + ch_inst];
+
+	retired_page = ADDR_OF_8KB_BLOCK(m->addr) |
+			ADDR_OF_256B_BLOCK(channel_index) |
+			OFFSET_IN_256B_BLOCK(m->addr);
+
+	err_rec.address = m->addr;
+	err_rec.retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+	err_rec.ts = (uint64_t)ktime_get_real_seconds();
+	err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+	err_rec.cu = 0;
+	err_rec.mem_channel = channel_index;
+	err_rec.mcumc_id = umc_inst;
+
+	err_data.err_addr = &err_rec;
+	err_data.err_addr_cnt = 1;
+
+	if (amdgpu_bad_page_threshold != 0) {
+		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+						err_data.err_addr_cnt);
+		amdgpu_ras_save_bad_pages(adev);
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block amdgpu_bad_page_nb = {
+	.notifier_call  = amdgpu_bad_page_notifier,
+	.priority       = MCE_PRIO_UC,
+};
+
+static void amdgpu_register_bad_pages_mca_notifier(void)
+{
+	/*
+	 * Register the x86 notifier only once
+	 * with MCE subsystem.
+	 */
+	if (notifier_registered == false) {
+		mce_register_decode_chain(&amdgpu_bad_page_nb);
+		notifier_registered = true;
+	}
+}
+#endif
-- 
2.17.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCHv3 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-23 18:14               ` Borislav Petkov
@ 2021-09-24 19:46                 ` Yazen Ghannam
  2021-09-25 11:20                   ` Borislav Petkov
  0 siblings, 1 reply; 33+ messages in thread
From: Yazen Ghannam @ 2021-09-24 19:46 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Joshi, Mukul, linux-edac, x86, linux-kernel, mingo, mchehab, amd-gfx

On Thu, Sep 23, 2021 at 08:14:15PM +0200, Borislav Petkov wrote:
> On Thu, Sep 23, 2021 at 05:23:21PM +0000, Yazen Ghannam wrote:
> > Shouldn't the error still be reported to EDAC for decoding and counting? I
> > think users want this.
> 
> You know what happens with users getting ECCs reported, right? They
> think immediately their hw is going bad and start wanting to replace
> it...
> 
> So what does actually tell you if you were a simple user and you had 5
> correctable errors in the GPU VRAM?
> 

I agree with you in general. But this device isn't really a GPU. And users of
this device seem to want to count *every* error, at least for now.

> All you wanna do is play, I'd say.
> 
> :-)
>

Definitely. :)

Thanks,
Yazen

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCHv4 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-23 22:04         ` [PATCHv4 " Mukul Joshi
@ 2021-09-24 19:53           ` Yazen Ghannam
  0 siblings, 0 replies; 33+ messages in thread
From: Yazen Ghannam @ 2021-09-24 19:53 UTC (permalink / raw)
  To: Mukul Joshi; +Cc: linux-edac, x86, linux-kernel, bp, mingo, mchehab, amd-gfx

On Thu, Sep 23, 2021 at 06:04:34PM -0400, Mukul Joshi wrote:
> On Aldebaran, GPU driver will handle bad page retirement
> for GPU memory even though UMC is host managed. As a result,
> register a bad page retirement handler on the mce notifier
> chain to retire bad pages on Aldebaran.
> 
> Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
> ---

This patch looks good to me overall.

Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>

Thanks,
Yazen

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCHv3 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-24 19:46                 ` Yazen Ghannam
@ 2021-09-25 11:20                   ` Borislav Petkov
  2021-09-27 18:37                     ` Yazen Ghannam
  0 siblings, 1 reply; 33+ messages in thread
From: Borislav Petkov @ 2021-09-25 11:20 UTC (permalink / raw)
  To: Yazen Ghannam
  Cc: Joshi, Mukul, linux-edac, x86, linux-kernel, mingo, mchehab, amd-gfx

On Fri, Sep 24, 2021 at 07:46:10PM +0000, Yazen Ghannam wrote:
> I agree with you in general. But this device isn't really a GPU. And
> users of this device seem to want to count *every* error, at least for
> now.

Aha, so something accelerator-y where they do general purpose computation.

So what's the big picture here: they count all the errors and when they
reach a certain amount, they decide to replace the GPUs just in case?

Or wait until they become uncorrectable? But then it doesn't matter
because we will handle it properly by excluding the VRAM range from
further use.

Or do they wanna see *when* they had the correctable errors so that they
can restart the computation, just in case.

Dunno, it would be a lot helpful if we had some RAS strategy for those
things...

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCHv3 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
  2021-09-25 11:20                   ` Borislav Petkov
@ 2021-09-27 18:37                     ` Yazen Ghannam
  0 siblings, 0 replies; 33+ messages in thread
From: Yazen Ghannam @ 2021-09-27 18:37 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Joshi, Mukul, linux-edac, x86, linux-kernel, mingo, mchehab, amd-gfx

On Sat, Sep 25, 2021 at 01:20:57PM +0200, Borislav Petkov wrote:
> On Fri, Sep 24, 2021 at 07:46:10PM +0000, Yazen Ghannam wrote:
> > I agree with you in general. But this device isn't really a GPU. And
> > users of this device seem to want to count *every* error, at least for
> > now.
> 
> Aha, so something accelerator-y where they do general purpose computation.
> 
> So what's the big picture here: they count all the errors and when they
> reach a certain amount, they decide to replace the GPUs just in case?
> 
> Or wait until they become uncorrectable? But then it doesn't matter
> because we will handle it properly by excluding the VRAM range from
> further use.
> 
> Or do they wanna see *when* they had the correctable errors so that they
> can restart the computation, just in case.
> 
> Dunno, it would be a lot helpful if we had some RAS strategy for those
> things...
>

I completely agree. The system integrators have their own policies for error
tracking, part replacement, etc. I expect they'll propose kernel changes if
they want any. Though I think general strategies will become apparent once
these sort of devices are in wider use.

Thanks,
Yazen

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2021-09-27 18:37 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-11 15:25 [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types Naveen Krishna Chatradhi
2021-05-11 15:25 ` [PATCH 2/3] x86/MCE/AMD: Helper function to check UMC v2 Naveen Krishna Chatradhi
2021-05-11 17:34   ` Borislav Petkov
2021-05-12  1:40     ` Joshi, Mukul
2021-05-12  7:26       ` Borislav Petkov
2021-09-13  2:13   ` [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol Mukul Joshi
2021-09-13  2:13     ` [PATCHv2 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS Mukul Joshi
2021-09-22 11:40       ` Borislav Petkov
2021-09-22 19:43         ` Joshi, Mukul
2021-09-22 19:36       ` [PATCHv3 " Mukul Joshi
2021-09-23 14:29         ` Yazen Ghannam
2021-09-23 14:37           ` Borislav Petkov
2021-09-23 15:31             ` Joshi, Mukul
2021-09-23 15:30           ` Joshi, Mukul
2021-09-23 17:23             ` Yazen Ghannam
2021-09-23 18:14               ` Borislav Petkov
2021-09-24 19:46                 ` Yazen Ghannam
2021-09-25 11:20                   ` Borislav Petkov
2021-09-27 18:37                     ` Yazen Ghannam
2021-09-23 18:34               ` Joshi, Mukul
2021-09-23 22:04         ` [PATCHv4 " Mukul Joshi
2021-09-24 19:53           ` Yazen Ghannam
2021-09-22 11:33     ` [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol Borislav Petkov
2021-09-22 16:27       ` Deucher, Alexander
2021-09-22 16:43         ` Borislav Petkov
2021-09-22 16:47           ` Joshi, Mukul
2021-05-11 15:25 ` [PATCH 3/3] x86/mce: Add MCE priority for Accelerator devices Naveen Krishna Chatradhi
2021-05-11 17:27 ` [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types Borislav Petkov
2021-05-24 16:41   ` Chatradhi, Naveen Krishna
2021-05-25 18:02     ` Borislav Petkov
2021-05-25 20:03       ` Yazen Ghannam
2021-05-25 20:12         ` Borislav Petkov
2021-05-26 16:46 ` [PATCH v2] " Naveen Krishna Chatradhi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).