Linux-EDAC Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH v3 4/4] x86/mce: Add Zhaoxin LMCE support
@ 2019-09-11 12:03 Tony W Wang-oc
  0 siblings, 0 replies; 5+ messages in thread
From: Tony W Wang-oc @ 2019-09-11 12:03 UTC (permalink / raw)
  To: tony.luck, Borislav Petkov (bp, tglx, mingo, hpa, x86,
	linux-edac, linux-kernel, yazen.ghannam, vishal.l.verma,
	qiuxu.zhuo
  Cc: David Wang, Cooper Yan(BJ-RD), Qiyuan Wang(BJ-RD), Herry Yang(BJ-RD)

Zhaoxin newer CPUs support LMCE that compatible with Intel's
"Machine-Check Architecture", so add support for Zhaoxin LMCE
in mce/core.c.

Signed-off-by: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
---
v2->v3:
 - Rework mce_zhaoxin_feature_clear() as static
 - Add comment and change coding style

v1->v2:
 - Fix redefinition of "mce_zhaoxin_feature_clear"

 arch/x86/kernel/cpu/mce/core.c | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 65c5a1f..acdd76b 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1132,6 +1132,27 @@ static bool __mc_check_crashing_cpu(int cpu)
 		u64 mcgstatus;
 
 		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+
+		if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
+			if (mcgstatus & MCG_STATUS_LMCES)
+				return false;
+
+			if (!(mcgstatus & MCG_STATUS_LMCES)) {
+				/*
+				 * Clear the MCG_STATUS_RIPV valid status
+				 * bit so that a second MCE won't cause a
+				 * shutdown.
+				 */
+				if (mcgstatus & MCG_STATUS_RIPV)
+					mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+				/*
+				 * On this CPU, skip synchronize regardless
+				 * of MCG_STATUS_RIPV status.
+				 */
+				return true;
+			}
+		}
+
 		if (mcgstatus & MCG_STATUS_RIPV) {
 			mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
 			return true;
@@ -1282,9 +1303,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 	/*
 	 * Check if this MCE is signaled to only this logical processor,
-	 * on Intel only.
+	 * on Intel, Zhaoxin only.
 	 */
-	if (m.cpuvendor == X86_VENDOR_INTEL)
+	if (m.cpuvendor == X86_VENDOR_INTEL ||
+	    m.cpuvendor == X86_VENDOR_ZHAOXIN)
 		lmce = m.mcgstatus & MCG_STATUS_LMCES;
 
 	/*
@@ -1795,9 +1817,15 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
 	}
 
 	intel_init_cmci();
+	intel_init_lmce();
 	mce_adjust_timer = cmci_intel_adjust_timer;
 }
 
+static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
+{
+	intel_clear_lmce();
+}
+
 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 {
 	switch (c->x86_vendor) {
@@ -1834,6 +1862,9 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
 	case X86_VENDOR_INTEL:
 		mce_intel_feature_clear(c);
 		break;
+	case X86_VENDOR_ZHAOXIN:
+		mce_zhaoxin_feature_clear(c);
+		break;
 	default:
 		break;
 	}
-- 
2.7.4


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3 4/4] x86/mce: Add Zhaoxin LMCE support
  2019-09-17  6:54 Tony W Wang-oc
@ 2019-09-17 16:37 ` Luck, Tony
  0 siblings, 0 replies; 5+ messages in thread
From: Luck, Tony @ 2019-09-17 16:37 UTC (permalink / raw)
  To: Tony W Wang-oc
  Cc: Borislav Petkov (bp, tglx, mingo, hpa, x86, linux-edac,
	linux-kernel, yazen.ghannam, vishal.l.verma, qiuxu.zhuo,
	David Wang, Cooper Yan(BJ-RD), Qiyuan Wang(BJ-RD),
	Herry Yang(BJ-RD)

On Tue, Sep 17, 2019 at 06:54:05AM +0000, Tony W Wang-oc wrote:
> But have a question about below codes:
> 	if (mcgstatus & MCG_STATUS_RIPV) {
> 		mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
> 		return true;
> 	}
> These seems require all #MC exception errors set MCG_STATUS_RIPV = 1
> in order to skip synchronize which "return true;" actually does for this.
> 
> As Intel SDM show, "Recoverable-not-continuable SRAR Type" errors may
> set MCG_STATUS_RIPV = 0, PCC = 0. When these #MC errors broadcast
> to offline CPU, may cause kernel panic with synchronize timeout (offline
> CPU can't skip synchronize in this case).
> 
> Could "return true;" outside the if-case?
> 	if (mcgstatus & MCG_STATUS_RIPV) {
> 		mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
> 	} 
> 	return true; 

If RIPV bit is not set in mcgstatus, then where will the CPU return
to if you simply return from the #MC handler? RIPV=1 means that the
CPU pushed a valid return instruction pointer onto the stack.

E.g. in the not-continuable case you mention above? The CPU
will likely do something undefined if you try to continue a
not-continuable instruction.

-Tony

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3 4/4] x86/mce: Add Zhaoxin LMCE support
@ 2019-09-17  6:54 Tony W Wang-oc
  2019-09-17 16:37 ` Luck, Tony
  0 siblings, 1 reply; 5+ messages in thread
From: Tony W Wang-oc @ 2019-09-17  6:54 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov (bp, tglx, mingo, hpa, x86, linux-edac,
	linux-kernel, yazen.ghannam, vishal.l.verma, qiuxu.zhuo,
	David Wang, Cooper Yan(BJ-RD), Qiyuan Wang(BJ-RD),
	Herry Yang(BJ-RD)

On Mon, Sep 16, 2019, Luck, Tony wrote:
>On Mon, Sep 16, 2019 at 11:37:18AM +0000, Tony W Wang-oc wrote:
>> Zhaoxin newer CPUs support LMCE that compatible with Intel's
>> "Machine-Check Architecture", so add support for Zhaoxin LMCE
>> in mce/core.c.
>>
>> Signed-off-by: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
>> ---
>>  arch/x86/kernel/cpu/mce/core.c | 35
>+++++++++++++++++++++++++++++++++--
>>  1 file changed, 33 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
>> index 65c5a1f..acdd76b 100644
>> --- a/arch/x86/kernel/cpu/mce/core.c
>> +++ b/arch/x86/kernel/cpu/mce/core.c
>> @@ -1132,6 +1132,27 @@ static bool __mc_check_crashing_cpu(int cpu)
>>  		u64 mcgstatus;
>>
>>  		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
>> +
>> +		if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
>> +			if (mcgstatus & MCG_STATUS_LMCES)
>> +				return false;
>> +
>> +			if (!(mcgstatus & MCG_STATUS_LMCES)) {
>
>Don't really need this test ... you already did "return false" if
>the LMCES bit was set ... so this test is redundant (and you can avoid
>indenting the next dozen lines.

Got it, Thank you.

But have a question about below codes:
	if (mcgstatus & MCG_STATUS_RIPV) {
		mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
		return true;
	}
These seems require all #MC exception errors set MCG_STATUS_RIPV = 1
in order to skip synchronize which "return true;" actually does for this.

As Intel SDM show, "Recoverable-not-continuable SRAR Type" errors may
set MCG_STATUS_RIPV = 0, PCC = 0. When these #MC errors broadcast
to offline CPU, may cause kernel panic with synchronize timeout (offline
CPU can't skip synchronize in this case).

Could "return true;" outside the if-case?
	if (mcgstatus & MCG_STATUS_RIPV) {
		mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
	} 
	return true; 

Sincerely
TonyWWang-oc

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3 4/4] x86/mce: Add Zhaoxin LMCE support
  2019-09-16 11:37 Tony W Wang-oc
@ 2019-09-16 17:40 ` Luck, Tony
  0 siblings, 0 replies; 5+ messages in thread
From: Luck, Tony @ 2019-09-16 17:40 UTC (permalink / raw)
  To: Tony W Wang-oc
  Cc: Borislav Petkov (bp, tglx, mingo, hpa, x86, linux-edac,
	linux-kernel, yazen.ghannam, vishal.l.verma, qiuxu.zhuo,
	David Wang, Cooper Yan(BJ-RD), Qiyuan Wang(BJ-RD),
	Herry Yang(BJ-RD)

On Mon, Sep 16, 2019 at 11:37:18AM +0000, Tony W Wang-oc wrote:
> Zhaoxin newer CPUs support LMCE that compatible with Intel's
> "Machine-Check Architecture", so add support for Zhaoxin LMCE
> in mce/core.c.
> 
> Signed-off-by: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
> ---
>  arch/x86/kernel/cpu/mce/core.c | 35 +++++++++++++++++++++++++++++++++--
>  1 file changed, 33 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index 65c5a1f..acdd76b 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -1132,6 +1132,27 @@ static bool __mc_check_crashing_cpu(int cpu)
>  		u64 mcgstatus;
>  
>  		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
> +
> +		if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
> +			if (mcgstatus & MCG_STATUS_LMCES)
> +				return false;
> +
> +			if (!(mcgstatus & MCG_STATUS_LMCES)) {

Don't really need this test ... you already did "return false" if 
the LMCES bit was set ... so this test is redundant (and you can avoid
indenting the next dozen lines.

> +				/*
> +				 * Clear the MCG_STATUS_RIPV valid status
> +				 * bit so that a second MCE won't cause a
> +				 * shutdown.
> +				 */
> +				if (mcgstatus & MCG_STATUS_RIPV)
> +					mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
> +				/*
> +				 * On this CPU, skip synchronize regardless
> +				 * of MCG_STATUS_RIPV status.
> +				 */
> +				return true;
> +			}
> +		}
> +

Otherwise I'm OK with the series.  May earlier comment about
wanting to clean up all the vendor/family/model checks should
be seen as a longer term goal. I don't want to block this waiting
until the day we figure out how to make this prettier.

-Tony

[The "Content-Language: zh-CN" in the mail headers is still freaking out
my version of mutt (Mutt 1.11.3 (2019-02-01)) ... but I figured out a
simple script to dowload a raw copy of each patch from lore.kernel.org
to work around that]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v3 4/4] x86/mce: Add Zhaoxin LMCE support
@ 2019-09-16 11:37 Tony W Wang-oc
  2019-09-16 17:40 ` Luck, Tony
  0 siblings, 1 reply; 5+ messages in thread
From: Tony W Wang-oc @ 2019-09-16 11:37 UTC (permalink / raw)
  To: tony.luck, Borislav Petkov (bp, tglx, mingo, hpa, x86,
	linux-edac, linux-kernel, yazen.ghannam, vishal.l.verma,
	qiuxu.zhuo
  Cc: David Wang, Cooper Yan(BJ-RD), Qiyuan Wang(BJ-RD), Herry Yang(BJ-RD)

Zhaoxin newer CPUs support LMCE that compatible with Intel's
"Machine-Check Architecture", so add support for Zhaoxin LMCE
in mce/core.c.

Signed-off-by: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
---
 arch/x86/kernel/cpu/mce/core.c | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 65c5a1f..acdd76b 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1132,6 +1132,27 @@ static bool __mc_check_crashing_cpu(int cpu)
 		u64 mcgstatus;
 
 		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+
+		if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
+			if (mcgstatus & MCG_STATUS_LMCES)
+				return false;
+
+			if (!(mcgstatus & MCG_STATUS_LMCES)) {
+				/*
+				 * Clear the MCG_STATUS_RIPV valid status
+				 * bit so that a second MCE won't cause a
+				 * shutdown.
+				 */
+				if (mcgstatus & MCG_STATUS_RIPV)
+					mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+				/*
+				 * On this CPU, skip synchronize regardless
+				 * of MCG_STATUS_RIPV status.
+				 */
+				return true;
+			}
+		}
+
 		if (mcgstatus & MCG_STATUS_RIPV) {
 			mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
 			return true;
@@ -1282,9 +1303,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 	/*
 	 * Check if this MCE is signaled to only this logical processor,
-	 * on Intel only.
+	 * on Intel, Zhaoxin only.
 	 */
-	if (m.cpuvendor == X86_VENDOR_INTEL)
+	if (m.cpuvendor == X86_VENDOR_INTEL ||
+	    m.cpuvendor == X86_VENDOR_ZHAOXIN)
 		lmce = m.mcgstatus & MCG_STATUS_LMCES;
 
 	/*
@@ -1795,9 +1817,15 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
 	}
 
 	intel_init_cmci();
+	intel_init_lmce();
 	mce_adjust_timer = cmci_intel_adjust_timer;
 }
 
+static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
+{
+	intel_clear_lmce();
+}
+
 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 {
 	switch (c->x86_vendor) {
@@ -1834,6 +1862,9 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
 	case X86_VENDOR_INTEL:
 		mce_intel_feature_clear(c);
 		break;
+	case X86_VENDOR_ZHAOXIN:
+		mce_zhaoxin_feature_clear(c);
+		break;
 	default:
 		break;
 	}
-- 
2.7.4

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, back to index

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-11 12:03 [PATCH v3 4/4] x86/mce: Add Zhaoxin LMCE support Tony W Wang-oc
2019-09-16 11:37 Tony W Wang-oc
2019-09-16 17:40 ` Luck, Tony
2019-09-17  6:54 Tony W Wang-oc
2019-09-17 16:37 ` Luck, Tony

Linux-EDAC Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-edac/0 linux-edac/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-edac linux-edac/ https://lore.kernel.org/linux-edac \
		linux-edac@vger.kernel.org linux-edac@archiver.kernel.org
	public-inbox-index linux-edac


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-edac


AGPL code for this site: git clone https://public-inbox.org/ public-inbox