All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86/MCE: allow overriding the CMCI threshold
@ 2015-01-12  8:44 Jan Beulich
  2015-01-12 10:18 ` Egger, Christoph
  0 siblings, 1 reply; 3+ messages in thread
From: Jan Beulich @ 2015-01-12  8:44 UTC (permalink / raw)
  To: xen-devel; +Cc: Jinsong Liu, Christoph Egger

[-- Attachment #1: Type: text/plain, Size: 2724 bytes --]

We've had reports of systems where CMCIs would surface at a relatively
high rate during certain periods of time, without them apparently
causing subsequent more severe problems (see Xeon E7-8800/4800/2800
specification clarification SC1). Give the admin a knob to lower the
impact on the system logs.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -242,6 +242,14 @@ the NMI watchdog is also enabled.
 
 If set, override Xen's default choice for the platform timer.
 
+### cmci-threshold
+> `= <integer>`
+
+> Default: `2`
+
+Specify the event count threshold for raising Corrected Machine Check
+Interrupts.  Specifying zero disables CMCI handling.
+
 ### cmos-rtc-probe
 > `= <boolean>`
 
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -492,6 +492,9 @@ static int do_cmci_discover(int i)
 {
     unsigned msr = MSR_IA32_MCx_CTL2(i);
     u64 val;
+    unsigned int threshold, max_threshold;
+    static unsigned int cmci_threshold = 2;
+    integer_param("cmci-threshold", cmci_threshold);
 
     rdmsrl(msr, val);
     /* Some other CPU already owns this bank. */
@@ -500,15 +503,28 @@ static int do_cmci_discover(int i)
         goto out;
     }
 
-    val &= ~CMCI_THRESHOLD_MASK;
-    wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
-    rdmsrl(msr, val);
+    if ( cmci_threshold )
+    {
+        wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD_MASK);
+        rdmsrl(msr, val);
+    }
 
     if (!(val & CMCI_EN)) {
         /* This bank does not support CMCI. Polling timer has to handle it. */
         mcabanks_set(i, __get_cpu_var(no_cmci_banks));
+        wrmsrl(msr, val & ~CMCI_THRESHOLD_MASK);
         return 0;
     }
+    max_threshold = MASK_EXTR(val, CMCI_THRESHOLD_MASK);
+    threshold = cmci_threshold;
+    if ( threshold > max_threshold )
+    {
+       mce_printk(MCE_QUIET,
+                  "CMCI: threshold %#x too large for CPU%u bank %u, using %#x\n",
+                  threshold, smp_processor_id(), i, max_threshold);
+       threshold = max_threshold;
+    }
+    wrmsrl(msr, (val & ~CMCI_THRESHOLD_MASK) | CMCI_EN | threshold);
     mcabanks_set(i, __get_cpu_var(mce_banks_owned));
 out:
     mcabanks_clear(i, __get_cpu_var(no_cmci_banks));
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h
@@ -86,9 +86,6 @@
 /* Bitfield of MSR_K8_HWCR register */
 #define K8_HWCR_MCi_STATUS_WREN		(1ULL << 18)
 
-/*Intel Specific bitfield*/
-#define CMCI_THRESHOLD			0x2
-
 #define MCi_MISC_ADDRMOD_MASK (0x7UL << 6)
 #define MCi_MISC_PHYSMOD    (0x2UL << 6)
 




[-- Attachment #2: x86-CMCI-threshold.patch --]
[-- Type: text/plain, Size: 2766 bytes --]

x86/MCE: allow overriding the CMCI threshold

We've had reports of systems where CMCIs would surface at a relatively
high rate during certain periods of time, without them apparently
causing subsequent more severe problems (see Xeon E7-8800/4800/2800
specification clarification SC1). Give the admin a knob to lower the
impact on the system logs.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -242,6 +242,14 @@ the NMI watchdog is also enabled.
 
 If set, override Xen's default choice for the platform timer.
 
+### cmci-threshold
+> `= <integer>`
+
+> Default: `2`
+
+Specify the event count threshold for raising Corrected Machine Check
+Interrupts.  Specifying zero disables CMCI handling.
+
 ### cmos-rtc-probe
 > `= <boolean>`
 
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -492,6 +492,9 @@ static int do_cmci_discover(int i)
 {
     unsigned msr = MSR_IA32_MCx_CTL2(i);
     u64 val;
+    unsigned int threshold, max_threshold;
+    static unsigned int cmci_threshold = 2;
+    integer_param("cmci-threshold", cmci_threshold);
 
     rdmsrl(msr, val);
     /* Some other CPU already owns this bank. */
@@ -500,15 +503,28 @@ static int do_cmci_discover(int i)
         goto out;
     }
 
-    val &= ~CMCI_THRESHOLD_MASK;
-    wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
-    rdmsrl(msr, val);
+    if ( cmci_threshold )
+    {
+        wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD_MASK);
+        rdmsrl(msr, val);
+    }
 
     if (!(val & CMCI_EN)) {
         /* This bank does not support CMCI. Polling timer has to handle it. */
         mcabanks_set(i, __get_cpu_var(no_cmci_banks));
+        wrmsrl(msr, val & ~CMCI_THRESHOLD_MASK);
         return 0;
     }
+    max_threshold = MASK_EXTR(val, CMCI_THRESHOLD_MASK);
+    threshold = cmci_threshold;
+    if ( threshold > max_threshold )
+    {
+       mce_printk(MCE_QUIET,
+                  "CMCI: threshold %#x too large for CPU%u bank %u, using %#x\n",
+                  threshold, smp_processor_id(), i, max_threshold);
+       threshold = max_threshold;
+    }
+    wrmsrl(msr, (val & ~CMCI_THRESHOLD_MASK) | CMCI_EN | threshold);
     mcabanks_set(i, __get_cpu_var(mce_banks_owned));
 out:
     mcabanks_clear(i, __get_cpu_var(no_cmci_banks));
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h
@@ -86,9 +86,6 @@
 /* Bitfield of MSR_K8_HWCR register */
 #define K8_HWCR_MCi_STATUS_WREN		(1ULL << 18)
 
-/*Intel Specific bitfield*/
-#define CMCI_THRESHOLD			0x2
-
 #define MCi_MISC_ADDRMOD_MASK (0x7UL << 6)
 #define MCi_MISC_PHYSMOD    (0x2UL << 6)
 

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] x86/MCE: allow overriding the CMCI threshold
  2015-01-12  8:44 [PATCH] x86/MCE: allow overriding the CMCI threshold Jan Beulich
@ 2015-01-12 10:18 ` Egger, Christoph
  2015-01-12 14:33   ` 答复: " 刘劲松(凯耳)
  0 siblings, 1 reply; 3+ messages in thread
From: Egger, Christoph @ 2015-01-12 10:18 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Jinsong Liu

On 2015/01/12 9:44, Jan Beulich wrote:
> We've had reports of systems where CMCIs would surface at a relatively
> high rate during certain periods of time, without them apparently
> causing subsequent more severe problems (see Xeon E7-8800/4800/2800
> specification clarification SC1). Give the admin a knob to lower the
> impact on the system logs.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

A small comment at the bottom, besides of that:

Acked-by: Christoph Egger <chegger@amazon.de>

> 
> --- a/docs/misc/xen-command-line.markdown
> +++ b/docs/misc/xen-command-line.markdown
> @@ -242,6 +242,14 @@ the NMI watchdog is also enabled.
>  
>  If set, override Xen's default choice for the platform timer.
>  
> +### cmci-threshold
> +> `= <integer>`
> +
> +> Default: `2`
> +
> +Specify the event count threshold for raising Corrected Machine Check
> +Interrupts.  Specifying zero disables CMCI handling.
> +
>  ### cmos-rtc-probe
>  > `= <boolean>`
>  
> --- a/xen/arch/x86/cpu/mcheck/mce_intel.c
> +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
> @@ -492,6 +492,9 @@ static int do_cmci_discover(int i)
>  {
>      unsigned msr = MSR_IA32_MCx_CTL2(i);
>      u64 val;
> +    unsigned int threshold, max_threshold;
> +    static unsigned int cmci_threshold = 2;
> +    integer_param("cmci-threshold", cmci_threshold);
>  
>      rdmsrl(msr, val);
>      /* Some other CPU already owns this bank. */
> @@ -500,15 +503,28 @@ static int do_cmci_discover(int i)
>          goto out;
>      }
>  
> -    val &= ~CMCI_THRESHOLD_MASK;
> -    wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
> -    rdmsrl(msr, val);
> +    if ( cmci_threshold )
> +    {
> +        wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD_MASK);
> +        rdmsrl(msr, val);
> +    }
>  
>      if (!(val & CMCI_EN)) {
>          /* This bank does not support CMCI. Polling timer has to handle it. */
>          mcabanks_set(i, __get_cpu_var(no_cmci_banks));
> +        wrmsrl(msr, val & ~CMCI_THRESHOLD_MASK);
>          return 0;
>      }
> +    max_threshold = MASK_EXTR(val, CMCI_THRESHOLD_MASK);
> +    threshold = cmci_threshold;
> +    if ( threshold > max_threshold )
> +    {
> +       mce_printk(MCE_QUIET,
> +                  "CMCI: threshold %#x too large for CPU%u bank %u, using %#x\n",
> +                  threshold, smp_processor_id(), i, max_threshold);
> +       threshold = max_threshold;
> +    }
> +    wrmsrl(msr, (val & ~CMCI_THRESHOLD_MASK) | CMCI_EN | threshold);
>      mcabanks_set(i, __get_cpu_var(mce_banks_owned));
>  out:
>      mcabanks_clear(i, __get_cpu_var(no_cmci_banks));
> --- a/xen/arch/x86/cpu/mcheck/x86_mca.h
> +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h
> @@ -86,9 +86,6 @@
>  /* Bitfield of MSR_K8_HWCR register */
>  #define K8_HWCR_MCi_STATUS_WREN		(1ULL << 18)
>  
> -/*Intel Specific bitfield*/
> -#define CMCI_THRESHOLD			0x2
> -
>  #define MCi_MISC_ADDRMOD_MASK (0x7UL << 6)
>  #define MCi_MISC_PHYSMOD    (0x2UL << 6)

I think these two are also Intel specific bitfields.
Please leave the comment for those.

Christoph

^ permalink raw reply	[flat|nested] 3+ messages in thread

* 答复: [PATCH] x86/MCE: allow overriding the CMCI threshold
  2015-01-12 10:18 ` Egger, Christoph
@ 2015-01-12 14:33   ` 刘劲松(凯耳)
  0 siblings, 0 replies; 3+ messages in thread
From: 刘劲松(凯耳) @ 2015-01-12 14:33 UTC (permalink / raw)
  To: 'Egger, Christoph', 'Jan Beulich', 'xen-devel'

Same comments with Egger.

Acked-by: Liu Jinsong <jinsong.liu@alibaba-inc.com>

-----邮件原件-----
发件人: Egger, Christoph [mailto:chegger@amazon.de] 
发送时间: 2015年1月12日 18:18
收件人: Jan Beulich; xen-devel
抄送: 刘劲松(凯耳)
主题: Re: [PATCH] x86/MCE: allow overriding the CMCI threshold

On 2015/01/12 9:44, Jan Beulich wrote:
> We've had reports of systems where CMCIs would surface at a relatively 
> high rate during certain periods of time, without them apparently 
> causing subsequent more severe problems (see Xeon E7-8800/4800/2800 
> specification clarification SC1). Give the admin a knob to lower the 
> impact on the system logs.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

A small comment at the bottom, besides of that:

Acked-by: Christoph Egger <chegger@amazon.de>

> 
> --- a/docs/misc/xen-command-line.markdown
> +++ b/docs/misc/xen-command-line.markdown
> @@ -242,6 +242,14 @@ the NMI watchdog is also enabled.
>  
>  If set, override Xen's default choice for the platform timer.
>  
> +### cmci-threshold
> +> `= <integer>`
> +
> +> Default: `2`
> +
> +Specify the event count threshold for raising Corrected Machine Check 
> +Interrupts.  Specifying zero disables CMCI handling.
> +
>  ### cmos-rtc-probe
>  > `= <boolean>`
>  
> --- a/xen/arch/x86/cpu/mcheck/mce_intel.c
> +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
> @@ -492,6 +492,9 @@ static int do_cmci_discover(int i)  {
>      unsigned msr = MSR_IA32_MCx_CTL2(i);
>      u64 val;
> +    unsigned int threshold, max_threshold;
> +    static unsigned int cmci_threshold = 2;
> +    integer_param("cmci-threshold", cmci_threshold);
>  
>      rdmsrl(msr, val);
>      /* Some other CPU already owns this bank. */ @@ -500,15 +503,28 
> @@ static int do_cmci_discover(int i)
>          goto out;
>      }
>  
> -    val &= ~CMCI_THRESHOLD_MASK;
> -    wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
> -    rdmsrl(msr, val);
> +    if ( cmci_threshold )
> +    {
> +        wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD_MASK);
> +        rdmsrl(msr, val);
> +    }
>  
>      if (!(val & CMCI_EN)) {
>          /* This bank does not support CMCI. Polling timer has to handle
it. */
>          mcabanks_set(i, __get_cpu_var(no_cmci_banks));
> +        wrmsrl(msr, val & ~CMCI_THRESHOLD_MASK);
>          return 0;
>      }
> +    max_threshold = MASK_EXTR(val, CMCI_THRESHOLD_MASK);
> +    threshold = cmci_threshold;
> +    if ( threshold > max_threshold )
> +    {
> +       mce_printk(MCE_QUIET,
> +                  "CMCI: threshold %#x too large for CPU%u bank %u, using
%#x\n",
> +                  threshold, smp_processor_id(), i, max_threshold);
> +       threshold = max_threshold;
> +    }
> +    wrmsrl(msr, (val & ~CMCI_THRESHOLD_MASK) | CMCI_EN | threshold);
>      mcabanks_set(i, __get_cpu_var(mce_banks_owned));
>  out:
>      mcabanks_clear(i, __get_cpu_var(no_cmci_banks));
> --- a/xen/arch/x86/cpu/mcheck/x86_mca.h
> +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h
> @@ -86,9 +86,6 @@
>  /* Bitfield of MSR_K8_HWCR register */
>  #define K8_HWCR_MCi_STATUS_WREN		(1ULL << 18)
>  
> -/*Intel Specific bitfield*/
> -#define CMCI_THRESHOLD			0x2
> -
>  #define MCi_MISC_ADDRMOD_MASK (0x7UL << 6)
>  #define MCi_MISC_PHYSMOD    (0x2UL << 6)

I think these two are also Intel specific bitfields.
Please leave the comment for those.

Christoph


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2015-01-12 14:33 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-12  8:44 [PATCH] x86/MCE: allow overriding the CMCI threshold Jan Beulich
2015-01-12 10:18 ` Egger, Christoph
2015-01-12 14:33   ` 答复: " 刘劲松(凯耳)

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.