[PATCH] [0/4] x86: MCE: Machine check bug fix series for 2.6.30

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] [0/4] x86: MCE: Machine check bug fix series for 2.6.30
@ 2009-04-07 15:06 Andi Kleen
  2009-04-07 15:06 ` [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU Andi Kleen
                   ` (3 more replies)
  0 siblings, 4 replies; 35+ messages in thread
From: Andi Kleen @ 2009-04-07 15:06 UTC (permalink / raw)
  To: hpa, linux-kernel, mingo, tglx


Various bug fixes for x86 machine checks which are imho 2.6.30 candidates.

-Andi

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU
  2009-04-07 15:06 [PATCH] [0/4] x86: MCE: Machine check bug fix series for 2.6.30 Andi Kleen
@ 2009-04-07 15:06 ` Andi Kleen
  2009-04-08  3:43   ` Hidetoshi Seto
  2009-04-09 10:28   ` [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU v2 Andi Kleen
  2009-04-07 15:06 ` [PATCH] [2/4] x86: MCE: Fix boot logging logic Andi Kleen
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 35+ messages in thread
From: Andi Kleen @ 2009-04-07 15:06 UTC (permalink / raw)
  To: hpa, linux-kernel, mingo, tglx


Impact: bug fix

The polling timer while running per CPU still uses a global next_interval
variable, which lead to some CPUs either polling too fast or too slow.
This was not a serious problem because all errors get picked up eventually,
but it's still better to avoid it. Turn next_interval into a per cpu variable.

Signed-off-by: Andi Kleen <ak@linux.intel.com>

---
 arch/x86/kernel/cpu/mcheck/mce_64.c |   25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

Index: linux/arch/x86/kernel/cpu/mcheck/mce_64.c
===================================================================
--- linux.orig/arch/x86/kernel/cpu/mcheck/mce_64.c	2009-04-07 16:09:57.000000000 +0200
+++ linux/arch/x86/kernel/cpu/mcheck/mce_64.c	2009-04-07 16:43:16.000000000 +0200
@@ -452,13 +452,14 @@
  */
 
 static int check_interval = 5 * 60; /* 5 minutes */
-static int next_interval; /* in jiffies */
+static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
 static void mcheck_timer(unsigned long);
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
 static void mcheck_timer(unsigned long data)
 {
 	struct timer_list *t = &per_cpu(mce_timer, data);
+	int *n;
 
 	WARN_ON(smp_processor_id() != data);
 
@@ -470,14 +471,14 @@
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
 	 * polling interval, otherwise increase the polling interval.
 	 */
+	n = &__get_cpu_var(next_interval);
 	if (mce_notify_user()) {
-		next_interval = max(next_interval/2, HZ/100);
+		*n = max(*n/2, HZ/100);
 	} else {
-		next_interval = min(next_interval * 2,
-				(int)round_jiffies_relative(check_interval*HZ));
+		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
 	}
 
-	t->expires = jiffies + next_interval;
+	t->expires = jiffies + *n;
 	add_timer(t);
 }
 
@@ -632,14 +633,14 @@
 static void mce_init_timer(void)
 {
 	struct timer_list *t = &__get_cpu_var(mce_timer);
+	int *n = &__get_cpu_var(next_interval);
 
-	/* data race harmless because everyone sets to the same value */
-	if (!next_interval)
-		next_interval = check_interval * HZ;
-	if (!next_interval)
+	if (!*n)
+		*n = check_interval * HZ;
+	if (!*n)
 		return;
 	setup_timer(t, mcheck_timer, smp_processor_id());
-	t->expires = round_jiffies(jiffies + next_interval);
+	t->expires = round_jiffies(jiffies + *n);
 	add_timer(t);
 }
 
@@ -907,7 +908,6 @@
 /* Reinit MCEs after user configuration changes */
 static void mce_restart(void)
 {
-	next_interval = check_interval * HZ;
 	on_each_cpu(mce_cpu_restart, NULL, 1);
 }
 
@@ -1110,7 +1110,8 @@
 		break;
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
-		t->expires = round_jiffies(jiffies + next_interval);
+		t->expires = round_jiffies(jiffies +
+						__get_cpu_var(next_interval));
 		add_timer_on(t, cpu);
 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
 		break;

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH] [2/4] x86: MCE: Fix boot logging logic
  2009-04-07 15:06 [PATCH] [0/4] x86: MCE: Machine check bug fix series for 2.6.30 Andi Kleen
  2009-04-07 15:06 ` [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU Andi Kleen
@ 2009-04-07 15:06 ` Andi Kleen
  2009-04-07 15:06 ` [PATCH] [3/4] x86: MCE: Improve mce_get_rip Andi Kleen
  2009-04-07 15:06 ` [PATCH] [4/4] x86: MCE: Fix EIPV behaviour with !PCC Andi Kleen
  3 siblings, 0 replies; 35+ messages in thread
From: Andi Kleen @ 2009-04-07 15:06 UTC (permalink / raw)
  To: hpa, linux-kernel, mingo, tglx


Impact: bugfix

The earlier patch to change the poller to a separate function subtly
broke the boot logging logic. This could lead to machine checks 
getting logged at boot even when disabled or defaulting to off
on some systems. Fix that.

Signed-off-by: Andi Kleen <ak@linux.intel.com>

---
 arch/x86/include/asm/mce.h          |    1 +
 arch/x86/kernel/cpu/mcheck/mce_64.c |    9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

Index: linux/arch/x86/include/asm/mce.h
===================================================================
--- linux.orig/arch/x86/include/asm/mce.h	2009-04-07 16:09:57.000000000 +0200
+++ linux/arch/x86/include/asm/mce.h	2009-04-07 16:43:15.000000000 +0200
@@ -137,6 +137,7 @@
 enum mcp_flags {
 	MCP_TIMESTAMP = (1 << 0),	/* log time stamp */
 	MCP_UC = (1 << 1),		/* log uncorrected errors */
+	MCP_DONTLOG = (1 << 2),		/* only clear, don't log */
 };
 extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
Index: linux/arch/x86/kernel/cpu/mcheck/mce_64.c
===================================================================
--- linux.orig/arch/x86/kernel/cpu/mcheck/mce_64.c	2009-04-07 16:09:59.000000000 +0200
+++ linux/arch/x86/kernel/cpu/mcheck/mce_64.c	2009-04-07 16:43:15.000000000 +0200
@@ -239,9 +239,10 @@
 		 * Don't get the IP here because it's unlikely to
 		 * have anything to do with the actual error location.
 		 */
-
-		mce_log(&m);
-		add_taint(TAINT_MACHINE_CHECK);
+		if (!(flags & MCP_DONTLOG)) {
+			mce_log(&m);
+			add_taint(TAINT_MACHINE_CHECK);
+		}
 
 		/*
 		 * Clear state for this bank.
@@ -585,7 +586,7 @@
 	 * Log the machine checks left over from the previous reset.
 	 */
 	bitmap_fill(all_banks, MAX_NR_BANKS);
-	machine_check_poll(MCP_UC, &all_banks);
+	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
 
 	set_in_cr4(X86_CR4_MCE);
 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-07 15:06 [PATCH] [0/4] x86: MCE: Machine check bug fix series for 2.6.30 Andi Kleen
  2009-04-07 15:06 ` [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU Andi Kleen
  2009-04-07 15:06 ` [PATCH] [2/4] x86: MCE: Fix boot logging logic Andi Kleen
@ 2009-04-07 15:06 ` Andi Kleen
  2009-04-08  8:15   ` Hidetoshi Seto
  2009-04-07 15:06 ` [PATCH] [4/4] x86: MCE: Fix EIPV behaviour with !PCC Andi Kleen
  3 siblings, 1 reply; 35+ messages in thread
From: Andi Kleen @ 2009-04-07 15:06 UTC (permalink / raw)
  To: ying.huang, hpa, linux-kernel, mingo, tglx


From: Huang Ying <ying.huang@intel.com>

Return rip/cs if MCG_STATUS_EIPV is set in mce_get_rip(). Remain m->cs
if RIP is read from rip_msr.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>

---
 arch/x86/kernel/cpu/mcheck/mce_64.c |    3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

Index: linux/arch/x86/kernel/cpu/mcheck/mce_64.c
===================================================================
--- linux.orig/arch/x86/kernel/cpu/mcheck/mce_64.c	2009-04-07 16:09:59.000000000 +0200
+++ linux/arch/x86/kernel/cpu/mcheck/mce_64.c	2009-04-07 16:43:15.000000000 +0200
@@ -175,7 +175,7 @@
 
 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 {
-	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
+	if (regs && (m->mcgstatus & (MCG_STATUS_RIPV | MCG_STATUS_EIPV))) {
 		m->ip = regs->ip;
 		m->cs = regs->cs;
 	} else {
@@ -186,7 +186,6 @@
 		/* Assume the RIP in the MSR is exact. Is this true? */
 		m->mcgstatus |= MCG_STATUS_EIPV;
 		rdmsrl(rip_msr, m->ip);
-		m->cs = 0;
 	}
 }
 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH] [4/4] x86: MCE: Fix EIPV behaviour with !PCC
  2009-04-07 15:06 [PATCH] [0/4] x86: MCE: Machine check bug fix series for 2.6.30 Andi Kleen
                   ` (2 preceding siblings ...)
  2009-04-07 15:06 ` [PATCH] [3/4] x86: MCE: Improve mce_get_rip Andi Kleen
@ 2009-04-07 15:06 ` Andi Kleen
  2009-04-23  9:43   ` Huang Ying
  3 siblings, 1 reply; 35+ messages in thread
From: Andi Kleen @ 2009-04-07 15:06 UTC (permalink / raw)
  To: ying.huang, hpa, linux-kernel, mingo, tglx


From: Huang Ying <ying.huang@intel.com>

Impact: Spec compliance

When tolerant == 0, system should go panic instead of send SIGBUS even
for a MCE with EPIV && !PCC on user space.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>

---
 arch/x86/kernel/cpu/mcheck/mce_64.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux/arch/x86/kernel/cpu/mcheck/mce_64.c
===================================================================
--- linux.orig/arch/x86/kernel/cpu/mcheck/mce_64.c	2009-04-07 16:09:59.000000000 +0200
+++ linux/arch/x86/kernel/cpu/mcheck/mce_64.c	2009-04-07 16:43:14.000000000 +0200
@@ -401,7 +401,7 @@
 		 * force_sig() takes an awful lot of locks and has a slight
 		 * risk of deadlocking.
 		 */
-		if (user_space) {
+		if (user_space && tolerant > 0) {
 			force_sig(SIGBUS, current);
 		} else if (panic_on_oops || tolerant < 2) {
 			mce_panic("Uncorrected machine check",

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU
  2009-04-07 15:06 ` [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU Andi Kleen
@ 2009-04-08  3:43   ` Hidetoshi Seto
  2009-04-08 10:43     ` Andi Kleen
  2009-04-09 10:28   ` [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU v2 Andi Kleen
  1 sibling, 1 reply; 35+ messages in thread
From: Hidetoshi Seto @ 2009-04-08  3:43 UTC (permalink / raw)
  To: Andi Kleen; +Cc: hpa, linux-kernel, mingo, tglx

Andi Kleen wrote:
> Impact: bug fix
> 
> The polling timer while running per CPU still uses a global next_interval
> variable, which lead to some CPUs either polling too fast or too slow.
> This was not a serious problem because all errors get picked up eventually,
> but it's still better to avoid it. Turn next_interval into a per cpu variable.
> 
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> 
> ---
>  arch/x86/kernel/cpu/mcheck/mce_64.c |   25 +++++++++++++------------
>  1 file changed, 13 insertions(+), 12 deletions(-)
> 
> Index: linux/arch/x86/kernel/cpu/mcheck/mce_64.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/cpu/mcheck/mce_64.c	2009-04-07 16:09:57.000000000 +0200
> +++ linux/arch/x86/kernel/cpu/mcheck/mce_64.c	2009-04-07 16:43:16.000000000 +0200
> @@ -452,13 +452,14 @@
>   */
>  
>  static int check_interval = 5 * 60; /* 5 minutes */
> -static int next_interval; /* in jiffies */
> +static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
>  static void mcheck_timer(unsigned long);
>  static DEFINE_PER_CPU(struct timer_list, mce_timer);
>  
>  static void mcheck_timer(unsigned long data)
>  {
>  	struct timer_list *t = &per_cpu(mce_timer, data);
> +	int *n;
>  
>  	WARN_ON(smp_processor_id() != data);
>  
> @@ -470,14 +471,14 @@
>  	 * Alert userspace if needed.  If we logged an MCE, reduce the
>  	 * polling interval, otherwise increase the polling interval.
>  	 */
> +	n = &__get_cpu_var(next_interval);
>  	if (mce_notify_user()) {
> -		next_interval = max(next_interval/2, HZ/100);
> +		*n = max(*n/2, HZ/100);
>  	} else {
> -		next_interval = min(next_interval * 2,
> -				(int)round_jiffies_relative(check_interval*HZ));
> +		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
>  	}
>  
> -	t->expires = jiffies + next_interval;
> +	t->expires = jiffies + *n;
>  	add_timer(t);
>  }
>  
> @@ -632,14 +633,14 @@
>  static void mce_init_timer(void)
>  {
>  	struct timer_list *t = &__get_cpu_var(mce_timer);
> +	int *n = &__get_cpu_var(next_interval);
>  
> -	/* data race harmless because everyone sets to the same value */
> -	if (!next_interval)
> -		next_interval = check_interval * HZ;
> -	if (!next_interval)

[plan A]
Add
	if (!check_interval)
		return;

Or...

> +	if (!*n)
> +		*n = check_interval * HZ;
> +	if (!*n)
>  		return;
>  	setup_timer(t, mcheck_timer, smp_processor_id());
> -	t->expires = round_jiffies(jiffies + next_interval);
> +	t->expires = round_jiffies(jiffies + *n);
>  	add_timer(t);
>  }
>  
> @@ -907,7 +908,6 @@
>  /* Reinit MCEs after user configuration changes */
>  static void mce_restart(void)
>  {
> -	next_interval = check_interval * HZ;
>  	on_each_cpu(mce_cpu_restart, NULL, 1);
>  }
>  

[plan B]
Don't remove this line.

Take A or B, or we cannot stop polling timer by setting check_interval
to 0 via sysfs and then the timer will spin with 0 interval.


Thanks,
H.Seto

> @@ -1110,7 +1110,8 @@
>  		break;
>  	case CPU_DOWN_FAILED:
>  	case CPU_DOWN_FAILED_FROZEN:
> -		t->expires = round_jiffies(jiffies + next_interval);
> +		t->expires = round_jiffies(jiffies +
> +						__get_cpu_var(next_interval));
>  		add_timer_on(t, cpu);
>  		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
>  		break;
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-07 15:06 ` [PATCH] [3/4] x86: MCE: Improve mce_get_rip Andi Kleen
@ 2009-04-08  8:15   ` Hidetoshi Seto
  2009-04-08 10:06     ` Andi Kleen
  2009-04-23  9:43     ` Huang Ying
  0 siblings, 2 replies; 35+ messages in thread
From: Hidetoshi Seto @ 2009-04-08  8:15 UTC (permalink / raw)
  To: Andi Kleen; +Cc: ying.huang, hpa, linux-kernel, mingo, tglx

Andi Kleen wrote:
> From: Huang Ying <ying.huang@intel.com>
> 
> Return rip/cs if MCG_STATUS_EIPV is set in mce_get_rip(). Remain m->cs
> if RIP is read from rip_msr.

It means we use "Error IP" as "Return IP" if RIPV=0 but EIPV=1 ...?
Sounds strange.

> 
> Signed-off-by: Huang Ying <ying.huang@intel.com>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> 
> ---
>  arch/x86/kernel/cpu/mcheck/mce_64.c |    3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> Index: linux/arch/x86/kernel/cpu/mcheck/mce_64.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/cpu/mcheck/mce_64.c	2009-04-07 16:09:59.000000000 +0200
> +++ linux/arch/x86/kernel/cpu/mcheck/mce_64.c	2009-04-07 16:43:15.000000000 +0200
> @@ -175,7 +175,7 @@
>  
>  static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
>  {
> -	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
> +	if (regs && (m->mcgstatus & (MCG_STATUS_RIPV | MCG_STATUS_EIPV))) {
>  		m->ip = regs->ip;
>  		m->cs = regs->cs;
>  	} else {
> @@ -186,7 +186,6 @@
>  		/* Assume the RIP in the MSR is exact. Is this true? */
>  		m->mcgstatus |= MCG_STATUS_EIPV;

Why this "forcing EIPV=1" still required?
I think remaining this line will make something wrong.

>  		rdmsrl(rip_msr, m->ip);
> -		m->cs = 0;
>  	}
>  }

The mce_get_rip() is called from inside of a for-loop.
And assume that we start with RIPV=0 and EIPV=0:

Before applying this patch:

   if (rip_msr) { (m->ip, m->cs) = ((data from msr), 0); }
   else         { (m->ip, m->cs) = (0, 0); }

And After:

 1st call:
   if (rip_msr) { (m->ip, m->cs) = ((data from msr), 0); }
   else         { (m->ip, m->cs) = (0, 0); }
 2nd call and later:
   if (rip_msr) { (m->ip, m->cs) = ((data from msr), regs->cs); }
   else         { (m->ip, m->cs) = (0, 0); }

Plus, after applying [3/28] of your patchset for 2.6.31 (that
removes "m->mcgstatus |= MCG_STATUS_EIPV"), it will be again:

   if (rip_msr) { (m->ip, m->cs) = ((data from msr), 0); }
   else         { (m->ip, m->cs) = (0, 0); }

So I bet this patch does not work stand alone.


Given that:

  1) the ip retrieved by mce_get_rip() is now only used for input of
     mce_log().

  2) code in mce_log():

        if (m->ip) {
                printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
                       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
                       m->cs, m->ip);
                if (m->cs == __KERNEL_CS)
                        print_symbol("{%s}", m->ip);
                printk("\n");
        }

  3) code in mce_cap_init():

        /* Use accurate RIP reporting if available. */
        if ((cap & MCG_EXT_P) && (MCG_NUM_EXT(cap) >= 9))
                rip_msr = MSR_IA32_MCG_EIP;

I guess it would make much sense if we stop mixing RIP and EIP and rename
the mce_get_rip() to mce_get_eip(), and the rip_msr to eip_msr too.

And then it would be acceptable if we print RIP with "!INEXACT!" annotation
instead of printing precise EIP in case of RIPV=0 but EIPV=1.


Thanks,
H.Seto

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-08  8:15   ` Hidetoshi Seto
@ 2009-04-08 10:06     ` Andi Kleen
  2009-04-09  4:59       ` Hidetoshi Seto
  2009-04-23  9:43     ` Huang Ying
  1 sibling, 1 reply; 35+ messages in thread
From: Andi Kleen @ 2009-04-08 10:06 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: ying.huang, hpa, linux-kernel, mingo, tglx

Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> writes:

> Andi Kleen wrote:
>> From: Huang Ying <ying.huang@intel.com>
>> 
>> Return rip/cs if MCG_STATUS_EIPV is set in mce_get_rip(). Remain m->cs
>> if RIP is read from rip_msr.
>
> It means we use "Error IP" as "Return IP" if RIPV=0 but EIPV=1 ...?
> Sounds strange.

It's not return IP, but "reported IP" in this case.

> Why this "forcing EIPV=1" still required?
> I think remaining this line will make something wrong.

Yes it's wrong. I'm dropping this in a later patch.
BTW current CPUs don't support this MSR anyways, it was a P4 only feature.

>

> I guess it would make much sense if we stop mixing RIP and EIP and rename
> the mce_get_rip() to mce_get_eip(), and the rip_msr to eip_msr too.

Ok fair enough.  I admit the code was always a bit dubious.

> And then it would be acceptable if we print RIP with "!INEXACT!" annotation
> instead of printing precise EIP in case of RIPV=0 but EIPV=1.

Please send a patch to do all that.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU
  2009-04-08  3:43   ` Hidetoshi Seto
@ 2009-04-08 10:43     ` Andi Kleen
  2009-04-08 11:30       ` Hidetoshi Seto
  0 siblings, 1 reply; 35+ messages in thread
From: Andi Kleen @ 2009-04-08 10:43 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: hpa, linux-kernel, mingo, tglx

Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> writes:
>>  
>> @@ -632,14 +633,14 @@
>>  static void mce_init_timer(void)
>>  {
>>  	struct timer_list *t = &__get_cpu_var(mce_timer);
>> +	int *n = &__get_cpu_var(next_interval);
>>  
>> -	/* data race harmless because everyone sets to the same value */
>> -	if (!next_interval)
>> -		next_interval = check_interval * HZ;
>> -	if (!next_interval)
>
> [plan A]
> Add
> 	if (!check_interval)
> 		return;
>
> Or...
>
>> +	if (!*n)
>> +		*n = check_interval * HZ;
>> +	if (!*n)
>>  		return;


The !*n will return for check_interval == 0 because 0*HZ is still 0 so it should be 
equivalent. Did I miss something?

-Andi


-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU
  2009-04-08 10:43     ` Andi Kleen
@ 2009-04-08 11:30       ` Hidetoshi Seto
  2009-04-08 11:40         ` Andi Kleen
  0 siblings, 1 reply; 35+ messages in thread
From: Hidetoshi Seto @ 2009-04-08 11:30 UTC (permalink / raw)
  To: Andi Kleen; +Cc: hpa, linux-kernel, mingo, tglx

Andi Kleen wrote:
>>> +	if (!*n)
>>> +		*n = check_interval * HZ;
>>> +	if (!*n)
>>>  		return;
> 
> The !*n will return for check_interval == 0 because 0*HZ is still 0 so it should be 
> equivalent. Did I miss something?

At First, *n is 0 on boot.
And then soon it will be initialized to non-zero (check_interval * HZ)
by the first if-statement.

After that if check_interval is changed via sysfs while *n is non-zero,
which if-statement runs?


Thanks,
H.Seto

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU
  2009-04-08 11:30       ` Hidetoshi Seto
@ 2009-04-08 11:40         ` Andi Kleen
  0 siblings, 0 replies; 35+ messages in thread
From: Andi Kleen @ 2009-04-08 11:40 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Andi Kleen, hpa, linux-kernel, mingo, tglx

On Wed, Apr 08, 2009 at 08:30:45PM +0900, Hidetoshi Seto wrote:
> Andi Kleen wrote:
> >>> +	if (!*n)
> >>> +		*n = check_interval * HZ;
> >>> +	if (!*n)
> >>>  		return;
> > 
> > The !*n will return for check_interval == 0 because 0*HZ is still 0 so it should be 
> > equivalent. Did I miss something?
> 
> At First, *n is 0 on boot.
> And then soon it will be initialized to non-zero (check_interval * HZ)
> by the first if-statement.
> 
> After that if check_interval is changed via sysfs while *n is non-zero,
> which if-statement runs?

Ok got your point now. The first if (!*n) needs to be dropped indeed.
Will respin that patch.

Thanks.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-08 10:06     ` Andi Kleen
@ 2009-04-09  4:59       ` Hidetoshi Seto
  2009-04-09  7:14         ` Andi Kleen
  0 siblings, 1 reply; 35+ messages in thread
From: Hidetoshi Seto @ 2009-04-09  4:59 UTC (permalink / raw)
  To: Andi Kleen; +Cc: ying.huang, hpa, linux-kernel, mingo, tglx

Andi Kleen wrote:
> Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> writes:
> 
>> Andi Kleen wrote:
>>> From: Huang Ying <ying.huang@intel.com>
>>>
>>> Return rip/cs if MCG_STATUS_EIPV is set in mce_get_rip(). Remain m->cs
>>> if RIP is read from rip_msr.
>> It means we use "Error IP" as "Return IP" if RIPV=0 but EIPV=1 ...?
>> Sounds strange.
> 
> It's not return IP, but "reported IP" in this case.

Wait, I'm ashamed to say, it seems we missed the name of instruction
pointer register: The 64bit one is RIP, and the 32bit one is EIP.

Anyway we have proved a major point - It is confusing expression.

>> I guess it would make much sense if we stop mixing RIP and EIP and rename
>> the mce_get_rip() to mce_get_eip(), and the rip_msr to eip_msr too.
> 
> Ok fair enough.  I admit the code was always a bit dubious.
> 
>> And then it would be acceptable if we print RIP with "!INEXACT!" annotation
>> instead of printing precise EIP in case of RIPV=0 but EIPV=1.
> 
> Please send a patch to do all that.

I will.

A trivial question is if you unified 32/64bit mce codes, would you
like to print only "IP %02x:<%016Lx>", or "RIP ..." and "EIP ..." ?


Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-09  4:59       ` Hidetoshi Seto
@ 2009-04-09  7:14         ` Andi Kleen
  2009-04-09  9:59           ` Hidetoshi Seto
  0 siblings, 1 reply; 35+ messages in thread
From: Andi Kleen @ 2009-04-09  7:14 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Andi Kleen, ying.huang, hpa, linux-kernel, mingo, tglx

On Thu, Apr 09, 2009 at 01:59:32PM +0900, Hidetoshi Seto wrote:
> >> I guess it would make much sense if we stop mixing RIP and EIP and rename
> >> the mce_get_rip() to mce_get_eip(), and the rip_msr to eip_msr too.
> > 
> > Ok fair enough.  I admit the code was always a bit dubious.
> > 
> >> And then it would be acceptable if we print RIP with "!INEXACT!" annotation
> >> instead of printing precise EIP in case of RIPV=0 but EIPV=1.
> > 
> > Please send a patch to do all that.
> 
> I will.
> 
> A trivial question is if you unified 32/64bit mce codes, would you
> like to print only "IP %02x:<%016Lx>", or "RIP ..." and "EIP ..." ?

I would prefer to pt in RIP in both cases, simply because i fear breaking
parsers. I think the 32bit users will survive if their instruction
pointer is reported as "RIP" too.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-09  7:14         ` Andi Kleen
@ 2009-04-09  9:59           ` Hidetoshi Seto
  2009-04-09 10:13             ` Andi Kleen
  0 siblings, 1 reply; 35+ messages in thread
From: Hidetoshi Seto @ 2009-04-09  9:59 UTC (permalink / raw)
  To: Andi Kleen; +Cc: ying.huang, hpa, linux-kernel, mingo, tglx

Andi Kleen wrote:
> On Thu, Apr 09, 2009 at 01:59:32PM +0900, Hidetoshi Seto wrote:
>>>> I guess it would make much sense if we stop mixing RIP and EIP and rename
>>>> the mce_get_rip() to mce_get_eip(), and the rip_msr to eip_msr too.
>>> Ok fair enough.  I admit the code was always a bit dubious.
>>>
>>>> And then it would be acceptable if we print RIP with "!INEXACT!" annotation
>>>> instead of printing precise EIP in case of RIPV=0 but EIPV=1.
>>> Please send a patch to do all that.
>> I will.
>>
>> A trivial question is if you unified 32/64bit mce codes, would you
>> like to print only "IP %02x:<%016Lx>", or "RIP ..." and "EIP ..." ?
> 
> I would prefer to pt in RIP in both cases, simply because i fear breaking
> parsers. I think the 32bit users will survive if their instruction
> pointer is reported as "RIP" too.

I see.  I also supposed it will be an issue on parsers.

In the end, since still this is 64bit code, I decided to keep using "RIP"
as the name of 64bit register.
I found there are two main factor in my trouble:

 1) There are no short description for mce_get_rip()
    I thought the purpose of the function is getting "Return/Restart IP"
    because it worked only if RIPV(Restart IP Valid) bit is set.

 2) The following initialization let me wrong:
      rip_msr = MSR_IA32_MCG_EIP;
    I imagined that the "r" is typo or there are special meaning in the "r".

Following is a proposal version. Maybe dividing it into 2 pieces, function
improvement and MSR definition, would be a good idea.

Comments are welcomed.


Thanks,
H.Seto


[PATCH] x86: MCE: Improve mce_get_rip v2

The mce_get_rip() is a function to get the address of instruction
at the time of the machine check.  Usually the address is stored
on the stack, but it may not always valid.
We can trust the value if MCG_STATUS_RIPV is set, because it means
we can restart the program from the address.

This patch adds new logics:

 - Return rip/cs on the stack if MCG_STATUS_EIPV is set.
   Even the RIPV is not set, EIPV means the address is directly
   associated with the error.
 - Remain m->cs even if accurate RIP is available in rip_msr.

Strictly speaking, in processor with Intel 64 support there are no
MSR named IA32_MCG_EIP, but an alias MSR named IA32_MCG_RIP.
Add definitions for MSRs in the 64bit processor, following the
specification.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 arch/x86/include/asm/msr-index.h    |   28 +++++++++++++++++++++++++++-
 arch/x86/kernel/cpu/mcheck/mce_64.c |   20 ++++++++++++--------
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ec41fc1..f5cf25f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -244,7 +244,7 @@
 #define MSR_P6_EVNTSEL0			0x00000186
 #define MSR_P6_EVNTSEL1			0x00000187
 
-/* P4/Xeon+ specific */
+/* Extended Machine Check State MSRs (P4/Xeon+ specific) */
 #define MSR_IA32_MCG_EAX		0x00000180
 #define MSR_IA32_MCG_EBX		0x00000181
 #define MSR_IA32_MCG_ECX		0x00000182
@@ -257,6 +257,32 @@
 #define MSR_IA32_MCG_EIP		0x00000189
 #define MSR_IA32_MCG_RESERVED		0x0000018a
 
+/* Extended Machine Check State MSRs (in processors with 64bit support) */
+#define MSR_IA32_MCG_RAX		0x00000180
+#define MSR_IA32_MCG_RBX		0x00000181
+#define MSR_IA32_MCG_RCX		0x00000182
+#define MSR_IA32_MCG_RDX		0x00000183
+#define MSR_IA32_MCG_RSI		0x00000184
+#define MSR_IA32_MCG_RDI		0x00000185
+#define MSR_IA32_MCG_RBP		0x00000186
+#define MSR_IA32_MCG_RSP		0x00000187
+#define MSR_IA32_MCG_RFLAGS		0x00000188
+#define MSR_IA32_MCG_RIP		0x00000189
+#define MSR_IA32_MCG_MISC		0x0000018a
+#define MSR_IA32_MCG_RESERVED1		0x0000018b
+#define MSR_IA32_MCG_RESERVED2		0x0000018c
+#define MSR_IA32_MCG_RESERVED3		0x0000018d
+#define MSR_IA32_MCG_RESERVED4		0x0000018e
+#define MSR_IA32_MCG_RESERVED5		0x0000018f
+#define MSR_IA32_MCG_R8			0x00000190
+#define MSR_IA32_MCG_R9			0x00000191
+#define MSR_IA32_MCG_R10		0x00000192
+#define MSR_IA32_MCG_R11		0x00000193
+#define MSR_IA32_MCG_R12		0x00000194
+#define MSR_IA32_MCG_R13		0x00000195
+#define MSR_IA32_MCG_R14		0x00000196
+#define MSR_IA32_MCG_R15		0x00000197
+
 /* Pentium IV performance counter MSRs */
 #define MSR_P4_BPU_PERFCTR0		0x00000300
 #define MSR_P4_BPU_PERFCTR1		0x00000301
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 95b81eb..374ef6d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -173,21 +173,22 @@ int mce_available(struct cpuinfo_x86 *c)
 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
 
+/*
+ * Get the address of instruction at the time of the machine check error.
+ */
 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 {
-	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
+	/* Use value on the stack if it is meaningful. */
+	if (regs && (m->mcgstatus & (MCG_STATUS_RIPV | MCG_STATUS_EIPV))) {
 		m->ip = regs->ip;
 		m->cs = regs->cs;
 	} else {
 		m->ip = 0;
 		m->cs = 0;
 	}
-	if (rip_msr) {
-		/* Assume the RIP in the MSR is exact. Is this true? */
-		m->mcgstatus |= MCG_STATUS_EIPV;
+	/* Use accurate value if available. */
+	if (rip_msr)
 		rdmsrl(rip_msr, m->ip);
-		m->cs = 0;
-	}
 }
 
 /*
@@ -569,9 +570,12 @@ static int mce_cap_init(void)
 		memset(bank, 0xff, banks * sizeof(u64));
 	}
 
-	/* Use accurate RIP reporting if available. */
+	/*
+	 * Use Extended Machine Check State Register to get accurate state of
+	 * the RIP register at the time of the machine check if available.
+	 */
 	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
-		rip_msr = MSR_IA32_MCG_EIP;
+		rip_msr = MSR_IA32_MCG_RIP;
 
 	return 0;
 }
-- 
1.6.2.2


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-09  9:59           ` Hidetoshi Seto
@ 2009-04-09 10:13             ` Andi Kleen
  2009-04-10  4:38               ` Hidetoshi Seto
  0 siblings, 1 reply; 35+ messages in thread
From: Andi Kleen @ 2009-04-09 10:13 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Andi Kleen, ying.huang, hpa, linux-kernel, mingo, tglx

> Following is a proposal version. Maybe dividing it into 2 pieces, function
> improvement and MSR definition, would be a good idea.

I don't think we need the full MSR definitions right now, at least 
I don't have any plans to support them. After all current CPUs
don't.

The rest looks good.

-Andi

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU v2
  2009-04-07 15:06 ` [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU Andi Kleen
  2009-04-08  3:43   ` Hidetoshi Seto
@ 2009-04-09 10:28   ` Andi Kleen
  1 sibling, 0 replies; 35+ messages in thread
From: Andi Kleen @ 2009-04-09 10:28 UTC (permalink / raw)
  To: hpa; +Cc: linux-kernel, mingo, tglx, seto.hidetoshi


This v2 version fixes the check_interval == 0 case noticed by Seto-san.
Please apply.

-Andi

---

x86: MCE: Make polling timer interval per CPU v2

Impact: bug fix

The polling timer while running per CPU still uses a global next_interval
variable, which lead to some CPUs either polling too fast or too slow.
This was not a serious problem because all errors get picked up eventually,
but it's still better to avoid it. Turn next_interval into a per cpu variable.

v2: Fix check_interval == 0 case (Hidetoshi Seto)

Signed-off-by: Andi Kleen <ak@linux.intel.com>

---
 arch/x86/kernel/cpu/mcheck/mce_64.c |   24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

Index: linux/arch/x86/kernel/cpu/mcheck/mce_64.c
===================================================================
--- linux.orig/arch/x86/kernel/cpu/mcheck/mce_64.c	2009-04-09 11:43:58.000000000 +0200
+++ linux/arch/x86/kernel/cpu/mcheck/mce_64.c	2009-04-09 12:16:33.000000000 +0200
@@ -452,13 +452,14 @@
  */
 
 static int check_interval = 5 * 60; /* 5 minutes */
-static int next_interval; /* in jiffies */
+static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
 static void mcheck_timer(unsigned long);
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
 static void mcheck_timer(unsigned long data)
 {
 	struct timer_list *t = &per_cpu(mce_timer, data);
+	int *n;
 
 	WARN_ON(smp_processor_id() != data);
 
@@ -470,14 +471,14 @@
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
 	 * polling interval, otherwise increase the polling interval.
 	 */
+	n = &__get_cpu_var(next_interval);
 	if (mce_notify_user()) {
-		next_interval = max(next_interval/2, HZ/100);
+		*n = max(*n/2, HZ/100);
 	} else {
-		next_interval = min(next_interval * 2,
-				(int)round_jiffies_relative(check_interval*HZ));
+		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
 	}
 
-	t->expires = jiffies + next_interval;
+	t->expires = jiffies + *n;
 	add_timer(t);
 }
 
@@ -632,14 +633,13 @@
 static void mce_init_timer(void)
 {
 	struct timer_list *t = &__get_cpu_var(mce_timer);
+	int *n = &__get_cpu_var(next_interval);
 
-	/* data race harmless because everyone sets to the same value */
-	if (!next_interval)
-		next_interval = check_interval * HZ;
-	if (!next_interval)
+	*n = check_interval * HZ;
+	if (!*n)
 		return;
 	setup_timer(t, mcheck_timer, smp_processor_id());
-	t->expires = round_jiffies(jiffies + next_interval);
+	t->expires = round_jiffies(jiffies + *n);
 	add_timer(t);
 }
 
@@ -907,7 +907,6 @@
 /* Reinit MCEs after user configuration changes */
 static void mce_restart(void)
 {
-	next_interval = check_interval * HZ;
 	on_each_cpu(mce_cpu_restart, NULL, 1);
 }
 
@@ -1110,7 +1109,8 @@
 		break;
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
-		t->expires = round_jiffies(jiffies + next_interval);
+		t->expires = round_jiffies(jiffies +
+						__get_cpu_var(next_interval));
 		add_timer_on(t, cpu);
 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
 		break;

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-09 10:13             ` Andi Kleen
@ 2009-04-10  4:38               ` Hidetoshi Seto
  2009-04-10  8:25                 ` Andi Kleen
  0 siblings, 1 reply; 35+ messages in thread
From: Hidetoshi Seto @ 2009-04-10  4:38 UTC (permalink / raw)
  To: Andi Kleen; +Cc: ying.huang, hpa, linux-kernel, mingo, tglx

Andi Kleen wrote:
>> Following is a proposal version. Maybe dividing it into 2 pieces, function
>> improvement and MSR definition, would be a good idea.
> 
> I don't think we need the full MSR definitions right now, at least 
> I don't have any plans to support them. After all current CPUs
> don't.
> 
> The rest looks good.

Thanks.

I still believe that using MSR which only available on 32bit from 64bit
code is not right thing.  However this is not logical bug, and adding
definition is not suitable for 2.6.30.  I'd like to defer the MSR part
to the next time.

BTW, since this patch is "Improve", I think you need to clarify why you
bind it into the "bugfix" patch set for 2.6.30.  If there are known bug,
please describe about it.

Thanks,
H.Seto

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-10  4:38               ` Hidetoshi Seto
@ 2009-04-10  8:25                 ` Andi Kleen
  2009-04-10  9:49                   ` Hidetoshi Seto
  0 siblings, 1 reply; 35+ messages in thread
From: Andi Kleen @ 2009-04-10  8:25 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Andi Kleen, ying.huang, hpa, linux-kernel, mingo, tglx

On Fri, Apr 10, 2009 at 01:38:07PM +0900, Hidetoshi Seto wrote:
> Andi Kleen wrote:
> >> Following is a proposal version. Maybe dividing it into 2 pieces, function
> >> improvement and MSR definition, would be a good idea.
> > 
> > I don't think we need the full MSR definitions right now, at least 
> > I don't have any plans to support them. After all current CPUs
> > don't.
> > 
> > The rest looks good.
> 
> Thanks.
> 
> I still believe that using MSR which only available on 32bit from 64bit

The MSR is available on 64bit too (there are 64bit capable P4s
like Prescott or Smithfield) 

> code is not right thing.  However this is not logical bug, and adding
> definition is not suitable for 2.6.30.  I'd like to defer the MSR part
> to the next time.
> 
> BTW, since this patch is "Improve", I think you need to clarify why you
> bind it into the "bugfix" patch set for 2.6.30.  If there are known bug,
> please describe about it.

It reports the incorrect RIP, fixing one of the test cases in the 
MCE regression test suite.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-10  8:25                 ` Andi Kleen
@ 2009-04-10  9:49                   ` Hidetoshi Seto
  0 siblings, 0 replies; 35+ messages in thread
From: Hidetoshi Seto @ 2009-04-10  9:49 UTC (permalink / raw)
  To: Andi Kleen; +Cc: ying.huang, hpa, linux-kernel, mingo, tglx

Andi Kleen wrote:
> On Fri, Apr 10, 2009 at 01:38:07PM +0900, Hidetoshi Seto wrote:
>> I still believe that using MSR which only available on 32bit from 64bit
> 
> The MSR is available on 64bit too (there are 64bit capable P4s
> like Prescott or Smithfield) 

The problem is the name which the MSR locating address 189H has.
Do you mind if I put this topic on the back-burner?

>> BTW, since this patch is "Improve", I think you need to clarify why you
>> bind it into the "bugfix" patch set for 2.6.30.  If there are known bug,
>> please describe about it.
> 
> It reports the incorrect RIP, fixing one of the test cases in the 
> MCE regression test suite.

Why is it incorrect?  In what case, what result is expected because why,
and how this patch fix it?


Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-08  8:15   ` Hidetoshi Seto
  2009-04-08 10:06     ` Andi Kleen
@ 2009-04-23  9:43     ` Huang Ying
  2009-04-24  6:16       ` Hidetoshi Seto
  1 sibling, 1 reply; 35+ messages in thread
From: Huang Ying @ 2009-04-23  9:43 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Hidetoshi Seto, hpa, linux-kernel, mingo, tglx

[-- Attachment #1: Type: text/plain, Size: 2691 bytes --]

Add some description for the patch, hope that to be more clear.

Best Regards,
Huang Ying
--------------------------------------------->
mce_get_rip() is used to get IP when MCE is generated, usually from
the stack. But the IP on the stack is not always valid.
MCG_STATUS_RIPV indicates program can restart from the IP on the stack,
so if it is set, the IP is valid. MCG_STATUS_EIPV indicate IP on the
stack is directly associated with the error, so if it is set, the IP
is valid too.

In current implementation, no IP will be returned (and then reported)
if MCG_STATUS_RIPV is not set and MCG_STATUS_EIPV is set. This patch
fixes this issue by returning IP on the stack when MCG_STATUS_EIPV is
set.

In some CPU, a MSR (rip_msr) provides another way to get IP when MCE
is generated. This is used by mce_get_rip() too.

There is no MSR for CS, in current implementation, if rip_msr is used
to get IP, reported CS is set to 0. But in fact, the CS on the stack
can be trusted if MCG_STATUS_RIPV or MCG_STATUS_EIPV is set. This
patch fixes this issue by keeping reported CS when rip_msr is used.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>

---
 arch/x86/kernel/cpu/mcheck/mce_64.c |   18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -173,21 +173,22 @@ int mce_available(struct cpuinfo_x86 *c)
 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
 
+/*
+ * Get the address of instruction at the time of the machine check error.
+ */
 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 {
-	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
+	/* Use value on the stack if it is meaningful. */
+	if (regs && (m->mcgstatus & (MCG_STATUS_RIPV | MCG_STATUS_EIPV))) {
 		m->ip = regs->ip;
 		m->cs = regs->cs;
 	} else {
 		m->ip = 0;
 		m->cs = 0;
 	}
-	if (rip_msr) {
-		/* Assume the RIP in the MSR is exact. Is this true? */
-		m->mcgstatus |= MCG_STATUS_EIPV;
+	/* Use accurate value if available. */
+	if (rip_msr)
 		rdmsrl(rip_msr, m->ip);
-		m->cs = 0;
-	}
 }
 
 /*
@@ -569,7 +570,10 @@ static int mce_cap_init(void)
 		memset(bank, 0xff, banks * sizeof(u64));
 	}
 
-	/* Use accurate RIP reporting if available. */
+	/*
+	 * Use Extended Machine Check State Register to get accurate state of
+	 * the RIP register at the time of the machine check if available.
+	 */
 	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 		rip_msr = MSR_IA32_MCG_EIP;
 


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [4/4] x86: MCE: Fix EIPV behaviour with !PCC
  2009-04-07 15:06 ` [PATCH] [4/4] x86: MCE: Fix EIPV behaviour with !PCC Andi Kleen
@ 2009-04-23  9:43   ` Huang Ying
  2009-04-23 20:49     ` H. Peter Anvin
  2009-04-24  0:27     ` Hidetoshi Seto
  0 siblings, 2 replies; 35+ messages in thread
From: Huang Ying @ 2009-04-23  9:43 UTC (permalink / raw)
  To: Andi Kleen; +Cc: hpa, linux-kernel, mingo, tglx

[-- Attachment #1: Type: text/plain, Size: 1330 bytes --]

Add some description for the patch, hope that to be more clear.

Best Regards,
Huang Ying
-------------------------------------------------->
Impact: Spec compliance

Tolerant level 0 means: always panic on uncorrected errors, that is,
panic even for recoverable uncorrected errors. This is a useful option
for someone think panic is the better hardware error containment
mechanism than trying to recover.

Current implementation does not comply with the tolerant == 0 spec,
that is, it tries to recover (by killing related processes) for
recoverable uncorrected errors (errors triggered in userspace) when
tolerant == 0. This patch fixes this by going panic for that case.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>

---
 arch/x86/kernel/cpu/mcheck/mce_64.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -400,7 +400,7 @@ void do_machine_check(struct pt_regs * r
 		 * force_sig() takes an awful lot of locks and has a slight
 		 * risk of deadlocking.
 		 */
-		if (user_space) {
+		if (user_space && tolerant > 0) {
 			force_sig(SIGBUS, current);
 		} else if (panic_on_oops || tolerant < 2) {
 			mce_panic("Uncorrected machine check",


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [4/4] x86: MCE: Fix EIPV behaviour with !PCC
  2009-04-23  9:43   ` Huang Ying
@ 2009-04-23 20:49     ` H. Peter Anvin
  2009-04-24  8:35       ` Andi Kleen
  2009-04-24  0:27     ` Hidetoshi Seto
  1 sibling, 1 reply; 35+ messages in thread
From: H. Peter Anvin @ 2009-04-23 20:49 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andi Kleen, linux-kernel, mingo, tglx

Huang Ying wrote:
> Add some description for the patch, hope that to be more clear.

The patch description makes sense.  I cannot, however, make head or tail 
out of the subject line or how it is related to the patch at all?

I have rewritten it to "always panic on tolerant == 0", ok with everyone?

	-hpa

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [4/4] x86: MCE: Fix EIPV behaviour with !PCC
  2009-04-23  9:43   ` Huang Ying
  2009-04-23 20:49     ` H. Peter Anvin
@ 2009-04-24  0:27     ` Hidetoshi Seto
  2009-04-24  1:11       ` Huang Ying
  1 sibling, 1 reply; 35+ messages in thread
From: Hidetoshi Seto @ 2009-04-24  0:27 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andi Kleen, hpa, linux-kernel, mingo, tglx

Huang Ying wrote:
> Add some description for the patch, hope that to be more clear.
> 
> Best Regards,
> Huang Ying
> -------------------------------------------------->
> Impact: Spec compliance
> 
> Tolerant level 0 means: always panic on uncorrected errors, that is,
> panic even for recoverable uncorrected errors. This is a useful option
> for someone think panic is the better hardware error containment
> mechanism than trying to recover.
> 
> Current implementation does not comply with the tolerant == 0 spec,
> that is, it tries to recover (by killing related processes) for
> recoverable uncorrected errors (errors triggered in userspace) when
> tolerant == 0. This patch fixes this by going panic for that case.
> 
> Signed-off-by: Huang Ying <ying.huang@intel.com>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> 
> ---
>  arch/x86/kernel/cpu/mcheck/mce_64.c |    2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> --- a/arch/x86/kernel/cpu/mcheck/mce_64.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
> @@ -400,7 +400,7 @@ void do_machine_check(struct pt_regs * r
>  		 * force_sig() takes an awful lot of locks and has a slight
>  		 * risk of deadlocking.
>  		 */
> -		if (user_space) {
> +		if (user_space && tolerant > 0) {
>  			force_sig(SIGBUS, current);
>  		} else if (panic_on_oops || tolerant < 2) {
>  			mce_panic("Uncorrected machine check",
> 

Wait, I want confirmation.

Given:
 * Tolerant levels:
 *   0: always panic on uncorrected errors, log corrected errors

Let's walk do_machine_check():

    266 void do_machine_check(struct pt_regs * regs, long error_code)
    267 {
	:
    302         for (i = 0; i < banks; i++) {
	:
    311                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
    312                 if ((m.status & MCI_STATUS_VAL) == 0)
    313                         continue;
	:
    319                 if ((m.status & MCI_STATUS_UC) == 0)
    320                         continue;
	:
# Now we start checking status with VAL and UC
	:
    329                 if (m.status & MCI_STATUS_EN) {
    330                         /* if PCC was set, there's no way out */
    331                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
    332                         /*
    333                          * If this error was uncorrectable and there was
    334                          * an overflow, we're in trouble.  If no overflow,
    335                          * we might get away with just killing a task.
    336                          */
    337                         if (m.status & MCI_STATUS_UC) {
    338                                 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
    339                                         no_way_out = 1;
    340                                 kill_it = 1;
    341                         }
    342                 } else {
    343                         /*
    344                          * Machine check event was not enabled. Clear, but
    345                          * ignore.
    346                          */
    347                         continue;
    348                 }
	:
# Humm, second UC check should be removed...
# Anyway, in case of tolerant == 0, no_way_out == 1 if the event is enabled.
# And kill_it == 1 unless there are no event enabled.
# Therefore, in case of tolerant == 0, always "no_way_out == kill_it".
	:
    364                 }
    365         }
	:
    376         if (no_way_out && tolerant < 3)
    377                 mce_panic("Machine check", &panicm, mcestart);
	:
# in case of tolerant == 0, we usually hit here.
	:
    385         if (kill_it && tolerant < 3) {
    386                 int user_space = 0;
    387
    388                 /*
    389                  * If the EIPV bit is set, it means the saved IP is the
    390                  * instruction which caused the MCE.
    391                  */
    392                 if (m.mcgstatus & MCG_STATUS_EIPV)
    393                         user_space = panicm.ip && (panicm.cs & 3);
    394
    395                 /*
    396                  * If we know that the error was in user space, send a
    397                  * SIGBUS.  Otherwise, panic if tolerance is low.
    398                  *
    399                  * force_sig() takes an awful lot of locks and has a slight
    400                  * risk of deadlocking.
    401                  */
    402                 if (user_space) {
    403                         force_sig(SIGBUS, current);
    404                 } else if (panic_on_oops || tolerant < 2) {
    405                         mce_panic("Uncorrected machine check",
    406                                 &panicm, mcestart);
    407                 }
    408         }
	:
# Then, when we enter here with tolerant == 0 ?
	:
    421 }

Or, should this patch be applied after committing some of Andi's patches?
It means this patch targets a bug in Andi's patch set and the bug is not
in 2.6.30-rc* yet.


Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [4/4] x86: MCE: Fix EIPV behaviour with !PCC
  2009-04-24  0:27     ` Hidetoshi Seto
@ 2009-04-24  1:11       ` Huang Ying
  2009-04-24  5:40         ` H. Peter Anvin
  0 siblings, 1 reply; 35+ messages in thread
From: Huang Ying @ 2009-04-24  1:11 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Andi Kleen, hpa, linux-kernel, mingo, tglx

[-- Attachment #1: Type: text/plain, Size: 353 bytes --]

On Fri, 2009-04-24 at 08:27 +0800, Hidetoshi Seto wrote:
> Huang Ying wrote:
>
> Or, should this patch be applied after committing some of Andi's patches?
> It means this patch targets a bug in Andi's patch set and the bug is not
> in 2.6.30-rc* yet.

Yes. I think you are right. I fix a "bug" that doesn't exist.

Best Regards,
Huang Ying


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [4/4] x86: MCE: Fix EIPV behaviour with !PCC
  2009-04-24  1:11       ` Huang Ying
@ 2009-04-24  5:40         ` H. Peter Anvin
  2009-04-24  8:46           ` Andi Kleen
  0 siblings, 1 reply; 35+ messages in thread
From: H. Peter Anvin @ 2009-04-24  5:40 UTC (permalink / raw)
  To: Huang Ying; +Cc: Hidetoshi Seto, Andi Kleen, linux-kernel, mingo, tglx

Huang Ying wrote:
> On Fri, 2009-04-24 at 08:27 +0800, Hidetoshi Seto wrote:
>> Huang Ying wrote:
>>
>> Or, should this patch be applied after committing some of Andi's patches?
>> It means this patch targets a bug in Andi's patch set and the bug is not
>> in 2.6.30-rc* yet.
> 
> Yes. I think you are right. I fix a "bug" that doesn't exist.
> 

Thanks for catching that now :)

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-23  9:43     ` Huang Ying
@ 2009-04-24  6:16       ` Hidetoshi Seto
  2009-04-24  6:35         ` Huang Ying
  0 siblings, 1 reply; 35+ messages in thread
From: Hidetoshi Seto @ 2009-04-24  6:16 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andi Kleen, hpa, linux-kernel, mingo, tglx

Huang Ying wrote:
> Add some description for the patch, hope that to be more clear.
> 
> Best Regards,
> Huang Ying
> --------------------------------------------->
> mce_get_rip() is used to get IP when MCE is generated, usually from
> the stack. But the IP on the stack is not always valid.
> MCG_STATUS_RIPV indicates program can restart from the IP on the stack,
> so if it is set, the IP is valid. MCG_STATUS_EIPV indicate IP on the
> stack is directly associated with the error, so if it is set, the IP
> is valid too.
> 
> In current implementation, no IP will be returned (and then reported)
> if MCG_STATUS_RIPV is not set and MCG_STATUS_EIPV is set. This patch
> fixes this issue by returning IP on the stack when MCG_STATUS_EIPV is
> set.
> 
> In some CPU, a MSR (rip_msr) provides another way to get IP when MCE
> is generated. This is used by mce_get_rip() too.
> 
> There is no MSR for CS, in current implementation, if rip_msr is used
> to get IP, reported CS is set to 0. But in fact, the CS on the stack
> can be trusted if MCG_STATUS_RIPV or MCG_STATUS_EIPV is set. This
> patch fixes this issue by keeping reported CS when rip_msr is used.

So the bug is in short:
 In some cases no IP/CS reported even there were valid records.
Right?

Then in other words it will mean lost of error information, that is not
good for error investigation.

One question is: if (RIPV,EIPV) = (0,0), then is the IP on the stack
really invalid value, or is it still point IP when MCE is generated?
I suppose it is not invalid.  If a processor encounters MCE and if it
is not sure what happened, then it will store the IP on the stack,
indicating neither of flags.

If this supposition is correct, the best way is pick the value on
the stack unconditionally, and record valid flags together.


Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-24  6:16       ` Hidetoshi Seto
@ 2009-04-24  6:35         ` Huang Ying
  2009-04-24  7:28           ` Hidetoshi Seto
  0 siblings, 1 reply; 35+ messages in thread
From: Huang Ying @ 2009-04-24  6:35 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Andi Kleen, hpa, linux-kernel, mingo, tglx

[-- Attachment #1: Type: text/plain, Size: 2222 bytes --]

On Fri, 2009-04-24 at 14:16 +0800, Hidetoshi Seto wrote:
> Huang Ying wrote:
> > Add some description for the patch, hope that to be more clear.
> > 
> > Best Regards,
> > Huang Ying
> > --------------------------------------------->
> > mce_get_rip() is used to get IP when MCE is generated, usually from
> > the stack. But the IP on the stack is not always valid.
> > MCG_STATUS_RIPV indicates program can restart from the IP on the stack,
> > so if it is set, the IP is valid. MCG_STATUS_EIPV indicate IP on the
> > stack is directly associated with the error, so if it is set, the IP
> > is valid too.
> > 
> > In current implementation, no IP will be returned (and then reported)
> > if MCG_STATUS_RIPV is not set and MCG_STATUS_EIPV is set. This patch
> > fixes this issue by returning IP on the stack when MCG_STATUS_EIPV is
> > set.
> > 
> > In some CPU, a MSR (rip_msr) provides another way to get IP when MCE
> > is generated. This is used by mce_get_rip() too.
> > 
> > There is no MSR for CS, in current implementation, if rip_msr is used
> > to get IP, reported CS is set to 0. But in fact, the CS on the stack
> > can be trusted if MCG_STATUS_RIPV or MCG_STATUS_EIPV is set. This
> > patch fixes this issue by keeping reported CS when rip_msr is used.
> 
> So the bug is in short:
>  In some cases no IP/CS reported even there were valid records.
> Right?
> 
> Then in other words it will mean lost of error information, that is not
> good for error investigation.
> 
> One question is: if (RIPV,EIPV) = (0,0), then is the IP on the stack
> really invalid value, or is it still point IP when MCE is generated?
> I suppose it is not invalid.  If a processor encounters MCE and if it
> is not sure what happened, then it will store the IP on the stack,
> indicating neither of flags.
> 
> If this supposition is correct, the best way is pick the value on
> the stack unconditionally, and record valid flags together.

According to spec, the IP on stack can be not related to MCE if
(RIPV,EIPV) = (0,0). So it is meaningless to report them. If you report
them unconditionally, you just push the logic to user space or
administrator.

Best Regards,
Huang Ying


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-24  6:35         ` Huang Ying
@ 2009-04-24  7:28           ` Hidetoshi Seto
  2009-04-24  8:50             ` Andi Kleen
  2009-04-24  8:52             ` Huang Ying
  0 siblings, 2 replies; 35+ messages in thread
From: Hidetoshi Seto @ 2009-04-24  7:28 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andi Kleen, hpa, linux-kernel, mingo, tglx

Huang Ying wrote:
> On Fri, 2009-04-24 at 14:16 +0800, Hidetoshi Seto wrote:
>> One question is: if (RIPV,EIPV) = (0,0), then is the IP on the stack
>> really invalid value, or is it still point IP when MCE is generated?
>> I suppose it is not invalid.  If a processor encounters MCE and if it
>> is not sure what happened, then it will store the IP on the stack,
>> indicating neither of flags.
>>
>> If this supposition is correct, the best way is pick the value on
>> the stack unconditionally, and record valid flags together.
> 
> According to spec, the IP on stack can be not related to MCE if
> (RIPV,EIPV) = (0,0). So it is meaningless to report them. If you report
> them unconditionally, you just push the logic to user space or
> administrator.

Sorry, I could not find good page in the spec (Intel64 and IA-32 ASDM)...
Could you point one?

I believe that the IP with (RIPV,EIPV) = (1,0) is "not associated with the
error" too, so is it meaningless to report the IP?
If you think so then correct fix is replacing RIPV check by EIPV check.

>From another point of view, the reported IP will be one of followings:
  - IP that associated with error (= related to MCE)
  - IP that the interrupted program can restart from
  - IP that when MCE is generated
Are there no way to distinguish them in user space?
And which is the one that we must report to the user space?
Which is one that must not?

You stated in the description of this patch:
> mce_get_rip() is used to get IP when MCE is generated,
Is this right?

If we have no answer here, we should report the IP unconditionally,
not to lost any error information.


Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [4/4] x86: MCE: Fix EIPV behaviour with !PCC
  2009-04-23 20:49     ` H. Peter Anvin
@ 2009-04-24  8:35       ` Andi Kleen
  0 siblings, 0 replies; 35+ messages in thread
From: Andi Kleen @ 2009-04-24  8:35 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Huang Ying, Andi Kleen, linux-kernel, mingo, tglx

On Thu, Apr 23, 2009 at 01:49:33PM -0700, H. Peter Anvin wrote:
> Huang Ying wrote:
> >Add some description for the patch, hope that to be more clear.
> 
> The patch description makes sense.  I cannot, however, make head or tail 
> out of the subject line or how it is related to the patch at all?
> 
> I have rewritten it to "always panic on tolerant == 0", ok with everyone?

Ok for me.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [4/4] x86: MCE: Fix EIPV behaviour with !PCC
  2009-04-24  5:40         ` H. Peter Anvin
@ 2009-04-24  8:46           ` Andi Kleen
  2009-04-24 10:30             ` Hidetoshi Seto
  0 siblings, 1 reply; 35+ messages in thread
From: Andi Kleen @ 2009-04-24  8:46 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Huang Ying, Hidetoshi Seto, Andi Kleen, linux-kernel, mingo, tglx

On Thu, Apr 23, 2009 at 10:40:10PM -0700, H. Peter Anvin wrote:
> Huang Ying wrote:
> > On Fri, 2009-04-24 at 08:27 +0800, Hidetoshi Seto wrote:
> >> Huang Ying wrote:
> >>
> >> Or, should this patch be applied after committing some of Andi's patches?
> >> It means this patch targets a bug in Andi's patch set and the bug is not
> >> in 2.6.30-rc* yet.
> > 
> > Yes. I think you are right. I fix a "bug" that doesn't exist.
> > 
> 
> Thanks for catching that now :)

Sorry we still need it I think, see previous email.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-24  7:28           ` Hidetoshi Seto
@ 2009-04-24  8:50             ` Andi Kleen
  2009-04-24  8:52             ` Huang Ying
  1 sibling, 0 replies; 35+ messages in thread
From: Andi Kleen @ 2009-04-24  8:50 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Huang Ying, Andi Kleen, hpa, linux-kernel, mingo, tglx

On Fri, Apr 24, 2009 at 04:28:56PM +0900, Hidetoshi Seto wrote:
> Huang Ying wrote:
> > On Fri, 2009-04-24 at 14:16 +0800, Hidetoshi Seto wrote:
> >> One question is: if (RIPV,EIPV) = (0,0), then is the IP on the stack
> >> really invalid value, or is it still point IP when MCE is generated?
> >> I suppose it is not invalid.  If a processor encounters MCE and if it
> >> is not sure what happened, then it will store the IP on the stack,
> >> indicating neither of flags.
> >>
> >> If this supposition is correct, the best way is pick the value on
> >> the stack unconditionally, and record valid flags together.
> > 
> > According to spec, the IP on stack can be not related to MCE if
> > (RIPV,EIPV) = (0,0). So it is meaningless to report them. If you report
> > them unconditionally, you just push the logic to user space or
> > administrator.
> 
> Sorry, I could not find good page in the spec (Intel64 and IA-32 ASDM)...
> Could you point one?
> 
> I believe that the IP with (RIPV,EIPV) = (1,0) is "not associated with the
> error" too, so is it meaningless to report the IP?

Historical background:

We used to not report RIP on EIPV=1 traditionally (back in 2004 or so
when I wrote that code). But because most x86s don't
set EIPVs and don't guarantee it's related the RIP was never reported.  

But a few people asked for reporting it anyways even with EIPV=0 because e.g. 
when you get a MCE on MMIO in a driver due to broken hardware the RIP tends to 
be still nearby or at the MMIO access. So you can see roughly what went wrong.
It just warns about this by adding the !INEXACT! marker.

> If you think so then correct fix is replacing RIPV check by EIPV check.

Nope.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-24  7:28           ` Hidetoshi Seto
  2009-04-24  8:50             ` Andi Kleen
@ 2009-04-24  8:52             ` Huang Ying
  2009-04-24 10:11               ` Hidetoshi Seto
  1 sibling, 1 reply; 35+ messages in thread
From: Huang Ying @ 2009-04-24  8:52 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Andi Kleen, hpa, linux-kernel, mingo, tglx

[-- Attachment #1: Type: text/plain, Size: 1713 bytes --]

On Fri, 2009-04-24 at 15:28 +0800, Hidetoshi Seto wrote:
> Huang Ying wrote:
> > On Fri, 2009-04-24 at 14:16 +0800, Hidetoshi Seto wrote:
> >> One question is: if (RIPV,EIPV) = (0,0), then is the IP on the stack
> >> really invalid value, or is it still point IP when MCE is generated?
> >> I suppose it is not invalid.  If a processor encounters MCE and if it
> >> is not sure what happened, then it will store the IP on the stack,
> >> indicating neither of flags.
> >>
> >> If this supposition is correct, the best way is pick the value on
> >> the stack unconditionally, and record valid flags together.
> > 
> > According to spec, the IP on stack can be not related to MCE if
> > (RIPV,EIPV) = (0,0). So it is meaningless to report them. If you report
> > them unconditionally, you just push the logic to user space or
> > administrator.
> 
> Sorry, I could not find good page in the spec (Intel64 and IA-32 ASDM)...
> Could you point one?

14.3.1.2 IA32_MCG_STATUS MSR
* EIPV

> I believe that the IP with (RIPV,EIPV) = (1,0) is "not associated with the
> error" too, so is it meaningless to report the IP?
> If you think so then correct fix is replacing RIPV check by EIPV check.

In theory, that is possible (not associated), but I think in practical,
IP with (RIPV,EIPV) = (1,0) is still meaningful as Andi said.

> From another point of view, the reported IP will be one of followings:
>   - IP that associated with error (= related to MCE)
>   - IP that the interrupted program can restart from
>   - IP that when MCE is generated
> Are there no way to distinguish them in user space?

I think you just push same logic to user space.

Best Regards,
Huang Ying


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [3/4] x86: MCE: Improve mce_get_rip
  2009-04-24  8:52             ` Huang Ying
@ 2009-04-24 10:11               ` Hidetoshi Seto
  0 siblings, 0 replies; 35+ messages in thread
From: Hidetoshi Seto @ 2009-04-24 10:11 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andi Kleen, hpa, linux-kernel, mingo, tglx

Huang Ying wrote:
> On Fri, 2009-04-24 at 15:28 +0800, Hidetoshi Seto wrote:
>> Huang Ying wrote:
>>> On Fri, 2009-04-24 at 14:16 +0800, Hidetoshi Seto wrote:
>>>> One question is: if (RIPV,EIPV) = (0,0), then is the IP on the stack
>>>> really invalid value, or is it still point IP when MCE is generated?
>>>> I suppose it is not invalid.  If a processor encounters MCE and if it
>>>> is not sure what happened, then it will store the IP on the stack,
>>>> indicating neither of flags.
>>>>
>>>> If this supposition is correct, the best way is pick the value on
>>>> the stack unconditionally, and record valid flags together.
>>> According to spec, the IP on stack can be not related to MCE if
>>> (RIPV,EIPV) = (0,0). So it is meaningless to report them. If you report
>>> them unconditionally, you just push the logic to user space or
>>> administrator.
>> Sorry, I could not find good page in the spec (Intel64 and IA-32 ASDM)...
>> Could you point one?
> 
> 14.3.1.2 IA32_MCG_STATUS MSR
> * EIPV

Quote:
 "EIPV (error IP valid) flag, bit 1 ― Indicates (when set) that the
  instruction pointed to by the instruction pointer pushed onto the
  stack when the machine-check exception is generated is directly
  associated with the error. When this flag is cleared, the instruction
  pointed to may not be associated with the error."

My understanding is:
 If EIPV is 1:
    IP value on the stack is one pushed when the MCE is generated,
    and the IP is associated with the error.
 If EIPV is 0:
    IP value on the stack is one pushed when the MCE is generated,
    but the IP is not associated with the error.

So I repeat my question again:
You stated in the description of this patch:
   "mce_get_rip() is used to get IP when MCE is generated, ..."
Is this right?

If right, I think EIPV is not matter.
If not, please rewrite the description.

>> I believe that the IP with (RIPV,EIPV) = (1,0) is "not associated with the
>> error" too, so is it meaningless to report the IP?
>> If you think so then correct fix is replacing RIPV check by EIPV check.
> 
> In theory, that is possible (not associated), but I think in practical,
> IP with (RIPV,EIPV) = (1,0) is still meaningful as Andi said.

Then, why IP with (0,0) is meaningless?
Why not use it with the !INEXACT! marker?

>> From another point of view, the reported IP will be one of followings:
>>   - IP that associated with error (= related to MCE)
>>   - IP that the interrupted program can restart from
>>   - IP that when MCE is generated
>> Are there no way to distinguish them in user space?
> 
> I think you just push same logic to user space.

No, I just want a logical explanation.

It seems we already can provide records with "inexact" value.
Why not expand such cases?


Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [4/4] x86: MCE: Fix EIPV behaviour with !PCC
  2009-04-24  8:46           ` Andi Kleen
@ 2009-04-24 10:30             ` Hidetoshi Seto
  2009-04-24 16:32               ` H. Peter Anvin
  0 siblings, 1 reply; 35+ messages in thread
From: Hidetoshi Seto @ 2009-04-24 10:30 UTC (permalink / raw)
  To: Andi Kleen; +Cc: H. Peter Anvin, Huang Ying, linux-kernel, mingo, tglx

Andi Kleen wrote:
> On Thu, Apr 23, 2009 at 10:40:10PM -0700, H. Peter Anvin wrote:
>> Huang Ying wrote:
>>> On Fri, 2009-04-24 at 08:27 +0800, Hidetoshi Seto wrote:
>>>> Huang Ying wrote:
>>>>
>>>> Or, should this patch be applied after committing some of Andi's patches?
>>>> It means this patch targets a bug in Andi's patch set and the bug is not
>>>> in 2.6.30-rc* yet.
>>> Yes. I think you are right. I fix a "bug" that doesn't exist.
>>>
>> Thanks for catching that now :)
> 
> Sorry we still need it I think, see previous email.

... Really?
Still I believe this patch is not needed by .30

And, sorry, which email?


Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] [4/4] x86: MCE: Fix EIPV behaviour with !PCC
  2009-04-24 10:30             ` Hidetoshi Seto
@ 2009-04-24 16:32               ` H. Peter Anvin
  0 siblings, 0 replies; 35+ messages in thread
From: H. Peter Anvin @ 2009-04-24 16:32 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Andi Kleen, Huang Ying, linux-kernel, mingo, tglx

Hidetoshi Seto wrote:
> Andi Kleen wrote:
>> On Thu, Apr 23, 2009 at 10:40:10PM -0700, H. Peter Anvin wrote:
>>> Huang Ying wrote:
>>>> On Fri, 2009-04-24 at 08:27 +0800, Hidetoshi Seto wrote:
>>>>> Huang Ying wrote:
>>>>>
>>>>> Or, should this patch be applied after committing some of Andi's patches?
>>>>> It means this patch targets a bug in Andi's patch set and the bug is not
>>>>> in 2.6.30-rc* yet.
>>>> Yes. I think you are right. I fix a "bug" that doesn't exist.
>>>>
>>> Thanks for catching that now :)
>> Sorry we still need it I think, see previous email.
> 
> ... Really?
> Still I believe this patch is not needed by .30
> 
> And, sorry, which email?
> 

I presume he was referring to your email.
Andi, right?

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2009-04-24 16:33 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-07 15:06 [PATCH] [0/4] x86: MCE: Machine check bug fix series for 2.6.30 Andi Kleen
2009-04-07 15:06 ` [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU Andi Kleen
2009-04-08  3:43   ` Hidetoshi Seto
2009-04-08 10:43     ` Andi Kleen
2009-04-08 11:30       ` Hidetoshi Seto
2009-04-08 11:40         ` Andi Kleen
2009-04-09 10:28   ` [PATCH] [1/4] x86: MCE: Make polling timer interval per CPU v2 Andi Kleen
2009-04-07 15:06 ` [PATCH] [2/4] x86: MCE: Fix boot logging logic Andi Kleen
2009-04-07 15:06 ` [PATCH] [3/4] x86: MCE: Improve mce_get_rip Andi Kleen
2009-04-08  8:15   ` Hidetoshi Seto
2009-04-08 10:06     ` Andi Kleen
2009-04-09  4:59       ` Hidetoshi Seto
2009-04-09  7:14         ` Andi Kleen
2009-04-09  9:59           ` Hidetoshi Seto
2009-04-09 10:13             ` Andi Kleen
2009-04-10  4:38               ` Hidetoshi Seto
2009-04-10  8:25                 ` Andi Kleen
2009-04-10  9:49                   ` Hidetoshi Seto
2009-04-23  9:43     ` Huang Ying
2009-04-24  6:16       ` Hidetoshi Seto
2009-04-24  6:35         ` Huang Ying
2009-04-24  7:28           ` Hidetoshi Seto
2009-04-24  8:50             ` Andi Kleen
2009-04-24  8:52             ` Huang Ying
2009-04-24 10:11               ` Hidetoshi Seto
2009-04-07 15:06 ` [PATCH] [4/4] x86: MCE: Fix EIPV behaviour with !PCC Andi Kleen
2009-04-23  9:43   ` Huang Ying
2009-04-23 20:49     ` H. Peter Anvin
2009-04-24  8:35       ` Andi Kleen
2009-04-24  0:27     ` Hidetoshi Seto
2009-04-24  1:11       ` Huang Ying
2009-04-24  5:40         ` H. Peter Anvin
2009-04-24  8:46           ` Andi Kleen
2009-04-24 10:30             ` Hidetoshi Seto
2009-04-24 16:32               ` H. Peter Anvin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).