linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 1/2] x86: mce Cleanup timer mess
  2012-05-24 17:54 [patch 0/2] x86: mce: Implement poll mode for CMCI Thomas Gleixner
@ 2012-05-24 17:54 ` Thomas Gleixner
  2012-05-25  6:16   ` Borislav Petkov
  2012-06-04  2:22   ` Chen Gong
  2012-05-24 17:54 ` [patch 2/2] x86: mce: Implement cmci poll mode for intel machines Thomas Gleixner
  1 sibling, 2 replies; 29+ messages in thread
From: Thomas Gleixner @ 2012-05-24 17:54 UTC (permalink / raw)
  To: LKML; +Cc: Chen Gong, tony.luck, bp, x86, Peter Zijlstra

[-- Attachment #1: x86-mce-cleanup-timer-mess.patch --]
[-- Type: text/plain, Size: 3007 bytes --]

Use unsigned long for dealing with jiffies not int. Rename the
callback to something sensible. Use __this_cpu_read/write for
accessing per cpu data.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/mce.c |   31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1237,15 +1237,15 @@ void mce_log_therm_throt_event(__u64 sta
  * poller finds an MCE, poll 2x faster.  When the poller finds no more
  * errors, poll 2x slower (up to check_interval seconds).
  */
-static int check_interval = 5 * 60; /* 5 minutes */
+static unsigned long check_interval = 5 * 60; /* 5 minutes */
 
-static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
+static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
-static void mce_start_timer(unsigned long data)
+static void mce_timer_fn(unsigned long data)
 {
-	struct timer_list *t = &per_cpu(mce_timer, data);
-	int *n;
+	struct timer_list *t = &__get_cpu_var(mce_timer);
+	unsigned long iv;
 
 	WARN_ON(smp_processor_id() != data);
 
@@ -1258,13 +1258,14 @@ static void mce_start_timer(unsigned lon
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
 	 * polling interval, otherwise increase the polling interval.
 	 */
-	n = &__get_cpu_var(mce_next_interval);
+	iv = __this_cpu_read(mce_next_interval);
 	if (mce_notify_irq())
-		*n = max(*n/2, HZ/100);
+		iv = max(iv, (unsigned long) HZ/100);
 	else
-		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
+		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+	__this_cpu_write(mce_next_interval, iv);
 
-	t->expires = jiffies + *n;
+	t->expires = jiffies + iv;
 	add_timer_on(t, smp_processor_id());
 }
 
@@ -1542,17 +1543,17 @@ static void __mcheck_cpu_init_vendor(str
 static void __mcheck_cpu_init_timer(void)
 {
 	struct timer_list *t = &__get_cpu_var(mce_timer);
-	int *n = &__get_cpu_var(mce_next_interval);
+	unsigned long iv = __this_cpu_read(mce_next_interval);
 
-	setup_timer(t, mce_start_timer, smp_processor_id());
+	setup_timer(t, mce_timer_fn, smp_processor_id());
 
 	if (mce_ignore_ce)
 		return;
 
-	*n = check_interval * HZ;
-	if (!*n)
+	__this_cpu_write(mce_next_interval, iv);
+	if (!iv)
 		return;
-	t->expires = round_jiffies(jiffies + *n);
+	t->expires = round_jiffies(jiffies + iv);
 	add_timer_on(t, smp_processor_id());
 }
 
@@ -2262,7 +2263,7 @@ mce_cpu_callback(struct notifier_block *
 	case CPU_DOWN_FAILED_FROZEN:
 		if (!mce_ignore_ce && check_interval) {
 			t->expires = round_jiffies(jiffies +
-					   __get_cpu_var(mce_next_interval));
+					per_cpu(mce_next_interval, cpu));
 			add_timer_on(t, cpu);
 		}
 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);



^ permalink raw reply	[flat|nested] 29+ messages in thread

* [patch 0/2] x86: mce: Implement poll mode for CMCI
@ 2012-05-24 17:54 Thomas Gleixner
  2012-05-24 17:54 ` [patch 1/2] x86: mce Cleanup timer mess Thomas Gleixner
  2012-05-24 17:54 ` [patch 2/2] x86: mce: Implement cmci poll mode for intel machines Thomas Gleixner
  0 siblings, 2 replies; 29+ messages in thread
From: Thomas Gleixner @ 2012-05-24 17:54 UTC (permalink / raw)
  To: LKML; +Cc: Chen Gong, tony.luck, bp, x86, Peter Zijlstra

This implements poll mode for CMCI after cleaning up the timer mess in
mce.

Note this is completely untested, but at least it compiles.

Thanks,

	tglx




^ permalink raw reply	[flat|nested] 29+ messages in thread

* [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-05-24 17:54 [patch 0/2] x86: mce: Implement poll mode for CMCI Thomas Gleixner
  2012-05-24 17:54 ` [patch 1/2] x86: mce Cleanup timer mess Thomas Gleixner
@ 2012-05-24 17:54 ` Thomas Gleixner
  2012-05-25  6:24   ` Borislav Petkov
                     ` (2 more replies)
  1 sibling, 3 replies; 29+ messages in thread
From: Thomas Gleixner @ 2012-05-24 17:54 UTC (permalink / raw)
  To: LKML; +Cc: Chen Gong, tony.luck, bp, x86, Peter Zijlstra

[-- Attachment #1: mce-intel-implement-cmci-poll-mode.patch --]
[-- Type: text/plain, Size: 6575 bytes --]

Intentionally left blank to be filled out by someone who wants that and
can explain the reason for this better than me.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/mce-internal.h |    8 ++
 arch/x86/kernel/cpu/mcheck/mce.c          |   41 +++++++++++++--
 arch/x86/kernel/cpu/mcheck/mce_intel.c    |   81 +++++++++++++++++++++++++++++-
 3 files changed, 125 insertions(+), 5 deletions(-)

Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,6 +28,14 @@ extern int mce_ser;
 
 extern struct mce_bank *mce_banks;
 
+#ifdef CONFIG_X86_MCE_INTEL
+unsigned long mce_intel_adjust_timer(unsigned long interval);
+#else
+# define mce_intel_adjust_timer mce_adjust_timer_default
+#endif
+
+void mce_timer_kick(unsigned long interval);
+
 #ifdef CONFIG_ACPI_APEI
 int apei_write_mce(struct mce *m);
 ssize_t apei_read_mce(struct mce *m, u64 *record_id);
Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1242,6 +1242,14 @@ static unsigned long check_interval = 5 
 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
+static unsigned long mce_adjust_timer_default(unsigned long interval)
+{
+	return interval;
+}
+
+static unsigned long (*mce_adjust_timer)(unsigned long interval) =
+	mce_adjust_timer_default;
+
 static void mce_timer_fn(unsigned long data)
 {
 	struct timer_list *t = &__get_cpu_var(mce_timer);
@@ -1259,14 +1267,37 @@ static void mce_timer_fn(unsigned long d
 	 * polling interval, otherwise increase the polling interval.
 	 */
 	iv = __this_cpu_read(mce_next_interval);
-	if (mce_notify_irq())
+	if (mce_notify_irq()) {
 		iv = max(iv, (unsigned long) HZ/100);
-	else
+	} else {
 		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+		iv = mce_adjust_timer(iv);
+	}
 	__this_cpu_write(mce_next_interval, iv);
+	/* Might have become 0 after CMCI storm subsided */
+	if (iv) {
+		t->expires = jiffies + iv;
+		add_timer_on(t, smp_processor_id());
+	}
+}
 
-	t->expires = jiffies + iv;
-	add_timer_on(t, smp_processor_id());
+/*
+ * Ensure that the timer is firing in @interval from now.
+ */
+void mce_timer_kick(unsigned long interval)
+{
+	struct timer_list *t = &__get_cpu_var(mce_timer);
+	unsigned long when = jiffies + interval;
+	unsigned long iv = __this_cpu_read(mce_next_interval);
+
+	if (time_before(when, t->expires) && timer_pending(t)) {
+		mod_timer(t, when);
+	} else if (!mce_next_interval) {
+		t->expires = round_jiffies(jiffies + iv);
+		add_timer_on(t, smp_processor_id());
+	}
+	if (interval < iv)
+		__this_cpu_write(mce_next_interval, iv);
 }
 
 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@ -1531,6 +1562,7 @@ static void __mcheck_cpu_init_vendor(str
 	switch (c->x86_vendor) {
 	case X86_VENDOR_INTEL:
 		mce_intel_feature_init(c);
+		mce_adjust_timer = mce_intel_adjust_timer;
 		break;
 	case X86_VENDOR_AMD:
 		mce_amd_feature_init(c);
@@ -1550,6 +1582,7 @@ static void __mcheck_cpu_init_timer(void
 	if (mce_ignore_ce)
 		return;
 
+	iv = mce_adjust_timer(check_interval * HZ);
 	__this_cpu_write(mce_next_interval, iv);
 	if (!iv)
 		return;
Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce_intel.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -15,6 +15,8 @@
 #include <asm/msr.h>
 #include <asm/mce.h>
 
+#include "mce-internal.h"
+
 /*
  * Support for Intel Correct Machine Check Interrupts. This allows
  * the CPU to raise an interrupt when a corrected machine check happened.
@@ -30,7 +32,22 @@ static DEFINE_PER_CPU(mce_banks_t, mce_b
  */
 static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
 
-#define CMCI_THRESHOLD 1
+#define CMCI_THRESHOLD		1
+#define CMCI_POLL_INTERVAL	(30 * HZ)
+#define CMCI_STORM_INTERVAL	(1 * HZ)
+#define CMCI_STORM_TRESHOLD	5
+
+static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
+
+enum {
+	CMCI_STORM_NONE,
+	CMCI_STORM_ACTIVE,
+	CMCI_STORM_SUBSIDED,
+};
+
+static atomic_t cmci_storm_on_cpus;
 
 static int cmci_supported(int *banks)
 {
@@ -53,6 +70,66 @@ static int cmci_supported(int *banks)
 	return !!(cap & MCG_CMCI_P);
 }
 
+unsigned long mce_intel_adjust_timer(unsigned long interval)
+{
+	if (interval < CMCI_POLL_INTERVAL)
+		return interval;
+
+	switch (__this_cpu_read(cmci_storm_state)) {
+	case CMCI_STORM_ACTIVE:
+		/*
+		 * We switch back to interrupt mode once the poll timer has
+		 * silenced itself. That means no events recorded and the
+		 * timer interval is back to our poll interval.
+		 */
+		__this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
+		atomic_dec(&cmci_storm_on_cpus);
+
+	case CMCI_STORM_SUBSIDED:
+		/*
+		 * We wait for all cpus to go back to SUBSIDED
+		 * state. When that happens we switch back to
+		 * interrupt mode.
+		 */
+		if (!atomic_read(&cmci_storm_on_cpus)) {
+			__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
+			cmci_reenable();
+			cmci_recheck();
+		}
+		return CMCI_POLL_INTERVAL;
+	default:
+		/*
+		 * We have shiny wheather, let the poll do whatever it
+		 * thinks.
+		 */
+		return interval;
+	}
+}
+
+static bool cmci_storm_detect(void)
+{
+	unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
+	unsigned long ts = __this_cpu_read(cmci_time_stamp);
+	unsigned long now = jiffies;
+
+	if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
+		cnt++;
+	} else {
+		cnt = 1;
+		__this_cpu_write(cmci_time_stamp, now);
+	}
+	__this_cpu_write(cmci_storm_cnt, cnt);
+
+	if (cnt <= CMCI_STORM_TRESHOLD)
+		return false;
+
+	cmci_clear();
+	__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
+	atomic_inc(&cmci_storm_on_cpus);
+	mce_timer_kick(CMCI_POLL_INTERVAL);
+	return true;
+}
+
 /*
  * The interrupt handler. This is called on every event.
  * Just call the poller directly to log any events.
@@ -61,6 +138,8 @@ static int cmci_supported(int *banks)
  */
 static void intel_threshold_interrupt(void)
 {
+	if (cmci_storm_detect())
+		return;
 	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
 	mce_notify_irq();
 }



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 1/2] x86: mce Cleanup timer mess
  2012-05-24 17:54 ` [patch 1/2] x86: mce Cleanup timer mess Thomas Gleixner
@ 2012-05-25  6:16   ` Borislav Petkov
  2012-06-04  2:22   ` Chen Gong
  1 sibling, 0 replies; 29+ messages in thread
From: Borislav Petkov @ 2012-05-25  6:16 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Chen Gong, tony.luck, bp, x86, Peter Zijlstra

On Thu, May 24, 2012 at 05:54:51PM +0000, Thomas Gleixner wrote:
> Use unsigned long for dealing with jiffies not int. Rename the
> callback to something sensible. Use __this_cpu_read/write for
> accessing per cpu data.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Looks good to me,

thanks.

Acked-by: Borislav Petkov <borislav.petkov@amd.com>

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-05-24 17:54 ` [patch 2/2] x86: mce: Implement cmci poll mode for intel machines Thomas Gleixner
@ 2012-05-25  6:24   ` Borislav Petkov
  2012-05-25  7:31     ` Chen Gong
  2012-05-28  9:47     ` Chen Gong
  2012-05-28  9:52   ` Chen Gong
  2012-06-04  2:37   ` Chen Gong
  2 siblings, 2 replies; 29+ messages in thread
From: Borislav Petkov @ 2012-05-25  6:24 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Chen Gong, tony.luck, x86, Peter Zijlstra

On Thu, May 24, 2012 at 05:54:52PM +0000, Thomas Gleixner wrote:
> Intentionally left blank to be filled out by someone who wants that and
> can explain the reason for this better than me.

That'll be Intel folk :)

> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Looks good to me too, thanks Thomas for doing this!

I'll run it next week just in case.

Acked-by: Borislav Petkov <borislav.petkov@amd.com>

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-05-25  6:24   ` Borislav Petkov
@ 2012-05-25  7:31     ` Chen Gong
  2012-05-25  9:20       ` Thomas Gleixner
  2012-05-28  9:47     ` Chen Gong
  1 sibling, 1 reply; 29+ messages in thread
From: Chen Gong @ 2012-05-25  7:31 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: Thomas Gleixner, LKML, tony.luck, x86, Peter Zijlstra

于 2012/5/25 14:24, Borislav Petkov 写道:
> On Thu, May 24, 2012 at 05:54:52PM +0000, Thomas Gleixner wrote:
>> Intentionally left blank to be filled out by someone who wants
>> that and can explain the reason for this better than me.
> 
> That'll be Intel folk :)
> 
>> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> 
> Looks good to me too, thanks Thomas for doing this!
> 
> I'll run it next week just in case.
> 
> Acked-by: Borislav Petkov <borislav.petkov@amd.com>
> 

Oh, Oh, wait. First I need to thank Thomas to improve it. I don't
reply you at the first time because I have some thoughts and now
I'm testing it. The basic test shows it hangs the system after
sb_edac is removed and when error count increases to the threshold
it hangs again, and when trying to reboot the system no hang happnes
(not reach the threshold) the system oops. I need to time to debug
and give the valid feedback. Please be patient :-). Of course,
I still need time to add some description for Thomas ;-)


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-05-25  7:31     ` Chen Gong
@ 2012-05-25  9:20       ` Thomas Gleixner
  2012-05-25 12:17         ` Thomas Gleixner
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2012-05-25  9:20 UTC (permalink / raw)
  To: Chen Gong; +Cc: Borislav Petkov, LKML, tony.luck, x86, Peter Zijlstra

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1098 bytes --]

On Fri, 25 May 2012, Chen Gong wrote:

> 于 2012/5/25 14:24, Borislav Petkov 写道:
> > On Thu, May 24, 2012 at 05:54:52PM +0000, Thomas Gleixner wrote:
> >> Intentionally left blank to be filled out by someone who wants
> >> that and can explain the reason for this better than me.
> > 
> > That'll be Intel folk :)
> > 
> >> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> > 
> > Looks good to me too, thanks Thomas for doing this!
> > 
> > I'll run it next week just in case.
> > 
> > Acked-by: Borislav Petkov <borislav.petkov@amd.com>
> > 
> 
> Oh, Oh, wait. First I need to thank Thomas to improve it. I don't
> reply you at the first time because I have some thoughts and now
> I'm testing it. The basic test shows it hangs the system after
> sb_edac is removed and when error count increases to the threshold
> it hangs again, and when trying to reboot the system no hang happnes
> (not reach the threshold) the system oops. I need to time to debug
> and give the valid feedback. Please be patient :-). Of course,

As I said it's completely untested. I just made sure it compiles :)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-05-25  9:20       ` Thomas Gleixner
@ 2012-05-25 12:17         ` Thomas Gleixner
  0 siblings, 0 replies; 29+ messages in thread
From: Thomas Gleixner @ 2012-05-25 12:17 UTC (permalink / raw)
  To: Chen Gong; +Cc: Borislav Petkov, LKML, tony.luck, x86, Peter Zijlstra

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2988 bytes --]

On Fri, 25 May 2012, Thomas Gleixner wrote:
> On Fri, 25 May 2012, Chen Gong wrote:
> 
> > 于 2012/5/25 14:24, Borislav Petkov 写道:
> > > On Thu, May 24, 2012 at 05:54:52PM +0000, Thomas Gleixner wrote:
> > >> Intentionally left blank to be filled out by someone who wants
> > >> that and can explain the reason for this better than me.
> > > 
> > > That'll be Intel folk :)
> > > 
> > >> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> > > 
> > > Looks good to me too, thanks Thomas for doing this!
> > > 
> > > I'll run it next week just in case.
> > > 
> > > Acked-by: Borislav Petkov <borislav.petkov@amd.com>
> > > 
> > 
> > Oh, Oh, wait. First I need to thank Thomas to improve it. I don't
> > reply you at the first time because I have some thoughts and now
> > I'm testing it. The basic test shows it hangs the system after
> > sb_edac is removed and when error count increases to the threshold
> > it hangs again, and when trying to reboot the system no hang happnes
> > (not reach the threshold) the system oops. I need to time to debug
> > and give the valid feedback. Please be patient :-). Of course,
> 
> As I said it's completely untested. I just made sure it compiles :)

I just had a quick look again and noticed that nothing is polling the
cmci part in case of the storm mode.

Delta fix below.

Thanks,

	tglx

Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -30,8 +30,10 @@ extern struct mce_bank *mce_banks;
 
 #ifdef CONFIG_X86_MCE_INTEL
 unsigned long mce_intel_adjust_timer(unsigned long interval);
+void mce_intel_cmci_poll(void);
 #else
 # define mce_intel_adjust_timer mce_adjust_timer_default
+static inline void mce_intel_cmci_poll(void) { }
 #endif
 
 void mce_timer_kick(unsigned long interval);
Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1260,6 +1260,7 @@ static void mce_timer_fn(unsigned long d
 	if (mce_available(__this_cpu_ptr(&cpu_info))) {
 		machine_check_poll(MCP_TIMESTAMP,
 				&__get_cpu_var(mce_poll_banks));
+		mce_intel_cmci_poll();
 	}
 
 	/*
Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce_intel.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -70,6 +70,13 @@ static int cmci_supported(int *banks)
 	return !!(cap & MCG_CMCI_P);
 }
 
+void mce_intel_cmci_poll(void)
+{
+	if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
+		return;
+	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+}
+
 unsigned long mce_intel_adjust_timer(unsigned long interval)
 {
 	if (interval < CMCI_POLL_INTERVAL)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-05-25  6:24   ` Borislav Petkov
  2012-05-25  7:31     ` Chen Gong
@ 2012-05-28  9:47     ` Chen Gong
  1 sibling, 0 replies; 29+ messages in thread
From: Chen Gong @ 2012-05-28  9:47 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: Thomas Gleixner, LKML, tony.luck, x86, Peter Zijlstra

于 2012/5/25 14:24, Borislav Petkov 写道:
> On Thu, May 24, 2012 at 05:54:52PM +0000, Thomas Gleixner wrote:
>> Intentionally left blank to be filled out by someone who wants
>> that and can explain the reason for this better than me.
> 
> That'll be Intel folk :)
> 
>> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> 
> Looks good to me too, thanks Thomas for doing this!
> 
> I'll run it next week just in case.
> 
> Acked-by: Borislav Petkov <borislav.petkov@amd.com>
> 
Hi, Boris

Have you tested Thomas' patch. During my test, it hangs in below 2
scenarios:

1) remove sb_edac driver, and then inject error, hang right now!
2) keep sb_edac drivr, inject error until storm happens (I changed
the condition from 1 HZ to 1 minute to make it easy to trigger
storm), after 5 errors are injected, codes enter *storm* mode,
but hang right now again!

At least, scenario 2 has direct connection with new patches. I
need time to investigate because no any output after hang. (lockdep?)



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-05-24 17:54 ` [patch 2/2] x86: mce: Implement cmci poll mode for intel machines Thomas Gleixner
  2012-05-25  6:24   ` Borislav Petkov
@ 2012-05-28  9:52   ` Chen Gong
  2012-06-04  2:37   ` Chen Gong
  2 siblings, 0 replies; 29+ messages in thread
From: Chen Gong @ 2012-05-28  9:52 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, tony.luck, bp, x86, Peter Zijlstra

于 2012/5/25 1:54, Thomas Gleixner 写道:
> Intentionally left blank to be filled out by someone who wants that
> and can explain the reason for this better than me.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- 
> arch/x86/kernel/cpu/mcheck/mce-internal.h |    8 ++ 
> arch/x86/kernel/cpu/mcheck/mce.c          |   41 +++++++++++++-- 
> arch/x86/kernel/cpu/mcheck/mce_intel.c    |   81
> +++++++++++++++++++++++++++++- 3 files changed, 125 insertions(+),
> 5 deletions(-)
> 
> Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h 
> ===================================================================
>
> 
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce-internal.h
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -28,6
> +28,14 @@ extern int mce_ser;
> 
> extern struct mce_bank *mce_banks;
> 
> +#ifdef CONFIG_X86_MCE_INTEL +unsigned long
> mce_intel_adjust_timer(unsigned long interval); +#else +# define
> mce_intel_adjust_timer mce_adjust_timer_default +#endif + +void
> mce_timer_kick(unsigned long interval); + #ifdef CONFIG_ACPI_APEI 
> int apei_write_mce(struct mce *m); ssize_t apei_read_mce(struct mce
> *m, u64 *record_id); Index:
> linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c 
> ===================================================================
>
> 
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c @@ -1242,6 +1242,14
> @@ static unsigned long check_interval = 5 static
> DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 
> static DEFINE_PER_CPU(struct timer_list, mce_timer);
> 
> +static unsigned long mce_adjust_timer_default(unsigned long
> interval) +{ +	return interval; +} + +static unsigned long
> (*mce_adjust_timer)(unsigned long interval) = +
> mce_adjust_timer_default; + static void mce_timer_fn(unsigned long
> data) { struct timer_list *t = &__get_cpu_var(mce_timer); @@
> -1259,14 +1267,37 @@ static void mce_timer_fn(unsigned long d *
> polling interval, otherwise increase the polling interval. */ iv =
> __this_cpu_read(mce_next_interval); -	if (mce_notify_irq()) +	if
> (mce_notify_irq()) { iv = max(iv, (unsigned long) HZ/100);

You don't use old logic max(iv / 2, xxx), so it should not decrease any
more. Any defect for the old logic?

> -	else +	} else { iv = min(iv * 2,
> round_jiffies_relative(check_interval * HZ)); +		iv =
> mce_adjust_timer(iv); +	} __this_cpu_write(mce_next_interval, iv); 
> +	/* Might have become 0 after CMCI storm subsided */ +	if (iv) { +
> t->expires = jiffies + iv; +		add_timer_on(t, smp_processor_id()); 
> +	} +}
> 
> -	t->expires = jiffies + iv; -	add_timer_on(t,
> smp_processor_id()); +/* + * Ensure that the timer is firing in
> @interval from now. + */ +void mce_timer_kick(unsigned long
> interval) +{ +	struct timer_list *t = &__get_cpu_var(mce_timer); +
> unsigned long when = jiffies + interval; +	unsigned long iv =
> __this_cpu_read(mce_next_interval); + +	if (time_before(when,
> t->expires) && timer_pending(t)) { +		mod_timer(t, when); +	} else
> if (!mce_next_interval) {

Why using mce_next_interval, it is a per_cpu var, should be non-NULL
definitely, right? How about using *iv* here?

> +		t->expires = round_jiffies(jiffies + iv); +		add_timer_on(t,
> smp_processor_id()); +	} +	if (interval < iv) +
> __this_cpu_write(mce_next_interval, iv);

This code should be __this_cpu_write(mce_next_interval, interval); ?

> }
> 
> /* Must not be called in IRQ context where del_timer_sync() can
> deadlock */ @@ -1531,6 +1562,7 @@ static void
> __mcheck_cpu_init_vendor(str switch (c->x86_vendor) { case
> X86_VENDOR_INTEL: mce_intel_feature_init(c); +		mce_adjust_timer =
> mce_intel_adjust_timer; break; case X86_VENDOR_AMD: 
> mce_amd_feature_init(c); @@ -1550,6 +1582,7 @@ static void
> __mcheck_cpu_init_timer(void if (mce_ignore_ce) return;
> 
> +	iv = mce_adjust_timer(check_interval * HZ); 
> __this_cpu_write(mce_next_interval, iv); if (!iv) return; Index:
> linux-2.6/arch/x86/kernel/cpu/mcheck/mce_intel.c 
> ===================================================================
>
> 
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce_intel.c
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -15,6 +15,8
> @@ #include <asm/msr.h> #include <asm/mce.h>
> 
> +#include "mce-internal.h" + /* * Support for Intel Correct Machine
> Check Interrupts. This allows * the CPU to raise an interrupt when
> a corrected machine check happened. @@ -30,7 +32,22 @@ static
> DEFINE_PER_CPU(mce_banks_t, mce_b */ static
> DEFINE_RAW_SPINLOCK(cmci_discover_lock);
> 
> -#define CMCI_THRESHOLD 1 +#define CMCI_THRESHOLD		1 +#define
> CMCI_POLL_INTERVAL	(30 * HZ) +#define CMCI_STORM_INTERVAL	(1 * HZ) 
> +#define CMCI_STORM_TRESHOLD	5 + +static DEFINE_PER_CPU(unsigned
> long, cmci_time_stamp); +static DEFINE_PER_CPU(unsigned int,
> cmci_storm_cnt); +static DEFINE_PER_CPU(unsigned int,
> cmci_storm_state); + +enum { +	CMCI_STORM_NONE, +
> CMCI_STORM_ACTIVE, +	CMCI_STORM_SUBSIDED, +}; + +static atomic_t
> cmci_storm_on_cpus;
> 
> static int cmci_supported(int *banks) { @@ -53,6 +70,66 @@ static
> int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); }
> 
> +unsigned long mce_intel_adjust_timer(unsigned long interval) +{ +
> if (interval < CMCI_POLL_INTERVAL) +		return interval; + +	switch
> (__this_cpu_read(cmci_storm_state)) { +	case CMCI_STORM_ACTIVE: +
> /* +		 * We switch back to interrupt mode once the poll timer has +
> * silenced itself. That means no events recorded and the +		 *
> timer interval is back to our poll interval. +		 */ +
> __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); +
> atomic_dec(&cmci_storm_on_cpus); + +	case CMCI_STORM_SUBSIDED: +
> /* +		 * We wait for all cpus to go back to SUBSIDED +		 * state.
> When that happens we switch back to +		 * interrupt mode. +		 */ +
> if (!atomic_read(&cmci_storm_on_cpus)) { +
> __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); +
> cmci_reenable(); +			cmci_recheck(); +		} +		return
> CMCI_POLL_INTERVAL; +	default: +		/* +		 * We have shiny wheather,
> let the poll do whatever it +		 * thinks. +		 */ +		return
> interval; +	} +} + +static bool cmci_storm_detect(void) +{ +
> unsigned int cnt = __this_cpu_read(cmci_storm_cnt); +	unsigned long
> ts = __this_cpu_read(cmci_time_stamp); +	unsigned long now =
> jiffies; + +	if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { +
> cnt++; +	} else { +		cnt = 1; +		__this_cpu_write(cmci_time_stamp,
> now); +	} +	__this_cpu_write(cmci_storm_cnt, cnt); + +	if (cnt <=
> CMCI_STORM_TRESHOLD) +		return false; + +	cmci_clear(); +
> __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); +
> atomic_inc(&cmci_storm_on_cpus); +
> mce_timer_kick(CMCI_POLL_INTERVAL); +	return true; +} + /* * The
> interrupt handler. This is called on every event. * Just call the
> poller directly to log any events. @@ -61,6 +138,8 @@ static int
> cmci_supported(int *banks) */ static void
> intel_threshold_interrupt(void) { +	if (cmci_storm_detect()) +
> return; machine_check_poll(MCP_TIMESTAMP,
> &__get_cpu_var(mce_banks_owned)); mce_notify_irq(); }
> 
> 
> -- To unsubscribe from this list: send the line "unsubscribe
> linux-kernel" in the body of a message to
> majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html Please read the FAQ at
> http://www.tux.org/lkml/
> 



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 1/2] x86: mce Cleanup timer mess
  2012-05-24 17:54 ` [patch 1/2] x86: mce Cleanup timer mess Thomas Gleixner
  2012-05-25  6:16   ` Borislav Petkov
@ 2012-06-04  2:22   ` Chen Gong
  2012-06-04 18:14     ` Luck, Tony
  1 sibling, 1 reply; 29+ messages in thread
From: Chen Gong @ 2012-06-04  2:22 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, tony.luck, bp, x86, Peter Zijlstra

Hi, Tony and Thomas

This patch has been merged, but It still have some confusion, please
see inline comment and give me some explanation.

于 2012/5/25 1:54, Thomas Gleixner 写道:
> Use unsigned long for dealing with jiffies not int. Rename the 
> callback to something sensible. Use __this_cpu_read/write for 
> accessing per cpu data.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- 
> arch/x86/kernel/cpu/mcheck/mce.c |   31 
> ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 
> 15 deletions(-)
> 
> Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c 
> ===================================================================
>
>
> 
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c @@ -1237,15
> +1237,15 @@ void mce_log_therm_throt_event(__u64 sta * poller finds
> an MCE, poll 2x faster.  When the poller finds no more * errors,
> poll 2x slower (up to check_interval seconds). */ -static int 
> check_interval = 5 * 60; /* 5 minutes */ +static unsigned long 
> check_interval = 5 * 60; /* 5 minutes */
> 
> -static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 
> +static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in 
> jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer);
> 
> -static void mce_start_timer(unsigned long data) +static void 
> mce_timer_fn(unsigned long data) { -	struct timer_list *t = 
> &per_cpu(mce_timer, data); -	int *n; +	struct timer_list *t = 
> &__get_cpu_var(mce_timer); +	unsigned long iv;
> 
> WARN_ON(smp_processor_id() != data);
> 
> @@ -1258,13 +1258,14 @@ static void mce_start_timer(unsigned lon * 
> Alert userspace if needed.  If we logged an MCE, reduce the * 
> polling interval, otherwise increase the polling interval. */ -	n
> = &__get_cpu_var(mce_next_interval); +	iv = 
> __this_cpu_read(mce_next_interval); if (mce_notify_irq()) -		*n = 
> max(*n/2, HZ/100); +		iv = max(iv, (unsigned long) HZ/100);

Here Thomas changed original mode from "*n = max(*n/2, HZ/100);"
to "iv = max(iv, (unsigned long) HZ/100);", which means *iv* will not
be decremented but only incremented in _else_ branch. If so, eventually
the *iv will be equal to *check_interval*. I don't think it makes sense.
Even we use new logic, the comment before these codes should be updated.

So Thomas, would you please explain why you use this new logic?

> else -		*n = min(*n*2, 
> (int)round_jiffies_relative(check_interval*HZ)); +		iv = min(iv * 
> 2, round_jiffies_relative(check_interval * HZ)); + 
> __this_cpu_write(mce_next_interval, iv);
> 
> -	t->expires = jiffies + *n; +	t->expires = jiffies + iv; 
> add_timer_on(t, smp_processor_id()); }
> 
> @@ -1542,17 +1543,17 @@ static void __mcheck_cpu_init_vendor(str 
> static void __mcheck_cpu_init_timer(void) { struct timer_list *t = 
> &__get_cpu_var(mce_timer); -	int *n = 
> &__get_cpu_var(mce_next_interval); +	unsigned long iv = 
> __this_cpu_read(mce_next_interval);
> 
> -	setup_timer(t, mce_start_timer, smp_processor_id()); + 
> setup_timer(t, mce_timer_fn, smp_processor_id());
> 
> if (mce_ignore_ce) return;
> 
> -	*n = check_interval * HZ; -	if (!*n) + 
> __this_cpu_write(mce_next_interval, iv); +	if (!iv) return; - 
> t->expires = round_jiffies(jiffies + *n); +	t->expires = 
> round_jiffies(jiffies + iv); add_timer_on(t, smp_processor_id()); 
> }
> 
> @@ -2262,7 +2263,7 @@ mce_cpu_callback(struct notifier_block *
> case CPU_DOWN_FAILED_FROZEN: if (!mce_ignore_ce && check_interval)
> { t->expires = round_jiffies(jiffies + - 
> __get_cpu_var(mce_next_interval)); +
> per_cpu(mce_next_interval, cpu)); add_timer_on(t, cpu); }
> smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
> 
> 
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-05-24 17:54 ` [patch 2/2] x86: mce: Implement cmci poll mode for intel machines Thomas Gleixner
  2012-05-25  6:24   ` Borislav Petkov
  2012-05-28  9:52   ` Chen Gong
@ 2012-06-04  2:37   ` Chen Gong
  2012-06-04 20:01     ` Thomas Gleixner
  2 siblings, 1 reply; 29+ messages in thread
From: Chen Gong @ 2012-06-04  2:37 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, tony.luck, bp, x86, Peter Zijlstra

Hi, Thomas

I have some confusion in your patch please help to give me some updates.

于 2012/5/25 1:54, Thomas Gleixner 写道:
> Intentionally left blank to be filled out by someone who wants that and
> can explain the reason for this better than me.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  arch/x86/kernel/cpu/mcheck/mce-internal.h |    8 ++
>  arch/x86/kernel/cpu/mcheck/mce.c          |   41 +++++++++++++--
>  arch/x86/kernel/cpu/mcheck/mce_intel.c    |   81 +++++++++++++++++++++++++++++-
>  3 files changed, 125 insertions(+), 5 deletions(-)
> 
> Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce-internal.h
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h
> @@ -28,6 +28,14 @@ extern int mce_ser;
>  
>  extern struct mce_bank *mce_banks;
>  
> +#ifdef CONFIG_X86_MCE_INTEL
> +unsigned long mce_intel_adjust_timer(unsigned long interval);
> +#else
> +# define mce_intel_adjust_timer mce_adjust_timer_default
> +#endif
> +
> +void mce_timer_kick(unsigned long interval);
> +
>  #ifdef CONFIG_ACPI_APEI
>  int apei_write_mce(struct mce *m);
>  ssize_t apei_read_mce(struct mce *m, u64 *record_id);
> Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -1242,6 +1242,14 @@ static unsigned long check_interval = 5 
>  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
>  static DEFINE_PER_CPU(struct timer_list, mce_timer);
>  
> +static unsigned long mce_adjust_timer_default(unsigned long interval)
> +{
> +	return interval;
> +}
> +
> +static unsigned long (*mce_adjust_timer)(unsigned long interval) =
> +	mce_adjust_timer_default;
> +
>  static void mce_timer_fn(unsigned long data)
>  {
>  	struct timer_list *t = &__get_cpu_var(mce_timer);
> @@ -1259,14 +1267,37 @@ static void mce_timer_fn(unsigned long d
>  	 * polling interval, otherwise increase the polling interval.
>  	 */
>  	iv = __this_cpu_read(mce_next_interval);
> -	if (mce_notify_irq())
> +	if (mce_notify_irq()) {
>  		iv = max(iv, (unsigned long) HZ/100);
> -	else
> +	} else {
>  		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
> +		iv = mce_adjust_timer(iv);
> +	}
>  	__this_cpu_write(mce_next_interval, iv);
> +	/* Might have become 0 after CMCI storm subsided */
> +	if (iv) {
> +		t->expires = jiffies + iv;
> +		add_timer_on(t, smp_processor_id());
> +	}
> +}
>  
> -	t->expires = jiffies + iv;
> -	add_timer_on(t, smp_processor_id());
> +/*
> + * Ensure that the timer is firing in @interval from now.
> + */
> +void mce_timer_kick(unsigned long interval)
> +{
> +	struct timer_list *t = &__get_cpu_var(mce_timer);
> +	unsigned long when = jiffies + interval;
> +	unsigned long iv = __this_cpu_read(mce_next_interval);
> +
> +	if (time_before(when, t->expires) && timer_pending(t)) {
> +		mod_timer(t, when);
> +	} else if (!mce_next_interval) {

Why using mce_next_interval, it is a per_cpu var, should be non-NULL
definitely, right? How about using *iv* here?

> +		t->expires = round_jiffies(jiffies + iv);
> +		add_timer_on(t, smp_processor_id());
> +	}
> +	if (interval < iv)
> +		__this_cpu_write(mce_next_interval, iv);
>  }

This code should be __this_cpu_write(mce_next_interval, interval);?

>  
>  /* Must not be called in IRQ context where del_timer_sync() can deadlock */
> @@ -1531,6 +1562,7 @@ static void __mcheck_cpu_init_vendor(str
>  	switch (c->x86_vendor) {
>  	case X86_VENDOR_INTEL:
>  		mce_intel_feature_init(c);
> +		mce_adjust_timer = mce_intel_adjust_timer;
>  		break;
>  	case X86_VENDOR_AMD:
>  		mce_amd_feature_init(c);
> @@ -1550,6 +1582,7 @@ static void __mcheck_cpu_init_timer(void
>  	if (mce_ignore_ce)
>  		return;
>  
> +	iv = mce_adjust_timer(check_interval * HZ);
>  	__this_cpu_write(mce_next_interval, iv);
>  	if (!iv)
>  		return;

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [patch 1/2] x86: mce Cleanup timer mess
  2012-06-04  2:22   ` Chen Gong
@ 2012-06-04 18:14     ` Luck, Tony
  2012-06-04 19:57       ` Thomas Gleixner
  2012-06-04 22:18       ` Borislav Petkov
  0 siblings, 2 replies; 29+ messages in thread
From: Luck, Tony @ 2012-06-04 18:14 UTC (permalink / raw)
  To: Chen Gong, Thomas Gleixner; +Cc: LKML, bp, x86, Peter Zijlstra

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 682 bytes --]

> Here Thomas changed original mode from "*n = max(*n/2, HZ/100);"
> to "iv = max(iv, (unsigned long) HZ/100);", which means *iv* will not
> be decremented but only incremented in _else_ branch. If so, eventually
> the *iv will be equal to *check_interval*. I don't think it makes sense.

It looks like Thomas just forgot the "/ 2" there while cleaning up.
I didn't see it either, nor did Boris when he acked it. Thank goodness
for your extra eyes looking at this.

Please send patch to fix it (so you get credit).

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [patch 1/2] x86: mce Cleanup timer mess
  2012-06-04 18:14     ` Luck, Tony
@ 2012-06-04 19:57       ` Thomas Gleixner
  2012-06-04 22:18       ` Borislav Petkov
  1 sibling, 0 replies; 29+ messages in thread
From: Thomas Gleixner @ 2012-06-04 19:57 UTC (permalink / raw)
  To: Luck, Tony; +Cc: Chen Gong, LKML, bp, x86, Peter Zijlstra

On Mon, 4 Jun 2012, Luck, Tony wrote:

> > Here Thomas changed original mode from "*n = max(*n/2, HZ/100);"
> > to "iv = max(iv, (unsigned long) HZ/100);", which means *iv* will not
> > be decremented but only incremented in _else_ branch. If so, eventually
> > the *iv will be equal to *check_interval*. I don't think it makes sense.
> 
> It looks like Thomas just forgot the "/ 2" there while cleaning up.
> I didn't see it either, nor did Boris when he acked it. Thank goodness
> for your extra eyes looking at this.

Uurgh. Chen, thanks for spotting it!

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-06-04  2:37   ` Chen Gong
@ 2012-06-04 20:01     ` Thomas Gleixner
  2012-06-05 11:47       ` Chen Gong
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2012-06-04 20:01 UTC (permalink / raw)
  To: Chen Gong; +Cc: LKML, tony.luck, bp, x86, Peter Zijlstra

On Mon, 4 Jun 2012, Chen Gong wrote:
> > +/*
> > + * Ensure that the timer is firing in @interval from now.
> > + */
> > +void mce_timer_kick(unsigned long interval)
> > +{
> > +	struct timer_list *t = &__get_cpu_var(mce_timer);
> > +	unsigned long when = jiffies + interval;
> > +	unsigned long iv = __this_cpu_read(mce_next_interval);
> > +
> > +	if (time_before(when, t->expires) && timer_pending(t)) {
> > +		mod_timer(t, when);
> > +	} else if (!mce_next_interval) {
> 
> Why using mce_next_interval, it is a per_cpu var, should be non-NULL
> definitely, right? How about using *iv* here?

iv is the thing to use. No idea why I typoed mce_next_interval into
that.
 
> > +		t->expires = round_jiffies(jiffies + iv);
> > +		add_timer_on(t, smp_processor_id());
> > +	}
> > +	if (interval < iv)
> > +		__this_cpu_write(mce_next_interval, iv);
> >  }
> 
> This code should be __this_cpu_write(mce_next_interval, interval);?

Duh, yes.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 1/2] x86: mce Cleanup timer mess
  2012-06-04 18:14     ` Luck, Tony
  2012-06-04 19:57       ` Thomas Gleixner
@ 2012-06-04 22:18       ` Borislav Petkov
  1 sibling, 0 replies; 29+ messages in thread
From: Borislav Petkov @ 2012-06-04 22:18 UTC (permalink / raw)
  To: Luck, Tony; +Cc: Chen Gong, Thomas Gleixner, LKML, bp, x86, Peter Zijlstra

On Mon, Jun 04, 2012 at 06:14:38PM +0000, Luck, Tony wrote:
> > Here Thomas changed original mode from "*n = max(*n/2, HZ/100);"
> > to "iv = max(iv, (unsigned long) HZ/100);", which means *iv* will not
> > be decremented but only incremented in _else_ branch. If so, eventually
> > the *iv will be equal to *check_interval*. I don't think it makes sense.
> 
> It looks like Thomas just forgot the "/ 2" there while cleaning up.
> I didn't see it either, nor did Boris when he acked it. Thank goodness
> for your extra eyes looking at this.

We should all pack our bags and go home. :<|

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-06-04 20:01     ` Thomas Gleixner
@ 2012-06-05 11:47       ` Chen Gong
  2012-06-05 12:57         ` Borislav Petkov
  2012-06-05 13:35         ` Thomas Gleixner
  0 siblings, 2 replies; 29+ messages in thread
From: Chen Gong @ 2012-06-05 11:47 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, tony.luck, bp, x86, Peter Zijlstra

于 2012/6/5 4:01, Thomas Gleixner 写道:
> On Mon, 4 Jun 2012, Chen Gong wrote:
>>> +/*
>>> + * Ensure that the timer is firing in @interval from now.
>>> + */
>>> +void mce_timer_kick(unsigned long interval)
>>> +{
>>> +	struct timer_list *t = &__get_cpu_var(mce_timer);
>>> +	unsigned long when = jiffies + interval;
>>> +	unsigned long iv = __this_cpu_read(mce_next_interval);
>>> +
>>> +	if (time_before(when, t->expires) && timer_pending(t)) {
>>> +		mod_timer(t, when);
>>> +	} else if (!mce_next_interval) {
>>
>> Why using mce_next_interval, it is a per_cpu var, should be non-NULL
>> definitely, right? How about using *iv* here?
> 
> iv is the thing to use. No idea why I typoed mce_next_interval into
> that.
>  
>>> +		t->expires = round_jiffies(jiffies + iv);
>>> +		add_timer_on(t, smp_processor_id());
>>> +	}
>>> +	if (interval < iv)
>>> +		__this_cpu_write(mce_next_interval, iv);
>>>  }
>>
>> This code should be __this_cpu_write(mce_next_interval, interval);?
> 
> Duh, yes.
> 
> Thanks,
> 
> 	tglx
> 
Hi, Thomas

Besides above issues, I still have some other questions as below:

>  static void mce_timer_fn(unsigned long data)
>  {
> ...
> +	/* Might have become 0 after CMCI storm subsided */
> +	if (iv) {
> +		t->expires = jiffies + iv;
> +		add_timer_on(t, smp_processor_id());
> +	}
> +}

I've found under some conditions, *t* is pending on the timer tree, so
add_timer_on will crash the whole system. Furthermore, if this timer
function triggers "WARN_ON(smp_processor_id() != data);", this timer
will be added on the other CPU, which means it loses the chance to
decrement *cmci_storm_on_cpus* to zero to reenable the CMCI. Maybe
this situation happens seldomly, but once it happens, CMCI will never
be actived again after it is disabled.


> +void mce_timer_kick(unsigned long interval)
> +{
> +	struct timer_list *t = &__get_cpu_var(mce_timer);
> +	unsigned long when = jiffies + interval;
> +	unsigned long iv = __this_cpu_read(mce_next_interval);
> +
> +	if (time_before(when, t->expires) && timer_pending(t)) {
> +		mod_timer(t, when);
> +	} else if (!mce_next_interval) {
> +		t->expires = round_jiffies(jiffies + iv);
> +		add_timer_on(t, smp_processor_id());

I've changed "else if (!mce_next_interval)" to "else if (iv)", but
I still think it is not right. Imaging *when* is after t->expires and
this timer is pending on the timer tree, so it will hit *else if*
if iv is not zero(common situations), again, add_timer_on will trigger
BUG_ON because this timer is pending.


>  static void intel_threshold_interrupt(void)
>  {
> +	if (cmci_storm_detect())
> +		return;
>  	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
>  	mce_notify_irq();
>  }

I think cmci_storm_detect should be placed in the machine_check_poll,
not out of it. Because machine_check_poll it the core execution logic
for CMCI handling, in the meanwhile, poll timer and mce-inject module
call machine_check_poll at any time. If poll timer or mce-inject run
too quickly, the CMCI handler has trouble. Whereas, if
cmci_storm_detect is in the machine_check_poll, this kind of possibility
can be avoid.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-06-05 11:47       ` Chen Gong
@ 2012-06-05 12:57         ` Borislav Petkov
  2012-06-06  1:36           ` Chen Gong
  2012-06-05 13:35         ` Thomas Gleixner
  1 sibling, 1 reply; 29+ messages in thread
From: Borislav Petkov @ 2012-06-05 12:57 UTC (permalink / raw)
  To: Chen Gong; +Cc: Thomas Gleixner, LKML, tony.luck, x86, Peter Zijlstra

On Tue, Jun 05, 2012 at 07:47:20PM +0800, Chen Gong wrote:
> >  static void intel_threshold_interrupt(void)
> >  {
> > +	if (cmci_storm_detect())
> > +		return;
> >  	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
> >  	mce_notify_irq();
> >  }
> 
> I think cmci_storm_detect should be placed in the machine_check_poll,
> not out of it. Because machine_check_poll it the core execution logic
> for CMCI handling, in the meanwhile, poll timer and mce-inject module
> call machine_check_poll at any time.

Are you saying you need CMCI throttling for when you inject MCEs?

> If poll timer or mce-inject run too quickly, the CMCI handler has
> trouble. Whereas, if cmci_storm_detect is in the machine_check_poll,
> this kind of possibility can be avoid.

In any case, cmci_storm_detect() cannot be in machine_check_poll because
last one is generic MCE code and not Intel-only. Unless you do something
like what Thomas proposed for mce_adjust_timer where the default
function does nothing and Intel only overwrites that pointer with the
needed functionality.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-06-05 11:47       ` Chen Gong
  2012-06-05 12:57         ` Borislav Petkov
@ 2012-06-05 13:35         ` Thomas Gleixner
  2012-06-06  7:21           ` Chen Gong
  1 sibling, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2012-06-05 13:35 UTC (permalink / raw)
  To: Chen Gong; +Cc: LKML, tony.luck, bp, x86, Peter Zijlstra

[-- Attachment #1: Type: TEXT/PLAIN, Size: 4000 bytes --]

On Tue, 5 Jun 2012, Chen Gong wrote:
> 于 2012/6/5 4:01, Thomas Gleixner 写道:
> > On Mon, 4 Jun 2012, Chen Gong wrote:
> >  static void mce_timer_fn(unsigned long data)
> >  {
> > ...
> > +	/* Might have become 0 after CMCI storm subsided */
> > +	if (iv) {
> > +		t->expires = jiffies + iv;
> > +		add_timer_on(t, smp_processor_id());
> > +	}
> > +}
> 
> I've found under some conditions, *t* is pending on the timer tree, so
> add_timer_on will crash the whole system. Furthermore, if this timer

How should that happen? This is the timer callback function, which is
called from the timer code when the timer expired. It CANNOT be
pending at that point. The add_timer_on() in mce_timer_kick() might
cause that BUG to trigger, but definitely not the one here in the
timer callback.

> function triggers "WARN_ON(smp_processor_id() != data);", this timer

What causes the timer to be added on the wrong CPU in the first place?
The WARN_ON meriliy detects it after the fact.

> will be added on the other CPU, which means it loses the chance to
> decrement *cmci_storm_on_cpus* to zero to reenable the CMCI. Maybe
> this situation happens seldomly, but once it happens, CMCI will never
> be actived again after it is disabled.
> 
> > +void mce_timer_kick(unsigned long interval)
> > +{
> > +	struct timer_list *t = &__get_cpu_var(mce_timer);
> > +	unsigned long when = jiffies + interval;
> > +	unsigned long iv = __this_cpu_read(mce_next_interval);
> > +
> > +	if (time_before(when, t->expires) && timer_pending(t)) {
> > +		mod_timer(t, when);
> > +	} else if (!mce_next_interval) {
> > +		t->expires = round_jiffies(jiffies + iv);
> > +		add_timer_on(t, smp_processor_id());
> 
> I've changed "else if (!mce_next_interval)" to "else if (iv)", but

  else if (!iv) perhaps ?

> I still think it is not right. Imaging *when* is after t->expires and
> this timer is pending on the timer tree, so it will hit *else if*
> if iv is not zero(common situations), again, add_timer_on will trigger
> BUG_ON because this timer is pending.

See above. Aside of that I rewrote the code to be more clear. See
delta patch below.
 
> >  static void intel_threshold_interrupt(void)
> >  {
> > +	if (cmci_storm_detect())
> > +		return;
> >  	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
> >  	mce_notify_irq();
> >  }
> 
> I think cmci_storm_detect should be placed in the machine_check_poll,
> not out of it. Because machine_check_poll it the core execution logic
> for CMCI handling, in the meanwhile, poll timer and mce-inject module
> call machine_check_poll at any time. If poll timer or mce-inject run
> too quickly, the CMCI handler has trouble. Whereas, if
> cmci_storm_detect is in the machine_check_poll, this kind of possibility
> can be avoid.

No, it's wrong to put it into machine_check_poll(). 

machine_check_poll() is a generic functionality which has nothing to
do with CMCI storms. That CMCI storm/poll stuff is intel specific and
therefor the detection logic is in the intel specific interrupt
handler and nowhere else.

Thanks,

	tglx
---
 arch/x86/kernel/cpu/mcheck/mce.c |   11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1305,13 +1305,14 @@ void mce_timer_kick(unsigned long interv
 	unsigned long when = jiffies + interval;
 	unsigned long iv = __this_cpu_read(mce_next_interval);
 
-	if (time_before(when, t->expires) && timer_pending(t)) {
-		mod_timer(t, when);
-	} else if (!mce_next_interval) {
-		t->expires = round_jiffies(jiffies + iv);
+	if (timer_pending(t)) {
+		if (time_before(when, t->expires))
+			mod_timer(t, when);
+	} else {
+		t->expires = round_jiffies(jiffies + when);
 		add_timer_on(t, smp_processor_id());
 	}
-	if (interval < iv)
+	if (interval > iv)
 		__this_cpu_write(mce_next_interval, iv);
 }
 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-06-05 12:57         ` Borislav Petkov
@ 2012-06-06  1:36           ` Chen Gong
  2012-06-06  9:04             ` Borislav Petkov
  0 siblings, 1 reply; 29+ messages in thread
From: Chen Gong @ 2012-06-06  1:36 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: Thomas Gleixner, LKML, tony.luck, x86, Peter Zijlstra

于 2012/6/5 20:57, Borislav Petkov 写道:
> On Tue, Jun 05, 2012 at 07:47:20PM +0800, Chen Gong wrote:
>>>  static void intel_threshold_interrupt(void)
>>>  {
>>> +	if (cmci_storm_detect())
>>> +		return;
>>>  	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
>>>  	mce_notify_irq();
>>>  }
>> I think cmci_storm_detect should be placed in the machine_check_poll,
>> not out of it. Because machine_check_poll it the core execution logic
>> for CMCI handling, in the meanwhile, poll timer and mce-inject module
>> call machine_check_poll at any time.
> Are you saying you need CMCI throttling for when you inject MCEs?

Yes, I am just afraid similar situation happening when injecting MCEs or
poll timer
running too fast, which is like a pseduo CMCI storm. Maybe similar logic
behind
CMCI_STORM can be used there.



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-06-05 13:35         ` Thomas Gleixner
@ 2012-06-06  7:21           ` Chen Gong
  2012-06-06  9:18             ` Thomas Gleixner
  0 siblings, 1 reply; 29+ messages in thread
From: Chen Gong @ 2012-06-06  7:21 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, tony.luck, bp, x86, Peter Zijlstra

于 2012/6/5 21:35, Thomas Gleixner 写道:
> On Tue, 5 Jun 2012, Chen Gong wrote:
>> 于 2012/6/5 4:01, Thomas Gleixner 写道:
>>> On Mon, 4 Jun 2012, Chen Gong wrote:
>>>  static void mce_timer_fn(unsigned long data)
>>>  {
>>> ...
>>> +	/* Might have become 0 after CMCI storm subsided */
>>> +	if (iv) {
>>> +		t->expires = jiffies + iv;
>>> +		add_timer_on(t, smp_processor_id());
>>> +	}
>>> +}
>> I've found under some conditions, *t* is pending on the timer tree, so
>> add_timer_on will crash the whole system. Furthermore, if this timer
> How should that happen? This is the timer callback function, which is
> called from the timer code when the timer expired. It CANNOT be
> pending at that point. The add_timer_on() in mce_timer_kick() might
> cause that BUG to trigger, but definitely not the one here in the
> timer callback.
>
>> function triggers "WARN_ON(smp_processor_id() != data);", this timer
> What causes the timer to be added on the wrong CPU in the first place?
> The WARN_ON meriliy detects it after the fact.

Yes, I've checked CPU status when these timers are registered. The CPU these
timers adhere and *data* they saved are equal. But the fact is the issu does
happen.  I have no idea of it:-(.

e.g.

------------[ cut here ]------------
kernel BUG at kernel/timer.c:919!
invalid opcode: 0000 [#1] SMP
CPU 0
Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq
freq_table mperf ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state
nf_conntrack ip6table_filter ip6_tables ipv6 dm_mirror dm_region_hash
dm_log dm_mod coretemp kvm crc32c_intel ghash_clmulni_intel microcode
pcspkr sb_edac edac_core sg wmi lpc_ich mfd_core igb ioatdma dca
i2c_i801 i2c_core ext4 mbcache jbd2 sd_mod crc_t10dif sr_mod cdrom
aesni_intel cryptd aes_x86_64 aes_generic ahci libahci isci libsas
scsi_transport_sas [last unloaded: scsi_wait_scan]

Pid: 0, comm: swapper/0 Not tainted 3.5.0-rc1storm-thomas+ #39 Intel
Corporation S2600CP/S2600CP
RIP: 0010:[<ffffffff8105fbad>]  [<ffffffff8105fbad>] add_timer_on+0xcd/0x120
RSP: 0018:ffff88013ee03da0  EFLAGS: 00010286
RAX: 000000000000e208 RBX: ffff88013ee0d940 RCX: 0000000000001857
RDX: ffff88013ef40000 RSI: 0000000000000000 RDI: ffff88013ee0d940
RBP: ffff88013ee03de0 R08: 0000000000000000 R09: ffffffff8163ad80
R10: 0000000000000060 R11: 0000000000000001 R12: ffff880139700000
R13: 000000000000000a R14: 0000000100016638 R15: 000000000000000a
FS:  0000000000000000(0000) GS:ffff88013ee00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00000030eda73000 CR3: 0000000001a0b000 CR4: 00000000000407f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper/0 (pid: 0, threadinfo ffffffff81a00000, task
ffffffff81a13420)
Stack:
 ffff88013ee03db0 ffffffff8106b6aa 0000000000000000 ffff88013ee0d940
 000000000000000a 0000000000000000 0000000100016638 000000000000000a
 ffff88013ee03e10 ffffffff8102c2d4 0000000000000100 ffffffff8102c200
Call Trace:
 <IRQ>
 [<ffffffff8106b6aa>] ? __queue_work+0x10a/0x430
 [<ffffffff8102c2d4>] mce_timer_fn+0xd4/0x190
 [<ffffffff8102c200>] ? machine_check_poll+0x180/0x180
 [<ffffffff8105f9f9>] call_timer_fn+0x49/0x130
 [<ffffffff8102c200>] ? machine_check_poll+0x180/0x180
 [<ffffffff8105fe03>] run_timer_softirq+0x143/0x280
 [<ffffffff8109f148>] ? ktime_get+0x68/0xf0
 [<ffffffff81058017>] __do_softirq+0xb7/0x210
 [<ffffffff81077b72>] ? hrtimer_interrupt+0x152/0x240
 [<ffffffff8151ea5c>] call_softirq+0x1c/0x30
 [<ffffffff81015335>] do_softirq+0x65/0xa0
 [<ffffffff81057e1d>] irq_exit+0xbd/0xe0
 [<ffffffff8151f39e>] smp_apic_timer_interrupt+0x6e/0x99
 [<ffffffff8151e10a>] apic_timer_interrupt+0x6a/0x70
 <EOI>
 [<ffffffff812b2f9d>] ? intel_idle+0xdd/0x150
 [<ffffffff812b2f83>] ? intel_idle+0xc3/0x150
 [<ffffffff8140c939>] cpuidle_enter+0x19/0x20
 [<ffffffff8140d174>] cpuidle_idle_call+0xd4/0x1d0
 [<ffffffff8101bd7f>] cpu_idle+0xcf/0x120
 [<ffffffff814fa475>] rest_init+0x75/0x80
 [<ffffffff81b00f52>] start_kernel+0x3e2/0x3ef
 [<ffffffff81b0098e>] ? kernel_init+0x27b/0x27b
 [<ffffffff81b00356>] x86_64_start_reservations+0x131/0x136
 [<ffffffff81b0045e>] x86_64_start_kernel+0x103/0x112
Code: 4c 89 e7 e8 76 54 4b 00 48 8b 5d d8 4c 8b 65 e0 4c 8b 6d e8 4c 8b
75 f0 4c 8b 7d f8 c9 c3 f6 43 18 01 75 c5 4d 89 7c 24 18 eb be <0f> 0b
90 eb fd 48 8b 75 08 e8 b5 fc ff ff e9 6b ff ff ff 4c 8b
RIP  [<ffffffff8105fbad>] add_timer_on+0xcd/0x120
 RSP <ffff88013ee03da0>


I add some print in timer callback, it shows:

smp_processor_id() = 0, mce_timer_fn data(CPU id) = 10
timer->function = ffffffff8102c200, timer pending = 1, CPU = 0
(add_timer_on, BUG!!!)

In fact, CPU0 and CPU10 on different socket, and the error happens on
the socket including
CPU10. I'm afraid it is still related to CMCI broadcast, but CMCI
doesn't spread out
of the socket, so I have no idea about it :-(

>
>> will be added on the other CPU, which means it loses the chance to
>> decrement *cmci_storm_on_cpus* to zero to reenable the CMCI. Maybe
>> this situation happens seldomly, but once it happens, CMCI will never
>> be actived again after it is disabled.
>>
>>> +void mce_timer_kick(unsigned long interval)
>>> +{
>>> +	struct timer_list *t = &__get_cpu_var(mce_timer);
>>> +	unsigned long when = jiffies + interval;
>>> +	unsigned long iv = __this_cpu_read(mce_next_interval);
>>> +
>>> +	if (time_before(when, t->expires) && timer_pending(t)) {
>>> +		mod_timer(t, when);
>>> +	} else if (!mce_next_interval) {
>>> +		t->expires = round_jiffies(jiffies + iv);
>>> +		add_timer_on(t, smp_processor_id());
>> I've changed "else if (!mce_next_interval)" to "else if (iv)", but
>   else if (!iv) perhaps ?

If so, iv = 0, which means this timer will be executed right now after
add_timer_on.

>
>> I still think it is not right. Imaging *when* is after t->expires and
>> this timer is pending on the timer tree, so it will hit *else if*
>> if iv is not zero(common situations), again, add_timer_on will trigger
>> BUG_ON because this timer is pending.
> See above. Aside of that I rewrote the code to be more clear. See
> delta patch below.
>  
>>>  static void intel_threshold_interrupt(void)
>>>  {
>>> +	if (cmci_storm_detect())
>>> +		return;
>>>  	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
>>>  	mce_notify_irq();
>>>  }
>> I think cmci_storm_detect should be placed in the machine_check_poll,
>> not out of it. Because machine_check_poll it the core execution logic
>> for CMCI handling, in the meanwhile, poll timer and mce-inject module
>> call machine_check_poll at any time. If poll timer or mce-inject run
>> too quickly, the CMCI handler has trouble. Whereas, if
>> cmci_storm_detect is in the machine_check_poll, this kind of possibility
>> can be avoid.
> No, it's wrong to put it into machine_check_poll(). 
>
> machine_check_poll() is a generic functionality which has nothing to
> do with CMCI storms. That CMCI storm/poll stuff is intel specific and
> therefor the detection logic is in the intel specific interrupt
> handler and nowhere else.

Gotta it, I'm just afraid something else, please see my another mail replied
for Boris.

>
> Thanks,
>
> 	tglx
> ---
>  arch/x86/kernel/cpu/mcheck/mce.c |   11 ++++++-----
>  1 file changed, 6 insertions(+), 5 deletions(-)
>
> Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -1305,13 +1305,14 @@ void mce_timer_kick(unsigned long interv
>  	unsigned long when = jiffies + interval;
>  	unsigned long iv = __this_cpu_read(mce_next_interval);
>  
> -	if (time_before(when, t->expires) && timer_pending(t)) {
> -		mod_timer(t, when);
> -	} else if (!mce_next_interval) {
> -		t->expires = round_jiffies(jiffies + iv);
> +	if (timer_pending(t)) {
> +		if (time_before(when, t->expires))
> +			mod_timer(t, when);
> +	} else {
> +		t->expires = round_jiffies(jiffies + when);

should be "t->expires = round_jiffies(when);"
>  		add_timer_on(t, smp_processor_id());
>  	}
> -	if (interval < iv)
> +	if (interval > iv)
>  		__this_cpu_write(mce_next_interval, iv);
>  }
>  
Strange, anyway, mce_next_interval should be updated in proper way, but
if using above logic, mce_next_interval doesn't make change. I prefer
if (interval < iv)
    __this_cpu_write(mce_next_interval, interval);

In fact, during my test, I wrote the similar codes to do the test, but
it can't fix the bug. The logic here is not the key, no matter how you set
timer, you can get a poll timer. The issue happens in the timer function
itself. Once timer callback is entered, the issue happens every time.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-06-06  1:36           ` Chen Gong
@ 2012-06-06  9:04             ` Borislav Petkov
  0 siblings, 0 replies; 29+ messages in thread
From: Borislav Petkov @ 2012-06-06  9:04 UTC (permalink / raw)
  To: Chen Gong
  Cc: Borislav Petkov, Thomas Gleixner, LKML, tony.luck, x86, Peter Zijlstra

On Wed, Jun 06, 2012 at 09:36:52AM +0800, Chen Gong wrote:
> Yes, I am just afraid similar situation happening when injecting MCEs
> or poll timer running too fast, which is like a pseduo CMCI storm.
> Maybe similar logic behind CMCI_STORM can be used there.

Poll timer on !Intel fires once every 5 minutes and halves the interval
when it detects a valid MCE.

I still don't think this is an issue if you poll only on single CPUs -
I think you need the throttling because, as Tony explained earlier, the
CMCI interrupt runs on _all_ CPUs and I can see how that can be hurting
performance.

CMCI on AMD (or, rather, our version of it) runs only on the CPU which
sees the error so I don't think we need throttling there, unless real
life proves me otherwise.

So, the whole CMCI deal should be Intel-only (for now, at least) and it
shouldn't be touching generic MCE code.

Thanks.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-06-06  7:21           ` Chen Gong
@ 2012-06-06  9:18             ` Thomas Gleixner
  2012-06-06 10:23               ` Thomas Gleixner
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2012-06-06  9:18 UTC (permalink / raw)
  To: Chen Gong; +Cc: LKML, tony.luck, bp, x86, Peter Zijlstra

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1968 bytes --]

On Wed, 6 Jun 2012, Chen Gong wrote:
> 于 2012/6/5 21:35, Thomas Gleixner 写道:
> I add some print in timer callback, it shows:
> 
> smp_processor_id() = 0, mce_timer_fn data(CPU id) = 10
> timer->function = ffffffff8102c200, timer pending = 1, CPU = 0
> (add_timer_on, BUG!!!)

Sure. That's not a surprise. The timer function for cpu 10 is called
on cpu 0. And the timer function does:

   struct timer_list *t = &__get_cpu_var(mce_timer);

which gets a pointer to the timer of cpu0. And that timer is
pending. So yes, it's exploding for a good reason.

Though, this does not tell us how the timer of cpu10 gets on cpu0.

Did you do any cpu hotplug operations ?

> > @@ -1305,13 +1305,14 @@ void mce_timer_kick(unsigned long interv
> >  	unsigned long when = jiffies + interval;
> >  	unsigned long iv = __this_cpu_read(mce_next_interval);
> >  
> > -	if (time_before(when, t->expires) && timer_pending(t)) {
> > -		mod_timer(t, when);
> > -	} else if (!mce_next_interval) {
> > -		t->expires = round_jiffies(jiffies + iv);
> > +	if (timer_pending(t)) {
> > +		if (time_before(when, t->expires))
> > +			mod_timer(t, when);
> > +	} else {
> > +		t->expires = round_jiffies(jiffies + when);
> 
> should be "t->expires = round_jiffies(when);"

True.

> >  		add_timer_on(t, smp_processor_id());
> >  	}
> > -	if (interval < iv)
> > +	if (interval > iv)
> >  		__this_cpu_write(mce_next_interval, iv);
> >  }
> >  
> Strange, anyway, mce_next_interval should be updated in proper way, but
> if using above logic, mce_next_interval doesn't make change. I prefer
> if (interval < iv)
>     __this_cpu_write(mce_next_interval, interval);

Grr. you are right. Stupid me.
 
> In fact, during my test, I wrote the similar codes to do the test, but
> it can't fix the bug. The logic here is not the key, no matter how you set
> timer, you can get a poll timer. The issue happens in the timer function
> itself. Once timer callback is entered, the issue happens every time.
 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-06-06  9:18             ` Thomas Gleixner
@ 2012-06-06 10:23               ` Thomas Gleixner
  2012-06-06 12:24                 ` Chen Gong
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2012-06-06 10:23 UTC (permalink / raw)
  To: Chen Gong; +Cc: LKML, tony.luck, bp, x86, Peter Zijlstra

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1693 bytes --]

On Wed, 6 Jun 2012, Thomas Gleixner wrote:

> On Wed, 6 Jun 2012, Chen Gong wrote:
> > 于 2012/6/5 21:35, Thomas Gleixner 写道:
> > I add some print in timer callback, it shows:
> > 
> > smp_processor_id() = 0, mce_timer_fn data(CPU id) = 10
> > timer->function = ffffffff8102c200, timer pending = 1, CPU = 0
> > (add_timer_on, BUG!!!)
> 
> Sure. That's not a surprise. The timer function for cpu 10 is called
> on cpu 0. And the timer function does:
> 
>    struct timer_list *t = &__get_cpu_var(mce_timer);
> 
> which gets a pointer to the timer of cpu0. And that timer is
> pending. So yes, it's exploding for a good reason.
> 
> Though, this does not tell us how the timer of cpu10 gets on cpu0.
> 
> Did you do any cpu hotplug operations ?

There's a problem in the hotplug code.

        case CPU_DOWN_PREPARE:
	case CPU_DOWN_PREPARE_FROZEN:
                del_timer_sync(t);
                smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
                break;

We delete the timer before we disable mce and cmci. So if the cmci
interrupt kicks the timer after del_timer_sync() and before
mce_disable_cpu() is called on the other core, then the timer is still
enqueued when the cpu goes down. After it's dead the timer is migrated
and then the above scenario happens.

Can you try the following just for a quick test ?

        case CPU_DOWN_PREPARE:
	case CPU_DOWN_PREPARE_FROZEN:
                del_timer_sync(t);
                smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+		del_timer_sync(t);
                break;

That's not a proper solution, for a proper solution the hotplug code
of mce needs an overhaul. It's patently ugly.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-06-06 10:23               ` Thomas Gleixner
@ 2012-06-06 12:24                 ` Chen Gong
  2012-06-06 12:27                   ` Peter Zijlstra
  2012-06-06 14:15                   ` Thomas Gleixner
  0 siblings, 2 replies; 29+ messages in thread
From: Chen Gong @ 2012-06-06 12:24 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, tony.luck, bp, x86, Peter Zijlstra

于 2012/6/6 18:23, Thomas Gleixner 写道:
> On Wed, 6 Jun 2012, Thomas Gleixner wrote:
>
>> On Wed, 6 Jun 2012, Chen Gong wrote:
>>> 于 2012/6/5 21:35, Thomas Gleixner 写道:
>>> I add some print in timer callback, it shows:
>>>
>>> smp_processor_id() = 0, mce_timer_fn data(CPU id) = 10
>>> timer->function = ffffffff8102c200, timer pending = 1, CPU = 0
>>> (add_timer_on, BUG!!!)
>> Sure. That's not a surprise. The timer function for cpu 10 is called
>> on cpu 0. And the timer function does:
>>
>>    struct timer_list *t = &__get_cpu_var(mce_timer);
>>
>> which gets a pointer to the timer of cpu0. And that timer is
>> pending. So yes, it's exploding for a good reason.
>>
>> Though, this does not tell us how the timer of cpu10 gets on cpu0.
>>
>> Did you do any cpu hotplug operations ?
> There's a problem in the hotplug code.
>
>         case CPU_DOWN_PREPARE:
> 	case CPU_DOWN_PREPARE_FROZEN:
>                 del_timer_sync(t);
>                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
>                 break;
>
> We delete the timer before we disable mce and cmci. So if the cmci
> interrupt kicks the timer after del_timer_sync() and before
> mce_disable_cpu() is called on the other core, then the timer is still
> enqueued when the cpu goes down. After it's dead the timer is migrated
> and then the above scenario happens.
>
> Can you try the following just for a quick test ?
>
>         case CPU_DOWN_PREPARE:
> 	case CPU_DOWN_PREPARE_FROZEN:
>                 del_timer_sync(t);
>                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
> +		del_timer_sync(t);
>                 break;
I think you mean

-               del_timer_sync(t);
                smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+		del_timer_sync(t);
                break;


I don't execute hotplug and the whole error injection shouldn't trigger
hotplug.
I tried your patch but the test result was as before. But your thought
give me
a new way to find out the reason. I will continue to do more tests on
tomorrow. :-)


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-06-06 12:24                 ` Chen Gong
@ 2012-06-06 12:27                   ` Peter Zijlstra
  2012-06-06 14:15                   ` Thomas Gleixner
  1 sibling, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2012-06-06 12:27 UTC (permalink / raw)
  To: Chen Gong; +Cc: Thomas Gleixner, LKML, tony.luck, bp, x86

On Wed, 2012-06-06 at 20:24 +0800, Chen Gong wrote:
> I will continue to do more tests on tomorrow. 

You've got a TARDIS? Neat!

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-06-06 12:24                 ` Chen Gong
  2012-06-06 12:27                   ` Peter Zijlstra
@ 2012-06-06 14:15                   ` Thomas Gleixner
  2012-06-06 14:46                     ` Thomas Gleixner
  1 sibling, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2012-06-06 14:15 UTC (permalink / raw)
  To: Chen Gong; +Cc: LKML, tony.luck, bp, x86, Peter Zijlstra

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1233 bytes --]

On Wed, 6 Jun 2012, Chen Gong wrote:
> 于 2012/6/6 18:23, Thomas Gleixner 写道:
> I think you mean
> 
> -               del_timer_sync(t);
>                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
> +		del_timer_sync(t);
>                 break;

No I meant it the way I wrote, but as you don't execute hotplug it's
irrelevant.

So the obvious candidate is the mce-injection code, which was
obviously never tested with DEBUG_PREEMPT enabled.

raise_local() can be called with preemption enabled from
raise_mce(). Fix for that below.

Though I can't see how that would do anything with the timer.

Another thing in that mce injection code is that there is no
protection against concurrent writes, except when the device was
opened with O_EXCL, but what guarantees that ?

Thanks,

	tglx

Index: tip/arch/x86/kernel/cpu/mcheck/mce-inject.c
===================================================================
--- tip.orig/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ tip/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -194,7 +194,11 @@ static void raise_mce(struct mce *m)
 		put_online_cpus();
 	} else
 #endif
+	{
+		preempt_disable();
 		raise_local();
+		preempt_enable();
+	}
 }
 
 /* Error injection interface */

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-06-06 14:15                   ` Thomas Gleixner
@ 2012-06-06 14:46                     ` Thomas Gleixner
  2012-06-07  3:32                       ` Chen Gong
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2012-06-06 14:46 UTC (permalink / raw)
  To: Chen Gong; +Cc: LKML, tony.luck, bp, x86, Peter Zijlstra

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1453 bytes --]

On Wed, 6 Jun 2012, Thomas Gleixner wrote:

> On Wed, 6 Jun 2012, Chen Gong wrote:
> > 于 2012/6/6 18:23, Thomas Gleixner 写道:
> > I think you mean
> > 
> > -               del_timer_sync(t);
> >                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
> > +		del_timer_sync(t);
> >                 break;
> 
> No I meant it the way I wrote, but as you don't execute hotplug it's
> irrelevant.
> 
> So the obvious candidate is the mce-injection code, which was
> obviously never tested with DEBUG_PREEMPT enabled.
> 
> raise_local() can be called with preemption enabled from
> raise_mce(). Fix for that below.
> 
> Though I can't see how that would do anything with the timer.

I think I found it. Do you have CONFIG_NO_HZ enabled? Then mod_timer()
will try to move the timer to a different cpu, when the cpu which is
running that code is idle. Bloody obvious :(

I'll send out a combo patch with all changes so far later.

Thanks,
	
	tglx

Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1307,7 +1307,7 @@ void mce_timer_kick(unsigned long interv
 
 	if (timer_pending(t)) {
 		if (time_before(when, t->expires))
-			mod_timer(t, when);
+			mod_timer_pinned(t, when);
 	} else {
 		t->expires = round_jiffies(when);
 		add_timer_on(t, smp_processor_id());

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2] x86: mce: Implement cmci poll mode for intel machines
  2012-06-06 14:46                     ` Thomas Gleixner
@ 2012-06-07  3:32                       ` Chen Gong
  0 siblings, 0 replies; 29+ messages in thread
From: Chen Gong @ 2012-06-07  3:32 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, tony.luck, bp, x86, Peter Zijlstra

于 2012/6/6 22:46, Thomas Gleixner 写道:
> On Wed, 6 Jun 2012, Thomas Gleixner wrote:
>
>> On Wed, 6 Jun 2012, Chen Gong wrote:
>>> 于 2012/6/6 18:23, Thomas Gleixner 写道:
>>> I think you mean
>>>
>>> -               del_timer_sync(t);
>>>                  smp_call_function_single(cpu, mce_disable_cpu,&action, 1);
>>> +		del_timer_sync(t);
>>>                  break;
>>
>> No I meant it the way I wrote, but as you don't execute hotplug it's
>> irrelevant.
>>
>> So the obvious candidate is the mce-injection code, which was
>> obviously never tested with DEBUG_PREEMPT enabled.
>>
>> raise_local() can be called with preemption enabled from
>> raise_mce(). Fix for that below.
>>
>> Though I can't see how that would do anything with the timer.
>
> I think I found it. Do you have CONFIG_NO_HZ enabled? Then mod_timer()
> will try to move the timer to a different cpu, when the cpu which is
> running that code is idle. Bloody obvious :(

Oh, yes, it works! mod_timer is really a naughty baby :-).
Now it passes the basic test, and then I will use your latest
patch series to test more scenarios.

>
> I'll send out a combo patch with all changes so far later.
>
> Thanks,
> 	
> 	tglx
>
> Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -1307,7 +1307,7 @@ void mce_timer_kick(unsigned long interv
>
>   	if (timer_pending(t)) {
>   		if (time_before(when, t->expires))
> -			mod_timer(t, when);
> +			mod_timer_pinned(t, when);
>   	} else {
>   		t->expires = round_jiffies(when);
>   		add_timer_on(t, smp_processor_id());


^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2012-06-07  3:32 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-24 17:54 [patch 0/2] x86: mce: Implement poll mode for CMCI Thomas Gleixner
2012-05-24 17:54 ` [patch 1/2] x86: mce Cleanup timer mess Thomas Gleixner
2012-05-25  6:16   ` Borislav Petkov
2012-06-04  2:22   ` Chen Gong
2012-06-04 18:14     ` Luck, Tony
2012-06-04 19:57       ` Thomas Gleixner
2012-06-04 22:18       ` Borislav Petkov
2012-05-24 17:54 ` [patch 2/2] x86: mce: Implement cmci poll mode for intel machines Thomas Gleixner
2012-05-25  6:24   ` Borislav Petkov
2012-05-25  7:31     ` Chen Gong
2012-05-25  9:20       ` Thomas Gleixner
2012-05-25 12:17         ` Thomas Gleixner
2012-05-28  9:47     ` Chen Gong
2012-05-28  9:52   ` Chen Gong
2012-06-04  2:37   ` Chen Gong
2012-06-04 20:01     ` Thomas Gleixner
2012-06-05 11:47       ` Chen Gong
2012-06-05 12:57         ` Borislav Petkov
2012-06-06  1:36           ` Chen Gong
2012-06-06  9:04             ` Borislav Petkov
2012-06-05 13:35         ` Thomas Gleixner
2012-06-06  7:21           ` Chen Gong
2012-06-06  9:18             ` Thomas Gleixner
2012-06-06 10:23               ` Thomas Gleixner
2012-06-06 12:24                 ` Chen Gong
2012-06-06 12:27                   ` Peter Zijlstra
2012-06-06 14:15                   ` Thomas Gleixner
2012-06-06 14:46                     ` Thomas Gleixner
2012-06-07  3:32                       ` Chen Gong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).