All of lore.kernel.org
 help / color / mirror / Atom feed
* [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer
@ 2009-09-18 10:20 Huang Ying
  2009-09-18 11:09 ` Ingo Molnar
  2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
  0 siblings, 2 replies; 23+ messages in thread
From: Huang Ying @ 2009-09-18 10:20 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin, Andi Kleen, Hidetoshi Seto; +Cc: linux-kernel

Current MCE log ring buffer has following bugs and issues:

- On larger systems the 32 size buffer easily overflow, losing events.

- We had some reports of events getting corrupted which were also
  blamed on the ring buffer.

- There's a known livelock, now hit by more people, under high error
  rate.

We fix these bugs and issues via making MCE log ring buffer as
lock-less per-CPU ring buffer.

- One ring buffer for each CPU

  + MCEs are added to corresponding local per-CPU buffer, instead of
    one big global buffer. Contention/unfairness between CPUs is
    eleminated.

  + MCE records are read out and removed from per-CPU buffers by mutex
    protected global reader function. Because there are no many
    readers in system to contend in most cases.

- Per-CPU ring buffer data structure

  + An array is used to hold MCE records. integer "head" indicates
    next writing position and integer "tail" indicates next reading
    position.

  + To distinguish buffer empty and full, head and tail wrap to 0 at
    MCE_LOG_LIMIT instead of MCE_LOG_LEN. Then the real next writing
    position is head % MCE_LOG_LEN, and real next reading position is
    tail % MCE_LOG_LEN. If buffer is empty, head == tail, if buffer is
    full, head % MCE_LOG_LEN == tail % MCE_LOG_LEN and head != tail.

- Lock-less for writer side

  + MCE log writer may come from NMI, so the writer side must be
    lock-less. For per-CPU buffer of one CPU, writers may come from
    process, IRQ or NMI context, so "head" is increased with
    cmpxchg_local() to allocate buffer space.

  + Reader side is protected with a mutex to guarantee only one reader
    is active in the whole system.


Performance test show that the throughput of per-CPU mcelog buffer can
reach 430k records/s compared with 5.3k records/s for original
implementation on a 2-core 2.1GHz Core2 machine.


ChangeLog:

v7:

- Rebased on latest x86-tip.git/x86/mce

- Revise mce print code.

v6:

- Rebased on latest x86-tip.git/x86/mce + x86/tip

v5:

- Rebased on latest x86-tip.git/mce4.

- Fix parsing mce_log data structure in dumping kernel.

v4:

- Rebased on x86-tip.git x86/mce3 branch

- Fix a synchronization issue about mce.finished

- Fix comment style

v3:

- Use DEFINE_PER_CPU to allocate per_cpu mcelog buffer.

- Use cond_resched() to prevent possible system not reponsing issue
  for large user buffer.

v2:

- Use alloc_percpu() to allocate per_cpu mcelog buffer.

- Use ndelay to implement witer timeout.


Signed-off-by: Huang Ying <ying.huang@intel.com>
CC: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/include/asm/mce.h       |   18 +-
 arch/x86/kernel/cpu/mcheck/mce.c |  295 +++++++++++++++++++++++----------------
 2 files changed, 190 insertions(+), 123 deletions(-)

--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -76,20 +76,28 @@ struct mce {
  * is set.
  */
 
-#define MCE_LOG_LEN 32
+#define MCE_LOG_LEN		32
+#define MCE_LOG_LIMIT		(MCE_LOG_LEN * 2 - 1)
+
+static inline int mce_log_index(int n)
+{
+	return n >= MCE_LOG_LEN ? n - MCE_LOG_LEN : n;
+}
+
+struct mce_log_cpu;
 
 struct mce_log {
-	char signature[12]; /* "MACHINECHECK" */
+	char signature[12]; /* "MACHINECHEC2" */
 	unsigned len;	    /* = MCE_LOG_LEN */
-	unsigned next;
 	unsigned flags;
 	unsigned recordlen;	/* length of struct mce */
-	struct mce entry[MCE_LOG_LEN];
+	unsigned nr_mcelog_cpus;
+	struct mce_log_cpu **mcelog_cpus;
 };
 
 #define MCE_OVERFLOW 0		/* bit 0 in flags means overflow */
 
-#define MCE_LOG_SIGNATURE	"MACHINECHECK"
+#define MCE_LOG_SIGNATURE	"MACHINECHEC2"
 
 #define MCE_GET_RECORD_LEN   _IOR('M', 1, int)
 #define MCE_GET_LOG_LEN      _IOR('M', 2, int)
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -52,6 +52,9 @@ int mce_disabled __read_mostly;
 
 #define SPINUNIT 100	/* 100ns */
 
+/* Timeout log reader to wait writer to finish */
+#define WRITER_TIMEOUT_NS	NSEC_PER_MSEC
+
 atomic_t mce_entry;
 
 DEFINE_PER_CPU(unsigned, mce_exception_count);
@@ -125,42 +128,50 @@ static struct mce_log mcelog = {
 	.recordlen	= sizeof(struct mce),
 };
 
+struct mce_log_cpu {
+	int head;
+	int tail;
+	unsigned long flags;
+	struct mce entry[MCE_LOG_LEN];
+};
+
+DEFINE_PER_CPU(struct mce_log_cpu, mce_log_cpus);
+
 void mce_log(struct mce *mce)
 {
-	unsigned next, entry;
+	struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mce_log_cpus);
+	int head, ihead, tail, next;
 
 	mce->finished = 0;
-	wmb();
-	for (;;) {
-		entry = rcu_dereference(mcelog.next);
-		for (;;) {
-			/*
-			 * When the buffer fills up discard new entries.
-			 * Assume that the earlier errors are the more
-			 * interesting ones:
-			 */
-			if (entry >= MCE_LOG_LEN) {
-				set_bit(MCE_OVERFLOW,
-					(unsigned long *)&mcelog.flags);
-				return;
-			}
-			/* Old left over entry. Skip: */
-			if (mcelog.entry[entry].finished) {
-				entry++;
-				continue;
-			}
-			break;
+	/*
+	 * mce->finished must be set to 0 before written to ring
+	 * buffer
+	 */
+	smp_wmb();
+	do {
+		head = mcelog_cpu->head;
+		tail = mcelog_cpu->tail;
+		ihead = mce_log_index(head);
+		/*
+		 * When the buffer fills up discard new entries.
+		 * Assume that the earlier errors are the more
+		 * interesting.
+		 */
+		if (ihead == mce_log_index(tail) && head != tail) {
+			set_bit(MCE_OVERFLOW, &mcelog_cpu->flags);
+			return;
 		}
-		smp_rmb();
-		next = entry + 1;
-		if (cmpxchg(&mcelog.next, entry, next) == entry)
-			break;
-	}
-	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
-	wmb();
-	mcelog.entry[entry].finished = 1;
-	wmb();
-
+		next = head == MCE_LOG_LIMIT ? 0 : head + 1;
+	} while (cmpxchg_local(&mcelog_cpu->head, head, next) != head);
+	memcpy(mcelog_cpu->entry + ihead, mce, sizeof(struct mce));
+	/*
+	 * ".finished" of MCE record in ring buffer must be set after
+	 * copy
+	 */
+	smp_wmb();
+	mcelog_cpu->entry[ihead].finished = 1;
+	/* bit 0 of notify_user should be set after finished be set */
+	smp_wmb();
 	mce->finished = 1;
 	set_bit(0, &mce_need_notify);
 }
@@ -200,6 +211,26 @@ static void print_mce_tail(void)
 	       KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
 }
 
+static void print_mce_cpu(int cpu, struct mce *final, u64 mask, u64 res)
+{
+	int i;
+	struct mce_log_cpu *mcelog_cpu;
+
+	mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+	for (i = 0; i < MCE_LOG_LEN; i++) {
+		struct mce *m = &mcelog_cpu->entry[i];
+		if (!m->finished)
+			continue;
+		if (!(m->status & MCI_STATUS_VAL))
+			continue;
+		if ((m->status & mask) != res)
+			continue;
+		if (final && !memcmp(m, final, sizeof(struct mce)))
+			continue;
+		print_mce(m);
+	}
+}
+
 #define PANIC_TIMEOUT 5 /* 5 seconds */
 
 static atomic_t mce_paniced;
@@ -222,7 +253,7 @@ static void wait_for_panic(void)
 
 static void mce_panic(char *msg, struct mce *final, char *exp)
 {
-	int i;
+	int cpu;
 
 	if (!fake_panic) {
 		/*
@@ -241,23 +272,12 @@ static void mce_panic(char *msg, struct 
 	}
 	print_mce_head();
 	/* First print corrected ones that are still unlogged */
-	for (i = 0; i < MCE_LOG_LEN; i++) {
-		struct mce *m = &mcelog.entry[i];
-		if (!(m->status & MCI_STATUS_VAL))
-			continue;
-		if (!(m->status & MCI_STATUS_UC))
-			print_mce(m);
-	}
-	/* Now print uncorrected but with the final one last */
-	for (i = 0; i < MCE_LOG_LEN; i++) {
-		struct mce *m = &mcelog.entry[i];
-		if (!(m->status & MCI_STATUS_VAL))
-			continue;
-		if (!(m->status & MCI_STATUS_UC))
-			continue;
-		if (!final || memcmp(m, final, sizeof(struct mce)))
-			print_mce(m);
-	}
+	for_each_online_cpu(cpu)
+		print_mce_cpu(cpu, final, MCI_STATUS_UC, 0);
+	/* Print uncorrected but without the final one */
+	for_each_online_cpu(cpu)
+		print_mce_cpu(cpu, final, MCI_STATUS_UC, MCI_STATUS_UC);
+	/* Finally print the final mce */
 	if (final)
 		print_mce(final);
 	if (cpu_missing)
@@ -1207,6 +1227,22 @@ static int __cpuinit mce_cap_init(void)
 	return 0;
 }
 
+/*
+ * Initialize MCE per-CPU log buffer
+ */
+static __cpuinit void mce_log_init(void)
+{
+	int cpu;
+
+	if (mcelog.mcelog_cpus)
+		return;
+	mcelog.nr_mcelog_cpus = num_possible_cpus();
+	mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
+				     GFP_KERNEL);
+	for_each_possible_cpu(cpu)
+		mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
+}
+
 static void mce_init(void)
 {
 	mce_banks_t all_banks;
@@ -1367,6 +1403,7 @@ void __cpuinit mcheck_init(struct cpuinf
 		return;
 	}
 	mce_cpu_quirks(c);
+	mce_log_init();
 
 	machine_check_vector = do_machine_check;
 
@@ -1415,94 +1452,116 @@ static int mce_release(struct inode *ino
 	return 0;
 }
 
-static void collect_tscs(void *data)
+static ssize_t mce_read_cpu(struct mce_log_cpu *mcelog_cpu,
+			    char __user *inubuf, size_t usize)
 {
-	unsigned long *cpu_tsc = (unsigned long *)data;
+	char __user *ubuf = inubuf;
+	int head, tail, pos, i, err = 0;
 
-	rdtscll(cpu_tsc[smp_processor_id()]);
-}
+	head = mcelog_cpu->head;
+	tail = mcelog_cpu->tail;
 
-static DEFINE_MUTEX(mce_read_mutex);
+	if (head == tail)
+		return 0;
 
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
-			loff_t *off)
-{
-	char __user *buf = ubuf;
-	unsigned long *cpu_tsc;
-	unsigned prev, next;
-	int i, err;
+	for (pos = tail; pos != head && usize >= sizeof(struct mce);
+	     pos = pos == MCE_LOG_LIMIT ? 0 : pos+1) {
+		i = mce_log_index(pos);
+		if (!mcelog_cpu->entry[i].finished) {
+			int timeout = WRITER_TIMEOUT_NS;
+			while (!mcelog_cpu->entry[i].finished) {
+				if (timeout-- <= 0) {
+					memset(mcelog_cpu->entry + i, 0,
+					       sizeof(struct mce));
+					head = mcelog_cpu->head;
+					printk(KERN_WARNING "mcelog: timeout "
+					       "waiting for writer to finish!\n");
+					goto timeout;
+				}
+				ndelay(1);
+			}
+		}
+		/*
+		 * finished field should be checked before
+		 * copy_to_user()
+		 */
+		smp_rmb();
+		err |= copy_to_user(ubuf, mcelog_cpu->entry + i,
+				   sizeof(struct mce));
+		ubuf += sizeof(struct mce);
+		usize -= sizeof(struct mce);
+		mcelog_cpu->entry[i].finished = 0;
+timeout:
+		;
+	}
+	/*
+	 * mcelog_cpu->tail must be updated after ".finished" of
+	 * corresponding MCE records are clear.
+	 */
+	smp_wmb();
+	mcelog_cpu->tail = pos;
 
-	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
-	if (!cpu_tsc)
-		return -ENOMEM;
+	return err ? -EFAULT : ubuf - inubuf;
+}
 
-	mutex_lock(&mce_read_mutex);
-	next = rcu_dereference(mcelog.next);
+static int mce_empty_cpu(struct mce_log_cpu *mcelog_cpu)
+{
+	return mcelog_cpu->head == mcelog_cpu->tail;
+}
 
-	/* Only supports full reads right now */
-	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
-		mutex_unlock(&mce_read_mutex);
-		kfree(cpu_tsc);
+static int mce_empty(void)
+{
+	int cpu;
+	struct mce_log_cpu *mcelog_cpu;
 
-		return -EINVAL;
+	for_each_possible_cpu(cpu) {
+		mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+		if (!mce_empty_cpu(mcelog_cpu))
+			return 0;
 	}
+	return 1;
+}
 
-	err = 0;
-	prev = 0;
-	do {
-		for (i = prev; i < next; i++) {
-			unsigned long start = jiffies;
+static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
+			loff_t *off)
+{
+	char __user *ubuf = inubuf;
+	struct mce_log_cpu *mcelog_cpu;
+	int cpu, new_mce, err = 0;
+	static DEFINE_MUTEX(mce_read_mutex);
 
-			while (!mcelog.entry[i].finished) {
-				if (time_after_eq(jiffies, start + 2)) {
-					memset(mcelog.entry + i, 0,
-					       sizeof(struct mce));
-					goto timeout;
-				}
-				cpu_relax();
-			}
-			smp_rmb();
-			err |= copy_to_user(buf, mcelog.entry + i,
-					    sizeof(struct mce));
-			buf += sizeof(struct mce);
-timeout:
-			;
+	mutex_lock(&mce_read_mutex);
+	do {
+		new_mce = 0;
+		for_each_possible_cpu(cpu) {
+			if (usize < sizeof(struct mce))
+				goto out;
+			mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+			err = mce_read_cpu(mcelog_cpu, ubuf,
+					   sizeof(struct mce));
+			if (err > 0) {
+				ubuf += sizeof(struct mce);
+				usize -= sizeof(struct mce);
+				new_mce = 1;
+				err = 0;
+			} else if (err < 0)
+				goto out;
 		}
-
-		memset(mcelog.entry + prev, 0,
-		       (next - prev) * sizeof(struct mce));
-		prev = next;
-		next = cmpxchg(&mcelog.next, prev, 0);
-	} while (next != prev);
-
-	synchronize_sched();
-
-	/*
-	 * Collect entries that were still getting written before the
-	 * synchronize.
-	 */
-	on_each_cpu(collect_tscs, cpu_tsc, 1);
-
-	for (i = next; i < MCE_LOG_LEN; i++) {
-		if (mcelog.entry[i].finished &&
-		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
-			err |= copy_to_user(buf, mcelog.entry+i,
-					    sizeof(struct mce));
-			smp_rmb();
-			buf += sizeof(struct mce);
-			memset(&mcelog.entry[i], 0, sizeof(struct mce));
+		if (need_resched()) {
+			mutex_unlock(&mce_read_mutex);
+			cond_resched();
+			mutex_lock(&mce_read_mutex);
 		}
-	}
+	} while (new_mce || !mce_empty());
+out:
 	mutex_unlock(&mce_read_mutex);
-	kfree(cpu_tsc);
-
-	return err ? -EFAULT : buf - ubuf;
+	return err ? err : ubuf - inubuf;
 }
 
 static unsigned int mce_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &mce_wait, wait);
-	if (rcu_dereference(mcelog.next))
+	if (!mce_empty())
 		return POLLIN | POLLRDNORM;
 	return 0;
 }



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer
  2009-09-18 10:20 [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Huang Ying
@ 2009-09-18 11:09 ` Ingo Molnar
  2009-09-21  5:37   ` Huang Ying
  2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
  1 sibling, 1 reply; 23+ messages in thread
From: Ingo Molnar @ 2009-09-18 11:09 UTC (permalink / raw)
  To: Huang Ying, Borislav Petkov, Fr??d??ric Weisbecker, Li Zefan,
	Steven Rostedt
  Cc: H. Peter Anvin, Andi Kleen, Hidetoshi Seto, linux-kernel


* Huang Ying <ying.huang@intel.com> wrote:

> Current MCE log ring buffer has following bugs and issues:
> 
> - On larger systems the 32 size buffer easily overflow, losing events.
> 
> - We had some reports of events getting corrupted which were also
>   blamed on the ring buffer.
> 
> - There's a known livelock, now hit by more people, under high error
>   rate.
> 
> We fix these bugs and issues via making MCE log ring buffer as 
> lock-less per-CPU ring buffer.

I like the direction of this (the current MCE ring-buffer code is a bad 
local hack that should never have been merged upstream in that form) - 
but i'd like to see a MUCH more ambitious (and much more useful!) 
approach insted of using an explicit ring-buffer.

Please define MCE generic tracepoints using TRACE_EVENT() and use 
perfcounters to access them.

This approach solves all the problems you listed and it also adds a 
large number of new features to MCE events:

 - Multiple user-space agents can access MCE events. You can have an
   mcelog daemon running but also a system-wide tracer capturing
   important events in flight-recorder mode.

 - Sampling support: the kernel and the user-space call-chain of MCE
   events can be stored and analyzed as well. This way actual patterns 
   of bad behavior can be matched to precisely what kind of activity 
   happened in the kernel (and/or in the app) around that moment in 
   time.

 - Coupling with other hardware and software events: the PMU can track a 
   number of other anomalies - monitoring software might chose to 
   monitor those plus the MCE events as well - in one coherent stream of 
   events.

 - Discovery of MCE sources - tracepoints are enumerated and tools can 
   act upon the existence (or non-existence) of various channels of MCE 
   information.

 - Filtering support: you just subscribe to and act upon the events you 
   are interested in. Then even on a per event source basis there's 
   in-kernel filter expressions available that can restrict the amount
   of data that hits the event channel.

 - Arbitrary deep per cpu buffering of events - you can buffer 32 
   entries or you can buffer as much as you want, as long as you have 
   the RAM.

 - An NMI-safe ring-buffer implementation - mappable to user-space.

 - Built-in support for timestamping of events, PID markers, CPU 
   markers, etc.

 - A rich ABI accessible over system call interface. Per cpu, per task 
   and per workload monitoring of MCE events can be done this way. The 
   ABI itself has a nice, meaningful structure.

 - Extensible ABI: new fields can be added without breaking tooling.
   New tracepoints can be added as the hardware side evolves. There's 
   various parsers that can be used.

 - Lots of scheduling/buffering/batching modes of operandi for MCE
   events. poll() support. mmap() support. read() support. You name it.

 - Rich tooling support: even without any MCE specific extensions added
   the 'perf' tool today offers various views of MCE data: perf report,
   perf stat, perf trace can all be used to view logged MCE events and
   perhaps correlate them to certain user-space usage patterns. But it
   can be used directly as well, for user-space agents and policy action
   in mcelog, etc.

 - Significant code reduction and cleanup in the MCE code: the whole 
   mcelog facility can be dropped in essence.

 - (these are the top of the list - there more advantages as well.)

Such a design would basically propel the MCE code into the twenty first 
century. Once we have these facilities we can phase out /dev/mcelog for 
good. It would turn Linux MCE events from a quirky hack that doesnt even 
work after years of hacking into a modern, extensible event logging 
facility that uses event sources and flexible transports to user-space.

It would actually be code that is not a problem child like today but one 
that we can take pride in and which is fun to work on :-)

Now, an approach like this shouldnt just be a blind export of mce_log() 
into a single artificial generic event [which is a pretty poor API to 
begin with] - it should be the definition of meaningful 
tracepoints/events that describe the hardware's structure.

I'd rather have a good enumeration of various sources of MCEs as 
separate tracepoints than some badly jumbled mess of all MCE sources in 
one inflexible ABI as /dev/mcelog does it today.

Note, if you need any perfcounter infrastructure extensions/help for 
this then we'll be glad to provide that. I'm sure there's a few things 
to enhance and a few things to fix - there always are with any 
non-trivial new user :-) But heck would i take _those_ forward looking 
problems over any of the current MCE design mess, any day of the week.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer
  2009-09-18 11:09 ` Ingo Molnar
@ 2009-09-21  5:37   ` Huang Ying
  2009-09-22 13:39     ` [PATCH] x86: mce: New MCE logging design Ingo Molnar
  0 siblings, 1 reply; 23+ messages in thread
From: Huang Ying @ 2009-09-21  5:37 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Fr??d??ric Weisbecker, Li Zefan, Steven Rostedt,
	H. Peter Anvin, Andi Kleen, Hidetoshi Seto, linux-kernel

Hi, Ingo,

I think it is a interesting idea to replace the original MCE logging
part implementation (MCE log ring buffer + /dev/mcelog misc device) with
tracing infrastructure. The tracing infrastructure has definitely more
features than original method, but it is more complex than original
method too.

The tracing infrastructure is good at tracing. But I think it may be too
complex to be used as a (hardware) error processing mechanism. I think
the error processing should be simple and relatively independent with
other part of the kernel to avoid triggering more (hardware) error
during error processing. What do you think about this?

Even if we will reach consensus at some point to re-write MCE logging
part totally. We still need to fix the bugs in original implementation
before the new implementation being ready. Do you agree?

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] x86: mce: New MCE logging design
  2009-09-21  5:37   ` Huang Ying
@ 2009-09-22 13:39     ` Ingo Molnar
  0 siblings, 0 replies; 23+ messages in thread
From: Ingo Molnar @ 2009-09-22 13:39 UTC (permalink / raw)
  To: Huang Ying
  Cc: Borislav Petkov, Fr??d??ric Weisbecker, Li Zefan, Steven Rostedt,
	H. Peter Anvin, Andi Kleen, Hidetoshi Seto, linux-kernel,
	Arjan van de Ven, Thomas Gleixner


* Huang Ying <ying.huang@intel.com> wrote:

> Hi, Ingo,
> 
> I think it is a interesting idea to replace the original MCE logging 
> part implementation (MCE log ring buffer + /dev/mcelog misc device) 
> with tracing infrastructure. The tracing infrastructure has definitely 
> more features than original method, but it is more complex than 
> original method too.

TRACE_EVENT() can be used to define the tracepoint itself - but most of 
the benefit comes from using the performance events subsystem for event 
reporting. It's a perfect match for MCE events - see below.

> The tracing infrastructure is good at tracing. But I think it may be 
> too complex to be used as a (hardware) error processing mechanism. I 
> think the error processing should be simple and relatively independent 
> with other part of the kernel to avoid triggering more (hardware) 
> error during error processing. What do you think about this?

The fact that event reporting is used for other things as well might 
even make it more fault-resilient than open-coded MCE code: just due to 
it possibly being hot in the cache already (and thus not needing extra 
DRAM cycles to a possibly faulty piece of hardware) while the MCE code, 
not used by anything else, has to be brought into the cache for error 
processing - triggering more faults.

But ... that argument is even largely irrelevant: the overriding core 
kernel principle was always that of reuse of facilities. We want to keep 
performance event reporting simple and robust too, so re-using 
facilities is a good thing and the needs of MCE logging will improve 
perf events - and vice versa.

Furthermore, to me as x86 maintainer it matters a lot that a piece of 
code i maintain is clean, well-designed and maintainable in the long 
run. The current /dev/mcelog code is not clean and not maintainable,
and that needs to be fixed before it can be extended.

That MCE logging also becomes more robust and more feature-rich as a 
result is a happy side-effect.

> Even if we will reach consensus at some point to re-write MCE logging 
> part totally. We still need to fix the bugs in original implementation 
> before the new implementation being ready. Do you agree?

Well, this is all 2.6.33 stuff and that's plenty of time to do the 
generic events + perf events approach.

Anyway, i think we can work on these MCE enhancements much better if we 
move it to the level of actual patches. Let me give you a (very small) 
demonstration of my requests.

The patch attached below adds two new events to the MCE code, to the 
thermal throttling code:

  mce_thermal_throttle_hot
  mce_thermal_throttle_cold

If you apply the patch below to latest -tip and boot that kernel and do:

	cd tools/perf/
	make -j install

And type 'perf list', those two new MCE events will show up in the list 
of available events:

  # perf list
  [...]
    mce:mce_thermal_throttle_hot               [Tracepoint event]
    mce:mce_thermal_throttle_cold              [Tracepoint event]
  [...]

Initially you can check whether those events are occuring:

  # perf stat -a -e mce:mce_thermal_throttle_hot sleep 1

   Performance counter stats for 'sleep 1':

              0  mce:mce_thermal_throttle_hot #      0.000 M/sec

    1.000672214  seconds time elapsed

None on a normal system. Now, i tried it on a system that gets frequent 
thermal events, and there it gives:

  # perf stat -a -e mce:mce_thermal_throttle_hot sleep 1

  Performance counter stats for 'sleep 1':

              73  mce:mce_thermal_throttle_hot #      0.000 M/sec

     1.000765260  seconds time elapsed

Now i can observe the frequency of those thermal throttling events:

  # perf stat --repeat 10 -a -e mce:mce_thermal_throttle_hot sleep 1

  Performance counter stats for 'sleep 1' (10 runs):

             184  mce:mce_thermal_throttle_hot #      0.000 M/sec   ( +-   1.971% )

     1.006488821  seconds time elapsed   ( +-   0.203% )

184/sec is pretty hefty, with a relatively stable rate.

Now i try to see exactly which code is affected by those thermal 
throttling events. For that i use 'perf record', and profile the whole 
system for 10 seconds:

  # perf record -f -e mce:mce_thermal_throttle_hot -c 1 -a sleep 10
  [ perf record: Woken up 4 times to write data ]
  [ perf record: Captured and wrote 0.802 MB perf.data (~35026 samples) ]

And i used 'perf report' to analyze where the MCE events occured:

 # Samples: 2495
 #
 # Overhead         Command             Shared Object  Symbol
 # ........  ..............  ........................  ......
 #
    55.39%       hackbench  [vdso]                    [.] 0x000000f77e1430
    35.27%       hackbench                  f77c2430  [.] 0x000000f77c2430
     3.05%       hackbench  [kernel]                  [k] ia32_setup_sigcontext
     0.88%            bash  [vdso]                    [.] 0x000000f776f430
     0.64%            bash                  f7731430  [.] 0x000000f7731430
     0.52%       hackbench  ./hackbench               [.] sender
     0.48%            nice                  f7753430  [.] 0x000000f7753430
     0.36%       hackbench  ./hackbench               [.] receiver
     0.24%            perf  /lib64/libc-2.5.so        [.] __GI_read
     0.24%       hackbench  /lib/libc-2.5.so          [.] __libc_read

Most of the events concentrate themselves to around syscall entry 
points. That's not because the system is doing that most of the time - 
it's probably because the fast syscall entry point is a critical path of 
the CPU and overheating events likely concentrate themselves to around 
that place.

Now lets how we can do very detailed MCE logs, with per CPU data, 
timestamps and other events mixed. For that i use perf record again:

# perf record -R -f -e mce:mce_thermal_throttle_hot:r -e mce:mce_thermal_throttle_cold:r -e irq:irq_handler_entry:r -M -c 1 -a sleep 10
[ perf record: Woken up 12 times to write data ]
[ perf record: Captured and wrote 1.167 MB perf.data (~50990 samples) ]

And i use 'perf trace' to look at the result:

       hackbench-5070  [000]  1542.044184301: mce_thermal_throttle_hot: throttle_count=150852
       hackbench-5070  [000]  1542.044672817: mce_thermal_throttle_cold: throttle_count=150852
       hackbench-5056  [000]  1542.047029212: mce_thermal_throttle_hot: throttle_count=150853
       hackbench-5057  [000]  1542.047518733: mce_thermal_throttle_cold: throttle_count=150853
       hackbench-5058  [000]  1542.049822672: mce_thermal_throttle_hot: throttle_count=150854
       hackbench-5046  [000]  1542.050312612: mce_thermal_throttle_cold: throttle_count=150854
       hackbench-5072  [000]  1542.052720357: mce_thermal_throttle_hot: throttle_count=150855
       hackbench-5072  [000]  1542.053210612: mce_thermal_throttle_cold: throttle_count=150855
           :5074-5074  [000]  1542.055689468: mce_thermal_throttle_hot: throttle_count=150856
       hackbench-5011  [000]  1542.056179849: mce_thermal_throttle_cold: throttle_count=150856
           :5061-5061  [000]  1542.056937152: mce_thermal_throttle_hot: throttle_count=150857
       hackbench-5048  [000]  1542.057428970: mce_thermal_throttle_cold: throttle_count=150857
       hackbench-5053  [000]  1542.060372255: mce_thermal_throttle_hot: throttle_count=150858
       hackbench-5054  [000]  1542.060863006: mce_thermal_throttle_cold: throttle_count=150858

The first thing we can see it from the trace, only the first core 
reports thermal events (the box is a dual-core box). The other thing is 
the timing: the CPU is in 'hot' state for about ~490 microseconds, then 
in 'cold' state for ~250 microseconds.

So the CPU spends about 65% of its time in hot-throttled state, to cool 
down enough to switch back to cold mode again.

Also, most of the overheating events hit the 'hackbench' app. (which is 
no surprise on this box as it is only running that workload - but might 
be interesting on other systems with more mixed workloads.)

So even in its completely MCE-unaware form, and with this small patch 
that took 5 minutes to write, the performance events subsystem and 
tooling can already give far greater insight into characteristics of 
these events than the MCE subsystem did before.

Obviously the reported events can be used for different, MCE specific 
things as well:

 - Report an alert on the desktop (without having to scan the syslog all 
   the time)

 - Mail the sysadmin

 - Do some specific policy action such as soft-throttle the workload 
   until the situation and the log data can be examined.

As i outlined it in my previous mail, the perf events subsystem provides 
a wide range of facilities that can be used for that purpose.

If there are aspects of MCE reporting usecases that necessiate the 
extension of perf events, then we can do that - both sides will benefit 
from it. What we dont want to do is to prolongue the pain caused by the 
inferior, buggy and poorly designed /dev/mcelog ABI and code. It's not 
serving our users well and people are spending time on fixing something 
that has been misdesigned from the get go instead of concentrating on 
improving the right facilities.

So this is really how i see the future of the MCE logging subsystem: use 
meaningful generic events that empowers users, admins and tools to do 
something intelligent with the available stream of MCE data. The 
hardware tells us a _lot_ of information - lets use and expose that in a 
structured form.

	Ingo

---
 arch/x86/kernel/cpu/mcheck/therm_throt.c |    9 +++++
 include/trace/events/mce.h               |   47 +++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)

Index: linux/arch/x86/kernel/cpu/mcheck/therm_throt.c
===================================================================
--- linux.orig/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ linux/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -31,6 +31,9 @@
 #include <asm/mce.h>
 #include <asm/msr.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/mce.h>
+
 /* How long to wait between reporting thermal events */
 #define CHECK_INTERVAL		(300 * HZ)
 
@@ -118,8 +121,12 @@ static int therm_throt_process(bool is_t
 	was_throttled = state->is_throttled;
 	state->is_throttled = is_throttled;
 
-	if (is_throttled)
+	if (is_throttled) {
 		state->throttle_count++;
+		trace_mce_thermal_throttle_hot(state->throttle_count);
+	} else {
+		trace_mce_thermal_throttle_cold(state->throttle_count);
+	}
 
 	if (time_before64(now, state->next_check) &&
 			state->throttle_count != state->last_throttle_count)
Index: linux/include/trace/events/mce.h
===================================================================
--- /dev/null
+++ linux/include/trace/events/mce.h
@@ -0,0 +1,47 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mce
+
+#if !defined(_TRACE_MCE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MCE_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(mce_thermal_throttle_hot,
+
+	TP_PROTO(unsigned int throttle_count),
+
+	TP_ARGS(throttle_count),
+
+	TP_STRUCT__entry(
+		__field(	u64,		throttle_count	)
+	),
+
+	TP_fast_assign(
+		__entry->throttle_count = throttle_count;
+	),
+
+	TP_printk("throttle_count=%Lu", __entry->throttle_count)
+);
+
+TRACE_EVENT(mce_thermal_throttle_cold,
+
+	TP_PROTO(unsigned int throttle_count),
+
+	TP_ARGS(throttle_count),
+
+	TP_STRUCT__entry(
+		__field(	u64,		throttle_count	)
+	),
+
+	TP_fast_assign(
+		__entry->throttle_count = throttle_count;
+	),
+
+	TP_printk("throttle_count=%Lu", __entry->throttle_count)
+);
+
+#endif /* _TRACE_MCE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer
  2009-09-18 10:20 [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Huang Ying
  2009-09-18 11:09 ` Ingo Molnar
@ 2009-10-05  6:23 ` Hidetoshi Seto
  2009-10-05  6:33   ` [PATCH 01/10] x86, mce: remove tsc handling from mce_read Hidetoshi Seto
                     ` (11 more replies)
  1 sibling, 12 replies; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-05  6:23 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H. Peter Anvin, Andi Kleen, linux-kernel

Hi Huang,

Huang Ying wrote:
> Current MCE log ring buffer has following bugs and issues:
> 
> - On larger systems the 32 size buffer easily overflow, losing events.
> 
> - We had some reports of events getting corrupted which were also
>   blamed on the ring buffer.
> 
> - There's a known livelock, now hit by more people, under high error
>   rate.
> 
> We fix these bugs and issues via making MCE log ring buffer as
> lock-less per-CPU ring buffer.

Now I have a real problem on the small MCE log buffer on my new large
system with Nehalem which has many cpus/banks in one socket...
So I'd like to solve the problem asap.  I think this problem might block
some distros to support new processor.

Last week I reviewed your patch again and noticed that it is doing a lot
of changes at once.  I suppose that this method must be one of reasons
why your patch seems to be so hard to review, and why it is taking long
time to be accepted by x86 maintainers.

Fortunately I had some spare time so I carefully broke your patch into
some purpose-designed pieces.  It would be the most significant change 
that now there are 2 steps to convert the buffer structure - 1) to make
it per-CPU and 2) to make it ring buffer.

Also I fixed some problem in your patch, found on the way to make this
patch set.  I'll explain about my changes later using diff from your
change.  Comments are welcomed.

Thanks,
H.Seto


Hidetoshi Seto (10):
      x86, mce: remove tsc handling from mce_read
      x86, mce: mce_read can check args without mutex
      x86, mce: change writer timeout in mce_read
      x86, mce: use do-while in mce_log
      x86, mce: make mce_log buffer to per-CPU, prep
      x86, mce: make mce_log buffer to per-CPU
      x86, mce: remove for-loop in mce_log
      x86, mce: change barriers in mce_log
      x86, mce: make mce_log buffer to ring buffer
      x86, mce: move mce_log_init() into mce_cap_init()

 arch/x86/include/asm/mce.h       |   43 ++++--
 arch/x86/kernel/cpu/mcheck/mce.c |  299 +++++++++++++++++++++++---------------
 2 files changed, 211 insertions(+), 131 deletions(-)


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 01/10] x86, mce: remove tsc handling from mce_read
  2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
@ 2009-10-05  6:33   ` Hidetoshi Seto
  2009-10-05  6:34   ` [PATCH 02/10] x86, mce: mce_read can check args without mutex Hidetoshi Seto
                     ` (10 subsequent siblings)
  11 siblings, 0 replies; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-05  6:33 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H. Peter Anvin, Andi Kleen, linux-kernel

This handling (looks too buggy) is no longer required.

(This piece originates from Huang's patch, titled:
 "x86, MCE: Fix bugs and issues of MCE log ring buffer")

Originally-From: Huang Ying <ying.huang@intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c |   30 ------------------------------
 1 files changed, 0 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a6d5d4a..28e1e14 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1449,34 +1449,21 @@ static int mce_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static void collect_tscs(void *data)
-{
-	unsigned long *cpu_tsc = (unsigned long *)data;
-
-	rdtscll(cpu_tsc[smp_processor_id()]);
-}
-
 static DEFINE_MUTEX(mce_read_mutex);
 
 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 			loff_t *off)
 {
 	char __user *buf = ubuf;
-	unsigned long *cpu_tsc;
 	unsigned prev, next;
 	int i, err;
 
-	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
-	if (!cpu_tsc)
-		return -ENOMEM;
-
 	mutex_lock(&mce_read_mutex);
 	next = rcu_dereference(mcelog.next);
 
 	/* Only supports full reads right now */
 	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 		mutex_unlock(&mce_read_mutex);
-		kfree(cpu_tsc);
 
 		return -EINVAL;
 	}
@@ -1511,24 +1498,7 @@ timeout:
 
 	synchronize_sched();
 
-	/*
-	 * Collect entries that were still getting written before the
-	 * synchronize.
-	 */
-	on_each_cpu(collect_tscs, cpu_tsc, 1);
-
-	for (i = next; i < MCE_LOG_LEN; i++) {
-		if (mcelog.entry[i].finished &&
-		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
-			err |= copy_to_user(buf, mcelog.entry+i,
-					    sizeof(struct mce));
-			smp_rmb();
-			buf += sizeof(struct mce);
-			memset(&mcelog.entry[i], 0, sizeof(struct mce));
-		}
-	}
 	mutex_unlock(&mce_read_mutex);
-	kfree(cpu_tsc);
 
 	return err ? -EFAULT : buf - ubuf;
 }
-- 
1.6.4.3



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 02/10] x86, mce: mce_read can check args without mutex
  2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
  2009-10-05  6:33   ` [PATCH 01/10] x86, mce: remove tsc handling from mce_read Hidetoshi Seto
@ 2009-10-05  6:34   ` Hidetoshi Seto
  2009-10-05  6:35   ` [PATCH 03/10] x86, mce: change writer timeout in mce_read Hidetoshi Seto
                     ` (9 subsequent siblings)
  11 siblings, 0 replies; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-05  6:34 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H. Peter Anvin, Andi Kleen, linux-kernel

Move it before mutex_lock().

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c |   11 ++++-------
 1 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 28e1e14..2d0f9d8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1458,15 +1458,12 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	unsigned prev, next;
 	int i, err;
 
-	mutex_lock(&mce_read_mutex);
-	next = rcu_dereference(mcelog.next);
-
 	/* Only supports full reads right now */
-	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
-		mutex_unlock(&mce_read_mutex);
-
+	if (*off != 0 || usize < sizeof(struct mce) * MCE_LOG_LEN)
 		return -EINVAL;
-	}
+
+	mutex_lock(&mce_read_mutex);
+	next = rcu_dereference(mcelog.next);
 
 	err = 0;
 	prev = 0;
-- 
1.6.4.3



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 03/10] x86, mce: change writer timeout in mce_read
  2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
  2009-10-05  6:33   ` [PATCH 01/10] x86, mce: remove tsc handling from mce_read Hidetoshi Seto
  2009-10-05  6:34   ` [PATCH 02/10] x86, mce: mce_read can check args without mutex Hidetoshi Seto
@ 2009-10-05  6:35   ` Hidetoshi Seto
  2009-10-05  6:36   ` [PATCH 04/10] x86, mce: use do-while in mce_log Hidetoshi Seto
                     ` (8 subsequent siblings)
  11 siblings, 0 replies; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-05  6:35 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H. Peter Anvin, Andi Kleen, linux-kernel

Changed to use ndelay().

(This piece originates from Huang's patch, titled:
 "x86, MCE: Fix bugs and issues of MCE log ring buffer")

Originally-From: Huang Ying <ying.huang@intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c |   11 ++++++++---
 1 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 2d0f9d8..9e8eabd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -52,6 +52,9 @@ int mce_disabled __read_mostly;
 
 #define SPINUNIT 100	/* 100ns */
 
+/* Timeout log reader to wait writer to finish */
+#define WRITER_TIMEOUT_NS	NSEC_PER_MSEC
+
 atomic_t mce_entry;
 
 DEFINE_PER_CPU(unsigned, mce_exception_count);
@@ -1469,15 +1472,17 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	prev = 0;
 	do {
 		for (i = prev; i < next; i++) {
-			unsigned long start = jiffies;
+			int timeout = WRITER_TIMEOUT_NS;
 
 			while (!mcelog.entry[i].finished) {
-				if (time_after_eq(jiffies, start + 2)) {
+				if (timeout-- <= 0) {
 					memset(mcelog.entry + i, 0,
 					       sizeof(struct mce));
+					printk(KERN_WARNING "mcelog: timeout "
+					     "waiting for writer to finish!\n");
 					goto timeout;
 				}
-				cpu_relax();
+				ndelay(1);
 			}
 			smp_rmb();
 			err |= copy_to_user(buf, mcelog.entry + i,
-- 
1.6.4.3



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 04/10] x86, mce: use do-while in mce_log
  2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
                     ` (2 preceding siblings ...)
  2009-10-05  6:35   ` [PATCH 03/10] x86, mce: change writer timeout in mce_read Hidetoshi Seto
@ 2009-10-05  6:36   ` Hidetoshi Seto
  2009-10-05  6:37   ` [PATCH 05/10] x86, mce: make mce_log buffer to per-CPU, prep Hidetoshi Seto
                     ` (7 subsequent siblings)
  11 siblings, 0 replies; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-05  6:36 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H. Peter Anvin, Andi Kleen, linux-kernel

No changes in logic.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c |   10 +++++-----
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9e8eabd..5f88ada 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -134,7 +134,8 @@ void mce_log(struct mce *mce)
 
 	mce->finished = 0;
 	wmb();
-	for (;;) {
+
+	do {
 		entry = rcu_dereference(mcelog.next);
 		for (;;) {
 			/*
@@ -156,14 +157,13 @@ void mce_log(struct mce *mce)
 		}
 		smp_rmb();
 		next = entry + 1;
-		if (cmpxchg(&mcelog.next, entry, next) == entry)
-			break;
-	}
+	} while (cmpxchg(&mcelog.next, entry, next) != entry);
+
 	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+
 	wmb();
 	mcelog.entry[entry].finished = 1;
 	wmb();
-
 	mce->finished = 1;
 	set_bit(0, &mce_need_notify);
 }
-- 
1.6.4.3



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 05/10] x86, mce: make mce_log buffer to per-CPU, prep
  2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
                     ` (3 preceding siblings ...)
  2009-10-05  6:36   ` [PATCH 04/10] x86, mce: use do-while in mce_log Hidetoshi Seto
@ 2009-10-05  6:37   ` Hidetoshi Seto
  2009-10-05  6:38   ` [PATCH 06/10] x86, mce: make mce_log buffer to per-CPU Hidetoshi Seto
                     ` (6 subsequent siblings)
  11 siblings, 0 replies; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-05  6:37 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H. Peter Anvin, Andi Kleen, linux-kernel

Move core part of mce_read() into mce_read_buf().  Add mce_empty().
These changes are required to improve readability of next patch.

(This piece originates from Huang's patch, titled:
 "x86, MCE: Fix bugs and issues of MCE log ring buffer")

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c |   48 ++++++++++++++++++++++++++-----------
 1 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5f88ada..684b42e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1452,20 +1452,12 @@ static int mce_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static DEFINE_MUTEX(mce_read_mutex);
-
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
-			loff_t *off)
+static ssize_t mce_read_buf(char __user *inubuf, size_t usize)
 {
-	char __user *buf = ubuf;
+	char __user *ubuf = inubuf;
 	unsigned prev, next;
 	int i, err;
 
-	/* Only supports full reads right now */
-	if (*off != 0 || usize < sizeof(struct mce) * MCE_LOG_LEN)
-		return -EINVAL;
-
-	mutex_lock(&mce_read_mutex);
 	next = rcu_dereference(mcelog.next);
 
 	err = 0;
@@ -1485,9 +1477,9 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 				ndelay(1);
 			}
 			smp_rmb();
-			err |= copy_to_user(buf, mcelog.entry + i,
+			err |= copy_to_user(ubuf, mcelog.entry + i,
 					    sizeof(struct mce));
-			buf += sizeof(struct mce);
+			ubuf += sizeof(struct mce);
 timeout:
 			;
 		}
@@ -1500,15 +1492,43 @@ timeout:
 
 	synchronize_sched();
 
+	return err ? -EFAULT : ubuf - inubuf;
+}
+
+static int mce_empty(void)
+{
+	return !rcu_dereference(mcelog.next);
+}
+
+static DEFINE_MUTEX(mce_read_mutex);
+
+static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
+			loff_t *off)
+{
+	char __user *ubuf = inubuf;
+	int err;
+
+	/* Only supports full reads right now */
+	if (*off != 0 || usize < sizeof(struct mce) * MCE_LOG_LEN)
+		return -EINVAL;
+
+	mutex_lock(&mce_read_mutex);
+
+	err = mce_read_buf(ubuf, usize);
+	if (err > 0) {
+		ubuf += err;
+		err = 0;
+	}
+
 	mutex_unlock(&mce_read_mutex);
 
-	return err ? -EFAULT : buf - ubuf;
+	return err ? err : ubuf - inubuf;
 }
 
 static unsigned int mce_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &mce_wait, wait);
-	if (rcu_dereference(mcelog.next))
+	if (!mce_empty())
 		return POLLIN | POLLRDNORM;
 	return 0;
 }
-- 
1.6.4.3



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 06/10] x86, mce: make mce_log buffer to per-CPU
  2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
                     ` (4 preceding siblings ...)
  2009-10-05  6:37   ` [PATCH 05/10] x86, mce: make mce_log buffer to per-CPU, prep Hidetoshi Seto
@ 2009-10-05  6:38   ` Hidetoshi Seto
  2009-10-05  7:06     ` Andi Kleen
  2009-10-05  6:40   ` [PATCH 07/10] x86, mce: remove for-loop in mce_log Hidetoshi Seto
                     ` (5 subsequent siblings)
  11 siblings, 1 reply; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-05  6:38 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H. Peter Anvin, Andi Kleen, linux-kernel

On larger systems the global 32 size buffer for mcelog easily overflow,
lose events.  And there's a known livelock, now hit by more people,
under high error rate.

This patch fixes these issues by making MCE log buffer to per-CPU:

  + MCEs are added to corresponding local per-CPU buffer, instead of
    one big global buffer.  Contention/unfairness between CPUs is
    eliminated.

Reader/Writer convention is unchanged (= Lock-less for writer side):

  + MCE log writer may come from NMI, so the writer side must be
    lock-less.  For per-CPU buffer of one CPU, writers may come from
    process, IRQ or NMI context, so cmpxchg_local() is used to allocate
    buffer space.

  + MCE records are read out and removed from per-CPU buffers by mutex
    protected global reader function.  Because there are no many
    readers in system to contend in most cases.  In other words,
    reader side is protected with a mutex to guarantee only one reader
    is active in the whole system.

As the result now each CPU has its local 32 size buffer.

HS: Add a member header_len to struct mce_log to help debugger to know
    where the array of record is.

(This piece originates from Huang's patch, titled:
 "x86, MCE: Fix bugs and issues of MCE log ring buffer")

Originally-From: Huang Ying <ying.huang@intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 arch/x86/include/asm/mce.h       |   37 ++++++----
 arch/x86/kernel/cpu/mcheck/mce.c |  139 +++++++++++++++++++++++++++-----------
 2 files changed, 120 insertions(+), 56 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2f1c0ef..c5d4144 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -52,7 +52,7 @@
 #define MCE_INJ_NMI_BROADCAST	(1 << 2)	/* do NMI broadcasting */
 #define MCE_INJ_EXCEPTION	(1 << 3)	/* raise as exception */
 
-/* Fields are zero when not available */
+/* MCE log entry. Fields are zero when not available. */
 struct mce {
 	__u64 status;
 	__u64 misc;
@@ -63,12 +63,12 @@ struct mce {
 	__u64 time;	/* wall time_t when error was detected */
 	__u8  cpuvendor;	/* cpu vendor as encoded in system.h */
 	__u8  inject_flags;	/* software inject flags */
-	__u16  pad;
+	__u16 pad;
 	__u32 cpuid;	/* CPUID 1 EAX */
-	__u8  cs;		/* code segment */
+	__u8  cs;	/* code segment */
 	__u8  bank;	/* machine check bank */
 	__u8  cpu;	/* cpu number; obsolete; use extcpu now */
-	__u8  finished;   /* entry is valid */
+	__u8  finished;	/* 1 if write to entry is finished & entry is valid */
 	__u32 extcpu;	/* linux cpu number that detected the error */
 	__u32 socketid;	/* CPU socket ID */
 	__u32 apicid;	/* CPU initial apic ID */
@@ -76,26 +76,33 @@ struct mce {
 };
 
 /*
- * This structure contains all data related to the MCE log.  Also
- * carries a signature to make it easier to find from external
- * debugging tools.  Each entry is only valid when its finished flag
- * is set.
+ * This structure contains all data related to the MCE log.  Also carries
+ * a signature to make it easier to find from external debugging tools.
+ * Each entry is only valid when its finished flag is set.
  */
 
-#define MCE_LOG_LEN 32
+#define MCE_LOG_LEN		32
+
+struct mce_log_cpu;
 
 struct mce_log {
-	char signature[12]; /* "MACHINECHECK" */
-	unsigned len;	    /* = MCE_LOG_LEN */
-	unsigned next;
+	char signature[12];		/* "MACHINECHEC2" */
+
+	/* points the table of per-CPU buffers */
+	struct mce_log_cpu **mcelog_cpus;
+	unsigned int nr_mcelog_cpus;	/* = num_possible_cpus() */
+
+	/* spec of per-CPU buffer */
+	unsigned int header_len; 	/* offset of array "entry" */
+	unsigned int nr_record;		/* array size (= MCE_LOG_LEN) */
+	unsigned int record_len;	/* length of struct mce */
+
 	unsigned flags;
-	unsigned recordlen;	/* length of struct mce */
-	struct mce entry[MCE_LOG_LEN];
 };
 
 #define MCE_OVERFLOW 0		/* bit 0 in flags means overflow */
 
-#define MCE_LOG_SIGNATURE	"MACHINECHECK"
+#define MCE_LOG_SIGNATURE	"MACHINECHEC2"
 
 #define MCE_GET_RECORD_LEN   _IOR('M', 1, int)
 #define MCE_GET_LOG_LEN      _IOR('M', 2, int)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 684b42e..ad2eb89 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -122,21 +122,30 @@ EXPORT_PER_CPU_SYMBOL_GPL(mce_fake_banks);
  * separate MCEs from kernel messages to avoid bogus bug reports.
  */
 
+struct mce_log_cpu {
+	unsigned next;
+	struct mce entry[MCE_LOG_LEN];
+};
+
+DEFINE_PER_CPU(struct mce_log_cpu, mce_log_cpus);
+
 static struct mce_log mcelog = {
 	.signature	= MCE_LOG_SIGNATURE,
-	.len		= MCE_LOG_LEN,
-	.recordlen	= sizeof(struct mce),
+	.header_len	= offsetof(struct mce_log_cpu, entry),
+	.nr_record	= MCE_LOG_LEN,
+	.record_len	= sizeof(struct mce),
 };
 
 void mce_log(struct mce *mce)
 {
+	struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mce_log_cpus);
 	unsigned next, entry;
 
 	mce->finished = 0;
 	wmb();
 
 	do {
-		entry = rcu_dereference(mcelog.next);
+		entry = mcelog_cpu->next;
 		for (;;) {
 			/*
 			 * When the buffer fills up discard new entries.
@@ -149,7 +158,7 @@ void mce_log(struct mce *mce)
 				return;
 			}
 			/* Old left over entry. Skip: */
-			if (mcelog.entry[entry].finished) {
+			if (mcelog_cpu->entry[entry].finished) {
 				entry++;
 				continue;
 			}
@@ -157,12 +166,12 @@ void mce_log(struct mce *mce)
 		}
 		smp_rmb();
 		next = entry + 1;
-	} while (cmpxchg(&mcelog.next, entry, next) != entry);
+	} while (cmpxchg_local(&mcelog_cpu->next, entry, next) != entry);
 
-	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+	memcpy(mcelog_cpu->entry + entry, mce, sizeof(struct mce));
 
 	wmb();
-	mcelog.entry[entry].finished = 1;
+	mcelog_cpu->entry[entry].finished = 1;
 	wmb();
 	mce->finished = 1;
 	set_bit(0, &mce_need_notify);
@@ -210,6 +219,26 @@ static void print_mce_tail(void)
 	       "Run through mcelog --ascii to decode and contact your hardware vendor\n");
 }
 
+static void print_mce_cpu(int cpu, struct mce *final, u64 mask, u64 res)
+{
+	int i;
+	struct mce_log_cpu *mcelog_cpu;
+
+	mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+	for (i = 0; i < MCE_LOG_LEN; i++) {
+		struct mce *m = &mcelog_cpu->entry[i];
+		if (!m->finished)
+			continue;
+		if (!(m->status & MCI_STATUS_VAL))
+			continue;
+		if ((m->status & mask) != res)
+			continue;
+		if (final && !memcmp(m, final, sizeof(struct mce)))
+			continue;
+		print_mce(m);
+	}
+}
+
 #define PANIC_TIMEOUT 5 /* 5 seconds */
 
 static atomic_t mce_paniced;
@@ -232,7 +261,7 @@ static void wait_for_panic(void)
 
 static void mce_panic(char *msg, struct mce *final, char *exp)
 {
-	int i;
+	int cpu;
 
 	if (!fake_panic) {
 		/*
@@ -251,23 +280,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 	}
 	print_mce_head();
 	/* First print corrected ones that are still unlogged */
-	for (i = 0; i < MCE_LOG_LEN; i++) {
-		struct mce *m = &mcelog.entry[i];
-		if (!(m->status & MCI_STATUS_VAL))
-			continue;
-		if (!(m->status & MCI_STATUS_UC))
-			print_mce(m);
-	}
-	/* Now print uncorrected but with the final one last */
-	for (i = 0; i < MCE_LOG_LEN; i++) {
-		struct mce *m = &mcelog.entry[i];
-		if (!(m->status & MCI_STATUS_VAL))
-			continue;
-		if (!(m->status & MCI_STATUS_UC))
-			continue;
-		if (!final || memcmp(m, final, sizeof(struct mce)))
-			print_mce(m);
-	}
+	for_each_online_cpu(cpu)
+		print_mce_cpu(cpu, final, MCI_STATUS_UC, 0);
+	/* Print uncorrected but without the final one */
+	for_each_online_cpu(cpu)
+		print_mce_cpu(cpu, final, MCI_STATUS_UC, MCI_STATUS_UC);
+	/* Finally print the final mce */
 	if (final)
 		print_mce(final);
 	if (cpu_missing)
@@ -1234,6 +1252,22 @@ static int __cpuinit mce_cap_init(void)
 	return 0;
 }
 
+/*
+ * Initialize MCE per-CPU log buffer
+ */
+static __cpuinit void mce_log_init(void)
+{
+	int cpu;
+
+	if (mcelog.mcelog_cpus)
+		return;
+	mcelog.nr_mcelog_cpus = num_possible_cpus();
+	mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
+				     GFP_KERNEL);
+	for_each_possible_cpu(cpu)
+		mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
+}
+
 static void mce_init(void)
 {
 	mce_banks_t all_banks;
@@ -1404,6 +1438,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 		mce_disabled = 1;
 		return;
 	}
+	mce_log_init();
 
 	machine_check_vector = do_machine_check;
 
@@ -1452,13 +1487,16 @@ static int mce_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static ssize_t mce_read_buf(char __user *inubuf, size_t usize)
+static ssize_t mce_read_cpu(int cpu, char __user *inubuf, size_t usize)
 {
+	struct mce_log_cpu *mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
 	char __user *ubuf = inubuf;
 	unsigned prev, next;
 	int i, err;
 
-	next = rcu_dereference(mcelog.next);
+	next = mcelog_cpu->next;
+	if (!next)
+		return 0;
 
 	err = 0;
 	prev = 0;
@@ -1466,9 +1504,9 @@ static ssize_t mce_read_buf(char __user *inubuf, size_t usize)
 		for (i = prev; i < next; i++) {
 			int timeout = WRITER_TIMEOUT_NS;
 
-			while (!mcelog.entry[i].finished) {
+			while (!mcelog_cpu->entry[i].finished) {
 				if (timeout-- <= 0) {
-					memset(mcelog.entry + i, 0,
+					memset(mcelog_cpu->entry + i, 0,
 					       sizeof(struct mce));
 					printk(KERN_WARNING "mcelog: timeout "
 					     "waiting for writer to finish!\n");
@@ -1477,27 +1515,33 @@ static ssize_t mce_read_buf(char __user *inubuf, size_t usize)
 				ndelay(1);
 			}
 			smp_rmb();
-			err |= copy_to_user(ubuf, mcelog.entry + i,
+			err |= copy_to_user(ubuf, mcelog_cpu->entry + i,
 					    sizeof(struct mce));
 			ubuf += sizeof(struct mce);
 timeout:
 			;
 		}
 
-		memset(mcelog.entry + prev, 0,
+		memset(mcelog_cpu->entry + prev, 0,
 		       (next - prev) * sizeof(struct mce));
 		prev = next;
-		next = cmpxchg(&mcelog.next, prev, 0);
+		next = cmpxchg(&mcelog_cpu->next, prev, 0);
 	} while (next != prev);
 
-	synchronize_sched();
-
 	return err ? -EFAULT : ubuf - inubuf;
 }
 
 static int mce_empty(void)
 {
-	return !rcu_dereference(mcelog.next);
+	int cpu;
+	struct mce_log_cpu *mcelog_cpu;
+
+	for_each_possible_cpu(cpu) {
+		mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+		if (mcelog_cpu->next)
+			return 0;
+	}
+	return 1;
 }
 
 static DEFINE_MUTEX(mce_read_mutex);
@@ -1506,7 +1550,7 @@ static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
 			loff_t *off)
 {
 	char __user *ubuf = inubuf;
-	int err;
+	int cpu, err = 0;
 
 	/* Only supports full reads right now */
 	if (*off != 0 || usize < sizeof(struct mce) * MCE_LOG_LEN)
@@ -1514,12 +1558,25 @@ static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
 
 	mutex_lock(&mce_read_mutex);
 
-	err = mce_read_buf(ubuf, usize);
-	if (err > 0) {
-		ubuf += err;
-		err = 0;
+	while (!mce_empty()) {
+		for_each_possible_cpu(cpu) {
+			if (usize < MCE_LOG_LEN * sizeof(struct mce))
+				goto out;
+			err = mce_read_cpu(cpu, ubuf, sizeof(struct mce));
+			if (err > 0) {
+				ubuf += sizeof(struct mce);
+				usize -= sizeof(struct mce);
+				err = 0;
+			} else if (err < 0)
+				goto out;
+		}
+		if (need_resched()) {
+			mutex_unlock(&mce_read_mutex);
+			cond_resched();
+			mutex_lock(&mce_read_mutex);
+		}
 	}
-
+out:
 	mutex_unlock(&mce_read_mutex);
 
 	return err ? err : ubuf - inubuf;
-- 
1.6.4.3



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 07/10] x86, mce: remove for-loop in mce_log
  2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
                     ` (5 preceding siblings ...)
  2009-10-05  6:38   ` [PATCH 06/10] x86, mce: make mce_log buffer to per-CPU Hidetoshi Seto
@ 2009-10-05  6:40   ` Hidetoshi Seto
  2009-10-05  6:41   ` [PATCH 08/10] x86, mce: change barriers " Hidetoshi Seto
                     ` (4 subsequent siblings)
  11 siblings, 0 replies; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-05  6:40 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H. Peter Anvin, Andi Kleen, linux-kernel

The new per-CPU buffer allows us to drop this weird code.

Note: This reverts a part of following old commit:

  commit 673242c10d535bfe238d9d8e82ac93432d35b88e
  Author: Andi Kleen <ak@suse.de>
  Date:   Mon Sep 12 18:49:24 2005 +0200
  [PATCH] x86-64: Make lockless machine check record passing a bit more robust.

(This piece originates from Huang's patch, titled:
 "x86, MCE: Fix bugs and issues of MCE log ring buffer")

Originally-From: Huang Ying <ying.huang@intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c |   25 ++++++++-----------------
 1 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ad2eb89..87b2e29 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -146,23 +146,14 @@ void mce_log(struct mce *mce)
 
 	do {
 		entry = mcelog_cpu->next;
-		for (;;) {
-			/*
-			 * When the buffer fills up discard new entries.
-			 * Assume that the earlier errors are the more
-			 * interesting ones:
-			 */
-			if (entry >= MCE_LOG_LEN) {
-				set_bit(MCE_OVERFLOW,
-					(unsigned long *)&mcelog.flags);
-				return;
-			}
-			/* Old left over entry. Skip: */
-			if (mcelog_cpu->entry[entry].finished) {
-				entry++;
-				continue;
-			}
-			break;
+		/*
+		 * When the buffer fills up discard new entries.
+		 * Assume that the earlier errors are the more
+		 * interesting ones:
+		 */
+		if (entry >= MCE_LOG_LEN) {
+			set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
+			return;
 		}
 		smp_rmb();
 		next = entry + 1;
-- 
1.6.4.3



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 08/10] x86, mce: change barriers in mce_log
  2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
                     ` (6 preceding siblings ...)
  2009-10-05  6:40   ` [PATCH 07/10] x86, mce: remove for-loop in mce_log Hidetoshi Seto
@ 2009-10-05  6:41   ` Hidetoshi Seto
  2009-10-05  6:42   ` [PATCH 09/10] x86, mce: make mce_log buffer to ring buffer Hidetoshi Seto
                     ` (3 subsequent siblings)
  11 siblings, 0 replies; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-05  6:41 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H. Peter Anvin, Andi Kleen, linux-kernel

Long time ago, smp_wmb() was replaced to wmb() by following commit:
  commit 7644143cd6f7e029f3a8ea64f5fb0ab33ec39f72
  Author: Mike Waychison <mikew@google.com>
  Date:   Fri Sep 30 00:01:27 2005 +0200
  [PATCH] x86_64: Fix mce_log
  > AK: turned the smp_wmbs into true wmbs to make sure they are not
  > reordered by the compiler on UP.

Change them back to original form, and put comments.

(This piece originates from Huang's patch, titled:
 "x86, MCE: Fix bugs and issues of MCE log ring buffer")

Originally-From: Huang Ying <ying.huang@intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c |   10 +++++++---
 1 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 87b2e29..655915b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -141,8 +141,9 @@ void mce_log(struct mce *mce)
 	struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mce_log_cpus);
 	unsigned next, entry;
 
+	/* mce->finished must be set to 0 before written to buffer */
 	mce->finished = 0;
-	wmb();
+	smp_wmb();
 
 	do {
 		entry = mcelog_cpu->next;
@@ -161,9 +162,12 @@ void mce_log(struct mce *mce)
 
 	memcpy(mcelog_cpu->entry + entry, mce, sizeof(struct mce));
 
-	wmb();
+	/* ".finished" of MCE record in buffer must be set after copy */
+	smp_wmb();
 	mcelog_cpu->entry[entry].finished = 1;
-	wmb();
+
+	/* bit 0 of notify_user should be set after finished be set */
+	smp_wmb();
 	mce->finished = 1;
 	set_bit(0, &mce_need_notify);
 }
-- 
1.6.4.3



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 09/10] x86, mce: make mce_log buffer to ring buffer
  2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
                     ` (7 preceding siblings ...)
  2009-10-05  6:41   ` [PATCH 08/10] x86, mce: change barriers " Hidetoshi Seto
@ 2009-10-05  6:42   ` Hidetoshi Seto
  2009-10-05  6:44   ` [PATCH 10/10] x86, mce: move mce_log_init() into mce_cap_init() Hidetoshi Seto
                     ` (2 subsequent siblings)
  11 siblings, 0 replies; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-05  6:42 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H. Peter Anvin, Andi Kleen, linux-kernel

This patch implements Per-CPU ring buffer data structure.

  + An array is used to hold MCE records. integer "head" indicates
    next writing position and integer "tail" indicates next reading
    position.

  + To distinguish buffer empty and full, head and tail wrap to 0 at
    MCE_LOG_LIMIT instead of MCE_LOG_LEN. Then the real next writing
    position is head % MCE_LOG_LEN, and real next reading position is
    tail % MCE_LOG_LEN. If buffer is empty, head == tail, if buffer is
    full, head % MCE_LOG_LEN == tail % MCE_LOG_LEN and head != tail.

(This piece originates from Huang's patch, titled:
 "x86, MCE: Fix bugs and issues of MCE log ring buffer")

Originally-From: Huang Ying <ying.huang@intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 arch/x86/include/asm/mce.h       |    6 +++
 arch/x86/kernel/cpu/mcheck/mce.c |   77 +++++++++++++++++++++----------------
 2 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c5d4144..4b5ef3c 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -82,6 +82,12 @@ struct mce {
  */
 
 #define MCE_LOG_LEN		32
+#define MCE_LOG_LIMIT		(MCE_LOG_LEN * 2 - 1)
+
+static inline int mce_log_index(int n)
+{
+	return n >= MCE_LOG_LEN ? n - MCE_LOG_LEN : n;
+}
 
 struct mce_log_cpu;
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 655915b..63a7820 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -123,7 +123,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(mce_fake_banks);
  */
 
 struct mce_log_cpu {
-	unsigned next;
+	int head;
+	int tail;
 	struct mce entry[MCE_LOG_LEN];
 };
 
@@ -139,32 +140,34 @@ static struct mce_log mcelog = {
 void mce_log(struct mce *mce)
 {
 	struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mce_log_cpus);
-	unsigned next, entry;
+	int head, ihead, tail, next;
 
 	/* mce->finished must be set to 0 before written to buffer */
 	mce->finished = 0;
 	smp_wmb();
 
 	do {
-		entry = mcelog_cpu->next;
+		head = mcelog_cpu->head;
+		tail = mcelog_cpu->tail;
+		ihead = mce_log_index(head);
+
 		/*
 		 * When the buffer fills up discard new entries.
 		 * Assume that the earlier errors are the more
-		 * interesting ones:
+		 * interesting.
 		 */
-		if (entry >= MCE_LOG_LEN) {
+		if (ihead == mce_log_index(tail) && head != tail) {
 			set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
 			return;
 		}
-		smp_rmb();
-		next = entry + 1;
-	} while (cmpxchg_local(&mcelog_cpu->next, entry, next) != entry);
+		next = head == MCE_LOG_LIMIT ? 0 : head + 1;
+	} while (cmpxchg_local(&mcelog_cpu->head, head, next) != head);
 
-	memcpy(mcelog_cpu->entry + entry, mce, sizeof(struct mce));
+	memcpy(mcelog_cpu->entry + ihead, mce, sizeof(struct mce));
 
 	/* ".finished" of MCE record in buffer must be set after copy */
 	smp_wmb();
-	mcelog_cpu->entry[entry].finished = 1;
+	mcelog_cpu->entry[ihead].finished = 1;
 
 	/* bit 0 of notify_user should be set after finished be set */
 	smp_wmb();
@@ -1486,42 +1489,50 @@ static ssize_t mce_read_cpu(int cpu, char __user *inubuf, size_t usize)
 {
 	struct mce_log_cpu *mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
 	char __user *ubuf = inubuf;
-	unsigned prev, next;
-	int i, err;
+	int head, tail, pos, i, err = 0;
 
-	next = mcelog_cpu->next;
-	if (!next)
+	head = mcelog_cpu->head;
+	tail = mcelog_cpu->tail;
+	if (head == tail)
 		return 0;
 
-	err = 0;
-	prev = 0;
-	do {
-		for (i = prev; i < next; i++) {
+	for (pos = tail; pos != head && usize >= sizeof(struct mce);
+	     pos = pos == MCE_LOG_LIMIT ? 0 : pos+1) {
+		i = mce_log_index(pos);
+		if (!mcelog_cpu->entry[i].finished) {
 			int timeout = WRITER_TIMEOUT_NS;
 
 			while (!mcelog_cpu->entry[i].finished) {
 				if (timeout-- <= 0) {
 					memset(mcelog_cpu->entry + i, 0,
 					       sizeof(struct mce));
+					head = mcelog_cpu->head;
 					printk(KERN_WARNING "mcelog: timeout "
 					     "waiting for writer to finish!\n");
 					goto timeout;
 				}
 				ndelay(1);
 			}
-			smp_rmb();
-			err |= copy_to_user(ubuf, mcelog_cpu->entry + i,
-					    sizeof(struct mce));
-			ubuf += sizeof(struct mce);
-timeout:
-			;
 		}
-
-		memset(mcelog_cpu->entry + prev, 0,
-		       (next - prev) * sizeof(struct mce));
-		prev = next;
-		next = cmpxchg(&mcelog_cpu->next, prev, 0);
-	} while (next != prev);
+		/*
+		 * finished field should be checked before
+		 * copy_to_user()
+		 */
+		smp_rmb();
+		err |= copy_to_user(ubuf, mcelog_cpu->entry + i,
+				   sizeof(struct mce));
+		ubuf += sizeof(struct mce);
+		usize -= sizeof(struct mce);
+		mcelog_cpu->entry[i].finished = 0;
+timeout:
+		;
+	}
+	/*
+	 * mcelog_cpu->tail must be updated after ".finished" of
+	 * corresponding MCE records are clear.
+	 */
+	smp_wmb();
+	mcelog_cpu->tail = pos;
 
 	return err ? -EFAULT : ubuf - inubuf;
 }
@@ -1533,7 +1544,7 @@ static int mce_empty(void)
 
 	for_each_possible_cpu(cpu) {
 		mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
-		if (mcelog_cpu->next)
+		if (mcelog_cpu->head != mcelog_cpu->tail)
 			return 0;
 	}
 	return 1;
@@ -1548,14 +1559,14 @@ static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
 	int cpu, err = 0;
 
 	/* Only supports full reads right now */
-	if (*off != 0 || usize < sizeof(struct mce) * MCE_LOG_LEN)
+	if (*off != 0 || usize < sizeof(struct mce))
 		return -EINVAL;
 
 	mutex_lock(&mce_read_mutex);
 
 	while (!mce_empty()) {
 		for_each_possible_cpu(cpu) {
-			if (usize < MCE_LOG_LEN * sizeof(struct mce))
+			if (usize < sizeof(struct mce))
 				goto out;
 			err = mce_read_cpu(cpu, ubuf, sizeof(struct mce));
 			if (err > 0) {
-- 
1.6.4.3



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 10/10] x86, mce: move mce_log_init() into mce_cap_init()
  2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
                     ` (8 preceding siblings ...)
  2009-10-05  6:42   ` [PATCH 09/10] x86, mce: make mce_log buffer to ring buffer Hidetoshi Seto
@ 2009-10-05  6:44   ` Hidetoshi Seto
  2009-10-05  7:07   ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
  2009-10-05  8:51   ` Frédéric Weisbecker
  11 siblings, 0 replies; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-05  6:44 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H. Peter Anvin, Andi Kleen, linux-kernel

The mce_log_init() should be able to return -ENOMEM, to abort all following
initialization in mce.c.  It can be done in same style with mce_banks_init()
so I moved mce_log_init() into mce_cap_init() too.

And put __cpuinit to mce_banks_init().

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c |   48 +++++++++++++++++++++++--------------
 1 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 63a7820..fad3daa 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1194,19 +1194,40 @@ int mce_notify_irq(void)
 }
 EXPORT_SYMBOL_GPL(mce_notify_irq);
 
-static int mce_banks_init(void)
+static __cpuinit int mce_banks_init(void)
 {
 	int i;
 
 	mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
 	if (!mce_banks)
 		return -ENOMEM;
+
 	for (i = 0; i < banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
 
 		b->ctl = -1ULL;
 		b->init = 1;
 	}
+
+	return 0;
+}
+
+/*
+ * Initialize MCE per-CPU log buffer
+ */
+static __cpuinit int mce_log_init(void)
+{
+	int cpu;
+
+	mcelog.nr_mcelog_cpus = num_possible_cpus();
+	mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
+				     GFP_KERNEL);
+	if (!mcelog.mcelog_cpus)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu)
+		mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
+
 	return 0;
 }
 
@@ -1233,6 +1254,7 @@ static int __cpuinit mce_cap_init(void)
 	/* Don't support asymmetric configurations today */
 	WARN_ON(banks != 0 && b != banks);
 	banks = b;
+
 	if (!mce_banks) {
 		int err = mce_banks_init();
 
@@ -1240,6 +1262,13 @@ static int __cpuinit mce_cap_init(void)
 			return err;
 	}
 
+	if (!mcelog.mcelog_cpus) {
+		int err = mce_log_init();
+
+		if (err)
+			return err;
+	}
+
 	/* Use accurate RIP reporting if available. */
 	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
 		rip_msr = MSR_IA32_MCG_EIP;
@@ -1250,22 +1279,6 @@ static int __cpuinit mce_cap_init(void)
 	return 0;
 }
 
-/*
- * Initialize MCE per-CPU log buffer
- */
-static __cpuinit void mce_log_init(void)
-{
-	int cpu;
-
-	if (mcelog.mcelog_cpus)
-		return;
-	mcelog.nr_mcelog_cpus = num_possible_cpus();
-	mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
-				     GFP_KERNEL);
-	for_each_possible_cpu(cpu)
-		mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
-}
-
 static void mce_init(void)
 {
 	mce_banks_t all_banks;
@@ -1436,7 +1449,6 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 		mce_disabled = 1;
 		return;
 	}
-	mce_log_init();
 
 	machine_check_vector = do_machine_check;
 
-- 
1.6.4.3



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 06/10] x86, mce: make mce_log buffer to per-CPU
  2009-10-05  6:38   ` [PATCH 06/10] x86, mce: make mce_log buffer to per-CPU Hidetoshi Seto
@ 2009-10-05  7:06     ` Andi Kleen
  2009-10-05  7:50       ` Hidetoshi Seto
  0 siblings, 1 reply; 23+ messages in thread
From: Andi Kleen @ 2009-10-05  7:06 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Huang Ying, Ingo Molnar, H. Peter Anvin, linux-kernel


> 
> (This piece originates from Huang's patch, titled:
>  "x86, MCE: Fix bugs and issues of MCE log ring buffer")

You should use proper From: headers then for correct attribution.

-Andi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer
  2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
                     ` (9 preceding siblings ...)
  2009-10-05  6:44   ` [PATCH 10/10] x86, mce: move mce_log_init() into mce_cap_init() Hidetoshi Seto
@ 2009-10-05  7:07   ` Hidetoshi Seto
  2009-10-05  8:51   ` Frédéric Weisbecker
  11 siblings, 0 replies; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-05  7:07 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H. Peter Anvin, Andi Kleen, linux-kernel

This is the diff between Huang's original change and the result of changes
by my patch set.

I'll going to explain what I changed from the original.

> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> index 2d5c42a..4b5ef3c 100644
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -52,7 +52,7 @@
>  #define MCE_INJ_NMI_BROADCAST	(1 << 2)	/* do NMI broadcasting */
>  #define MCE_INJ_EXCEPTION	(1 << 3)	/* raise as exception */
>  
> -/* Fields are zero when not available */
> +/* MCE log entry. Fields are zero when not available. */
>  struct mce {
>  	__u64 status;
>  	__u64 misc;
> @@ -63,12 +63,12 @@ struct mce {
>  	__u64 time;	/* wall time_t when error was detected */
>  	__u8  cpuvendor;	/* cpu vendor as encoded in system.h */
>  	__u8  inject_flags;	/* software inject flags */
> -	__u16  pad;
> +	__u16 pad;
>  	__u32 cpuid;	/* CPUID 1 EAX */
> -	__u8  cs;		/* code segment */
> +	__u8  cs;	/* code segment */
>  	__u8  bank;	/* machine check bank */
>  	__u8  cpu;	/* cpu number; obsolete; use extcpu now */
> -	__u8  finished;   /* entry is valid */
> +	__u8  finished;	/* 1 if write to entry is finished & entry is valid */
>  	__u32 extcpu;	/* linux cpu number that detected the error */
>  	__u32 socketid;	/* CPU socket ID */
>  	__u32 apicid;	/* CPU initial apic ID */
> @@ -76,10 +76,9 @@ struct mce {
>  };
>  
>  /*
> - * This structure contains all data related to the MCE log.  Also
> - * carries a signature to make it easier to find from external
> - * debugging tools.  Each entry is only valid when its finished flag
> - * is set.
> + * This structure contains all data related to the MCE log.  Also carries
> + * a signature to make it easier to find from external debugging tools.
> + * Each entry is only valid when its finished flag is set.
>   */

Small clean up.

>  
>  #define MCE_LOG_LEN		32
> @@ -93,12 +92,18 @@ static inline int mce_log_index(int n)
>  struct mce_log_cpu;
>  
>  struct mce_log {
> -	char signature[12]; /* "MACHINECHEC2" */
> -	unsigned len;	    /* = MCE_LOG_LEN */
> -	unsigned flags;
> -	unsigned recordlen;	/* length of struct mce */
> -	unsigned nr_mcelog_cpus;
> +	char signature[12];		/* "MACHINECHEC2" */
> +
> +	/* points the table of per-CPU buffers */
>  	struct mce_log_cpu **mcelog_cpus;
> +	unsigned int nr_mcelog_cpus;	/* = num_possible_cpus() */
> +
> +	/* spec of per-CPU buffer */
> +	unsigned int header_len; 	/* offset of array "entry" */
> +	unsigned int nr_record;		/* array size (= MCE_LOG_LEN) */
> +	unsigned int record_len;	/* length of struct mce */
> +
> +	unsigned flags;
>  };

Add a member header_len, and renamed "len" to "nr_record" which is easier
to understand.  Please refer the description of patch 6/10.

>  
>  #define MCE_OVERFLOW 0		/* bit 0 in flags means overflow */
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index 5db3f5b..fad3daa 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -122,54 +122,53 @@ EXPORT_PER_CPU_SYMBOL_GPL(mce_fake_banks);
>   * separate MCEs from kernel messages to avoid bogus bug reports.
>   */
>  
> -static struct mce_log mcelog = {
> -	.signature	= MCE_LOG_SIGNATURE,
> -	.len		= MCE_LOG_LEN,
> -	.recordlen	= sizeof(struct mce),
> -};
> -
>  struct mce_log_cpu {
>  	int head;
>  	int tail;
> -	unsigned long flags;
>  	struct mce entry[MCE_LOG_LEN];
>  };

Removed "flags" from per-CPU structure since mce_ioctl only cares global flags
in struct mce_log.

>  
>  DEFINE_PER_CPU(struct mce_log_cpu, mce_log_cpus);
>  
> +static struct mce_log mcelog = {
> +	.signature	= MCE_LOG_SIGNATURE,
> +	.header_len	= offsetof(struct mce_log_cpu, entry),
> +	.nr_record	= MCE_LOG_LEN,
> +	.record_len	= sizeof(struct mce),
> +};
> +
>  void mce_log(struct mce *mce)
>  {
>  	struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mce_log_cpus);
>  	int head, ihead, tail, next;
>  
> +	/* mce->finished must be set to 0 before written to buffer */
>  	mce->finished = 0;
> -	/*
> -	 * mce->finished must be set to 0 before written to ring
> -	 * buffer
> -	 */
>  	smp_wmb();
> +
>  	do {
>  		head = mcelog_cpu->head;
>  		tail = mcelog_cpu->tail;
>  		ihead = mce_log_index(head);
> +
>  		/*
>  		 * When the buffer fills up discard new entries.
>  		 * Assume that the earlier errors are the more
>  		 * interesting.
>  		 */
>  		if (ihead == mce_log_index(tail) && head != tail) {
> -			set_bit(MCE_OVERFLOW, &mcelog_cpu->flags);
> +			set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);

Use global flags which is cared by mce_ioctl.

>  			return;
>  		}
>  		next = head == MCE_LOG_LIMIT ? 0 : head + 1;
>  	} while (cmpxchg_local(&mcelog_cpu->head, head, next) != head);
> +
>  	memcpy(mcelog_cpu->entry + ihead, mce, sizeof(struct mce));
> -	/*
> -	 * ".finished" of MCE record in ring buffer must be set after
> -	 * copy
> -	 */
> +
> +	/* ".finished" of MCE record in buffer must be set after copy */
>  	smp_wmb();
>  	mcelog_cpu->entry[ihead].finished = 1;
> +
>  	/* bit 0 of notify_user should be set after finished be set */
>  	smp_wmb();
>  	mce->finished = 1;


Changes from here to ....

> @@ -1195,19 +1194,40 @@ int mce_notify_irq(void)
>  }
>  EXPORT_SYMBOL_GPL(mce_notify_irq);
>  
> -static int mce_banks_init(void)
> +static __cpuinit int mce_banks_init(void)
>  {
>  	int i;
>  
>  	mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
>  	if (!mce_banks)
>  		return -ENOMEM;
> +
>  	for (i = 0; i < banks; i++) {
>  		struct mce_bank *b = &mce_banks[i];
>  
>  		b->ctl = -1ULL;
>  		b->init = 1;
>  	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Initialize MCE per-CPU log buffer
> + */
> +static __cpuinit int mce_log_init(void)
> +{
> +	int cpu;
> +
> +	mcelog.nr_mcelog_cpus = num_possible_cpus();
> +	mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
> +				     GFP_KERNEL);
> +	if (!mcelog.mcelog_cpus)
> +		return -ENOMEM;
> +
> +	for_each_possible_cpu(cpu)
> +		mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
> +
>  	return 0;
>  }
>  
> @@ -1234,6 +1254,7 @@ static int __cpuinit mce_cap_init(void)
>  	/* Don't support asymmetric configurations today */
>  	WARN_ON(banks != 0 && b != banks);
>  	banks = b;
> +
>  	if (!mce_banks) {
>  		int err = mce_banks_init();
>  
> @@ -1241,6 +1262,13 @@ static int __cpuinit mce_cap_init(void)
>  			return err;
>  	}
>  
> +	if (!mcelog.mcelog_cpus) {
> +		int err = mce_log_init();
> +
> +		if (err)
> +			return err;
> +	}
> +
>  	/* Use accurate RIP reporting if available. */
>  	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
>  		rip_msr = MSR_IA32_MCG_EIP;
> @@ -1251,22 +1279,6 @@ static int __cpuinit mce_cap_init(void)
>  	return 0;
>  }
>  
> -/*
> - * Initialize MCE per-CPU log buffer
> - */
> -static __cpuinit void mce_log_init(void)
> -{
> -	int cpu;
> -
> -	if (mcelog.mcelog_cpus)
> -		return;
> -	mcelog.nr_mcelog_cpus = num_possible_cpus();
> -	mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
> -				     GFP_KERNEL);
> -	for_each_possible_cpu(cpu)
> -		mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
> -}
> -
>  static void mce_init(void)
>  {
>  	mce_banks_t all_banks;
> @@ -1437,7 +1449,6 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
>  		mce_disabled = 1;
>  		return;
>  	}
> -	mce_log_init();
>  
>  	machine_check_vector = do_machine_check;
>  

... here are what done in patch 10/10.

> @@ -1486,15 +1497,14 @@ static int mce_release(struct inode *inode, struct file *file)
>  	return 0;
>  }
>  
> -static ssize_t mce_read_cpu(struct mce_log_cpu *mcelog_cpu,
> -			    char __user *inubuf, size_t usize)
> +static ssize_t mce_read_cpu(int cpu, char __user *inubuf, size_t usize)
>  {
> +	struct mce_log_cpu *mcelog_cpu = &per_cpu(mce_log_cpus, cpu);

Changed 1st arg to cpu number.

>  	char __user *ubuf = inubuf;
>  	int head, tail, pos, i, err = 0;
>  
>  	head = mcelog_cpu->head;
>  	tail = mcelog_cpu->tail;
> -
>  	if (head == tail)
>  		return 0;
>  
> @@ -1503,13 +1513,14 @@ static ssize_t mce_read_cpu(struct mce_log_cpu *mcelog_cpu,
>  		i = mce_log_index(pos);
>  		if (!mcelog_cpu->entry[i].finished) {
>  			int timeout = WRITER_TIMEOUT_NS;
> +
>  			while (!mcelog_cpu->entry[i].finished) {
>  				if (timeout-- <= 0) {
>  					memset(mcelog_cpu->entry + i, 0,
>  					       sizeof(struct mce));
>  					head = mcelog_cpu->head;
>  					printk(KERN_WARNING "mcelog: timeout "
> -					       "waiting for writer to finish!\n");
> +					     "waiting for writer to finish!\n");
>  					goto timeout;
>  				}
>  				ndelay(1);
> @@ -1538,11 +1549,6 @@ timeout:
>  	return err ? -EFAULT : ubuf - inubuf;
>  }
>  
> -static int mce_empty_cpu(struct mce_log_cpu *mcelog_cpu)
> -{
> -	return mcelog_cpu->head == mcelog_cpu->tail;
> -}
> -
>  static int mce_empty(void)
>  {
>  	int cpu;
> @@ -1550,33 +1556,34 @@ static int mce_empty(void)
>  
>  	for_each_possible_cpu(cpu) {
>  		mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
> -		if (!mce_empty_cpu(mcelog_cpu))
> +		if (mcelog_cpu->head != mcelog_cpu->tail)

Inlined.  Or it would be better to have static inlines in header file.

>  			return 0;
>  	}
>  	return 1;
>  }
>  
> +static DEFINE_MUTEX(mce_read_mutex);
> +
>  static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
>  			loff_t *off)
>  {
>  	char __user *ubuf = inubuf;
> -	struct mce_log_cpu *mcelog_cpu;
> -	int cpu, new_mce, err = 0;
> -	static DEFINE_MUTEX(mce_read_mutex);
> +	int cpu, err = 0;
> +
> +	/* Only supports full reads right now */
> +	if (*off != 0 || usize < sizeof(struct mce))
> +		return -EINVAL;

Picked up what implicitly dropped.

>  
>  	mutex_lock(&mce_read_mutex);
> -	do {
> -		new_mce = 0;
> +
> +	while (!mce_empty()) {
>  		for_each_possible_cpu(cpu) {
>  			if (usize < sizeof(struct mce))
>  				goto out;
> -			mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
> -			err = mce_read_cpu(mcelog_cpu, ubuf,
> -					   sizeof(struct mce));
> +			err = mce_read_cpu(cpu, ubuf, sizeof(struct mce));
>  			if (err > 0) {
>  				ubuf += sizeof(struct mce);
>  				usize -= sizeof(struct mce);
> -				new_mce = 1;
>  				err = 0;
>  			} else if (err < 0)
>  				goto out;
> @@ -1586,9 +1593,10 @@ static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
>  			cond_resched();
>  			mutex_lock(&mce_read_mutex);
>  		}
> -	} while (new_mce || !mce_empty());
> +	}

I could not catch the intent of "new_mce," so replaced do-while by
simple while statement.

>  out:
>  	mutex_unlock(&mce_read_mutex);
> +
>  	return err ? err : ubuf - inubuf;
>  }
>  

That's all.

Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 06/10] x86, mce: make mce_log buffer to per-CPU
  2009-10-05  7:06     ` Andi Kleen
@ 2009-10-05  7:50       ` Hidetoshi Seto
  2009-10-09  1:45         ` Huang Ying
  0 siblings, 1 reply; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-05  7:50 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Ingo Molnar, H. Peter Anvin, linux-kernel

Andi Kleen wrote:
>> (This piece originates from Huang's patch, titled:
>>  "x86, MCE: Fix bugs and issues of MCE log ring buffer")
> 
> You should use proper From: headers then for correct attribution.
> 
> -Andi

I just referred following commit we already have: 

> commit 77e26cca20013e9352a8df86a54640543304a23a
> Author: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
> Date:   Thu Jun 11 16:04:35 2009 +0900
> 
>     x86, mce: Fix mce printing
> 
>     This patch:
> 
>      - Adds print_mce_head() instead of first flag
>      - Makes the header to be printed always
>      - Stops double printing of corrected errors
> 
>     [ This portion originates from Huang Ying's patch ]
> 
>     Originally-From: Huang Ying <ying.huang@intel.com>
>     Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
>     LKML-Reference: <4A30AC83.5010708@jp.fujitsu.com>
>     Signed-off-by: Ingo Molnar <mingo@elte.hu>

You mean s/Originally-From/From/, right?

I'll update attributes if this set need to be revised or if maintainer
(who expected to be able to set proper attributes) request me to do so.

Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer
  2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
                     ` (10 preceding siblings ...)
  2009-10-05  7:07   ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
@ 2009-10-05  8:51   ` Frédéric Weisbecker
  2009-10-05 15:16     ` Andi Kleen
  2009-10-06  5:46     ` Hidetoshi Seto
  11 siblings, 2 replies; 23+ messages in thread
From: Frédéric Weisbecker @ 2009-10-05  8:51 UTC (permalink / raw)
  To: Hidetoshi Seto
  Cc: Huang Ying, Ingo Molnar, H. Peter Anvin, Andi Kleen, linux-kernel

2009/10/5 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>:
> Hi Huang,
>
> Huang Ying wrote:
>> Current MCE log ring buffer has following bugs and issues:
>>
>> - On larger systems the 32 size buffer easily overflow, losing events.
>>
>> - We had some reports of events getting corrupted which were also
>>   blamed on the ring buffer.
>>
>> - There's a known livelock, now hit by more people, under high error
>>   rate.
>>
>> We fix these bugs and issues via making MCE log ring buffer as
>> lock-less per-CPU ring buffer.
>
> Now I have a real problem on the small MCE log buffer on my new large
> system with Nehalem which has many cpus/banks in one socket...
> So I'd like to solve the problem asap.  I think this problem might block
> some distros to support new processor.
>
> Last week I reviewed your patch again and noticed that it is doing a lot
> of changes at once.  I suppose that this method must be one of reasons
> why your patch seems to be so hard to review, and why it is taking long
> time to be accepted by x86 maintainers.
>
> Fortunately I had some spare time so I carefully broke your patch into
> some purpose-designed pieces.  It would be the most significant change
> that now there are 2 steps to convert the buffer structure - 1) to make
> it per-CPU and 2) to make it ring buffer.
>
> Also I fixed some problem in your patch, found on the way to make this
> patch set.  I'll explain about my changes later using diff from your
> change.  Comments are welcomed.
>
> Thanks,
> H.Seto
>


Looks like the conversion of MCE log into a TRACE_EVENT is still in
discussion whereas the
current issues are urgent.

So the need is to have a more stable ring buffer. But this one is an ad-hoc one.
We already have a general purpose per-cpu/lockless ring buffer implementation in
kernel/trace/ring_buffer.c
And it's not only used by tracing, it's generally available.

I think it would be nicer to use it to avoid a proliferation of
unstable ring buffers
inside the kernel.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer
  2009-10-05  8:51   ` Frédéric Weisbecker
@ 2009-10-05 15:16     ` Andi Kleen
  2009-10-06  5:46     ` Hidetoshi Seto
  1 sibling, 0 replies; 23+ messages in thread
From: Andi Kleen @ 2009-10-05 15:16 UTC (permalink / raw)
  To: Frédéric Weisbecker
  Cc: Hidetoshi Seto, Huang Ying, H. Peter Anvin, Andi Kleen, linux-kernel

Frédéric Weisbecker <fweisbec@gmail.com> writes:

> So the need is to have a more stable ring buffer. But this one is an ad-hoc one.

It was designed to do one job and does it well with minimal overhead.
That seems all positive to me.

> We already have a general purpose per-cpu/lockless ring buffer implementation in
> kernel/trace/ring_buffer.c
> And it's not only used by tracing, it's generally available.

That trace ring buffer is significantly larger than the all rest of the MCE 
code together:

andi@basil:~/lsrc/git/obj> size kernel/trace/ring_buffer.o 
   text    data     bss     dec     hex filename
  14241      21      12   14274    37c2 kernel/trace/ring_buffer.o
andi@basil:~/lsrc/git/obj> size arch/x86/kernel/cpu/mcheck/mce.o 
   text    data     bss     dec     hex filename
  11098    4480     224   15802    3dba arch/x86/kernel/cpu/mcheck/mce.o

Basically you're proposing to more than double the code footprint of the 
MCE handler for a small kernel configuration, just to avoid a few lines
of code.

It might be that all of its features are needed for tracing, but they
are definitely not needed for MCE handling and it would need 
to be put on a significant diet before core code like the MCE handler
could even think to start relying on it. In addition the ring-buffer.c
currently doesn't even directly do what mce needs and would need
significant glue code to fit into the existing interfaces and likely
modifications to the ring buffer itself too (e.g. to stop
trace_stop from messing with error handling)

Error handling code like the MCE handler has to be always compiled in
unlike tracing and other debugging options and I don't think it can
really afford to use such oversized code, especially since it doesn't
really need any of its fancy features.  Linux has enough problems with
code size growth recently, no need to make it worse with proposals
like this. Also the requirements of error handling are quite different
from other tracing and debugging and the ring-buffer doesn't even fit
well recently.

If the code should be converted to use a generic buffer kernel/kfifo.o
would be the right target and size:

andi@basil:~/lsrc/git/obj> size kernel/kfifo.o 
   text    data     bss     dec     hex filename
    734       0       0     734     2de kernel/kfifo.o

Unfortunately this needs Stefanie's lockless kfifo changes first to make 
happen and they didn't make it into 2.6.32-rc*

If they make it into 2.6.33 and there's really a strong desire to use
some generalized buffer I think that would be a suitable solution.
But frankly just keeping Ying's buffer around would also be quite reasonable

Short term Ying's simple alternative is the only good solution.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer
  2009-10-05  8:51   ` Frédéric Weisbecker
  2009-10-05 15:16     ` Andi Kleen
@ 2009-10-06  5:46     ` Hidetoshi Seto
  1 sibling, 0 replies; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-06  5:46 UTC (permalink / raw)
  To: Frédéric Weisbecker
  Cc: Huang Ying, Ingo Molnar, H. Peter Anvin, Andi Kleen, linux-kernel

Frédéric Weisbecker wrote:
> Looks like the conversion of MCE log into a TRACE_EVENT is still in
> discussion whereas the current issues are urgent.
> 
> So the need is to have a more stable ring buffer. But this one is an ad-hoc one.
> We already have a general purpose per-cpu/lockless ring buffer implementation in
> kernel/trace/ring_buffer.c
> And it's not only used by tracing, it's generally available.

Thank you for this comment.

First of all, we have to make it clear that what is the urgent issue.
IMHO there are 2 issues, one is "small buffer" and the other is "non
ring buffer."

I think that the "small buffer" is definitely a scalability issue and
that this problem should be fixed asap to support large servers with
modern processors.
But I think that the "non ring buffer" is kind of a performance issue,
which is relatively minor problem on the MCE codes.

So frankly I approve of the per-CPU buffering, but be skeptical about
the ring buffering here.

In other words I don't mind dropping 9/10 of my patch set to postpone
converting the buffer to ring buffer, because even though the rests of
my patch set can convert the buffer to per-CPU style, which will solve
the urgent issue I have now.  That is why I divided original patch into
these pieces and made separate steps in the buffer conversion.

How about converting the MCE buffer to per-CPU as the 1st step, to solve
an urgent scalability issue, x86 maintainers? > Ingo, Peter?


> I think it would be nicer to use it to avoid a proliferation of
> unstable ring buffers inside the kernel.

I tend to agree with Andi.
I think the generic ring buffer implementation is too rich for current
use in MCE codes.

Maybe I could invent an alternative patch of my patch 9/10 to use generic
ring buffer from MCE codes.  But as I mentioned above I think that the
patch will solve the minor issue only.  Also I think that it might bring
unexpected impacts on traditional interfaces.
Therefore if there are still "use generic one if you want to use ring buffer"
comments then I'd like to drop 9/10 simply rather than reinvent it.

Any other suggestions?


BTW, I'd like to make it clear that what is the main complaint here.

Is it kind of "don't duplicate codes"?
The requirement in MCE codes is quite simple, so use of relatively complex
generic ring buffer doesn't fit for it.  I'm not sure but as Andi proposed
the next version of kernel/kfifo will be better choice here.  I can wait
the release of new kfifo either without making MCE log buffer to ring,
or with the temporary ad-hoc one.

Is it kind of "provide generic interface"?
I think it makes a lot of sense.  I can't say the /dev/mcelog is the best
interface for MCE logging.  Moreover there are many other hardware error
sources in a system such as PCIe and power unit etc., so it would be nice
for monitoring applications if kernel can provide a single channel (other
than /var/log/messages) to gather all kind of hardware errors.
The TRACE_EVENT would be one of possible options (or parts) to implement
such feature.

Oh, don't forget "please keep traditional interfaces."
To be honest I think the current interfaces look quite ugly.  But I know
that there are few real users such as mcelog (and mced?), and that at this
time we have no choice but use these tools.  So we cannot cast away current
interfaces immediately, until new interfaces are invented and new user land
application can provide enough functionalities.  Or we need to improve
current interface gradually and synchronously with improvements on tools ...
but I suppose in this way there will never be drastic changes.


Well, then I think what we have to do is invent a competitive new interface
for MCE logging, which can live together with current interface for a while,
isn't it?


Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 06/10] x86, mce: make mce_log buffer to per-CPU
  2009-10-05  7:50       ` Hidetoshi Seto
@ 2009-10-09  1:45         ` Huang Ying
  2009-10-09  5:34           ` Hidetoshi Seto
  0 siblings, 1 reply; 23+ messages in thread
From: Huang Ying @ 2009-10-09  1:45 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Andi Kleen, Ingo Molnar, H. Peter Anvin, linux-kernel

On Mon, 2009-10-05 at 15:50 +0800, Hidetoshi Seto wrote: 
> Andi Kleen wrote:
> >> (This piece originates from Huang's patch, titled:
> >>  "x86, MCE: Fix bugs and issues of MCE log ring buffer")
> > 
> > You should use proper From: headers then for correct attribution.
> > 
> > -Andi
> 
> I just referred following commit we already have: 
> 
> > commit 77e26cca20013e9352a8df86a54640543304a23a
> > Author: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
> > Date:   Thu Jun 11 16:04:35 2009 +0900
> > 
> >     x86, mce: Fix mce printing
> > 
> >     This patch:
> > 
> >      - Adds print_mce_head() instead of first flag
> >      - Makes the header to be printed always
> >      - Stops double printing of corrected errors
> > 
> >     [ This portion originates from Huang Ying's patch ]
> > 
> >     Originally-From: Huang Ying <ying.huang@intel.com>
> >     Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
> >     LKML-Reference: <4A30AC83.5010708@jp.fujitsu.com>
> >     Signed-off-by: Ingo Molnar <mingo@elte.hu>
> 
> You mean s/Originally-From/From/, right?
> 
> I'll update attributes if this set need to be revised or if maintainer
> (who expected to be able to set proper attributes) request me to do so.

I don't think it is a good collaboration style to copy others' patches,
do some modification and send it out, instead:

- comment on original patches
- communicate with the original author firstly
- provide another patch on top of original patches

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 06/10] x86, mce: make mce_log buffer to per-CPU
  2009-10-09  1:45         ` Huang Ying
@ 2009-10-09  5:34           ` Hidetoshi Seto
  0 siblings, 0 replies; 23+ messages in thread
From: Hidetoshi Seto @ 2009-10-09  5:34 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andi Kleen, Ingo Molnar, H. Peter Anvin, linux-kernel

Huang Ying wrote:
> I don't think it is a good collaboration style to copy others' patches,
> do some modification and send it out, instead:

Sorry, I could not stop myself since I got frustrated over a trivial matter
around this field in these days.  And I could not wait your next post any
more, didn't want to see abandoned "rebased" patch again, so I took this
kind of aggressive violent action in bad manner to push your change.
Sorry again.


> - comment on original patches

I think your patch contains various changes, so how about to break down
this one into small pieces and make patches with proper clearer description.

I believe that the urgent problem can be solved by making the buffer to
per-CPU.  Or do you have any reason to rush it into ring buffer?


> - communicate with the original author firstly

Certainly I wrote following line at first:
 > Hi Huang,


> - provide another patch on top of original patches

Since what I want to demonstrate to you was another style of your patch,
I could not provide patches in incremental form as usual.


Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2009-10-09  5:35 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-09-18 10:20 [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Huang Ying
2009-09-18 11:09 ` Ingo Molnar
2009-09-21  5:37   ` Huang Ying
2009-09-22 13:39     ` [PATCH] x86: mce: New MCE logging design Ingo Molnar
2009-10-05  6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
2009-10-05  6:33   ` [PATCH 01/10] x86, mce: remove tsc handling from mce_read Hidetoshi Seto
2009-10-05  6:34   ` [PATCH 02/10] x86, mce: mce_read can check args without mutex Hidetoshi Seto
2009-10-05  6:35   ` [PATCH 03/10] x86, mce: change writer timeout in mce_read Hidetoshi Seto
2009-10-05  6:36   ` [PATCH 04/10] x86, mce: use do-while in mce_log Hidetoshi Seto
2009-10-05  6:37   ` [PATCH 05/10] x86, mce: make mce_log buffer to per-CPU, prep Hidetoshi Seto
2009-10-05  6:38   ` [PATCH 06/10] x86, mce: make mce_log buffer to per-CPU Hidetoshi Seto
2009-10-05  7:06     ` Andi Kleen
2009-10-05  7:50       ` Hidetoshi Seto
2009-10-09  1:45         ` Huang Ying
2009-10-09  5:34           ` Hidetoshi Seto
2009-10-05  6:40   ` [PATCH 07/10] x86, mce: remove for-loop in mce_log Hidetoshi Seto
2009-10-05  6:41   ` [PATCH 08/10] x86, mce: change barriers " Hidetoshi Seto
2009-10-05  6:42   ` [PATCH 09/10] x86, mce: make mce_log buffer to ring buffer Hidetoshi Seto
2009-10-05  6:44   ` [PATCH 10/10] x86, mce: move mce_log_init() into mce_cap_init() Hidetoshi Seto
2009-10-05  7:07   ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
2009-10-05  8:51   ` Frédéric Weisbecker
2009-10-05 15:16     ` Andi Kleen
2009-10-06  5:46     ` Hidetoshi Seto

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.