All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH -v2 0/9] ACPI, APEI patches for 2.6.37
@ 2010-10-25  7:43 Huang Ying
  2010-10-25  7:43 ` [PATCH -v2 1/9] ACPI, APEI, Add ERST record ID cache Huang Ying
                   ` (8 more replies)
  0 siblings, 9 replies; 48+ messages in thread
From: Huang Ying @ 2010-10-25  7:43 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

v2:

- Some minor changes according to Andi's comments.


[PATCH -v2 1/9] ACPI, APEI, Add ERST record ID cache
[PATCH -v2 2/9] Add lock-less version of bitmap_set/clear
[PATCH -v2 3/9] lock-less NULL terminated single list implementation
[PATCH -v2 4/9] lock-less general memory allocator
[PATCH -v2 5/9] Hardware error device core
[PATCH -v2 6/9] Hardware error record persistent support
[PATCH -v2 7/9] ACPI, APEI, Use ERST for hardware error persisting before panic
[PATCH -v2 8/9] ACPI, APEI, Report GHES error record with hardware error device core
[PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH -v2 1/9] ACPI, APEI, Add ERST record ID cache
  2010-10-25  7:43 [PATCH -v2 0/9] ACPI, APEI patches for 2.6.37 Huang Ying
@ 2010-10-25  7:43 ` Huang Ying
  2010-10-25  7:43 ` [PATCH -v2 2/9] Add lock-less version of bitmap_set/clear Huang Ying
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 48+ messages in thread
From: Huang Ying @ 2010-10-25  7:43 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

APEI ERST firmware interface and implementation has no multiple users
in mind. For example, there is four records in storage with ID: 1, 2,
3 and 4, if two ERST readers enumerate the records via
GET_NEXT_RECORD_ID as follow,

reader 1		reader 2
1
			2
3
			4
-1
			-1

where -1 signals there is no more record ID.

Reader 1 has no chance to check record 2 and 4, while reader 2 has no
chance to check record 1 and 3. And any other GET_NEXT_RECORD_ID will
return -1, that is, other readers will has no chance to check any
record even they are not cleared by anyone.

This makes raw GET_NEXT_RECORD_ID not suitable for usage of multiple
users.

To solve the issue, an in memory ERST record ID cache is designed and
implemented. When enumerating record ID, the ID returned by
GET_NEXT_RECORD_ID is added into cache in addition to be returned to
caller. So other readers can check the cache to get all record ID
available.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-apei.c |   40 +++--
 drivers/acpi/apei/erst-dbg.c          |   24 ++-
 drivers/acpi/apei/erst.c              |  241 +++++++++++++++++++++++++++-------
 include/acpi/apei.h                   |    5 
 4 files changed, 246 insertions(+), 64 deletions(-)

--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m)
 ssize_t apei_read_mce(struct mce *m, u64 *record_id)
 {
 	struct cper_mce_record rcd;
-	ssize_t len;
-
-	len = erst_read_next(&rcd.hdr, sizeof(rcd));
-	if (len <= 0)
-		return len;
-	/* Can not skip other records in storage via ERST unless clear them */
-	else if (len != sizeof(rcd) ||
-		 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
-		if (printk_ratelimit())
-			pr_warning(
-			"MCE-APEI: Can not skip the unknown record in ERST");
-		return -EIO;
-	}
+	int rc, pos;
 
+	rc = erst_get_record_id_begin(&pos);
+	if (rc)
+		return rc;
+retry:
+	rc = erst_get_record_id_next(&pos, record_id);
+	if (rc)
+		goto out;
+	/* no more record */
+	if (*record_id == APEI_ERST_INVALID_RECORD_ID)
+		goto out;
+	rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
+	/* someone else has cleared the record, try next one */
+	if (rc == -ENOENT)
+		goto retry;
+	else if (rc < 0)
+		goto out;
+	/* try to skip other type records in storage */
+	else if (rc != sizeof(rcd) ||
+		 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
+		goto retry;
 	memcpy(m, &rcd.mce, sizeof(*m));
-	*record_id = rcd.hdr.record_id;
+	rc = sizeof(*m);
+out:
+	erst_get_record_id_end();
 
-	return sizeof(*m);
+	return rc;
 }
 
 /* Check whether there is record in ERST */
--- a/drivers/acpi/apei/erst-dbg.c
+++ b/drivers/acpi/apei/erst-dbg.c
@@ -43,12 +43,27 @@ static DEFINE_MUTEX(erst_dbg_mutex);
 
 static int erst_dbg_open(struct inode *inode, struct file *file)
 {
+	int rc, *pos;
+
 	if (erst_disable)
 		return -ENODEV;
 
+	pos = (int *)&file->private_data;
+
+	rc = erst_get_record_id_begin(pos);
+	if (rc)
+		return rc;
+
 	return nonseekable_open(inode, file);
 }
 
+static int erst_dbg_release(struct inode *inode, struct file *file)
+{
+	erst_get_record_id_end();
+
+	return 0;
+}
+
 static long erst_dbg_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 {
 	int rc;
@@ -79,18 +94,20 @@ static long erst_dbg_ioctl(struct file *
 static ssize_t erst_dbg_read(struct file *filp, char __user *ubuf,
 			     size_t usize, loff_t *off)
 {
-	int rc;
+	int rc, *pos;
 	ssize_t len = 0;
 	u64 id;
 
-	if (*off != 0)
+	if (*off)
 		return -EINVAL;
 
 	if (mutex_lock_interruptible(&erst_dbg_mutex) != 0)
 		return -EINTR;
 
+	pos = (int *)&filp->private_data;
+
 retry_next:
-	rc = erst_get_next_record_id(&id);
+	rc = erst_get_record_id_next(pos, &id);
 	if (rc)
 		goto out;
 	/* no more record */
@@ -181,6 +198,7 @@ out:
 static const struct file_operations erst_dbg_ops = {
 	.owner		= THIS_MODULE,
 	.open		= erst_dbg_open,
+	.release	= erst_dbg_release,
 	.read		= erst_dbg_read,
 	.write		= erst_dbg_write,
 	.unlocked_ioctl	= erst_dbg_ioctl,
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -429,6 +429,27 @@ ssize_t erst_get_record_count(void)
 }
 EXPORT_SYMBOL_GPL(erst_get_record_count);
 
+#define ERST_RECORD_ID_CACHE_SIZE_MIN	16
+#define ERST_RECORD_ID_CACHE_SIZE_MAX	1024
+
+struct erst_record_id_entry {
+	u64 id;
+	unsigned int cleared:1;
+};
+
+struct erst_record_id_cache {
+	struct mutex lock;
+	struct erst_record_id_entry *entries;
+	int len;
+	int size;
+	int refcount;
+};
+
+static struct erst_record_id_cache erst_record_id_cache = {
+	.lock = __MUTEX_INITIALIZER(erst_record_id_cache.lock),
+	.refcount = 0,
+};
+
 static int __erst_get_next_record_id(u64 *record_id)
 {
 	struct apei_exec_context ctx;
@@ -443,26 +464,180 @@ static int __erst_get_next_record_id(u64
 	return 0;
 }
 
+int erst_get_record_id_begin(int *pos)
+{
+	int rc;
+
+	if (erst_disable)
+		return -ENODEV;
+
+	rc = mutex_lock_interruptible(&erst_record_id_cache.lock);
+	if (rc)
+		return rc;
+	erst_record_id_cache.refcount++;
+	mutex_unlock(&erst_record_id_cache.lock);
+
+	*pos = 0;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(erst_get_record_id_begin);
+
+/* erst_record_id_cache.lock must be held by caller */
+static int erst_record_id_cache_add_one(void)
+{
+	u64 id, prev_id, first_id;
+	int i, rc;
+	struct erst_record_id_entry *entries;
+	unsigned long flags;
+
+	id = prev_id = first_id = APEI_ERST_INVALID_RECORD_ID;
+retry:
+	spin_lock_irqsave(&erst_lock, flags);
+	rc = __erst_get_next_record_id(&id);
+	spin_unlock_irqrestore(&erst_lock, flags);
+	if (rc == -ENOENT)
+		return 0;
+	if (rc)
+		return rc;
+	if (id == APEI_ERST_INVALID_RECORD_ID)
+		return 0;
+	/* can not skip current ID, or look back to first ID */
+	if (id == prev_id || id == first_id)
+		return 0;
+	if (first_id == APEI_ERST_INVALID_RECORD_ID)
+		first_id = id;
+	prev_id = id;
+
+	entries = erst_record_id_cache.entries;
+	for (i = 0; i < erst_record_id_cache.len; i++) {
+		if (!entries[i].cleared && entries[i].id == id)
+			break;
+	}
+	/* record id already in cache, try next */
+	if (i < erst_record_id_cache.len)
+		goto retry;
+	if (erst_record_id_cache.len >= erst_record_id_cache.size) {
+		int new_size, alloc_size;
+		struct erst_record_id_entry *new_entries;
+
+		new_size = erst_record_id_cache.size * 2;
+		new_size = clamp_val(new_size, ERST_RECORD_ID_CACHE_SIZE_MIN,
+				     ERST_RECORD_ID_CACHE_SIZE_MAX);
+		if (new_size <= erst_record_id_cache.size) {
+			if (printk_ratelimit())
+				pr_warning(FW_WARN ERST_PFX
+					   "too many record ID!\n");
+			return 0;
+		}
+		alloc_size = new_size * sizeof(struct erst_record_id_entry);
+		if (alloc_size < PAGE_SIZE)
+			new_entries = kmalloc(alloc_size, GFP_KERNEL);
+		else
+			new_entries = vmalloc(alloc_size);
+		if (!new_entries)
+			return -ENOMEM;
+		memcpy(new_entries, entries,
+		       erst_record_id_cache.len * sizeof(entries[0]));
+		if (erst_record_id_cache.size < PAGE_SIZE)
+			kfree(entries);
+		else
+			vfree(entries);
+		erst_record_id_cache.entries = entries = new_entries;
+		erst_record_id_cache.size = new_size;
+	}
+	memset(&entries[i], 0, sizeof(entries[i]));
+	entries[i].id = id;
+	erst_record_id_cache.len++;
+
+	return 1;
+}
+
 /*
  * Get the record ID of an existing error record on the persistent
  * storage. If there is no error record on the persistent storage, the
  * returned record_id is APEI_ERST_INVALID_RECORD_ID.
  */
-int erst_get_next_record_id(u64 *record_id)
+int erst_get_record_id_next(int *pos, u64 *record_id)
 {
-	int rc;
-	unsigned long flags;
+	int rc = 0;
+	struct erst_record_id_entry *entries;
 
 	if (erst_disable)
 		return -ENODEV;
 
-	spin_lock_irqsave(&erst_lock, flags);
-	rc = __erst_get_next_record_id(record_id);
-	spin_unlock_irqrestore(&erst_lock, flags);
+	/* must be enclosed by erst_get_record_id_begin/end */
+	BUG_ON(!erst_record_id_cache.refcount);
+	BUG_ON(*pos < 0 || *pos > erst_record_id_cache.len);
+
+	mutex_lock(&erst_record_id_cache.lock);
+	entries = erst_record_id_cache.entries;
+	for (; *pos < erst_record_id_cache.len; (*pos)++)
+		if (!entries[*pos].cleared)
+			break;
+	/* found next record id in cache */
+	if (*pos < erst_record_id_cache.len) {
+		*record_id = entries[*pos].id;
+		(*pos)++;
+		goto out_unlock;
+	}
+
+	/* Try to add one more record ID to cache */
+	rc = erst_record_id_cache_add_one();
+	if (rc < 0)
+		goto out_unlock;
+	/* successfully add one new ID */
+	if (rc == 1) {
+		entries = erst_record_id_cache.entries;
+		*record_id = entries[*pos].id;
+		(*pos)++;
+		rc = 0;
+	} else {
+		*pos = -1;
+		*record_id = APEI_ERST_INVALID_RECORD_ID;
+	}
+out_unlock:
+	mutex_unlock(&erst_record_id_cache.lock);
 
 	return rc;
 }
-EXPORT_SYMBOL_GPL(erst_get_next_record_id);
+EXPORT_SYMBOL_GPL(erst_get_record_id_next);
+
+static void __erst_record_id_cache_compact(void)
+{
+	int i, wpos = 0;
+	struct erst_record_id_entry *entries;
+
+	if (!erst_record_id_cache.refcount) {
+		entries = erst_record_id_cache.entries;
+		for (i = 0; i < erst_record_id_cache.len; i++) {
+			if (entries[i].cleared)
+				continue;
+			if (wpos != i)
+				memcpy(&entries[wpos], &entries[i],
+				       sizeof(entries[i]));
+			wpos++;
+		}
+		erst_record_id_cache.len = wpos;
+	}
+}
+
+void erst_get_record_id_end(void)
+{
+	/*
+	 * erst_disable != 0 should be detected by invoker via the
+	 * return value of erst_get_record_id_begin/next, so this
+	 * function should not be called for erst_disable != 0.
+	 */
+	BUG_ON(erst_disable);
+
+	mutex_lock(&erst_record_id_cache.lock);
+	erst_record_id_cache.refcount--;
+	BUG_ON(erst_record_id_cache.refcount < 0);
+	__erst_record_id_cache_compact();
+	mutex_unlock(&erst_record_id_cache.lock);
+}
+EXPORT_SYMBOL_GPL(erst_get_record_id_end);
 
 static int __erst_write_to_storage(u64 offset)
 {
@@ -703,56 +878,34 @@ ssize_t erst_read(u64 record_id, struct
 }
 EXPORT_SYMBOL_GPL(erst_read);
 
-/*
- * If return value > buflen, the buffer size is not big enough,
- * else if return value = 0, there is no more record to read,
- * else if return value < 0, something goes wrong,
- * else everything is OK, and return value is record length
- */
-ssize_t erst_read_next(struct cper_record_header *record, size_t buflen)
-{
-	int rc;
-	ssize_t len;
-	unsigned long flags;
-	u64 record_id;
-
-	if (erst_disable)
-		return -ENODEV;
-
-	spin_lock_irqsave(&erst_lock, flags);
-	rc = __erst_get_next_record_id(&record_id);
-	if (rc) {
-		spin_unlock_irqrestore(&erst_lock, flags);
-		return rc;
-	}
-	/* no more record */
-	if (record_id == APEI_ERST_INVALID_RECORD_ID) {
-		spin_unlock_irqrestore(&erst_lock, flags);
-		return 0;
-	}
-
-	len = __erst_read(record_id, record, buflen);
-	spin_unlock_irqrestore(&erst_lock, flags);
-
-	return len;
-}
-EXPORT_SYMBOL_GPL(erst_read_next);
-
 int erst_clear(u64 record_id)
 {
-	int rc;
+	int rc, i;
 	unsigned long flags;
+	struct erst_record_id_entry *entries;
 
 	if (erst_disable)
 		return -ENODEV;
 
+	rc = mutex_lock_interruptible(&erst_record_id_cache.lock);
+	if (rc)
+		return rc;
 	spin_lock_irqsave(&erst_lock, flags);
 	if (erst_erange.attr & ERST_RANGE_NVRAM)
 		rc = __erst_clear_from_nvram(record_id);
 	else
 		rc = __erst_clear_from_storage(record_id);
 	spin_unlock_irqrestore(&erst_lock, flags);
-
+	if (rc)
+		goto out;
+	entries = erst_record_id_cache.entries;
+	for (i = 0; i < erst_record_id_cache.len; i++) {
+		if (!entries[i].cleared && entries[i].id == record_id)
+			entries[i].cleared = 1;
+	}
+	__erst_record_id_cache_compact();
+out:
+	mutex_unlock(&erst_record_id_cache.lock);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(erst_clear);
--- a/include/acpi/apei.h
+++ b/include/acpi/apei.h
@@ -24,10 +24,11 @@ int apei_hest_parse(apei_hest_func_t fun
 
 int erst_write(const struct cper_record_header *record);
 ssize_t erst_get_record_count(void);
-int erst_get_next_record_id(u64 *record_id);
+int erst_get_record_id_begin(int *pos);
+int erst_get_record_id_next(int *pos, u64 *record_id);
+void erst_get_record_id_end(void);
 ssize_t erst_read(u64 record_id, struct cper_record_header *record,
 		  size_t buflen);
-ssize_t erst_read_next(struct cper_record_header *record, size_t buflen);
 int erst_clear(u64 record_id);
 
 #endif

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH -v2 2/9] Add lock-less version of bitmap_set/clear
  2010-10-25  7:43 [PATCH -v2 0/9] ACPI, APEI patches for 2.6.37 Huang Ying
  2010-10-25  7:43 ` [PATCH -v2 1/9] ACPI, APEI, Add ERST record ID cache Huang Ying
@ 2010-10-25  7:43 ` Huang Ying
  2010-10-25  7:43 ` [PATCH -v2 3/9] lock-less NULL terminated single list implementation Huang Ying
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 48+ messages in thread
From: Huang Ying @ 2010-10-25  7:43 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

cmpxchg is used to change the bitmap instead of the ordinary unsigned
long assigning. Several users can set/clear the same bitmap
simultaneously without lock. If there is conflict between two user,
(set/clear same bit), one will return remain bits immediately.

This can be used to implement the lock-less resource allocator.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/bitmap.h |    4 +
 lib/bitmap.c           |  101 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+)

--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -44,6 +44,8 @@
  * bitmap_weight(src, nbits)			Hamming Weight: number set bits
  * bitmap_set(dst, pos, nbits)			Set specified bit area
  * bitmap_clear(dst, pos, nbits)		Clear specified bit area
+ * bitmap_set_ll(dst, pos, nbits)		Set specified bit area, lock-less version
+ * bitmap_clear_ll(dst, pos, nbits)		Clear specified bit area, lock-less version
  * bitmap_find_next_zero_area(buf, len, pos, n, mask)	Find bit free area
  * bitmap_shift_right(dst, src, n, nbits)	*dst = *src >> n
  * bitmap_shift_left(dst, src, n, nbits)	*dst = *src << n
@@ -113,6 +115,8 @@ extern int __bitmap_weight(const unsigne
 
 extern void bitmap_set(unsigned long *map, int i, int len);
 extern void bitmap_clear(unsigned long *map, int start, int nr);
+extern int bitmap_set_ll(unsigned long *map, int i, int len);
+extern int bitmap_clear_ll(unsigned long *map, int start, int nr);
 extern unsigned long bitmap_find_next_zero_area(unsigned long *map,
 					 unsigned long size,
 					 unsigned long start,
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -315,6 +315,107 @@ void bitmap_clear(unsigned long *map, in
 }
 EXPORT_SYMBOL(bitmap_clear);
 
+static inline int set_bits_ll(unsigned long *addr, unsigned long mask_to_set)
+{
+	unsigned long val, nval;
+
+	nval = *addr;
+	do {
+		val = nval;
+		if (val & mask_to_set)
+			return -EBUSY;
+	} while ((nval = cmpxchg(addr, val, val | mask_to_set)) != val);
+
+	return 0;
+}
+
+static inline int clear_bits_ll(unsigned long *addr,
+				unsigned long mask_to_clear)
+{
+	unsigned long val, nval;
+
+	nval = *addr;
+	do {
+		val = nval;
+		if ((val & mask_to_clear) != mask_to_clear)
+			return -EBUSY;
+	} while ((nval = cmpxchg(addr, val, val & ~mask_to_clear)) != val);
+
+	return 0;
+}
+
+/**
+ * bitmap_set_ll - set the specified number of bits at the specified position
+ * @map: pointer to a bitmap
+ * @start: a bit position in @map
+ * @nr: number of bits to set
+ *
+ * Set @nr bits start from @start in @map lock-lessly. Several users
+ * can set/clear the same bitmap simultaneously without lock. If two
+ * users set the same bit, one user will return remain bits, otherwise
+ * return 0.
+ */
+int bitmap_set_ll(unsigned long *map, int start, int nr)
+{
+	unsigned long *p = map + BIT_WORD(start);
+	const int size = start + nr;
+	int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+	unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+
+	while (nr - bits_to_set >= 0) {
+		if (set_bits_ll(p, mask_to_set))
+			return nr;
+		nr -= bits_to_set;
+		bits_to_set = BITS_PER_LONG;
+		mask_to_set = ~0UL;
+		p++;
+	}
+	if (nr) {
+		mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+		if (set_bits_ll(p, mask_to_set))
+			return nr;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(bitmap_set_ll);
+
+/**
+ * bitmap_clear_ll - clear the specified number of bits at the specified position
+ * @map: pointer to a bitmap
+ * @start: a bit position in @map
+ * @nr: number of bits to set
+ *
+ * Clear @nr bits start from @start in @map lock-lessly. Several users
+ * can set/clear the same bitmap simultaneously without lock. If two
+ * users clear the same bit, one user will return remain bits,
+ * otherwise return 0.
+ */
+int bitmap_clear_ll(unsigned long *map, int start, int nr)
+{
+	unsigned long *p = map + BIT_WORD(start);
+	const int size = start + nr;
+	int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+	unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+	while (nr - bits_to_clear >= 0) {
+		if (clear_bits_ll(p, mask_to_clear))
+			return nr;
+		nr -= bits_to_clear;
+		bits_to_clear = BITS_PER_LONG;
+		mask_to_clear = ~0UL;
+		p++;
+	}
+	if (nr) {
+		mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+		if (clear_bits_ll(p, mask_to_clear))
+			return nr;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(bitmap_clear_ll);
+
 /*
  * bitmap_find_next_zero_area - find a contiguous aligned zero area
  * @map: The address to base the search on

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH -v2 3/9] lock-less NULL terminated single list implementation
  2010-10-25  7:43 [PATCH -v2 0/9] ACPI, APEI patches for 2.6.37 Huang Ying
  2010-10-25  7:43 ` [PATCH -v2 1/9] ACPI, APEI, Add ERST record ID cache Huang Ying
  2010-10-25  7:43 ` [PATCH -v2 2/9] Add lock-less version of bitmap_set/clear Huang Ying
@ 2010-10-25  7:43 ` Huang Ying
  2010-10-25  7:43 ` [PATCH -v2 4/9] lock-less general memory allocator Huang Ying
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 48+ messages in thread
From: Huang Ying @ 2010-10-25  7:43 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

Cmpxchg is used to implement adding new entry to list, deleting first
entry of the list and some other operations.

Because this is a single list, so the tail can not be accessed in O(1).

This can be used in NMI handler.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/llist.h |   64 ++++++++++++++++++++++++++++++++++++++
 lib/Kconfig           |    3 +
 lib/Makefile          |    2 +
 lib/llist.c           |   84 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 153 insertions(+)
 create mode 100644 include/linux/llist.h
 create mode 100644 lib/llist.c

--- /dev/null
+++ b/include/linux/llist.h
@@ -0,0 +1,64 @@
+#ifndef LLIST_H
+#define LLIST_H
+
+/* lock-less NULL terminated single linked list */
+struct llist_head {
+	struct llist_head *next;
+};
+
+#define LLIST_HEAD_INIT(name) { NULL }
+
+#define LLIST_HEAD(name)				\
+	struct llist_head name = LLIST_HEAD_INIT(name)
+
+/**
+ * init_llist_head - initialize lock-less list head
+ * @head:	the head for your lock-less list
+ */
+static inline void init_llist_head(struct llist_head *list)
+{
+	list->next = NULL;
+}
+
+/**
+ * llist_entry - get the struct of this entry
+ * @ptr:	the &struct llist_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the llist_head within the struct.
+ */
+#define llist_entry(ptr, type, member)		\
+	container_of(ptr, type, member)
+
+/**
+ * llist_for_each - iterate over a lock-less list
+ * @pos:	the &struct llist_head to use as a loop cursor
+ * @head:	the head for your lock-less list
+ */
+#define llist_for_each(pos, head)					\
+	for (pos = (head)->next; pos; pos = pos->next)
+
+/**
+ * llist_for_each_entry - iterate over lock-less list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your lock-less list.
+ * @member:	the name of the llist_head with the struct.
+ */
+#define llist_for_each_entry(pos, head, member)				\
+	for (pos = llist_entry((head)->next, typeof(*pos), member);	\
+	     &pos->member != NULL;					\
+	     pos = llist_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * llist_empty - tests whether a lock-less list is empty
+ * @head:	the list to test
+ */
+static inline int llist_empty(const struct llist_head *head)
+{
+	return head->next == NULL;
+}
+
+void llist_add(struct llist_head *new, struct llist_head *head);
+struct llist_head *llist_del_first(struct llist_head *head);
+void llist_move_all(struct llist_head *list, struct llist_head *head);
+
+#endif /* LLIST_H */
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -210,4 +210,7 @@ config GENERIC_ATOMIC64
 config LRU_CACHE
 	tristate
 
+config LLIST
+	bool
+
 endmenu
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -106,6 +106,8 @@ obj-$(CONFIG_GENERIC_ATOMIC64) += atomic
 
 obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
 
+obj-$(CONFIG_LLIST) += llist.o
+
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 
--- /dev/null
+++ b/lib/llist.c
@@ -0,0 +1,84 @@
+/*
+ * Simple lock-less NULL terminated single list implementation
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/llist.h>
+#include <linux/errno.h>
+
+#include <asm/system.h>
+
+/**
+ * llist_add - add a new entry
+ * @new:	new entry to be added
+ * @head:	the head for your lock-less list
+ */
+void llist_add(struct llist_head *new, struct llist_head *head)
+{
+	struct llist_head *entry;
+
+	do {
+		entry = head->next;
+		new->next = entry;
+	} while (cmpxchg(&head->next, entry, new) != entry);
+}
+EXPORT_SYMBOL_GPL(llist_add);
+
+/**
+ * llist_del_first - delete the first entry of lock-less list
+ * @head:	the head for your lock-less list
+ *
+ * If list is empty, return NULL, otherwise, return the first entry deleted.
+ *
+ * Only one llist_del_first user can be used simultaneously with
+ * multiple llist_add users without lock. Because otherwise
+ * llist_del_first, llist_add, llist_add sequence in another user may
+ * change @head->next->next, but keep @head->next. If multiple
+ * consumers are needed, please use llist_move_all.
+ */
+struct llist_head *llist_del_first(struct llist_head *head)
+{
+	struct llist_head *entry;
+
+	do {
+		entry = head->next;
+		if (entry == NULL)
+			return NULL;
+	} while (cmpxchg(&head->next, entry, entry->next) != entry);
+
+	return entry;
+}
+EXPORT_SYMBOL_GPL(llist_del_first);
+
+/**
+ * llist_move_all - delete all entries from one list and add them to another list
+ * @list:	the head of lock-less list to delete all entries
+ * @head:	the head of lock-less list to add the entries
+ *
+ * Remove all entries from @list lock-lessly, then add the entries to
+ * lock-less list @head.
+ */
+void llist_move_all(struct llist_head *list, struct llist_head *head)
+{
+	struct llist_head *entry;
+
+	entry = xchg(&list->next, NULL);
+	head->next = entry;
+}
+EXPORT_SYMBOL_GPL(llist_move_all);

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH -v2 4/9] lock-less general memory allocator
  2010-10-25  7:43 [PATCH -v2 0/9] ACPI, APEI patches for 2.6.37 Huang Ying
                   ` (2 preceding siblings ...)
  2010-10-25  7:43 ` [PATCH -v2 3/9] lock-less NULL terminated single list implementation Huang Ying
@ 2010-10-25  7:43 ` Huang Ying
  2010-10-25  7:43 ` [PATCH -v2 5/9] Hardware error device core Huang Ying
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 48+ messages in thread
From: Huang Ying @ 2010-10-25  7:43 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

Lock-less memory allocator, can be used to allocate/free memory in
IRQ/NMI handler. This is useful for hardware error handling, which
needs to collect hardware error information in IRQ/NMI handler.

The memory pages for lock-less memory allocator are pre-allocated
during initialization. Bitmap is used to record allocated/free memory
blocks. To support lock-less allocate/free operation, lock-less bitmap
set/clear operations are used.

The difference between this allocator and the gen_pool implementation
in lib/genalloc.c is that memory can be allocated/freed by multiple
users simultaneously without lock.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/llalloc.h |   27 +++++
 lib/Kconfig             |    3 
 lib/Makefile            |    1 
 lib/llalloc.c           |  259 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 290 insertions(+)
 create mode 100644 include/linux/llalloc.h
 create mode 100644 lib/llalloc.c

--- /dev/null
+++ b/include/linux/llalloc.h
@@ -0,0 +1,27 @@
+#ifndef LLALLOC_H
+#define LLALLOC_H
+
+struct ll_pool;
+
+struct ll_pool_chunk {
+	atomic_t avail;
+	void *data;
+	struct ll_pool *pool;
+	unsigned long bitmap[0];
+};
+
+struct ll_pool {
+	int min_alloc_order;
+	int chunk_order;
+	int chunk_num;
+	struct ll_pool_chunk *chunks[0];
+};
+
+struct ll_pool *ll_pool_create(int min_alloc_order, int chunk_order,
+			       int chunk_num, int nid);
+void ll_pool_destroy(struct ll_pool *pool);
+void *ll_pool_alloc(struct ll_pool *pool, size_t len);
+void ll_pool_free(const void *p, size_t len);
+size_t ll_pool_avail(struct ll_pool *pool);
+size_t ll_pool_size(struct ll_pool *pool);
+#endif /* LLALLOC_H */
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -213,4 +213,7 @@ config LRU_CACHE
 config LLIST
 	bool
 
+config LLALLOC
+	bool
+
 endmenu
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -107,6 +107,7 @@ obj-$(CONFIG_GENERIC_ATOMIC64) += atomic
 obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
 
 obj-$(CONFIG_LLIST) += llist.o
+obj-$(CONFIG_LLALLOC) += llalloc.o
 
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
--- /dev/null
+++ b/lib/llalloc.c
@@ -0,0 +1,259 @@
+/*
+ * Lock-less memory allocator, This can be used to allocate/free
+ * memory in IRQ/NMI handler. This is useful for hardware error
+ * handling, which needs to collect hardware error information in
+ * IRQ/NMI handler.
+ *
+ * The memory pages for lock-less memory allocator are pre-allocated
+ * during initialization. Bitmap is used to record allocated/free
+ * memory blocks. To support lock-less allocate/free operation,
+ * lock-less bitmap set/clear operations are used.
+ *
+ * The difference between this allocator and the gen_pool
+ * implementation in lib/genalloc.c is that memory can be
+ * allocated/freed by multiple users simultaneously without lock.
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/llalloc.h>
+
+static inline void ll_page_set_chunk(struct page *page,
+				     struct ll_pool_chunk *chunk)
+{
+	page->lru.prev = (struct list_head *)chunk;
+}
+
+static inline struct ll_pool_chunk *ll_virt_to_pool_chunk(const void *p)
+{
+	struct page *page = virt_to_head_page(p);
+	return (struct ll_pool_chunk *)page->lru.prev;
+}
+
+static void ll_pool_chunk_destroy(struct ll_pool_chunk *chunk)
+{
+	int chunk_size = PAGE_SIZE << chunk->pool->chunk_order;
+
+	BUG_ON(atomic_read(&chunk->avail) != chunk_size);
+	__free_pages(virt_to_page(chunk->data), chunk->pool->chunk_order);
+	kfree(chunk);
+}
+
+/**
+ * ll_pool_destroy - destroy a lock-less memory pool
+ * @ pool: pool to destroy
+ *
+ * Destroy the lock-less memory pool. Verify that there are no
+ * outstanding allocations. Memory pages backed the lock-less
+ * allocator are freed.
+ */
+void ll_pool_destroy(struct ll_pool *pool)
+{
+	int i;
+
+	for (i = 0; i < pool->chunk_num; i++) {
+		if (pool->chunks[i])
+			ll_pool_chunk_destroy(pool->chunks[i]);
+	}
+}
+EXPORT_SYMBOL_GPL(ll_pool_destroy);
+
+static struct ll_pool_chunk *ll_pool_chunk_create(struct ll_pool *pool, int nid)
+{
+	struct ll_pool_chunk *chunk;
+	int chunk_size = PAGE_SIZE << pool->chunk_order;
+	int nbits = chunk_size >> pool->min_alloc_order;
+	int nbytes = sizeof(struct ll_pool_chunk) + \
+		DIV_ROUND_UP(nbits, BITS_PER_LONG) * sizeof(unsigned long);
+	struct page *page;
+
+	chunk = kmalloc_node(nbytes, GFP_KERNEL | __GFP_ZERO, nid);
+	if (!chunk)
+		return NULL;
+	page = alloc_pages_node(nid, GFP_KERNEL, pool->chunk_order);
+	if (!page)
+		goto err_free_chunk;
+	chunk->data = page_address(page);
+	atomic_set(&chunk->avail, chunk_size);
+	ll_page_set_chunk(page, chunk);
+	chunk->pool = pool;
+
+	return chunk;
+err_free_chunk:
+	kfree(chunk);
+	return NULL;
+}
+
+/**
+ * ll_pool_create - create a new lock-less memory pool
+ * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
+ * @chunk_order: log base 2 of number of pages each chunk manages
+ * @chunk_num: number of chunks
+ * @nid: node id of the node the pool structure should be allocated on, or -1
+ *
+ * Create a new lock-less memory pool that can be used in IRQ/NMI
+ * context. (PAGE_SIZE << @chunk_order) * @chunk_num memory pages
+ * backed the lock-less allocator are allocated with alloc_pages_node.
+ */
+struct ll_pool *ll_pool_create(int min_alloc_order, int chunk_order,
+			       int chunk_num, int nid)
+{
+	struct ll_pool *pool;
+	int i;
+
+	pool = kmalloc_node(sizeof(*pool) + chunk_num * sizeof(void *),
+			    GFP_KERNEL | __GFP_ZERO, nid);
+	if (!pool)
+		return NULL;
+	pool->min_alloc_order = min_alloc_order;
+	pool->chunk_order = chunk_order;
+	pool->chunk_num = chunk_num;
+	for (i = 0; i < chunk_num; i++) {
+		pool->chunks[i] = ll_pool_chunk_create(pool, nid);
+		if (!pool->chunks[i])
+			goto err_pool_destroy;
+	}
+
+	return pool;
+err_pool_destroy:
+	ll_pool_destroy(pool);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(ll_pool_create);
+
+static void *ll_pool_chunk_alloc(struct ll_pool_chunk *chunk, size_t len)
+{
+	struct ll_pool *pool = chunk->pool;
+	int order = pool->min_alloc_order;
+	int chunk_bits = (PAGE_SIZE << pool->chunk_order) >> order;
+	int pos = 0, bits, remain;
+
+	if (len > atomic_read(&chunk->avail))
+		return NULL;
+
+	bits = (len + (1UL << order) - 1) >> order;
+	for (;;) {
+		pos = bitmap_find_next_zero_area(chunk->bitmap, chunk_bits,
+						 pos, bits, 0);
+		if (pos >= chunk_bits)
+			return NULL;
+		remain = bitmap_set_ll(chunk->bitmap, pos, bits);
+		if (!remain)
+			break;
+		remain = bitmap_clear_ll(chunk->bitmap, pos, bits - remain);
+		BUG_ON(remain);
+	}
+	len = bits << order;
+	atomic_sub(len, &chunk->avail);
+
+	return chunk->data + (pos << order);
+}
+
+/**
+ * ll_pool_alloc - allocate memory from the pool
+ * @pool: pool to allocate from
+ * @len: number of bytes to allocate from the pool
+ *
+ * Allocate the required number of bytes from the pool. Uses a
+ * first-fit algorithm.
+ */
+void *ll_pool_alloc(struct ll_pool *pool, size_t len)
+{
+	int i;
+	void *p;
+	struct ll_pool_chunk *chunk;
+
+	for (i = 0; i < pool->chunk_num; i++) {
+		chunk = pool->chunks[i];
+		p = ll_pool_chunk_alloc(chunk, len);
+		if (p)
+			return p;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(ll_pool_alloc);
+
+static void ll_pool_chunk_free(struct ll_pool_chunk *chunk,
+			       const void *p, size_t len)
+{
+	struct ll_pool *pool = chunk->pool;
+	int order = pool->min_alloc_order;
+	int mask = (1UL << order) - 1;
+	int chunk_size = PAGE_SIZE << pool->chunk_order;
+	int remain, pos, bits;
+
+	BUG_ON(p < chunk->data || p + len > chunk->data + chunk_size);
+	BUG_ON((p - chunk->data) & mask);
+	bits = (len + mask) >> order;
+	len = bits << order;
+	pos = (p - chunk->data) >> order;
+	remain = bitmap_clear_ll(chunk->bitmap, pos, bits);
+	BUG_ON(remain);
+	atomic_add(len, &chunk->avail);
+}
+
+/**
+ * ll_pool_free - free allocated memory back to the pool
+ * @p: starting address of memory to free back to pool
+ * @len: size in bytes of memory to free
+ *
+ * Free previously allocated memory back the the pool.
+ */
+void ll_pool_free(const void *p, size_t len)
+{
+	struct ll_pool_chunk *chunk;
+
+	chunk = ll_virt_to_pool_chunk(p);
+	ll_pool_chunk_free(chunk, p, len);
+}
+EXPORT_SYMBOL_GPL(ll_pool_free);
+
+/**
+ * ll_pool_avail - get available free space of the pool
+ * @pool: pool to get available free space
+ *
+ * Return available free space of the specified pool.
+ */
+size_t ll_pool_avail(struct ll_pool *pool)
+{
+	int i;
+	size_t avail = 0;
+
+	for (i = 0; i < pool->chunk_num; i++)
+		avail += atomic_read(&pool->chunks[i]->avail);
+
+	return avail;
+}
+EXPORT_SYMBOL_GPL(ll_pool_avail);
+
+/**
+ * ll_pool_size - get size in bytes of memory managed by the pool
+ * @pool: pool to get size
+ *
+ * Return size in bytes of memory managed by the pool.
+ */
+size_t ll_pool_size(struct ll_pool *pool)
+{
+	return (PAGE_SIZE << pool->chunk_order) * pool->chunk_num;
+}
+EXPORT_SYMBOL_GPL(ll_pool_size);

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH -v2 5/9] Hardware error device core
  2010-10-25  7:43 [PATCH -v2 0/9] ACPI, APEI patches for 2.6.37 Huang Ying
                   ` (3 preceding siblings ...)
  2010-10-25  7:43 ` [PATCH -v2 4/9] lock-less general memory allocator Huang Ying
@ 2010-10-25  7:43 ` Huang Ying
  2010-10-25  7:43 ` [PATCH -v2 6/9] Hardware error record persistent support Huang Ying
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 48+ messages in thread
From: Huang Ying @ 2010-10-25  7:43 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

Hardware error device is a kind of device which can report hardware
errors.  The examples of hardware error device include APEI GHES, PCIe
AER, etc.

Hardware error device core in this patch provides common services for
various hardware error devices.

Hardware error record data structure is defined to accommodate various
hardware error information.  Several error sections can be
incorporated into one error record to accumulate information for
multiple hardware components related to one error.  For example, for
an error on a device on the secondary side of a PCIe bridge, it is
useful to record error information from PCIe bridge and from the PCIe
device.

To manage various hardware error devices, struct herr_dev is defined
as a "sub-class" of "struct device".  It should be created as the
child device of the "real" hardware error device such as APEI GHES to
represent the hardware error reporting aspect of the device.  The
class of struct herr_dev instances is "error", so that all hardware
error devices can be enumerated via listing all links in
/sys/class/error.

The most important functionality that this patch provided is hardware
error reporting support.  Details is as follow.

A lock-less hardware error record allocator is provided.  So for
hardware error that can be ignored (such as corrected errors), it is
not needed to pre-allocate the error record or allocate the error
record on stack.

After filling in all necessary fields in hardware error record, the
error reporting is quite straightforward, just calling
herr_record_report, parameters are the error record itself and the
corresponding struct herr_dev.  All other special processing including
user space interface is handled by hardware error device core, this
make it much easier to write a hardware error handler.

Because the possibility for two or more hardware parts to go error
simultaneously is quite low, it is not necessary to create one device
file for each hardware error device.  One device file (/dev/error)
which mixed error records from all hardware error devices is created
as the user space interface.  Then, the user space hardware error
daemon can do some further processing (including logging into disk),
based on the hardware error records read from the device file.

Although the in-kernel and user space interface of hardware error
reporting is quite simple, there is some special consideration in
hardware error reporting implementation.

Hardware errors may burst, for example, same hardware errors may be
reported at high rate within a short interval, this will use up all
pre-allocated memory for error reporting, so that other hardware
errors come from same or different hardware device can not be logged.
To deal with this issue, a burst control algorithm is implemented.
The logging rate for errors come from one hardware error device is
throttled based on the available pre-allocated memory for error
reporting.  In this way we can log as many kinds of errors as possible
comes from as many error devices as possible.

This patch is designed by Andi Kleen and Huang Ying.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 drivers/char/Kconfig            |    2 
 drivers/char/Makefile           |    2 
 drivers/char/herror/Kconfig     |    4 
 drivers/char/herror/Makefile    |    1 
 drivers/char/herror/herr-core.c |  599 ++++++++++++++++++++++++++++++++++++++++
 include/linux/Kbuild            |    1 
 include/linux/herror.h          |   69 ++++
 include/linux/herror_record.h   |  100 ++++++
 8 files changed, 778 insertions(+)
 create mode 100644 drivers/char/herror/Kconfig
 create mode 100644 drivers/char/herror/Makefile
 create mode 100644 drivers/char/herror/herr-core.c
 create mode 100644 include/linux/herror.h
 create mode 100644 include/linux/herror_record.h

--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -1129,5 +1129,7 @@ config RAMOOPS
 	  This enables panic and oops messages to be logged to a circular
 	  buffer in RAM where it can be read back at some later point.
 
+source "drivers/char/herror/Kconfig"
+
 endmenu
 
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -115,6 +115,8 @@ obj-$(CONFIG_RAMOOPS)		+= ramoops.o
 obj-$(CONFIG_JS_RTC)		+= js-rtc.o
 js-rtc-y = rtc.o
 
+obj-$(CONFIG_HERR_DEV_CORE)	+= herror/
+
 # Files generated that shall be removed upon make clean
 clean-files := consolemap_deftbl.c defkeymap.c
 
--- /dev/null
+++ b/drivers/char/herror/Kconfig
@@ -0,0 +1,4 @@
+config HERR_DEV_CORE
+	bool
+	select LLIST
+	select LLALLOC
--- /dev/null
+++ b/drivers/char/herror/Makefile
@@ -0,0 +1 @@
+obj-y				+= herr-core.o
--- /dev/null
+++ b/drivers/char/herror/herr-core.c
@@ -0,0 +1,599 @@
+/*
+ * Hardware error device core
+ *
+ * Hardware error device is a kind of device which can report hardware
+ * errors.  The examples of hardware error device include APEI GHES,
+ * PCIe AER, etc.
+ *
+ * Hardware error device core provides common services for various
+ * hardware error devices, including hardware error record lock-less
+ * allocator, error reporting mechanism, hardware error device
+ * management, etc.
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rculist.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+#include <linux/ratelimit.h>
+#include <linux/nmi.h>
+#include <linux/llist.h>
+#include <linux/llalloc.h>
+#include <linux/herror.h>
+
+#define HERR_NOTIFY_BIT			0
+
+static unsigned long herr_flags;
+
+/*
+ * Record list management and error reporting
+ */
+
+struct herr_node {
+	struct llist_head llist;
+	struct herr_record ercd __attribute__((aligned(HERR_MIN_ALIGN)));
+};
+
+#define HERR_NODE_LEN(rcd_len)					\
+	((rcd_len) + sizeof(struct herr_node) - sizeof(struct herr_record))
+
+#define HERR_MIN_ALLOC_ORDER	HERR_MIN_ALIGN_ORDER
+#define HERR_CHUNK_ORDER	0
+#define HERR_CHUNKS_PER_CPU	2
+#define HERR_RCD_LIST_NUM	2
+
+struct herr_rcd_lists {
+	struct llist_head *write;
+	struct llist_head *read;
+	struct llist_head heads[HERR_RCD_LIST_NUM];
+};
+
+static DEFINE_PER_CPU(struct herr_rcd_lists, herr_rcd_lists);
+
+static DEFINE_PER_CPU(struct ll_pool *, herr_ll_pool);
+
+static void herr_rcd_lists_init(void)
+{
+	int cpu, i;
+	struct herr_rcd_lists *lists;
+
+	for_each_possible_cpu(cpu) {
+		lists = per_cpu_ptr(&herr_rcd_lists, cpu);
+		for (i = 0; i < HERR_RCD_LIST_NUM; i++)
+			init_llist_head(&lists->heads[i]);
+		lists->write = &lists->heads[0];
+		lists->read = &lists->heads[1];
+	}
+}
+
+static void herr_pool_fini(void)
+{
+	struct ll_pool *pool;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		pool = per_cpu(herr_ll_pool, cpu);
+		ll_pool_destroy(pool);
+	}
+}
+
+static int herr_pool_init(void)
+{
+	struct ll_pool **pool;
+	int cpu, rc;
+
+	for_each_possible_cpu(cpu) {
+		pool = per_cpu_ptr(&herr_ll_pool, cpu);
+		rc = -ENOMEM;
+		*pool = ll_pool_create(HERR_MIN_ALLOC_ORDER, HERR_CHUNK_ORDER,
+				       HERR_CHUNKS_PER_CPU, cpu_to_node(cpu));
+		if (!*pool)
+			goto err_pool_fini;
+	}
+
+	return 0;
+err_pool_fini:
+	herr_pool_fini();
+	return rc;
+}
+
+/* Max interval: about 2 second */
+#define HERR_BURST_BASE_INTVL	NSEC_PER_USEC
+#define HERR_BURST_MAX_RATIO	21
+#define HERR_BURST_MAX_INTVL						\
+	((1ULL << HERR_BURST_MAX_RATIO) * HERR_BURST_BASE_INTVL)
+/*
+ * Pool size/used ratio considered spare, before this, interval
+ * between error reporting is ignored. After this, minimal interval
+ * needed is increased exponentially to max interval.
+ */
+#define HERR_BURST_SPARE_RATIO	3
+
+static int herr_burst_control(struct herr_dev *edev)
+{
+	struct ll_pool *pool;
+	unsigned long long last, now, min_intvl;
+	unsigned int size, used, ratio;
+
+	pool = __get_cpu_var(herr_ll_pool);
+	size = ll_pool_size(pool);
+	used = size - ll_pool_avail(pool);
+	if (HERR_BURST_SPARE_RATIO * used < size)
+		goto pass;
+	now = trace_clock_local();
+	last = atomic64_read(&edev->timestamp);
+	ratio = (used * HERR_BURST_SPARE_RATIO - size) * HERR_BURST_MAX_RATIO;
+	ratio = ratio / (size * HERR_BURST_SPARE_RATIO - size) + 1;
+	min_intvl = (1ULL << ratio) * HERR_BURST_BASE_INTVL;
+	if ((long long)(now - last) > min_intvl)
+		goto pass;
+	atomic_inc(&edev->bursts);
+	return 0;
+pass:
+	return 1;
+}
+
+static u64 herr_record_next_id(void)
+{
+	static atomic64_t seq = ATOMIC64_INIT(0);
+
+	if (!atomic64_read(&seq))
+		atomic64_set(&seq, (u64)get_seconds() << 32);
+
+	return atomic64_inc_return(&seq);
+}
+
+void herr_record_init(struct herr_record *ercd)
+{
+	ercd->flags = 0;
+	ercd->rev = HERR_RCD_REV1_0;
+	ercd->id = herr_record_next_id();
+	ercd->timestamp = trace_clock_local();
+}
+EXPORT_SYMBOL_GPL(herr_record_init);
+
+struct herr_record *herr_record_alloc(unsigned int len, struct herr_dev *edev,
+				      unsigned int flags)
+{
+	struct ll_pool *pool;
+	struct herr_node *enode;
+	struct herr_record *ercd = NULL;
+
+	preempt_disable();
+	if (!(flags & HERR_ALLOC_NO_BURST_CONTROL)) {
+		if (!herr_burst_control(edev)) {
+			preempt_enable_no_resched();
+			return NULL;
+		}
+	}
+
+	pool = __get_cpu_var(herr_ll_pool);
+	enode = ll_pool_alloc(pool, HERR_NODE_LEN(len));
+	if (enode) {
+		ercd = &enode->ercd;
+		herr_record_init(ercd);
+		ercd->length = len;
+
+		atomic64_set(&edev->timestamp, trace_clock_local());
+		atomic_inc(&edev->logs);
+	} else
+		atomic_inc(&edev->overflows);
+	preempt_enable_no_resched();
+
+	return ercd;
+}
+EXPORT_SYMBOL_GPL(herr_record_alloc);
+
+int herr_record_report(struct herr_record *ercd, struct herr_dev *edev)
+{
+	struct herr_rcd_lists *lists;
+	struct herr_node *enode;
+
+	preempt_disable();
+	lists = this_cpu_ptr(&herr_rcd_lists);
+	enode = container_of(ercd, struct herr_node, ercd);
+	llist_add(&enode->llist, lists->write);
+	preempt_enable_no_resched();
+
+	set_bit(HERR_NOTIFY_BIT, &herr_flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(herr_record_report);
+
+void herr_record_free(struct herr_record *ercd)
+{
+	struct herr_node *enode;
+
+	enode = container_of(ercd, struct herr_node, ercd);
+	ll_pool_free(enode, HERR_NODE_LEN(enode->ercd.length));
+}
+EXPORT_SYMBOL_GPL(herr_record_free);
+
+/*
+ * The low 16 bit is freeze count, high 16 bit is thaw count. If they
+ * are not equal, someone is freezing the reader
+ */
+static u32 herr_freeze_thaw;
+
+/*
+ * Stop the reader to consume error records, so that the error records
+ * can be checked in kernel space safely.
+ */
+static void herr_freeze_reader(void)
+{
+	u32 old, new;
+
+	do {
+		new = old = herr_freeze_thaw;
+		new = ((new + 1) & 0xffff) | (old & 0xffff0000);
+	} while (cmpxchg(&herr_freeze_thaw, old, new) != old);
+}
+
+static void herr_thaw_reader(void)
+{
+	u32 old, new;
+
+	do {
+		old = herr_freeze_thaw;
+		new = old + 0x10000;
+	} while (cmpxchg(&herr_freeze_thaw, old, new) != old);
+}
+
+static int herr_reader_is_frozen(void)
+{
+	u32 freeze_thaw = herr_freeze_thaw;
+	return (freeze_thaw & 0xffff) != (freeze_thaw >> 16);
+}
+
+int herr_for_each_record(herr_traverse_func_t func, void *data)
+{
+	int i, cpu, rc = 0;
+	struct herr_rcd_lists *lists;
+	struct herr_node *enode;
+
+	preempt_disable();
+	herr_freeze_reader();
+	for_each_possible_cpu(cpu) {
+		lists = per_cpu_ptr(&herr_rcd_lists, cpu);
+		for (i = 0; i < HERR_RCD_LIST_NUM; i++) {
+			llist_for_each_entry(enode, &lists->heads[i], llist) {
+				rc = func(&enode->ercd, data);
+				if (rc)
+					goto out;
+			}
+		}
+	}
+out:
+	herr_thaw_reader();
+	preempt_enable_no_resched();
+	return rc;
+}
+EXPORT_SYMBOL_GPL(herr_for_each_record);
+
+static ssize_t herr_rcd_lists_read(char __user *ubuf, size_t usize,
+				   struct mutex *read_mutex)
+{
+	int cpu, rc = 0, read;
+	struct herr_rcd_lists *lists;
+	ssize_t len, rsize = 0;
+	struct herr_node *enode;
+	struct llist_head *old_read, *to_read;
+
+	do {
+		read = 0;
+		for_each_possible_cpu(cpu) {
+			lists = per_cpu_ptr(&herr_rcd_lists, cpu);
+			if (llist_empty(lists->read)) {
+				if (llist_empty(lists->write))
+					continue;
+				/*
+				 * Error records are output in batch, so old
+				 * error records can be output before new ones.
+				 */
+				old_read = lists->read;
+				lists->read = lists->write;
+				lists->write = old_read;
+			}
+			rc = rsize ? 0 : -EBUSY;
+			if (herr_reader_is_frozen())
+				goto out;
+			to_read = llist_del_first(lists->read);
+			if (herr_reader_is_frozen())
+				goto out_readd;
+			enode = llist_entry(to_read, struct herr_node, llist);
+			len = enode->ercd.length;
+			rc = rsize ? 0 : -EINVAL;
+			if (len > usize - rsize)
+				goto out_readd;
+			rc = -EFAULT;
+			if (copy_to_user(ubuf + rsize, &enode->ercd, len))
+				goto out_readd;
+			ll_pool_free(enode, HERR_NODE_LEN(len));
+			rsize += len;
+			read = 1;
+		}
+		if (need_resched()) {
+			mutex_unlock(read_mutex);
+			cond_resched();
+			mutex_lock(read_mutex);
+		}
+	} while (read);
+	rc = 0;
+out:
+	return rc ? rc : rsize;
+out_readd:
+	llist_add(to_read, lists->read);
+	goto out;
+}
+
+static int herr_rcd_lists_is_empty(void)
+{
+	int cpu, i;
+	struct herr_rcd_lists *lists;
+
+	for_each_possible_cpu(cpu) {
+		lists = per_cpu_ptr(&herr_rcd_lists, cpu);
+		for (i = 0; i < HERR_RCD_LIST_NUM; i++) {
+			if (!llist_empty(&lists->heads[i]))
+				return 0;
+		}
+	}
+	return 1;
+}
+
+
+/*
+ * Hardware Error Reporting Device Management
+ */
+
+static ssize_t herr_dev_name_show(struct device *device,
+				  struct device_attribute *attr,
+				  char *buf)
+{
+	struct herr_dev *dev = to_herr_dev(device);
+	return sprintf(buf, "%s\n", dev->name);
+}
+
+static struct device_attribute herr_dev_attr_name =
+	__ATTR(name, 0400, herr_dev_name_show, NULL);
+
+#define HERR_DEV_COUNTER_ATTR(_name)					\
+	static ssize_t herr_dev_##_name##_show(struct device *device,	\
+					       struct device_attribute *attr, \
+					       char *buf)		\
+	{								\
+		struct herr_dev *dev = to_herr_dev(device);		\
+		int counter;						\
+									\
+		counter = atomic_read(&dev->_name);			\
+		return sprintf(buf, "%d\n", counter);			\
+	}								\
+	static ssize_t herr_dev_##_name##_store(struct device *device,	\
+						struct device_attribute *attr, \
+						const char *buf,	\
+						size_t count)		\
+	{								\
+		struct herr_dev *dev = to_herr_dev(device);		\
+									\
+		atomic_set(&dev->_name, 0);				\
+		return count;						\
+	}								\
+	static struct device_attribute herr_dev_attr_##_name =		\
+		__ATTR(_name, 0600, herr_dev_##_name##_show,		\
+		       herr_dev_##_name##_store)
+
+HERR_DEV_COUNTER_ATTR(logs);
+HERR_DEV_COUNTER_ATTR(overflows);
+HERR_DEV_COUNTER_ATTR(bursts);
+
+static struct attribute *herr_dev_attrs[] = {
+	&herr_dev_attr_name.attr,
+	&herr_dev_attr_logs.attr,
+	&herr_dev_attr_overflows.attr,
+	&herr_dev_attr_bursts.attr,
+	NULL,
+};
+
+static struct attribute_group herr_dev_attr_group = {
+	.attrs	= herr_dev_attrs,
+};
+
+static const struct attribute_group *herr_dev_attr_groups[] = {
+	&herr_dev_attr_group,
+	NULL,
+};
+
+static void herr_dev_release(struct device *device)
+{
+	struct herr_dev *dev = to_herr_dev(device);
+
+	kfree(dev);
+}
+
+static struct device_type herr_dev_type = {
+	.groups		= herr_dev_attr_groups,
+	.release	= herr_dev_release,
+};
+
+static char *herr_devnode(struct device *dev, mode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "error/%s", dev_name(dev));
+}
+
+struct class herr_class = {
+	.name		= "error",
+	.devnode	= herr_devnode,
+};
+EXPORT_SYMBOL_GPL(herr_class);
+
+struct herr_dev *herr_dev_alloc(void)
+{
+	struct herr_dev *dev;
+
+	dev = kzalloc(sizeof(struct herr_dev), GFP_KERNEL);
+	if (!dev)
+		return NULL;
+	dev->dev.type = &herr_dev_type;
+	dev->dev.class = &herr_class;
+	device_initialize(&dev->dev);
+	atomic_set(&dev->logs, 0);
+	atomic_set(&dev->overflows, 0);
+	atomic_set(&dev->bursts, 0);
+	atomic64_set(&dev->timestamp, 0);
+
+	return dev;
+}
+EXPORT_SYMBOL_GPL(herr_dev_alloc);
+
+void herr_dev_free(struct herr_dev *dev)
+{
+	if (dev)
+		herr_dev_put(dev);
+}
+EXPORT_SYMBOL_GPL(herr_dev_free);
+
+int herr_dev_register(struct herr_dev *dev)
+{
+	static atomic_t herr_no = ATOMIC_INIT(0);
+	const char *path;
+	int rc;
+
+	dev_set_name(&dev->dev, "error%d", atomic_inc_return(&herr_no) - 1);
+
+	rc = device_add(&dev->dev);
+	if (rc)
+		goto err;
+
+	path = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
+	pr_info("error: %s as %s\n", dev->name ? dev->name : "Unspecified device",
+		path ? path : "N/A");
+	kfree(path);
+
+	return 0;
+err:
+	return rc;
+}
+EXPORT_SYMBOL_GPL(herr_dev_register);
+
+void herr_dev_unregister(struct herr_dev *dev)
+{
+	device_unregister(&dev->dev);
+}
+EXPORT_SYMBOL_GPL(herr_dev_unregister);
+
+
+/*
+ * Hardware Error Mix Reporting Device
+ */
+
+static int herr_major;
+static DECLARE_WAIT_QUEUE_HEAD(herr_mix_wait);
+
+void herr_notify(void)
+{
+	if (test_and_clear_bit(HERR_NOTIFY_BIT, &herr_flags))
+		wake_up_interruptible(&herr_mix_wait);
+}
+EXPORT_SYMBOL_GPL(herr_notify);
+
+static ssize_t herr_mix_read(struct file *filp, char __user *ubuf,
+			     size_t usize, loff_t *off)
+{
+	int rc;
+	static DEFINE_MUTEX(read_mutex);
+
+	if (*off != 0)
+		return -EINVAL;
+
+	rc = mutex_lock_interruptible(&read_mutex);
+	if (rc)
+		return rc;
+	rc = herr_rcd_lists_read(ubuf, usize, &read_mutex);
+	mutex_unlock(&read_mutex);
+
+	return rc;
+}
+
+static unsigned int herr_mix_poll(struct file *file, poll_table *wait)
+{
+	poll_wait(file, &herr_mix_wait, wait);
+	if (!herr_rcd_lists_is_empty())
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
+static const struct file_operations herr_mix_dev_fops = {
+	.owner		= THIS_MODULE,
+	.read		= herr_mix_read,
+	.poll		= herr_mix_poll,
+};
+
+static int __init herr_mix_dev_init(void)
+{
+	struct device *dev;
+	dev_t devt;
+
+	devt = MKDEV(herr_major, 0);
+	dev = device_create(&herr_class, NULL, devt, NULL, "error");
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	return 0;
+}
+device_initcall(herr_mix_dev_init);
+
+static int __init herr_core_init(void)
+{
+	int rc;
+
+	BUILD_BUG_ON(sizeof(struct herr_node) % HERR_MIN_ALIGN);
+	BUILD_BUG_ON(sizeof(struct herr_record) % HERR_MIN_ALIGN);
+	BUILD_BUG_ON(sizeof(struct herr_section) % HERR_MIN_ALIGN);
+
+	herr_rcd_lists_init();
+
+	rc = herr_pool_init();
+	if (rc)
+		goto err;
+
+	rc = class_register(&herr_class);
+	if (rc)
+		goto err_free_pool;
+
+	rc = herr_major = register_chrdev(0, "error", &herr_mix_dev_fops);
+	if (rc < 0)
+		goto err_free_class;
+
+	return 0;
+err_free_class:
+	class_unregister(&herr_class);
+err_free_pool:
+	herr_pool_fini();
+err:
+	return rc;
+}
+/* Initialize data structure used by device driver, so subsys_initcall */
+subsys_initcall(herr_core_init);
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -140,6 +140,7 @@ header-y += gigaset_dev.h
 header-y += hdlc.h
 header-y += hdlcdrv.h
 header-y += hdreg.h
+header-y += herror_record.h
 header-y += hid.h
 header-y += hiddev.h
 header-y += hidraw.h
--- /dev/null
+++ b/include/linux/herror.h
@@ -0,0 +1,69 @@
+#ifndef LINUX_HERROR_H
+#define LINUX_HERROR_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/device.h>
+#include <linux/herror_record.h>
+
+/*
+ * Hardware error reporting
+ */
+
+#define HERR_ALLOC_NO_BURST_CONTROL	0x0001
+
+struct herr_dev;
+
+/* allocate a herr_record lock-lessly */
+struct herr_record *herr_record_alloc(unsigned int len,
+				      struct herr_dev *edev,
+				      unsigned int flags);
+void herr_record_init(struct herr_record *ercd);
+/* report error via error record */
+int herr_record_report(struct herr_record *ercd, struct herr_dev *edev);
+/* free the herr_record allocated before */
+void herr_record_free(struct herr_record *ercd);
+/*
+ * Notify waited user space hardware error daemon for the new error
+ * record, can not be used in NMI context
+ */
+void herr_notify(void);
+
+/* Traverse all error records not consumed by user space */
+typedef int (*herr_traverse_func_t)(struct herr_record *ercd, void *data);
+int herr_for_each_record(herr_traverse_func_t func, void *data);
+
+
+/*
+ * Hardware Error Reporting Device Management
+ */
+extern struct class herr_class;
+
+struct herr_dev {
+	const char *name;
+	atomic_t overflows;
+	atomic_t logs;
+	atomic_t bursts;
+	struct device dev;
+	struct list_head list;
+	atomic64_t timestamp;
+};
+#define to_herr_dev(d)	container_of(d, struct herr_dev, dev)
+
+struct herr_dev *herr_dev_alloc(void);
+void herr_dev_free(struct herr_dev *dev);
+
+static inline struct herr_dev *herr_dev_get(struct herr_dev *dev)
+{
+	return dev ? to_herr_dev(get_device(&dev->dev)) : NULL;
+}
+
+static inline void herr_dev_put(struct herr_dev *dev)
+{
+	if (dev)
+		put_device(&dev->dev);
+}
+
+int herr_dev_register(struct herr_dev *dev);
+void herr_dev_unregister(struct herr_dev *dev);
+#endif
--- /dev/null
+++ b/include/linux/herror_record.h
@@ -0,0 +1,100 @@
+#ifndef LINUX_HERROR_RECORD_H
+#define LINUX_HERROR_RECORD_H
+
+#include <linux/types.h>
+
+/*
+ * Hardware Error Record Definition
+ */
+enum herr_severity {
+	HERR_SEV_NONE,
+	HERR_SEV_CORRECTED,
+	HERR_SEV_RECOVERABLE,
+	HERR_SEV_FATAL,
+};
+
+#define HERR_RCD_REV1_0		0x0100
+#define HERR_MIN_ALIGN_ORDER	3
+#define HERR_MIN_ALIGN		(1 << HERR_MIN_ALIGN_ORDER)
+
+enum herr_record_flags {
+	HERR_RCD_PREV		= 0x0001, /* record is for previous boot */
+	HERR_RCD_PERSIST	= 0x0002, /* record is from flash, need to be
+					   * cleared after writing to disk */
+};
+
+/*
+ * sizeof(struct herr_record) and sizeof(struct herr_section) should
+ * be multiple of HERR_MIN_ALIGN to make error record packing easier.
+ */
+struct herr_record {
+	__u16	length;
+	__u16	flags;
+	__u16	rev;
+	__u8	severity;
+	__u8	pad1;
+	__u64	id;
+	__u64	timestamp;
+	__u8	data[0];
+};
+
+/* Section type ID are allocated here */
+enum herr_section_type_id {
+	/* 0x0 - 0xff are reserved by core */
+	/* 0x100 - 0x1ff are allocated to CPER */
+	HERR_TYPE_CPER		= 0x0100,
+	HERR_TYPE_GESR		= 0x0110, /* acpi_hest_generic_status */
+	/* 0x200 - 0x2ff are allocated to PCI/PCIe subsystem */
+	HERR_TYPE_PCIE_AER	= 0x0200,
+};
+
+struct herr_section {
+	__u16	length;
+	__u16	flags;
+	__u32	type;
+	__u8	data[0];
+};
+
+#define herr_record_for_each_section(ercd, esec)		\
+	for ((esec) = (struct herr_section *)(ercd)->data;	\
+	     (void *)(esec) - (void *)(ercd) < (ercd)->length;	\
+	     (esec) = (void *)(esec) + (esec)->length)
+
+#define HERR_SEC_LEN_ROUND(len)						\
+	(((len) + HERR_MIN_ALIGN - 1) & ~(HERR_MIN_ALIGN - 1))
+#define HERR_SEC_LEN(type)						\
+	(sizeof(struct herr_section) + HERR_SEC_LEN_ROUND(sizeof(type)))
+
+#define HERR_RECORD_LEN_ROUND1(sec_len1)				\
+	(sizeof(struct herr_record) + HERR_SEC_LEN_ROUND(sec_len1))
+#define HERR_RECORD_LEN_ROUND2(sec_len1, sec_len2)			\
+	(sizeof(struct herr_record) + HERR_SEC_LEN_ROUND(sec_len1) +	\
+	 HERR_SEC_LEN_ROUND(sec_len2))
+#define HERR_RECORD_LEN_ROUND3(sec_len1, sec_len2, sec_len3)		\
+	(sizeof(struct herr_record) + HERR_SEC_LEN_ROUND(sec_len1) +	\
+	 HERR_SEC_LEN_ROUND(sec_len2) + HERR_SEC_LEN_ROUND(sec_len3))
+
+#define HERR_RECORD_LEN1(sec_type1)				\
+	(sizeof(struct herr_record) + HERR_SEC_LEN(sec_type1))
+#define HERR_RECORD_LEN2(sec_type1, sec_type2)			\
+	(sizeof(struct herr_record) + HERR_SEC_LEN(sec_type1) + \
+	 HERR_SEC_LEN(sec_type2))
+#define HERR_RECORD_LEN3(sec_type1, sec_type2, sec_type3)	\
+	(sizeof(struct herr_record) + HERR_SEC_LEN(sec_type1) + \
+	 HERR_SEC_LEN(sec_type2) + HERR_SEC_LEN(sec_type3))
+
+static inline struct herr_section *herr_first_sec(struct herr_record *ercd)
+{
+	return (struct herr_section *)(ercd + 1);
+}
+
+static inline struct herr_section *herr_next_sec(struct herr_section *esrc)
+{
+	return (void *)esrc + esrc->length;
+}
+
+static inline void *herr_sec_data(struct herr_section *esec)
+{
+	return (void *)(esec + 1);
+}
+#endif

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH -v2 6/9] Hardware error record persistent support
  2010-10-25  7:43 [PATCH -v2 0/9] ACPI, APEI patches for 2.6.37 Huang Ying
                   ` (4 preceding siblings ...)
  2010-10-25  7:43 ` [PATCH -v2 5/9] Hardware error device core Huang Ying
@ 2010-10-25  7:43 ` Huang Ying
  2010-10-25  7:43 ` [PATCH -v2 7/9] ACPI, APEI, Use ERST for hardware error persisting before panic Huang Ying
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 48+ messages in thread
From: Huang Ying @ 2010-10-25  7:43 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

Normally, corrected hardware error records will go through the kernel
processing and be logged to disk or network finally.  But for
uncorrected errors, system may go panic directly for better error
containment, disk or network is not usable in this half-working
system.  To avoid losing these valuable hardware error records, the
error records are saved into some kind of simple persistent storage
such as flash before panic, so that they can be read out after system
reboot successfully.

Different kind of simple persistent storage implementation mechanisms
are provided on different platforms, so an abstract interface for
persistent storage is defined.  Different implementations of the
interface can be registered.

This patch is designed by Andi Kleen and Huang Ying.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 drivers/char/herror/Makefile        |    2 
 drivers/char/herror/herr-core.c     |   39 +++++++-
 drivers/char/herror/herr-internal.h |   12 ++
 drivers/char/herror/herr-persist.c  |  174 ++++++++++++++++++++++++++++++++++++
 include/linux/Kbuild                |    1 
 include/linux/herror.h              |   48 +++++++++
 6 files changed, 271 insertions(+), 5 deletions(-)
 create mode 100644 drivers/char/herror/herr-internal.h
 create mode 100644 drivers/char/herror/herr-persist.c

--- a/drivers/char/herror/Makefile
+++ b/drivers/char/herror/Makefile
@@ -1 +1 @@
-obj-y				+= herr-core.o
+obj-y				+= herr-core.o herr-persist.o
--- a/drivers/char/herror/herr-core.c
+++ b/drivers/char/herror/herr-core.c
@@ -43,9 +43,9 @@
 #include <linux/llalloc.h>
 #include <linux/herror.h>
 
-#define HERR_NOTIFY_BIT			0
+#include "herr-internal.h"
 
-static unsigned long herr_flags;
+unsigned long herr_flags;
 
 /*
  * Record list management and error reporting
@@ -524,6 +524,7 @@ static ssize_t herr_mix_read(struct file
 {
 	int rc;
 	static DEFINE_MUTEX(read_mutex);
+	u64 record_id;
 
 	if (*off != 0)
 		return -EINVAL;
@@ -531,7 +532,14 @@ static ssize_t herr_mix_read(struct file
 	rc = mutex_lock_interruptible(&read_mutex);
 	if (rc)
 		return rc;
+	rc = herr_persist_peek_user(&record_id, ubuf, usize);
+	if (rc > 0) {
+		herr_persist_clear(record_id);
+		goto out;
+	}
+
 	rc = herr_rcd_lists_read(ubuf, usize, &read_mutex);
+out:
 	mutex_unlock(&read_mutex);
 
 	return rc;
@@ -540,15 +548,40 @@ static ssize_t herr_mix_read(struct file
 static unsigned int herr_mix_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &herr_mix_wait, wait);
-	if (!herr_rcd_lists_is_empty())
+	if (!herr_rcd_lists_is_empty() || !herr_persist_read_done())
 		return POLLIN | POLLRDNORM;
 	return 0;
 }
 
+static long herr_mix_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+	void __user *p = (void __user *)arg;
+	int rc;
+	u64 record_id;
+	struct herr_persist_buffer buf;
+
+	switch (cmd) {
+	case HERR_PERSIST_PEEK:
+		rc = copy_from_user(&buf, p, sizeof(buf));
+		if (rc)
+			return -EFAULT;
+		return herr_persist_peek_user(&record_id, buf.buf,
+					      buf.buf_size);
+	case HERR_PERSIST_CLEAR:
+		rc = copy_from_user(&record_id, p, sizeof(record_id));
+		if (rc)
+			return -EFAULT;
+		return herr_persist_clear(record_id);
+	default:
+		return -ENOTTY;
+	}
+}
+
 static const struct file_operations herr_mix_dev_fops = {
 	.owner		= THIS_MODULE,
 	.read		= herr_mix_read,
 	.poll		= herr_mix_poll,
+	.unlocked_ioctl	= herr_mix_ioctl,
 };
 
 static int __init herr_mix_dev_init(void)
--- /dev/null
+++ b/drivers/char/herror/herr-internal.h
@@ -0,0 +1,12 @@
+#ifndef HERR_INTERNAL_H
+#define HERR_INTERNAL_H
+
+#define HERR_NOTIFY_BIT			0
+
+extern unsigned long herr_flags;
+
+int herr_persist_read_done(void);
+ssize_t herr_persist_peek_user(u64 *record_id, char __user *ercd,
+			       size_t bufsiz);
+int herr_persist_clear(u64 record_id);
+#endif /* HERR_INTERNAL_H */
--- /dev/null
+++ b/drivers/char/herror/herr-persist.c
@@ -0,0 +1,174 @@
+/*
+ * Hardware error record persistent support
+ *
+ * Normally, corrected hardware error records will go through the
+ * kernel processing and be logged to disk or network finally.  But
+ * for uncorrected errors, system may go panic directly for better
+ * error containment, disk or network is not usable in this
+ * half-working system.  To avoid losing these valuable hardware error
+ * records, the error records are saved into some kind of simple
+ * persistent storage such as flash before panic, so that they can be
+ * read out after system reboot successfully.
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rculist.h>
+#include <linux/mutex.h>
+
+#include <linux/herror.h>
+
+#include "herr-internal.h"
+
+/*
+ * Simple persistent storage provider list, herr_persists_mutex is
+ * used for writer side mutual exclusion, RCU is used to implement
+ * lock-less reader side.
+ */
+static LIST_HEAD(herr_persists);
+static DEFINE_MUTEX(herr_persists_mutex);
+
+int herr_persist_register(struct herr_persist *persist)
+{
+	if (!persist->peek_user)
+		return -EINVAL;
+	persist->read_done = 0;
+	if (mutex_lock_interruptible(&herr_persists_mutex))
+		return -EINTR;
+	list_add_rcu(&persist->list, &herr_persists);
+	mutex_unlock(&herr_persists_mutex);
+	/*
+	 * There may be hardware error records of previous boot in
+	 * persistent storage, notify the user space error daemon to
+	 * check.
+	 */
+	set_bit(HERR_NOTIFY_BIT, &herr_flags);
+	herr_notify();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(herr_persist_register);
+
+void herr_persist_unregister(struct herr_persist *persist)
+{
+	mutex_lock(&herr_persists_mutex);
+	list_del_rcu(&persist->list);
+	mutex_unlock(&herr_persists_mutex);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(herr_persist_unregister);
+
+/* Can be used in atomic context including NMI */
+int herr_persist_in(const struct herr_record *ercd)
+{
+	struct herr_persist *persist;
+	int rc = -ENODEV;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(persist, &herr_persists, list) {
+		if (!persist->in)
+			continue;
+		rc = persist->in(ercd);
+		if (!rc)
+			break;
+	}
+	rcu_read_unlock();
+	return rc;
+}
+EXPORT_SYMBOL_GPL(herr_persist_in);
+
+int herr_persist_read_done(void)
+{
+	struct herr_persist *persist;
+	int rc = 1;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(persist, &herr_persists, list) {
+		if (!persist->read_done) {
+			rc = 0;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return rc;
+}
+
+/* Read next error record from persist storage, don't remove it */
+ssize_t herr_persist_peek_user(u64 *record_id, char __user *ercd,
+			       size_t bufsiz)
+{
+	struct herr_persist *persist;
+	ssize_t rc = 0;
+
+	if (mutex_lock_interruptible(&herr_persists_mutex))
+		return -EINTR;
+	list_for_each_entry(persist, &herr_persists, list) {
+		if (persist->read_done)
+			continue;
+		rc = persist->peek_user(record_id, ercd, bufsiz);
+		if (rc > 0)
+			break;
+		else if (rc != -EINTR && rc != -EAGAIN && rc != -EINVAL)
+			persist->read_done = 1;
+	}
+	mutex_unlock(&herr_persists_mutex);
+	return rc;
+}
+
+/* Clear specified error record from persist storage */
+int herr_persist_clear(u64 record_id)
+{
+	struct herr_persist *persist;
+	int rc = -ENOENT;
+
+	if (mutex_lock_interruptible(&herr_persists_mutex))
+		return -EINTR;
+	list_for_each_entry(persist, &herr_persists, list) {
+		if (!persist->clear)
+			continue;
+		rc = persist->clear(record_id);
+		if (!rc)
+			break;
+		/*
+		 * Failed to clear, mark as read_done, because we can
+		 * not skip this one
+		 */
+		else if (rc != -EINTR && rc != -EAGAIN && rc != -ENOENT)
+			persist->read_done = 1;
+	}
+	mutex_unlock(&herr_persists_mutex);
+	return rc;
+}
+
+static int herr_persist_record(struct herr_record *ercd, void *data)
+{
+	int *severity = data;
+
+	if (ercd->severity == *severity)
+		return herr_persist_in(ercd);
+	return 0;
+}
+
+void herr_persist_all_records(void)
+{
+	int severity;
+
+	for (severity = HERR_SEV_FATAL; severity >= HERR_SEV_NONE; severity--)
+		herr_for_each_record(herr_persist_record, &severity);
+}
+EXPORT_SYMBOL_GPL(herr_persist_all_records);
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -141,6 +141,7 @@ header-y += hdlc.h
 header-y += hdlcdrv.h
 header-y += hdreg.h
 header-y += herror_record.h
+header-y += herror.h
 header-y += hid.h
 header-y += hiddev.h
 header-y += hidraw.h
--- a/include/linux/herror.h
+++ b/include/linux/herror.h
@@ -1,10 +1,22 @@
 #ifndef LINUX_HERROR_H
 #define LINUX_HERROR_H
 
+#include <linux/ioctl.h>
+#include <linux/herror_record.h>
+
+struct herr_persist_buffer {
+	void __user *buf;
+	unsigned int buf_size;
+};
+
+#define HERR_PERSIST_PEEK	_IOW('H', 1, struct herr_persist_buffer)
+#define HERR_PERSIST_CLEAR	_IOW('H', 2, u64)
+
+#ifdef __KERNEL__
+
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/device.h>
-#include <linux/herror_record.h>
 
 /*
  * Hardware error reporting
@@ -66,4 +78,38 @@ static inline void herr_dev_put(struct h
 
 int herr_dev_register(struct herr_dev *dev);
 void herr_dev_unregister(struct herr_dev *dev);
+
+
+/*
+ * Simple Persistent Storage
+ */
+
+struct herr_persist;
+/* Put an error record into simple persistent storage */
+int herr_persist_in(const struct herr_record *ercd);
+/* Save all error records not yet consumed in persistent storage */
+void herr_persist_all_records(void);
+
+/*
+ * Simple Persistent Storage Provider Management
+ */
+struct herr_persist {
+	struct list_head list;
+	char *name;
+	unsigned int read_done:1;
+	/* Put an error record into storage, must be NMI-safe */
+	int (*in)(const struct herr_record *ercd);
+	/*
+	 * Read out an error record from storage to user space, don't
+	 * remove it, the HERR_RCD_PERSIST must be set in record flags
+	 */
+	ssize_t (*peek_user)(u64 *record_id, char __user *ubuf, size_t usize);
+	/* Clear an error record */
+	int (*clear)(u64 record_id);
+};
+
+/* Register (un-register) simple persistent storage provider */
+int herr_persist_register(struct herr_persist *persist);
+void herr_persist_unregister(struct herr_persist *persist);
+#endif
 #endif

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH -v2 7/9] ACPI, APEI, Use ERST for hardware error persisting before panic
  2010-10-25  7:43 [PATCH -v2 0/9] ACPI, APEI patches for 2.6.37 Huang Ying
                   ` (5 preceding siblings ...)
  2010-10-25  7:43 ` [PATCH -v2 6/9] Hardware error record persistent support Huang Ying
@ 2010-10-25  7:43 ` Huang Ying
  2010-10-25  7:43 ` [PATCH -v2 8/9] ACPI, APEI, Report GHES error record with hardware error device core Huang Ying
  2010-10-25  7:43 ` [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support Huang Ying
  8 siblings, 0 replies; 48+ messages in thread
From: Huang Ying @ 2010-10-25  7:43 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

Corrected/Recoverable hardware errors can usually be saved into disk
and/or sent to network for logging.  But if uncorrected/fatal hardware
errors occur, system may not reliable enough for desk and/or network
accessing and needs to go panic as soon as possible for error
containment.  In this situation, ERST can be used to save the valuable
error records into some persistent storage such as flash.

This patch implements herr_persist interface using ERST.  So that
hardware error device core can use ERST to save hardware error record
before panic.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 drivers/acpi/apei/Kconfig |    1 
 drivers/acpi/apei/cper.c  |   19 +++++
 drivers/acpi/apei/erst.c  |  169 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/cper.h      |    1 
 4 files changed, 190 insertions(+)

--- a/drivers/acpi/apei/cper.c
+++ b/drivers/acpi/apei/cper.c
@@ -29,6 +29,25 @@
 #include <linux/time.h>
 #include <linux/cper.h>
 #include <linux/acpi.h>
+#include <linux/herror_record.h>
+
+int herr_severity_to_cper(int herr_severity)
+{
+	switch (herr_severity) {
+	case HERR_SEV_NONE:
+		return CPER_SEV_INFORMATIONAL;
+	case HERR_SEV_CORRECTED:
+		return CPER_SEV_CORRECTED;
+	case HERR_SEV_RECOVERABLE:
+		return CPER_SEV_RECOVERABLE;
+	case HERR_SEV_FATAL:
+		return CPER_SEV_FATAL;
+	default:
+		BUG();
+		return CPER_SEV_FATAL;
+	}
+}
+EXPORT_SYMBOL_GPL(herr_severity_to_cper);
 
 /*
  * CPER record ID need to be unique even after reboot, because record
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -33,6 +33,7 @@
 #include <linux/uaccess.h>
 #include <linux/cper.h>
 #include <linux/nmi.h>
+#include <linux/herror.h>
 #include <linux/hardirq.h>
 #include <acpi/apei.h>
 
@@ -88,6 +89,12 @@ static struct erst_erange {
  */
 static DEFINE_SPINLOCK(erst_lock);
 
+static void *erst_buf;
+static unsigned int erst_buf_len;
+
+/* Prevent erst_buf from being accessed simultaneously */
+static DEFINE_MUTEX(erst_buf_mutex);
+
 static inline int erst_errno(int command_status)
 {
 	switch (command_status) {
@@ -774,6 +781,12 @@ static int __erst_write_to_nvram(const s
 	return -ENOSYS;
 }
 
+static int __erst_write_herr_record_to_nvram(const struct herr_record *ercd)
+{
+	/* do not print message, because printk is not safe for NMI */
+	return -ENOSYS;
+}
+
 static int __erst_read_to_erange_from_nvram(u64 record_id, u64 *offset)
 {
 	pr_unimpl_nvram();
@@ -910,6 +923,156 @@ out:
 }
 EXPORT_SYMBOL_GPL(erst_clear);
 
+#define CPER_CREATOR_ERST						\
+	UUID_LE(0xEACBBA0C, 0x803A, 0x4096, 0xB1, 0x1D, 0xC3, 0xC7,	\
+		0x6E, 0xE7, 0x94, 0xF9)
+
+#define CPER_SEC_HERR_RECORD						\
+	UUID_LE(0x633AB656, 0x6703, 0x11DF, 0x87, 0xCF, 0x00, 0x19,	\
+		0xD1, 0x2A, 0x29, 0xEF)
+
+static ssize_t erst_herr_record_to_cper(struct cper_record_header *crcd,
+					size_t buf_size,
+					const struct herr_record *ercd)
+{
+	struct cper_section_descriptor *csec;
+	unsigned int crcd_len;
+	void *csec_data;
+
+	crcd_len = sizeof(*crcd) + sizeof(*csec) + ercd->length;
+	if (crcd_len > buf_size)
+		return crcd_len;
+
+	memset(crcd, 0, crcd_len);
+	memcpy(crcd->signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+	crcd->revision = CPER_RECORD_REV;
+	crcd->signature_end = CPER_SIG_END;
+	crcd->error_severity = herr_severity_to_cper(ercd->severity);
+	/* timestamp, platform_id, partition_id is invalid */
+	crcd->validation_bits = 0;
+	crcd->creator_id = CPER_CREATOR_ERST;
+	crcd->section_count = 1;
+	crcd->record_length = crcd_len;
+	crcd->record_id = ercd->id;
+
+	csec = (struct cper_section_descriptor *)(crcd + 1);
+	csec_data = csec + 1;
+
+	csec->section_length = ercd->length;
+	csec->revision = CPER_SEC_REV;
+	csec->section_type = CPER_SEC_HERR_RECORD;
+	csec->section_severity = crcd->error_severity;
+	csec->section_offset = (void *)csec_data - (void *)crcd;
+
+	memcpy(csec_data, ercd, ercd->length);
+
+	return crcd_len;
+}
+
+static int erst_write_herr_record(const struct herr_record *ercd)
+{
+	struct cper_record_header *crcd;
+	ssize_t crcd_len;
+	unsigned long flags;
+	int rc;
+
+	if (!spin_trylock_irqsave(&erst_lock, flags))
+		return -EBUSY;
+
+	if (erst_erange.attr & ERST_RANGE_NVRAM) {
+		rc = __erst_write_herr_record_to_nvram(ercd);
+		goto out;
+	}
+
+	rc = -EINVAL;
+	crcd_len = erst_herr_record_to_cper(erst_erange.vaddr,
+					    erst_erange.size, ercd);
+	if (crcd_len > erst_erange.size)
+		goto out;
+	crcd = erst_erange.vaddr;
+	/* signature for serialization system */
+	memcpy(&crcd->persistence_information, "ER", 2);
+	rc = __erst_write_to_storage(0);
+out:
+	spin_unlock_irqrestore(&erst_lock, flags);
+
+	return rc;
+}
+
+static ssize_t erst_persist_peek_user(u64 *record_id, char __user *ubuf,
+				      size_t usize)
+{
+	int rc, pos;
+	ssize_t len, clen;
+	u64 id;
+	struct cper_record_header *crcd;
+	struct cper_section_descriptor *csec;
+	struct herr_record *ercd;
+
+	if (mutex_lock_interruptible(&erst_buf_mutex) != 0)
+		return -EINTR;
+	erst_get_record_id_begin(&pos);
+retry_next:
+	len = 0;
+	rc = erst_get_record_id_next(&pos, &id);
+	if (rc)
+		goto out;
+	/* no more record */
+	if (id == APEI_ERST_INVALID_RECORD_ID)
+		goto out;
+retry:
+	rc = clen = erst_read(id, erst_buf, erst_buf_len);
+	/* someone else has cleared the record, try next one */
+	if (rc == -ENOENT)
+		goto retry_next;
+	else if (rc < 0)
+		goto out;
+	else if (clen > erst_buf_len) {
+		void *p;
+		rc = -ENOMEM;
+		p = kmalloc(clen, GFP_KERNEL);
+		if (!p)
+			goto out;
+		kfree(erst_buf);
+		erst_buf = p;
+		erst_buf_len = clen;
+		goto retry;
+	}
+
+	crcd = erst_buf;
+	csec = (struct cper_section_descriptor *)(crcd + 1);
+	if (crcd->section_count != 1 ||
+	    uuid_le_cmp(crcd->creator_id, CPER_CREATOR_ERST) ||
+	    uuid_le_cmp(csec->section_type, CPER_SEC_HERR_RECORD))
+		goto retry_next;
+
+	ercd = (struct herr_record *)(csec + 1);
+	len = ercd->length;
+
+	rc = -EINVAL;
+	if (len > usize)
+		goto out;
+
+	ercd->flags |= HERR_RCD_PREV | HERR_RCD_PERSIST;
+
+	rc = -EFAULT;
+	if (copy_to_user(ubuf, ercd, len))
+		goto out;
+	*record_id = id;
+	rc = 0;
+out:
+	erst_get_record_id_end();
+	mutex_unlock(&erst_buf_mutex);
+	return rc ? rc : len;
+}
+
+static struct herr_persist erst_persist = {
+	.name		= "ERST",
+	.in		= erst_write_herr_record,
+	.peek_user	= erst_persist_peek_user,
+	.clear		= erst_clear,
+};
+
 static int __init setup_erst_disable(char *str)
 {
 	erst_disable = 1;
@@ -1007,11 +1170,17 @@ static int __init erst_init(void)
 	if (!erst_erange.vaddr)
 		goto err_release_erange;
 
+	rc = herr_persist_register(&erst_persist);
+	if (rc)
+		goto err_unmap_erange;
+
 	pr_info(ERST_PFX
 	"Error Record Serialization Table (ERST) support is initialized.\n");
 
 	return 0;
 
+err_unmap_erange:
+	iounmap(erst_erange.vaddr);
 err_release_erange:
 	release_mem_region(erst_erange.base, erst_erange.size);
 err_unmap_reg:
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -309,6 +309,7 @@ struct cper_sec_mem_err {
 /* Reset to default packing */
 #pragma pack()
 
+int herr_severity_to_cper(int herr_severity);
 u64 cper_next_record_id(void);
 
 #endif
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -1,6 +1,7 @@
 config ACPI_APEI
 	bool "ACPI Platform Error Interface (APEI)"
 	depends on X86
+	select HERR_DEV_CORE
 	help
 	  APEI allows to report errors (for example from the chipset)
 	  to the operating system. This improves NMI handling

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH -v2 8/9] ACPI, APEI, Report GHES error record with hardware error device core
  2010-10-25  7:43 [PATCH -v2 0/9] ACPI, APEI patches for 2.6.37 Huang Ying
                   ` (6 preceding siblings ...)
  2010-10-25  7:43 ` [PATCH -v2 7/9] ACPI, APEI, Use ERST for hardware error persisting before panic Huang Ying
@ 2010-10-25  7:43 ` Huang Ying
  2010-10-25  7:43 ` [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support Huang Ying
  8 siblings, 0 replies; 48+ messages in thread
From: Huang Ying @ 2010-10-25  7:43 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

One hardware error device (struct herr_dev) is created for each GHES
in GHES platform device "probe" function.  Then when GHES hardware
error handler is notified by firmware, the hardware error records will
be reported on the struct herr_dev.

In the previous GHES support, only corrected memory error can be
reported to user space via /dev/mcelog, now all kinds of hardware
errors notified with SCI can be reported.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 drivers/acpi/apei/cper.c |   18 +++++++
 drivers/acpi/apei/ghes.c |  119 +++++++++++++++++++++++++++++++----------------
 include/linux/cper.h     |    2 
 3 files changed, 99 insertions(+), 40 deletions(-)

--- a/drivers/acpi/apei/cper.c
+++ b/drivers/acpi/apei/cper.c
@@ -49,6 +49,24 @@ int herr_severity_to_cper(int herr_sever
 }
 EXPORT_SYMBOL_GPL(herr_severity_to_cper);
 
+int cper_severity_to_herr(int cper_severity)
+{
+	switch (cper_severity) {
+	case CPER_SEV_INFORMATIONAL:
+		return HERR_SEV_NONE;
+	case CPER_SEV_CORRECTED:
+		return HERR_SEV_CORRECTED;
+	case CPER_SEV_RECOVERABLE:
+		return HERR_SEV_RECOVERABLE;
+	case CPER_SEV_FATAL:
+		return HERR_SEV_FATAL;
+	default:
+		/* Unknown, default to fatal */
+		return HERR_SEV_FATAL;
+	}
+}
+EXPORT_SYMBOL_GPL(cper_severity_to_herr);
+
 /*
  * CPER record ID need to be unique even after reboot, because record
  * ID is used as index for ERST storage, while CPER records from
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -43,6 +43,7 @@
 #include <linux/kdebug.h>
 #include <linux/platform_device.h>
 #include <linux/mutex.h>
+#include <linux/herror.h>
 #include <acpi/apei.h>
 #include <acpi/atomicio.h>
 #include <acpi/hed.h>
@@ -74,6 +75,7 @@ struct ghes {
 	struct list_head list;
 	u64 buffer_paddr;
 	unsigned long flags;
+	struct herr_dev *herr_dev;
 };
 
 /*
@@ -238,9 +240,38 @@ static void ghes_clear_estatus(struct gh
 	ghes->flags &= ~GHES_TO_CLEAR;
 }
 
+static void ghes_report(struct ghes *ghes)
+{
+	struct herr_record *ercd;
+	struct herr_section *esec;
+	struct acpi_hest_generic_status *estatus;
+	unsigned int estatus_len, ercd_alloc_flags = 0;
+	int ghes_sev;
+
+	ghes_sev = ghes_severity(ghes->estatus->error_severity);
+	if (ghes_sev >= GHES_SEV_PANIC)
+		ercd_alloc_flags |= HERR_ALLOC_NO_BURST_CONTROL;
+	estatus_len = apei_estatus_len(ghes->estatus);
+	ercd = herr_record_alloc(HERR_RECORD_LEN_ROUND1(estatus_len),
+				 ghes->herr_dev, ercd_alloc_flags);
+	if (!ercd)
+		return;
+
+	ercd->severity = cper_severity_to_herr(ghes->estatus->error_severity);
+
+	esec = herr_first_sec(ercd);
+	esec->length = HERR_SEC_LEN_ROUND(estatus_len);
+	esec->flags = 0;
+	esec->type = HERR_TYPE_GESR;
+
+	estatus = herr_sec_data(esec);
+	memcpy(estatus, ghes->estatus, estatus_len);
+	herr_record_report(ercd, ghes->herr_dev);
+}
+
 static void ghes_do_proc(struct ghes *ghes)
 {
-	int sev, processed = 0;
+	int sev;
 	struct acpi_hest_generic_data *gdata;
 
 	sev = ghes_severity(ghes->estatus->error_severity);
@@ -251,15 +282,9 @@ static void ghes_do_proc(struct ghes *gh
 			apei_mce_report_mem_error(
 				sev == GHES_SEV_CORRECTED,
 				(struct cper_sec_mem_err *)(gdata+1));
-			processed = 1;
 		}
 #endif
 	}
-
-	if (!processed && printk_ratelimit())
-		pr_warning(GHES_PFX
-		"Unknown error record from generic hardware error source: %d\n",
-			   ghes->generic->header.source_id);
 }
 
 static int ghes_proc(struct ghes *ghes)
@@ -269,7 +294,9 @@ static int ghes_proc(struct ghes *ghes)
 	rc = ghes_read_estatus(ghes, 0);
 	if (rc)
 		goto out;
+	ghes_report(ghes);
 	ghes_do_proc(ghes);
+	herr_notify();
 
 out:
 	ghes_clear_estatus(ghes);
@@ -300,41 +327,15 @@ static int __devinit ghes_probe(struct p
 {
 	struct acpi_hest_generic *generic;
 	struct ghes *ghes = NULL;
-	int rc = -EINVAL;
+	int rc;
 
+	rc = -ENODEV;
 	generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data;
 	if (!generic->enabled)
-		return -ENODEV;
-
-	if (generic->error_block_length <
-	    sizeof(struct acpi_hest_generic_status)) {
-		pr_warning(FW_BUG GHES_PFX
-"Invalid error block length: %u for generic hardware error source: %d\n",
-			   generic->error_block_length,
-			   generic->header.source_id);
 		goto err;
-	}
-	if (generic->records_to_preallocate == 0) {
-		pr_warning(FW_BUG GHES_PFX
-"Invalid records to preallocate: %u for generic hardware error source: %d\n",
-			   generic->records_to_preallocate,
-			   generic->header.source_id);
-		goto err;
-	}
-	ghes = ghes_new(generic);
-	if (IS_ERR(ghes)) {
-		rc = PTR_ERR(ghes);
-		ghes = NULL;
-		goto err;
-	}
-	if (generic->notify.type == ACPI_HEST_NOTIFY_SCI) {
-		mutex_lock(&ghes_list_mutex);
-		if (list_empty(&ghes_sci))
-			register_acpi_hed_notifier(&ghes_notifier_sci);
-		list_add_rcu(&ghes->list, &ghes_sci);
-		mutex_unlock(&ghes_list_mutex);
-	} else {
-		unsigned char *notify = NULL;
+
+	if (generic->notify.type != ACPI_HEST_NOTIFY_SCI) {
+		char *notify = NULL;
 
 		switch (generic->notify.type) {
 		case ACPI_HEST_NOTIFY_POLLED:
@@ -357,9 +358,46 @@ static int __devinit ghes_probe(struct p
 "Unknown notification type: %u for generic hardware error source: %d\n",
 			generic->notify.type, generic->header.source_id);
 		}
-		rc = -ENODEV;
 		goto err;
 	}
+
+	rc = -EIO;
+	if (generic->error_block_length <
+	    sizeof(struct acpi_hest_generic_status)) {
+		pr_warning(FW_BUG GHES_PFX
+"Invalid error block length: %u for generic hardware error source: %d\n",
+			   generic->error_block_length,
+			   generic->header.source_id);
+		goto err;
+	}
+	ghes = ghes_new(generic);
+	if (IS_ERR(ghes)) {
+		rc = PTR_ERR(ghes);
+		ghes = NULL;
+		goto err;
+	}
+	rc = -ENOMEM;
+	ghes->herr_dev = herr_dev_alloc();
+	if (!ghes->herr_dev)
+		goto err;
+	ghes->herr_dev->name = dev_name(&ghes_dev->dev);
+	ghes->herr_dev->dev.parent = &ghes_dev->dev;
+	rc = herr_dev_register(ghes->herr_dev);
+	if (rc) {
+		herr_dev_free(ghes->herr_dev);
+		goto err;
+	}
+	switch (generic->notify.type) {
+	case ACPI_HEST_NOTIFY_SCI:
+		mutex_lock(&ghes_list_mutex);
+		if (list_empty(&ghes_sci))
+			register_acpi_hed_notifier(&ghes_notifier_sci);
+		list_add_rcu(&ghes->list, &ghes_sci);
+		mutex_unlock(&ghes_list_mutex);
+		break;
+	default:
+		BUG();
+	}
 	platform_set_drvdata(ghes_dev, ghes);
 
 	return 0;
@@ -386,13 +424,14 @@ static int __devexit ghes_remove(struct
 		if (list_empty(&ghes_sci))
 			unregister_acpi_hed_notifier(&ghes_notifier_sci);
 		mutex_unlock(&ghes_list_mutex);
+		synchronize_rcu();
 		break;
 	default:
 		BUG();
 		break;
 	}
 
-	synchronize_rcu();
+	herr_dev_unregister(ghes->herr_dev);
 	ghes_fini(ghes);
 	kfree(ghes);
 
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -22,6 +22,7 @@
 #define LINUX_CPER_H
 
 #include <linux/uuid.h>
+#include <linux/herror_record.h>
 
 /* CPER record signature and the size */
 #define CPER_SIG_RECORD				"CPER"
@@ -310,6 +311,7 @@ struct cper_sec_mem_err {
 #pragma pack()
 
 int herr_severity_to_cper(int herr_severity);
+int cper_severity_to_herr(int cper_severity);
 u64 cper_next_record_id(void);
 
 #endif

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25  7:43 [PATCH -v2 0/9] ACPI, APEI patches for 2.6.37 Huang Ying
                   ` (7 preceding siblings ...)
  2010-10-25  7:43 ` [PATCH -v2 8/9] ACPI, APEI, Report GHES error record with hardware error device core Huang Ying
@ 2010-10-25  7:43 ` Huang Ying
  2010-10-25  8:45   ` [NAK] " Ingo Molnar
  8 siblings, 1 reply; 48+ messages in thread
From: Huang Ying @ 2010-10-25  7:43 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

Generic Hardware Error Source provides a way to report platform
hardware errors (such as that from chipset). It works in so called
"Firmware First" mode, that is, hardware errors are reported to
firmware firstly, then reported to Linux by firmware. This way, some
non-standard hardware error registers or non-standard hardware link
can be checked by firmware to produce more valuable hardware error
information for Linux.

This patch adds POLL/IRQ/NMI notification types support.

Because the memory area used to transfer hardware error information
from BIOS to Linux can be determined only in NMI, IRQ or timer
handler, but general ioremap can not be used in atomic context, so a
special version of atomic ioremap is implemented for that.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/acpi/boot.c |    1 
 arch/x86/kernel/dumpstack.c |    1 
 drivers/acpi/apei/ghes.c    |  397 ++++++++++++++++++++++++++++++++++++--------
 kernel/panic.c              |    1 
 lib/ioremap.c               |    2 
 mm/vmalloc.c                |    1 
 6 files changed, 333 insertions(+), 70 deletions(-)

--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -512,6 +512,7 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq
 	*gsi = irq_to_gsi(isa_irq);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
 
 /*
  * success: return IRQ number (>=0)
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -240,6 +240,7 @@ unsigned __kprobes long oops_begin(void)
 	bust_spinlocks(1);
 	return flags;
 }
+EXPORT_SYMBOL_GPL(oops_begin);
 
 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 {
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -12,10 +12,6 @@
  * For more information about Generic Hardware Error Source, please
  * refer to ACPI Specification version 4.0, section 17.3.2.6
  *
- * Now, only SCI notification type and memory errors are
- * supported. More notification type and hardware error type will be
- * added later.
- *
  * Copyright 2010 Intel Corp.
  *   Author: Huang Ying <ying.huang@intel.com>
  *
@@ -39,15 +35,18 @@
 #include <linux/acpi.h>
 #include <linux/io.h>
 #include <linux/interrupt.h>
+#include <linux/timer.h>
 #include <linux/cper.h>
 #include <linux/kdebug.h>
 #include <linux/platform_device.h>
 #include <linux/mutex.h>
+#include <linux/vmalloc.h>
 #include <linux/herror.h>
 #include <acpi/apei.h>
 #include <acpi/atomicio.h>
 #include <acpi/hed.h>
 #include <asm/mce.h>
+#include <asm/tlbflush.h>
 
 #include "apei-internal.h"
 
@@ -56,43 +55,133 @@
 #define GHES_ESTATUS_MAX_SIZE		65536
 
 /*
- * One struct ghes is created for each generic hardware error
- * source.
- *
+ * One struct ghes is created for each generic hardware error source.
  * It provides the context for APEI hardware error timer/IRQ/SCI/NMI
- * handler. Handler for one generic hardware error source is only
- * triggered after the previous one is done. So handler can uses
- * struct ghes without locking.
+ * handler.
  *
  * estatus: memory buffer for error status block, allocated during
  * HEST parsing.
  */
 #define GHES_TO_CLEAR		0x0001
+#define GHES_EXITING		0x0002
 
 struct ghes {
 	struct acpi_hest_generic *generic;
 	struct acpi_hest_generic_status *estatus;
-	struct list_head list;
 	u64 buffer_paddr;
 	unsigned long flags;
 	struct herr_dev *herr_dev;
+	union {
+		struct list_head list;
+		struct timer_list timer;
+		unsigned int irq;
+	};
 };
 
+static int ghes_panic_timeout	__read_mostly = 30;
+
 /*
- * Error source lists, one list for each notification method. The
- * members in lists are struct ghes.
+ * All error sources notified with SCI shares one notifier function,
+ * so they need to be linked and checked one by one.  This is applied
+ * to NMI too.
  *
- * The list members are only added in HEST parsing and deleted during
- * module_exit, that is, single-threaded. So no lock is needed for
- * that.
- *
- * But the mutual exclusion is needed between members adding/deleting
- * and timer/IRQ/SCI/NMI handler, which may traverse the list. RCU is
- * used for that.
+ * RCU is used for these lists, so ghes_list_mutex is only used for
+ * list changing, not for traversing.
  */
 static LIST_HEAD(ghes_sci);
+static LIST_HEAD(ghes_nmi);
 static DEFINE_MUTEX(ghes_list_mutex);
 
+/*
+ * NMI may be triggered on any CPU, so ghes_nmi_lock is used for
+ * mutual exclusion.
+ */
+static DEFINE_RAW_SPINLOCK(ghes_nmi_lock);
+
+/*
+ * Because the memory area used to transfer hardware error information
+ * from BIOS to Linux can be determined only in NMI, IRQ or timer
+ * handler, but general ioremap can not be used in atomic context, so
+ * a special version of atomic ioremap is implemented for that.
+ */
+
+/*
+ * Two virtual pages are used, one for NMI context, the other for
+ * IRQ/PROCESS context
+ */
+#define GHES_IOREMAP_PAGES		2
+#define GHES_IOREMAP_NMI_PAGE(base)	(base)
+#define GHES_IOREMAP_IRQ_PAGE(base)	((base) + PAGE_SIZE)
+
+/* virtual memory area for atomic ioremap */
+static struct vm_struct *ghes_ioremap_area;
+/*
+ * These 2 spinlock is used to prevent atomic ioremap virtual memory
+ * area from being mapped simultaneously.
+ */
+static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi);
+static DEFINE_SPINLOCK(ghes_ioremap_lock_irq);
+
+static int ghes_ioremap_init(void)
+{
+	ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES,
+		VM_IOREMAP, VMALLOC_START, VMALLOC_END);
+	if (!ghes_ioremap_area) {
+		pr_err(GHES_PFX
+		"Failed to allocate virtual memory area for atomic ioremap.\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void ghes_ioremap_exit(void)
+{
+	free_vm_area(ghes_ioremap_area);
+}
+
+static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn)
+{
+	unsigned long vaddr;
+
+	vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr);
+	ioremap_page_range(vaddr, vaddr + PAGE_SIZE,
+			   pfn << PAGE_SHIFT, PAGE_KERNEL);
+
+	return (void __iomem *)vaddr;
+}
+
+static void __iomem *ghes_ioremap_pfn_irq(u64 pfn)
+{
+	unsigned long vaddr;
+
+	vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr);
+	ioremap_page_range(vaddr, vaddr + PAGE_SIZE,
+			   pfn << PAGE_SHIFT, PAGE_KERNEL);
+
+	return (void __iomem *)vaddr;
+}
+
+static void ghes_iounmap_nmi(void __iomem *vaddr_ptr)
+{
+	unsigned long vaddr = (unsigned long __force)vaddr_ptr;
+	void *base = ghes_ioremap_area->addr;
+
+	BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base));
+	unmap_kernel_range_noflush(vaddr, PAGE_SIZE);
+	__flush_tlb_one(vaddr);
+}
+
+static void ghes_iounmap_irq(void __iomem *vaddr_ptr)
+{
+	unsigned long vaddr = (unsigned long __force)vaddr_ptr;
+	void *base = ghes_ioremap_area->addr;
+
+	BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base));
+	unmap_kernel_range_noflush(vaddr, PAGE_SIZE);
+	__flush_tlb_one(vaddr);
+}
+
 static struct ghes *ghes_new(struct acpi_hest_generic *generic)
 {
 	struct ghes *ghes;
@@ -103,7 +192,6 @@ static struct ghes *ghes_new(struct acpi
 	if (!ghes)
 		return ERR_PTR(-ENOMEM);
 	ghes->generic = generic;
-	INIT_LIST_HEAD(&ghes->list);
 	rc = acpi_pre_map_gar(&generic->error_status_address);
 	if (rc)
 		goto err_free;
@@ -160,22 +248,41 @@ static inline int ghes_severity(int seve
 	}
 }
 
-/* SCI handler run in work queue, so ioremap can be used here */
-static int ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len,
-				 int from_phys)
+static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len,
+				  int from_phys)
 {
-	void *vaddr;
-
-	vaddr = ioremap_cache(paddr, len);
-	if (!vaddr)
-		return -ENOMEM;
-	if (from_phys)
-		memcpy(buffer, vaddr, len);
-	else
-		memcpy(vaddr, buffer, len);
-	iounmap(vaddr);
-
-	return 0;
+	void __iomem *vaddr;
+	unsigned long flags = 0;
+	int in_nmi = in_nmi();
+	u64 offset;
+	u32 trunk;
+
+	while (len > 0) {
+		offset = paddr - (paddr & PAGE_MASK);
+		if (in_nmi) {
+			raw_spin_lock(&ghes_ioremap_lock_nmi);
+			vaddr = ghes_ioremap_pfn_nmi(paddr >> PAGE_SHIFT);
+		} else {
+			spin_lock_irqsave(&ghes_ioremap_lock_irq, flags);
+			vaddr = ghes_ioremap_pfn_irq(paddr >> PAGE_SHIFT);
+		}
+		trunk = PAGE_SIZE - offset;
+		trunk = min(trunk, len);
+		if (from_phys)
+			memcpy_fromio(buffer, vaddr + offset, trunk);
+		else
+			memcpy_toio(vaddr + offset, buffer, trunk);
+		len -= trunk;
+		paddr += trunk;
+		buffer += trunk;
+		if (in_nmi) {
+			ghes_iounmap_nmi(vaddr);
+			raw_spin_unlock(&ghes_ioremap_lock_nmi);
+		} else {
+			ghes_iounmap_irq(vaddr);
+			spin_unlock_irqrestore(&ghes_ioremap_lock_irq, flags);
+		}
+	}
 }
 
 static int ghes_read_estatus(struct ghes *ghes, int silent)
@@ -196,10 +303,8 @@ static int ghes_read_estatus(struct ghes
 	if (!buf_paddr)
 		return -ENOENT;
 
-	rc = ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
-				   sizeof(*ghes->estatus), 1);
-	if (rc)
-		return rc;
+	ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
+			      sizeof(*ghes->estatus), 1);
 	if (!ghes->estatus->block_status)
 		return -ENOENT;
 
@@ -214,17 +319,15 @@ static int ghes_read_estatus(struct ghes
 		goto err_read_block;
 	if (apei_estatus_check_header(ghes->estatus))
 		goto err_read_block;
-	rc = ghes_copy_tofrom_phys(ghes->estatus + 1,
-				   buf_paddr + sizeof(*ghes->estatus),
-				   len - sizeof(*ghes->estatus), 1);
-	if (rc)
-		return rc;
+	ghes_copy_tofrom_phys(ghes->estatus + 1,
+			      buf_paddr + sizeof(*ghes->estatus),
+			      len - sizeof(*ghes->estatus), 1);
 	if (apei_estatus_check(ghes->estatus))
 		goto err_read_block;
 	rc = 0;
 
 err_read_block:
-	if (rc && !silent)
+	if (rc && !silent && printk_ratelimit())
 		pr_warning(FW_WARN GHES_PFX
 			   "Failed to read error status block!\n");
 	return rc;
@@ -303,6 +406,43 @@ out:
 	return 0;
 }
 
+static void ghes_add_timer(struct ghes *ghes)
+{
+	struct acpi_hest_generic *g = ghes->generic;
+	unsigned long expire;
+
+	if (!g->notify.poll_interval) {
+		pr_warning(FW_WARN GHES_PFX "Poll interval is 0 for "
+			   "generic hardware error source: %d, disabled.",
+			   g->header.source_id);
+		return;
+	}
+	expire = jiffies + msecs_to_jiffies(g->notify.poll_interval);
+	ghes->timer.expires = round_jiffies_relative(expire);
+	add_timer(&ghes->timer);
+}
+
+static void ghes_poll_func(unsigned long data)
+{
+	struct ghes *ghes = (void *)data;
+
+	ghes_proc(ghes);
+	if (!(ghes->flags & GHES_EXITING))
+		ghes_add_timer(ghes);
+}
+
+static irqreturn_t ghes_irq_func(int irq, void *data)
+{
+	struct ghes *ghes = data;
+	int rc;
+
+	rc = ghes_proc(ghes);
+	if (rc)
+		return IRQ_NONE;
+
+	return IRQ_HANDLED;
+}
+
 static int ghes_notify_sci(struct notifier_block *this,
 				  unsigned long event, void *data)
 {
@@ -319,10 +459,70 @@ static int ghes_notify_sci(struct notifi
 	return ret;
 }
 
+static int ghes_notify_nmi(struct notifier_block *this,
+				  unsigned long cmd, void *data)
+{
+	struct ghes *ghes, *ghes_global = NULL;
+	int sev, sev_global = -1;
+	int ret = NOTIFY_DONE;
+
+	if (cmd != DIE_NMI && cmd != DIE_NMI_IPI)
+		return ret;
+
+	raw_spin_lock(&ghes_nmi_lock);
+	list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
+		if (ghes_read_estatus(ghes, 1)) {
+			ghes_clear_estatus(ghes);
+			continue;
+		}
+		ghes_report(ghes);
+		sev = ghes_severity(ghes->estatus->error_severity);
+		if (sev > sev_global) {
+			sev_global = sev;
+			ghes_global = ghes;
+		}
+		ret = NOTIFY_STOP;
+	}
+
+	if (ret == NOTIFY_DONE)
+		goto out;
+
+	if (sev_global >= GHES_SEV_PANIC) {
+		herr_persist_all_records();
+		oops_begin();
+		/* reboot to log the error! */
+		if (panic_timeout == 0)
+			panic_timeout = ghes_panic_timeout;
+		pr_emerg(HW_ERR
+"Fatal hardware error from Generic Hardware Error Source: %d\n",
+			 ghes_global->generic->header.source_id);
+		pr_emerg(HW_ERR "Block status: 0x%08x\n",
+			 ghes_global->estatus->block_status);
+		pr_emerg(HW_ERR "Severity: %d\n",
+			 ghes_global->estatus->error_severity);
+		panic("Fatal hardware error!");
+	}
+
+	list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
+		if (!(ghes->flags & GHES_TO_CLEAR))
+			continue;
+		ghes_do_proc(ghes);
+		ghes_clear_estatus(ghes);
+	}
+
+out:
+	raw_spin_unlock(&ghes_nmi_lock);
+	return ret;
+}
+
 static struct notifier_block ghes_notifier_sci = {
 	.notifier_call = ghes_notify_sci,
 };
 
+static struct notifier_block ghes_notifier_nmi = {
+	.notifier_call = ghes_notify_nmi,
+};
+
 static int __devinit ghes_probe(struct platform_device *ghes_dev)
 {
 	struct acpi_hest_generic *generic;
@@ -334,30 +534,21 @@ static int __devinit ghes_probe(struct p
 	if (!generic->enabled)
 		goto err;
 
-	if (generic->notify.type != ACPI_HEST_NOTIFY_SCI) {
-		char *notify = NULL;
-
-		switch (generic->notify.type) {
-		case ACPI_HEST_NOTIFY_POLLED:
-			notify = "POLL";
-			break;
-		case ACPI_HEST_NOTIFY_EXTERNAL:
-		case ACPI_HEST_NOTIFY_LOCAL:
-			notify = "IRQ";
-			break;
-		case ACPI_HEST_NOTIFY_NMI:
-			notify = "NMI";
-			break;
-		}
-		if (notify) {
-			pr_warning(GHES_PFX
-"Generic hardware error source: %d notified via %s is not supported!\n",
-				   generic->header.source_id, notify);
-		} else {
-			pr_warning(FW_WARN GHES_PFX
+	switch (generic->notify.type) {
+	case ACPI_HEST_NOTIFY_POLLED:
+	case ACPI_HEST_NOTIFY_EXTERNAL:
+	case ACPI_HEST_NOTIFY_SCI:
+	case ACPI_HEST_NOTIFY_NMI:
+		break;
+	case ACPI_HEST_NOTIFY_LOCAL:
+		pr_warning(GHES_PFX
+"Generic hardware error source: %d notified via local interrupt is not supported!\n",
+			   generic->header.source_id);
+		goto err;
+	default:
+		pr_warning(FW_WARN GHES_PFX
 "Unknown notification type: %u for generic hardware error source: %d\n",
-			generic->notify.type, generic->header.source_id);
-		}
+			   generic->notify.type, generic->header.source_id);
 		goto err;
 	}
 
@@ -388,6 +579,28 @@ static int __devinit ghes_probe(struct p
 		goto err;
 	}
 	switch (generic->notify.type) {
+	case ACPI_HEST_NOTIFY_POLLED:
+		ghes->timer.function = ghes_poll_func;
+		ghes->timer.data = (unsigned long)ghes;
+		init_timer_deferrable(&ghes->timer);
+		ghes_add_timer(ghes);
+		break;
+	case ACPI_HEST_NOTIFY_EXTERNAL:
+		/* External interrupt vector is GSI */
+		if (acpi_gsi_to_irq(generic->notify.vector, &ghes->irq)) {
+			pr_err(GHES_PFX
+	"Failed to map GSI to IRQ for generic hardware error source: %d\n",
+			       generic->header.source_id);
+			goto err_unreg;
+		}
+		if (request_irq(ghes->irq, ghes_irq_func,
+				0, "GHES IRQ", ghes)) {
+			pr_err(GHES_PFX
+	"Failed to register IRQ for generic hardware error source: %d\n",
+			       generic->header.source_id);
+			goto err_unreg;
+		}
+		break;
 	case ACPI_HEST_NOTIFY_SCI:
 		mutex_lock(&ghes_list_mutex);
 		if (list_empty(&ghes_sci))
@@ -395,12 +608,21 @@ static int __devinit ghes_probe(struct p
 		list_add_rcu(&ghes->list, &ghes_sci);
 		mutex_unlock(&ghes_list_mutex);
 		break;
+	case ACPI_HEST_NOTIFY_NMI:
+		mutex_lock(&ghes_list_mutex);
+		if (list_empty(&ghes_nmi))
+			register_die_notifier(&ghes_notifier_nmi);
+		list_add_rcu(&ghes->list, &ghes_nmi);
+		mutex_unlock(&ghes_list_mutex);
+		break;
 	default:
 		BUG();
 	}
 	platform_set_drvdata(ghes_dev, ghes);
 
 	return 0;
+err_unreg:
+	herr_dev_unregister(ghes->herr_dev);
 err:
 	if (ghes) {
 		ghes_fini(ghes);
@@ -417,7 +639,15 @@ static int __devexit ghes_remove(struct
 	ghes = platform_get_drvdata(ghes_dev);
 	generic = ghes->generic;
 
+	ghes->flags |= GHES_EXITING;
+
 	switch (generic->notify.type) {
+	case ACPI_HEST_NOTIFY_POLLED:
+		del_timer_sync(&ghes->timer);
+		break;
+	case ACPI_HEST_NOTIFY_EXTERNAL:
+		free_irq(ghes->irq, ghes);
+		break;
 	case ACPI_HEST_NOTIFY_SCI:
 		mutex_lock(&ghes_list_mutex);
 		list_del_rcu(&ghes->list);
@@ -426,6 +656,18 @@ static int __devexit ghes_remove(struct
 		mutex_unlock(&ghes_list_mutex);
 		synchronize_rcu();
 		break;
+	case ACPI_HEST_NOTIFY_NMI:
+		mutex_lock(&ghes_list_mutex);
+		list_del_rcu(&ghes->list);
+		if (list_empty(&ghes_nmi))
+			unregister_die_notifier(&ghes_notifier_nmi);
+		mutex_unlock(&ghes_list_mutex);
+		/*
+		 * To synchronize with NMI handler, ghes can only be
+		 * freed after NMI handler finishes.
+		 */
+		synchronize_rcu();
+		break;
 	default:
 		BUG();
 		break;
@@ -451,6 +693,8 @@ static struct platform_driver ghes_platf
 
 static int __init ghes_init(void)
 {
+	int rc;
+
 	if (acpi_disabled)
 		return -ENODEV;
 
@@ -459,12 +703,25 @@ static int __init ghes_init(void)
 		return -EINVAL;
 	}
 
-	return platform_driver_register(&ghes_platform_driver);
+	rc = ghes_ioremap_init();
+	if (rc)
+		goto err;
+
+	rc = platform_driver_register(&ghes_platform_driver);
+	if (rc)
+		goto err_ioremap_exit;
+
+	return 0;
+err_ioremap_exit:
+	ghes_ioremap_exit();
+err:
+	return rc;
 }
 
 static void __exit ghes_exit(void)
 {
 	platform_driver_unregister(&ghes_platform_driver);
+	ghes_ioremap_exit();
 }
 
 module_init(ghes_init);
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
 
 int panic_timeout;
+EXPORT_SYMBOL_GPL(panic_timeout);
 
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/io.h>
+#include <linux/module.h>
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
 
@@ -90,3 +91,4 @@ int ioremap_page_range(unsigned long add
 
 	return err;
 }
+EXPORT_SYMBOL_GPL(ioremap_page_range);
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1160,6 +1160,7 @@ void unmap_kernel_range_noflush(unsigned
 {
 	vunmap_page_range(addr, addr + size);
 }
+EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
 
 /**
  * unmap_kernel_range - unmap kernel VM area and flush cache and TLB

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25  7:43 ` [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support Huang Ying
@ 2010-10-25  8:45   ` Ingo Molnar
  2010-10-25  8:58     ` Huang Ying
  2010-10-26  1:06     ` Len Brown
  0 siblings, 2 replies; 48+ messages in thread
From: Ingo Molnar @ 2010-10-25  8:45 UTC (permalink / raw)
  To: Huang Ying
  Cc: Len Brown, linux-kernel, Andi Kleen, linux-acpi, Borislav Petkov,
	Thomas Gleixner, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab


* Huang Ying <ying.huang@intel.com> wrote:

> Generic Hardware Error Source provides a way to report platform
> hardware errors (such as that from chipset). It works in so called
> "Firmware First" mode, that is, hardware errors are reported to
> firmware firstly, then reported to Linux by firmware. This way, some
> non-standard hardware error registers or non-standard hardware link
> can be checked by firmware to produce more valuable hardware error
> information for Linux.
> 
> This patch adds POLL/IRQ/NMI notification types support.
> 
> Because the memory area used to transfer hardware error information
> from BIOS to Linux can be determined only in NMI, IRQ or timer
> handler, but general ioremap can not be used in atomic context, so a
> special version of atomic ioremap is implemented for that.
> 
> Signed-off-by: Huang Ying <ying.huang@intel.com>
> Reviewed-by: Andi Kleen <ak@linux.intel.com>
> ---
>  arch/x86/kernel/acpi/boot.c |    1 
>  arch/x86/kernel/dumpstack.c |    1 
>  drivers/acpi/apei/ghes.c    |  397 ++++++++++++++++++++++++++++++++++++--------
>  kernel/panic.c              |    1 
>  lib/ioremap.c               |    2 
>  mm/vmalloc.c                |    1 
>  6 files changed, 333 insertions(+), 70 deletions(-)

WTF?

Sigh, please integrate all this into EDAC (drivers/edac/) properly, instead of 
turning it into YET ANOTHER hardware vendor special hw-errors thing. We can do 
better than this. EDAC is almost there: it has support for Nehalem, AMD, a couple
of older chips.

Guys, instead of carving out a special driver area where you can produce crap 
without anyone looking too much, and pretending that the EDAC code does not exist, 
please try to work with others who are aiming higher and who are using saner 
interfaces.

Just look at the higher level structure in drivers/acpi/apei/:

  apei-base.c  apei-internal.h  cper.c  einj.c  erst.c  erst-dbg.c  ghes.c  hest.c  Kconfig  Makefile

ghes, einj, cper, erst? Someone's been abbreviating too much.

einj.c: it's about the 3rd separate 'error injection' concept that got introduced 
...

Please _THINK_ for goodness's sake and try to get some usable, coherent, generic 
interface to users - instead of just directly implementing whatever crap hw 
designers came up with.

So unless Linus or Andrew overrules me i'll NAK this before the insanity spreads too 
much:

 NAKed-by: Ingo Molnar <mingo@elte.hu>

(Someone should have NAK-ed it when the first iteration was merged.)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25  8:45   ` [NAK] " Ingo Molnar
@ 2010-10-25  8:58     ` Huang Ying
  2010-10-25  9:19       ` Andi Kleen
  2010-10-25  9:25       ` Ingo Molnar
  2010-10-26  1:06     ` Len Brown
  1 sibling, 2 replies; 48+ messages in thread
From: Huang Ying @ 2010-10-25  8:58 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Len Brown, linux-kernel, Andi Kleen, linux-acpi, Borislav Petkov,
	Thomas Gleixner, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab

On Mon, 2010-10-25 at 16:45 +0800, Ingo Molnar wrote:
> * Huang Ying <ying.huang@intel.com> wrote:
> 
> > Generic Hardware Error Source provides a way to report platform
> > hardware errors (such as that from chipset). It works in so called
> > "Firmware First" mode, that is, hardware errors are reported to
> > firmware firstly, then reported to Linux by firmware. This way, some
> > non-standard hardware error registers or non-standard hardware link
> > can be checked by firmware to produce more valuable hardware error
> > information for Linux.
> > 
> > This patch adds POLL/IRQ/NMI notification types support.
> > 
> > Because the memory area used to transfer hardware error information
> > from BIOS to Linux can be determined only in NMI, IRQ or timer
> > handler, but general ioremap can not be used in atomic context, so a
> > special version of atomic ioremap is implemented for that.
> > 
> > Signed-off-by: Huang Ying <ying.huang@intel.com>
> > Reviewed-by: Andi Kleen <ak@linux.intel.com>
> > ---
> >  arch/x86/kernel/acpi/boot.c |    1 
> >  arch/x86/kernel/dumpstack.c |    1 
> >  drivers/acpi/apei/ghes.c    |  397 ++++++++++++++++++++++++++++++++++++--------
> >  kernel/panic.c              |    1 
> >  lib/ioremap.c               |    2 
> >  mm/vmalloc.c                |    1 
> >  6 files changed, 333 insertions(+), 70 deletions(-)
> 
> WTF?
> 
> Sigh, please integrate all this into EDAC (drivers/edac/) properly, instead of 
> turning it into YET ANOTHER hardware vendor special hw-errors thing. We can do 
> better than this. EDAC is almost there: it has support for Nehalem, AMD, a couple
> of older chips.

I think APEI (ACPI Platform Error Interface) is another driver. Why
integrate two drivers?

> Guys, instead of carving out a special driver area where you can produce crap 
> without anyone looking too much, and pretending that the EDAC code does not exist, 
> please try to work with others who are aiming higher and who are using saner 
> interfaces.
> 
> Just look at the higher level structure in drivers/acpi/apei/:
> 
>   apei-base.c  apei-internal.h  cper.c  einj.c  erst.c  erst-dbg.c  ghes.c  hest.c  Kconfig  Makefile
> 
> ghes, einj, cper, erst? Someone's been abbreviating too much.

Maybe they are not good name. But they are defined in ACPI
specification. Using the same name makes it easier for people to link
the specification to corresponding implementation.

> einj.c: it's about the 3rd separate 'error injection' concept that got introduced 
> ...

EINJ is a true platform feature, not just software feature. We need to
support it to debug various hardware error features.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25  8:58     ` Huang Ying
@ 2010-10-25  9:19       ` Andi Kleen
  2010-10-25 11:15         ` Ingo Molnar
  2010-10-25 16:38         ` Thomas Gleixner
  2010-10-25  9:25       ` Ingo Molnar
  1 sibling, 2 replies; 48+ messages in thread
From: Andi Kleen @ 2010-10-25  9:19 UTC (permalink / raw)
  To: Huang Ying
  Cc: Ingo Molnar, Len Brown, linux-kernel, Andi Kleen, linux-acpi,
	Borislav Petkov, Thomas Gleixner, H. Peter Anvin, Don Zickus,
	Linus Torvalds, Andrew Morton, Mauro Carvalho Chehab

> > Sigh, please integrate all this into EDAC (drivers/edac/) properly, instead of 
> > turning it into YET ANOTHER hardware vendor special hw-errors thing. We can do 
> > better than this. EDAC is almost there: it has support for Nehalem, AMD, a couple
> > of older chips.
> 
> I think APEI (ACPI Platform Error Interface) is another driver. Why
> integrate two drivers?

Yes they're solving quite different problems from EDAC with different
interfaces and for different devices in the ACPI space.

The earlier nack seems to be based on a lot of confusion on what the code 
does.

Besides it nacks code in areas Ingo doesn't even maintain.
(if he's allowed to nack random other code he doesn't like do I get this 
right too? :-)

> > einj.c: it's about the 3rd separate 'error injection' concept that got introduced 
> > ...
> 
> EINJ is a true platform feature, not just software feature. We need to
> support it to debug various hardware error features.

Also having multiple error injecting interfaces is a good thing.

Error injection is hard and one size definitely doesn't fit all. You need
quite different ones depending on what you want to test, in which
context etc.

For hwpoison we currently have three different injectors at least and I expect
that to even grow more in the future as different features get added.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25  8:58     ` Huang Ying
  2010-10-25  9:19       ` Andi Kleen
@ 2010-10-25  9:25       ` Ingo Molnar
  2010-10-25 17:14         ` Tony Luck
  1 sibling, 1 reply; 48+ messages in thread
From: Ingo Molnar @ 2010-10-25  9:25 UTC (permalink / raw)
  To: Huang Ying
  Cc: Len Brown, linux-kernel, Andi Kleen, linux-acpi, Borislav Petkov,
	Thomas Gleixner, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab


* Huang Ying <ying.huang@intel.com> wrote:

> > Sigh, please integrate all this into EDAC (drivers/edac/) properly, instead of 
> > turning it into YET ANOTHER hardware vendor special hw-errors thing. We can do 
> > better than this. EDAC is almost there: it has support for Nehalem, AMD, a 
> > couple of older chips.
> 
> I think APEI (ACPI Platform Error Interface) is another driver. Why integrate two 
> drivers?

Sigh. I did not say integrate the drivers - integrate the _error event facilities_. 

You can have drivers/edac/apei/ghes* bits just fine (in fact it would be desirable, 
to keep things modular).

Really, just read the two Kconfig entries:

        bool "EDAC (Error Detection And Correction) reporting"

          EDAC is designed to report errors in the core system.
          These are low-level errors that are reported in the CPU or
          supporting chipset or other subsystems:
          memory errors, cache errors, PCI errors, thermal throttling, etc..

...

        tristate "APEI Generic Hardware Error Source"

          Generic Hardware Error Source provides a way to report
          platform hardware errors (such as that from chipset).

drivers/acpi/apei/ overlaps and duplicates drivers/edac/. We dont want two 
facilities, two ABIs, two sets of behavior. erst-dbg even defines a /dev node with 
two ioctls, and a debugfs file to read/write records ...

I have NAK-ed various attempts to extend /dev/mcelog and asked for it to be done 
properly, and work has begun on that - but the debugfs interface here just tries to 
work around those objections by stealth.

I'd like you and Andi to listen not just to the letter of NAKs but to the spirit as 
well. If you get a NAK in one subsystem you should not just try to route around the 
NAK, go to some other subsystem, figure out a slightly different scheme and try to 
sneak crap upstream ...

If you disagree with the mcelog NAK, if you disagree with EDAC directions then at 
least do it openly and honestly and Cc: the parties that sent you the NAK and work 
with the EDAC guys to migrate to the facility you are advancing ...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25  9:19       ` Andi Kleen
@ 2010-10-25 11:15         ` Ingo Molnar
  2010-10-25 12:04           ` Mauro Carvalho Chehab
  2010-10-25 12:37           ` Andi Kleen
  2010-10-25 16:38         ` Thomas Gleixner
  1 sibling, 2 replies; 48+ messages in thread
From: Ingo Molnar @ 2010-10-25 11:15 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Len Brown, linux-kernel, linux-acpi, Borislav Petkov,
	Thomas Gleixner, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Arjan van de Ven


* Andi Kleen <andi@firstfloor.org> wrote:

> > > Sigh, please integrate all this into EDAC (drivers/edac/) properly, instead of 
> > > turning it into YET ANOTHER hardware vendor special hw-errors thing. We can do 
> > > better than this. EDAC is almost there: it has support for Nehalem, AMD, a 
> > > couple of older chips.
> > 
> > I think APEI (ACPI Platform Error Interface) is another driver. Why integrate 
> > two drivers?
> 
> Yes they're solving quite different problems from EDAC with different interfaces 
> and for different devices in the ACPI space.

That's my whole point, _why_ do they have different interfaces?

EDAC is the upstream mechanism to organize hardware error reporting and to get 
hardware errors to user-space. It is already successful in handling a wide range of 
hardware in a similar fashion.

Furthermore, there is work ongoing to do the reporting via perf event channels, some 
of that work is upstream already. Boris is working on persistent events, on RAS 
tooling (tools/ras/) and on event injection. Here's a past submission of his work:

  http://lwn.net/Articles/394522/

You are now doing a completely separate thing here, detaching a big CPU vendor from 
the main body of Linux code that deals with this stuff.

IMHO that's not helpful _at all_.

> > > einj.c: it's about the 3rd separate 'error injection' concept that got 
> > > introduced ...
> > 
> > EINJ is a true platform feature, not just software feature. We need to support 
> > it to debug various hardware error features.
> 
> Also having multiple error injecting interfaces is a good thing.

It's never a good thing to have separate, vendor dependent interfaces for what to 
the user is basically the same conceptual thing!

> Error injection is hard and one size definitely doesn't fit all. You need quite 
> different ones depending on what you want to test, in which context etc.

And that kind of variance is in your opinion a good reason to introduce separate 
user ABIs for it?

( And i dont care that there might be no 'end user' for hardware error injection per 
  se right now. There is certainly an 'end user' for hardware error events and even 
  _there_ you are introducing and pushing for separate, incompatible interfaces. )

We have really good historic data here: we got the _biggest_ practical advantage 
from event enumeration (/debug/tracing/events/) when we extended it in a generic, 
unified way to the rich topology that the hardware and the kernel gives us.

That way we got new, useful tools like powertop, timechart or pytimechart or the 
edac tool, which can concentrate on a single, well-defined event topology and event 
ABI.

Why do these tools like this kind of unified event enumeration and reporting 
facilities, which you are fighting against so hard? Because of the big technological 
advantage of having to deal with one enumeration and reporting facility alone. They 
can get power events, scheduling events, timer events, kmalloc events all from the 
same source - even though these subsystems have barely anything in common! Tools can 
then combine these seemingly unrelated events into something new and useful.

It's a very extensible model, and with every new event type added, the tool space 
gets richer _together_.

Error event injection to simulate/trigger various error conditions in those events 
is a natural extension to the whole events framework - not something that should be 
in a randomly different way.

What you are doing here is to fragment the whole landscape into small, incompatible, 
vendor specific bits. Some of it is in /dev, some of it is in debugfs, some things 
report via signals, etc. etc.

It's inconsistent, messy and doesnt integrate well with the events framework we are 
building.

That was the main basis of my prior NAK, and you have said _nothing_ in the past 
that invalidates the fundamental points of that NAK.

Instead you started, by stealth and by duplicity, looking for ways to get around 
that conceptual NAK.

> For hwpoison we currently have three different injectors at least and I expect 
> that to even grow more in the future as different features get added.

That's insane!

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 11:15         ` Ingo Molnar
@ 2010-10-25 12:04           ` Mauro Carvalho Chehab
  2010-10-25 17:07             ` Tony Luck
  2010-10-25 12:37           ` Andi Kleen
  1 sibling, 1 reply; 48+ messages in thread
From: Mauro Carvalho Chehab @ 2010-10-25 12:04 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Huang Ying, Len Brown, linux-kernel, linux-acpi,
	Borislav Petkov, Thomas Gleixner, H. Peter Anvin, Don Zickus,
	Linus Torvalds, Andrew Morton, Arjan van de Ven, Tony Luck,
	Borislav Petkov

Em 25-10-2010 09:15, Ingo Molnar escreveu:
> 
> * Andi Kleen <andi@firstfloor.org> wrote:
> 
>>>> Sigh, please integrate all this into EDAC (drivers/edac/) properly, instead of 
>>>> turning it into YET ANOTHER hardware vendor special hw-errors thing. We can do 
>>>> better than this. EDAC is almost there: it has support for Nehalem, AMD, a 
>>>> couple of older chips.
>>>
>>> I think APEI (ACPI Platform Error Interface) is another driver. Why integrate 
>>> two drivers?
>>
>> Yes they're solving quite different problems from EDAC with different interfaces 
>> and for different devices in the ACPI space.
> 
> That's my whole point, _why_ do they have different interfaces?
> 
> EDAC is the upstream mechanism to organize hardware error reporting and to get 
> hardware errors to user-space. It is already successful in handling a wide range of 
> hardware in a similar fashion.
> 
> Furthermore, there is work ongoing to do the reporting via perf event channels, some 
> of that work is upstream already. Boris is working on persistent events, on RAS 
> tooling (tools/ras/) and on event injection. Here's a past submission of his work:
> 
>   http://lwn.net/Articles/394522/
> 
> You are now doing a completely separate thing here, detaching a big CPU vendor from 
> the main body of Linux code that deals with this stuff.
> 
> IMHO that's not helpful _at all_.

I agree with Ingo NAK. 

Having vendor-dependent API's for errors is not the way to solve those issues. We should 
focus on an unique hardware error report facility that will provide a clean, consistent, 
vendor-independent interface to userspace. EDAC successfully achieved this target for the
current designs, and it is evolving to cover newer hardware needs.

We had some discussions with Intel during the Collaboration summit about that, trying to 
integrate Nehalem EX on such environment, with, unfortunately, didn't happen, as Intel 
didn't release any (public) datasheet describing Nehalem EX memory controller, nor wrote 
such driver.

As most of us will be in Boston next week, I've reserved a BoF at Plumbers for us to discuss
about this subject:

http://www.linuxplumbersconf.org/2010/ocw/proposals/921


> 
>>>> einj.c: it's about the 3rd separate 'error injection' concept that got 
>>>> introduced ...
>>>
>>> EINJ is a true platform feature, not just software feature. We need to support 
>>> it to debug various hardware error features.
>>
>> Also having multiple error injecting interfaces is a good thing.
> 
> It's never a good thing to have separate, vendor dependent interfaces for what to 
> the user is basically the same conceptual thing!
> 
>> Error injection is hard and one size definitely doesn't fit all. You need quite 
>> different ones depending on what you want to test, in which context etc.
> 
> And that kind of variance is in your opinion a good reason to introduce separate 
> user ABIs for it?
> 
> ( And i dont care that there might be no 'end user' for hardware error injection per 
>   se right now. There is certainly an 'end user' for hardware error events and even 
>   _there_ you are introducing and pushing for separate, incompatible interfaces. )
> 
> We have really good historic data here: we got the _biggest_ practical advantage 
> from event enumeration (/debug/tracing/events/) when we extended it in a generic, 
> unified way to the rich topology that the hardware and the kernel gives us.
> 
> That way we got new, useful tools like powertop, timechart or pytimechart or the 
> edac tool, which can concentrate on a single, well-defined event topology and event 
> ABI.
> 
> Why do these tools like this kind of unified event enumeration and reporting 
> facilities, which you are fighting against so hard? Because of the big technological 
> advantage of having to deal with one enumeration and reporting facility alone. They 
> can get power events, scheduling events, timer events, kmalloc events all from the 
> same source - even though these subsystems have barely anything in common! Tools can 
> then combine these seemingly unrelated events into something new and useful.
> 
> It's a very extensible model, and with every new event type added, the tool space 
> gets richer _together_.
> 
> Error event injection to simulate/trigger various error conditions in those events 
> is a natural extension to the whole events framework - not something that should be 
> in a randomly different way.
> 
> What you are doing here is to fragment the whole landscape into small, incompatible, 
> vendor specific bits. Some of it is in /dev, some of it is in debugfs, some things 
> report via signals, etc. etc.
> 
> It's inconsistent, messy and doesnt integrate well with the events framework we are 
> building.
> 
> That was the main basis of my prior NAK, and you have said _nothing_ in the past 
> that invalidates the fundamental points of that NAK.
> 
> Instead you started, by stealth and by duplicity, looking for ways to get around 
> that conceptual NAK.
> 
>> For hwpoison we currently have three different injectors at least and I expect 
>> that to even grow more in the future as different features get added.
> 
> That's insane!
> 
> 	Ingo


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 11:15         ` Ingo Molnar
  2010-10-25 12:04           ` Mauro Carvalho Chehab
@ 2010-10-25 12:37           ` Andi Kleen
  2010-10-25 12:55             ` Ingo Molnar
  1 sibling, 1 reply; 48+ messages in thread
From: Andi Kleen @ 2010-10-25 12:37 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Huang Ying, Len Brown, linux-kernel, linux-acpi,
	Borislav Petkov, Thomas Gleixner, H. Peter Anvin, Don Zickus,
	Linus Torvalds, Andrew Morton, Mauro Carvalho Chehab,
	Arjan van de Ven

On Mon, Oct 25, 2010 at 01:15:30PM +0200, Ingo Molnar wrote:

> > > > einj.c: it's about the 3rd separate 'error injection' concept that got 
> > > > introduced ...
> > > 
> > > EINJ is a true platform feature, not just software feature. We need to support 
> > > it to debug various hardware error features.
> > 
> > Also having multiple error injecting interfaces is a good thing.
> 
> It's never a good thing to have separate, vendor dependent interfaces for what to 
> the user is basically the same conceptual thing!

Perhaps a simple example (simplified, in practice there are more
complications) makes it more clear:

The memory error handler does different actions depending on what the 
state the page the error is happening on is in.

To get reasonable coverage of the recovery code you need to present it with 
pages in different states (like locked, clean, dirty, IO etc. etc. )

Now it turns out this is very hard to do if you just inject the error
at the hardware level, because there are lots of races and problems
ensuring the page is still in the expected state etc.etc.

So one of the solution hwpoison did for this was to have another injector
that works on the process level. At the process level you can get
pages into different stages and reasonably cleanly inject the right 
error with the right context. This is essentially error
injection at the VM level.

Again this is simplified, for coverage we actually needed multiple
injectors that work at different entry points, e.g. for example
to make sure buffered file system pages are correctly handled too.

Now that's great, but we still need other injectors that work 
on other level, otherwise the part that talks to the hardware
are not covered at all

But you cannot test all the code paths of that code either using
a single hardware injector.  So there's another one that can fake different
contexts at the software level and provides reasonable
coverage of this code.

But then you still didn't test the whole hardware to software
error path. Now yes you could use a EDAC like ECC bits injector
(which BTW doesn't really need a kernel driver, we did it just
using shell scripts fine before). But that also only tests
one path and not the others possibilities, and also only
works on specific hardware in specific modes with very careful
setup.

But that's just one type of error for one system.  So you need other 
interfaces for other hardware and for other errors etc.etc. 

In some cases you also need to talk to the BIOS to do this injection
for various reasons, that is where ACPI comes in (and all these 
acronyms you seem to object to)

Also it's not enough to do single error injection once
on some system. You need a repeatable regression test that
ensure the error handling stays operable for kernels as
they evolve. This requires that the error injection is reasonably 
portable. 

For this I tried to have a "software only" injector for
nearly everything just to make sure the code can be always
tested. Unfortunately the software injectors, especially
the ones aiming at larger coverage, also have quite different
interface requirements than hardware interfaces.

But again you still need to test the full hardware too,
otherwise we don't know if the error handling is really 
working on a real system in practice.

Error injection is just messy. There is no single general
solution that works for everything and solves all problems, but you
really need a pragmatic approach for every subsystem. 

In the end it means you end up with lots of different injectors,
all tied to some specific problem.

Would it be nice if there was a single great injector that covers
everything? Yes
Is it realistic? No.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 12:37           ` Andi Kleen
@ 2010-10-25 12:55             ` Ingo Molnar
  2010-10-25 13:02               ` Ingo Molnar
  2010-10-25 13:11               ` Andi Kleen
  0 siblings, 2 replies; 48+ messages in thread
From: Ingo Molnar @ 2010-10-25 12:55 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Len Brown, linux-kernel, linux-acpi, Borislav Petkov,
	Thomas Gleixner, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Arjan van de Ven


* Andi Kleen <andi@firstfloor.org> wrote:

> On Mon, Oct 25, 2010 at 01:15:30PM +0200, Ingo Molnar wrote:
> 
> > > > > einj.c: it's about the 3rd separate 'error injection' concept that got 
> > > > > introduced ...
> > > > 
> > > > EINJ is a true platform feature, not just software feature. We need to support 
> > > > it to debug various hardware error features.
> > > 
> > > Also having multiple error injecting interfaces is a good thing.
> > 
> > It's never a good thing to have separate, vendor dependent interfaces for what 
> > to the user is basically the same conceptual thing!
> 
> Perhaps a simple example (simplified, in practice there are more complications) 
> makes it more clear:
> 
> The memory error handler does different actions depending on what the state the 
> page the error is happening on is in.

What you appear to be arguing for is the ability to inject different types of 
events.

_OF COURSE_ we want that.

Just like we want to be able to _receive_ multiple types of events from wildly 
different hardware and wildly different kernel subsystems ...

Duh.

That desire does not necessiate 'three different injectors' at all. It does not 
necessiate multiple incompatible facilities with random ABIs.

What we want is a single injector facility visible to RAS/hw-testing/etc. apps, and 
a way to pass in attributes that specify the kind of event that we want to trigger.

Also note that you completely ignored the other basis of my objection and NAK: that 
the whole ad-hoc event log export that this code does via the /dev/erst-dbg ABI is 
actively harmful.

> Would it be nice if there was a single great injector that covers everything? Yes 
> Is it realistic? No.

Everyone else working on this area thinks it's realistic, and is in fact working on 
such a facility.

The main thing that is causing confusion here is not the technical viability of such 
a project (it's evidently doable and desirable), but your unwillingness to 
cooperate. If Intel goes into random directions and essentially obstructs upstream 
projects then we wont have this implemented on Intel CPUs sanely and cleanly - 
despite Mauro's best efforts on the Nehalem code.

But you should really not bring that up as some kind of positive argument ...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 12:55             ` Ingo Molnar
@ 2010-10-25 13:02               ` Ingo Molnar
  2010-10-25 13:11               ` Andi Kleen
  1 sibling, 0 replies; 48+ messages in thread
From: Ingo Molnar @ 2010-10-25 13:02 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Len Brown, linux-kernel, linux-acpi, Borislav Petkov,
	Thomas Gleixner, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Arjan van de Ven


* Ingo Molnar <mingo@elte.hu> wrote:

> > The memory error handler does different actions depending on what the state the 
> > page the error is happening on is in.
> 
> What you appear to be arguing for is the ability to inject different types of 
> events.
> 
> _OF COURSE_ we want that.
> 
> Just like we want to be able to _receive_ multiple types of events from wildly 
> different hardware and wildly different kernel subsystems ...
> 
> Duh.
> 
> That desire does not necessiate 'three different injectors' at all. It does not 
> necessiate multiple incompatible facilities with random ABIs.

And note that once there's a generic facility that allows event injection, the 
actual low level implementation might of course be hardware specific. There's no 
reduction in actual feature richness: if the hw can do fancy things, it can be 
expressed via a generic facility as well.

What i object to is the narrow hardware specificity (and ad-hocness) of the high 
level interface and its non-integration into existing facilities.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 12:55             ` Ingo Molnar
  2010-10-25 13:02               ` Ingo Molnar
@ 2010-10-25 13:11               ` Andi Kleen
  2010-10-25 13:47                 ` Ingo Molnar
  1 sibling, 1 reply; 48+ messages in thread
From: Andi Kleen @ 2010-10-25 13:11 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Huang Ying, Len Brown, linux-kernel, linux-acpi,
	Borislav Petkov, Thomas Gleixner, H. Peter Anvin, Don Zickus,
	Linus Torvalds, Andrew Morton, Mauro Carvalho Chehab,
	Arjan van de Ven

On Mon, Oct 25, 2010 at 02:55:31PM +0200, Ingo Molnar wrote:
> 
> * Andi Kleen <andi@firstfloor.org> wrote:
> 
> > On Mon, Oct 25, 2010 at 01:15:30PM +0200, Ingo Molnar wrote:
> > 
> > > > > > einj.c: it's about the 3rd separate 'error injection' concept that got 
> > > > > > introduced ...
> > > > > 
> > > > > EINJ is a true platform feature, not just software feature. We need to support 
> > > > > it to debug various hardware error features.
> > > > 
> > > > Also having multiple error injecting interfaces is a good thing.
> > > 
> > > It's never a good thing to have separate, vendor dependent interfaces for what 
> > > to the user is basically the same conceptual thing!
> > 
> > Perhaps a simple example (simplified, in practice there are more complications) 
> > makes it more clear:
> > 
> > The memory error handler does different actions depending on what the state the 
> > page the error is happening on is in.
> 
> What you appear to be arguing for is the ability to inject different types of 
> events.

Different events in different contexts with different drivers 
with different parameters using different tools.

Commonality: about 0% exept there's "error" somewhere in the description.

-Andi

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 13:11               ` Andi Kleen
@ 2010-10-25 13:47                 ` Ingo Molnar
  2010-10-25 15:14                   ` Andi Kleen
  0 siblings, 1 reply; 48+ messages in thread
From: Ingo Molnar @ 2010-10-25 13:47 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Len Brown, linux-kernel, linux-acpi, Borislav Petkov,
	Thomas Gleixner, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Arjan van de Ven


* Andi Kleen <andi@firstfloor.org> wrote:

> On Mon, Oct 25, 2010 at 02:55:31PM +0200, Ingo Molnar wrote:
> > 
> > * Andi Kleen <andi@firstfloor.org> wrote:
> > 
> > > On Mon, Oct 25, 2010 at 01:15:30PM +0200, Ingo Molnar wrote:
> > > 
> > > > > > > einj.c: it's about the 3rd separate 'error injection' concept that got 
> > > > > > > introduced ...
> > > > > > 
> > > > > > EINJ is a true platform feature, not just software feature. We need to support 
> > > > > > it to debug various hardware error features.
> > > > > 
> > > > > Also having multiple error injecting interfaces is a good thing.
> > > > 
> > > > It's never a good thing to have separate, vendor dependent interfaces for what 
> > > > to the user is basically the same conceptual thing!
> > > 
> > > Perhaps a simple example (simplified, in practice there are more complications) 
> > > makes it more clear:
> > > 
> > > The memory error handler does different actions depending on what the state the 
> > > page the error is happening on is in.
> > 
> > What you appear to be arguing for is the ability to inject different types of 
> > events.
> 
> Different events in different contexts with different drivers with different 
> parameters [...]

Correct.

> [...] using different tools.

That's possible, but i'd expect tools/ras/ to be populated with uniformly working 
tools. There's little sense in fragmenting the hw-testing field...

> Commonality: about 0% exept there's "error" somewhere in the description.

Wrong. Their main purpose is common: they are events attached to existing hardware 
topologies, which events can be configured, which events can be received and which 
can be injected with attributes for rare-event simulation purposes.

The tool people have spoken to us clear and loud that they want to _receive_ events 
in a unified and structured way - not via lots of separate ABIs from facilities that 
have mismatching capabilities.

We want to be able to inject _other_ events as well, not just hw-error ones - 
especially rare ones.

I.e. there's clear, demonstrated, patches-pending demand for uniformity and there's 
similar demand for a broader concept.

You are now making the point that somehow the receipt and sending/injecting of 'hw 
errors on Intel hardware' should be a separate, fragmented, disoriented, messy piece 
of interface design, closely matching some ACPI spec detail, which should be 
disassociated from the preferred mechanism of error reporting?

Your argument makes absolutely no sense to me.

The kernel is an abstraction machine: common hw aspects should be generalized to the 
extent it makes sense, with reasonable extensions for anything we dont want (or 
cannot) generalize.

There's _tons_ of interesting structure here to be taken advantage of: just look at 
what Boris is trying to achieve with his EDAC tooling patches. See what Lin Ming is 
trying to do by moving event descriptors to /sys, so that events can come with 
elements of our hw and sw topology in a natural way.

There is absolutely no justification whatsoever for the new /dev/erst-dbg ABI ...

Furthermore, you have ignored my other argument for the second time now: why does 
this code not do the event _reporting_ via the facilities we use and prefer? As far 
as users are concerned, the ability to receive hardware error events in a unified 
way is an even more important aspect than the matter of event injection.

Once you do that i think you will see how naturally error injection fits into the 
picture as well. It is an aspect of pretty much any event (not just hw-error events) 
that we want to be able to 'inject/simulate' them, to test tools.

Your refusal to even consider this possibility and to look at the EDAC/RAS patches 
that deal with this is puzzling to me.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 13:47                 ` Ingo Molnar
@ 2010-10-25 15:14                   ` Andi Kleen
  2010-10-25 17:10                     ` Ingo Molnar
  0 siblings, 1 reply; 48+ messages in thread
From: Andi Kleen @ 2010-10-25 15:14 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Huang Ying, Len Brown, linux-kernel, linux-acpi,
	Borislav Petkov, Thomas Gleixner, H. Peter Anvin, Don Zickus,
	Linus Torvalds, Andrew Morton, Mauro Carvalho Chehab,
	Arjan van de Ven

> > Different events in different contexts with different drivers with different 
> > parameters [...]
> 
> Correct.
> 
> > [...] using different tools.
> 
> That's possible, but i'd expect tools/ras/ to be populated with uniformly working 
> tools. There's little sense in fragmenting the hw-testing field...

First if you want to avoid fragmentation please contribute to mce-test

git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git

That's the standard area for tests in this area for several years
with many contributors. If anyone has anything new they want
to inject and it roughly fits hardware errors it can be placed there.
These days it already tests more than just mces.

Then the tools are actually more like test suites that do all kind
of different things. For example the testers for hwpoison high level
is a set of programs that get continuously extended. There's no
straight forward way to do all this from the command line, because
you need to write quite a lot of code just to get the basic 
context needed for the specific code path.

As an example of this see the cases in tinjpage:

http://git.kernel.org/?p=utils/cpu/mce/mce-test.git;a=blob;f=tsrc/tinjpage.c;h=1042c132a3235c6bc0fbbe4ee8f68f0c6f96804f;hb=HEAD

Compare it to random_offline which tests global soft offline coverage

http://git.kernel.org/?p=utils/cpu/mce/mce-test.git;a=blob;f=tsrc/random_offline;h=c380a86075511de4fedb0cff9bf99a53d9215cf0;hb=HEAD

Compare it to the file system stress suite which tests file system buffer recovery:

http://git.kernel.org/?p=utils/cpu/mce/mce-test.git;a=tree;f=stress;h=e0c7c281ea9a2753d326d6ab9e0dbe566f762cf1;hb=HEAD

And compare it to the mce coverage test suite which tests low level machine check coverage:

http://git.kernel.org/?p=utils/cpu/mce/mce-test.git;a=tree;f=cases/soft-inj;h=b0eac7aad53709c0ca7899d56698445646a51c3a;hb=HEAD

with mce-inject (software) as base:

http://git.kernel.org/?p=utils/cpu/mce/mce-inject.git;a=blob;f=mce.y;h=9a8bb0385e4c3f35f8115b42e0c859623ff9cde7;hb=HEAD

and with APEI as base

http://git.kernel.org/?p=utils/cpu/mce/mce-test.git;a=tree;f=cases/apei-inj;h=bbdffe6528f763b89b9ee1f954869a0f90d4deda;hb=HEAD

They are really all quite different. 

On merging them:

Well ok in theory one could have ras test1|test2|test3

But you would end up like git and perf where you have lots of different tools
just linked into the same executable. Not really an advantage, would you agree
on that? Ok i understand in the git/perf case there's also a common library,
but at least in the error injection case there isn't really.

Anyways if the request is to link everything into a single binary
it could be looked at, but I must admit I personally don't see any
advantage from that.

Or you could have a single kernel inject interface with a bazillion modi and 
different options that needs to be extended all the time to cover some new case.
Aka the ioctl multiplexer from hell. I don't think that's really 
an appealing alternative. It would work if all the injections were
very similar, but they are really not. They are all different.

Current way is to have own files in debugfs for each
or put in other places where it fits (e.g. madvise for in process injection)

Is the problem that there is no cleanly defined place in debugfs for it?
Maybe could simply define a standard directory structure in debugfs? 

I don't have a problem with that. Right now the approach was to put
it into a directory per subsystem, but perhaps there could be something
better. 

> Your refusal to even consider this possibility and to look at the EDAC/RAS patches 
> that deal with this is puzzling to me.

What I find puzzling is that you really continue to ignore all the practical details,
and still nack other people's work without even trying to understand it.

That's not really the old Ingo I used to know.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25  9:19       ` Andi Kleen
  2010-10-25 11:15         ` Ingo Molnar
@ 2010-10-25 16:38         ` Thomas Gleixner
  1 sibling, 0 replies; 48+ messages in thread
From: Thomas Gleixner @ 2010-10-25 16:38 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Ingo Molnar, Len Brown, linux-kernel, linux-acpi,
	Borislav Petkov, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Tony Luck

On Mon, 25 Oct 2010, Andi Kleen wrote:
> > > Sigh, please integrate all this into EDAC (drivers/edac/) properly, instead of 
> > > turning it into YET ANOTHER hardware vendor special hw-errors thing. We can do 
> > > better than this. EDAC is almost there: it has support for Nehalem, AMD, a couple
> > > of older chips.
> > 
> > I think APEI (ACPI Platform Error Interface) is another driver. Why
> > integrate two drivers?
> 
> Yes they're solving quite different problems from EDAC with different
> interfaces and for different devices in the ACPI space.
> 
> The earlier nack seems to be based on a lot of confusion on what the code 
> does.

Errm. That patch series carries a lot of other weird stuff including a
new "memory allocator", a new ioremap implementation private to the
acpi code and new character device driver for hardware error
reporting.

> Subject: [PATCH -v2 5/9] Hardware error device core
> 
> Hardware error device is a kind of device which can report hardware
> errors.  The examples of hardware error device include APEI GHES, PCIe
> AER, etc.
> 
> Hardware error device core in this patch provides common services for
> various hardware error devices.

But it does not even make an attempt to explain why this error
reporting cannot be done via the existing interfaces and why they
can't be extended to fit your needs. What's so special about APEI GHES
and PCIe AER that we need another incompatible "just fits your needs"
ABI which makes tooling folks deal with another completely different
interface ?

The only explanation I have is that you are simply not willing to work
with others and this is just another proof of a repeating problem.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 12:04           ` Mauro Carvalho Chehab
@ 2010-10-25 17:07             ` Tony Luck
  2010-10-25 17:19               ` Mauro Carvalho Chehab
  0 siblings, 1 reply; 48+ messages in thread
From: Tony Luck @ 2010-10-25 17:07 UTC (permalink / raw)
  To: Mauro Carvalho Chehab
  Cc: Ingo Molnar, Andi Kleen, Huang Ying, Len Brown, linux-kernel,
	linux-acpi, Borislav Petkov, Thomas Gleixner, H. Peter Anvin,
	Don Zickus, Linus Torvalds, Andrew Morton, Arjan van de Ven,
	Borislav Petkov

On Mon, Oct 25, 2010 at 5:04 AM, Mauro Carvalho Chehab
<mchehab@redhat.com> wrote:
> We had some discussions with Intel during the Collaboration summit about that, trying to
> integrate Nehalem EX on such environment, with, unfortunately, didn't happen, as Intel
> didn't release any (public) datasheet describing Nehalem EX memory controller, nor wrote
> such driver.

The chipset registers that are needed to write an EDAC driver for
Nehalem-EX (a.k.a Xeon
7500 series) are not accessible to OS (ring 0) code. Documenting them
wouldn't change this.

> As most of us will be in Boston next week, I've reserved a BoF at Plumbers for us to discuss
> about this subject:
>
> http://www.linuxplumbersconf.org/2010/ocw/proposals/921

An excellent idea. See you there.

-Tony

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 15:14                   ` Andi Kleen
@ 2010-10-25 17:10                     ` Ingo Molnar
  2010-10-27  8:25                       ` Ingo Molnar
  0 siblings, 1 reply; 48+ messages in thread
From: Ingo Molnar @ 2010-10-25 17:10 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Len Brown, linux-kernel, linux-acpi, Borislav Petkov,
	Thomas Gleixner, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Arjan van de Ven


* Andi Kleen <andi@firstfloor.org> wrote:

> [...]
>
> They are really all quite different.

The differences in those tools do not seem natural. Those tools are mostly just 
different for difference's sake - which is sad.

They could also admittedly benefit from a proper build mechanism and other 
integration goodies. For example when building mce-inject i get:

  $ make
  Makefile:42: .depend: No such file or directory

When running the mce-inject tool it apparently hangs:

  $ ./mce-inject 

(it's waiting for stdin)
  
it does not recognize any of the standard arguments:

  $ ./mce-inject -h
  -h: No such file or directory
  $ ./mce-inject --help
  --help: No such file or directory
  $ 

That's rather user-unfriendly.

Also, that's all besides the point, exactly _why_ should bad tooling examples be 
justification for us to continue that bad example in kernel-space and to ignore good 
user-space tooling examples?

Also, my NAK is not really against your user-space tooling - my NAK is against the 
kernel bits.

> On merging them:
> 
> Well ok in theory one could have ras test1|test2|test3

It's a good first step to realize that.

> But you would end up like git and perf where you have lots of different tools just 
> linked into the same executable. [...]

... or LTP, which too is a suite of many tests.

For testing it's especially good to think in 'suites', as there are many repetitive 
components and people quite often tend to want to use all the tests when they are 
validating, certifying, debugging a particular piece of hardware.

> [...] Not really an advantage, would you agree on that?

If you only look at Git, perf and LTP as examples then them offering a consistent, 
integrated command, configuration and handling interface is _wildly_ advantageous.

There's a common help system, common documentation, common parameters, common 
workflow and a common philosophy behind those tools - and more. There's development 
process advantages as well: beyond sharing code via common libraries the developers 
cross much easier and generally advantages of economy of scale apply.

Note, i'm not telling you to follow those examples. I only point out that they exist 
and that we try to shape the kernel interfaces based on sane examples not based on 
insane examples.

> Ok i understand in the git/perf case there's also a common library, but at least 
> in the error injection case there isn't really.

There isnt really mostly because you are not making use of common facilities. That's 
my whole point. It's a bug, not a feature. You make this area flow apart into 
inconsistent utilities, there's no coherency.

Which is fine to me as long as you dont contaminate the kernel with that kind of 
incoherency ...

> Anyways if the request is to link everything into a single binary it could be 
> looked at, but I must admit I personally don't see any advantage from that.

Why do you limit the advantages of Git et al to 'linking into a single binary' 
artificially? It isnt the only or even main advantage.

( That claim isnt even true technically, perf for example supports and uses script
  extensions that are separate from the main binary. Git has similar helpers as well
  although it tends to integrate all core functionality. )

> Or you could have a single kernel inject interface with a bazillion modi and 
> different options that needs to be extended all the time to cover some new case. 
> Aka the ioctl multiplexer from hell. [...]

Why use an ioctl multiplexer from hell?

A proper system call or sysfs based approach should be used - that way the hierarchy 
and structure is achieved via a VFS namespace.

         CPU events would be under /sys/devices/system/cpu/cpu0/events/
     IO-APIC events would be under /sys/devices/system/ioapic/ioapic0/events/
   NUMA node events would be under /sys/devices/system/node/node0/events/
 PCI chipset events would be under /sys/devices/pci0000:00/events/

etc.

There would be a /sys/class/events/ iterator to find all events, etc.

There is no messy ioctl demux - just a clean hiearchy of hw structure, and event and 
hardware specific implementations of them (with a good deal of sane default 
behavior, as many events dont need special treatment).

The main 'global' property that is absolutely needed is event enumeration.

Regular event reporting is event 'output' - while injection is really event 'input' 
with custom handlers to make it trigger under the right circumstances, but it's 
essentially the same abstract model that stands behind them.

> [...] I don't think that's really an appealing alternative. It would work if all 
> the injections were very similar, but they are really not. They are all different.

If you are artificially limiting options to an insane solution then of course it 
looks all unappealing to you ...

How about not doing that and considering the model we outlined?

> Current way is to have own files in debugfs for each or put in other places where 
> it fits (e.g. madvise for in process injection)

As i mentioned it several times (and even in the previous mail) we are moving it to 
sysfs - several versions of patches have been posted.

So why are you pretending that debugfs is our main goal? It isnt.

> Is the problem that there is no cleanly defined place in debugfs for it? Maybe 
> could simply define a standard directory structure in debugfs?
> 
> I don't have a problem with that. Right now the approach was to put it into a 
> directory per subsystem, but perhaps there could be something better.

Right. A first good step would be to log all those ERST events via TRACE_EVENT() and 
map out any missing bits you may need to get them logged.

Another useful area would be to help out Ling Ming finish up the sysfs bits, and to 
have events attach to existing topologies that Linux enumerates already.

Another good place to help would be Boris and the RAS tool, which involves the 
definition of new events that cover all the relevant event sources, and the 
population of tools/edac/ with tooling. That too could grow APEI and ERST logging 
(and later, injection) awareness.

Yet another useful area would be to define injection capabilities to TRACE_EVENT() 
type of events.

> > Your refusal to even consider this possibility and to look at the EDAC/RAS 
> > patches that deal with this is puzzling to me.
> 
> What I find puzzling is that you really continue to ignore all the practical 
> details, and still nack other people's work without even trying to understand it.

If only you listed practical details then we could address them ...

But as long as you come up with vague objections there's little i can do but to 
point out crap when it happens later: when you ignore what i suggest and then post a 
messy solution a few months down the line.

> That's not really the old Ingo I used to know.

That comes with me being a maintainer i'm afraid, i'm more directly affected by bad 
patches so i object to them more frequently. I'm also kind of expected to do that.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25  9:25       ` Ingo Molnar
@ 2010-10-25 17:14         ` Tony Luck
  2010-10-25 20:23           ` Borislav Petkov
  0 siblings, 1 reply; 48+ messages in thread
From: Tony Luck @ 2010-10-25 17:14 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Huang Ying, Len Brown, linux-kernel, Andi Kleen, linux-acpi,
	Borislav Petkov, Thomas Gleixner, H. Peter Anvin, Don Zickus,
	Linus Torvalds, Andrew Morton, Mauro Carvalho Chehab

On Mon, Oct 25, 2010 at 2:25 AM, Ingo Molnar <mingo@elte.hu> wrote:

> drivers/acpi/apei/ overlaps and duplicates drivers/edac/. We dont want two
> facilities, two ABIs, two sets of behavior. erst-dbg even defines a /dev node with
> two ioctls, and a debugfs file to read/write records ...

As mentioned above these 4-letter names from from the ACPI specification. ERST
is perhaps the dumbest name of them all - "Error Record Serialization Table" is
ACPI-speak for platform level non-volatile memory.  This code simply provides
a mechanism for Linux to stash some information in nvram before the system is
reset, and to retrieve it after the reboot.

The naming could be better - but I don't see any overlap with EDAC here.

-Tony

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 17:07             ` Tony Luck
@ 2010-10-25 17:19               ` Mauro Carvalho Chehab
  0 siblings, 0 replies; 48+ messages in thread
From: Mauro Carvalho Chehab @ 2010-10-25 17:19 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andi Kleen, Huang Ying, Len Brown, linux-kernel,
	linux-acpi, Borislav Petkov, Thomas Gleixner, H. Peter Anvin,
	Don Zickus, Linus Torvalds, Andrew Morton, Arjan van de Ven,
	Borislav Petkov

Em 25-10-2010 15:07, Tony Luck escreveu:
> On Mon, Oct 25, 2010 at 5:04 AM, Mauro Carvalho Chehab
> <mchehab@redhat.com> wrote:
>> We had some discussions with Intel during the Collaboration summit about that, trying to
>> integrate Nehalem EX on such environment, with, unfortunately, didn't happen, as Intel
>> didn't release any (public) datasheet describing Nehalem EX memory controller, nor wrote
>> such driver.
> 
> The chipset registers that are needed to write an EDAC driver for
> Nehalem-EX (a.k.a Xeon
> 7500 series) are not accessible to OS (ring 0) code. Documenting them
> wouldn't change this.

Yeah, I know.

The error reporting mechanism could do something similar to i7core_edac
driver to parse the MCE NMI errors to userspace via EDAC interface, but
without any way to get the memory topology, this wouldn't work. With some 
documentation, maybe we could find a way for a kernel code to retrieve 
the memory topology, maybe via BIOS, and make it work.


>> As most of us will be in Boston next week, I've reserved a BoF at Plumbers for us to discuss
>> about this subject:
>>
>> http://www.linuxplumbersconf.org/2010/ocw/proposals/921
> 
> An excellent idea. See you there.

OK,

See you there.

Thanks,
Mauro

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 17:14         ` Tony Luck
@ 2010-10-25 20:23           ` Borislav Petkov
  2010-10-25 21:23               ` Tony Luck
  0 siblings, 1 reply; 48+ messages in thread
From: Borislav Petkov @ 2010-10-25 20:23 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Huang Ying, Len Brown, linux-kernel, Andi Kleen,
	linux-acpi, Borislav Petkov, Thomas Gleixner, H. Peter Anvin,
	Don Zickus, Linus Torvalds, Andrew Morton, Mauro Carvalho Chehab

On Mon, Oct 25, 2010 at 10:14:52AM -0700, Tony Luck wrote:
> On Mon, Oct 25, 2010 at 2:25 AM, Ingo Molnar <mingo@elte.hu> wrote:
> 
> > drivers/acpi/apei/ overlaps and duplicates drivers/edac/. We dont want two
> > facilities, two ABIs, two sets of behavior. erst-dbg even defines a /dev node with
> > two ioctls, and a debugfs file to read/write records ...
> 
> As mentioned above these 4-letter names from from the ACPI specification. ERST
> is perhaps the dumbest name of them all - "Error Record Serialization Table" is
> ACPI-speak for platform level non-volatile memory.  This code simply provides
> a mechanism for Linux to stash some information in nvram before the system is
> reset, and to retrieve it after the reboot.
> 
> The naming could be better - but I don't see any overlap with EDAC here.

You may be right but what we actually want is a consistent RAS
infrastructure. Didn't you point out at the last edac meeting in Boston
that concerning RAS Linux were in the stone ages? (at least this is what
I remember reading).

What we should do is put all that post-system-reset error info, ECC
errors mapping to DRAM devices, L3 cache index manipulation based on
excessive errors - you name it - together and stick it in ras/ or
drivers/ras or whatever. And all with a nice and easy to use userspace
tool on top.

Now it looks like a wart on arch/x86/ which truly doesn't belong there.
And I don't buy all that crap that it can't be done right.

Thanks.

-- 
Regards/Gruss,
    Boris.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 20:23           ` Borislav Petkov
@ 2010-10-25 21:23               ` Tony Luck
  0 siblings, 0 replies; 48+ messages in thread
From: Tony Luck @ 2010-10-25 21:23 UTC (permalink / raw)
  To: Borislav Petkov, Tony Luck, Ingo Molnar, Huang Ying, Len Brown,
	linux-kernel

On Mon, Oct 25, 2010 at 1:23 PM, Borislav Petkov <bp@alien8.de> wrote:

> You may be right but what we actually want is a consistent RAS
> infrastructure. Didn't you point out at the last edac meeting in Boston
> that concerning RAS Linux were in the stone ages? (at least this is what
> I remember reading).

That meeting was in San Francisco - but your recollection is correct.
Right now we have ways to count errors, and to attribute them to
specific hardware components (if we are lucky). This is only the
beginning of the feature set that is needed to be "advanced RAS".

> What we should do is put all that post-system-reset error info, ECC
> errors mapping to DRAM devices, L3 cache index manipulation based on
> excessive errors - you name it - together and stick it in ras/ or
> drivers/ras or whatever. And all with a nice and easy to use userspace
> tool on top.

This is what we should be working towards.  I don't think we have
a clear picture of what that high level infrastructure looks like. It
needs to be very flexible to take input from all sorts of platform
specific "driver" code that collects data.  The "perf events"
mechanism looks plausible as a transport mechanism for
reporting corrected (or otherwise non-fatal) events. But the
errors that didn't kill the system are only part of the RAS picture.

> Now it looks like a wart on arch/x86/ which truly doesn't belong there.
> And I don't buy all that crap that it can't be done right.

Of course it is a wart ... look up ACPI in any dictionary and you'll
find a picture of a stereotypical Halloween witch :-) I don't see other
architectures lining up to support ACPI ... but we shouldn't just
ignore it in x86.  The APEI pieces that were added to ACPI 4.0
have some interesting and useful features.  Most of them are
already implemented on shipping platforms because the APEI
bits were simply documenting WHEA (Windows Hardware Error
Architecture) features.  Look for this stuff in dmesg:

ACPI: HEST 000000007fb1c000 000A8 (v01 INTEL    SFC4UR 00000001 INTL 00000001)
ACPI: BERT 000000007fb1b000 00030 (v01 INTEL    SFC4UR 00000001 INTL 00000001)
ACPI: ERST 000000007fb1a000 00230 (v01 INTEL    SFC4UR 00000001 INTL 00000001)
ACPI: EINJ 000000007fb19000 00130 (v01 INTEL    SFC4UR 00000001 INTL 00000001)

-Tony

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
@ 2010-10-25 21:23               ` Tony Luck
  0 siblings, 0 replies; 48+ messages in thread
From: Tony Luck @ 2010-10-25 21:23 UTC (permalink / raw)
  To: Borislav Petkov, Tony Luck, Ingo Molnar, Huang Ying, Len Brown,
	linux-kernel, Andi Kleen, linux-acpi, Borislav Petkov,
	Thomas Gleixner, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab

On Mon, Oct 25, 2010 at 1:23 PM, Borislav Petkov <bp@alien8.de> wrote:

> You may be right but what we actually want is a consistent RAS
> infrastructure. Didn't you point out at the last edac meeting in Boston
> that concerning RAS Linux were in the stone ages? (at least this is what
> I remember reading).

That meeting was in San Francisco - but your recollection is correct.
Right now we have ways to count errors, and to attribute them to
specific hardware components (if we are lucky). This is only the
beginning of the feature set that is needed to be "advanced RAS".

> What we should do is put all that post-system-reset error info, ECC
> errors mapping to DRAM devices, L3 cache index manipulation based on
> excessive errors - you name it - together and stick it in ras/ or
> drivers/ras or whatever. And all with a nice and easy to use userspace
> tool on top.

This is what we should be working towards.  I don't think we have
a clear picture of what that high level infrastructure looks like. It
needs to be very flexible to take input from all sorts of platform
specific "driver" code that collects data.  The "perf events"
mechanism looks plausible as a transport mechanism for
reporting corrected (or otherwise non-fatal) events. But the
errors that didn't kill the system are only part of the RAS picture.

> Now it looks like a wart on arch/x86/ which truly doesn't belong there.
> And I don't buy all that crap that it can't be done right.

Of course it is a wart ... look up ACPI in any dictionary and you'll
find a picture of a stereotypical Halloween witch :-) I don't see other
architectures lining up to support ACPI ... but we shouldn't just
ignore it in x86.  The APEI pieces that were added to ACPI 4.0
have some interesting and useful features.  Most of them are
already implemented on shipping platforms because the APEI
bits were simply documenting WHEA (Windows Hardware Error
Architecture) features.  Look for this stuff in dmesg:

ACPI: HEST 000000007fb1c000 000A8 (v01 INTEL    SFC4UR 00000001 INTL 00000001)
ACPI: BERT 000000007fb1b000 00030 (v01 INTEL    SFC4UR 00000001 INTL 00000001)
ACPI: ERST 000000007fb1a000 00230 (v01 INTEL    SFC4UR 00000001 INTL 00000001)
ACPI: EINJ 000000007fb19000 00130 (v01 INTEL    SFC4UR 00000001 INTL 00000001)

-Tony

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 21:23               ` Tony Luck
@ 2010-10-25 21:51                 ` Borislav Petkov
  -1 siblings, 0 replies; 48+ messages in thread
From: Borislav Petkov @ 2010-10-25 21:51 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Huang Ying, Len Brown, linux-kernel, Andi Kleen,
	linux-acpi, Borislav Petkov, Thomas Gleixner, H. Peter Anvin,
	Don Zickus, Linus Torvalds, Andrew Morton, Mauro Carvalho Chehab

On Mon, Oct 25, 2010 at 02:23:12PM -0700, Tony Luck wrote:
> On Mon, Oct 25, 2010 at 1:23 PM, Borislav Petkov <bp@alien8.de> wrote:
> 
> > You may be right but what we actually want is a consistent RAS
> > infrastructure. Didn't you point out at the last edac meeting in Boston
> > that concerning RAS Linux were in the stone ages? (at least this is what
> > I remember reading).
> 
> That meeting was in San Francisco - but your recollection is correct.
> Right now we have ways to count errors, and to attribute them to
> specific hardware components (if we are lucky). This is only the
> beginning of the feature set that is needed to be "advanced RAS".

Yep.

> > What we should do is put all that post-system-reset error info, ECC
> > errors mapping to DRAM devices, L3 cache index manipulation based on
> > excessive errors - you name it - together and stick it in ras/ or
> > drivers/ras or whatever. And all with a nice and easy to use userspace
> > tool on top.
> 
> This is what we should be working towards.  I don't think we have
> a clear picture of what that high level infrastructure looks like. It
> needs to be very flexible to take input from all sorts of platform
> specific "driver" code that collects data.  The "perf events"
> mechanism looks plausible as a transport mechanism for
> reporting corrected (or otherwise non-fatal) events.

Also agreed.

> But the errors that didn't kill the system are only part of the RAS
> picture.

Concerning fatal errors, take a look at drivers/edac/mce_amd.(c|h)¹ -
this is not in arch/x86/ and still decodes MCEs in the kernel. And it
works fine - it even helped in several cases where people simply read
their serial console/dmesg and didn't have to collect it first and run
it through some tool to understand which functional unit in the CPU is
mchecking.

And I have an error injection module which can inject MCEs using 2
/sysfs files only. It is software injection only for now but still, you
can even inject in the shell.

So I think having RAS decoupled from x86 proper could work but it has to
be done smart. In the example above, I've hooked into the machine check
notifier chain. And yes, this is just an example but you get the idea
and where we want to go.

> > Now it looks like a wart on arch/x86/ which truly doesn't belong there.
> > And I don't buy all that crap that it can't be done right.
> 
> Of course it is a wart ... look up ACPI in any dictionary and you'll
> find a picture of a stereotypical Halloween witch :-)

LOL. Can I have the ACPI witch on a t-shirt please?

> I don't see other architectures lining up to support ACPI ... but
> we shouldn't just ignore it in x86. The APEI pieces that were added
> to ACPI 4.0 have some interesting and useful features.

Yeah, I like the error info after system reset thing using nvram.
This might make a lot of sense with certain critical errors where we
syncflood the ht links to prevent corrupted data propagation.

> Most of them are already implemented on shipping platforms because
> the APEI bits were simply documenting WHEA (Windows Hardware Error
> Architecture) features. Look for this stuff in dmesg:
>
> ACPI: HEST 000000007fb1c000 000A8 (v01 INTEL    SFC4UR 00000001 INTL 00000001)
> ACPI: BERT 000000007fb1b000 00030 (v01 INTEL    SFC4UR 00000001 INTL 00000001)
> ACPI: ERST 000000007fb1a000 00230 (v01 INTEL    SFC4UR 00000001 INTL 00000001)
> ACPI: EINJ 000000007fb19000 00130 (v01 INTEL    SFC4UR 00000001 INTL 00000001)

Yeah, I'm not saying we shouldn't have those - I'm just saying that we
should think up a good infrastructure design first and hook them into it
so that users can have the highest benefit.

And then have a single tool which talks to the high level interface and
all different arches implement whatever makes sense for them. Basically
what you said above about the flexible high level infrastructure.

Thanks.

¹in current git, in 2.6.36 the files were called edac_mce_amd(c|h).

-- 
Regards/Gruss,
    Boris.
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
@ 2010-10-25 21:51                 ` Borislav Petkov
  0 siblings, 0 replies; 48+ messages in thread
From: Borislav Petkov @ 2010-10-25 21:51 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Huang Ying, Len Brown, linux-kernel, Andi Kleen,
	linux-acpi, Borislav Petkov, Thomas Gleixner, H. Peter Anvin,
	Don Zickus, Linus Torvalds, Andrew Morton, Mauro Carvalho Chehab

On Mon, Oct 25, 2010 at 02:23:12PM -0700, Tony Luck wrote:
> On Mon, Oct 25, 2010 at 1:23 PM, Borislav Petkov <bp@alien8.de> wrote:
> 
> > You may be right but what we actually want is a consistent RAS
> > infrastructure. Didn't you point out at the last edac meeting in Boston
> > that concerning RAS Linux were in the stone ages? (at least this is what
> > I remember reading).
> 
> That meeting was in San Francisco - but your recollection is correct.
> Right now we have ways to count errors, and to attribute them to
> specific hardware components (if we are lucky). This is only the
> beginning of the feature set that is needed to be "advanced RAS".

Yep.

> > What we should do is put all that post-system-reset error info, ECC
> > errors mapping to DRAM devices, L3 cache index manipulation based on
> > excessive errors - you name it - together and stick it in ras/ or
> > drivers/ras or whatever. And all with a nice and easy to use userspace
> > tool on top.
> 
> This is what we should be working towards.  I don't think we have
> a clear picture of what that high level infrastructure looks like. It
> needs to be very flexible to take input from all sorts of platform
> specific "driver" code that collects data.  The "perf events"
> mechanism looks plausible as a transport mechanism for
> reporting corrected (or otherwise non-fatal) events.

Also agreed.

> But the errors that didn't kill the system are only part of the RAS
> picture.

Concerning fatal errors, take a look at drivers/edac/mce_amd.(c|h)¹ -
this is not in arch/x86/ and still decodes MCEs in the kernel. And it
works fine - it even helped in several cases where people simply read
their serial console/dmesg and didn't have to collect it first and run
it through some tool to understand which functional unit in the CPU is
mchecking.

And I have an error injection module which can inject MCEs using 2
/sysfs files only. It is software injection only for now but still, you
can even inject in the shell.

So I think having RAS decoupled from x86 proper could work but it has to
be done smart. In the example above, I've hooked into the machine check
notifier chain. And yes, this is just an example but you get the idea
and where we want to go.

> > Now it looks like a wart on arch/x86/ which truly doesn't belong there.
> > And I don't buy all that crap that it can't be done right.
> 
> Of course it is a wart ... look up ACPI in any dictionary and you'll
> find a picture of a stereotypical Halloween witch :-)

LOL. Can I have the ACPI witch on a t-shirt please?

> I don't see other architectures lining up to support ACPI ... but
> we shouldn't just ignore it in x86. The APEI pieces that were added
> to ACPI 4.0 have some interesting and useful features.

Yeah, I like the error info after system reset thing using nvram.
This might make a lot of sense with certain critical errors where we
syncflood the ht links to prevent corrupted data propagation.

> Most of them are already implemented on shipping platforms because
> the APEI bits were simply documenting WHEA (Windows Hardware Error
> Architecture) features. Look for this stuff in dmesg:
>
> ACPI: HEST 000000007fb1c000 000A8 (v01 INTEL    SFC4UR 00000001 INTL 00000001)
> ACPI: BERT 000000007fb1b000 00030 (v01 INTEL    SFC4UR 00000001 INTL 00000001)
> ACPI: ERST 000000007fb1a000 00230 (v01 INTEL    SFC4UR 00000001 INTL 00000001)
> ACPI: EINJ 000000007fb19000 00130 (v01 INTEL    SFC4UR 00000001 INTL 00000001)

Yeah, I'm not saying we shouldn't have those - I'm just saying that we
should think up a good infrastructure design first and hook them into it
so that users can have the highest benefit.

And then have a single tool which talks to the high level interface and
all different arches implement whatever makes sense for them. Basically
what you said above about the flexible high level infrastructure.

Thanks.

¹in current git, in 2.6.36 the files were called edac_mce_amd(c|h).

-- 
Regards/Gruss,
    Boris.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 21:51                 ` Borislav Petkov
  (?)
  (?)
@ 2010-10-25 23:35                 ` Tony Luck
  -1 siblings, 0 replies; 48+ messages in thread
From: Tony Luck @ 2010-10-25 23:35 UTC (permalink / raw)
  To: Borislav Petkov, Tony Luck, Ingo Molnar, Huang Ying, Len Brown,
	linux-kernel

On Mon, Oct 25, 2010 at 2:51 PM, Borislav Petkov <bp@alien8.de> wrote:
> Concerning fatal errors, take a look at drivers/edac/mce_amd.(c|h)¹ -
> this is not in arch/x86/ and still decodes MCEs in the kernel. And it
> works fine - it even helped in several cases where people simply read
> their serial console/dmesg and didn't have to collect it first and run
> it through some tool to understand which functional unit in the CPU is
> mchecking.

That looks neat ... but end-users seem to have some conflicting requirements
here. Your uses seem to like it but the LLNL folks at the S.F. meeting said
that solutions that involved looking at console logs from thousands
of machines in a cluster were not acceptable.

I doubt very much if any end-user cares which unit *within* a cpu
failed (their replaceable unit is the whole of the cpu). So much of
your driver could be replaced with: printk("CPU%d is bad\n", cpu);

-Tony
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 21:51                 ` Borislav Petkov
  (?)
@ 2010-10-25 23:35                 ` Tony Luck
  2010-10-26  6:26                     ` Borislav Petkov
  -1 siblings, 1 reply; 48+ messages in thread
From: Tony Luck @ 2010-10-25 23:35 UTC (permalink / raw)
  To: Borislav Petkov, Tony Luck, Ingo Molnar, Huang Ying, Len Brown,
	linux-kernel, Andi Kleen, linux-acpi, Borislav Petkov,
	Thomas Gleixner, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab

On Mon, Oct 25, 2010 at 2:51 PM, Borislav Petkov <bp@alien8.de> wrote:
> Concerning fatal errors, take a look at drivers/edac/mce_amd.(c|h)¹ -
> this is not in arch/x86/ and still decodes MCEs in the kernel. And it
> works fine - it even helped in several cases where people simply read
> their serial console/dmesg and didn't have to collect it first and run
> it through some tool to understand which functional unit in the CPU is
> mchecking.

That looks neat ... but end-users seem to have some conflicting requirements
here. Your uses seem to like it but the LLNL folks at the S.F. meeting said
that solutions that involved looking at console logs from thousands
of machines in a cluster were not acceptable.

I doubt very much if any end-user cares which unit *within* a cpu
failed (their replaceable unit is the whole of the cpu). So much of
your driver could be replaced with: printk("CPU%d is bad\n", cpu);

-Tony

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25  8:45   ` [NAK] " Ingo Molnar
  2010-10-25  8:58     ` Huang Ying
@ 2010-10-26  1:06     ` Len Brown
  2010-10-26  4:53       ` Thomas Gleixner
  1 sibling, 1 reply; 48+ messages in thread
From: Len Brown @ 2010-10-26  1:06 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Huang Ying, linux-kernel, Andi Kleen, linux-acpi,
	Borislav Petkov, Thomas Gleixner, H. Peter Anvin, Don Zickus,
	Linus Torvalds, Andrew Morton, Mauro Carvalho Chehab

>  NAKed-by: Ingo Molnar <mingo@elte.hu>

Everybody knows that Linux has a lot to learn about RAS.

I think to catch up, we need to play to Linux's strengths
of continuous improvement.  If we halt patches in this area
then we could wait forever for the "perfect design".

If Linus Torvalds disagrees and wants to NAK this series,
I can live with that.  But it isn't my recommendation.

thanks,
-Len Brown, Intel Open Source Technology Center



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-26  1:06     ` Len Brown
@ 2010-10-26  4:53       ` Thomas Gleixner
  2010-10-26  7:22         ` Ingo Molnar
                           ` (2 more replies)
  0 siblings, 3 replies; 48+ messages in thread
From: Thomas Gleixner @ 2010-10-26  4:53 UTC (permalink / raw)
  To: Len Brown
  Cc: Ingo Molnar, Huang Ying, LKML, Andi Kleen, linux-acpi,
	Borislav Petkov, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Tony Luck

B1;2401;0cLen,

On Mon, 25 Oct 2010, Len Brown wrote:

> >  NAKed-by: Ingo Molnar <mingo@elte.hu>
> 
> Everybody knows that Linux has a lot to learn about RAS.
> 
> I think to catch up, we need to play to Linux's strengths
> of continuous improvement.  If we halt patches in this area
> then we could wait forever for the "perfect design".

it's not about perfect design. It's about creating new user space
ABIs. The patches introduce another error reporting user space ABI
with an ad hoc "fits the needs" design.

This is my major point of objection. 

I agree that Linux needs improvement on the RAS side, but does this
lack of features justify a new user space ABI which is totally
disconnected to existing RAS facilities ?

No, it does not. It's not our problem that Intel wasted time on
creating another character device driver to report errors to user
space. The time spent to do so would have been sufficient to do a
proper integration into the existing infrastructure.

I would not care at all if these patches would just introduce some
weird in kernel interfaces as we can clean that up at will. But
introducing a new user space ABI is setting the disconnect of RAS
related facilities into stone.

>From Kconfig:

  EDAC is designed to report errors in the core system.
  These are low-level errors that are reported in the CPU or
  supporting chipset or other subsystems:
  memory errors, cache errors, PCI errors, thermal throttling, etc..
  If unsure, select 'Y'.

So please explain why your error reporting is so different from the
above that it justifies a separate facility. And you better come up
with a real good explanation other than we looked at EDAC and it did
not fit our needs.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 23:35                 ` Tony Luck
@ 2010-10-26  6:26                     ` Borislav Petkov
  0 siblings, 0 replies; 48+ messages in thread
From: Borislav Petkov @ 2010-10-26  6:26 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Huang Ying, Len Brown, linux-kernel, Andi Kleen,
	linux-acpi, Borislav Petkov, Thomas Gleixner, H. Peter Anvin,
	Don Zickus, Linus Torvalds, Andrew Morton, Mauro Carvalho Chehab

On Mon, Oct 25, 2010 at 04:35:43PM -0700, Tony Luck wrote:
> On Mon, Oct 25, 2010 at 2:51 PM, Borislav Petkov <bp@alien8.de> wrote:
> > Concerning fatal errors, take a look at drivers/edac/mce_amd.(c|h)¹ -
> > this is not in arch/x86/ and still decodes MCEs in the kernel. And it
> > works fine - it even helped in several cases where people simply read
> > their serial console/dmesg and didn't have to collect it first and run
> > it through some tool to understand which functional unit in the CPU is
> > mchecking.
> 
> That looks neat ... but end-users seem to have some conflicting requirements
> here. Your uses seem to like it but the LLNL folks at the S.F. meeting said
> that solutions that involved looking at console logs from thousands
> of machines in a cluster were not acceptable.
> 
> I doubt very much if any end-user cares which unit *within* a cpu
> failed (their replaceable unit is the whole of the cpu). So much of
> your driver could be replaced with: printk("CPU%d is bad\n", cpu);

Yeah, nobody said this is finished. The next step is using perf
infrastructure to convey those decoded errors to userspace, say, to a
ras daemon or similar which can do all sorts of reporting, statistics,
policy decisions, injection, paint graphs, whatever...

I sent out two patchsets as an rfc already and am working
on the 3rd one so we're getting there. Here's the last one:
http://kerneltrap.org/mailarchive/linux-kernel/2010/8/6/4603847

Also, I'm open to all suggestions on how to make it more usable and
user-friendly.

Thanks.

-- 
Regards/Gruss,
    Boris.
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
@ 2010-10-26  6:26                     ` Borislav Petkov
  0 siblings, 0 replies; 48+ messages in thread
From: Borislav Petkov @ 2010-10-26  6:26 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Huang Ying, Len Brown, linux-kernel, Andi Kleen,
	linux-acpi, Borislav Petkov, Thomas Gleixner, H. Peter Anvin,
	Don Zickus, Linus Torvalds, Andrew Morton, Mauro Carvalho Chehab

On Mon, Oct 25, 2010 at 04:35:43PM -0700, Tony Luck wrote:
> On Mon, Oct 25, 2010 at 2:51 PM, Borislav Petkov <bp@alien8.de> wrote:
> > Concerning fatal errors, take a look at drivers/edac/mce_amd.(c|h)¹ -
> > this is not in arch/x86/ and still decodes MCEs in the kernel. And it
> > works fine - it even helped in several cases where people simply read
> > their serial console/dmesg and didn't have to collect it first and run
> > it through some tool to understand which functional unit in the CPU is
> > mchecking.
> 
> That looks neat ... but end-users seem to have some conflicting requirements
> here. Your uses seem to like it but the LLNL folks at the S.F. meeting said
> that solutions that involved looking at console logs from thousands
> of machines in a cluster were not acceptable.
> 
> I doubt very much if any end-user cares which unit *within* a cpu
> failed (their replaceable unit is the whole of the cpu). So much of
> your driver could be replaced with: printk("CPU%d is bad\n", cpu);

Yeah, nobody said this is finished. The next step is using perf
infrastructure to convey those decoded errors to userspace, say, to a
ras daemon or similar which can do all sorts of reporting, statistics,
policy decisions, injection, paint graphs, whatever...

I sent out two patchsets as an rfc already and am working
on the 3rd one so we're getting there. Here's the last one:
http://kerneltrap.org/mailarchive/linux-kernel/2010/8/6/4603847

Also, I'm open to all suggestions on how to make it more usable and
user-friendly.

Thanks.

-- 
Regards/Gruss,
    Boris.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-26  4:53       ` Thomas Gleixner
@ 2010-10-26  7:22         ` Ingo Molnar
  2010-10-26  7:30           ` Huang Ying
  2010-10-26  8:38         ` Andi Kleen
  2010-10-26  8:52         ` Huang Ying
  2 siblings, 1 reply; 48+ messages in thread
From: Ingo Molnar @ 2010-10-26  7:22 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Len Brown, Huang Ying, LKML, Andi Kleen, linux-acpi,
	Borislav Petkov, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Tony Luck


* Thomas Gleixner <tglx@linutronix.de> wrote:

> >From Kconfig:
> 
>   EDAC is designed to report errors in the core system.
>   These are low-level errors that are reported in the CPU or
>   supporting chipset or other subsystems:
>   memory errors, cache errors, PCI errors, thermal throttling, etc..
>   If unsure, select 'Y'.
> 
> So please explain why your error reporting is so different from the above that it 
> justifies a separate facility. And you better come up with a real good explanation 
> other than we looked at EDAC and it did not fit our needs.

Btw., it's not just about EDAC - the firmware can store Linux events persistently 
(beyond allowing the firmware to insert its own RAS events), that is obviously 
_hugely_ useful for kernel debugging in general. We could inject debugging events 
there and recover them after a crash, etc.

As long as it's all integrated into the standard event logging facilities it could 
be very useful (and i dont generally complain about things that are insignificant).

As /dev/erst-dbg it's not useful at all - in fact it might close the door to sane 
future usage of this hw facility if some crappy user-space learns to rely on the 
/dev/erst-dbg ABI.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-26  7:22         ` Ingo Molnar
@ 2010-10-26  7:30           ` Huang Ying
  2010-10-26  7:55             ` Ingo Molnar
  0 siblings, 1 reply; 48+ messages in thread
From: Huang Ying @ 2010-10-26  7:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, Len Brown, LKML, Andi Kleen, linux-acpi,
	Borislav Petkov, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Luck, Tony

On Tue, 2010-10-26 at 15:22 +0800, Ingo Molnar wrote:
> * Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> > >From Kconfig:
> > 
> >   EDAC is designed to report errors in the core system.
> >   These are low-level errors that are reported in the CPU or
> >   supporting chipset or other subsystems:
> >   memory errors, cache errors, PCI errors, thermal throttling, etc..
> >   If unsure, select 'Y'.
> > 
> > So please explain why your error reporting is so different from the above that it 
> > justifies a separate facility. And you better come up with a real good explanation 
> > other than we looked at EDAC and it did not fit our needs.
> 
> Btw., it's not just about EDAC - the firmware can store Linux events persistently 
> (beyond allowing the firmware to insert its own RAS events), that is obviously 
> _hugely_ useful for kernel debugging in general. We could inject debugging events 
> there and recover them after a crash, etc.

Yes. It can be used by other kernel subsystems other than RAS. A kernel
API is provided already. The design of the kernel API makes it easy to
be used by various kernel subsystems. As the first step, we plan to
support saving kernel log before panic and reading it back after reboot.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-26  7:30           ` Huang Ying
@ 2010-10-26  7:55             ` Ingo Molnar
  2010-10-26  8:32               ` Huang Ying
  0 siblings, 1 reply; 48+ messages in thread
From: Ingo Molnar @ 2010-10-26  7:55 UTC (permalink / raw)
  To: Huang Ying
  Cc: Thomas Gleixner, Len Brown, LKML, Andi Kleen, linux-acpi,
	Borislav Petkov, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Luck, Tony


* Huang Ying <ying.huang@intel.com> wrote:

> On Tue, 2010-10-26 at 15:22 +0800, Ingo Molnar wrote:
> > * Thomas Gleixner <tglx@linutronix.de> wrote:
> > 
> > > >From Kconfig:
> > > 
> > >   EDAC is designed to report errors in the core system.
> > >   These are low-level errors that are reported in the CPU or
> > >   supporting chipset or other subsystems:
> > >   memory errors, cache errors, PCI errors, thermal throttling, etc..
> > >   If unsure, select 'Y'.
> > > 
> > > So please explain why your error reporting is so different from the above that it 
> > > justifies a separate facility. And you better come up with a real good explanation 
> > > other than we looked at EDAC and it did not fit our needs.
> > 
> > Btw., it's not just about EDAC - the firmware can store Linux events 
> > persistently (beyond allowing the firmware to insert its own RAS events), that 
> > is obviously _hugely_ useful for kernel debugging in general. We could inject 
> > debugging events there and recover them after a crash, etc.
> 
> Yes. It can be used by other kernel subsystems other than RAS. A kernel API is 
> provided already. The design of the kernel API makes it easy to be used by various 
> kernel subsystems. As the first step, we plan to support saving kernel log before 
> panic and reading it back after reboot.

And that's the problem: we have good facilities already that deal with similar 
things. We have NMI-safe event logging, event enumeration, dump-on-panic code and 
all sorts of goodies there.

But what did Andi's guidance/design lead you to do instead?

You stuck a useful hw feature into a vendor specific area of the kernel and exported 
it to /dev/erst-dbg via a crappy ABI. You also did it in the worst possible 
imaginable way: you avoided talking to the people who maintain and know the 
RAS/EDAC/debugging/instrumentation code, and you tried to create an ABI to export it 
in the most raw form possible - limiting our future options.

All that done so that dealing with those pesky RAS/EDAC, instrumentation and core 
kernel people can be avoided? ;-)

Sucks IMHO.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-26  7:55             ` Ingo Molnar
@ 2010-10-26  8:32               ` Huang Ying
  2010-10-26 10:03                 ` Ingo Molnar
  0 siblings, 1 reply; 48+ messages in thread
From: Huang Ying @ 2010-10-26  8:32 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, Len Brown, LKML, Andi Kleen, linux-acpi,
	Borislav Petkov, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Luck, Tony

On Tue, 2010-10-26 at 15:55 +0800, Ingo Molnar wrote:
> * Huang Ying <ying.huang@intel.com> wrote:
> 
> > On Tue, 2010-10-26 at 15:22 +0800, Ingo Molnar wrote:
> > > * Thomas Gleixner <tglx@linutronix.de> wrote:
> > > 
> > > > >From Kconfig:
> > > > 
> > > >   EDAC is designed to report errors in the core system.
> > > >   These are low-level errors that are reported in the CPU or
> > > >   supporting chipset or other subsystems:
> > > >   memory errors, cache errors, PCI errors, thermal throttling, etc..
> > > >   If unsure, select 'Y'.
> > > > 
> > > > So please explain why your error reporting is so different from the above that it 
> > > > justifies a separate facility. And you better come up with a real good explanation 
> > > > other than we looked at EDAC and it did not fit our needs.
> > > 
> > > Btw., it's not just about EDAC - the firmware can store Linux events 
> > > persistently (beyond allowing the firmware to insert its own RAS events), that 
> > > is obviously _hugely_ useful for kernel debugging in general. We could inject 
> > > debugging events there and recover them after a crash, etc.
> > 
> > Yes. It can be used by other kernel subsystems other than RAS. A kernel API is 
> > provided already. The design of the kernel API makes it easy to be used by various 
> > kernel subsystems. As the first step, we plan to support saving kernel log before 
> > panic and reading it back after reboot.
> 
> And that's the problem: we have good facilities already that deal with similar 
> things. We have NMI-safe event logging, event enumeration, dump-on-panic code and 
> all sorts of goodies there.

We have provided an in-kernel API for ERST now. And we plan to implement
a kmsg_dumper with ERST. And maybe implement some output support (maybe
via some /dev/kmsg extension) for kmsg_dumper if necessary.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-26  4:53       ` Thomas Gleixner
  2010-10-26  7:22         ` Ingo Molnar
@ 2010-10-26  8:38         ` Andi Kleen
  2010-10-26 10:00           ` Thomas Gleixner
  2010-10-26  8:52         ` Huang Ying
  2 siblings, 1 reply; 48+ messages in thread
From: Andi Kleen @ 2010-10-26  8:38 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Len Brown, Ingo Molnar, Huang Ying, LKML, Andi Kleen, linux-acpi,
	Borislav Petkov, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Tony Luck

> So please explain why your error reporting is so different from the
> above that it justifies a separate facility. And you better come up
> with a real good explanation other than we looked at EDAC and it did
> not fit our needs.

Well it didn't fit the needs at all.  Is that not enough, Mr Inquisitor?

Really if you want to nack and maintain and design things you need
to do a bit more than just arguing from two lines of
high level description.

EDAC enumerates hardware and exports some hardware
registers and decodes a few errors in a format into
the kernel log that is hard to impossible to post-process.

APEI does nothing of that, so it doesn't fit into EDAC.

perf does small parts of it, but a lot of things that are needed
it doesn't do either. And for the things it does it's incredible 
overkill in terms of code size, overhead etc. for the task
at hand.

Basically the only facility in perf that could 
be reused is a simple message passing layer, which 
is only a few tens lines of code anyways, but only
at the cost of doing a lot of changes to get the right
semantics out of it.

The main interface in APEI right now is to manage fatal
errors after reboot. Short of making it a file system,
which didn't really fit either, there's no existing interface
that handles that in a half way sensible way.
 
There's some other stuff but it's reasonably small too.

-Andi

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-26  4:53       ` Thomas Gleixner
  2010-10-26  7:22         ` Ingo Molnar
  2010-10-26  8:38         ` Andi Kleen
@ 2010-10-26  8:52         ` Huang Ying
  2010-10-26 10:15           ` Ingo Molnar
  2 siblings, 1 reply; 48+ messages in thread
From: Huang Ying @ 2010-10-26  8:52 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Len Brown, Ingo Molnar, LKML, Andi Kleen, linux-acpi,
	Borislav Petkov, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Luck, Tony

Hi, Thomas,

On Tue, 2010-10-26 at 12:53 +0800, Thomas Gleixner wrote:
> B1;2401;0cLen,
> 
> On Mon, 25 Oct 2010, Len Brown wrote:
> 
> > >  NAKed-by: Ingo Molnar <mingo@elte.hu>
> > 
> > Everybody knows that Linux has a lot to learn about RAS.
> > 
> > I think to catch up, we need to play to Linux's strengths
> > of continuous improvement.  If we halt patches in this area
> > then we could wait forever for the "perfect design".
> 
> it's not about perfect design. It's about creating new user space
> ABIs. The patches introduce another error reporting user space ABI
> with an ad hoc "fits the needs" design.
> 
> This is my major point of objection. 
> 
> I agree that Linux needs improvement on the RAS side, but does this
> lack of features justify a new user space ABI which is totally
> disconnected to existing RAS facilities ?
> 
> No, it does not. It's not our problem that Intel wasted time on
> creating another character device driver to report errors to user
> space. The time spent to do so would have been sufficient to do a
> proper integration into the existing infrastructure.
> 
> I would not care at all if these patches would just introduce some
> weird in kernel interfaces as we can clean that up at will. But
> introducing a new user space ABI is setting the disconnect of RAS
> related facilities into stone.
> 
> From Kconfig:
> 
>   EDAC is designed to report errors in the core system.
>   These are low-level errors that are reported in the CPU or
>   supporting chipset or other subsystems:
>   memory errors, cache errors, PCI errors, thermal throttling, etc..
>   If unsure, select 'Y'.
> 
> So please explain why your error reporting is so different from the
> above that it justifies a separate facility. And you better come up
> with a real good explanation other than we looked at EDAC and it did
> not fit our needs.

As far as I know, EDAC guys plan to use some other "perfect interface"
in the future. So I think the current state is really waiting for the
"perfect design".

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-26  8:38         ` Andi Kleen
@ 2010-10-26 10:00           ` Thomas Gleixner
  0 siblings, 0 replies; 48+ messages in thread
From: Thomas Gleixner @ 2010-10-26 10:00 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Len Brown, Ingo Molnar, Huang Ying, LKML, linux-acpi,
	Borislav Petkov, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Tony Luck

On Tue, 26 Oct 2010, Andi Kleen wrote:
> > So please explain why your error reporting is so different from the
> > above that it justifies a separate facility. And you better come up
> > with a real good explanation other than we looked at EDAC and it did
> > not fit our needs.
> 
> Well it didn't fit the needs at all.  Is that not enough, Mr Inquisitor?

No, it's not enough without a reasonable explanation.

> Really if you want to nack and maintain and design things you need
> to do a bit more than just arguing from two lines of
> high level description.

If you want to shove a new facility into the kernel you need to do a
bit more than sending that stuff to LKML without explaining why you
can't use existing facilities and why it's impossible to extend those
existing facilities.

So the questions arise on the high level because you failed to provide
a reasonable explanation in the first place.

> EDAC enumerates hardware and exports some hardware
> registers and decodes a few errors in a format into
> the kernel log that is hard to impossible to post-process.

You are well aware that EDAC folks are working on consolidating the
interfaces and providing better error reporting, but it's out of your
interest so you just ignore that effort instead of working together?

> APEI does nothing of that, so it doesn't fit into EDAC.

Thanks for this overly detailed explanation.

APEI is about error detection and error reporting, nothing else. So
it fits into EDAC, which is an existing Error Detection and Correction
reporting facility by definition.

If you see EDAC has shortcomings, the proper answer to that is to go
there and work with those folks to make EDAC a fully integrated
facility. It's not like there's some disagreement with them.

But you did not even try to talk to them about this and went straight
ahead and implemented your own EDAC facility.

> The main interface in APEI right now is to manage fatal
> errors after reboot. ...

That's completely irrelevant.

It does not matter at all when and wherefrom error data come and
whether they were generated before or after reboot. That is the
hardware/firmware dependend side of things. Nobody is saying that this
isn't necessary.

But it matters very much how we report those errors. And it's not a
completely unreasonable request to avoid separate interfaces for the
very same problem, especially separate user space ABIs. 

If an existing facility has shortcomings, then the usual way is to
extend it for the sake of all existing users of that facility. We try
to consolidate stuff all over the place and avoid stuff which is
artificially separate, and that applies to that area as well.

Of course that might be more work and might change the design a bit,
but in the end it's a benefit for everyone.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-26  8:32               ` Huang Ying
@ 2010-10-26 10:03                 ` Ingo Molnar
  0 siblings, 0 replies; 48+ messages in thread
From: Ingo Molnar @ 2010-10-26 10:03 UTC (permalink / raw)
  To: Huang Ying
  Cc: Thomas Gleixner, Len Brown, LKML, Andi Kleen, linux-acpi,
	Borislav Petkov, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Luck, Tony


* Huang Ying <ying.huang@intel.com> wrote:

> On Tue, 2010-10-26 at 15:55 +0800, Ingo Molnar wrote:
> > * Huang Ying <ying.huang@intel.com> wrote:
> > 
> > > On Tue, 2010-10-26 at 15:22 +0800, Ingo Molnar wrote:
> > > > * Thomas Gleixner <tglx@linutronix.de> wrote:
> > > > 
> > > > > >From Kconfig:
> > > > > 
> > > > >   EDAC is designed to report errors in the core system.
> > > > >   These are low-level errors that are reported in the CPU or
> > > > >   supporting chipset or other subsystems:
> > > > >   memory errors, cache errors, PCI errors, thermal throttling, etc..
> > > > >   If unsure, select 'Y'.
> > > > > 
> > > > > So please explain why your error reporting is so different from the above that it 
> > > > > justifies a separate facility. And you better come up with a real good explanation 
> > > > > other than we looked at EDAC and it did not fit our needs.
> > > > 
> > > > Btw., it's not just about EDAC - the firmware can store Linux events 
> > > > persistently (beyond allowing the firmware to insert its own RAS events), that 
> > > > is obviously _hugely_ useful for kernel debugging in general. We could inject 
> > > > debugging events there and recover them after a crash, etc.
> > > 
> > > Yes. It can be used by other kernel subsystems other than RAS. A kernel API is 
> > > provided already. The design of the kernel API makes it easy to be used by various 
> > > kernel subsystems. As the first step, we plan to support saving kernel log before 
> > > panic and reading it back after reboot.
> > 
> > And that's the problem: we have good facilities already that deal with similar 
> > things. We have NMI-safe event logging, event enumeration, dump-on-panic code 
> > and all sorts of goodies there.
> 
> We have provided an in-kernel API for ERST now. And we plan to implement a 
> kmsg_dumper with ERST. And maybe implement some output support (maybe via some 
> /dev/kmsg extension) for kmsg_dumper if necessary.

So my argument/objection was:

 " You are introducing a bad ABI here, amongst other problems. Please work with the
   people who are maintaining sane RAS/event/error reporting ABIs and facilities.
   You should have done that to begin with. "

and your answer to that is:

 " Hey, we plan to introduce another ad-hoc ABI as well! "

... do you really not see the glaring disconnect?

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-26  8:52         ` Huang Ying
@ 2010-10-26 10:15           ` Ingo Molnar
  0 siblings, 0 replies; 48+ messages in thread
From: Ingo Molnar @ 2010-10-26 10:15 UTC (permalink / raw)
  To: Huang Ying
  Cc: Thomas Gleixner, Len Brown, LKML, Andi Kleen, linux-acpi,
	Borislav Petkov, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Luck, Tony


* Huang Ying <ying.huang@intel.com> wrote:

> Hi, Thomas,
> 
> On Tue, 2010-10-26 at 12:53 +0800, Thomas Gleixner wrote:
> > B1;2401;0cLen,
> > 
> > On Mon, 25 Oct 2010, Len Brown wrote:
> > 
> > > >  NAKed-by: Ingo Molnar <mingo@elte.hu>
> > > 
> > > Everybody knows that Linux has a lot to learn about RAS.
> > > 
> > > I think to catch up, we need to play to Linux's strengths
> > > of continuous improvement.  If we halt patches in this area
> > > then we could wait forever for the "perfect design".
> > 
> > it's not about perfect design. It's about creating new user space
> > ABIs. The patches introduce another error reporting user space ABI
> > with an ad hoc "fits the needs" design.
> > 
> > This is my major point of objection. 
> > 
> > I agree that Linux needs improvement on the RAS side, but does this
> > lack of features justify a new user space ABI which is totally
> > disconnected to existing RAS facilities ?
> > 
> > No, it does not. It's not our problem that Intel wasted time on
> > creating another character device driver to report errors to user
> > space. The time spent to do so would have been sufficient to do a
> > proper integration into the existing infrastructure.
> > 
> > I would not care at all if these patches would just introduce some
> > weird in kernel interfaces as we can clean that up at will. But
> > introducing a new user space ABI is setting the disconnect of RAS
> > related facilities into stone.
> > 
> > From Kconfig:
> > 
> >   EDAC is designed to report errors in the core system.
> >   These are low-level errors that are reported in the CPU or
> >   supporting chipset or other subsystems:
> >   memory errors, cache errors, PCI errors, thermal throttling, etc..
> >   If unsure, select 'Y'.
> > 
> > So please explain why your error reporting is so different from the
> > above that it justifies a separate facility. And you better come up
> > with a real good explanation other than we looked at EDAC and it did
> > not fit our needs.
> 
> As far as I know, EDAC guys plan to use some other "perfect interface" in the 
> future. So I think the current state is really waiting for the "perfect design".

Not sure what you mean by this, but Boris has posted links to his latest patch-set 
in this thread, see:

  http://kerneltrap.org/mailarchive/linux-kernel/2010/8/6/4603847

The Git coordinates are:

  git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace.git, branch tip/perf/parse-events

The 'persistent events' facility he has prototyped there appears to be a good 
potential match for the ERST store.

It would be very useful to have another feature there: to mark persistent events as 
'dump into syslog on bootup', so that for example the contents of the ERST log could 
be dumped right on bootup. [but ERST would not be the only persistent event that 
could be marked like that.]

Note that we dont need/want other ABI accesses to the ERST log (i.e. we dont want 
/dev/erst-dbg), because we want the benefits of the generalization: tooling (RAS and 
other tooling) should learn how to deal with persistent events - not learn how to 
deal with ERST logs ... or with warm bootup RAM-embedded logs ... or to deal with 
kcrash embedded kernel logs ... etc.

There are many obvious advantages from implementing it like that: there's no need to 
special-code ERST to printk or ERST to whatever other facility cross links - it 
would be part of a generic/uniform event logging facility to begin with. ERST would 
only implement its own, narrow, hardware-specific event accessor methods - nothing 
else. Basically a small 'event driver'. This would be the most optimal, smallest, 
easiest to maintain approach - with no facility duplication and no fragmentation.

It's certainly more work as well _for the first such example_ - but from that point 
on any new hardware facility can be added with ease, and those too will fit into 
existing tooling in a very natural way.

So please help out with the persistent events work. If you need any pointers we'd be 
glad to help.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [NAK] Re: [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-25 17:10                     ` Ingo Molnar
@ 2010-10-27  8:25                       ` Ingo Molnar
  0 siblings, 0 replies; 48+ messages in thread
From: Ingo Molnar @ 2010-10-27  8:25 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Len Brown, linux-kernel, linux-acpi, Borislav Petkov,
	Thomas Gleixner, H. Peter Anvin, Don Zickus, Linus Torvalds,
	Andrew Morton, Mauro Carvalho Chehab, Arjan van de Ven


Btw., just for the record. Two days ago you started getting personal:

* Andi Kleen <andi@firstfloor.org> wrote:

> That's not really the old Ingo I used to know.

But when i (again) outlayed all the technical reasons for the NAK, you did not 
answer. You also did not answer Thomas's technical mail either.

I've seen this behavior many times from you: when the technical argument goes 
against you, when many people are telling you that you are mostly wrong, you do not 
concede, you do not argue in support of your position, you simply ignore the 
discussion from that point on and you pretend that the discussion does not exist 
anymore.

Then a few weeks/months down the line this same pattern repeats: there's another 
similar incident somewhere else, and i'm (or others) are forced to NAK the hard way. 
You then claim it's unreasonable and you act as if the prior discussions did not 
happen at all. Rinse and repeat. Most people dont remember so it even looks 
plausible to the casual observer.

The thing is, upstream NAKs need to be addressed in good faith.

Again: i'm NAK-ing the use of the x86 NMI notifier chain in this fashion and i'm 
NAK-ing the exports that this patch is adding to arch/x86/. It's not the proper 
approach for the many reasons we outlined - debug facilities want to be organized in 
the core, not spread out to vendor-specific fringes.

I wouldnt mind as much if this was a driver for hw few people care about - but this 
looks like a useful piece of Intel hardware whose Linux support you are messing up 
thoroughly by exporting an ABI in the worst possible way. We can and should do 
better than this.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2010-10-27  8:26 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-10-25  7:43 [PATCH -v2 0/9] ACPI, APEI patches for 2.6.37 Huang Ying
2010-10-25  7:43 ` [PATCH -v2 1/9] ACPI, APEI, Add ERST record ID cache Huang Ying
2010-10-25  7:43 ` [PATCH -v2 2/9] Add lock-less version of bitmap_set/clear Huang Ying
2010-10-25  7:43 ` [PATCH -v2 3/9] lock-less NULL terminated single list implementation Huang Ying
2010-10-25  7:43 ` [PATCH -v2 4/9] lock-less general memory allocator Huang Ying
2010-10-25  7:43 ` [PATCH -v2 5/9] Hardware error device core Huang Ying
2010-10-25  7:43 ` [PATCH -v2 6/9] Hardware error record persistent support Huang Ying
2010-10-25  7:43 ` [PATCH -v2 7/9] ACPI, APEI, Use ERST for hardware error persisting before panic Huang Ying
2010-10-25  7:43 ` [PATCH -v2 8/9] ACPI, APEI, Report GHES error record with hardware error device core Huang Ying
2010-10-25  7:43 ` [PATCH -v2 9/9] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support Huang Ying
2010-10-25  8:45   ` [NAK] " Ingo Molnar
2010-10-25  8:58     ` Huang Ying
2010-10-25  9:19       ` Andi Kleen
2010-10-25 11:15         ` Ingo Molnar
2010-10-25 12:04           ` Mauro Carvalho Chehab
2010-10-25 17:07             ` Tony Luck
2010-10-25 17:19               ` Mauro Carvalho Chehab
2010-10-25 12:37           ` Andi Kleen
2010-10-25 12:55             ` Ingo Molnar
2010-10-25 13:02               ` Ingo Molnar
2010-10-25 13:11               ` Andi Kleen
2010-10-25 13:47                 ` Ingo Molnar
2010-10-25 15:14                   ` Andi Kleen
2010-10-25 17:10                     ` Ingo Molnar
2010-10-27  8:25                       ` Ingo Molnar
2010-10-25 16:38         ` Thomas Gleixner
2010-10-25  9:25       ` Ingo Molnar
2010-10-25 17:14         ` Tony Luck
2010-10-25 20:23           ` Borislav Petkov
2010-10-25 21:23             ` Tony Luck
2010-10-25 21:23               ` Tony Luck
2010-10-25 21:51               ` Borislav Petkov
2010-10-25 21:51                 ` Borislav Petkov
2010-10-25 23:35                 ` Tony Luck
2010-10-26  6:26                   ` Borislav Petkov
2010-10-26  6:26                     ` Borislav Petkov
2010-10-25 23:35                 ` Tony Luck
2010-10-26  1:06     ` Len Brown
2010-10-26  4:53       ` Thomas Gleixner
2010-10-26  7:22         ` Ingo Molnar
2010-10-26  7:30           ` Huang Ying
2010-10-26  7:55             ` Ingo Molnar
2010-10-26  8:32               ` Huang Ying
2010-10-26 10:03                 ` Ingo Molnar
2010-10-26  8:38         ` Andi Kleen
2010-10-26 10:00           ` Thomas Gleixner
2010-10-26  8:52         ` Huang Ying
2010-10-26 10:15           ` Ingo Molnar

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.