[PATCH -v3 0/8] ACPI, APEI patches for 2.6.37

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37
@ 2010-10-27  5:28 Huang Ying
  2010-10-27  5:28 ` [PATCH -v3 1/8] ACPI, APEI, Add ERST record ID cache Huang Ying
                   ` (8 more replies)
  0 siblings, 9 replies; 17+ messages in thread
From: Huang Ying @ 2010-10-27  5:28 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

v3:

- Rework lock-less memory allocator and lock-less list.

v2:

- Some minor changes according to Andi's comments.

[PATCH -v3 1/8] ACPI, APEI, Add ERST record ID cache
[PATCH -v3 2/8] lib, Make gen_pool memory allocator lock-less
[PATCH -v3 3/8] lib, Add lock-less NULL terminated single list
[PATCH -v3 4/8] Hardware error device core
[PATCH -v3 5/8] Hardware error record persistent support
[PATCH -v3 6/8] ACPI, APEI, Use ERST for hardware error persisting before panic
[PATCH -v3 7/8] ACPI, APEI, Report GHES error record with hardware error device core
[PATCH -v3 8/8] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH -v3 1/8] ACPI, APEI, Add ERST record ID cache
  2010-10-27  5:28 [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Huang Ying
@ 2010-10-27  5:28 ` Huang Ying
  2010-10-27  5:28 ` [PATCH -v3 2/8] lib, Make gen_pool memory allocator lock-less Huang Ying
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Huang Ying @ 2010-10-27  5:28 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

APEI ERST firmware interface and implementation has no multiple users
in mind. For example, there is four records in storage with ID: 1, 2,
3 and 4, if two ERST readers enumerate the records via
GET_NEXT_RECORD_ID as follow,

reader 1		reader 2
1
			2
3
			4
-1
			-1

where -1 signals there is no more record ID.

Reader 1 has no chance to check record 2 and 4, while reader 2 has no
chance to check record 1 and 3. And any other GET_NEXT_RECORD_ID will
return -1, that is, other readers will has no chance to check any
record even they are not cleared by anyone.

This makes raw GET_NEXT_RECORD_ID not suitable for usage of multiple
users.

To solve the issue, an in memory ERST record ID cache is designed and
implemented. When enumerating record ID, the ID returned by
GET_NEXT_RECORD_ID is added into cache in addition to be returned to
caller. So other readers can check the cache to get all record ID
available.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-apei.c |   40 +++--
 drivers/acpi/apei/erst-dbg.c          |   24 ++-
 drivers/acpi/apei/erst.c              |  241 +++++++++++++++++++++++++++-------
 include/acpi/apei.h                   |    5 
 4 files changed, 246 insertions(+), 64 deletions(-)

--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m)
 ssize_t apei_read_mce(struct mce *m, u64 *record_id)
 {
 	struct cper_mce_record rcd;
-	ssize_t len;
-
-	len = erst_read_next(&rcd.hdr, sizeof(rcd));
-	if (len <= 0)
-		return len;
-	/* Can not skip other records in storage via ERST unless clear them */
-	else if (len != sizeof(rcd) ||
-		 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
-		if (printk_ratelimit())
-			pr_warning(
-			"MCE-APEI: Can not skip the unknown record in ERST");
-		return -EIO;
-	}
+	int rc, pos;
 
+	rc = erst_get_record_id_begin(&pos);
+	if (rc)
+		return rc;
+retry:
+	rc = erst_get_record_id_next(&pos, record_id);
+	if (rc)
+		goto out;
+	/* no more record */
+	if (*record_id == APEI_ERST_INVALID_RECORD_ID)
+		goto out;
+	rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
+	/* someone else has cleared the record, try next one */
+	if (rc == -ENOENT)
+		goto retry;
+	else if (rc < 0)
+		goto out;
+	/* try to skip other type records in storage */
+	else if (rc != sizeof(rcd) ||
+		 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
+		goto retry;
 	memcpy(m, &rcd.mce, sizeof(*m));
-	*record_id = rcd.hdr.record_id;
+	rc = sizeof(*m);
+out:
+	erst_get_record_id_end();
 
-	return sizeof(*m);
+	return rc;
 }
 
 /* Check whether there is record in ERST */
--- a/drivers/acpi/apei/erst-dbg.c
+++ b/drivers/acpi/apei/erst-dbg.c
@@ -43,12 +43,27 @@ static DEFINE_MUTEX(erst_dbg_mutex);
 
 static int erst_dbg_open(struct inode *inode, struct file *file)
 {
+	int rc, *pos;
+
 	if (erst_disable)
 		return -ENODEV;
 
+	pos = (int *)&file->private_data;
+
+	rc = erst_get_record_id_begin(pos);
+	if (rc)
+		return rc;
+
 	return nonseekable_open(inode, file);
 }
 
+static int erst_dbg_release(struct inode *inode, struct file *file)
+{
+	erst_get_record_id_end();
+
+	return 0;
+}
+
 static long erst_dbg_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 {
 	int rc;
@@ -79,18 +94,20 @@ static long erst_dbg_ioctl(struct file *
 static ssize_t erst_dbg_read(struct file *filp, char __user *ubuf,
 			     size_t usize, loff_t *off)
 {
-	int rc;
+	int rc, *pos;
 	ssize_t len = 0;
 	u64 id;
 
-	if (*off != 0)
+	if (*off)
 		return -EINVAL;
 
 	if (mutex_lock_interruptible(&erst_dbg_mutex) != 0)
 		return -EINTR;
 
+	pos = (int *)&filp->private_data;
+
 retry_next:
-	rc = erst_get_next_record_id(&id);
+	rc = erst_get_record_id_next(pos, &id);
 	if (rc)
 		goto out;
 	/* no more record */
@@ -181,6 +198,7 @@ out:
 static const struct file_operations erst_dbg_ops = {
 	.owner		= THIS_MODULE,
 	.open		= erst_dbg_open,
+	.release	= erst_dbg_release,
 	.read		= erst_dbg_read,
 	.write		= erst_dbg_write,
 	.unlocked_ioctl	= erst_dbg_ioctl,
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -429,6 +429,27 @@ ssize_t erst_get_record_count(void)
 }
 EXPORT_SYMBOL_GPL(erst_get_record_count);
 
+#define ERST_RECORD_ID_CACHE_SIZE_MIN	16
+#define ERST_RECORD_ID_CACHE_SIZE_MAX	1024
+
+struct erst_record_id_entry {
+	u64 id;
+	unsigned int cleared:1;
+};
+
+struct erst_record_id_cache {
+	struct mutex lock;
+	struct erst_record_id_entry *entries;
+	int len;
+	int size;
+	int refcount;
+};
+
+static struct erst_record_id_cache erst_record_id_cache = {
+	.lock = __MUTEX_INITIALIZER(erst_record_id_cache.lock),
+	.refcount = 0,
+};
+
 static int __erst_get_next_record_id(u64 *record_id)
 {
 	struct apei_exec_context ctx;
@@ -443,26 +464,180 @@ static int __erst_get_next_record_id(u64
 	return 0;
 }
 
+int erst_get_record_id_begin(int *pos)
+{
+	int rc;
+
+	if (erst_disable)
+		return -ENODEV;
+
+	rc = mutex_lock_interruptible(&erst_record_id_cache.lock);
+	if (rc)
+		return rc;
+	erst_record_id_cache.refcount++;
+	mutex_unlock(&erst_record_id_cache.lock);
+
+	*pos = 0;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(erst_get_record_id_begin);
+
+/* erst_record_id_cache.lock must be held by caller */
+static int erst_record_id_cache_add_one(void)
+{
+	u64 id, prev_id, first_id;
+	int i, rc;
+	struct erst_record_id_entry *entries;
+	unsigned long flags;
+
+	id = prev_id = first_id = APEI_ERST_INVALID_RECORD_ID;
+retry:
+	spin_lock_irqsave(&erst_lock, flags);
+	rc = __erst_get_next_record_id(&id);
+	spin_unlock_irqrestore(&erst_lock, flags);
+	if (rc == -ENOENT)
+		return 0;
+	if (rc)
+		return rc;
+	if (id == APEI_ERST_INVALID_RECORD_ID)
+		return 0;
+	/* can not skip current ID, or look back to first ID */
+	if (id == prev_id || id == first_id)
+		return 0;
+	if (first_id == APEI_ERST_INVALID_RECORD_ID)
+		first_id = id;
+	prev_id = id;
+
+	entries = erst_record_id_cache.entries;
+	for (i = 0; i < erst_record_id_cache.len; i++) {
+		if (!entries[i].cleared && entries[i].id == id)
+			break;
+	}
+	/* record id already in cache, try next */
+	if (i < erst_record_id_cache.len)
+		goto retry;
+	if (erst_record_id_cache.len >= erst_record_id_cache.size) {
+		int new_size, alloc_size;
+		struct erst_record_id_entry *new_entries;
+
+		new_size = erst_record_id_cache.size * 2;
+		new_size = clamp_val(new_size, ERST_RECORD_ID_CACHE_SIZE_MIN,
+				     ERST_RECORD_ID_CACHE_SIZE_MAX);
+		if (new_size <= erst_record_id_cache.size) {
+			if (printk_ratelimit())
+				pr_warning(FW_WARN ERST_PFX
+					   "too many record ID!\n");
+			return 0;
+		}
+		alloc_size = new_size * sizeof(struct erst_record_id_entry);
+		if (alloc_size < PAGE_SIZE)
+			new_entries = kmalloc(alloc_size, GFP_KERNEL);
+		else
+			new_entries = vmalloc(alloc_size);
+		if (!new_entries)
+			return -ENOMEM;
+		memcpy(new_entries, entries,
+		       erst_record_id_cache.len * sizeof(entries[0]));
+		if (erst_record_id_cache.size < PAGE_SIZE)
+			kfree(entries);
+		else
+			vfree(entries);
+		erst_record_id_cache.entries = entries = new_entries;
+		erst_record_id_cache.size = new_size;
+	}
+	memset(&entries[i], 0, sizeof(entries[i]));
+	entries[i].id = id;
+	erst_record_id_cache.len++;
+
+	return 1;
+}
+
 /*
  * Get the record ID of an existing error record on the persistent
  * storage. If there is no error record on the persistent storage, the
  * returned record_id is APEI_ERST_INVALID_RECORD_ID.
  */
-int erst_get_next_record_id(u64 *record_id)
+int erst_get_record_id_next(int *pos, u64 *record_id)
 {
-	int rc;
-	unsigned long flags;
+	int rc = 0;
+	struct erst_record_id_entry *entries;
 
 	if (erst_disable)
 		return -ENODEV;
 
-	spin_lock_irqsave(&erst_lock, flags);
-	rc = __erst_get_next_record_id(record_id);
-	spin_unlock_irqrestore(&erst_lock, flags);
+	/* must be enclosed by erst_get_record_id_begin/end */
+	BUG_ON(!erst_record_id_cache.refcount);
+	BUG_ON(*pos < 0 || *pos > erst_record_id_cache.len);
+
+	mutex_lock(&erst_record_id_cache.lock);
+	entries = erst_record_id_cache.entries;
+	for (; *pos < erst_record_id_cache.len; (*pos)++)
+		if (!entries[*pos].cleared)
+			break;
+	/* found next record id in cache */
+	if (*pos < erst_record_id_cache.len) {
+		*record_id = entries[*pos].id;
+		(*pos)++;
+		goto out_unlock;
+	}
+
+	/* Try to add one more record ID to cache */
+	rc = erst_record_id_cache_add_one();
+	if (rc < 0)
+		goto out_unlock;
+	/* successfully add one new ID */
+	if (rc == 1) {
+		entries = erst_record_id_cache.entries;
+		*record_id = entries[*pos].id;
+		(*pos)++;
+		rc = 0;
+	} else {
+		*pos = -1;
+		*record_id = APEI_ERST_INVALID_RECORD_ID;
+	}
+out_unlock:
+	mutex_unlock(&erst_record_id_cache.lock);
 
 	return rc;
 }
-EXPORT_SYMBOL_GPL(erst_get_next_record_id);
+EXPORT_SYMBOL_GPL(erst_get_record_id_next);
+
+static void __erst_record_id_cache_compact(void)
+{
+	int i, wpos = 0;
+	struct erst_record_id_entry *entries;
+
+	if (!erst_record_id_cache.refcount) {
+		entries = erst_record_id_cache.entries;
+		for (i = 0; i < erst_record_id_cache.len; i++) {
+			if (entries[i].cleared)
+				continue;
+			if (wpos != i)
+				memcpy(&entries[wpos], &entries[i],
+				       sizeof(entries[i]));
+			wpos++;
+		}
+		erst_record_id_cache.len = wpos;
+	}
+}
+
+void erst_get_record_id_end(void)
+{
+	/*
+	 * erst_disable != 0 should be detected by invoker via the
+	 * return value of erst_get_record_id_begin/next, so this
+	 * function should not be called for erst_disable != 0.
+	 */
+	BUG_ON(erst_disable);
+
+	mutex_lock(&erst_record_id_cache.lock);
+	erst_record_id_cache.refcount--;
+	BUG_ON(erst_record_id_cache.refcount < 0);
+	__erst_record_id_cache_compact();
+	mutex_unlock(&erst_record_id_cache.lock);
+}
+EXPORT_SYMBOL_GPL(erst_get_record_id_end);
 
 static int __erst_write_to_storage(u64 offset)
 {
@@ -703,56 +878,34 @@ ssize_t erst_read(u64 record_id, struct
 }
 EXPORT_SYMBOL_GPL(erst_read);
 
-/*
- * If return value > buflen, the buffer size is not big enough,
- * else if return value = 0, there is no more record to read,
- * else if return value < 0, something goes wrong,
- * else everything is OK, and return value is record length
- */
-ssize_t erst_read_next(struct cper_record_header *record, size_t buflen)
-{
-	int rc;
-	ssize_t len;
-	unsigned long flags;
-	u64 record_id;
-
-	if (erst_disable)
-		return -ENODEV;
-
-	spin_lock_irqsave(&erst_lock, flags);
-	rc = __erst_get_next_record_id(&record_id);
-	if (rc) {
-		spin_unlock_irqrestore(&erst_lock, flags);
-		return rc;
-	}
-	/* no more record */
-	if (record_id == APEI_ERST_INVALID_RECORD_ID) {
-		spin_unlock_irqrestore(&erst_lock, flags);
-		return 0;
-	}
-
-	len = __erst_read(record_id, record, buflen);
-	spin_unlock_irqrestore(&erst_lock, flags);
-
-	return len;
-}
-EXPORT_SYMBOL_GPL(erst_read_next);
-
 int erst_clear(u64 record_id)
 {
-	int rc;
+	int rc, i;
 	unsigned long flags;
+	struct erst_record_id_entry *entries;
 
 	if (erst_disable)
 		return -ENODEV;
 
+	rc = mutex_lock_interruptible(&erst_record_id_cache.lock);
+	if (rc)
+		return rc;
 	spin_lock_irqsave(&erst_lock, flags);
 	if (erst_erange.attr & ERST_RANGE_NVRAM)
 		rc = __erst_clear_from_nvram(record_id);
 	else
 		rc = __erst_clear_from_storage(record_id);
 	spin_unlock_irqrestore(&erst_lock, flags);
-
+	if (rc)
+		goto out;
+	entries = erst_record_id_cache.entries;
+	for (i = 0; i < erst_record_id_cache.len; i++) {
+		if (!entries[i].cleared && entries[i].id == record_id)
+			entries[i].cleared = 1;
+	}
+	__erst_record_id_cache_compact();
+out:
+	mutex_unlock(&erst_record_id_cache.lock);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(erst_clear);
--- a/include/acpi/apei.h
+++ b/include/acpi/apei.h
@@ -24,10 +24,11 @@ int apei_hest_parse(apei_hest_func_t fun
 
 int erst_write(const struct cper_record_header *record);
 ssize_t erst_get_record_count(void);
-int erst_get_next_record_id(u64 *record_id);
+int erst_get_record_id_begin(int *pos);
+int erst_get_record_id_next(int *pos, u64 *record_id);
+void erst_get_record_id_end(void);
 ssize_t erst_read(u64 record_id, struct cper_record_header *record,
 		  size_t buflen);
-ssize_t erst_read_next(struct cper_record_header *record, size_t buflen);
 int erst_clear(u64 record_id);
 
 #endif

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH -v3 2/8] lib, Make gen_pool memory allocator lock-less
  2010-10-27  5:28 [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Huang Ying
  2010-10-27  5:28 ` [PATCH -v3 1/8] ACPI, APEI, Add ERST record ID cache Huang Ying
@ 2010-10-27  5:28 ` Huang Ying
  2010-10-27  9:17   ` Andi Kleen
  2010-10-27  5:28 ` [PATCH -v3 3/8] lib, Add lock-less NULL terminated single list Huang Ying
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 17+ messages in thread
From: Huang Ying @ 2010-10-27  5:28 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

Multiple users can allocate/free memory in the pool simultaneously
without lock.  This also makes gen_pool memory allocator can be used
to allocate/free memory in IRQ/NMI handler. This is useful for
hardware error handling, which needs to collect hardware error
information in IRQ/NMI handler.

To support lock-less allocate/free operation, lock-less bitmap
set/clear operations are implemented.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/bitmap.h   |    1 
 include/linux/genalloc.h |   32 +++++--
 lib/bitmap.c             |    2 
 lib/genalloc.c           |  211 +++++++++++++++++++++++++++++++++++++----------
 4 files changed, 195 insertions(+), 51 deletions(-)
 create mode 100644 include/linux/llalloc.h
 create mode 100644 lib/llalloc.c

--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -1,8 +1,8 @@
 /*
- * Basic general purpose allocator for managing special purpose memory
- * not managed by the regular kmalloc/kfree interface.
- * Uses for this includes on-device special memory, uncached memory
- * etc.
+ * Basic general purpose allocator for managing special purpose
+ * memory, for example, memory that is not managed by the regular
+ * kmalloc/kfree interface.  Uses for this includes on-device special
+ * memory, uncached memory etc.
  *
  * Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org>
  *
@@ -13,8 +13,107 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/bitmap.h>
+#include <linux/rculist.h>
 #include <linux/genalloc.h>
 
+static inline int set_bits_ll(unsigned long *addr, unsigned long mask_to_set)
+{
+	unsigned long val, nval;
+
+	nval = *addr;
+	do {
+		val = nval;
+		if (val & mask_to_set)
+			return -EBUSY;
+	} while ((nval = cmpxchg(addr, val, val | mask_to_set)) != val);
+
+	return 0;
+}
+
+static inline int clear_bits_ll(unsigned long *addr,
+				unsigned long mask_to_clear)
+{
+	unsigned long val, nval;
+
+	nval = *addr;
+	do {
+		val = nval;
+		if ((val & mask_to_clear) != mask_to_clear)
+			return -EBUSY;
+	} while ((nval = cmpxchg(addr, val, val & ~mask_to_clear)) != val);
+
+	return 0;
+}
+
+/*
+ * bitmap_set_ll - set the specified number of bits at the specified position
+ * @map: pointer to a bitmap
+ * @start: a bit position in @map
+ * @nr: number of bits to set
+ *
+ * Set @nr bits start from @start in @map lock-lessly. Several users
+ * can set/clear the same bitmap simultaneously without lock. If two
+ * users set the same bit, one user will return remain bits, otherwise
+ * return 0.
+ */
+static int bitmap_set_ll(unsigned long *map, int start, int nr)
+{
+	unsigned long *p = map + BIT_WORD(start);
+	const int size = start + nr;
+	int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+	unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+
+	while (nr - bits_to_set >= 0) {
+		if (set_bits_ll(p, mask_to_set))
+			return nr;
+		nr -= bits_to_set;
+		bits_to_set = BITS_PER_LONG;
+		mask_to_set = ~0UL;
+		p++;
+	}
+	if (nr) {
+		mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+		if (set_bits_ll(p, mask_to_set))
+			return nr;
+	}
+
+	return 0;
+}
+
+/*
+ * bitmap_clear_ll - clear the specified number of bits at the specified position
+ * @map: pointer to a bitmap
+ * @start: a bit position in @map
+ * @nr: number of bits to set
+ *
+ * Clear @nr bits start from @start in @map lock-lessly. Several users
+ * can set/clear the same bitmap simultaneously without lock. If two
+ * users clear the same bit, one user will return remain bits,
+ * otherwise return 0.
+ */
+static int bitmap_clear_ll(unsigned long *map, int start, int nr)
+{
+	unsigned long *p = map + BIT_WORD(start);
+	const int size = start + nr;
+	int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+	unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+	while (nr - bits_to_clear >= 0) {
+		if (clear_bits_ll(p, mask_to_clear))
+			return nr;
+		nr -= bits_to_clear;
+		bits_to_clear = BITS_PER_LONG;
+		mask_to_clear = ~0UL;
+		p++;
+	}
+	if (nr) {
+		mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+		if (clear_bits_ll(p, mask_to_clear))
+			return nr;
+	}
+
+	return 0;
+}
 
 /**
  * gen_pool_create - create a new special memory pool
@@ -30,7 +129,7 @@ struct gen_pool *gen_pool_create(int min
 
 	pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid);
 	if (pool != NULL) {
-		rwlock_init(&pool->lock);
+		spin_lock_init(&pool->lock);
 		INIT_LIST_HEAD(&pool->chunks);
 		pool->min_alloc_order = min_alloc_order;
 	}
@@ -58,15 +157,15 @@ int gen_pool_add(struct gen_pool *pool,
 
 	chunk = kmalloc_node(nbytes, GFP_KERNEL | __GFP_ZERO, nid);
 	if (unlikely(chunk == NULL))
-		return -1;
+		return -ENOMEM;
 
-	spin_lock_init(&chunk->lock);
 	chunk->start_addr = addr;
 	chunk->end_addr = addr + size;
+	atomic_set(&chunk->avail, size);
 
-	write_lock(&pool->lock);
-	list_add(&chunk->next_chunk, &pool->chunks);
-	write_unlock(&pool->lock);
+	spin_lock(&pool->lock);
+	list_add_rcu(&chunk->next_chunk, &pool->chunks);
+	spin_unlock(&pool->lock);
 
 	return 0;
 }
@@ -112,39 +211,39 @@ EXPORT_SYMBOL(gen_pool_destroy);
  */
 unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
 {
-	struct list_head *_chunk;
 	struct gen_pool_chunk *chunk;
-	unsigned long addr, flags;
+	unsigned long addr;
 	int order = pool->min_alloc_order;
-	int nbits, start_bit, end_bit;
+	int nbits, start_bit = 0, end_bit, remain;
 
 	if (size == 0)
 		return 0;
 
 	nbits = (size + (1UL << order) - 1) >> order;
 
-	read_lock(&pool->lock);
-	list_for_each(_chunk, &pool->chunks) {
-		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
+	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
+		if (size > atomic_read(&chunk->avail))
+			continue;
 
 		end_bit = (chunk->end_addr - chunk->start_addr) >> order;
-
-		spin_lock_irqsave(&chunk->lock, flags);
-		start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit, 0,
-						nbits, 0);
-		if (start_bit >= end_bit) {
-			spin_unlock_irqrestore(&chunk->lock, flags);
+retry:
+		start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit,
+						       start_bit, nbits, 0);
+		if (start_bit >= end_bit)
 			continue;
+		remain = bitmap_set_ll(chunk->bits, start_bit, nbits);
+		if (remain) {
+			remain = bitmap_clear_ll(chunk->bits, start_bit,
+						 nbits - remain);
+			BUG_ON(remain);
+			goto retry;
 		}
 
 		addr = chunk->start_addr + ((unsigned long)start_bit << order);
-
-		bitmap_set(chunk->bits, start_bit, nbits);
-		spin_unlock_irqrestore(&chunk->lock, flags);
-		read_unlock(&pool->lock);
+		size = nbits << order;
+		atomic_sub(size, &chunk->avail);
 		return addr;
 	}
-	read_unlock(&pool->lock);
 	return 0;
 }
 EXPORT_SYMBOL(gen_pool_alloc);
@@ -159,29 +258,57 @@ EXPORT_SYMBOL(gen_pool_alloc);
  */
 void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
 {
-	struct list_head *_chunk;
 	struct gen_pool_chunk *chunk;
-	unsigned long flags;
 	int order = pool->min_alloc_order;
-	int bit, nbits;
+	int start_bit, nbits, remain;
 
 	nbits = (size + (1UL << order) - 1) >> order;
 
-	read_lock(&pool->lock);
-	list_for_each(_chunk, &pool->chunks) {
-		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
-
+	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
 		if (addr >= chunk->start_addr && addr < chunk->end_addr) {
 			BUG_ON(addr + size > chunk->end_addr);
-			spin_lock_irqsave(&chunk->lock, flags);
-			bit = (addr - chunk->start_addr) >> order;
-			while (nbits--)
-				__clear_bit(bit++, chunk->bits);
-			spin_unlock_irqrestore(&chunk->lock, flags);
-			break;
+			start_bit = (addr - chunk->start_addr) >> order;
+			remain = bitmap_clear_ll(chunk->bits, start_bit, nbits);
+			BUG_ON(remain);
+			size = nbits << order;
+			atomic_add(size, &chunk->avail);
+			return;
 		}
 	}
-	BUG_ON(nbits > 0);
-	read_unlock(&pool->lock);
+	BUG();
 }
 EXPORT_SYMBOL(gen_pool_free);
+
+/**
+ * gen_pool_avail - get available free space of the pool
+ * @pool: pool to get available free space
+ *
+ * Return available free space of the specified pool.
+ */
+size_t gen_pool_avail(struct gen_pool *pool)
+{
+	struct gen_pool_chunk *chunk;
+	size_t avail = 0;
+
+	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
+		avail += atomic_read(&chunk->avail);
+	return avail;
+}
+EXPORT_SYMBOL_GPL(gen_pool_avail);
+
+/**
+ * gen_pool_size - get size in bytes of memory managed by the pool
+ * @pool: pool to get size
+ *
+ * Return size in bytes of memory managed by the pool.
+ */
+size_t gen_pool_size(struct gen_pool *pool)
+{
+	struct gen_pool_chunk *chunk;
+	size_t size = 0;
+
+	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
+		size += chunk->end_addr - chunk->start_addr;
+	return size;
+}
+EXPORT_SYMBOL_GPL(gen_pool_size);
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -1,19 +1,26 @@
+#ifndef GENALLOC_H
+#define GENALLOC_H
 /*
- * Basic general purpose allocator for managing special purpose memory
- * not managed by the regular kmalloc/kfree interface.
- * Uses for this includes on-device special memory, uncached memory
- * etc.
+ * Basic general purpose allocator for managing special purpose
+ * memory, for example, memory not managed by the regular
+ * kmalloc/kfree interface.  Uses for this includes on-device special
+ * memory, uncached memory etc.
+ *
+ * The gen_pool_alloc, gen_pool_free, gen_pool_avail and gen_pool_size
+ * implementation is lock-less, that is, multiple users can
+ * allocate/free memory in the pool simultaneously without lock.  This
+ * also makes the gen_pool memory allocator can be used to
+ * allocate/free memory in IRQ/NMI handler.
  *
  * This source code is licensed under the GNU General Public License,
  * Version 2.  See the file COPYING for more details.
  */
 
-
 /*
  *  General purpose special memory pool descriptor.
  */
 struct gen_pool {
-	rwlock_t lock;
+	spinlock_t lock;
 	struct list_head chunks;	/* list of chunks in this pool */
 	int min_alloc_order;		/* minimum allocation order */
 };
@@ -22,15 +29,26 @@ struct gen_pool {
  *  General purpose special memory pool chunk descriptor.
  */
 struct gen_pool_chunk {
-	spinlock_t lock;
 	struct list_head next_chunk;	/* next chunk in pool */
+	atomic_t avail;
 	unsigned long start_addr;	/* starting address of memory chunk */
 	unsigned long end_addr;		/* ending address of memory chunk */
 	unsigned long bits[0];		/* bitmap for allocating memory chunk */
 };
 
+/**
+ * gen_pool_for_each_chunk - iterate over chunks of generic memory pool
+ * @chunk:	the struct gen_pool_chunk * to use as a loop cursor
+ * @pool:	the generic memory pool
+ */
+#define gen_pool_for_each_chunk(chunk, pool)			\
+	list_for_each_entry(chunk, &pool->chunks, next_chunk)
+
 extern struct gen_pool *gen_pool_create(int, int);
 extern int gen_pool_add(struct gen_pool *, unsigned long, size_t, int);
 extern void gen_pool_destroy(struct gen_pool *);
 extern unsigned long gen_pool_alloc(struct gen_pool *, size_t);
 extern void gen_pool_free(struct gen_pool *, unsigned long, size_t);
+extern size_t gen_pool_avail(struct gen_pool *);
+extern size_t gen_pool_size(struct gen_pool *);
+#endif /* GENALLOC_H */
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -271,8 +271,6 @@ int __bitmap_weight(const unsigned long
 }
 EXPORT_SYMBOL(__bitmap_weight);
 
-#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
-
 void bitmap_set(unsigned long *map, int start, int nr)
 {
 	unsigned long *p = map + BIT_WORD(start);
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -142,6 +142,7 @@ extern void bitmap_release_region(unsign
 extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
 extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
 
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
 #define BITMAP_LAST_WORD_MASK(nbits)					\
 (									\
 	((nbits) % BITS_PER_LONG) ?					\

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH -v3 3/8] lib, Add lock-less NULL terminated single list
  2010-10-27  5:28 [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Huang Ying
  2010-10-27  5:28 ` [PATCH -v3 1/8] ACPI, APEI, Add ERST record ID cache Huang Ying
  2010-10-27  5:28 ` [PATCH -v3 2/8] lib, Make gen_pool memory allocator lock-less Huang Ying
@ 2010-10-27  5:28 ` Huang Ying
  2010-10-27  5:28 ` [PATCH -v3 4/8] Hardware error device core Huang Ying
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Huang Ying @ 2010-10-27  5:28 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

Cmpxchg is used to implement adding new entry to the list, deleting
all entries from the list, deleting first entry of the list and some
other operations.

Because this is a single list, so the tail can not be accessed in O(1).

This can be used in NMI handler.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/llist.h |   92 ++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig           |    3 +
 lib/Makefile          |    2 +
 lib/llist.c           |   80 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 177 insertions(+)
 create mode 100644 include/linux/llist.h
 create mode 100644 lib/llist.c

--- /dev/null
+++ b/include/linux/llist.h
@@ -0,0 +1,92 @@
+#ifndef LLIST_H
+#define LLIST_H
+/*
+ * Lock-less NULL terminated single linked list
+ *
+ * If there are multiple producers and multiple consumers, llist_add
+ * can be used in producers and llist_del_all can be used in
+ * consumers.  They can work simultaneously without lock.  But
+ * llist_del_first can not be used here.  Because llist_del_first
+ * depends on list->first->next does not changed if list->first is not
+ * changed during its operation, but llist_del_first, llist_add,
+ * llist_add sequence in another consumer may violate that.
+ *
+ * If there are multiple producers and one consumer, llist_add can be
+ * used in producers and llist_del_all or llist_del_first can be used
+ * in the consumer.
+ *
+ * The list entries deleted via llist_del_all can be traversed with
+ * traversing function such as llist_for_each etc.  But the list
+ * entries can not be traversed safely before deleted from the list
+ * without proper synchronization with the list consumers.
+ */
+
+struct llist_head {
+	struct llist_node *first;
+};
+
+struct llist_node {
+	struct llist_node *next;
+};
+
+#define LLIST_HEAD_INIT(name)	{ NULL }
+#define LLIST_HEAD(name)	struct llist_head name = LLIST_HEAD_INIT(name)
+
+/**
+ * init_llist_head - initialize lock-less list head
+ * @head:	the head for your lock-less list
+ */
+static inline void init_llist_head(struct llist_head *list)
+{
+	list->first = NULL;
+}
+
+/**
+ * llist_entry - get the struct of this entry
+ * @ptr:	the &struct llist_node pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the llist_node within the struct.
+ */
+#define llist_entry(ptr, type, member)		\
+	container_of(ptr, type, member)
+
+/**
+ * llist_for_each - iterate over some entries of a lock-less list
+ * @pos:	the &struct llist_node to use as a loop cursor
+ * @node:	the first entry of deleted list entries
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being deleted from list, so start with an entry
+ * instead of list head.
+ */
+#define llist_for_each(pos, node)			\
+	for (pos = (node); pos; pos = pos->next)
+
+/**
+ * llist_for_each_entry - iterate over some entries of lock-less list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @node:	the fist entry of deleted list entries.
+ * @member:	the name of the llist_node with the struct.
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being removed from list, so start with an entry
+ * instead of list head.
+ */
+#define llist_for_each_entry(pos, node, member)				\
+	for (pos = llist_entry((node), typeof(*pos), member);		\
+	     &pos->member != NULL;					\
+	     pos = llist_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * llist_empty - tests whether a lock-less list is empty
+ * @head:	the list to test
+ */
+static inline int llist_empty(const struct llist_head *head)
+{
+	return head->first == NULL;
+}
+
+void llist_add(struct llist_node *new, struct llist_head *head);
+struct llist_node *llist_del_first(struct llist_head *head);
+struct llist_node *llist_del_all(struct llist_head *head);
+#endif /* LLIST_H */
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -210,4 +210,7 @@ config GENERIC_ATOMIC64
 config LRU_CACHE
 	tristate
 
+config LLIST
+	bool
+
 endmenu
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -106,6 +106,8 @@ obj-$(CONFIG_GENERIC_ATOMIC64) += atomic
 
 obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
 
+obj-$(CONFIG_LLIST) += llist.o
+
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 
--- /dev/null
+++ b/lib/llist.c
@@ -0,0 +1,80 @@
+/*
+ * Lock-less NULL terminated single linked list
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/llist.h>
+#include <linux/errno.h>
+
+#include <asm/system.h>
+
+/**
+ * llist_add - add a new entry
+ * @new:	new entry to be added
+ * @head:	the head for your lock-less list
+ */
+void llist_add(struct llist_node *new, struct llist_head *head)
+{
+	struct llist_node *entry;
+
+	do {
+		entry = head->first;
+		new->next = entry;
+	} while (cmpxchg(&head->first, entry, new) != entry);
+}
+EXPORT_SYMBOL_GPL(llist_add);
+
+/**
+ * llist_del_first - delete the first entry of lock-less list
+ * @head:	the head for your lock-less list
+ *
+ * If list is empty, return NULL, otherwise, return the first entry deleted.
+ *
+ * Only one llist_del_first user can be used simultaneously with
+ * multiple llist_add users without lock. Because otherwise
+ * llist_del_first, llist_add, llist_add sequence in another user may
+ * change @head->first->next, but keep @head->first. If multiple
+ * consumers are needed, please use llist_del_all.
+ */
+struct llist_node *llist_del_first(struct llist_head *head)
+{
+	struct llist_node *entry;
+
+	do {
+		entry = head->first;
+		if (entry == NULL)
+			return NULL;
+	} while (cmpxchg(&head->first, entry, entry->next) != entry);
+
+	return entry;
+}
+EXPORT_SYMBOL_GPL(llist_del_first);
+
+/**
+ * llist_del_all - delete all entries from lock-less list
+ * @head:	the head of lock-less list to delete all entries
+ *
+ * If list is empty, return NULL, otherwise, delete all entries and
+ * return the pointer to the first entry.
+ */
+struct llist_node *llist_del_all(struct llist_head *head)
+{
+	return xchg(&head->first, NULL);
+}
+EXPORT_SYMBOL_GPL(llist_del_all);

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH -v3 4/8] Hardware error device core
  2010-10-27  5:28 [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Huang Ying
                   ` (2 preceding siblings ...)
  2010-10-27  5:28 ` [PATCH -v3 3/8] lib, Add lock-less NULL terminated single list Huang Ying
@ 2010-10-27  5:28 ` Huang Ying
  2010-10-27  5:28 ` [PATCH -v3 5/8] Hardware error record persistent support Huang Ying
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Huang Ying @ 2010-10-27  5:28 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

Hardware error device is a kind of device which can report hardware
errors.  The examples of hardware error device include APEI GHES, PCIe
AER, etc.

Hardware error device core in this patch provides common services for
various hardware error devices.

Hardware error record data structure is defined to accommodate various
hardware error information.  Several error sections can be
incorporated into one error record to accumulate information for
multiple hardware components related to one error.  For example, for
an error on a device on the secondary side of a PCIe bridge, it is
useful to record error information from PCIe bridge and from the PCIe
device.

To manage various hardware error devices, struct herr_dev is defined
as a "sub-class" of "struct device".  It should be created as the
child device of the "real" hardware error device such as APEI GHES to
represent the hardware error reporting aspect of the device.  The
class of struct herr_dev instances is "error", so that all hardware
error devices can be enumerated via listing all links in
/sys/class/error.

The most important functionality that this patch provided is hardware
error reporting support.  Details is as follow.

A lock-less hardware error record allocator is provided.  So for
hardware error that can be ignored (such as corrected errors), it is
not needed to pre-allocate the error record or allocate the error
record on stack.

After filling in all necessary fields in hardware error record, the
error reporting is quite straightforward, just calling
herr_record_report, parameters are the error record itself and the
corresponding struct herr_dev.  All other special processing including
user space interface is handled by hardware error device core, this
make it much easier to write a hardware error handler.

Because the possibility for two or more hardware parts to go error
simultaneously is quite low, it is not necessary to create one device
file for each hardware error device.  One device file (/dev/error)
which mixed error records from all hardware error devices is created
as the user space interface.  Then, the user space hardware error
daemon can do some further processing (including logging into disk),
based on the hardware error records read from the device file.

Although the in-kernel and user space interface of hardware error
reporting is quite simple, there is some special consideration in
hardware error reporting implementation.

Hardware errors may burst, for example, same hardware errors may be
reported at high rate within a short interval, this will use up all
pre-allocated memory for error reporting, so that other hardware
errors come from same or different hardware device can not be logged.
To deal with this issue, a burst control algorithm is implemented.
The logging rate for errors come from one hardware error device is
throttled based on the available pre-allocated memory for error
reporting.  In this way we can log as many kinds of errors as possible
comes from as many error devices as possible.

This patch is designed by Andi Kleen and Huang Ying.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 drivers/char/Kconfig            |    2 
 drivers/char/Makefile           |    2 
 drivers/char/herror/Kconfig     |    4 
 drivers/char/herror/Makefile    |    1 
 drivers/char/herror/herr-core.c |  620 ++++++++++++++++++++++++++++++++++++++++
 include/linux/Kbuild            |    1 
 include/linux/herror.h          |   69 ++++
 include/linux/herror_record.h   |  100 ++++++
 8 files changed, 799 insertions(+)
 create mode 100644 drivers/char/herror/Kconfig
 create mode 100644 drivers/char/herror/Makefile
 create mode 100644 drivers/char/herror/herr-core.c
 create mode 100644 include/linux/herror.h
 create mode 100644 include/linux/herror_record.h

--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -1129,5 +1129,7 @@ config RAMOOPS
 	  This enables panic and oops messages to be logged to a circular
 	  buffer in RAM where it can be read back at some later point.
 
+source "drivers/char/herror/Kconfig"
+
 endmenu
 
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -115,6 +115,8 @@ obj-$(CONFIG_RAMOOPS)		+= ramoops.o
 obj-$(CONFIG_JS_RTC)		+= js-rtc.o
 js-rtc-y = rtc.o
 
+obj-$(CONFIG_HERR_DEV_CORE)	+= herror/
+
 # Files generated that shall be removed upon make clean
 clean-files := consolemap_deftbl.c defkeymap.c
 
--- /dev/null
+++ b/drivers/char/herror/Kconfig
@@ -0,0 +1,4 @@
+config HERR_DEV_CORE
+	bool
+	select LLIST
+	select GENERIC_ALLOCATOR
--- /dev/null
+++ b/drivers/char/herror/Makefile
@@ -0,0 +1 @@
+obj-y				+= herr-core.o
--- /dev/null
+++ b/drivers/char/herror/herr-core.c
@@ -0,0 +1,620 @@
+/*
+ * Hardware error device core
+ *
+ * Hardware error device is a kind of device which can report hardware
+ * errors.  The examples of hardware error device include APEI GHES,
+ * PCIe AER, etc.
+ *
+ * Hardware error device core provides common services for various
+ * hardware error devices, including hardware error record lock-less
+ * allocator, error reporting mechanism, hardware error device
+ * management, etc.
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rculist.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+#include <linux/ratelimit.h>
+#include <linux/nmi.h>
+#include <linux/llist.h>
+#include <linux/genalloc.h>
+#include <linux/herror.h>
+
+#define HERR_NOTIFY_BIT			0
+
+static unsigned long herr_flags;
+
+/*
+ * Record list management and error reporting
+ */
+
+struct herr_node {
+	struct llist_node llist;
+	struct herr_record ercd __attribute__((aligned(HERR_MIN_ALIGN)));
+};
+
+#define HERR_NODE_LEN(rcd_len)					\
+	((rcd_len) + sizeof(struct herr_node) - sizeof(struct herr_record))
+
+#define HERR_MIN_ALLOC_ORDER	HERR_MIN_ALIGN_ORDER
+#define HERR_CHUNKS_PER_CPU	2
+#define HERR_RCD_LIST_NUM	2
+
+struct herr_rcd_lists {
+	struct llist_head *write;
+	struct llist_head *read;
+	struct llist_head heads[HERR_RCD_LIST_NUM];
+};
+
+static DEFINE_PER_CPU(struct herr_rcd_lists, herr_rcd_lists);
+
+static DEFINE_PER_CPU(struct gen_pool *, herr_gen_pool);
+
+static void herr_rcd_lists_init(void)
+{
+	int cpu, i;
+	struct herr_rcd_lists *lists;
+
+	for_each_possible_cpu(cpu) {
+		lists = per_cpu_ptr(&herr_rcd_lists, cpu);
+		for (i = 0; i < HERR_RCD_LIST_NUM; i++)
+			init_llist_head(&lists->heads[i]);
+		lists->write = &lists->heads[0];
+		lists->read = &lists->heads[1];
+	}
+}
+
+static void herr_pool_fini(void)
+{
+	struct gen_pool *pool;
+	struct gen_pool_chunk *chunk;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		pool = per_cpu(herr_gen_pool, cpu);
+		gen_pool_for_each_chunk(chunk, pool)
+			free_page(chunk->start_addr);
+		gen_pool_destroy(pool);
+	}
+}
+
+static int herr_pool_init(void)
+{
+	struct gen_pool **pool;
+	int cpu, rc, nid, i;
+	unsigned long addr;
+
+	for_each_possible_cpu(cpu) {
+		pool = per_cpu_ptr(&herr_gen_pool, cpu);
+		rc = -ENOMEM;
+		nid = cpu_to_node(cpu);
+		*pool = gen_pool_create(HERR_MIN_ALLOC_ORDER, nid);
+		if (!*pool)
+			goto err_pool_fini;
+		for (i = 0; i < HERR_CHUNKS_PER_CPU; i++) {
+			rc = -ENOMEM;
+			addr = __get_free_page(GFP_KERNEL);
+			if (!addr)
+				goto err_pool_fini;
+			rc = gen_pool_add(*pool, addr, PAGE_SIZE, nid);
+			if (rc)
+				goto err_pool_fini;
+		}
+	}
+
+	return 0;
+err_pool_fini:
+	herr_pool_fini();
+	return rc;
+}
+
+/* Max interval: about 2 second */
+#define HERR_BURST_BASE_INTVL	NSEC_PER_USEC
+#define HERR_BURST_MAX_RATIO	21
+#define HERR_BURST_MAX_INTVL						\
+	((1ULL << HERR_BURST_MAX_RATIO) * HERR_BURST_BASE_INTVL)
+/*
+ * Pool size/used ratio considered spare, before this, interval
+ * between error reporting is ignored. After this, minimal interval
+ * needed is increased exponentially to max interval.
+ */
+#define HERR_BURST_SPARE_RATIO	3
+
+static int herr_burst_control(struct herr_dev *edev)
+{
+	struct gen_pool *pool;
+	unsigned long long last, now, min_intvl;
+	unsigned int size, used, ratio;
+
+	pool = __get_cpu_var(herr_gen_pool);
+	size = gen_pool_size(pool);
+	used = size - gen_pool_avail(pool);
+	if (HERR_BURST_SPARE_RATIO * used < size)
+		goto pass;
+	now = trace_clock_local();
+	last = atomic64_read(&edev->timestamp);
+	ratio = (used * HERR_BURST_SPARE_RATIO - size) * HERR_BURST_MAX_RATIO;
+	ratio = ratio / (size * HERR_BURST_SPARE_RATIO - size) + 1;
+	min_intvl = (1ULL << ratio) * HERR_BURST_BASE_INTVL;
+	if ((long long)(now - last) > min_intvl)
+		goto pass;
+	atomic_inc(&edev->bursts);
+	return 0;
+pass:
+	return 1;
+}
+
+static u64 herr_record_next_id(void)
+{
+	static atomic64_t seq = ATOMIC64_INIT(0);
+
+	if (!atomic64_read(&seq))
+		atomic64_set(&seq, (u64)get_seconds() << 32);
+
+	return atomic64_inc_return(&seq);
+}
+
+void herr_record_init(struct herr_record *ercd)
+{
+	ercd->flags = 0;
+	ercd->rev = HERR_RCD_REV1_0;
+	ercd->id = herr_record_next_id();
+	ercd->timestamp = trace_clock_local();
+}
+EXPORT_SYMBOL_GPL(herr_record_init);
+
+struct herr_record *herr_record_alloc(unsigned int len, struct herr_dev *edev,
+				      unsigned int flags)
+{
+	struct gen_pool *pool;
+	struct herr_node *enode;
+	struct herr_record *ercd = NULL;
+
+	preempt_disable();
+	if (!(flags & HERR_ALLOC_NO_BURST_CONTROL)) {
+		if (!herr_burst_control(edev)) {
+			preempt_enable_no_resched();
+			return NULL;
+		}
+	}
+
+	pool = __get_cpu_var(herr_gen_pool);
+	enode = (struct herr_node *)gen_pool_alloc(pool, HERR_NODE_LEN(len));
+	if (enode) {
+		ercd = &enode->ercd;
+		herr_record_init(ercd);
+		ercd->length = len;
+
+		atomic64_set(&edev->timestamp, trace_clock_local());
+		atomic_inc(&edev->logs);
+	} else
+		atomic_inc(&edev->overflows);
+	preempt_enable_no_resched();
+
+	return ercd;
+}
+EXPORT_SYMBOL_GPL(herr_record_alloc);
+
+int herr_record_report(struct herr_record *ercd, struct herr_dev *edev)
+{
+	struct herr_rcd_lists *lists;
+	struct herr_node *enode;
+
+	preempt_disable();
+	lists = this_cpu_ptr(&herr_rcd_lists);
+	enode = container_of(ercd, struct herr_node, ercd);
+	llist_add(&enode->llist, lists->write);
+	preempt_enable_no_resched();
+
+	set_bit(HERR_NOTIFY_BIT, &herr_flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(herr_record_report);
+
+void herr_record_free(struct herr_record *ercd)
+{
+	struct herr_node *enode;
+	struct gen_pool *pool;
+
+	enode = container_of(ercd, struct herr_node, ercd);
+	pool = get_cpu_var(herr_gen_pool);
+	gen_pool_free(pool, (unsigned long)enode,
+		      HERR_NODE_LEN(enode->ercd.length));
+	put_cpu_var(pool);
+}
+EXPORT_SYMBOL_GPL(herr_record_free);
+
+/*
+ * The low 16 bit is freeze count, high 16 bit is thaw count. If they
+ * are not equal, someone is freezing the reader
+ */
+static u32 herr_freeze_thaw;
+
+/*
+ * Stop the reader to consume error records, so that the error records
+ * can be checked in kernel space safely.
+ */
+static void herr_freeze_reader(void)
+{
+	u32 old, new;
+
+	do {
+		new = old = herr_freeze_thaw;
+		new = ((new + 1) & 0xffff) | (old & 0xffff0000);
+	} while (cmpxchg(&herr_freeze_thaw, old, new) != old);
+}
+
+static void herr_thaw_reader(void)
+{
+	u32 old, new;
+
+	do {
+		old = herr_freeze_thaw;
+		new = old + 0x10000;
+	} while (cmpxchg(&herr_freeze_thaw, old, new) != old);
+}
+
+static int herr_reader_is_frozen(void)
+{
+	u32 freeze_thaw = herr_freeze_thaw;
+	return (freeze_thaw & 0xffff) != (freeze_thaw >> 16);
+}
+
+int herr_for_each_record(herr_traverse_func_t func, void *data)
+{
+	int i, cpu, rc = 0;
+	struct herr_rcd_lists *lists;
+	struct herr_node *enode;
+
+	preempt_disable();
+	herr_freeze_reader();
+	for_each_possible_cpu(cpu) {
+		lists = per_cpu_ptr(&herr_rcd_lists, cpu);
+		for (i = 0; i < HERR_RCD_LIST_NUM; i++) {
+			struct llist_head *head = &lists->heads[i];
+			llist_for_each_entry(enode, head->first, llist) {
+				rc = func(&enode->ercd, data);
+				if (rc)
+					goto out;
+			}
+		}
+	}
+out:
+	herr_thaw_reader();
+	preempt_enable_no_resched();
+	return rc;
+}
+EXPORT_SYMBOL_GPL(herr_for_each_record);
+
+static ssize_t herr_rcd_lists_read(char __user *ubuf, size_t usize,
+				   struct mutex *read_mutex)
+{
+	int cpu, rc = 0, read;
+	struct herr_rcd_lists *lists;
+	struct gen_pool *pool;
+	ssize_t len, rsize = 0;
+	struct herr_node *enode;
+	struct llist_head *old_read;
+	struct llist_node *to_read;
+
+	do {
+		read = 0;
+		for_each_possible_cpu(cpu) {
+			lists = per_cpu_ptr(&herr_rcd_lists, cpu);
+			pool = per_cpu(herr_gen_pool, cpu);
+			if (llist_empty(lists->read)) {
+				if (llist_empty(lists->write))
+					continue;
+				/*
+				 * Error records are output in batch, so old
+				 * error records can be output before new ones.
+				 */
+				old_read = lists->read;
+				lists->read = lists->write;
+				lists->write = old_read;
+			}
+			rc = rsize ? 0 : -EBUSY;
+			if (herr_reader_is_frozen())
+				goto out;
+			to_read = llist_del_first(lists->read);
+			if (herr_reader_is_frozen())
+				goto out_readd;
+			enode = llist_entry(to_read, struct herr_node, llist);
+			len = enode->ercd.length;
+			rc = rsize ? 0 : -EINVAL;
+			if (len > usize - rsize)
+				goto out_readd;
+			rc = -EFAULT;
+			if (copy_to_user(ubuf + rsize, &enode->ercd, len))
+				goto out_readd;
+			gen_pool_free(pool, (unsigned long)enode,
+				      HERR_NODE_LEN(len));
+			rsize += len;
+			read = 1;
+		}
+		if (need_resched()) {
+			mutex_unlock(read_mutex);
+			cond_resched();
+			mutex_lock(read_mutex);
+		}
+	} while (read);
+	rc = 0;
+out:
+	return rc ? rc : rsize;
+out_readd:
+	llist_add(to_read, lists->read);
+	goto out;
+}
+
+static int herr_rcd_lists_is_empty(void)
+{
+	int cpu, i;
+	struct herr_rcd_lists *lists;
+
+	for_each_possible_cpu(cpu) {
+		lists = per_cpu_ptr(&herr_rcd_lists, cpu);
+		for (i = 0; i < HERR_RCD_LIST_NUM; i++) {
+			if (!llist_empty(&lists->heads[i]))
+				return 0;
+		}
+	}
+	return 1;
+}
+
+
+/*
+ * Hardware Error Reporting Device Management
+ */
+
+static ssize_t herr_dev_name_show(struct device *device,
+				  struct device_attribute *attr,
+				  char *buf)
+{
+	struct herr_dev *dev = to_herr_dev(device);
+	return sprintf(buf, "%s\n", dev->name);
+}
+
+static struct device_attribute herr_dev_attr_name =
+	__ATTR(name, 0400, herr_dev_name_show, NULL);
+
+#define HERR_DEV_COUNTER_ATTR(_name)					\
+	static ssize_t herr_dev_##_name##_show(struct device *device,	\
+					       struct device_attribute *attr, \
+					       char *buf)		\
+	{								\
+		struct herr_dev *dev = to_herr_dev(device);		\
+		int counter;						\
+									\
+		counter = atomic_read(&dev->_name);			\
+		return sprintf(buf, "%d\n", counter);			\
+	}								\
+	static ssize_t herr_dev_##_name##_store(struct device *device,	\
+						struct device_attribute *attr, \
+						const char *buf,	\
+						size_t count)		\
+	{								\
+		struct herr_dev *dev = to_herr_dev(device);		\
+									\
+		atomic_set(&dev->_name, 0);				\
+		return count;						\
+	}								\
+	static struct device_attribute herr_dev_attr_##_name =		\
+		__ATTR(_name, 0600, herr_dev_##_name##_show,		\
+		       herr_dev_##_name##_store)
+
+HERR_DEV_COUNTER_ATTR(logs);
+HERR_DEV_COUNTER_ATTR(overflows);
+HERR_DEV_COUNTER_ATTR(bursts);
+
+static struct attribute *herr_dev_attrs[] = {
+	&herr_dev_attr_name.attr,
+	&herr_dev_attr_logs.attr,
+	&herr_dev_attr_overflows.attr,
+	&herr_dev_attr_bursts.attr,
+	NULL,
+};
+
+static struct attribute_group herr_dev_attr_group = {
+	.attrs	= herr_dev_attrs,
+};
+
+static const struct attribute_group *herr_dev_attr_groups[] = {
+	&herr_dev_attr_group,
+	NULL,
+};
+
+static void herr_dev_release(struct device *device)
+{
+	struct herr_dev *dev = to_herr_dev(device);
+
+	kfree(dev);
+}
+
+static struct device_type herr_dev_type = {
+	.groups		= herr_dev_attr_groups,
+	.release	= herr_dev_release,
+};
+
+static char *herr_devnode(struct device *dev, mode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "error/%s", dev_name(dev));
+}
+
+struct class herr_class = {
+	.name		= "error",
+	.devnode	= herr_devnode,
+};
+EXPORT_SYMBOL_GPL(herr_class);
+
+struct herr_dev *herr_dev_alloc(void)
+{
+	struct herr_dev *dev;
+
+	dev = kzalloc(sizeof(struct herr_dev), GFP_KERNEL);
+	if (!dev)
+		return NULL;
+	dev->dev.type = &herr_dev_type;
+	dev->dev.class = &herr_class;
+	device_initialize(&dev->dev);
+	atomic_set(&dev->logs, 0);
+	atomic_set(&dev->overflows, 0);
+	atomic_set(&dev->bursts, 0);
+	atomic64_set(&dev->timestamp, 0);
+
+	return dev;
+}
+EXPORT_SYMBOL_GPL(herr_dev_alloc);
+
+void herr_dev_free(struct herr_dev *dev)
+{
+	if (dev)
+		herr_dev_put(dev);
+}
+EXPORT_SYMBOL_GPL(herr_dev_free);
+
+int herr_dev_register(struct herr_dev *dev)
+{
+	static atomic_t herr_no = ATOMIC_INIT(0);
+	const char *path;
+	int rc;
+
+	dev_set_name(&dev->dev, "error%d", atomic_inc_return(&herr_no) - 1);
+
+	rc = device_add(&dev->dev);
+	if (rc)
+		goto err;
+
+	path = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
+	pr_info("error: %s as %s\n", dev->name ? dev->name : "Unspecified device",
+		path ? path : "N/A");
+	kfree(path);
+
+	return 0;
+err:
+	return rc;
+}
+EXPORT_SYMBOL_GPL(herr_dev_register);
+
+void herr_dev_unregister(struct herr_dev *dev)
+{
+	device_unregister(&dev->dev);
+}
+EXPORT_SYMBOL_GPL(herr_dev_unregister);
+
+
+/*
+ * Hardware Error Mix Reporting Device
+ */
+
+static int herr_major;
+static DECLARE_WAIT_QUEUE_HEAD(herr_mix_wait);
+
+void herr_notify(void)
+{
+	if (test_and_clear_bit(HERR_NOTIFY_BIT, &herr_flags))
+		wake_up_interruptible(&herr_mix_wait);
+}
+EXPORT_SYMBOL_GPL(herr_notify);
+
+static ssize_t herr_mix_read(struct file *filp, char __user *ubuf,
+			     size_t usize, loff_t *off)
+{
+	int rc;
+	static DEFINE_MUTEX(read_mutex);
+
+	if (*off != 0)
+		return -EINVAL;
+
+	rc = mutex_lock_interruptible(&read_mutex);
+	if (rc)
+		return rc;
+	rc = herr_rcd_lists_read(ubuf, usize, &read_mutex);
+	mutex_unlock(&read_mutex);
+
+	return rc;
+}
+
+static unsigned int herr_mix_poll(struct file *file, poll_table *wait)
+{
+	poll_wait(file, &herr_mix_wait, wait);
+	if (!herr_rcd_lists_is_empty())
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
+static const struct file_operations herr_mix_dev_fops = {
+	.owner		= THIS_MODULE,
+	.read		= herr_mix_read,
+	.poll		= herr_mix_poll,
+};
+
+static int __init herr_mix_dev_init(void)
+{
+	struct device *dev;
+	dev_t devt;
+
+	devt = MKDEV(herr_major, 0);
+	dev = device_create(&herr_class, NULL, devt, NULL, "error");
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	return 0;
+}
+device_initcall(herr_mix_dev_init);
+
+static int __init herr_core_init(void)
+{
+	int rc;
+
+	BUILD_BUG_ON(sizeof(struct herr_node) % HERR_MIN_ALIGN);
+	BUILD_BUG_ON(sizeof(struct herr_record) % HERR_MIN_ALIGN);
+	BUILD_BUG_ON(sizeof(struct herr_section) % HERR_MIN_ALIGN);
+
+	herr_rcd_lists_init();
+
+	rc = herr_pool_init();
+	if (rc)
+		goto err;
+
+	rc = class_register(&herr_class);
+	if (rc)
+		goto err_free_pool;
+
+	rc = herr_major = register_chrdev(0, "error", &herr_mix_dev_fops);
+	if (rc < 0)
+		goto err_free_class;
+
+	return 0;
+err_free_class:
+	class_unregister(&herr_class);
+err_free_pool:
+	herr_pool_fini();
+err:
+	return rc;
+}
+/* Initialize data structure used by device driver, so subsys_initcall */
+subsys_initcall(herr_core_init);
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -140,6 +140,7 @@ header-y += gigaset_dev.h
 header-y += hdlc.h
 header-y += hdlcdrv.h
 header-y += hdreg.h
+header-y += herror_record.h
 header-y += hid.h
 header-y += hiddev.h
 header-y += hidraw.h
--- /dev/null
+++ b/include/linux/herror.h
@@ -0,0 +1,69 @@
+#ifndef LINUX_HERROR_H
+#define LINUX_HERROR_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/device.h>
+#include <linux/herror_record.h>
+
+/*
+ * Hardware error reporting
+ */
+
+#define HERR_ALLOC_NO_BURST_CONTROL	0x0001
+
+struct herr_dev;
+
+/* allocate a herr_record lock-lessly */
+struct herr_record *herr_record_alloc(unsigned int len,
+				      struct herr_dev *edev,
+				      unsigned int flags);
+void herr_record_init(struct herr_record *ercd);
+/* report error via error record */
+int herr_record_report(struct herr_record *ercd, struct herr_dev *edev);
+/* free the herr_record allocated before, must on same CPU as allocated CPU */
+void herr_record_free(struct herr_record *ercd);
+/*
+ * Notify waited user space hardware error daemon for the new error
+ * record, can not be used in NMI context
+ */
+void herr_notify(void);
+
+/* Traverse all error records not consumed by user space */
+typedef int (*herr_traverse_func_t)(struct herr_record *ercd, void *data);
+int herr_for_each_record(herr_traverse_func_t func, void *data);
+
+
+/*
+ * Hardware Error Reporting Device Management
+ */
+extern struct class herr_class;
+
+struct herr_dev {
+	const char *name;
+	atomic_t overflows;
+	atomic_t logs;
+	atomic_t bursts;
+	struct device dev;
+	struct list_head list;
+	atomic64_t timestamp;
+};
+#define to_herr_dev(d)	container_of(d, struct herr_dev, dev)
+
+struct herr_dev *herr_dev_alloc(void);
+void herr_dev_free(struct herr_dev *dev);
+
+static inline struct herr_dev *herr_dev_get(struct herr_dev *dev)
+{
+	return dev ? to_herr_dev(get_device(&dev->dev)) : NULL;
+}
+
+static inline void herr_dev_put(struct herr_dev *dev)
+{
+	if (dev)
+		put_device(&dev->dev);
+}
+
+int herr_dev_register(struct herr_dev *dev);
+void herr_dev_unregister(struct herr_dev *dev);
+#endif
--- /dev/null
+++ b/include/linux/herror_record.h
@@ -0,0 +1,100 @@
+#ifndef LINUX_HERROR_RECORD_H
+#define LINUX_HERROR_RECORD_H
+
+#include <linux/types.h>
+
+/*
+ * Hardware Error Record Definition
+ */
+enum herr_severity {
+	HERR_SEV_NONE,
+	HERR_SEV_CORRECTED,
+	HERR_SEV_RECOVERABLE,
+	HERR_SEV_FATAL,
+};
+
+#define HERR_RCD_REV1_0		0x0100
+#define HERR_MIN_ALIGN_ORDER	3
+#define HERR_MIN_ALIGN		(1 << HERR_MIN_ALIGN_ORDER)
+
+enum herr_record_flags {
+	HERR_RCD_PREV		= 0x0001, /* record is for previous boot */
+	HERR_RCD_PERSIST	= 0x0002, /* record is from flash, need to be
+					   * cleared after writing to disk */
+};
+
+/*
+ * sizeof(struct herr_record) and sizeof(struct herr_section) should
+ * be multiple of HERR_MIN_ALIGN to make error record packing easier.
+ */
+struct herr_record {
+	__u16	length;
+	__u16	flags;
+	__u16	rev;
+	__u8	severity;
+	__u8	pad1;
+	__u64	id;
+	__u64	timestamp;
+	__u8	data[0];
+};
+
+/* Section type ID are allocated here */
+enum herr_section_type_id {
+	/* 0x0 - 0xff are reserved by core */
+	/* 0x100 - 0x1ff are allocated to CPER */
+	HERR_TYPE_CPER		= 0x0100,
+	HERR_TYPE_GESR		= 0x0110, /* acpi_hest_generic_status */
+	/* 0x200 - 0x2ff are allocated to PCI/PCIe subsystem */
+	HERR_TYPE_PCIE_AER	= 0x0200,
+};
+
+struct herr_section {
+	__u16	length;
+	__u16	flags;
+	__u32	type;
+	__u8	data[0];
+};
+
+#define herr_record_for_each_section(ercd, esec)		\
+	for ((esec) = (struct herr_section *)(ercd)->data;	\
+	     (void *)(esec) - (void *)(ercd) < (ercd)->length;	\
+	     (esec) = (void *)(esec) + (esec)->length)
+
+#define HERR_SEC_LEN_ROUND(len)						\
+	(((len) + HERR_MIN_ALIGN - 1) & ~(HERR_MIN_ALIGN - 1))
+#define HERR_SEC_LEN(type)						\
+	(sizeof(struct herr_section) + HERR_SEC_LEN_ROUND(sizeof(type)))
+
+#define HERR_RECORD_LEN_ROUND1(sec_len1)				\
+	(sizeof(struct herr_record) + HERR_SEC_LEN_ROUND(sec_len1))
+#define HERR_RECORD_LEN_ROUND2(sec_len1, sec_len2)			\
+	(sizeof(struct herr_record) + HERR_SEC_LEN_ROUND(sec_len1) +	\
+	 HERR_SEC_LEN_ROUND(sec_len2))
+#define HERR_RECORD_LEN_ROUND3(sec_len1, sec_len2, sec_len3)		\
+	(sizeof(struct herr_record) + HERR_SEC_LEN_ROUND(sec_len1) +	\
+	 HERR_SEC_LEN_ROUND(sec_len2) + HERR_SEC_LEN_ROUND(sec_len3))
+
+#define HERR_RECORD_LEN1(sec_type1)				\
+	(sizeof(struct herr_record) + HERR_SEC_LEN(sec_type1))
+#define HERR_RECORD_LEN2(sec_type1, sec_type2)			\
+	(sizeof(struct herr_record) + HERR_SEC_LEN(sec_type1) + \
+	 HERR_SEC_LEN(sec_type2))
+#define HERR_RECORD_LEN3(sec_type1, sec_type2, sec_type3)	\
+	(sizeof(struct herr_record) + HERR_SEC_LEN(sec_type1) + \
+	 HERR_SEC_LEN(sec_type2) + HERR_SEC_LEN(sec_type3))
+
+static inline struct herr_section *herr_first_sec(struct herr_record *ercd)
+{
+	return (struct herr_section *)(ercd + 1);
+}
+
+static inline struct herr_section *herr_next_sec(struct herr_section *esrc)
+{
+	return (void *)esrc + esrc->length;
+}
+
+static inline void *herr_sec_data(struct herr_section *esec)
+{
+	return (void *)(esec + 1);
+}
+#endif

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH -v3 5/8] Hardware error record persistent support
  2010-10-27  5:28 [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Huang Ying
                   ` (3 preceding siblings ...)
  2010-10-27  5:28 ` [PATCH -v3 4/8] Hardware error device core Huang Ying
@ 2010-10-27  5:28 ` Huang Ying
  2010-10-27  5:28 ` [PATCH -v3 6/8] ACPI, APEI, Use ERST for hardware error persisting before panic Huang Ying
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Huang Ying @ 2010-10-27  5:28 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

Normally, corrected hardware error records will go through the kernel
processing and be logged to disk or network finally.  But for
uncorrected errors, system may go panic directly for better error
containment, disk or network is not usable in this half-working
system.  To avoid losing these valuable hardware error records, the
error records are saved into some kind of simple persistent storage
such as flash before panic, so that they can be read out after system
reboot successfully.

Different kind of simple persistent storage implementation mechanisms
are provided on different platforms, so an abstract interface for
persistent storage is defined.  Different implementations of the
interface can be registered.

This patch is designed by Andi Kleen and Huang Ying.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 drivers/char/herror/Makefile        |    2 
 drivers/char/herror/herr-core.c     |   39 +++++++-
 drivers/char/herror/herr-internal.h |   12 ++
 drivers/char/herror/herr-persist.c  |  174 ++++++++++++++++++++++++++++++++++++
 include/linux/Kbuild                |    1 
 include/linux/herror.h              |   48 +++++++++
 6 files changed, 271 insertions(+), 5 deletions(-)
 create mode 100644 drivers/char/herror/herr-internal.h
 create mode 100644 drivers/char/herror/herr-persist.c

--- a/drivers/char/herror/Makefile
+++ b/drivers/char/herror/Makefile
@@ -1 +1 @@
-obj-y				+= herr-core.o
+obj-y				+= herr-core.o herr-persist.o
--- a/drivers/char/herror/herr-core.c
+++ b/drivers/char/herror/herr-core.c
@@ -43,9 +43,9 @@
 #include <linux/genalloc.h>
 #include <linux/herror.h>
 
-#define HERR_NOTIFY_BIT			0
+#include "herr-internal.h"
 
-static unsigned long herr_flags;
+unsigned long herr_flags;
 
 /*
  * Record list management and error reporting
@@ -545,6 +545,7 @@ static ssize_t herr_mix_read(struct file
 {
 	int rc;
 	static DEFINE_MUTEX(read_mutex);
+	u64 record_id;
 
 	if (*off != 0)
 		return -EINVAL;
@@ -552,7 +553,14 @@ static ssize_t herr_mix_read(struct file
 	rc = mutex_lock_interruptible(&read_mutex);
 	if (rc)
 		return rc;
+	rc = herr_persist_peek_user(&record_id, ubuf, usize);
+	if (rc > 0) {
+		herr_persist_clear(record_id);
+		goto out;
+	}
+
 	rc = herr_rcd_lists_read(ubuf, usize, &read_mutex);
+out:
 	mutex_unlock(&read_mutex);
 
 	return rc;
@@ -561,15 +569,40 @@ static ssize_t herr_mix_read(struct file
 static unsigned int herr_mix_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &herr_mix_wait, wait);
-	if (!herr_rcd_lists_is_empty())
+	if (!herr_rcd_lists_is_empty() || !herr_persist_read_done())
 		return POLLIN | POLLRDNORM;
 	return 0;
 }
 
+static long herr_mix_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+	void __user *p = (void __user *)arg;
+	int rc;
+	u64 record_id;
+	struct herr_persist_buffer buf;
+
+	switch (cmd) {
+	case HERR_PERSIST_PEEK:
+		rc = copy_from_user(&buf, p, sizeof(buf));
+		if (rc)
+			return -EFAULT;
+		return herr_persist_peek_user(&record_id, buf.buf,
+					      buf.buf_size);
+	case HERR_PERSIST_CLEAR:
+		rc = copy_from_user(&record_id, p, sizeof(record_id));
+		if (rc)
+			return -EFAULT;
+		return herr_persist_clear(record_id);
+	default:
+		return -ENOTTY;
+	}
+}
+
 static const struct file_operations herr_mix_dev_fops = {
 	.owner		= THIS_MODULE,
 	.read		= herr_mix_read,
 	.poll		= herr_mix_poll,
+	.unlocked_ioctl	= herr_mix_ioctl,
 };
 
 static int __init herr_mix_dev_init(void)
--- /dev/null
+++ b/drivers/char/herror/herr-internal.h
@@ -0,0 +1,12 @@
+#ifndef HERR_INTERNAL_H
+#define HERR_INTERNAL_H
+
+#define HERR_NOTIFY_BIT			0
+
+extern unsigned long herr_flags;
+
+int herr_persist_read_done(void);
+ssize_t herr_persist_peek_user(u64 *record_id, char __user *ercd,
+			       size_t bufsiz);
+int herr_persist_clear(u64 record_id);
+#endif /* HERR_INTERNAL_H */
--- /dev/null
+++ b/drivers/char/herror/herr-persist.c
@@ -0,0 +1,174 @@
+/*
+ * Hardware error record persistent support
+ *
+ * Normally, corrected hardware error records will go through the
+ * kernel processing and be logged to disk or network finally.  But
+ * for uncorrected errors, system may go panic directly for better
+ * error containment, disk or network is not usable in this
+ * half-working system.  To avoid losing these valuable hardware error
+ * records, the error records are saved into some kind of simple
+ * persistent storage such as flash before panic, so that they can be
+ * read out after system reboot successfully.
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rculist.h>
+#include <linux/mutex.h>
+
+#include <linux/herror.h>
+
+#include "herr-internal.h"
+
+/*
+ * Simple persistent storage provider list, herr_persists_mutex is
+ * used for writer side mutual exclusion, RCU is used to implement
+ * lock-less reader side.
+ */
+static LIST_HEAD(herr_persists);
+static DEFINE_MUTEX(herr_persists_mutex);
+
+int herr_persist_register(struct herr_persist *persist)
+{
+	if (!persist->peek_user)
+		return -EINVAL;
+	persist->read_done = 0;
+	if (mutex_lock_interruptible(&herr_persists_mutex))
+		return -EINTR;
+	list_add_rcu(&persist->list, &herr_persists);
+	mutex_unlock(&herr_persists_mutex);
+	/*
+	 * There may be hardware error records of previous boot in
+	 * persistent storage, notify the user space error daemon to
+	 * check.
+	 */
+	set_bit(HERR_NOTIFY_BIT, &herr_flags);
+	herr_notify();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(herr_persist_register);
+
+void herr_persist_unregister(struct herr_persist *persist)
+{
+	mutex_lock(&herr_persists_mutex);
+	list_del_rcu(&persist->list);
+	mutex_unlock(&herr_persists_mutex);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(herr_persist_unregister);
+
+/* Can be used in atomic context including NMI */
+int herr_persist_in(const struct herr_record *ercd)
+{
+	struct herr_persist *persist;
+	int rc = -ENODEV;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(persist, &herr_persists, list) {
+		if (!persist->in)
+			continue;
+		rc = persist->in(ercd);
+		if (!rc)
+			break;
+	}
+	rcu_read_unlock();
+	return rc;
+}
+EXPORT_SYMBOL_GPL(herr_persist_in);
+
+int herr_persist_read_done(void)
+{
+	struct herr_persist *persist;
+	int rc = 1;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(persist, &herr_persists, list) {
+		if (!persist->read_done) {
+			rc = 0;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return rc;
+}
+
+/* Read next error record from persist storage, don't remove it */
+ssize_t herr_persist_peek_user(u64 *record_id, char __user *ercd,
+			       size_t bufsiz)
+{
+	struct herr_persist *persist;
+	ssize_t rc = 0;
+
+	if (mutex_lock_interruptible(&herr_persists_mutex))
+		return -EINTR;
+	list_for_each_entry(persist, &herr_persists, list) {
+		if (persist->read_done)
+			continue;
+		rc = persist->peek_user(record_id, ercd, bufsiz);
+		if (rc > 0)
+			break;
+		else if (rc != -EINTR && rc != -EAGAIN && rc != -EINVAL)
+			persist->read_done = 1;
+	}
+	mutex_unlock(&herr_persists_mutex);
+	return rc;
+}
+
+/* Clear specified error record from persist storage */
+int herr_persist_clear(u64 record_id)
+{
+	struct herr_persist *persist;
+	int rc = -ENOENT;
+
+	if (mutex_lock_interruptible(&herr_persists_mutex))
+		return -EINTR;
+	list_for_each_entry(persist, &herr_persists, list) {
+		if (!persist->clear)
+			continue;
+		rc = persist->clear(record_id);
+		if (!rc)
+			break;
+		/*
+		 * Failed to clear, mark as read_done, because we can
+		 * not skip this one
+		 */
+		else if (rc != -EINTR && rc != -EAGAIN && rc != -ENOENT)
+			persist->read_done = 1;
+	}
+	mutex_unlock(&herr_persists_mutex);
+	return rc;
+}
+
+static int herr_persist_record(struct herr_record *ercd, void *data)
+{
+	int *severity = data;
+
+	if (ercd->severity == *severity)
+		return herr_persist_in(ercd);
+	return 0;
+}
+
+void herr_persist_all_records(void)
+{
+	int severity;
+
+	for (severity = HERR_SEV_FATAL; severity >= HERR_SEV_NONE; severity--)
+		herr_for_each_record(herr_persist_record, &severity);
+}
+EXPORT_SYMBOL_GPL(herr_persist_all_records);
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -141,6 +141,7 @@ header-y += hdlc.h
 header-y += hdlcdrv.h
 header-y += hdreg.h
 header-y += herror_record.h
+header-y += herror.h
 header-y += hid.h
 header-y += hiddev.h
 header-y += hidraw.h
--- a/include/linux/herror.h
+++ b/include/linux/herror.h
@@ -1,10 +1,22 @@
 #ifndef LINUX_HERROR_H
 #define LINUX_HERROR_H
 
+#include <linux/ioctl.h>
+#include <linux/herror_record.h>
+
+struct herr_persist_buffer {
+	void __user *buf;
+	unsigned int buf_size;
+};
+
+#define HERR_PERSIST_PEEK	_IOW('H', 1, struct herr_persist_buffer)
+#define HERR_PERSIST_CLEAR	_IOW('H', 2, u64)
+
+#ifdef __KERNEL__
+
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/device.h>
-#include <linux/herror_record.h>
 
 /*
  * Hardware error reporting
@@ -66,4 +78,38 @@ static inline void herr_dev_put(struct h
 
 int herr_dev_register(struct herr_dev *dev);
 void herr_dev_unregister(struct herr_dev *dev);
+
+
+/*
+ * Simple Persistent Storage
+ */
+
+struct herr_persist;
+/* Put an error record into simple persistent storage */
+int herr_persist_in(const struct herr_record *ercd);
+/* Save all error records not yet consumed in persistent storage */
+void herr_persist_all_records(void);
+
+/*
+ * Simple Persistent Storage Provider Management
+ */
+struct herr_persist {
+	struct list_head list;
+	char *name;
+	unsigned int read_done:1;
+	/* Put an error record into storage, must be NMI-safe */
+	int (*in)(const struct herr_record *ercd);
+	/*
+	 * Read out an error record from storage to user space, don't
+	 * remove it, the HERR_RCD_PERSIST must be set in record flags
+	 */
+	ssize_t (*peek_user)(u64 *record_id, char __user *ubuf, size_t usize);
+	/* Clear an error record */
+	int (*clear)(u64 record_id);
+};
+
+/* Register (un-register) simple persistent storage provider */
+int herr_persist_register(struct herr_persist *persist);
+void herr_persist_unregister(struct herr_persist *persist);
+#endif
 #endif

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH -v3 6/8] ACPI, APEI, Use ERST for hardware error persisting before panic
  2010-10-27  5:28 [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Huang Ying
                   ` (4 preceding siblings ...)
  2010-10-27  5:28 ` [PATCH -v3 5/8] Hardware error record persistent support Huang Ying
@ 2010-10-27  5:28 ` Huang Ying
  2010-10-27  5:28 ` [PATCH -v3 7/8] ACPI, APEI, Report GHES error record with hardware error device core Huang Ying
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Huang Ying @ 2010-10-27  5:28 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

Corrected/Recoverable hardware errors can usually be saved into disk
and/or sent to network for logging.  But if uncorrected/fatal hardware
errors occur, system may not reliable enough for desk and/or network
accessing and needs to go panic as soon as possible for error
containment.  In this situation, ERST can be used to save the valuable
error records into some persistent storage such as flash.

This patch implements herr_persist interface using ERST.  So that
hardware error device core can use ERST to save hardware error record
before panic.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 drivers/acpi/apei/Kconfig |    1 
 drivers/acpi/apei/cper.c  |   19 +++++
 drivers/acpi/apei/erst.c  |  169 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/cper.h      |    1 
 4 files changed, 190 insertions(+)

--- a/drivers/acpi/apei/cper.c
+++ b/drivers/acpi/apei/cper.c
@@ -29,6 +29,25 @@
 #include <linux/time.h>
 #include <linux/cper.h>
 #include <linux/acpi.h>
+#include <linux/herror_record.h>
+
+int herr_severity_to_cper(int herr_severity)
+{
+	switch (herr_severity) {
+	case HERR_SEV_NONE:
+		return CPER_SEV_INFORMATIONAL;
+	case HERR_SEV_CORRECTED:
+		return CPER_SEV_CORRECTED;
+	case HERR_SEV_RECOVERABLE:
+		return CPER_SEV_RECOVERABLE;
+	case HERR_SEV_FATAL:
+		return CPER_SEV_FATAL;
+	default:
+		BUG();
+		return CPER_SEV_FATAL;
+	}
+}
+EXPORT_SYMBOL_GPL(herr_severity_to_cper);
 
 /*
  * CPER record ID need to be unique even after reboot, because record
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -33,6 +33,7 @@
 #include <linux/uaccess.h>
 #include <linux/cper.h>
 #include <linux/nmi.h>
+#include <linux/herror.h>
 #include <linux/hardirq.h>
 #include <acpi/apei.h>
 
@@ -88,6 +89,12 @@ static struct erst_erange {
  */
 static DEFINE_SPINLOCK(erst_lock);
 
+static void *erst_buf;
+static unsigned int erst_buf_len;
+
+/* Prevent erst_buf from being accessed simultaneously */
+static DEFINE_MUTEX(erst_buf_mutex);
+
 static inline int erst_errno(int command_status)
 {
 	switch (command_status) {
@@ -774,6 +781,12 @@ static int __erst_write_to_nvram(const s
 	return -ENOSYS;
 }
 
+static int __erst_write_herr_record_to_nvram(const struct herr_record *ercd)
+{
+	/* do not print message, because printk is not safe for NMI */
+	return -ENOSYS;
+}
+
 static int __erst_read_to_erange_from_nvram(u64 record_id, u64 *offset)
 {
 	pr_unimpl_nvram();
@@ -910,6 +923,156 @@ out:
 }
 EXPORT_SYMBOL_GPL(erst_clear);
 
+#define CPER_CREATOR_ERST						\
+	UUID_LE(0xEACBBA0C, 0x803A, 0x4096, 0xB1, 0x1D, 0xC3, 0xC7,	\
+		0x6E, 0xE7, 0x94, 0xF9)
+
+#define CPER_SEC_HERR_RECORD						\
+	UUID_LE(0x633AB656, 0x6703, 0x11DF, 0x87, 0xCF, 0x00, 0x19,	\
+		0xD1, 0x2A, 0x29, 0xEF)
+
+static ssize_t erst_herr_record_to_cper(struct cper_record_header *crcd,
+					size_t buf_size,
+					const struct herr_record *ercd)
+{
+	struct cper_section_descriptor *csec;
+	unsigned int crcd_len;
+	void *csec_data;
+
+	crcd_len = sizeof(*crcd) + sizeof(*csec) + ercd->length;
+	if (crcd_len > buf_size)
+		return crcd_len;
+
+	memset(crcd, 0, crcd_len);
+	memcpy(crcd->signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+	crcd->revision = CPER_RECORD_REV;
+	crcd->signature_end = CPER_SIG_END;
+	crcd->error_severity = herr_severity_to_cper(ercd->severity);
+	/* timestamp, platform_id, partition_id is invalid */
+	crcd->validation_bits = 0;
+	crcd->creator_id = CPER_CREATOR_ERST;
+	crcd->section_count = 1;
+	crcd->record_length = crcd_len;
+	crcd->record_id = ercd->id;
+
+	csec = (struct cper_section_descriptor *)(crcd + 1);
+	csec_data = csec + 1;
+
+	csec->section_length = ercd->length;
+	csec->revision = CPER_SEC_REV;
+	csec->section_type = CPER_SEC_HERR_RECORD;
+	csec->section_severity = crcd->error_severity;
+	csec->section_offset = (void *)csec_data - (void *)crcd;
+
+	memcpy(csec_data, ercd, ercd->length);
+
+	return crcd_len;
+}
+
+static int erst_write_herr_record(const struct herr_record *ercd)
+{
+	struct cper_record_header *crcd;
+	ssize_t crcd_len;
+	unsigned long flags;
+	int rc;
+
+	if (!spin_trylock_irqsave(&erst_lock, flags))
+		return -EBUSY;
+
+	if (erst_erange.attr & ERST_RANGE_NVRAM) {
+		rc = __erst_write_herr_record_to_nvram(ercd);
+		goto out;
+	}
+
+	rc = -EINVAL;
+	crcd_len = erst_herr_record_to_cper(erst_erange.vaddr,
+					    erst_erange.size, ercd);
+	if (crcd_len > erst_erange.size)
+		goto out;
+	crcd = erst_erange.vaddr;
+	/* signature for serialization system */
+	memcpy(&crcd->persistence_information, "ER", 2);
+	rc = __erst_write_to_storage(0);
+out:
+	spin_unlock_irqrestore(&erst_lock, flags);
+
+	return rc;
+}
+
+static ssize_t erst_persist_peek_user(u64 *record_id, char __user *ubuf,
+				      size_t usize)
+{
+	int rc, pos;
+	ssize_t len, clen;
+	u64 id;
+	struct cper_record_header *crcd;
+	struct cper_section_descriptor *csec;
+	struct herr_record *ercd;
+
+	if (mutex_lock_interruptible(&erst_buf_mutex) != 0)
+		return -EINTR;
+	erst_get_record_id_begin(&pos);
+retry_next:
+	len = 0;
+	rc = erst_get_record_id_next(&pos, &id);
+	if (rc)
+		goto out;
+	/* no more record */
+	if (id == APEI_ERST_INVALID_RECORD_ID)
+		goto out;
+retry:
+	rc = clen = erst_read(id, erst_buf, erst_buf_len);
+	/* someone else has cleared the record, try next one */
+	if (rc == -ENOENT)
+		goto retry_next;
+	else if (rc < 0)
+		goto out;
+	else if (clen > erst_buf_len) {
+		void *p;
+		rc = -ENOMEM;
+		p = kmalloc(clen, GFP_KERNEL);
+		if (!p)
+			goto out;
+		kfree(erst_buf);
+		erst_buf = p;
+		erst_buf_len = clen;
+		goto retry;
+	}
+
+	crcd = erst_buf;
+	csec = (struct cper_section_descriptor *)(crcd + 1);
+	if (crcd->section_count != 1 ||
+	    uuid_le_cmp(crcd->creator_id, CPER_CREATOR_ERST) ||
+	    uuid_le_cmp(csec->section_type, CPER_SEC_HERR_RECORD))
+		goto retry_next;
+
+	ercd = (struct herr_record *)(csec + 1);
+	len = ercd->length;
+
+	rc = -EINVAL;
+	if (len > usize)
+		goto out;
+
+	ercd->flags |= HERR_RCD_PREV | HERR_RCD_PERSIST;
+
+	rc = -EFAULT;
+	if (copy_to_user(ubuf, ercd, len))
+		goto out;
+	*record_id = id;
+	rc = 0;
+out:
+	erst_get_record_id_end();
+	mutex_unlock(&erst_buf_mutex);
+	return rc ? rc : len;
+}
+
+static struct herr_persist erst_persist = {
+	.name		= "ERST",
+	.in		= erst_write_herr_record,
+	.peek_user	= erst_persist_peek_user,
+	.clear		= erst_clear,
+};
+
 static int __init setup_erst_disable(char *str)
 {
 	erst_disable = 1;
@@ -1007,11 +1170,17 @@ static int __init erst_init(void)
 	if (!erst_erange.vaddr)
 		goto err_release_erange;
 
+	rc = herr_persist_register(&erst_persist);
+	if (rc)
+		goto err_unmap_erange;
+
 	pr_info(ERST_PFX
 	"Error Record Serialization Table (ERST) support is initialized.\n");
 
 	return 0;
 
+err_unmap_erange:
+	iounmap(erst_erange.vaddr);
 err_release_erange:
 	release_mem_region(erst_erange.base, erst_erange.size);
 err_unmap_reg:
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -309,6 +309,7 @@ struct cper_sec_mem_err {
 /* Reset to default packing */
 #pragma pack()
 
+int herr_severity_to_cper(int herr_severity);
 u64 cper_next_record_id(void);
 
 #endif
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -1,6 +1,7 @@
 config ACPI_APEI
 	bool "ACPI Platform Error Interface (APEI)"
 	depends on X86
+	select HERR_DEV_CORE
 	help
 	  APEI allows to report errors (for example from the chipset)
 	  to the operating system. This improves NMI handling

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH -v3 7/8] ACPI, APEI, Report GHES error record with hardware error device core
  2010-10-27  5:28 [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Huang Ying
                   ` (5 preceding siblings ...)
  2010-10-27  5:28 ` [PATCH -v3 6/8] ACPI, APEI, Use ERST for hardware error persisting before panic Huang Ying
@ 2010-10-27  5:28 ` Huang Ying
  2010-10-27  5:28 ` [PATCH -v3 8/8] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support Huang Ying
  2010-10-27 10:07 ` [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Peter Zijlstra
  8 siblings, 0 replies; 17+ messages in thread
From: Huang Ying @ 2010-10-27  5:28 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

One hardware error device (struct herr_dev) is created for each GHES
in GHES platform device "probe" function.  Then when GHES hardware
error handler is notified by firmware, the hardware error records will
be reported on the struct herr_dev.

In the previous GHES support, only corrected memory error can be
reported to user space via /dev/mcelog, now all kinds of hardware
errors notified with SCI can be reported.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 drivers/acpi/apei/cper.c |   18 +++++++
 drivers/acpi/apei/ghes.c |  119 +++++++++++++++++++++++++++++++----------------
 include/linux/cper.h     |    2 
 3 files changed, 99 insertions(+), 40 deletions(-)

--- a/drivers/acpi/apei/cper.c
+++ b/drivers/acpi/apei/cper.c
@@ -49,6 +49,24 @@ int herr_severity_to_cper(int herr_sever
 }
 EXPORT_SYMBOL_GPL(herr_severity_to_cper);
 
+int cper_severity_to_herr(int cper_severity)
+{
+	switch (cper_severity) {
+	case CPER_SEV_INFORMATIONAL:
+		return HERR_SEV_NONE;
+	case CPER_SEV_CORRECTED:
+		return HERR_SEV_CORRECTED;
+	case CPER_SEV_RECOVERABLE:
+		return HERR_SEV_RECOVERABLE;
+	case CPER_SEV_FATAL:
+		return HERR_SEV_FATAL;
+	default:
+		/* Unknown, default to fatal */
+		return HERR_SEV_FATAL;
+	}
+}
+EXPORT_SYMBOL_GPL(cper_severity_to_herr);
+
 /*
  * CPER record ID need to be unique even after reboot, because record
  * ID is used as index for ERST storage, while CPER records from
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -43,6 +43,7 @@
 #include <linux/kdebug.h>
 #include <linux/platform_device.h>
 #include <linux/mutex.h>
+#include <linux/herror.h>
 #include <acpi/apei.h>
 #include <acpi/atomicio.h>
 #include <acpi/hed.h>
@@ -74,6 +75,7 @@ struct ghes {
 	struct list_head list;
 	u64 buffer_paddr;
 	unsigned long flags;
+	struct herr_dev *herr_dev;
 };
 
 /*
@@ -238,9 +240,38 @@ static void ghes_clear_estatus(struct gh
 	ghes->flags &= ~GHES_TO_CLEAR;
 }
 
+static void ghes_report(struct ghes *ghes)
+{
+	struct herr_record *ercd;
+	struct herr_section *esec;
+	struct acpi_hest_generic_status *estatus;
+	unsigned int estatus_len, ercd_alloc_flags = 0;
+	int ghes_sev;
+
+	ghes_sev = ghes_severity(ghes->estatus->error_severity);
+	if (ghes_sev >= GHES_SEV_PANIC)
+		ercd_alloc_flags |= HERR_ALLOC_NO_BURST_CONTROL;
+	estatus_len = apei_estatus_len(ghes->estatus);
+	ercd = herr_record_alloc(HERR_RECORD_LEN_ROUND1(estatus_len),
+				 ghes->herr_dev, ercd_alloc_flags);
+	if (!ercd)
+		return;
+
+	ercd->severity = cper_severity_to_herr(ghes->estatus->error_severity);
+
+	esec = herr_first_sec(ercd);
+	esec->length = HERR_SEC_LEN_ROUND(estatus_len);
+	esec->flags = 0;
+	esec->type = HERR_TYPE_GESR;
+
+	estatus = herr_sec_data(esec);
+	memcpy(estatus, ghes->estatus, estatus_len);
+	herr_record_report(ercd, ghes->herr_dev);
+}
+
 static void ghes_do_proc(struct ghes *ghes)
 {
-	int sev, processed = 0;
+	int sev;
 	struct acpi_hest_generic_data *gdata;
 
 	sev = ghes_severity(ghes->estatus->error_severity);
@@ -251,15 +282,9 @@ static void ghes_do_proc(struct ghes *gh
 			apei_mce_report_mem_error(
 				sev == GHES_SEV_CORRECTED,
 				(struct cper_sec_mem_err *)(gdata+1));
-			processed = 1;
 		}
 #endif
 	}
-
-	if (!processed && printk_ratelimit())
-		pr_warning(GHES_PFX
-		"Unknown error record from generic hardware error source: %d\n",
-			   ghes->generic->header.source_id);
 }
 
 static int ghes_proc(struct ghes *ghes)
@@ -269,7 +294,9 @@ static int ghes_proc(struct ghes *ghes)
 	rc = ghes_read_estatus(ghes, 0);
 	if (rc)
 		goto out;
+	ghes_report(ghes);
 	ghes_do_proc(ghes);
+	herr_notify();
 
 out:
 	ghes_clear_estatus(ghes);
@@ -300,41 +327,15 @@ static int __devinit ghes_probe(struct p
 {
 	struct acpi_hest_generic *generic;
 	struct ghes *ghes = NULL;
-	int rc = -EINVAL;
+	int rc;
 
+	rc = -ENODEV;
 	generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data;
 	if (!generic->enabled)
-		return -ENODEV;
-
-	if (generic->error_block_length <
-	    sizeof(struct acpi_hest_generic_status)) {
-		pr_warning(FW_BUG GHES_PFX
-"Invalid error block length: %u for generic hardware error source: %d\n",
-			   generic->error_block_length,
-			   generic->header.source_id);
 		goto err;
-	}
-	if (generic->records_to_preallocate == 0) {
-		pr_warning(FW_BUG GHES_PFX
-"Invalid records to preallocate: %u for generic hardware error source: %d\n",
-			   generic->records_to_preallocate,
-			   generic->header.source_id);
-		goto err;
-	}
-	ghes = ghes_new(generic);
-	if (IS_ERR(ghes)) {
-		rc = PTR_ERR(ghes);
-		ghes = NULL;
-		goto err;
-	}
-	if (generic->notify.type == ACPI_HEST_NOTIFY_SCI) {
-		mutex_lock(&ghes_list_mutex);
-		if (list_empty(&ghes_sci))
-			register_acpi_hed_notifier(&ghes_notifier_sci);
-		list_add_rcu(&ghes->list, &ghes_sci);
-		mutex_unlock(&ghes_list_mutex);
-	} else {
-		unsigned char *notify = NULL;
+
+	if (generic->notify.type != ACPI_HEST_NOTIFY_SCI) {
+		char *notify = NULL;
 
 		switch (generic->notify.type) {
 		case ACPI_HEST_NOTIFY_POLLED:
@@ -357,9 +358,46 @@ static int __devinit ghes_probe(struct p
 "Unknown notification type: %u for generic hardware error source: %d\n",
 			generic->notify.type, generic->header.source_id);
 		}
-		rc = -ENODEV;
 		goto err;
 	}
+
+	rc = -EIO;
+	if (generic->error_block_length <
+	    sizeof(struct acpi_hest_generic_status)) {
+		pr_warning(FW_BUG GHES_PFX
+"Invalid error block length: %u for generic hardware error source: %d\n",
+			   generic->error_block_length,
+			   generic->header.source_id);
+		goto err;
+	}
+	ghes = ghes_new(generic);
+	if (IS_ERR(ghes)) {
+		rc = PTR_ERR(ghes);
+		ghes = NULL;
+		goto err;
+	}
+	rc = -ENOMEM;
+	ghes->herr_dev = herr_dev_alloc();
+	if (!ghes->herr_dev)
+		goto err;
+	ghes->herr_dev->name = dev_name(&ghes_dev->dev);
+	ghes->herr_dev->dev.parent = &ghes_dev->dev;
+	rc = herr_dev_register(ghes->herr_dev);
+	if (rc) {
+		herr_dev_free(ghes->herr_dev);
+		goto err;
+	}
+	switch (generic->notify.type) {
+	case ACPI_HEST_NOTIFY_SCI:
+		mutex_lock(&ghes_list_mutex);
+		if (list_empty(&ghes_sci))
+			register_acpi_hed_notifier(&ghes_notifier_sci);
+		list_add_rcu(&ghes->list, &ghes_sci);
+		mutex_unlock(&ghes_list_mutex);
+		break;
+	default:
+		BUG();
+	}
 	platform_set_drvdata(ghes_dev, ghes);
 
 	return 0;
@@ -386,13 +424,14 @@ static int __devexit ghes_remove(struct
 		if (list_empty(&ghes_sci))
 			unregister_acpi_hed_notifier(&ghes_notifier_sci);
 		mutex_unlock(&ghes_list_mutex);
+		synchronize_rcu();
 		break;
 	default:
 		BUG();
 		break;
 	}
 
-	synchronize_rcu();
+	herr_dev_unregister(ghes->herr_dev);
 	ghes_fini(ghes);
 	kfree(ghes);
 
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -22,6 +22,7 @@
 #define LINUX_CPER_H
 
 #include <linux/uuid.h>
+#include <linux/herror_record.h>
 
 /* CPER record signature and the size */
 #define CPER_SIG_RECORD				"CPER"
@@ -310,6 +311,7 @@ struct cper_sec_mem_err {
 #pragma pack()
 
 int herr_severity_to_cper(int herr_severity);
+int cper_severity_to_herr(int cper_severity);
 u64 cper_next_record_id(void);
 
 #endif

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH -v3 8/8] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
  2010-10-27  5:28 [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Huang Ying
                   ` (6 preceding siblings ...)
  2010-10-27  5:28 ` [PATCH -v3 7/8] ACPI, APEI, Report GHES error record with hardware error device core Huang Ying
@ 2010-10-27  5:28 ` Huang Ying
  2010-10-27 10:07 ` [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Peter Zijlstra
  8 siblings, 0 replies; 17+ messages in thread
From: Huang Ying @ 2010-10-27  5:28 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-kernel, Andi Kleen, ying.huang, linux-acpi

Generic Hardware Error Source provides a way to report platform
hardware errors (such as that from chipset). It works in so called
"Firmware First" mode, that is, hardware errors are reported to
firmware firstly, then reported to Linux by firmware. This way, some
non-standard hardware error registers or non-standard hardware link
can be checked by firmware to produce more valuable hardware error
information for Linux.

This patch adds POLL/IRQ/NMI notification types support.

Because the memory area used to transfer hardware error information
from BIOS to Linux can be determined only in NMI, IRQ or timer
handler, but general ioremap can not be used in atomic context, so a
special version of atomic ioremap is implemented for that.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/acpi/boot.c |    1 
 arch/x86/kernel/dumpstack.c |    1 
 drivers/acpi/apei/ghes.c    |  397 ++++++++++++++++++++++++++++++++++++--------
 kernel/panic.c              |    1 
 lib/ioremap.c               |    2 
 mm/vmalloc.c                |    1 
 6 files changed, 333 insertions(+), 70 deletions(-)

--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -512,6 +512,7 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq
 	*gsi = irq_to_gsi(isa_irq);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
 
 /*
  * success: return IRQ number (>=0)
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -240,6 +240,7 @@ unsigned __kprobes long oops_begin(void)
 	bust_spinlocks(1);
 	return flags;
 }
+EXPORT_SYMBOL_GPL(oops_begin);
 
 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 {
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -12,10 +12,6 @@
  * For more information about Generic Hardware Error Source, please
  * refer to ACPI Specification version 4.0, section 17.3.2.6
  *
- * Now, only SCI notification type and memory errors are
- * supported. More notification type and hardware error type will be
- * added later.
- *
  * Copyright 2010 Intel Corp.
  *   Author: Huang Ying <ying.huang@intel.com>
  *
@@ -39,15 +35,18 @@
 #include <linux/acpi.h>
 #include <linux/io.h>
 #include <linux/interrupt.h>
+#include <linux/timer.h>
 #include <linux/cper.h>
 #include <linux/kdebug.h>
 #include <linux/platform_device.h>
 #include <linux/mutex.h>
+#include <linux/vmalloc.h>
 #include <linux/herror.h>
 #include <acpi/apei.h>
 #include <acpi/atomicio.h>
 #include <acpi/hed.h>
 #include <asm/mce.h>
+#include <asm/tlbflush.h>
 
 #include "apei-internal.h"
 
@@ -56,43 +55,133 @@
 #define GHES_ESTATUS_MAX_SIZE		65536
 
 /*
- * One struct ghes is created for each generic hardware error
- * source.
- *
+ * One struct ghes is created for each generic hardware error source.
  * It provides the context for APEI hardware error timer/IRQ/SCI/NMI
- * handler. Handler for one generic hardware error source is only
- * triggered after the previous one is done. So handler can uses
- * struct ghes without locking.
+ * handler.
  *
  * estatus: memory buffer for error status block, allocated during
  * HEST parsing.
  */
 #define GHES_TO_CLEAR		0x0001
+#define GHES_EXITING		0x0002
 
 struct ghes {
 	struct acpi_hest_generic *generic;
 	struct acpi_hest_generic_status *estatus;
-	struct list_head list;
 	u64 buffer_paddr;
 	unsigned long flags;
 	struct herr_dev *herr_dev;
+	union {
+		struct list_head list;
+		struct timer_list timer;
+		unsigned int irq;
+	};
 };
 
+static int ghes_panic_timeout	__read_mostly = 30;
+
 /*
- * Error source lists, one list for each notification method. The
- * members in lists are struct ghes.
+ * All error sources notified with SCI shares one notifier function,
+ * so they need to be linked and checked one by one.  This is applied
+ * to NMI too.
  *
- * The list members are only added in HEST parsing and deleted during
- * module_exit, that is, single-threaded. So no lock is needed for
- * that.
- *
- * But the mutual exclusion is needed between members adding/deleting
- * and timer/IRQ/SCI/NMI handler, which may traverse the list. RCU is
- * used for that.
+ * RCU is used for these lists, so ghes_list_mutex is only used for
+ * list changing, not for traversing.
  */
 static LIST_HEAD(ghes_sci);
+static LIST_HEAD(ghes_nmi);
 static DEFINE_MUTEX(ghes_list_mutex);
 
+/*
+ * NMI may be triggered on any CPU, so ghes_nmi_lock is used for
+ * mutual exclusion.
+ */
+static DEFINE_RAW_SPINLOCK(ghes_nmi_lock);
+
+/*
+ * Because the memory area used to transfer hardware error information
+ * from BIOS to Linux can be determined only in NMI, IRQ or timer
+ * handler, but general ioremap can not be used in atomic context, so
+ * a special version of atomic ioremap is implemented for that.
+ */
+
+/*
+ * Two virtual pages are used, one for NMI context, the other for
+ * IRQ/PROCESS context
+ */
+#define GHES_IOREMAP_PAGES		2
+#define GHES_IOREMAP_NMI_PAGE(base)	(base)
+#define GHES_IOREMAP_IRQ_PAGE(base)	((base) + PAGE_SIZE)
+
+/* virtual memory area for atomic ioremap */
+static struct vm_struct *ghes_ioremap_area;
+/*
+ * These 2 spinlock is used to prevent atomic ioremap virtual memory
+ * area from being mapped simultaneously.
+ */
+static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi);
+static DEFINE_SPINLOCK(ghes_ioremap_lock_irq);
+
+static int ghes_ioremap_init(void)
+{
+	ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES,
+		VM_IOREMAP, VMALLOC_START, VMALLOC_END);
+	if (!ghes_ioremap_area) {
+		pr_err(GHES_PFX
+		"Failed to allocate virtual memory area for atomic ioremap.\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void ghes_ioremap_exit(void)
+{
+	free_vm_area(ghes_ioremap_area);
+}
+
+static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn)
+{
+	unsigned long vaddr;
+
+	vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr);
+	ioremap_page_range(vaddr, vaddr + PAGE_SIZE,
+			   pfn << PAGE_SHIFT, PAGE_KERNEL);
+
+	return (void __iomem *)vaddr;
+}
+
+static void __iomem *ghes_ioremap_pfn_irq(u64 pfn)
+{
+	unsigned long vaddr;
+
+	vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr);
+	ioremap_page_range(vaddr, vaddr + PAGE_SIZE,
+			   pfn << PAGE_SHIFT, PAGE_KERNEL);
+
+	return (void __iomem *)vaddr;
+}
+
+static void ghes_iounmap_nmi(void __iomem *vaddr_ptr)
+{
+	unsigned long vaddr = (unsigned long __force)vaddr_ptr;
+	void *base = ghes_ioremap_area->addr;
+
+	BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base));
+	unmap_kernel_range_noflush(vaddr, PAGE_SIZE);
+	__flush_tlb_one(vaddr);
+}
+
+static void ghes_iounmap_irq(void __iomem *vaddr_ptr)
+{
+	unsigned long vaddr = (unsigned long __force)vaddr_ptr;
+	void *base = ghes_ioremap_area->addr;
+
+	BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base));
+	unmap_kernel_range_noflush(vaddr, PAGE_SIZE);
+	__flush_tlb_one(vaddr);
+}
+
 static struct ghes *ghes_new(struct acpi_hest_generic *generic)
 {
 	struct ghes *ghes;
@@ -103,7 +192,6 @@ static struct ghes *ghes_new(struct acpi
 	if (!ghes)
 		return ERR_PTR(-ENOMEM);
 	ghes->generic = generic;
-	INIT_LIST_HEAD(&ghes->list);
 	rc = acpi_pre_map_gar(&generic->error_status_address);
 	if (rc)
 		goto err_free;
@@ -160,22 +248,41 @@ static inline int ghes_severity(int seve
 	}
 }
 
-/* SCI handler run in work queue, so ioremap can be used here */
-static int ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len,
-				 int from_phys)
+static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len,
+				  int from_phys)
 {
-	void *vaddr;
-
-	vaddr = ioremap_cache(paddr, len);
-	if (!vaddr)
-		return -ENOMEM;
-	if (from_phys)
-		memcpy(buffer, vaddr, len);
-	else
-		memcpy(vaddr, buffer, len);
-	iounmap(vaddr);
-
-	return 0;
+	void __iomem *vaddr;
+	unsigned long flags = 0;
+	int in_nmi = in_nmi();
+	u64 offset;
+	u32 trunk;
+
+	while (len > 0) {
+		offset = paddr - (paddr & PAGE_MASK);
+		if (in_nmi) {
+			raw_spin_lock(&ghes_ioremap_lock_nmi);
+			vaddr = ghes_ioremap_pfn_nmi(paddr >> PAGE_SHIFT);
+		} else {
+			spin_lock_irqsave(&ghes_ioremap_lock_irq, flags);
+			vaddr = ghes_ioremap_pfn_irq(paddr >> PAGE_SHIFT);
+		}
+		trunk = PAGE_SIZE - offset;
+		trunk = min(trunk, len);
+		if (from_phys)
+			memcpy_fromio(buffer, vaddr + offset, trunk);
+		else
+			memcpy_toio(vaddr + offset, buffer, trunk);
+		len -= trunk;
+		paddr += trunk;
+		buffer += trunk;
+		if (in_nmi) {
+			ghes_iounmap_nmi(vaddr);
+			raw_spin_unlock(&ghes_ioremap_lock_nmi);
+		} else {
+			ghes_iounmap_irq(vaddr);
+			spin_unlock_irqrestore(&ghes_ioremap_lock_irq, flags);
+		}
+	}
 }
 
 static int ghes_read_estatus(struct ghes *ghes, int silent)
@@ -196,10 +303,8 @@ static int ghes_read_estatus(struct ghes
 	if (!buf_paddr)
 		return -ENOENT;
 
-	rc = ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
-				   sizeof(*ghes->estatus), 1);
-	if (rc)
-		return rc;
+	ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
+			      sizeof(*ghes->estatus), 1);
 	if (!ghes->estatus->block_status)
 		return -ENOENT;
 
@@ -214,17 +319,15 @@ static int ghes_read_estatus(struct ghes
 		goto err_read_block;
 	if (apei_estatus_check_header(ghes->estatus))
 		goto err_read_block;
-	rc = ghes_copy_tofrom_phys(ghes->estatus + 1,
-				   buf_paddr + sizeof(*ghes->estatus),
-				   len - sizeof(*ghes->estatus), 1);
-	if (rc)
-		return rc;
+	ghes_copy_tofrom_phys(ghes->estatus + 1,
+			      buf_paddr + sizeof(*ghes->estatus),
+			      len - sizeof(*ghes->estatus), 1);
 	if (apei_estatus_check(ghes->estatus))
 		goto err_read_block;
 	rc = 0;
 
 err_read_block:
-	if (rc && !silent)
+	if (rc && !silent && printk_ratelimit())
 		pr_warning(FW_WARN GHES_PFX
 			   "Failed to read error status block!\n");
 	return rc;
@@ -303,6 +406,43 @@ out:
 	return 0;
 }
 
+static void ghes_add_timer(struct ghes *ghes)
+{
+	struct acpi_hest_generic *g = ghes->generic;
+	unsigned long expire;
+
+	if (!g->notify.poll_interval) {
+		pr_warning(FW_WARN GHES_PFX "Poll interval is 0 for "
+			   "generic hardware error source: %d, disabled.",
+			   g->header.source_id);
+		return;
+	}
+	expire = jiffies + msecs_to_jiffies(g->notify.poll_interval);
+	ghes->timer.expires = round_jiffies_relative(expire);
+	add_timer(&ghes->timer);
+}
+
+static void ghes_poll_func(unsigned long data)
+{
+	struct ghes *ghes = (void *)data;
+
+	ghes_proc(ghes);
+	if (!(ghes->flags & GHES_EXITING))
+		ghes_add_timer(ghes);
+}
+
+static irqreturn_t ghes_irq_func(int irq, void *data)
+{
+	struct ghes *ghes = data;
+	int rc;
+
+	rc = ghes_proc(ghes);
+	if (rc)
+		return IRQ_NONE;
+
+	return IRQ_HANDLED;
+}
+
 static int ghes_notify_sci(struct notifier_block *this,
 				  unsigned long event, void *data)
 {
@@ -319,10 +459,70 @@ static int ghes_notify_sci(struct notifi
 	return ret;
 }
 
+static int ghes_notify_nmi(struct notifier_block *this,
+				  unsigned long cmd, void *data)
+{
+	struct ghes *ghes, *ghes_global = NULL;
+	int sev, sev_global = -1;
+	int ret = NOTIFY_DONE;
+
+	if (cmd != DIE_NMI && cmd != DIE_NMI_IPI)
+		return ret;
+
+	raw_spin_lock(&ghes_nmi_lock);
+	list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
+		if (ghes_read_estatus(ghes, 1)) {
+			ghes_clear_estatus(ghes);
+			continue;
+		}
+		ghes_report(ghes);
+		sev = ghes_severity(ghes->estatus->error_severity);
+		if (sev > sev_global) {
+			sev_global = sev;
+			ghes_global = ghes;
+		}
+		ret = NOTIFY_STOP;
+	}
+
+	if (ret == NOTIFY_DONE)
+		goto out;
+
+	if (sev_global >= GHES_SEV_PANIC) {
+		herr_persist_all_records();
+		oops_begin();
+		/* reboot to log the error! */
+		if (panic_timeout == 0)
+			panic_timeout = ghes_panic_timeout;
+		pr_emerg(HW_ERR
+"Fatal hardware error from Generic Hardware Error Source: %d\n",
+			 ghes_global->generic->header.source_id);
+		pr_emerg(HW_ERR "Block status: 0x%08x\n",
+			 ghes_global->estatus->block_status);
+		pr_emerg(HW_ERR "Severity: %d\n",
+			 ghes_global->estatus->error_severity);
+		panic("Fatal hardware error!");
+	}
+
+	list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
+		if (!(ghes->flags & GHES_TO_CLEAR))
+			continue;
+		ghes_do_proc(ghes);
+		ghes_clear_estatus(ghes);
+	}
+
+out:
+	raw_spin_unlock(&ghes_nmi_lock);
+	return ret;
+}
+
 static struct notifier_block ghes_notifier_sci = {
 	.notifier_call = ghes_notify_sci,
 };
 
+static struct notifier_block ghes_notifier_nmi = {
+	.notifier_call = ghes_notify_nmi,
+};
+
 static int __devinit ghes_probe(struct platform_device *ghes_dev)
 {
 	struct acpi_hest_generic *generic;
@@ -334,30 +534,21 @@ static int __devinit ghes_probe(struct p
 	if (!generic->enabled)
 		goto err;
 
-	if (generic->notify.type != ACPI_HEST_NOTIFY_SCI) {
-		char *notify = NULL;
-
-		switch (generic->notify.type) {
-		case ACPI_HEST_NOTIFY_POLLED:
-			notify = "POLL";
-			break;
-		case ACPI_HEST_NOTIFY_EXTERNAL:
-		case ACPI_HEST_NOTIFY_LOCAL:
-			notify = "IRQ";
-			break;
-		case ACPI_HEST_NOTIFY_NMI:
-			notify = "NMI";
-			break;
-		}
-		if (notify) {
-			pr_warning(GHES_PFX
-"Generic hardware error source: %d notified via %s is not supported!\n",
-				   generic->header.source_id, notify);
-		} else {
-			pr_warning(FW_WARN GHES_PFX
+	switch (generic->notify.type) {
+	case ACPI_HEST_NOTIFY_POLLED:
+	case ACPI_HEST_NOTIFY_EXTERNAL:
+	case ACPI_HEST_NOTIFY_SCI:
+	case ACPI_HEST_NOTIFY_NMI:
+		break;
+	case ACPI_HEST_NOTIFY_LOCAL:
+		pr_warning(GHES_PFX
+"Generic hardware error source: %d notified via local interrupt is not supported!\n",
+			   generic->header.source_id);
+		goto err;
+	default:
+		pr_warning(FW_WARN GHES_PFX
 "Unknown notification type: %u for generic hardware error source: %d\n",
-			generic->notify.type, generic->header.source_id);
-		}
+			   generic->notify.type, generic->header.source_id);
 		goto err;
 	}
 
@@ -388,6 +579,28 @@ static int __devinit ghes_probe(struct p
 		goto err;
 	}
 	switch (generic->notify.type) {
+	case ACPI_HEST_NOTIFY_POLLED:
+		ghes->timer.function = ghes_poll_func;
+		ghes->timer.data = (unsigned long)ghes;
+		init_timer_deferrable(&ghes->timer);
+		ghes_add_timer(ghes);
+		break;
+	case ACPI_HEST_NOTIFY_EXTERNAL:
+		/* External interrupt vector is GSI */
+		if (acpi_gsi_to_irq(generic->notify.vector, &ghes->irq)) {
+			pr_err(GHES_PFX
+	"Failed to map GSI to IRQ for generic hardware error source: %d\n",
+			       generic->header.source_id);
+			goto err_unreg;
+		}
+		if (request_irq(ghes->irq, ghes_irq_func,
+				0, "GHES IRQ", ghes)) {
+			pr_err(GHES_PFX
+	"Failed to register IRQ for generic hardware error source: %d\n",
+			       generic->header.source_id);
+			goto err_unreg;
+		}
+		break;
 	case ACPI_HEST_NOTIFY_SCI:
 		mutex_lock(&ghes_list_mutex);
 		if (list_empty(&ghes_sci))
@@ -395,12 +608,21 @@ static int __devinit ghes_probe(struct p
 		list_add_rcu(&ghes->list, &ghes_sci);
 		mutex_unlock(&ghes_list_mutex);
 		break;
+	case ACPI_HEST_NOTIFY_NMI:
+		mutex_lock(&ghes_list_mutex);
+		if (list_empty(&ghes_nmi))
+			register_die_notifier(&ghes_notifier_nmi);
+		list_add_rcu(&ghes->list, &ghes_nmi);
+		mutex_unlock(&ghes_list_mutex);
+		break;
 	default:
 		BUG();
 	}
 	platform_set_drvdata(ghes_dev, ghes);
 
 	return 0;
+err_unreg:
+	herr_dev_unregister(ghes->herr_dev);
 err:
 	if (ghes) {
 		ghes_fini(ghes);
@@ -417,7 +639,15 @@ static int __devexit ghes_remove(struct
 	ghes = platform_get_drvdata(ghes_dev);
 	generic = ghes->generic;
 
+	ghes->flags |= GHES_EXITING;
+
 	switch (generic->notify.type) {
+	case ACPI_HEST_NOTIFY_POLLED:
+		del_timer_sync(&ghes->timer);
+		break;
+	case ACPI_HEST_NOTIFY_EXTERNAL:
+		free_irq(ghes->irq, ghes);
+		break;
 	case ACPI_HEST_NOTIFY_SCI:
 		mutex_lock(&ghes_list_mutex);
 		list_del_rcu(&ghes->list);
@@ -426,6 +656,18 @@ static int __devexit ghes_remove(struct
 		mutex_unlock(&ghes_list_mutex);
 		synchronize_rcu();
 		break;
+	case ACPI_HEST_NOTIFY_NMI:
+		mutex_lock(&ghes_list_mutex);
+		list_del_rcu(&ghes->list);
+		if (list_empty(&ghes_nmi))
+			unregister_die_notifier(&ghes_notifier_nmi);
+		mutex_unlock(&ghes_list_mutex);
+		/*
+		 * To synchronize with NMI handler, ghes can only be
+		 * freed after NMI handler finishes.
+		 */
+		synchronize_rcu();
+		break;
 	default:
 		BUG();
 		break;
@@ -451,6 +693,8 @@ static struct platform_driver ghes_platf
 
 static int __init ghes_init(void)
 {
+	int rc;
+
 	if (acpi_disabled)
 		return -ENODEV;
 
@@ -459,12 +703,25 @@ static int __init ghes_init(void)
 		return -EINVAL;
 	}
 
-	return platform_driver_register(&ghes_platform_driver);
+	rc = ghes_ioremap_init();
+	if (rc)
+		goto err;
+
+	rc = platform_driver_register(&ghes_platform_driver);
+	if (rc)
+		goto err_ioremap_exit;
+
+	return 0;
+err_ioremap_exit:
+	ghes_ioremap_exit();
+err:
+	return rc;
 }
 
 static void __exit ghes_exit(void)
 {
 	platform_driver_unregister(&ghes_platform_driver);
+	ghes_ioremap_exit();
 }
 
 module_init(ghes_init);
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
 
 int panic_timeout;
+EXPORT_SYMBOL_GPL(panic_timeout);
 
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/io.h>
+#include <linux/module.h>
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
 
@@ -90,3 +91,4 @@ int ioremap_page_range(unsigned long add
 
 	return err;
 }
+EXPORT_SYMBOL_GPL(ioremap_page_range);
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1160,6 +1160,7 @@ void unmap_kernel_range_noflush(unsigned
 {
 	vunmap_page_range(addr, addr + size);
 }
+EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
 
 /**
  * unmap_kernel_range - unmap kernel VM area and flush cache and TLB

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -v3 2/8] lib, Make gen_pool memory allocator lock-less
  2010-10-27  5:28 ` [PATCH -v3 2/8] lib, Make gen_pool memory allocator lock-less Huang Ying
@ 2010-10-27  9:17   ` Andi Kleen
  2010-10-31 14:30     ` Huang Ying
  0 siblings, 1 reply; 17+ messages in thread
From: Andi Kleen @ 2010-10-27  9:17 UTC (permalink / raw)
  To: Huang Ying; +Cc: Len Brown, linux-kernel, Andi Kleen, linux-acpi

> --- a/lib/genalloc.c
> +++ b/lib/genalloc.c
> @@ -1,8 +1,8 @@
>  /*
> - * Basic general purpose allocator for managing special purpose memory
> - * not managed by the regular kmalloc/kfree interface.
> - * Uses for this includes on-device special memory, uncached memory
> - * etc.
> + * Basic general purpose allocator for managing special purpose
> + * memory, for example, memory that is not managed by the regular
> + * kmalloc/kfree interface.  Uses for this includes on-device special
> + * memory, uncached memory etc.

I think we need some more description here about the locklessness:

How about adding to the comment:

This version of the allocator supports lockless operation.

This makes it safe to use in NMI handlers and other special unblockable
contexts that could otherwise deadlock on locks.  This is implemented by 
using atomic operations and retries on any conflicts. 
The disadvantage is that there may be livelocks in extreme cases.  

The lockless operation only works if there is enough memory
available. If new memory is added to the pool a lock has to 
be still taken. So any user relying on locklessness has to ensure
that sufficient memory is preallocated.

The basic atomic operation of this allocator is cmpxchg on long. 
On architectures that don't support cmpxchg natively a fallback
is used. If the fallback uses locks it may not be safe to use
it in NMI contexts on these architectures.

> +/**
> + * gen_pool_for_each_chunk - iterate over chunks of generic memory pool
> + * @chunk:	the struct gen_pool_chunk * to use as a loop cursor
> + * @pool:	the generic memory pool
> + */

I believe that's not safe in a lockless context right?
Should note that.

> +#define gen_pool_for_each_chunk(chunk, pool)			\
> +	list_for_each_entry(chunk, &pool->chunks, next_chunk)
> +

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37
  2010-10-27  5:28 [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Huang Ying
                   ` (7 preceding siblings ...)
  2010-10-27  5:28 ` [PATCH -v3 8/8] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support Huang Ying
@ 2010-10-27 10:07 ` Peter Zijlstra
  2010-10-27 15:27   ` Randy Dunlap
  2010-10-27 17:23   ` Andi Kleen
  8 siblings, 2 replies; 17+ messages in thread
From: Peter Zijlstra @ 2010-10-27 10:07 UTC (permalink / raw)
  To: Huang Ying
  Cc: Mauro Carvalho Chehab, Len Brown, linux-kernel, Andi Kleen,
	linux-acpi, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, Luck,
	Tony, BorislavPetkov, Andrew Morton, Don Zickus, Linus Torvalds

On Wed, 2010-10-27 at 13:28 +0800, Huang Ying wrote:
> v3:
> 
> - Rework lock-less memory allocator and lock-less list.
> 
> v2:
> 
> - Some minor changes according to Andi's comments.
> 
> [PATCH -v3 1/8] ACPI, APEI, Add ERST record ID cache
> [PATCH -v3 2/8] lib, Make gen_pool memory allocator lock-less
> [PATCH -v3 3/8] lib, Add lock-less NULL terminated single list
> [PATCH -v3 4/8] Hardware error device core
> [PATCH -v3 5/8] Hardware error record persistent support
> [PATCH -v3 6/8] ACPI, APEI, Use ERST for hardware error persisting before panic
> [PATCH -v3 7/8] ACPI, APEI, Report GHES error record with hardware error device core
> [PATCH -v3 8/8] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support

You forgot to CC all people who participated in the previous discussion.

You seem to have forgotten to address the high-level feedback given by
the x86 maintainers.

All you've done is decreased the arch/x86/ footprint of the patch
series, but neither you nor Andi have addressed the technical arguments
against adding this ABI.

Nor have you engaged in conversation with the other EDAC people on how
to extend the existing interface, or even work towards creating
something new that would cater to all interested parties.

Not charmed at all by your attitude.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37
  2010-10-27 10:07 ` [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Peter Zijlstra
@ 2010-10-27 15:27   ` Randy Dunlap
  2010-10-31 11:36       ` huang ying
  2010-10-27 17:23   ` Andi Kleen
  1 sibling, 1 reply; 17+ messages in thread
From: Randy Dunlap @ 2010-10-27 15:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Huang Ying, Mauro Carvalho Chehab, Len Brown, linux-kernel,
	Andi Kleen, linux-acpi, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Luck, Tony, BorislavPetkov, Andrew Morton,
	Don Zickus, Linus Torvalds

On Wed, 27 Oct 2010 12:07:30 +0200 Peter Zijlstra wrote:

> On Wed, 2010-10-27 at 13:28 +0800, Huang Ying wrote:
> > v3:
> > 
> > - Rework lock-less memory allocator and lock-less list.
> > 
> > v2:
> > 
> > - Some minor changes according to Andi's comments.
> > 
> > [PATCH -v3 1/8] ACPI, APEI, Add ERST record ID cache
> > [PATCH -v3 2/8] lib, Make gen_pool memory allocator lock-less
> > [PATCH -v3 3/8] lib, Add lock-less NULL terminated single list
> > [PATCH -v3 4/8] Hardware error device core
> > [PATCH -v3 5/8] Hardware error record persistent support
> > [PATCH -v3 6/8] ACPI, APEI, Use ERST for hardware error persisting before panic
> > [PATCH -v3 7/8] ACPI, APEI, Report GHES error record with hardware error device core
> > [PATCH -v3 8/8] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support
> 
> You forgot to CC all people who participated in the previous discussion.
> 
> You seem to have forgotten to address the high-level feedback given by
> the x86 maintainers.
> 
> All you've done is decreased the arch/x86/ footprint of the patch
> series, but neither you nor Andi have addressed the technical arguments
> against adding this ABI.
> 
> Nor have you engaged in conversation with the other EDAC people on how
> to extend the existing interface, or even work towards creating
> something new that would cater to all interested parties.
> 
> Not charmed at all by your attitude.
> --

Looks like you also have not addressed this reported build error:
  http://lkml.org/lkml/2010/10/25/440

---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37
  2010-10-27 10:07 ` [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Peter Zijlstra
  2010-10-27 15:27   ` Randy Dunlap
@ 2010-10-27 17:23   ` Andi Kleen
  2010-10-28 16:01     ` Ingo Molnar
  1 sibling, 1 reply; 17+ messages in thread
From: Andi Kleen @ 2010-10-27 17:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Huang Ying, Mauro Carvalho Chehab, Len Brown, linux-kernel,
	Andi Kleen, linux-acpi, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Luck, Tony, BorislavPetkov, Andrew Morton,
	Don Zickus, Linus Torvalds

Peter,

> All you've done is decreased the arch/x86/ footprint of the patch
> series, but neither you nor Andi have addressed the technical arguments
> against adding this ABI.

What were the technical arguments? I don't remember any.

I remember just reading some rants from people who clearly
never even looked at this code and made no attempt to review
it. It was very untechnical.

> Nor have you engaged in conversation with the other EDAC people on how

There was a long thread on it last time, patchkit really is based
on at least some of the sane feedback from that.

But in a nutshell:

This code is quite orthogonal to EDAC: EDAC does enumeration
(or some weird variant of it at least), exporting of counter registers
and some dumping of the registers into the kernel log. 

APEI doesn't do enumeration or counting or registers, it's really just
an interface to retrieve some events from firmware, stop the machine on 
fatal events and pass them on and support managing of the events in a 
backing store.

It's also a standardized interface supported by multiple vendors,
it's not proprietary at all as some people claimed.

> to extend the existing interface, or even work towards creating
> something new that would cater to all interested parties.

Open to serious input. Do you have anything concrete?
Please be concrete and technical ideally with references to code.

Some common comments I heard:

If you want enumeration it could be probably done in some 
EDAC2 variant which fixes the worst problems in the current EDAC interface.
But it won't be talking to APEI directly anyways -- APEI
doesn't do enumeration -- it's really a different problem
and won't be solve by this code.

If you want to move the event channel to perf: we looked at that
and it's not practical with the current infrastructure. And
not a very good fit anyways because the requirements
as quite different from the ones perf was designed for.
For more details see the big thread on the previous posting.

If you want to move the backing store to a file system
or MTD: we actually looked at that too but there were too
many problems and it's really far too much code for the simple 
use case.

Short term this code just attempts to handle fatal chipset errors
(which are a long standing problem in Linux) in a sane way.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37
  2010-10-27 17:23   ` Andi Kleen
@ 2010-10-28 16:01     ` Ingo Molnar
  0 siblings, 0 replies; 17+ messages in thread
From: Ingo Molnar @ 2010-10-28 16:01 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Peter Zijlstra, Huang Ying, Mauro Carvalho Chehab, Len Brown,
	linux-kernel, linux-acpi, Thomas Gleixner, H. Peter Anvin, Luck,
	Tony, BorislavPetkov, Andrew Morton, Don Zickus, Linus Torvalds


* Andi Kleen <andi@firstfloor.org> wrote:

> Peter,
> 
> > All you've done is decreased the arch/x86/ footprint of the patch series, but 
> > neither you nor Andi have addressed the technical arguments against adding this 
> > ABI.
> 
> What were the technical arguments? I don't remember any.

You did not reply to my technical arguments:

    http://lkml.org/lkml/2010/10/25/271

You did not reply to Thomas's technical arguments:

    http://lkml.org/lkml/2010/10/26/71

Are you really claiming that you "don't remember any"?

	Ingo

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37
  2010-10-27 15:27   ` Randy Dunlap
@ 2010-10-31 11:36       ` huang ying
  0 siblings, 0 replies; 17+ messages in thread
From: huang ying @ 2010-10-31 11:36 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: Peter Zijlstra, Huang Ying, Mauro Carvalho Chehab, Len Brown,
	linux-kernel, Andi Kleen, linux-acpi, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, Luck, Tony, BorislavPetkov,
	Andrew Morton, Don Zickus, Linus Torvalds

Hi, Randy,

On Wed, Oct 27, 2010 at 11:27 PM, Randy Dunlap <randy.dunlap@oracle.com> wrote:
> Looks like you also have not addressed this reported build error:
>  http://lkml.org/lkml/2010/10/25/440

Sorry for late, busy in traveling recently. I will fix the issue in
the next version.

Best Regards,
Huang Ying
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37
@ 2010-10-31 11:36       ` huang ying
  0 siblings, 0 replies; 17+ messages in thread
From: huang ying @ 2010-10-31 11:36 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: Peter Zijlstra, Huang Ying, Mauro Carvalho Chehab, Len Brown,
	linux-kernel, Andi Kleen, linux-acpi, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, Luck, Tony, BorislavPetkov,
	Andrew Morton, Don Zickus, Linus Torvalds

Hi, Randy,

On Wed, Oct 27, 2010 at 11:27 PM, Randy Dunlap <randy.dunlap@oracle.com> wrote:
> Looks like you also have not addressed this reported build error:
>  http://lkml.org/lkml/2010/10/25/440

Sorry for late, busy in traveling recently. I will fix the issue in
the next version.

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -v3 2/8] lib, Make gen_pool memory allocator lock-less
  2010-10-27  9:17   ` Andi Kleen
@ 2010-10-31 14:30     ` Huang Ying
  0 siblings, 0 replies; 17+ messages in thread
From: Huang Ying @ 2010-10-31 14:30 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Len Brown, linux-kernel, linux-acpi

Hi, Andi,

Sorry for late. In traveling recently.

On Wed, 2010-10-27 at 02:17 -0700, Andi Kleen wrote:
> > --- a/lib/genalloc.c
> > +++ b/lib/genalloc.c
> > @@ -1,8 +1,8 @@
> >  /*
> > - * Basic general purpose allocator for managing special purpose memory
> > - * not managed by the regular kmalloc/kfree interface.
> > - * Uses for this includes on-device special memory, uncached memory
> > - * etc.
> > + * Basic general purpose allocator for managing special purpose
> > + * memory, for example, memory that is not managed by the regular
> > + * kmalloc/kfree interface.  Uses for this includes on-device special
> > + * memory, uncached memory etc.
> 
> I think we need some more description here about the locklessness:
> 
> How about adding to the comment:
> 
> This version of the allocator supports lockless operation.
> 
> This makes it safe to use in NMI handlers and other special unblockable
> contexts that could otherwise deadlock on locks.  This is implemented by 
> using atomic operations and retries on any conflicts. 
> The disadvantage is that there may be livelocks in extreme cases.

Or add something like:

To get better scalability, one allocator can be used for each CPU.

> The lockless operation only works if there is enough memory
> available. If new memory is added to the pool a lock has to 
> be still taken. So any user relying on locklessness has to ensure
> that sufficient memory is preallocated.

In fact, because RCU is used between gen_pool_add and
gen_pool_alloc/gen_pool_free, the memory can be added to pool at any
time. This makes the memory allocator more flexible. Maybe we can add
gen_pool_remove in the future to shrink the preallocated memory at any
time.

> The basic atomic operation of this allocator is cmpxchg on long. 
> On architectures that don't support cmpxchg natively a fallback
> is used. If the fallback uses locks it may not be safe to use
> it in NMI contexts on these architectures.
> 
> > +/**
> > + * gen_pool_for_each_chunk - iterate over chunks of generic memory pool
> > + * @chunk:	the struct gen_pool_chunk * to use as a loop cursor
> > + * @pool:	the generic memory pool
> > + */
> 
> I believe that's not safe in a lockless context right?
> Should note that.

Yes. Will add that.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2010-10-31 14:30 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-10-27  5:28 [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Huang Ying
2010-10-27  5:28 ` [PATCH -v3 1/8] ACPI, APEI, Add ERST record ID cache Huang Ying
2010-10-27  5:28 ` [PATCH -v3 2/8] lib, Make gen_pool memory allocator lock-less Huang Ying
2010-10-27  9:17   ` Andi Kleen
2010-10-31 14:30     ` Huang Ying
2010-10-27  5:28 ` [PATCH -v3 3/8] lib, Add lock-less NULL terminated single list Huang Ying
2010-10-27  5:28 ` [PATCH -v3 4/8] Hardware error device core Huang Ying
2010-10-27  5:28 ` [PATCH -v3 5/8] Hardware error record persistent support Huang Ying
2010-10-27  5:28 ` [PATCH -v3 6/8] ACPI, APEI, Use ERST for hardware error persisting before panic Huang Ying
2010-10-27  5:28 ` [PATCH -v3 7/8] ACPI, APEI, Report GHES error record with hardware error device core Huang Ying
2010-10-27  5:28 ` [PATCH -v3 8/8] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support Huang Ying
2010-10-27 10:07 ` [PATCH -v3 0/8] ACPI, APEI patches for 2.6.37 Peter Zijlstra
2010-10-27 15:27   ` Randy Dunlap
2010-10-31 11:36     ` huang ying
2010-10-31 11:36       ` huang ying
2010-10-27 17:23   ` Andi Kleen
2010-10-28 16:01     ` Ingo Molnar

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.