Linux-EDAC Archive on lore.kernel.org
 help / color / Atom feed
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
	<linux-kernel@vger.kernel.org>, <bp@alien8.de>,
	<tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
	<lenb@kernel.org>
Cc: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 6/7] RAS/CEC: Add CPU Correctable Error Collector to isolate an erroneous CPU core
Date: Fri, 2 Oct 2020 13:22:34 +0100
Message-ID: <20201002122235.1280-7-shiju.jose@huawei.com> (raw)
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>

When the CPU correctable errors, for example L1/L2 cache errors,
reported on an ARM64 CPU core too often, it should be isolated.
Add the CPU correctable error collector to store the CPU correctable
error count.

When the correctable error count for a CPU exceed the threshold
value in a short time period, it will try to isolate the CPU core.

If disabling entire CPU core is not acceptable, Please suggest
method to disable L1 and L2 cache on ARM64 core?

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 arch/arm64/ras/Kconfig |  17 +++
 drivers/ras/Kconfig    |   1 +
 drivers/ras/cec.c      | 231 +++++++++++++++++++++++++++++++++++++++--
 include/linux/ras.h    |   9 ++
 4 files changed, 247 insertions(+), 11 deletions(-)
 create mode 100644 arch/arm64/ras/Kconfig

diff --git a/arch/arm64/ras/Kconfig b/arch/arm64/ras/Kconfig
new file mode 100644
index 000000000000..bfa14157cd2e
--- /dev/null
+++ b/arch/arm64/ras/Kconfig
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+config RAS_CEC
+        bool "Correctable Errors Collector"
+        depends on ARM64 && HOTPLUG_CPU && DEBUG_FS
+        help
+          This is a small cache which collects correctable CPU errors and
+          counts their repeated occurrence. Once the counter for a CPU
+          overflows in a short time period, we try to offline that CPU
+          as we take it to mean that it has reached a relatively high error
+          count and would probably be best if we don't use it anymore.
+
+          Presently CPU error correction enabld for ARM64 platform only.
+
+config RAS_CEC_DEBUG
+        bool "CEC debugging machinery"
+        default n
+        depends on RAS_CEC
diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig
index c2a236f2e846..d2f877e5f7ad 100644
--- a/drivers/ras/Kconfig
+++ b/drivers/ras/Kconfig
@@ -32,5 +32,6 @@ menuconfig RAS
 if RAS
 
 source "arch/x86/ras/Kconfig"
+source "arch/arm64/ras/Kconfig"
 
 endif
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index ca52917d514c..408bf2ac2461 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -7,6 +7,8 @@
 #include <linux/ras.h>
 #include <linux/kernel.h>
 #include <linux/workqueue.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
 
 #if defined(CONFIG_X86_MCE)
 #include <asm/mce.h>
@@ -143,7 +145,7 @@ static struct ce_array {
 		};
 		__u32 flags;
 	};
-} ce_arr;
+} ce_arr, cpu_ce_arr;
 
 static u64 dfs_pfn;
 
@@ -156,6 +158,8 @@ static u64 action_threshold = COUNT_MASK;
 #define CEC_DECAY_MAX_INTERVAL	   30 *	24 * 60 * 60	/* one month */
 static u64 decay_interval = CEC_DECAY_DEFAULT_INTERVAL;
 
+static const char * const bins[] = { "00", "01", "10", "11" };
+
 /* Definitions for elements (for example CPU) for which
  * error count on shrot time period is checked with threshold.
  *
@@ -484,6 +488,172 @@ static int cec_add_elem(u64 pfn)
 	return ret;
 }
 
+struct cec_elem_offline {
+	struct work_struct work;
+	struct ce_array *ca;
+	int array_index;
+	int elem_id;
+};
+
+/*
+ * Work function to offline a cpu because the offlining to be done
+ * in the process context.
+ */
+static void cec_cpu_offline_work_fn(struct work_struct *work)
+{
+	int rc, cpu;
+	struct cec_elem_offline *elem;
+	struct ce_array *ca;
+
+	elem = container_of(work, struct cec_elem_offline, work);
+
+	cpu = elem->elem_id;
+	if (!cpu_online(cpu))
+		return;
+
+	rc = remove_cpu(cpu);
+	if (rc) {
+		pr_warn("Failed to offline CPU%d, error %d\n", cpu, rc);
+	} else {
+		ca = elem->ca;
+		ca->array[elem->array_index] |= ELEM_STATUS_BIT;
+	}
+
+	kfree(elem);
+}
+
+int cec_cpu_add_elem(int cpu, u64 ce_count)
+{
+	struct ce_array *ca = &cpu_ce_arr;
+	unsigned int to = 0;
+	int count, ret = 0;
+	unsigned long flags;
+	struct cec_elem_offline *elem;
+
+	/*
+	 * We can be called very early on the identify_cpu() path where we are
+	 * not initialized yet. We ignore the error for simplicity.
+	 */
+	if (!ca->array || ca->disabled || !cpu_online(cpu))
+		return -ENODEV;
+
+	spin_lock_irqsave(&ca->spin_lock, flags);
+
+	ca->ces_entered++;
+
+	ret = find_elem(ca, cpu, &to);
+	if (ret < 0) {
+		/*
+		 * Shift range [to-end] to make room for one more element.
+		 */
+		memmove((void *)&ca->array[to + 1],
+			(void *)&ca->array[to],
+			(ca->n - to) * sizeof(u64));
+
+		ca->array[to] = cpu << ca->id_shift;
+		ca->n++;
+	}
+
+	/* Error received for a previously CEC offlined CPU, which later online elsewhere.
+	 * reset array.
+	 */
+	if (ca->array[to] & ELEM_STATUS_BIT) {
+		ca->array[to] &= ~(ELEM_STATUS_BIT);
+		ca->array[to] &= ~(COUNT_MASK);
+	}
+
+	/* Add/refresh element generation and increment count */
+	ca->array[to] |= DECAY_MASK << COUNT_BITS;
+	ca->array[to] += ce_count;
+
+	/* Check action threshold and offline, if reached. */
+	count = COUNT(ca->array[to]);
+	if (count >= action_threshold) {
+		if (!cpu_online(cpu)) {
+			pr_warn("CEC: Invalid cpu: %d\n", cpu);
+		} else {
+			/* We have reached max count for this cpu, offline it. */
+			ca->elems_poisoned++;
+			/* schedule work function to offline the cpu */
+			elem = kmalloc(sizeof(*elem), GFP_NOWAIT);
+			if (elem) {
+				pr_info("CEC: offlining cpu: %d\n", cpu);
+				elem->ca = ca;
+				elem->array_index = to;
+				elem->elem_id = cpu;
+				INIT_WORK(&elem->work, cec_cpu_offline_work_fn);
+				schedule_work(&elem->work);
+			} else
+				pr_warn("CEC: offlining cpu: out of memory %d\n", cpu);
+		}
+
+		/*
+		 * Return a >0 value to callers, to denote that we've reached
+		 * the offlining threshold.
+		 */
+		ret = 1;
+
+		goto unlock;
+	}
+
+	ca->decay_count++;
+
+	/* Do we need to call spring cleaning for the modules(eg CPU) with
+	 * small number of elements?
+	 */
+	if (ca->decay_count >= (num_present_cpus() >> DECAY_BITS))
+		do_spring_cleaning(ca);
+
+	WARN_ON_ONCE(sanity_check(ca));
+
+unlock:
+	spin_unlock_irqrestore(&ca->spin_lock, flags);
+
+	return ret;
+}
+
+static int cec_cpu_stats_show(struct seq_file *seq, void *v)
+{
+	struct ce_array *ca = &cpu_ce_arr;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&cpu_ce_arr.spin_lock, flags);
+	seq_puts(seq, "CEC CPU Stats:\n");
+
+	seq_printf(seq, "{ n: %d\n", ca->n);
+	for (i = 0; i < ca->n; i++) {
+		int cpu = ELEM_NO(ca->array[i], ca->id_shift);
+
+	seq_printf(seq, "cpu=%d: %03llx\n",
+		   cpu, ca->array[i]);
+
+	seq_printf(seq, " %3d: [%d|%s|%03lld|%s]\n",
+		   i, cpu, bins[DECAY(ca->array[i])],
+		   COUNT(ca->array[i]),
+		   cpu_online(cpu) ? "online" :
+		   (ca->array[i] & ELEM_STATUS_BIT) ?
+		   "offlined-by-cec" : "offline");
+	}
+
+	seq_printf(seq, "}\n");
+
+	seq_printf(seq, "Stats:\nCEs: %llu\nofflined CPUs: %llu\n",
+		   ca->ces_entered, ca->elems_poisoned);
+
+	seq_printf(seq, "Flags: 0x%x\n", ca->flags);
+
+	seq_printf(seq, "Decay interval: %lld seconds\n", decay_interval);
+	seq_printf(seq, "Decays: %lld\n", ca->decays_done);
+
+	seq_printf(seq, "Action threshold: %lld\n", action_threshold);
+
+	spin_unlock_irqrestore(&cpu_ce_arr.spin_lock, flags);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(cec_cpu_stats);
+
 static int u64_get(void *data, u64 *val)
 {
 	*val = *(u64 *)data;
@@ -514,6 +684,7 @@ static int decay_interval_set(void *data, u64 val)
 	decay_interval = val;
 
 	cec_mod_work(&ce_arr.work, decay_interval);
+	cec_mod_work(&cpu_ce_arr.work, decay_interval/RAS_CEC_NUM_TIME_SLOTS);
 
 	return 0;
 }
@@ -532,8 +703,6 @@ static int action_threshold_set(void *data, u64 val)
 }
 DEFINE_DEBUGFS_ATTRIBUTE(action_threshold_ops, u64_get, action_threshold_set, "%lld\n");
 
-static const char * const bins[] = { "00", "01", "10", "11" };
-
 static int array_dump(struct seq_file *m, void *v)
 {
 	struct ce_array *ca = &ce_arr;
@@ -620,6 +789,14 @@ static int __init create_debugfs_nodes(void)
 	}
 #endif
 
+#if defined(CONFIG_ARM64)
+	array = debugfs_create_file("cpu_stats", 0400, d, NULL, &cec_cpu_stats_fops);
+	if (!array) {
+		pr_warn("Error creating cpu_stats debugfs node!\n");
+		goto err;
+	}
+#endif
+
 	return 0;
 
 err:
@@ -658,21 +835,26 @@ static struct notifier_block cec_nb = {
 
 static void __init cec_init(void)
 {
-	if (ce_arr.disabled)
+	if (ce_arr.disabled && cpu_ce_arr.disabled)
 		return;
 
 #if defined(CONFIG_X86_MCE)
 	ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!ce_arr.array) {
 		pr_err("Error allocating CE array page!\n");
-		return;
+		goto error;
 	}
 #endif
 
-	if (create_debugfs_nodes()) {
-		free_page((unsigned long)ce_arr.array);
-		return;
-	}
+#if defined(CONFIG_ARM64)
+	cpu_ce_arr.array = kcalloc(num_present_cpus(), sizeof(*(cpu_ce_arr.array)),
+				   GFP_KERNEL);
+	if (!cpu_ce_arr.array)
+		goto error;
+#endif
+
+	if (create_debugfs_nodes())
+		goto error;
 
 #if defined(CONFIG_X86_MCE)
 	ce_arr.id_shift = PAGE_SHIFT;
@@ -682,22 +864,49 @@ static void __init cec_init(void)
 	mce_register_decode_chain(&cec_nb);
 #endif
 
+#if defined(CONFIG_ARM64)
+	cpu_ce_arr.short_period = true;
+	cpu_ce_arr.id_shift = ELEM_ID_SHIFT;
+	spin_lock_init(&cpu_ce_arr.spin_lock);
+	INIT_DELAYED_WORK(&cpu_ce_arr.work, cec_work_fn);
+	schedule_delayed_work(&cpu_ce_arr.work, CEC_DECAY_DEFAULT_INTERVAL/RAS_CEC_NUM_TIME_SLOTS);
+#endif
+
 	pr_info("Correctable Errors collector initialized.\n");
+	return;
+error:
+#if defined(CONFIG_ARM64)
+	kfree(cpu_ce_arr.array);
+#endif
+	if (ce_arr.array)
+		free_page((unsigned long)ce_arr.array);
+
 }
 late_initcall(cec_init);
 
 int __init parse_cec_param(char *str)
 {
+	bool match = false;
+
 	if (!str)
 		return 0;
 
 	if (*str == '=')
 		str++;
 
-	if (!strcmp(str, "cec_disable"))
+	if (!strcmp(str, "cec_disable")) {
 		ce_arr.disabled = 1;
+		match = true;
+	}
+
+	if (!strcmp(str, "cec_cpu_disable")) {
+		cpu_ce_arr.disabled = 1;
+		match = true;
+	}
+
+	if (match)
+		return 1;
 	else
 		return 0;
 
-	return 1;
 }
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 1f4048bf2674..43d91298f1e3 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -18,6 +18,15 @@ static inline int ras_add_daemon_trace(void) { return 0; }
 
 #ifdef CONFIG_RAS_CEC
 int __init parse_cec_param(char *str);
+/**
+ * cec_cpu_add_elem - add the count of CPU correctable errors to the
+ * CEC(correctable errors collector).
+ * @cpu: CPU index.
+ * @ce_count: CPU correctable errors count.
+ */
+int cec_cpu_add_elem(int cpu, u64 ce_count);
+#else
+static inline int cec_cpu_add_elem(int cpu, u64 ce_count) { return -ENODEV; }
 #endif
 
 #ifdef CONFIG_RAS
-- 
2.17.1



  parent reply index

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-02 12:22 [RFC PATCH 0/7] RAS/CEC: Extend CEC for errors count check on short time period Shiju Jose
2020-10-02 12:22 ` [RFC PATCH 1/7] RAS/CEC: Replace the macro PFN with ELEM_NO Shiju Jose
2020-10-02 12:22 ` [RFC PATCH 2/7] RAS/CEC: Replace pfns_poisoned with elems_poisoned Shiju Jose
2020-10-02 12:22 ` [RFC PATCH 3/7] RAS/CEC: Move X86 MCE specific code under CONFIG_X86_MCE Shiju Jose
2020-10-02 12:22 ` [RFC PATCH 4/7] RAS/CEC: Modify cec_mod_work() for common use Shiju Jose
2020-10-02 12:22 ` [RFC PATCH 5/7] RAS/CEC: Add support for errors count check on short time period Shiju Jose
2020-10-02 12:22 ` Shiju Jose [this message]
2020-10-02 12:22 ` [RFC PATCH 7/7] ACPI / APEI: Add reporting ARM64 CPU correctable errors to the CEC Shiju Jose
2020-10-02 12:43 ` [RFC PATCH 0/7] RAS/CEC: Extend CEC for errors count check on short time period Borislav Petkov
2020-10-02 15:38   ` Shiju Jose
2020-10-02 17:33     ` James Morse
2020-10-02 18:02       ` Borislav Petkov
2020-10-06 16:13       ` Shiju Jose
2020-10-07 16:45         ` James Morse
2020-10-02 16:04   ` Luck, Tony

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201002122235.1280-7-shiju.jose@huawei.com \
    --to=shiju.jose@huawei.com \
    --cc=bp@alien8.de \
    --cc=james.morse@arm.com \
    --cc=lenb@kernel.org \
    --cc=linux-acpi@vger.kernel.org \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linuxarm@huawei.com \
    --cc=rjw@rjwysocki.net \
    --cc=tony.luck@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-EDAC Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-edac/0 linux-edac/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-edac linux-edac/ https://lore.kernel.org/linux-edac \
		linux-edac@vger.kernel.org
	public-inbox-index linux-edac

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-edac


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git