All of lore.kernel.org
 help / color / mirror / Atom feed
From: Vishal Verma <vishal.l.verma@intel.com>
To: linux-nvdimm@lists.01.org
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>,
	linux-acpi@vger.kernel.org
Subject: [PATCH 1/3] nfit: don't start a full scrub by default for an MCE
Date: Fri, 30 Sep 2016 17:19:29 -0600	[thread overview]
Message-ID: <1475277571-10152-2-git-send-email-vishal.l.verma@intel.com> (raw)
In-Reply-To: <1475277571-10152-1-git-send-email-vishal.l.verma@intel.com>

Starting a full Address Range Scrub (ARS) on hitting a memory error
machine check exception may not always be desirable. Provide a way
through sysfs to toggle the behavior between just adding the address
(cache line) where the MCE happened to the poison list and doing a full
scrub. The former (selective insertion of the address) is done
unconditionally.

Cc: linux-acpi@vger.kernel.org
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Linda Knippers <linda.knippers@hpe.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 drivers/acpi/nfit/core.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/nfit/mce.c  | 24 +++++++++++++++++-----
 drivers/acpi/nfit/nfit.h |  6 ++++++
 3 files changed, 78 insertions(+), 5 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 80cc7c0..ed78cc7 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -878,6 +878,58 @@ static ssize_t revision_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(revision);
 
+static ssize_t hw_error_scrub_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
+	struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+
+	return sprintf(buf, "%d\n", acpi_desc->scrub_mode);
+}
+
+/*
+ * The 'hw_error_scrub' attribute can have the following values written to it:
+ * '0': Switch to the default mode where an exception will only insert
+ *      the address of the memory error into the poison and badblocks lists.
+ * '1': Enable a full scrub to happen if an exception for a memory error is
+ *      received.
+ */
+static ssize_t hw_error_scrub_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct nvdimm_bus_descriptor *nd_desc;
+	ssize_t rc;
+	long val;
+
+	rc = kstrtol(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	device_lock(dev);
+	nd_desc = dev_get_drvdata(dev);
+	if (nd_desc) {
+		struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+
+		switch (val) {
+		case HW_ERROR_SCRUB_ON:
+			acpi_desc->scrub_mode = HW_ERROR_SCRUB_ON;
+			break;
+		case HW_ERROR_SCRUB_OFF:
+			acpi_desc->scrub_mode = HW_ERROR_SCRUB_OFF;
+			break;
+		default:
+			rc = -EINVAL;
+			break;
+		}
+	}
+	device_unlock(dev);
+	if (rc)
+		return rc;
+	return size;
+}
+static DEVICE_ATTR_RW(hw_error_scrub);
+
 /*
  * This shows the number of full Address Range Scrubs that have been
  * completed since driver load time. Userspace can wait on this using
@@ -950,6 +1002,7 @@ static umode_t nfit_visible(struct kobject *kobj, struct attribute *a, int n)
 static struct attribute *acpi_nfit_attributes[] = {
 	&dev_attr_revision.attr,
 	&dev_attr_scrub.attr,
+	&dev_attr_hw_error_scrub.attr,
 	NULL,
 };
 
diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
index 161f915..e5ce81c 100644
--- a/drivers/acpi/nfit/mce.c
+++ b/drivers/acpi/nfit/mce.c
@@ -14,6 +14,7 @@
  */
 #include <linux/notifier.h>
 #include <linux/acpi.h>
+#include <linux/nd.h>
 #include <asm/mce.h>
 #include "nfit.h"
 
@@ -62,12 +63,25 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
 		}
 		mutex_unlock(&acpi_desc->init_mutex);
 
-		/*
-		 * We can ignore an -EBUSY here because if an ARS is already
-		 * in progress, just let that be the last authoritative one
-		 */
-		if (found_match)
+		if (!found_match)
+			continue;
+
+		/* If this fails due to an -ENOMEM, there is little we can do */
+		nvdimm_bus_add_poison(acpi_desc->nvdimm_bus,
+				ALIGN(mce->addr, L1_CACHE_BYTES),
+				L1_CACHE_BYTES);
+		nvdimm_region_notify(nfit_spa->nd_region,
+				NVDIMM_REVALIDATE_POISON);
+
+		if (acpi_desc->scrub_mode == HW_ERROR_SCRUB_ON) {
+			/*
+			 * We can ignore an -EBUSY here because if an ARS is
+			 * already in progress, just let that be the last
+			 * authoritative one
+			 */
 			acpi_nfit_ars_rescan(acpi_desc);
+		}
+		break;
 	}
 
 	mutex_unlock(&acpi_desc_lock);
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index e894ded..c147e11 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -152,6 +152,7 @@ struct acpi_nfit_desc {
 	struct list_head list;
 	struct kernfs_node *scrub_count_state;
 	unsigned int scrub_count;
+	unsigned int scrub_mode;
 	unsigned int cancel:1;
 	unsigned long dimm_cmd_force_en;
 	unsigned long bus_cmd_force_en;
@@ -159,6 +160,11 @@ struct acpi_nfit_desc {
 			void *iobuf, u64 len, int rw);
 };
 
+enum scrub_mode {
+	HW_ERROR_SCRUB_OFF,
+	HW_ERROR_SCRUB_ON,
+};
+
 enum nd_blk_mmio_selector {
 	BDW,
 	DCR,
-- 
2.7.4

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

WARNING: multiple messages have this Message-ID (diff)
From: Vishal Verma <vishal.l.verma@intel.com>
To: linux-nvdimm@lists.01.org
Cc: Dan Williams <dan.j.williams@intel.com>,
	Ross Zwisler <ross.zwisler@linux.intel.com>,
	Linda Knippers <linda.knippers@hpe.com>,
	linux-acpi@vger.kernel.org,
	"Rafael J. Wysocki" <rafael.j.wysocki@intel.com>,
	Vishal Verma <vishal.l.verma@intel.com>
Subject: [PATCH 1/3] nfit: don't start a full scrub by default for an MCE
Date: Fri, 30 Sep 2016 17:19:29 -0600	[thread overview]
Message-ID: <1475277571-10152-2-git-send-email-vishal.l.verma@intel.com> (raw)
In-Reply-To: <1475277571-10152-1-git-send-email-vishal.l.verma@intel.com>

Starting a full Address Range Scrub (ARS) on hitting a memory error
machine check exception may not always be desirable. Provide a way
through sysfs to toggle the behavior between just adding the address
(cache line) where the MCE happened to the poison list and doing a full
scrub. The former (selective insertion of the address) is done
unconditionally.

Cc: linux-acpi@vger.kernel.org
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Linda Knippers <linda.knippers@hpe.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 drivers/acpi/nfit/core.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/nfit/mce.c  | 24 +++++++++++++++++-----
 drivers/acpi/nfit/nfit.h |  6 ++++++
 3 files changed, 78 insertions(+), 5 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 80cc7c0..ed78cc7 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -878,6 +878,58 @@ static ssize_t revision_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(revision);
 
+static ssize_t hw_error_scrub_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
+	struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+
+	return sprintf(buf, "%d\n", acpi_desc->scrub_mode);
+}
+
+/*
+ * The 'hw_error_scrub' attribute can have the following values written to it:
+ * '0': Switch to the default mode where an exception will only insert
+ *      the address of the memory error into the poison and badblocks lists.
+ * '1': Enable a full scrub to happen if an exception for a memory error is
+ *      received.
+ */
+static ssize_t hw_error_scrub_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct nvdimm_bus_descriptor *nd_desc;
+	ssize_t rc;
+	long val;
+
+	rc = kstrtol(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	device_lock(dev);
+	nd_desc = dev_get_drvdata(dev);
+	if (nd_desc) {
+		struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+
+		switch (val) {
+		case HW_ERROR_SCRUB_ON:
+			acpi_desc->scrub_mode = HW_ERROR_SCRUB_ON;
+			break;
+		case HW_ERROR_SCRUB_OFF:
+			acpi_desc->scrub_mode = HW_ERROR_SCRUB_OFF;
+			break;
+		default:
+			rc = -EINVAL;
+			break;
+		}
+	}
+	device_unlock(dev);
+	if (rc)
+		return rc;
+	return size;
+}
+static DEVICE_ATTR_RW(hw_error_scrub);
+
 /*
  * This shows the number of full Address Range Scrubs that have been
  * completed since driver load time. Userspace can wait on this using
@@ -950,6 +1002,7 @@ static umode_t nfit_visible(struct kobject *kobj, struct attribute *a, int n)
 static struct attribute *acpi_nfit_attributes[] = {
 	&dev_attr_revision.attr,
 	&dev_attr_scrub.attr,
+	&dev_attr_hw_error_scrub.attr,
 	NULL,
 };
 
diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
index 161f915..e5ce81c 100644
--- a/drivers/acpi/nfit/mce.c
+++ b/drivers/acpi/nfit/mce.c
@@ -14,6 +14,7 @@
  */
 #include <linux/notifier.h>
 #include <linux/acpi.h>
+#include <linux/nd.h>
 #include <asm/mce.h>
 #include "nfit.h"
 
@@ -62,12 +63,25 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
 		}
 		mutex_unlock(&acpi_desc->init_mutex);
 
-		/*
-		 * We can ignore an -EBUSY here because if an ARS is already
-		 * in progress, just let that be the last authoritative one
-		 */
-		if (found_match)
+		if (!found_match)
+			continue;
+
+		/* If this fails due to an -ENOMEM, there is little we can do */
+		nvdimm_bus_add_poison(acpi_desc->nvdimm_bus,
+				ALIGN(mce->addr, L1_CACHE_BYTES),
+				L1_CACHE_BYTES);
+		nvdimm_region_notify(nfit_spa->nd_region,
+				NVDIMM_REVALIDATE_POISON);
+
+		if (acpi_desc->scrub_mode == HW_ERROR_SCRUB_ON) {
+			/*
+			 * We can ignore an -EBUSY here because if an ARS is
+			 * already in progress, just let that be the last
+			 * authoritative one
+			 */
 			acpi_nfit_ars_rescan(acpi_desc);
+		}
+		break;
 	}
 
 	mutex_unlock(&acpi_desc_lock);
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index e894ded..c147e11 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -152,6 +152,7 @@ struct acpi_nfit_desc {
 	struct list_head list;
 	struct kernfs_node *scrub_count_state;
 	unsigned int scrub_count;
+	unsigned int scrub_mode;
 	unsigned int cancel:1;
 	unsigned long dimm_cmd_force_en;
 	unsigned long bus_cmd_force_en;
@@ -159,6 +160,11 @@ struct acpi_nfit_desc {
 			void *iobuf, u64 len, int rw);
 };
 
+enum scrub_mode {
+	HW_ERROR_SCRUB_OFF,
+	HW_ERROR_SCRUB_ON,
+};
+
 enum nd_blk_mmio_selector {
 	BDW,
 	DCR,
-- 
2.7.4


  reply	other threads:[~2016-09-30 23:20 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-09-30 23:19 [PATCH 0/3] misc updates for Address Range Scrub Vishal Verma
2016-09-30 23:19 ` Vishal Verma
2016-09-30 23:19 ` Vishal Verma [this message]
2016-09-30 23:19   ` [PATCH 1/3] nfit: don't start a full scrub by default for an MCE Vishal Verma
2016-09-30 23:19 ` [PATCH 2/3] pmem: reduce kmap_atomic sections to the memcpys only Vishal Verma
2016-09-30 23:19   ` Vishal Verma
2016-09-30 23:19 ` [PATCH 3/3] libnvdimm: clear the internal poison_list when clearing badblocks Vishal Verma
2016-09-30 23:19   ` Vishal Verma
  -- strict thread matches above, loose matches on Subject: below --
2016-09-29  0:10 [PATCH 0/3] misc updates for Address Range Scrub Vishal Verma
2016-09-29  0:10 ` [PATCH 1/3] nfit: don't start a full scrub by default for an MCE Vishal Verma
2016-09-29  0:10   ` Vishal Verma
2016-09-29  0:53   ` Dan Williams
2016-09-29  0:53     ` Dan Williams
2016-09-29  2:26     ` Dan Williams
2016-09-29  2:26       ` Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1475277571-10152-2-git-send-email-vishal.l.verma@intel.com \
    --to=vishal.l.verma@intel.com \
    --cc=linux-acpi@vger.kernel.org \
    --cc=linux-nvdimm@lists.01.org \
    --cc=rafael.j.wysocki@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.