nvdimm.lists.linux.dev archive mirror
 help / color / mirror / Atom feed
* [ndctl PATCH v2 0/3] nfit, address-range-scrub: rework and fixes
@ 2018-04-06  4:18 Dan Williams
  2018-04-06  4:18 ` [ndctl PATCH v2 1/3] nfit, address-range-scrub: determine one platform max_ars value Dan Williams
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Dan Williams @ 2018-04-06  4:18 UTC (permalink / raw)
  To: linux-nvdimm

Changes since v1:
* Merge the cleanups and fixes with Dave's reviewed-by
* Rework acpi_nfit_query_poison() to be independent of the spa range
* Rework all acpi_nfit_query_poison() usage to call it once and then
  iterate the result buffer over all SPA ranges
* Make sure the implementation never attempts to scrub / register
  unknown SPA types
* Change no_init_ars to a bool type so it can be specified without an
  '=' character.

---

Given the fact that ARS can take 10s to 100s of seconds it is not
feasible to wait for ARS completion before publishing persistent memory
namespaces. Instead convert the ARS implementation to perform a short
ARS for critical errors, ones that caused a previous system reset,
before registering namespaces. Finally, arrange for all long ARS
operations to run in the background and populate the badblock lists at
run time.

---

Dan Williams (3):
      nfit, address-range-scrub: determine one platform max_ars value
      nfit, address-range-scrub: rework and simplify ARS state machine
      nfit, address-range-scrub: add module option to skip initial ars


 drivers/acpi/nfit/core.c |  542 ++++++++++++++++++++--------------------------
 drivers/acpi/nfit/nfit.h |    6 -
 2 files changed, 243 insertions(+), 305 deletions(-)
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [ndctl PATCH v2 1/3] nfit, address-range-scrub: determine one platform max_ars value
  2018-04-06  4:18 [ndctl PATCH v2 0/3] nfit, address-range-scrub: rework and fixes Dan Williams
@ 2018-04-06  4:18 ` Dan Williams
  2018-04-06  4:19 ` [ndctl PATCH v2 2/3] nfit, address-range-scrub: rework and simplify ARS state machine Dan Williams
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 9+ messages in thread
From: Dan Williams @ 2018-04-06  4:18 UTC (permalink / raw)
  To: linux-nvdimm

acpi_nfit_query_poison() is awkward in that it requires an nfit_spa
argument in order to determine what max_ars value to use. Instead probe
for the minimum max_ars across all scrub-capable ranges in the system
and drop the nfit_spa argument.

This enables a larger rework / simplification of the ARS state machine
whereby the status can be retrieved once and then iterated over all
address ranges to reap completions.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c |   78 ++++++++++++++++++++++++----------------------
 drivers/acpi/nfit/nfit.h |    2 +
 2 files changed, 41 insertions(+), 39 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 9c56022c216f..866853abebea 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2494,16 +2494,16 @@ static int ars_get_status(struct acpi_nfit_desc *acpi_desc)
 	int rc, cmd_rc;
 
 	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, ars_status,
-			acpi_desc->ars_status_size, &cmd_rc);
+			acpi_desc->max_ars, &cmd_rc);
 	if (rc < 0)
 		return rc;
 	return cmd_rc;
 }
 
-static int ars_status_process_records(struct acpi_nfit_desc *acpi_desc,
-		struct nd_cmd_ars_status *ars_status)
+static int ars_status_process_records(struct acpi_nfit_desc *acpi_desc)
 {
 	struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus;
+	struct nd_cmd_ars_status *ars_status = acpi_desc->ars_status;
 	int rc;
 	u32 i;
 
@@ -2739,60 +2739,35 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 	return rc;
 }
 
-static int ars_status_alloc(struct acpi_nfit_desc *acpi_desc,
-		u32 max_ars)
+static int ars_status_alloc(struct acpi_nfit_desc *acpi_desc)
 {
 	struct device *dev = acpi_desc->dev;
 	struct nd_cmd_ars_status *ars_status;
 
-	if (acpi_desc->ars_status && acpi_desc->ars_status_size >= max_ars) {
-		memset(acpi_desc->ars_status, 0, acpi_desc->ars_status_size);
+	if (acpi_desc->ars_status) {
+		memset(acpi_desc->ars_status, 0, acpi_desc->max_ars);
 		return 0;
 	}
 
-	if (acpi_desc->ars_status)
-		devm_kfree(dev, acpi_desc->ars_status);
-	acpi_desc->ars_status = NULL;
-	ars_status = devm_kzalloc(dev, max_ars, GFP_KERNEL);
+	ars_status = devm_kzalloc(dev, acpi_desc->max_ars, GFP_KERNEL);
 	if (!ars_status)
 		return -ENOMEM;
 	acpi_desc->ars_status = ars_status;
-	acpi_desc->ars_status_size = max_ars;
 	return 0;
 }
 
-static int acpi_nfit_query_poison(struct acpi_nfit_desc *acpi_desc,
-		struct nfit_spa *nfit_spa)
+static int acpi_nfit_query_poison(struct acpi_nfit_desc *acpi_desc)
 {
-	struct acpi_nfit_system_address *spa = nfit_spa->spa;
 	int rc;
 
-	if (!nfit_spa->max_ars) {
-		struct nd_cmd_ars_cap ars_cap;
-
-		memset(&ars_cap, 0, sizeof(ars_cap));
-		rc = ars_get_cap(acpi_desc, &ars_cap, nfit_spa);
-		if (rc < 0)
-			return rc;
-		nfit_spa->max_ars = ars_cap.max_ars_out;
-		nfit_spa->clear_err_unit = ars_cap.clear_err_unit;
-		/* check that the supported scrub types match the spa type */
-		if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE &&
-				((ars_cap.status >> 16) & ND_ARS_VOLATILE) == 0)
-			return -ENOTTY;
-		else if (nfit_spa_type(spa) == NFIT_SPA_PM &&
-				((ars_cap.status >> 16) & ND_ARS_PERSISTENT) == 0)
-			return -ENOTTY;
-	}
-
-	if (ars_status_alloc(acpi_desc, nfit_spa->max_ars))
+	if (ars_status_alloc(acpi_desc))
 		return -ENOMEM;
 
 	rc = ars_get_status(acpi_desc);
 	if (rc < 0 && rc != -ENOSPC)
 		return rc;
 
-	if (ars_status_process_records(acpi_desc, acpi_desc->ars_status))
+	if (ars_status_process_records(acpi_desc))
 		return -ENOMEM;
 
 	return 0;
@@ -2824,7 +2799,7 @@ static void acpi_nfit_async_scrub(struct acpi_nfit_desc *acpi_desc,
 
 		if (acpi_desc->cancel)
 			break;
-		rc = acpi_nfit_query_poison(acpi_desc, nfit_spa);
+		rc = acpi_nfit_query_poison(acpi_desc);
 		if (rc == -ENOTTY)
 			break;
 		if (rc == -EBUSY && !tmo) {
@@ -2925,7 +2900,7 @@ static void acpi_nfit_scrub(struct work_struct *work)
 			 */
 			rc = 0;
 		} else
-			rc = acpi_nfit_query_poison(acpi_desc, nfit_spa);
+			rc = acpi_nfit_query_poison(acpi_desc);
 
 		if (rc == -ENOTTY) {
 			/* no ars capability, just register spa and move on */
@@ -3018,6 +2993,31 @@ static void acpi_nfit_scrub(struct work_struct *work)
 	mutex_unlock(&acpi_desc->init_mutex);
 }
 
+static void acpi_nfit_init_ars(struct acpi_nfit_desc *acpi_desc,
+		struct nfit_spa *nfit_spa)
+{
+	int type = nfit_spa_type(nfit_spa->spa);
+	struct nd_cmd_ars_cap ars_cap;
+	int rc;
+
+	memset(&ars_cap, 0, sizeof(ars_cap));
+	rc = ars_get_cap(acpi_desc, &ars_cap, nfit_spa);
+	if (rc < 0)
+		return;
+	/* check that the supported scrub types match the spa type */
+	if (type == NFIT_SPA_VOLATILE && ((ars_cap.status >> 16)
+				& ND_ARS_VOLATILE) == 0)
+		return;
+	if (type == NFIT_SPA_PM && ((ars_cap.status >> 16)
+				& ND_ARS_PERSISTENT) == 0)
+		return;
+
+	nfit_spa->max_ars = ars_cap.max_ars_out;
+	nfit_spa->clear_err_unit = ars_cap.clear_err_unit;
+	acpi_desc->max_ars = max(nfit_spa->max_ars, acpi_desc->max_ars);
+}
+
+
 static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
 {
 	struct nfit_spa *nfit_spa;
@@ -3026,8 +3026,10 @@ static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
 		int rc, type = nfit_spa_type(nfit_spa->spa);
 
 		/* PMEM and VMEM will be registered by the ARS workqueue */
-		if (type == NFIT_SPA_PM || type == NFIT_SPA_VOLATILE)
+		if (type == NFIT_SPA_PM || type == NFIT_SPA_VOLATILE) {
+			acpi_nfit_init_ars(acpi_desc, nfit_spa);
 			continue;
+		}
 		/* BLK apertures belong to BLK region registration below */
 		if (type == NFIT_SPA_BDW)
 			continue;
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index 2b97e5f76bdf..45e7949986a8 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -197,10 +197,10 @@ struct acpi_nfit_desc {
 	struct device *dev;
 	u8 ars_start_flags;
 	struct nd_cmd_ars_status *ars_status;
-	size_t ars_status_size;
 	struct work_struct work;
 	struct list_head list;
 	struct kernfs_node *scrub_count_state;
+	unsigned int max_ars;
 	unsigned int scrub_count;
 	unsigned int scrub_mode;
 	unsigned int cancel:1;

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [ndctl PATCH v2 2/3] nfit, address-range-scrub: rework and simplify ARS state machine
  2018-04-06  4:18 [ndctl PATCH v2 0/3] nfit, address-range-scrub: rework and fixes Dan Williams
  2018-04-06  4:18 ` [ndctl PATCH v2 1/3] nfit, address-range-scrub: determine one platform max_ars value Dan Williams
@ 2018-04-06  4:19 ` Dan Williams
  2018-04-06 22:06   ` Kani, Toshi
  2018-04-06  4:19 ` [ndctl PATCH v2 3/3] nfit, address-range-scrub: add module option to skip initial ars Dan Williams
  2018-04-06  4:31 ` [ndctl PATCH v2 0/3] nfit, address-range-scrub: rework and fixes Dan Williams
  3 siblings, 1 reply; 9+ messages in thread
From: Dan Williams @ 2018-04-06  4:19 UTC (permalink / raw)
  To: linux-nvdimm

ARS is an operation that can take 10s to 100s of seconds to find media
errors that should rarely be present. If the platform crashes due to
media errors in persistent memory, the expectation is that the BIOS will
report those known errors in a 'short' ARS request.

A 'short' ARS request asks platform firmware to return an ARS payload
with all known errors, but without issuing a 'long' scrub. At driver
init a short request is issued to all PMEM ranges before registering
regions. Then, in the background, a long ARS is scheduled for each
region.

The ARS implementation is simplified to centralize ARS completion work
in the ars_complete() helper called from ars_status_process_records().
The timeout is removed since there is no facility to cancel ARS, and
system init is never blocked waiting for a 'long' ARS. The ars_state
flags are used to coordinate ARS requests from driver init, ARS requests
from userspace, and ARS requests in response to media error
notifications.

Given that there is no notification of ARS completion the implementation
still needs to poll, but now it backs off exponentially to a maximum
poll period of 30 minutes.

Suggested-by: Toshi Kani <toshi.kani@hpe.com>
Co-developed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c |  469 ++++++++++++++++++++--------------------------
 drivers/acpi/nfit/nfit.h |    4 
 2 files changed, 201 insertions(+), 272 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 866853abebea..77c6a065fdb3 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -35,16 +35,6 @@ static bool force_enable_dimms;
 module_param(force_enable_dimms, bool, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(force_enable_dimms, "Ignore _STA (ACPI DIMM device) status");
 
-static unsigned int scrub_timeout = NFIT_ARS_TIMEOUT;
-module_param(scrub_timeout, uint, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(scrub_timeout, "Initial scrub timeout in seconds");
-
-/* after three payloads of overflow, it's dead jim */
-static unsigned int scrub_overflow_abort = 3;
-module_param(scrub_overflow_abort, uint, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(scrub_overflow_abort,
-		"Number of times we overflow ARS results before abort");
-
 static bool disable_vendor_specific;
 module_param(disable_vendor_specific, bool, S_IRUGO);
 MODULE_PARM_DESC(disable_vendor_specific,
@@ -1251,7 +1241,7 @@ static ssize_t scrub_show(struct device *dev,
 
 		mutex_lock(&acpi_desc->init_mutex);
 		rc = sprintf(buf, "%d%s", acpi_desc->scrub_count,
-				work_busy(&acpi_desc->work)
+				work_busy(&acpi_desc->dwork.work)
 				&& !acpi_desc->cancel ? "+\n" : "\n");
 		mutex_unlock(&acpi_desc->init_mutex);
 	}
@@ -2452,7 +2442,8 @@ static int ars_start(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa
 	memset(&ars_start, 0, sizeof(ars_start));
 	ars_start.address = spa->address;
 	ars_start.length = spa->length;
-	ars_start.flags = acpi_desc->ars_start_flags;
+	if (test_bit(ARS_SHORT, &nfit_spa->ars_state))
+		ars_start.flags = ND_ARS_RETURN_PREV_DATA;
 	if (nfit_spa_type(spa) == NFIT_SPA_PM)
 		ars_start.type = ND_ARS_PERSISTENT;
 	else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE)
@@ -2500,6 +2491,52 @@ static int ars_get_status(struct acpi_nfit_desc *acpi_desc)
 	return cmd_rc;
 }
 
+static void ars_complete(struct acpi_nfit_desc *acpi_desc,
+		struct nfit_spa *nfit_spa)
+{
+	struct nd_cmd_ars_status *ars_status = acpi_desc->ars_status;
+	struct acpi_nfit_system_address *spa = nfit_spa->spa;
+	struct nd_region *nd_region = nfit_spa->nd_region;
+	struct device *dev;
+
+	if ((ars_status->address >= spa->address && ars_status->address
+				< spa->address + spa->length)
+			|| (ars_status->address < spa->address)) {
+		/*
+		 * Assume that if a scrub starts at an offset from the
+		 * start of nfit_spa that we are in the continuation
+		 * case.
+		 *
+		 * Otherwise, if the scrub covers the spa range, mark
+		 * any pending request complete.
+		 */
+		if (ars_status->address + ars_status->length
+				>= spa->address + spa->length)
+				/* complete */;
+		else
+			return;
+	} else
+		return;
+
+	if (test_bit(ARS_DONE, &nfit_spa->ars_state))
+		return;
+
+	if (!test_and_clear_bit(ARS_REQ, &nfit_spa->ars_state))
+		return;
+
+	if (nd_region) {
+		dev = nd_region_dev(nd_region);
+		nvdimm_region_notify(nd_region, NVDIMM_REVALIDATE_POISON);
+	} else
+		dev = acpi_desc->dev;
+
+	dev_dbg(dev, "ARS: range %d %s complete\n", spa->range_index,
+			test_bit(ARS_SHORT, &nfit_spa->ars_state)
+			? "short" : "long");
+	clear_bit(ARS_SHORT, &nfit_spa->ars_state);
+	set_bit(ARS_DONE, &nfit_spa->ars_state);
+}
+
 static int ars_status_process_records(struct acpi_nfit_desc *acpi_desc)
 {
 	struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus;
@@ -2773,223 +2810,108 @@ static int acpi_nfit_query_poison(struct acpi_nfit_desc *acpi_desc)
 	return 0;
 }
 
-static void acpi_nfit_async_scrub(struct acpi_nfit_desc *acpi_desc,
-		struct nfit_spa *nfit_spa)
+static int ars_register(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa,
+		int *query_rc)
 {
-	struct acpi_nfit_system_address *spa = nfit_spa->spa;
-	unsigned int overflow_retry = scrub_overflow_abort;
-	u64 init_ars_start = 0, init_ars_len = 0;
-	struct device *dev = acpi_desc->dev;
-	unsigned int tmo = scrub_timeout;
-	int rc;
-
-	if (!test_bit(ARS_REQ, &nfit_spa->ars_state) || !nfit_spa->nd_region)
-		return;
-
-	rc = ars_start(acpi_desc, nfit_spa);
-	/*
-	 * If we timed out the initial scan we'll still be busy here,
-	 * and will wait another timeout before giving up permanently.
-	 */
-	if (rc < 0 && rc != -EBUSY)
-		return;
+	int rc = *query_rc;
 
-	do {
-		u64 ars_start, ars_len;
-
-		if (acpi_desc->cancel)
-			break;
-		rc = acpi_nfit_query_poison(acpi_desc);
-		if (rc == -ENOTTY)
-			break;
-		if (rc == -EBUSY && !tmo) {
-			dev_warn(dev, "range %d ars timeout, aborting\n",
-					spa->range_index);
-			break;
-		}
+	set_bit(ARS_REQ, &nfit_spa->ars_state);
+	set_bit(ARS_SHORT, &nfit_spa->ars_state);
 
-		if (rc == -EBUSY) {
-			/*
-			 * Note, entries may be appended to the list
-			 * while the lock is dropped, but the workqueue
-			 * being active prevents entries being deleted /
-			 * freed.
-			 */
-			mutex_unlock(&acpi_desc->init_mutex);
-			ssleep(1);
-			tmo--;
-			mutex_lock(&acpi_desc->init_mutex);
-			continue;
-		}
+	if (rc == 0) {
+		rc = ars_start(acpi_desc, nfit_spa);
+		/*
+		 * if short ars does not complete immediately, or
+		 * overflows, save the rest of the ars handling for the
+		 * workqueue
+		 */
+		*query_rc = rc;
+	}
 
-		/* we got some results, but there are more pending... */
-		if (rc == -ENOSPC && overflow_retry--) {
-			if (!init_ars_len) {
-				init_ars_len = acpi_desc->ars_status->length;
-				init_ars_start = acpi_desc->ars_status->address;
-			}
-			rc = ars_continue(acpi_desc);
-		}
+	if (rc == 0)
+		ars_complete(acpi_desc, nfit_spa);
+	else if (rc != -EBUSY && rc != -ENOSPC)
+		set_bit(ARS_FAILED, &nfit_spa->ars_state);
 
-		if (rc < 0) {
-			dev_warn(dev, "range %d ars continuation failed\n",
-					spa->range_index);
-			break;
-		}
+	if (test_and_clear_bit(ARS_DONE, &nfit_spa->ars_state))
+		set_bit(ARS_REQ, &nfit_spa->ars_state);
 
-		if (init_ars_len) {
-			ars_start = init_ars_start;
-			ars_len = init_ars_len;
-		} else {
-			ars_start = acpi_desc->ars_status->address;
-			ars_len = acpi_desc->ars_status->length;
-		}
-		dev_dbg(dev, "spa range: %d ars from %#llx + %#llx complete\n",
-				spa->range_index, ars_start, ars_len);
-		/* notify the region about new poison entries */
-		nvdimm_region_notify(nfit_spa->nd_region,
-				NVDIMM_REVALIDATE_POISON);
-		break;
-	} while (1);
+	return acpi_nfit_register_region(acpi_desc, nfit_spa);
 }
 
-static void acpi_nfit_scrub(struct work_struct *work)
+static unsigned int __acpi_nfit_scrub(struct acpi_nfit_desc *acpi_desc,
+		int query_rc)
 {
-	struct device *dev;
-	u64 init_scrub_length = 0;
+	unsigned int tmo = acpi_desc->scrub_tmo;
+	struct device *dev = acpi_desc->dev;
 	struct nfit_spa *nfit_spa;
-	u64 init_scrub_address = 0;
-	bool init_ars_done = false;
-	struct acpi_nfit_desc *acpi_desc;
-	unsigned int tmo = scrub_timeout;
-	unsigned int overflow_retry = scrub_overflow_abort;
-
-	acpi_desc = container_of(work, typeof(*acpi_desc), work);
-	dev = acpi_desc->dev;
 
-	/*
-	 * We scrub in 2 phases.  The first phase waits for any platform
-	 * firmware initiated scrubs to complete and then we go search for the
-	 * affected spa regions to mark them scanned.  In the second phase we
-	 * initiate a directed scrub for every range that was not scrubbed in
-	 * phase 1. If we're called for a 'rescan', we harmlessly pass through
-	 * the first phase, but really only care about running phase 2, where
-	 * regions can be notified of new poison.
-	 */
-
-	/* process platform firmware initiated scrubs */
- retry:
-	mutex_lock(&acpi_desc->init_mutex);
-	list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
-		struct nd_cmd_ars_status *ars_status;
-		struct acpi_nfit_system_address *spa;
-		u64 ars_start, ars_len;
-		int rc;
-
-		if (acpi_desc->cancel)
-			break;
+	if (acpi_desc->cancel)
+		return 0;
 
-		if (nfit_spa->nd_region)
-			continue;
+	if (query_rc == -EBUSY) {
+		dev_dbg(dev, "ARS: ARS busy\n");
+		return min(30U * 60U, tmo * 2);
+	}
+	if (query_rc == -ENOSPC) {
+		dev_dbg(dev, "ARS: ARS continue\n");
+		ars_continue(acpi_desc);
+		return 1;
+	}
+	if (query_rc) {
+		unsigned long long addr, end;
 
-		if (init_ars_done) {
-			/*
-			 * No need to re-query, we're now just
-			 * reconciling all the ranges covered by the
-			 * initial scrub
-			 */
-			rc = 0;
-		} else
-			rc = acpi_nfit_query_poison(acpi_desc);
+		addr = acpi_desc->ars_status->address;
+		end = addr + acpi_desc->ars_status->length;
+		dev_dbg(dev, "ARS: %llx-%llx failed (%d)\n", addr, end,
+				query_rc);
+	}
 
-		if (rc == -ENOTTY) {
-			/* no ars capability, just register spa and move on */
-			acpi_nfit_register_region(acpi_desc, nfit_spa);
+	list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
+		if (test_bit(ARS_FAILED, &nfit_spa->ars_state))
 			continue;
-		}
-
-		if (rc == -EBUSY && !tmo) {
-			/* fallthrough to directed scrub in phase 2 */
-			dev_warn(dev, "timeout awaiting ars results, continuing...\n");
-			break;
-		} else if (rc == -EBUSY) {
-			mutex_unlock(&acpi_desc->init_mutex);
-			ssleep(1);
-			tmo--;
-			goto retry;
-		}
-
-		/* we got some results, but there are more pending... */
-		if (rc == -ENOSPC && overflow_retry--) {
-			ars_status = acpi_desc->ars_status;
-			/*
-			 * Record the original scrub range, so that we
-			 * can recall all the ranges impacted by the
-			 * initial scrub.
-			 */
-			if (!init_scrub_length) {
-				init_scrub_length = ars_status->length;
-				init_scrub_address = ars_status->address;
-			}
-			rc = ars_continue(acpi_desc);
-			if (rc == 0) {
-				mutex_unlock(&acpi_desc->init_mutex);
-				goto retry;
-			}
-		}
-
-		if (rc < 0) {
-			/*
-			 * Initial scrub failed, we'll give it one more
-			 * try below...
-			 */
-			break;
-		}
-
-		/* We got some final results, record completed ranges */
-		ars_status = acpi_desc->ars_status;
-		if (init_scrub_length) {
-			ars_start = init_scrub_address;
-			ars_len = ars_start + init_scrub_length;
-		} else {
-			ars_start = ars_status->address;
-			ars_len = ars_status->length;
-		}
-		spa = nfit_spa->spa;
-
-		if (!init_ars_done) {
-			init_ars_done = true;
-			dev_dbg(dev, "init scrub %#llx + %#llx complete\n",
-					ars_start, ars_len);
-		}
-		if (ars_start <= spa->address && ars_start + ars_len
-				>= spa->address + spa->length)
-			acpi_nfit_register_region(acpi_desc, nfit_spa);
+		ars_complete(acpi_desc, nfit_spa);
 	}
 
-	/*
-	 * For all the ranges not covered by an initial scrub we still
-	 * want to see if there are errors, but it's ok to discover them
-	 * asynchronously.
-	 */
 	list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
-		/*
-		 * Flag all the ranges that still need scrubbing, but
-		 * register them now to make data available.
-		 */
-		if (!nfit_spa->nd_region) {
-			set_bit(ARS_REQ, &nfit_spa->ars_state);
-			acpi_nfit_register_region(acpi_desc, nfit_spa);
+		if (test_bit(ARS_FAILED, &nfit_spa->ars_state))
+			continue;
+		if (test_bit(ARS_REQ, &nfit_spa->ars_state)) {
+			int rc = ars_start(acpi_desc, nfit_spa);
+
+			clear_bit(ARS_DONE, &nfit_spa->ars_state);
+			dev = nd_region_dev(nfit_spa->nd_region);
+			dev_dbg(dev, "ARS: range %d ARS start (%d)\n",
+					nfit_spa->spa->range_index, rc);
+			if (rc == 0 || rc == -EBUSY)
+				return 1;
+			dev_err(dev, "ARS: range %d ARS failed (%d)\n",
+					nfit_spa->spa->range_index, rc);
+			set_bit(ARS_FAILED, &nfit_spa->ars_state);
 		}
 	}
-	acpi_desc->init_complete = 1;
+	return 0;
+}
 
-	list_for_each_entry(nfit_spa, &acpi_desc->spas, list)
-		acpi_nfit_async_scrub(acpi_desc, nfit_spa);
-	acpi_desc->scrub_count++;
-	acpi_desc->ars_start_flags = 0;
-	if (acpi_desc->scrub_count_state)
-		sysfs_notify_dirent(acpi_desc->scrub_count_state);
+static void acpi_nfit_scrub(struct work_struct *work)
+{
+	struct acpi_nfit_desc *acpi_desc;
+	unsigned int tmo;
+	int query_rc;
+
+	acpi_desc = container_of(work, typeof(*acpi_desc), dwork.work);
+	mutex_lock(&acpi_desc->init_mutex);
+	query_rc = acpi_nfit_query_poison(acpi_desc);
+	tmo = __acpi_nfit_scrub(acpi_desc, query_rc);
+	if (tmo) {
+		queue_delayed_work(nfit_wq, &acpi_desc->dwork, tmo * HZ);
+		acpi_desc->scrub_tmo = tmo;
+	} else {
+		acpi_desc->scrub_count++;
+		if (acpi_desc->scrub_count_state)
+			sysfs_notify_dirent(acpi_desc->scrub_count_state);
+	}
+	memset(acpi_desc->ars_status, 0, acpi_desc->max_ars);
 	mutex_unlock(&acpi_desc->init_mutex);
 }
 
@@ -3015,33 +2937,58 @@ static void acpi_nfit_init_ars(struct acpi_nfit_desc *acpi_desc,
 	nfit_spa->max_ars = ars_cap.max_ars_out;
 	nfit_spa->clear_err_unit = ars_cap.clear_err_unit;
 	acpi_desc->max_ars = max(nfit_spa->max_ars, acpi_desc->max_ars);
+	clear_bit(ARS_FAILED, &nfit_spa->ars_state);
 }
 
-
 static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
 {
 	struct nfit_spa *nfit_spa;
+	int rc, query_rc;
 
 	list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
-		int rc, type = nfit_spa_type(nfit_spa->spa);
-
-		/* PMEM and VMEM will be registered by the ARS workqueue */
-		if (type == NFIT_SPA_PM || type == NFIT_SPA_VOLATILE) {
+		set_bit(ARS_FAILED, &nfit_spa->ars_state);
+		switch (nfit_spa_type(nfit_spa->spa)) {
+		case NFIT_SPA_VOLATILE:
+		case NFIT_SPA_PM:
 			acpi_nfit_init_ars(acpi_desc, nfit_spa);
-			continue;
+			break;
 		}
-		/* BLK apertures belong to BLK region registration below */
-		if (type == NFIT_SPA_BDW)
-			continue;
-		/* BLK regions don't need to wait for ARS results */
-		rc = acpi_nfit_register_region(acpi_desc, nfit_spa);
-		if (rc)
-			return rc;
 	}
 
-	acpi_desc->ars_start_flags = 0;
-	if (!acpi_desc->cancel)
-		queue_work(nfit_wq, &acpi_desc->work);
+	/*
+	 * Reap any results that might be pending before starting new
+	 * short requests.
+	 */
+	query_rc = acpi_nfit_query_poison(acpi_desc);
+
+	list_for_each_entry(nfit_spa, &acpi_desc->spas, list)
+		switch (nfit_spa_type(nfit_spa->spa)) {
+		case NFIT_SPA_VOLATILE:
+		case NFIT_SPA_PM:
+			/* register regions and kick off initial ARS run */
+			rc = ars_register(acpi_desc, nfit_spa, &query_rc);
+			if (rc)
+				return rc;
+			break;
+		case NFIT_SPA_BDW:
+			/* nothing to register */
+			break;
+		case NFIT_SPA_DCR:
+		case NFIT_SPA_VDISK:
+		case NFIT_SPA_VCD:
+		case NFIT_SPA_PDISK:
+		case NFIT_SPA_PCD:
+			/* register known regions that don't support ARS */
+			rc = acpi_nfit_register_region(acpi_desc, nfit_spa);
+			if (rc)
+				return rc;
+			break;
+		default:
+			/* don't register unknown regions */
+			break;
+		}
+
+	queue_delayed_work(nfit_wq, &acpi_desc->dwork, 0);
 	return 0;
 }
 
@@ -3176,49 +3123,20 @@ int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz)
 }
 EXPORT_SYMBOL_GPL(acpi_nfit_init);
 
-struct acpi_nfit_flush_work {
-	struct work_struct work;
-	struct completion cmp;
-};
-
-static void flush_probe(struct work_struct *work)
-{
-	struct acpi_nfit_flush_work *flush;
-
-	flush = container_of(work, typeof(*flush), work);
-	complete(&flush->cmp);
-}
-
 static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
 {
 	struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
 	struct device *dev = acpi_desc->dev;
-	struct acpi_nfit_flush_work flush;
-	int rc;
 
-	/* bounce the device lock to flush acpi_nfit_add / acpi_nfit_notify */
+	/* Bounce the device lock to flush acpi_nfit_add / acpi_nfit_notify */
 	device_lock(dev);
 	device_unlock(dev);
 
-	/* bounce the init_mutex to make init_complete valid */
+	/* Bounce the init_mutex to complete initial registration */
 	mutex_lock(&acpi_desc->init_mutex);
-	if (acpi_desc->cancel || acpi_desc->init_complete) {
-		mutex_unlock(&acpi_desc->init_mutex);
-		return 0;
-	}
-
-	/*
-	 * Scrub work could take 10s of seconds, userspace may give up so we
-	 * need to be interruptible while waiting.
-	 */
-	INIT_WORK_ONSTACK(&flush.work, flush_probe);
-	init_completion(&flush.cmp);
-	queue_work(nfit_wq, &flush.work);
 	mutex_unlock(&acpi_desc->init_mutex);
 
-	rc = wait_for_completion_interruptible(&flush.cmp);
-	cancel_work_sync(&flush.work);
-	return rc;
+	return 0;
 }
 
 static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc,
@@ -3237,7 +3155,7 @@ static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc,
 	 * just needs guarantees that any ars it initiates are not
 	 * interrupted by any intervening start reqeusts from userspace.
 	 */
-	if (work_busy(&acpi_desc->work))
+	if (work_busy(&acpi_desc->dwork.work))
 		return -EBUSY;
 
 	return 0;
@@ -3246,11 +3164,9 @@ static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc,
 int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc, unsigned long flags)
 {
 	struct device *dev = acpi_desc->dev;
+	int scheduled = 0, busy = 0;
 	struct nfit_spa *nfit_spa;
 
-	if (work_busy(&acpi_desc->work))
-		return -EBUSY;
-
 	mutex_lock(&acpi_desc->init_mutex);
 	if (acpi_desc->cancel) {
 		mutex_unlock(&acpi_desc->init_mutex);
@@ -3258,21 +3174,32 @@ int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc, unsigned long flags)
 	}
 
 	list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
-		struct acpi_nfit_system_address *spa = nfit_spa->spa;
+		int type = nfit_spa_type(nfit_spa->spa);
 
-		if (nfit_spa_type(spa) != NFIT_SPA_PM)
+		if (type != NFIT_SPA_PM && type != NFIT_SPA_VOLATILE)
+			continue;
+		if (test_bit(ARS_FAILED, &nfit_spa->ars_state))
 			continue;
 
-		set_bit(ARS_REQ, &nfit_spa->ars_state);
+		if (test_and_set_bit(ARS_REQ, &nfit_spa->ars_state))
+			busy++;
+		else {
+			if (test_bit(ARS_SHORT, &flags))
+				set_bit(ARS_SHORT, &nfit_spa->ars_state);
+			scheduled++;
+		}
+	}
+	if (scheduled) {
+		queue_delayed_work(nfit_wq, &acpi_desc->dwork, 0);
+		dev_dbg(dev, "ars_scan triggered\n");
 	}
-	acpi_desc->ars_start_flags = 0;
-	if (test_bit(ARS_SHORT, &flags))
-		acpi_desc->ars_start_flags |= ND_ARS_RETURN_PREV_DATA;
-	queue_work(nfit_wq, &acpi_desc->work);
-	dev_dbg(dev, "ars_scan triggered\n");
 	mutex_unlock(&acpi_desc->init_mutex);
 
-	return 0;
+	if (scheduled)
+		return 0;
+	if (busy)
+		return -EBUSY;
+	return -ENOTTY;
 }
 
 void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev)
@@ -3299,7 +3226,8 @@ void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev)
 	INIT_LIST_HEAD(&acpi_desc->dimms);
 	INIT_LIST_HEAD(&acpi_desc->list);
 	mutex_init(&acpi_desc->init_mutex);
-	INIT_WORK(&acpi_desc->work, acpi_nfit_scrub);
+	acpi_desc->scrub_tmo = 1;
+	INIT_DELAYED_WORK(&acpi_desc->dwork, acpi_nfit_scrub);
 }
 EXPORT_SYMBOL_GPL(acpi_nfit_desc_init);
 
@@ -3323,6 +3251,7 @@ void acpi_nfit_shutdown(void *data)
 
 	mutex_lock(&acpi_desc->init_mutex);
 	acpi_desc->cancel = 1;
+	cancel_delayed_work_sync(&acpi_desc->dwork);
 	mutex_unlock(&acpi_desc->init_mutex);
 
 	/*
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index 45e7949986a8..7d15856a739f 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -197,18 +197,18 @@ struct acpi_nfit_desc {
 	struct device *dev;
 	u8 ars_start_flags;
 	struct nd_cmd_ars_status *ars_status;
-	struct work_struct work;
+	struct delayed_work dwork;
 	struct list_head list;
 	struct kernfs_node *scrub_count_state;
 	unsigned int max_ars;
 	unsigned int scrub_count;
 	unsigned int scrub_mode;
 	unsigned int cancel:1;
-	unsigned int init_complete:1;
 	unsigned long dimm_cmd_force_en;
 	unsigned long bus_cmd_force_en;
 	unsigned long bus_nfit_cmd_force_en;
 	unsigned int platform_cap;
+	unsigned int scrub_tmo;
 	int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
 			void *iobuf, u64 len, int rw);
 };

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [ndctl PATCH v2 3/3] nfit, address-range-scrub: add module option to skip initial ars
  2018-04-06  4:18 [ndctl PATCH v2 0/3] nfit, address-range-scrub: rework and fixes Dan Williams
  2018-04-06  4:18 ` [ndctl PATCH v2 1/3] nfit, address-range-scrub: determine one platform max_ars value Dan Williams
  2018-04-06  4:19 ` [ndctl PATCH v2 2/3] nfit, address-range-scrub: rework and simplify ARS state machine Dan Williams
@ 2018-04-06  4:19 ` Dan Williams
  2018-04-06  4:31 ` [ndctl PATCH v2 0/3] nfit, address-range-scrub: rework and fixes Dan Williams
  3 siblings, 0 replies; 9+ messages in thread
From: Dan Williams @ 2018-04-06  4:19 UTC (permalink / raw)
  To: linux-nvdimm

After attempting to quickly retrieve known errors the kernel proceeds to
kick off a long running ARS. Add a module option to disable this
behavior at initialization time, or at new region discovery time.
Otherwise, ARS can be started manually regardless of the state of this
setting.

Co-developed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c |    7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 77c6a065fdb3..6c333f28d7da 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -49,6 +49,10 @@ module_param(default_dsm_family, int, S_IRUGO);
 MODULE_PARM_DESC(default_dsm_family,
 		"Try this DSM type first when identifying NVDIMM family");
 
+static bool no_init_ars;
+module_param(no_init_ars, bool, 0644);
+MODULE_PARM_DESC(no_init_ars, "Skip ARS run at nfit init time");
+
 LIST_HEAD(acpi_descs);
 DEFINE_MUTEX(acpi_desc_lock);
 
@@ -2815,6 +2819,9 @@ static int ars_register(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_
 {
 	int rc = *query_rc;
 
+	if (no_init_ars)
+		return acpi_nfit_register_region(acpi_desc, nfit_spa);
+
 	set_bit(ARS_REQ, &nfit_spa->ars_state);
 	set_bit(ARS_SHORT, &nfit_spa->ars_state);
 

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [ndctl PATCH v2 0/3] nfit, address-range-scrub: rework and fixes
  2018-04-06  4:18 [ndctl PATCH v2 0/3] nfit, address-range-scrub: rework and fixes Dan Williams
                   ` (2 preceding siblings ...)
  2018-04-06  4:19 ` [ndctl PATCH v2 3/3] nfit, address-range-scrub: add module option to skip initial ars Dan Williams
@ 2018-04-06  4:31 ` Dan Williams
  3 siblings, 0 replies; 9+ messages in thread
From: Dan Williams @ 2018-04-06  4:31 UTC (permalink / raw)
  To: linux-nvdimm

Whoops, these patches should not be prefixed "ndctl".

On Thu, Apr 5, 2018 at 9:18 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> Changes since v1:
> * Merge the cleanups and fixes with Dave's reviewed-by
> * Rework acpi_nfit_query_poison() to be independent of the spa range
> * Rework all acpi_nfit_query_poison() usage to call it once and then
>   iterate the result buffer over all SPA ranges
> * Make sure the implementation never attempts to scrub / register
>   unknown SPA types
> * Change no_init_ars to a bool type so it can be specified without an
>   '=' character.
>
> ---
>
> Given the fact that ARS can take 10s to 100s of seconds it is not
> feasible to wait for ARS completion before publishing persistent memory
> namespaces. Instead convert the ARS implementation to perform a short
> ARS for critical errors, ones that caused a previous system reset,
> before registering namespaces. Finally, arrange for all long ARS
> operations to run in the background and populate the badblock lists at
> run time.
>
> ---
>
> Dan Williams (3):
>       nfit, address-range-scrub: determine one platform max_ars value
>       nfit, address-range-scrub: rework and simplify ARS state machine
>       nfit, address-range-scrub: add module option to skip initial ars
>
>
>  drivers/acpi/nfit/core.c |  542 ++++++++++++++++++++--------------------------
>  drivers/acpi/nfit/nfit.h |    6 -
>  2 files changed, 243 insertions(+), 305 deletions(-)
> _______________________________________________
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [ndctl PATCH v2 2/3] nfit, address-range-scrub: rework and simplify ARS state machine
  2018-04-06  4:19 ` [ndctl PATCH v2 2/3] nfit, address-range-scrub: rework and simplify ARS state machine Dan Williams
@ 2018-04-06 22:06   ` Kani, Toshi
  2018-04-06 22:13     ` Dan Williams
  0 siblings, 1 reply; 9+ messages in thread
From: Kani, Toshi @ 2018-04-06 22:06 UTC (permalink / raw)
  To: dan.j.williams, linux-nvdimm

On Thu, 2018-04-05 at 21:19 -0700, Dan Williams wrote:
> ARS is an operation that can take 10s to 100s of seconds to find media
> errors that should rarely be present. If the platform crashes due to
> media errors in persistent memory, the expectation is that the BIOS will
> report those known errors in a 'short' ARS request.
> 
> A 'short' ARS request asks platform firmware to return an ARS payload
> with all known errors, but without issuing a 'long' scrub. At driver
> init a short request is issued to all PMEM ranges before registering
> regions. Then, in the background, a long ARS is scheduled for each
> region.

I confirmed that this version addressed the WARN_ONCE issue.

> The ARS implementation is simplified to centralize ARS completion work
> in the ars_complete() helper called from ars_status_process_records().
> The timeout is removed since there is no facility to cancel ARS, and
> system init is never blocked waiting for a 'long' ARS. The ars_state
> flags are used to coordinate ARS requests from driver init, ARS requests
> from userspace, and ARS requests in response to media error
> notifications.

While I like the simplification of the code, I leaned that we need to
handle both cases below:
 
 1) No FW ARS Scan: ARS short scan and enable pmem devices without delay
(new behavior by this patch)
 2) FW ARS Scan: Wait for FW ARS scan to complete, and then enable pmem
devices

Case 2) is still necessary because:

 - After a system crash in certain error scenario, FW may not be able to
obtain all error records and need ARS long scan to retrieve them.
 - Other OSes do not initiate an ARS long scan, and assume FW to start
it at POST when necessary.

Thanks,
-Toshi
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [ndctl PATCH v2 2/3] nfit, address-range-scrub: rework and simplify ARS state machine
  2018-04-06 22:06   ` Kani, Toshi
@ 2018-04-06 22:13     ` Dan Williams
  2018-04-06 22:36       ` Kani, Toshi
  0 siblings, 1 reply; 9+ messages in thread
From: Dan Williams @ 2018-04-06 22:13 UTC (permalink / raw)
  To: Kani, Toshi; +Cc: linux-nvdimm

On Fri, Apr 6, 2018 at 3:06 PM, Kani, Toshi <toshi.kani@hpe.com> wrote:
> On Thu, 2018-04-05 at 21:19 -0700, Dan Williams wrote:
>> ARS is an operation that can take 10s to 100s of seconds to find media
>> errors that should rarely be present. If the platform crashes due to
>> media errors in persistent memory, the expectation is that the BIOS will
>> report those known errors in a 'short' ARS request.
>>
>> A 'short' ARS request asks platform firmware to return an ARS payload
>> with all known errors, but without issuing a 'long' scrub. At driver
>> init a short request is issued to all PMEM ranges before registering
>> regions. Then, in the background, a long ARS is scheduled for each
>> region.
>
> I confirmed that this version addressed the WARN_ONCE issue.
>
>> The ARS implementation is simplified to centralize ARS completion work
>> in the ars_complete() helper called from ars_status_process_records().
>> The timeout is removed since there is no facility to cancel ARS, and
>> system init is never blocked waiting for a 'long' ARS. The ars_state
>> flags are used to coordinate ARS requests from driver init, ARS requests
>> from userspace, and ARS requests in response to media error
>> notifications.
>
> While I like the simplification of the code, I leaned that we need to
> handle both cases below:
>
>  1) No FW ARS Scan: ARS short scan and enable pmem devices without delay
> (new behavior by this patch)
>  2) FW ARS Scan: Wait for FW ARS scan to complete, and then enable pmem
> devices
>
> Case 2) is still necessary because:
>
>  - After a system crash in certain error scenario, FW may not be able to
> obtain all error records and need ARS long scan to retrieve them.
>  - Other OSes do not initiate an ARS long scan, and assume FW to start
> it at POST when necessary.

Given that there is no specification for how long an ARS can take it
is not acceptable for system boot to be blocked indefinitely. In the
case where firmware can't populate enough errors into the short scan,
*and* machine check error recovery can't handle the errors we're well
into "this system needs manual remediation" territory. In that case an
administrator can do the following:

1/ boot with "modprobe.blacklist=nd_pmem" to stop pmem namespaces from
starting automatically
2/ call "ndctl wait-scrub" to wait for the OS or BIOS initiated ARS to complete
3/ manually start up namespaces with the up to date error list
"modprobe nd_pmem"
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [ndctl PATCH v2 2/3] nfit, address-range-scrub: rework and simplify ARS state machine
  2018-04-06 22:13     ` Dan Williams
@ 2018-04-06 22:36       ` Kani, Toshi
  2018-04-06 22:51         ` Dan Williams
  0 siblings, 1 reply; 9+ messages in thread
From: Kani, Toshi @ 2018-04-06 22:36 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-nvdimm

On Fri, 2018-04-06 at 15:13 -0700, Dan Williams wrote:
> On Fri, Apr 6, 2018 at 3:06 PM, Kani, Toshi <toshi.kani@hpe.com> wrote:
> > On Thu, 2018-04-05 at 21:19 -0700, Dan Williams wrote:
> > > ARS is an operation that can take 10s to 100s of seconds to find media
> > > errors that should rarely be present. If the platform crashes due to
> > > media errors in persistent memory, the expectation is that the BIOS will
> > > report those known errors in a 'short' ARS request.
> > > 
> > > A 'short' ARS request asks platform firmware to return an ARS payload
> > > with all known errors, but without issuing a 'long' scrub. At driver
> > > init a short request is issued to all PMEM ranges before registering
> > > regions. Then, in the background, a long ARS is scheduled for each
> > > region.
> > 
> > I confirmed that this version addressed the WARN_ONCE issue.
> > 
> > > The ARS implementation is simplified to centralize ARS completion work
> > > in the ars_complete() helper called from ars_status_process_records().
> > > The timeout is removed since there is no facility to cancel ARS, and
> > > system init is never blocked waiting for a 'long' ARS. The ars_state
> > > flags are used to coordinate ARS requests from driver init, ARS requests
> > > from userspace, and ARS requests in response to media error
> > > notifications.
> > 
> > While I like the simplification of the code, I leaned that we need to
> > handle both cases below:
> > 
> >  1) No FW ARS Scan: ARS short scan and enable pmem devices without delay
> > (new behavior by this patch)
> >  2) FW ARS Scan: Wait for FW ARS scan to complete, and then enable pmem
> > devices
> > 
> > Case 2) is still necessary because:
> > 
> >  - After a system crash in certain error scenario, FW may not be able to
> > obtain all error records and need ARS long scan to retrieve them.
> >  - Other OSes do not initiate an ARS long scan, and assume FW to start
> > it at POST when necessary.
> 
> Given that there is no specification for how long an ARS can take it
> is not acceptable for system boot to be blocked indefinitely. In the
> case where firmware can't populate enough errors into the short scan,

I am less concerned if we get not-enough errors from the short scan in
case 2).  A background ARS long scan can then fill the gap.  In this
case, however, it does not get any error from ARS, including previously
populated ones, since the short scan is not called before enabling pmem
devices.

> *and* machine check error recovery can't handle the errors we're well
> into "this system needs manual remediation" territory. In that case an
> administrator can do the following:
> 
> 1/ boot with "modprobe.blacklist=nd_pmem" to stop pmem namespaces from
> starting automatically
> 2/ call "ndctl wait-scrub" to wait for the OS or BIOS initiated ARS to complete
> 3/ manually start up namespaces with the up to date error list
> "modprobe nd_pmem"

It's good to know that we have a remedy, but exposing all previously
populated errors as a result does not sound right to me.

Thanks,
-Toshi
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [ndctl PATCH v2 2/3] nfit, address-range-scrub: rework and simplify ARS state machine
  2018-04-06 22:36       ` Kani, Toshi
@ 2018-04-06 22:51         ` Dan Williams
  0 siblings, 0 replies; 9+ messages in thread
From: Dan Williams @ 2018-04-06 22:51 UTC (permalink / raw)
  To: Kani, Toshi; +Cc: linux-nvdimm

On Fri, Apr 6, 2018 at 3:36 PM, Kani, Toshi <toshi.kani@hpe.com> wrote:
> On Fri, 2018-04-06 at 15:13 -0700, Dan Williams wrote:
>> On Fri, Apr 6, 2018 at 3:06 PM, Kani, Toshi <toshi.kani@hpe.com> wrote:
>> > On Thu, 2018-04-05 at 21:19 -0700, Dan Williams wrote:
>> > > ARS is an operation that can take 10s to 100s of seconds to find media
>> > > errors that should rarely be present. If the platform crashes due to
>> > > media errors in persistent memory, the expectation is that the BIOS will
>> > > report those known errors in a 'short' ARS request.
>> > >
>> > > A 'short' ARS request asks platform firmware to return an ARS payload
>> > > with all known errors, but without issuing a 'long' scrub. At driver
>> > > init a short request is issued to all PMEM ranges before registering
>> > > regions. Then, in the background, a long ARS is scheduled for each
>> > > region.
>> >
>> > I confirmed that this version addressed the WARN_ONCE issue.
>> >
>> > > The ARS implementation is simplified to centralize ARS completion work
>> > > in the ars_complete() helper called from ars_status_process_records().
>> > > The timeout is removed since there is no facility to cancel ARS, and
>> > > system init is never blocked waiting for a 'long' ARS. The ars_state
>> > > flags are used to coordinate ARS requests from driver init, ARS requests
>> > > from userspace, and ARS requests in response to media error
>> > > notifications.
>> >
>> > While I like the simplification of the code, I leaned that we need to
>> > handle both cases below:
>> >
>> >  1) No FW ARS Scan: ARS short scan and enable pmem devices without delay
>> > (new behavior by this patch)
>> >  2) FW ARS Scan: Wait for FW ARS scan to complete, and then enable pmem
>> > devices
>> >
>> > Case 2) is still necessary because:
>> >
>> >  - After a system crash in certain error scenario, FW may not be able to
>> > obtain all error records and need ARS long scan to retrieve them.
>> >  - Other OSes do not initiate an ARS long scan, and assume FW to start
>> > it at POST when necessary.
>>
>> Given that there is no specification for how long an ARS can take it
>> is not acceptable for system boot to be blocked indefinitely. In the
>> case where firmware can't populate enough errors into the short scan,
>
> I am less concerned if we get not-enough errors from the short scan in
> case 2).  A background ARS long scan can then fill the gap.  In this
> case, however, it does not get any error from ARS, including previously
> populated ones, since the short scan is not called before enabling pmem
> devices.

Right, if the BIOS kicks off ARS it is getting in the way of the OS
doing the short scan. My position is that the BIOS is being too
paranoid in that case, and should not be doing ARS at boot. We have
machine check recovery and the low expected rate of errors as
mitigations that allow namespaces to come up as early as possible. We
certainly do not do this type of paranoid waiting for disk devices,
why would NVDIMMs be any different?

So yes, the new proposed Linux ARS policy is deliberately ignoring
BIOSen that want to leave ARS running as the OS is booting. If a
platform BIOS implementation really wants all known errors to be found
before starting namespaces it should wait for ARS completion before
finishing POST. Hopefully, no BIOS does that because the OS has better
ways to make forward progress and mitigate errors.

>> *and* machine check error recovery can't handle the errors we're well
>> into "this system needs manual remediation" territory. In that case an
>> administrator can do the following:
>>
>> 1/ boot with "modprobe.blacklist=nd_pmem" to stop pmem namespaces from
>> starting automatically
>> 2/ call "ndctl wait-scrub" to wait for the OS or BIOS initiated ARS to complete
>> 3/ manually start up namespaces with the up to date error list
>> "modprobe nd_pmem"
>
> It's good to know that we have a remedy, but exposing all previously
> populated errors as a result does not sound right to me.

They're not exposed if they are not being actively consumed.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2018-04-06 22:51 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-06  4:18 [ndctl PATCH v2 0/3] nfit, address-range-scrub: rework and fixes Dan Williams
2018-04-06  4:18 ` [ndctl PATCH v2 1/3] nfit, address-range-scrub: determine one platform max_ars value Dan Williams
2018-04-06  4:19 ` [ndctl PATCH v2 2/3] nfit, address-range-scrub: rework and simplify ARS state machine Dan Williams
2018-04-06 22:06   ` Kani, Toshi
2018-04-06 22:13     ` Dan Williams
2018-04-06 22:36       ` Kani, Toshi
2018-04-06 22:51         ` Dan Williams
2018-04-06  4:19 ` [ndctl PATCH v2 3/3] nfit, address-range-scrub: add module option to skip initial ars Dan Williams
2018-04-06  4:31 ` [ndctl PATCH v2 0/3] nfit, address-range-scrub: rework and fixes Dan Williams

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).