linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] dax: add badblocks check to Device DAX
@ 2017-05-03 15:31 Toshi Kani
  2017-05-03 15:52 ` Dan Williams
  0 siblings, 1 reply; 14+ messages in thread
From: Toshi Kani @ 2017-05-03 15:31 UTC (permalink / raw)
  To: dan.j.williams; +Cc: dave.jiang, linux-nvdimm, linux-kernel, Toshi Kani

This is a RFC patch for seeking suggestions.  It adds support of
badblocks check in Device DAX by using region-level badblocks list.
This patch is only briefly tested.

device_dax is a well-isolated self-contained module as it calls
alloc_dax() with dev_dax, which is private to device_dax.  For
checking badblocks, it needs to call dax_pmem to check with
region-level badblocks.

This patch attempts to keep device_dax self-contained.  It adds
check_error() to dax_operations, and dax_check_error() as a stub
with *dev_dax and *dev pointers to convey it to dax_pmem.  I am
wondering if this is the right direction, or we should change the
modularity to let dax_pmem call alloc_dax() with its dax_pmem (or
I completely missed something).

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
---
 drivers/dax/device-dax.h |    4 +++-
 drivers/dax/device.c     |   22 ++++++++++++----------
 drivers/dax/pmem.c       |   28 +++++++++++++++++++++++++++-
 drivers/dax/super.c      |   19 +++++++++++++++++++
 include/linux/dax.h      |    7 +++++++
 5 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/drivers/dax/device-dax.h b/drivers/dax/device-dax.h
index fdcd976..ae0277c 100644
--- a/drivers/dax/device-dax.h
+++ b/drivers/dax/device-dax.h
@@ -12,6 +12,7 @@
  */
 #ifndef __DEVICE_DAX_H__
 #define __DEVICE_DAX_H__
+#include <linux/dax.h>
 struct device;
 struct dev_dax;
 struct resource;
@@ -21,5 +22,6 @@ struct dax_region *alloc_dax_region(struct device *parent,
 		int region_id, struct resource *res, unsigned int align,
 		void *addr, unsigned long flags);
 struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
-		struct resource *res, int count);
+		struct resource *res, int count,
+		const struct dax_operations *ops);
 #endif /* __DEVICE_DAX_H__ */
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 1a3e08e..7eb1395 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -249,13 +249,14 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
 		pgoff -= PHYS_PFN(resource_size(res));
 	}
 
-	if (i < dev_dax->num_resources) {
-		res = &dev_dax->res[i];
-		if (phys + size - 1 <= res->end)
-			return phys;
-	}
+	if ((i >= dev_dax->num_resources) ||
+	    (phys + size - 1 > res->end))
+		return -1;
 
-	return -1;
+	if (dax_check_error(dev_dax->dax_dev, dev_dax->dev.parent, phys, size))
+		return -1;
+
+	return phys;
 }
 
 static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
@@ -340,7 +341,7 @@ static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
 	if (phys == -1) {
 		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
 				pgoff);
-		return VM_FAULT_SIGBUS;
+		return VM_FAULT_FALLBACK;
 	}
 
 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
@@ -392,7 +393,7 @@ static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
 	if (phys == -1) {
 		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
 				pgoff);
-		return VM_FAULT_SIGBUS;
+		return VM_FAULT_FALLBACK;
 	}
 
 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
@@ -574,7 +575,8 @@ static void unregister_dev_dax(void *dev)
 }
 
 struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
-		struct resource *res, int count)
+		struct resource *res, int count,
+		const struct dax_operations *ops)
 {
 	struct device *parent = dax_region->dev;
 	struct dax_device *dax_dev;
@@ -612,7 +614,7 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
 	 * No 'host' or dax_operations since there is no access to this
 	 * device outside of mmap of the resulting character device.
 	 */
-	dax_dev = alloc_dax(dev_dax, NULL, NULL);
+	dax_dev = alloc_dax(dev_dax, NULL, ops);
 	if (!dax_dev)
 		goto err_dax;
 
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index d4ca19b..c8db9bc 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -56,6 +56,32 @@ static void dax_pmem_percpu_kill(void *data)
 	wait_for_completion(&dax_pmem->cmp);
 }
 
+static int dax_pmem_check_error(struct device *dev, phys_addr_t phys,
+		unsigned long len)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	sector_t sector;
+
+	if (phys < nd_region->ndr_start) {
+		len = phys + len - nd_region->ndr_start;
+		phys = nd_region->ndr_start;
+	}
+
+	if (phys + len > nd_region->ndr_start + nd_region->ndr_size)
+		len = nd_region->ndr_start + nd_region->ndr_size - phys;
+
+	sector = (phys - nd_region->ndr_start) / 512;
+
+	if (unlikely(is_bad_pmem(&nd_region->bb, sector, len)))
+		return -EIO;
+
+	return 0;
+}
+
+static const struct dax_operations dax_pmem_ops = {
+	.check_error = dax_pmem_check_error,
+};
+
 static int dax_pmem_probe(struct device *dev)
 {
 	int rc;
@@ -130,7 +156,7 @@ static int dax_pmem_probe(struct device *dev)
 		return -ENOMEM;
 
 	/* TODO: support for subdividing a dax region... */
-	dev_dax = devm_create_dev_dax(dax_region, &res, 1);
+	dev_dax = devm_create_dev_dax(dax_region, &res, 1, &dax_pmem_ops);
 
 	/* child dev_dax instances now own the lifetime of the dax_region */
 	dax_region_put(dax_region);
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 465dcd7..6a83174 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -104,6 +104,25 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
 }
 EXPORT_SYMBOL_GPL(dax_direct_access);
 
+/**
+ * dax_check_error() - check if a physical address range has any error
+ * @dax_dev: a dax_device instance representing the logical memory range
+ * @dev: a device instance representing the driver's device
+ * @phys: physical base address to check for error
+ * @len: length of physical address range to check for error
+ *
+ * Return: 0 if no error, negative errno if any error is found.
+ */
+int dax_check_error(struct dax_device *dax_dev, struct device *dev,
+		phys_addr_t phys, unsigned long len)
+{
+	if (!dax_dev->ops || !dax_dev->ops->check_error)
+		return 0;
+
+	return dax_dev->ops->check_error(dev, phys, len);
+}
+EXPORT_SYMBOL_GPL(dax_check_error);
+
 bool dax_alive(struct dax_device *dax_dev)
 {
 	lockdep_assert_held(&dax_srcu);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index d3158e7..7b575c4 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -16,6 +16,11 @@ struct dax_operations {
 	 */
 	long (*direct_access)(struct dax_device *, pgoff_t, long,
 			void **, pfn_t *);
+
+	/*
+	 * check_error: check if a physical address range has any error.
+	 */
+	int (*check_error)(struct device *, phys_addr_t, unsigned long);
 };
 
 int dax_read_lock(void);
@@ -29,6 +34,8 @@ void kill_dax(struct dax_device *dax_dev);
 void *dax_get_private(struct dax_device *dax_dev);
 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
 		void **kaddr, pfn_t *pfn);
+int dax_check_error(struct dax_device *dax_dev, struct device *dev,
+		phys_addr_t phys, unsigned long len);
 
 /*
  * We use lowest available bit in exceptional entry for locking, one bit for

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] dax: add badblocks check to Device DAX
  2017-05-03 15:31 [RFC PATCH] dax: add badblocks check to Device DAX Toshi Kani
@ 2017-05-03 15:52 ` Dan Williams
  2017-05-03 16:09   ` Kani, Toshimitsu
  0 siblings, 1 reply; 14+ messages in thread
From: Dan Williams @ 2017-05-03 15:52 UTC (permalink / raw)
  To: Toshi Kani; +Cc: Dave Jiang, linux-nvdimm@lists.01.org, linux-kernel

On Wed, May 3, 2017 at 8:31 AM, Toshi Kani <toshi.kani@hpe.com> wrote:
> This is a RFC patch for seeking suggestions.  It adds support of
> badblocks check in Device DAX by using region-level badblocks list.
> This patch is only briefly tested.
>
> device_dax is a well-isolated self-contained module as it calls
> alloc_dax() with dev_dax, which is private to device_dax.  For
> checking badblocks, it needs to call dax_pmem to check with
> region-level badblocks.
>
> This patch attempts to keep device_dax self-contained.  It adds
> check_error() to dax_operations, and dax_check_error() as a stub
> with *dev_dax and *dev pointers to convey it to dax_pmem.  I am
> wondering if this is the right direction, or we should change the
> modularity to let dax_pmem call alloc_dax() with its dax_pmem (or
> I completely missed something).

The problem is that device-dax guarantees a given fault granularity.
To make that guarantee we can't fallback from 1G or 2M mappings due to
an error. We also can't reasonably go the other way and fail mappings
that contain a badblock because that would change the blast radius of
a media error to the fault size.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] dax: add badblocks check to Device DAX
  2017-05-03 15:52 ` Dan Williams
@ 2017-05-03 16:09   ` Kani, Toshimitsu
  2017-05-03 16:30     ` Dan Williams
  0 siblings, 1 reply; 14+ messages in thread
From: Kani, Toshimitsu @ 2017-05-03 16:09 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-kernel, dave.jiang, linux-nvdimm@lists.01.org

On Wed, 2017-05-03 at 08:52 -0700, Dan Williams wrote:
> On Wed, May 3, 2017 at 8:31 AM, Toshi Kani <toshi.kani@hpe.com>
> wrote:
> > This is a RFC patch for seeking suggestions.  It adds support of
> > badblocks check in Device DAX by using region-level badblocks list.
> > This patch is only briefly tested.
> > 
> > device_dax is a well-isolated self-contained module as it calls
> > alloc_dax() with dev_dax, which is private to device_dax.  For
> > checking badblocks, it needs to call dax_pmem to check with
> > region-level badblocks.
> > 
> > This patch attempts to keep device_dax self-contained.  It adds
> > check_error() to dax_operations, and dax_check_error() as a stub
> > with *dev_dax and *dev pointers to convey it to dax_pmem.  I am
> > wondering if this is the right direction, or we should change the
> > modularity to let dax_pmem call alloc_dax() with its dax_pmem (or
> > I completely missed something).
> 
> The problem is that device-dax guarantees a given fault granularity.
> To make that guarantee we can't fallback from 1G or 2M mappings due
> to an error. We also can't reasonably go the other way and fail
> mappings that contain a badblock because that would change the blast
> radius of a media error to the fault size.

Does it mean we expect users to have CPUs with MCE recovery for Device
DAX?  Can we add an attributes like allow error-check & fall-back?

Thanks,
-Toshi

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] dax: add badblocks check to Device DAX
  2017-05-03 16:09   ` Kani, Toshimitsu
@ 2017-05-03 16:30     ` Dan Williams
  2017-05-03 18:46       ` Kani, Toshimitsu
  0 siblings, 1 reply; 14+ messages in thread
From: Dan Williams @ 2017-05-03 16:30 UTC (permalink / raw)
  To: Kani, Toshimitsu; +Cc: linux-kernel, Jiang, Dave, linux-nvdimm@lists.01.org

On Wed, May 3, 2017 at 9:09 AM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
> On Wed, 2017-05-03 at 08:52 -0700, Dan Williams wrote:
>> On Wed, May 3, 2017 at 8:31 AM, Toshi Kani <toshi.kani@hpe.com>
>> wrote:
>> > This is a RFC patch for seeking suggestions.  It adds support of
>> > badblocks check in Device DAX by using region-level badblocks list.
>> > This patch is only briefly tested.
>> >
>> > device_dax is a well-isolated self-contained module as it calls
>> > alloc_dax() with dev_dax, which is private to device_dax.  For
>> > checking badblocks, it needs to call dax_pmem to check with
>> > region-level badblocks.
>> >
>> > This patch attempts to keep device_dax self-contained.  It adds
>> > check_error() to dax_operations, and dax_check_error() as a stub
>> > with *dev_dax and *dev pointers to convey it to dax_pmem.  I am
>> > wondering if this is the right direction, or we should change the
>> > modularity to let dax_pmem call alloc_dax() with its dax_pmem (or
>> > I completely missed something).
>>
>> The problem is that device-dax guarantees a given fault granularity.
>> To make that guarantee we can't fallback from 1G or 2M mappings due
>> to an error. We also can't reasonably go the other way and fail
>> mappings that contain a badblock because that would change the blast
>> radius of a media error to the fault size.
>
> Does it mean we expect users to have CPUs with MCE recovery for Device
> DAX?  Can we add an attributes like allow error-check & fall-back?

Yes, without MCE recovery device-dax mappings that consume errors will
reboot. If an application needs the kernel protection it should be
using filesystem-dax.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] dax: add badblocks check to Device DAX
  2017-05-03 16:30     ` Dan Williams
@ 2017-05-03 18:46       ` Kani, Toshimitsu
  2017-05-03 21:48         ` Dan Williams
  0 siblings, 1 reply; 14+ messages in thread
From: Kani, Toshimitsu @ 2017-05-03 18:46 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-kernel, dave.jiang, linux-nvdimm@lists.01.org

On Wed, 2017-05-03 at 09:30 -0700, Dan Williams wrote:
> On Wed, May 3, 2017 at 9:09 AM, Kani, Toshimitsu <toshi.kani@hpe.com>
> wrote:
> > On Wed, 2017-05-03 at 08:52 -0700, Dan Williams wrote:
> > > On Wed, May 3, 2017 at 8:31 AM, Toshi Kani <toshi.kani@hpe.com>
> > > wrote:
> > > > This is a RFC patch for seeking suggestions.  It adds support
> > > > of badblocks check in Device DAX by using region-level
> > > > badblocks list.  This patch is only briefly tested.
> > > > 
> > > > device_dax is a well-isolated self-contained module as it calls
> > > > alloc_dax() with dev_dax, which is private to device_dax.  For
> > > > checking badblocks, it needs to call dax_pmem to check with
> > > > region-level badblocks.
> > > > 
> > > > This patch attempts to keep device_dax self-contained.  It adds
> > > > check_error() to dax_operations, and dax_check_error() as a
> > > > stub with *dev_dax and *dev pointers to convey it to
> > > > dax_pmem.  I am wondering if this is the right direction, or we
> > > > should change the modularity to let dax_pmem call alloc_dax()
> > > > with its dax_pmem (or I completely missed something).
> > > 
> > > The problem is that device-dax guarantees a given fault
> > > granularity. To make that guarantee we can't fallback from 1G or
> > > 2M mappings due to an error. We also can't reasonably go the
> > > other way and fail mappings that contain a badblock because that
> > > would change the blast radius of a media error to the fault size.
> > 
> > Does it mean we expect users to have CPUs with MCE recovery for
> > Device DAX?  Can we add an attributes like allow error-check &
> > fall-back?
> 
> Yes, without MCE recovery device-dax mappings that consume errors
> will reboot. If an application needs the kernel protection it should
> be using filesystem-dax.

Understood.  Are we going to provide sysfs "badblocks" for Device DAX
as it is also needed for ndctl clear-error?

Thanks,
-Toshi
 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] dax: add badblocks check to Device DAX
  2017-05-03 18:46       ` Kani, Toshimitsu
@ 2017-05-03 21:48         ` Dan Williams
  2017-05-03 21:56           ` Dave Jiang
  2017-05-03 22:41           ` Kani, Toshimitsu
  0 siblings, 2 replies; 14+ messages in thread
From: Dan Williams @ 2017-05-03 21:48 UTC (permalink / raw)
  To: Kani, Toshimitsu; +Cc: linux-kernel, Jiang, Dave, linux-nvdimm@lists.01.org

On Wed, May 3, 2017 at 11:46 AM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
> On Wed, 2017-05-03 at 09:30 -0700, Dan Williams wrote:
>> On Wed, May 3, 2017 at 9:09 AM, Kani, Toshimitsu <toshi.kani@hpe.com>
>> wrote:
>> > On Wed, 2017-05-03 at 08:52 -0700, Dan Williams wrote:
>> > > On Wed, May 3, 2017 at 8:31 AM, Toshi Kani <toshi.kani@hpe.com>
>> > > wrote:
>> > > > This is a RFC patch for seeking suggestions.  It adds support
>> > > > of badblocks check in Device DAX by using region-level
>> > > > badblocks list.  This patch is only briefly tested.
>> > > >
>> > > > device_dax is a well-isolated self-contained module as it calls
>> > > > alloc_dax() with dev_dax, which is private to device_dax.  For
>> > > > checking badblocks, it needs to call dax_pmem to check with
>> > > > region-level badblocks.
>> > > >
>> > > > This patch attempts to keep device_dax self-contained.  It adds
>> > > > check_error() to dax_operations, and dax_check_error() as a
>> > > > stub with *dev_dax and *dev pointers to convey it to
>> > > > dax_pmem.  I am wondering if this is the right direction, or we
>> > > > should change the modularity to let dax_pmem call alloc_dax()
>> > > > with its dax_pmem (or I completely missed something).
>> > >
>> > > The problem is that device-dax guarantees a given fault
>> > > granularity. To make that guarantee we can't fallback from 1G or
>> > > 2M mappings due to an error. We also can't reasonably go the
>> > > other way and fail mappings that contain a badblock because that
>> > > would change the blast radius of a media error to the fault size.
>> >
>> > Does it mean we expect users to have CPUs with MCE recovery for
>> > Device DAX?  Can we add an attributes like allow error-check &
>> > fall-back?
>>
>> Yes, without MCE recovery device-dax mappings that consume errors
>> will reboot. If an application needs the kernel protection it should
>> be using filesystem-dax.
>
> Understood.  Are we going to provide sysfs "badblocks" for Device DAX
> as it is also needed for ndctl clear-error?

No, I had started that way, but badblocks really needs write(2) or
fallocate(PUNCH_HOLE) support for clearing errors. Since we don't want
to support write(2) and were NAKd from supporting fallocate() the only
interface that was left was sending clear-error-DSM ioctls directly to
 the nvdimm bus. Since that is a very libnvdimm specific interface it
made sense to then add badblocks at the libnvdimm-region level. The
"ndctl clear-error" command is there to do the translation of error
offsets in user space and supersedes the need for the kernel to carry
a badblocks file for device-dax.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] dax: add badblocks check to Device DAX
  2017-05-03 21:48         ` Dan Williams
@ 2017-05-03 21:56           ` Dave Jiang
  2017-05-03 22:41           ` Kani, Toshimitsu
  1 sibling, 0 replies; 14+ messages in thread
From: Dave Jiang @ 2017-05-03 21:56 UTC (permalink / raw)
  To: Dan Williams, Kani, Toshimitsu; +Cc: linux-kernel, linux-nvdimm@lists.01.org

On 05/03/2017 02:48 PM, Dan Williams wrote:
> On Wed, May 3, 2017 at 11:46 AM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
>> On Wed, 2017-05-03 at 09:30 -0700, Dan Williams wrote:
>>> On Wed, May 3, 2017 at 9:09 AM, Kani, Toshimitsu <toshi.kani@hpe.com>
>>> wrote:
>>>> On Wed, 2017-05-03 at 08:52 -0700, Dan Williams wrote:
>>>>> On Wed, May 3, 2017 at 8:31 AM, Toshi Kani <toshi.kani@hpe.com>
>>>>> wrote:
>>>>>> This is a RFC patch for seeking suggestions.  It adds support
>>>>>> of badblocks check in Device DAX by using region-level
>>>>>> badblocks list.  This patch is only briefly tested.
>>>>>>
>>>>>> device_dax is a well-isolated self-contained module as it calls
>>>>>> alloc_dax() with dev_dax, which is private to device_dax.  For
>>>>>> checking badblocks, it needs to call dax_pmem to check with
>>>>>> region-level badblocks.
>>>>>>
>>>>>> This patch attempts to keep device_dax self-contained.  It adds
>>>>>> check_error() to dax_operations, and dax_check_error() as a
>>>>>> stub with *dev_dax and *dev pointers to convey it to
>>>>>> dax_pmem.  I am wondering if this is the right direction, or we
>>>>>> should change the modularity to let dax_pmem call alloc_dax()
>>>>>> with its dax_pmem (or I completely missed something).
>>>>>
>>>>> The problem is that device-dax guarantees a given fault
>>>>> granularity. To make that guarantee we can't fallback from 1G or
>>>>> 2M mappings due to an error. We also can't reasonably go the
>>>>> other way and fail mappings that contain a badblock because that
>>>>> would change the blast radius of a media error to the fault size.
>>>>
>>>> Does it mean we expect users to have CPUs with MCE recovery for
>>>> Device DAX?  Can we add an attributes like allow error-check &
>>>> fall-back?
>>>
>>> Yes, without MCE recovery device-dax mappings that consume errors
>>> will reboot. If an application needs the kernel protection it should
>>> be using filesystem-dax.
>>
>> Understood.  Are we going to provide sysfs "badblocks" for Device DAX
>> as it is also needed for ndctl clear-error?
> 
> No, I had started that way, but badblocks really needs write(2) or
> fallocate(PUNCH_HOLE) support for clearing errors. Since we don't want
> to support write(2) and were NAKd from supporting fallocate() the only
> interface that was left was sending clear-error-DSM ioctls directly to
>  the nvdimm bus. Since that is a very libnvdimm specific interface it
> made sense to then add badblocks at the libnvdimm-region level. The
> "ndctl clear-error" command is there to do the translation of error
> offsets in user space and supersedes the need for the kernel to carry
> a badblocks file for device-dax.
> 

Toshi, I'm also working on ndctl list-errors in relations to dev dax so
that you get a list of badblocks that are fixed up for dev dax.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] dax: add badblocks check to Device DAX
  2017-05-03 21:48         ` Dan Williams
  2017-05-03 21:56           ` Dave Jiang
@ 2017-05-03 22:41           ` Kani, Toshimitsu
  2017-05-03 22:51             ` Dan Williams
  1 sibling, 1 reply; 14+ messages in thread
From: Kani, Toshimitsu @ 2017-05-03 22:41 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-kernel, dave.jiang, linux-nvdimm@lists.01.org

On Wed, 2017-05-03 at 14:48 -0700, Dan Williams wrote:
> On Wed, May 3, 2017 at 11:46 AM, Kani, Toshimitsu <toshi.kani@hpe.com
> > wrote:
> > On Wed, 2017-05-03 at 09:30 -0700, Dan Williams wrote:
> > > On Wed, May 3, 2017 at 9:09 AM, Kani, Toshimitsu <toshi.kani@hpe.
> > > com>
> > > wrote:
> > > > On Wed, 2017-05-03 at 08:52 -0700, Dan Williams wrote:
> > > > > On Wed, May 3, 2017 at 8:31 AM, Toshi Kani <toshi.kani@hpe.co
> > > > > m>
> > > > > wrote:
> > > > > > This is a RFC patch for seeking suggestions.  It adds
> > > > > > support of badblocks check in Device DAX by using region-
> > > > > > level badblocks list.  This patch is only briefly tested.
> > > > > > 
> > > > > > device_dax is a well-isolated self-contained module as it
> > > > > > calls alloc_dax() with dev_dax, which is private to
> > > > > > device_dax.  For checking badblocks, it needs to call
> > > > > > dax_pmem to check with region-level badblocks.
> > > > > > 
> > > > > > This patch attempts to keep device_dax self-contained.  It
> > > > > > adds check_error() to dax_operations, and dax_check_error()
> > > > > > as a stub with *dev_dax and *dev pointers to convey it to
> > > > > > dax_pmem.  I am wondering if this is the right direction,
> > > > > > or we should change the modularity to let dax_pmem call
> > > > > > alloc_dax() with its dax_pmem (or I completely missed
> > > > > > something).
> > > > > 
> > > > > The problem is that device-dax guarantees a given fault
> > > > > granularity. To make that guarantee we can't fallback from 1G
> > > > > or 2M mappings due to an error. We also can't reasonably go
> > > > > the other way and fail mappings that contain a badblock
> > > > > because that would change the blast radius of a media error
> > > > > to the fault size.
> > > > 
> > > > Does it mean we expect users to have CPUs with MCE recovery for
> > > > Device DAX?  Can we add an attributes like allow error-check &
> > > > fall-back?
> > > 
> > > Yes, without MCE recovery device-dax mappings that consume errors
> > > will reboot. If an application needs the kernel protection it
> > > should be using filesystem-dax.
> > 
> > Understood.  Are we going to provide sysfs "badblocks" for Device
> > DAX as it is also needed for ndctl clear-error?
> 
> No, I had started that way, but badblocks really needs write(2) or
> fallocate(PUNCH_HOLE) support for clearing errors. Since we don't
> want to support write(2) and were NAKd from supporting fallocate()
> the only interface that was left was sending clear-error-DSM ioctls
> directly to  the nvdimm bus. Since that is a very libnvdimm specific
> interface it made sense to then add badblocks at the libnvdimm-region 
> level. The "ndctl clear-error" command is there to do the translation
> of error offsets in user space and supersedes the need for the kernel
> to carry a badblocks file for device-dax.

I am fine with using ndctl to clear errors.  What I need is to allow an
application to avoid accessing to bad blocks by reading a sysfs file
and managing the bad blocks list by itself since the kernel does not
protect it at page faults.  At least, data offset of Device DAX should
be provided for such application to do the translation by itself. 

Thanks,
-Toshi

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] dax: add badblocks check to Device DAX
  2017-05-03 22:41           ` Kani, Toshimitsu
@ 2017-05-03 22:51             ` Dan Williams
  2017-05-03 23:08               ` Dan Williams
  0 siblings, 1 reply; 14+ messages in thread
From: Dan Williams @ 2017-05-03 22:51 UTC (permalink / raw)
  To: Kani, Toshimitsu; +Cc: linux-kernel, Jiang, Dave, linux-nvdimm@lists.01.org

On Wed, May 3, 2017 at 3:41 PM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
> On Wed, 2017-05-03 at 14:48 -0700, Dan Williams wrote:
>> On Wed, May 3, 2017 at 11:46 AM, Kani, Toshimitsu <toshi.kani@hpe.com
>> > wrote:
>> > On Wed, 2017-05-03 at 09:30 -0700, Dan Williams wrote:
>> > > On Wed, May 3, 2017 at 9:09 AM, Kani, Toshimitsu <toshi.kani@hpe.
>> > > com>
>> > > wrote:
>> > > > On Wed, 2017-05-03 at 08:52 -0700, Dan Williams wrote:
>> > > > > On Wed, May 3, 2017 at 8:31 AM, Toshi Kani <toshi.kani@hpe.co
>> > > > > m>
>> > > > > wrote:
>> > > > > > This is a RFC patch for seeking suggestions.  It adds
>> > > > > > support of badblocks check in Device DAX by using region-
>> > > > > > level badblocks list.  This patch is only briefly tested.
>> > > > > >
>> > > > > > device_dax is a well-isolated self-contained module as it
>> > > > > > calls alloc_dax() with dev_dax, which is private to
>> > > > > > device_dax.  For checking badblocks, it needs to call
>> > > > > > dax_pmem to check with region-level badblocks.
>> > > > > >
>> > > > > > This patch attempts to keep device_dax self-contained.  It
>> > > > > > adds check_error() to dax_operations, and dax_check_error()
>> > > > > > as a stub with *dev_dax and *dev pointers to convey it to
>> > > > > > dax_pmem.  I am wondering if this is the right direction,
>> > > > > > or we should change the modularity to let dax_pmem call
>> > > > > > alloc_dax() with its dax_pmem (or I completely missed
>> > > > > > something).
>> > > > >
>> > > > > The problem is that device-dax guarantees a given fault
>> > > > > granularity. To make that guarantee we can't fallback from 1G
>> > > > > or 2M mappings due to an error. We also can't reasonably go
>> > > > > the other way and fail mappings that contain a badblock
>> > > > > because that would change the blast radius of a media error
>> > > > > to the fault size.
>> > > >
>> > > > Does it mean we expect users to have CPUs with MCE recovery for
>> > > > Device DAX?  Can we add an attributes like allow error-check &
>> > > > fall-back?
>> > >
>> > > Yes, without MCE recovery device-dax mappings that consume errors
>> > > will reboot. If an application needs the kernel protection it
>> > > should be using filesystem-dax.
>> >
>> > Understood.  Are we going to provide sysfs "badblocks" for Device
>> > DAX as it is also needed for ndctl clear-error?
>>
>> No, I had started that way, but badblocks really needs write(2) or
>> fallocate(PUNCH_HOLE) support for clearing errors. Since we don't
>> want to support write(2) and were NAKd from supporting fallocate()
>> the only interface that was left was sending clear-error-DSM ioctls
>> directly to  the nvdimm bus. Since that is a very libnvdimm specific
>> interface it made sense to then add badblocks at the libnvdimm-region
>> level. The "ndctl clear-error" command is there to do the translation
>> of error offsets in user space and supersedes the need for the kernel
>> to carry a badblocks file for device-dax.
>
> I am fine with using ndctl to clear errors.  What I need is to allow an
> application to avoid accessing to bad blocks by reading a sysfs file
> and managing the bad blocks list by itself since the kernel does not
> protect it at page faults.  At least, data offset of Device DAX should
> be provided for such application to do the translation by itself.

I believe we already have all the data needed to calculate the data
offset. Given the following sysfs path:

    /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region1/dax1.1/dax/dax1.0

...we can find the associated namespace device from that dax1.1. From
there we have the base address of the namespace and the size
device-dax instance.

    device_dax_data_offset == namespace_base + namespace_size - device_dax_size

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] dax: add badblocks check to Device DAX
  2017-05-03 22:51             ` Dan Williams
@ 2017-05-03 23:08               ` Dan Williams
  2017-05-03 23:25                 ` Kani, Toshimitsu
  0 siblings, 1 reply; 14+ messages in thread
From: Dan Williams @ 2017-05-03 23:08 UTC (permalink / raw)
  To: Kani, Toshimitsu; +Cc: linux-kernel, Jiang, Dave, linux-nvdimm@lists.01.org

On Wed, May 3, 2017 at 3:51 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Wed, May 3, 2017 at 3:41 PM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
>> On Wed, 2017-05-03 at 14:48 -0700, Dan Williams wrote:
>>> On Wed, May 3, 2017 at 11:46 AM, Kani, Toshimitsu <toshi.kani@hpe.com
>>> > wrote:
>>> > On Wed, 2017-05-03 at 09:30 -0700, Dan Williams wrote:
>>> > > On Wed, May 3, 2017 at 9:09 AM, Kani, Toshimitsu <toshi.kani@hpe.
>>> > > com>
>>> > > wrote:
>>> > > > On Wed, 2017-05-03 at 08:52 -0700, Dan Williams wrote:
>>> > > > > On Wed, May 3, 2017 at 8:31 AM, Toshi Kani <toshi.kani@hpe.co
>>> > > > > m>
>>> > > > > wrote:
>>> > > > > > This is a RFC patch for seeking suggestions.  It adds
>>> > > > > > support of badblocks check in Device DAX by using region-
>>> > > > > > level badblocks list.  This patch is only briefly tested.
>>> > > > > >
>>> > > > > > device_dax is a well-isolated self-contained module as it
>>> > > > > > calls alloc_dax() with dev_dax, which is private to
>>> > > > > > device_dax.  For checking badblocks, it needs to call
>>> > > > > > dax_pmem to check with region-level badblocks.
>>> > > > > >
>>> > > > > > This patch attempts to keep device_dax self-contained.  It
>>> > > > > > adds check_error() to dax_operations, and dax_check_error()
>>> > > > > > as a stub with *dev_dax and *dev pointers to convey it to
>>> > > > > > dax_pmem.  I am wondering if this is the right direction,
>>> > > > > > or we should change the modularity to let dax_pmem call
>>> > > > > > alloc_dax() with its dax_pmem (or I completely missed
>>> > > > > > something).
>>> > > > >
>>> > > > > The problem is that device-dax guarantees a given fault
>>> > > > > granularity. To make that guarantee we can't fallback from 1G
>>> > > > > or 2M mappings due to an error. We also can't reasonably go
>>> > > > > the other way and fail mappings that contain a badblock
>>> > > > > because that would change the blast radius of a media error
>>> > > > > to the fault size.
>>> > > >
>>> > > > Does it mean we expect users to have CPUs with MCE recovery for
>>> > > > Device DAX?  Can we add an attributes like allow error-check &
>>> > > > fall-back?
>>> > >
>>> > > Yes, without MCE recovery device-dax mappings that consume errors
>>> > > will reboot. If an application needs the kernel protection it
>>> > > should be using filesystem-dax.
>>> >
>>> > Understood.  Are we going to provide sysfs "badblocks" for Device
>>> > DAX as it is also needed for ndctl clear-error?
>>>
>>> No, I had started that way, but badblocks really needs write(2) or
>>> fallocate(PUNCH_HOLE) support for clearing errors. Since we don't
>>> want to support write(2) and were NAKd from supporting fallocate()
>>> the only interface that was left was sending clear-error-DSM ioctls
>>> directly to  the nvdimm bus. Since that is a very libnvdimm specific
>>> interface it made sense to then add badblocks at the libnvdimm-region
>>> level. The "ndctl clear-error" command is there to do the translation
>>> of error offsets in user space and supersedes the need for the kernel
>>> to carry a badblocks file for device-dax.
>>
>> I am fine with using ndctl to clear errors.  What I need is to allow an
>> application to avoid accessing to bad blocks by reading a sysfs file
>> and managing the bad blocks list by itself since the kernel does not
>> protect it at page faults.  At least, data offset of Device DAX should
>> be provided for such application to do the translation by itself.
>
> I believe we already have all the data needed to calculate the data
> offset. Given the following sysfs path:
>
>     /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region1/dax1.1/dax/dax1.0
>
> ...we can find the associated namespace device from that dax1.1. From
> there we have the base address of the namespace and the size
> device-dax instance.
>
>     device_dax_data_offset == namespace_base + namespace_size - device_dax_size

Dave reminds me that we do have the data offset of the device-dax
instance at the libnvdimm level:

    /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region1/dax1.1/resource

...in this example, which maps to ndctl_dax_get_resource().

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] dax: add badblocks check to Device DAX
  2017-05-03 23:08               ` Dan Williams
@ 2017-05-03 23:25                 ` Kani, Toshimitsu
  2017-05-03 23:36                   ` Kani, Toshimitsu
  0 siblings, 1 reply; 14+ messages in thread
From: Kani, Toshimitsu @ 2017-05-03 23:25 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-kernel, dave.jiang, linux-nvdimm@lists.01.org

On Wed, 2017-05-03 at 16:08 -0700, Dan Williams wrote:
> On Wed, May 3, 2017 at 3:51 PM, Dan Williams <dan.j.williams@intel.co
> m> wrote:
> > On Wed, May 3, 2017 at 3:41 PM, Kani, Toshimitsu <toshi.kani@hpe.co
> > m> wrote:
> > > On Wed, 2017-05-03 at 14:48 -0700, Dan Williams wrote:
 :
> > 
> > I believe we already have all the data needed to calculate the data
> > offset. Given the following sysfs path:
> > 
> >     /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region1
> > /dax1.1/dax/dax1.0
> > 
> > ...we can find the associated namespace device from that dax1.1.
> > From
> > there we have the base address of the namespace and the size
> > device-dax instance.
> > 
> >     device_dax_data_offset == namespace_base + namespace_size -
> > device_dax_size
> 
> Dave reminds me that we do have the data offset of the device-dax
> instance at the libnvdimm level:
> 
>     /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region1/d
> ax1.1/resource
> 
> ...in this example, which maps to ndctl_dax_get_resource().

Thanks for the info!  I noticed why I did not catch this info before.

# ll /dev/dax*
crw------- 1 root root 251, 3 May  3 04:28 /dev/dax0.0

# pwd
/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region0/dax0.0

# grep . *
align:2097152
devtype:nd_dax
modalias:nd:t7
mode:none
numa_node:0
grep: power: Is a directory
grep: resource: No such device or address
grep: size: No such device or address
grep: subsystem: Is a directory
uevent:DEVTYPE=nd_dax
uevent:MODALIAS=nd:t7

But I noticed that "resource" and "size" that are under
".../region0/dax0.1" work.  Is this intended?

-Toshi

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] dax: add badblocks check to Device DAX
  2017-05-03 23:25                 ` Kani, Toshimitsu
@ 2017-05-03 23:36                   ` Kani, Toshimitsu
  2017-05-04  2:01                     ` Dan Williams
  0 siblings, 1 reply; 14+ messages in thread
From: Kani, Toshimitsu @ 2017-05-03 23:36 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-kernel, dave.jiang, linux-nvdimm@lists.01.org

On Wed, 2017-05-03 at 17:25 -0600, Toshi Kani wrote:
> On Wed, 2017-05-03 at 16:08 -0700, Dan Williams wrote:
> > On Wed, May 3, 2017 at 3:51 PM, Dan Williams <dan.j.williams@intel.
> > co
> > m> wrote:
> > > On Wed, May 3, 2017 at 3:41 PM, Kani, Toshimitsu <toshi.kani@hpe.
> > > co
> > > m> wrote:
> > > > On Wed, 2017-05-03 at 14:48 -0700, Dan Williams wrote:
> 
>  :
> > > 
> > > I believe we already have all the data needed to calculate the
> > > data
> > > offset. Given the following sysfs path:
> > > 
> > >     /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/regio
> > > n1
> > > /dax1.1/dax/dax1.0
> > > 
> > > ...we can find the associated namespace device from that dax1.1.
> > > From
> > > there we have the base address of the namespace and the size
> > > device-dax instance.
> > > 
> > >     device_dax_data_offset == namespace_base + namespace_size -
> > > device_dax_size
> > 
> > Dave reminds me that we do have the data offset of the device-dax
> > instance at the libnvdimm level:
> > 
> >     /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region1
> > /d
> > ax1.1/resource
> > 
> > ...in this example, which maps to ndctl_dax_get_resource().
> 
> Thanks for the info!  I noticed why I did not catch this info before.
> 
> # ll /dev/dax*
> crw------- 1 root root 251, 3 May  3 04:28 /dev/dax0.0
> 
> # pwd
> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region0/dax0.
> 0
> 
> # grep . *
> align:2097152
> devtype:nd_dax
> modalias:nd:t7
> mode:none
> numa_node:0
> grep: power: Is a directory
> grep: resource: No such device or address
> grep: size: No such device or address
> grep: subsystem: Is a directory
> uevent:DEVTYPE=nd_dax
> uevent:MODALIAS=nd:t7
> 
> But I noticed that "resource" and "size" that are under
> ".../region0/dax0.1" work.  Is this intended?

Here is an output from dax0.1 (I removed the size value).  Noticed that
mode is different.

# pwd
/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region0/dax0.1

# grep . *
align:2097152
grep: dax: Is a directory
grep: dax_region: Is a directory
devtype:nd_dax
grep: driver: Is a directory
modalias:nd:t7
mode:pmem
namespace:namespace0.0
numa_node:0
grep: power: Is a directory
resource:0x250200000
size:(size value)
grep: subsystem: Is a directory
uevent:DEVTYPE=nd_dax
uevent:DRIVER=dax_pmem
uevent:MODALIAS=nd:t7
uuid:8c71811f-260d-4788-8487-db88d829d393

Thanks,
-Toshi

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] dax: add badblocks check to Device DAX
  2017-05-03 23:36                   ` Kani, Toshimitsu
@ 2017-05-04  2:01                     ` Dan Williams
  2017-05-04 14:08                       ` Kani, Toshimitsu
  0 siblings, 1 reply; 14+ messages in thread
From: Dan Williams @ 2017-05-04  2:01 UTC (permalink / raw)
  To: Kani, Toshimitsu; +Cc: linux-kernel, Jiang, Dave, linux-nvdimm@lists.01.org

On Wed, May 3, 2017 at 4:36 PM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
> On Wed, 2017-05-03 at 17:25 -0600, Toshi Kani wrote:
>> On Wed, 2017-05-03 at 16:08 -0700, Dan Williams wrote:
>> > On Wed, May 3, 2017 at 3:51 PM, Dan Williams <dan.j.williams@intel.
>> > co
>> > m> wrote:
>> > > On Wed, May 3, 2017 at 3:41 PM, Kani, Toshimitsu <toshi.kani@hpe.
>> > > co
>> > > m> wrote:
>> > > > On Wed, 2017-05-03 at 14:48 -0700, Dan Williams wrote:
>>
>>  :
>> > >
>> > > I believe we already have all the data needed to calculate the
>> > > data
>> > > offset. Given the following sysfs path:
>> > >
>> > >     /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/regio
>> > > n1
>> > > /dax1.1/dax/dax1.0
>> > >
>> > > ...we can find the associated namespace device from that dax1.1.
>> > > From
>> > > there we have the base address of the namespace and the size
>> > > device-dax instance.
>> > >
>> > >     device_dax_data_offset == namespace_base + namespace_size -
>> > > device_dax_size
>> >
>> > Dave reminds me that we do have the data offset of the device-dax
>> > instance at the libnvdimm level:
>> >
>> >     /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region1
>> > /d
>> > ax1.1/resource
>> >
>> > ...in this example, which maps to ndctl_dax_get_resource().
>>
>> Thanks for the info!  I noticed why I did not catch this info before.
>>
>> # ll /dev/dax*
>> crw------- 1 root root 251, 3 May  3 04:28 /dev/dax0.0
>>
>> # pwd
>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region0/dax0.
>> 0
>>
>> # grep . *
>> align:2097152
>> devtype:nd_dax
>> modalias:nd:t7
>> mode:none
>> numa_node:0
>> grep: power: Is a directory
>> grep: resource: No such device or address
>> grep: size: No such device or address
>> grep: subsystem: Is a directory
>> uevent:DEVTYPE=nd_dax
>> uevent:MODALIAS=nd:t7
>>
>> But I noticed that "resource" and "size" that are under
>> ".../region0/dax0.1" work.  Is this intended?

You mean that region0/dax0.1 and region0/dax0.1/dax/dax0.0 are
functional, but region0/dax0.0 is not? Yes, that is expected the
libnvdimm "struct nd_dax" device is on a different device numbering
scheme than the "struct dev_dax" instance. Depending on load and
namespace reconfiguration order you can expect those names to not
match. The dev_dax instance name is "dax region id"."sub-division
instance"

>
> Here is an output from dax0.1 (I removed the size value).  Noticed that
> mode is different.
>
> # pwd
> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region0/dax0.1
>
> # grep . *
> align:2097152
> grep: dax: Is a directory
> grep: dax_region: Is a directory
> devtype:nd_dax
> grep: driver: Is a directory
> modalias:nd:t7
> mode:pmem
> namespace:namespace0.0
> numa_node:0
> grep: power: Is a directory
> resource:0x250200000
> size:(size value)
> grep: subsystem: Is a directory
> uevent:DEVTYPE=nd_dax
> uevent:DRIVER=dax_pmem
> uevent:MODALIAS=nd:t7
> uuid:8c71811f-260d-4788-8487-db88d829d393

The "mode" in this instance is the mode for the struct page
allocation, i.e. whether it is coming from main memory "mem" or the
persistent memory itself "pmem" in this case.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] dax: add badblocks check to Device DAX
  2017-05-04  2:01                     ` Dan Williams
@ 2017-05-04 14:08                       ` Kani, Toshimitsu
  0 siblings, 0 replies; 14+ messages in thread
From: Kani, Toshimitsu @ 2017-05-04 14:08 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-kernel, dave.jiang, linux-nvdimm@lists.01.org

On Wed, 2017-05-03 at 19:01 -0700, Dan Williams wrote:
> On Wed, May 3, 2017 at 4:36 PM, Kani, Toshimitsu <toshi.kani@hpe.com>
> wrote:
> > On Wed, 2017-05-03 at 17:25 -0600, Toshi Kani wrote:
> > > On Wed, 2017-05-03 at 16:08 -0700, Dan Williams wrote:
> > > > On Wed, May 3, 2017 at 3:51 PM, Dan Williams
> > > > <dan.j.williams@intel.com> wrote:
> > > > > On Wed, May 3, 2017 at 3:41 PM, Kani, Toshimitsu
> > > > > <toshi.kani@hpe.com> wrote:
> > > > > > On Wed, 2017-05-03 at 14:48 -0700, Dan Williams wrote:
> > > 
> > >  :
> > > > > 
> > > > > I believe we already have all the data needed to calculate
> > > > > the data offset. Given the following sysfs path:
> > > > > 
> > > > >     /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/r
> > > > > egion1/dax1.1/dax/dax1.0
> > > > > 
> > > > > ...we can find the associated namespace device from that
> > > > > dax1.1.
> > > > > From there we have the base address of the namespace and the
> > > > > size device-dax instance.
> > > > > 
> > > > >     device_dax_data_offset == namespace_base + namespace_size
> > > > > -
> > > > > device_dax_size
> > > > 
> > > > Dave reminds me that we do have the data offset of the device-
> > > > dax instance at the libnvdimm level:
> > > > 
> > > >     /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/reg
> > > > ion1/dax1.1/resource
> > > > 
> > > > ...in this example, which maps to ndctl_dax_get_resource().
> > > 
> > > Thanks for the info!  I noticed why I did not catch this info
> > > before.
> > > 
> > > # ll /dev/dax*
> > > crw------- 1 root root 251, 3 May  3 04:28 /dev/dax0.0
> > > 
> > > # pwd
> > > /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region0/d
> > > ax0.0
> > > 
> > > # grep . *
> > > align:2097152
> > > devtype:nd_dax
> > > modalias:nd:t7
> > > mode:none
> > > numa_node:0
> > > grep: power: Is a directory
> > > grep: resource: No such device or address
> > > grep: size: No such device or address
> > > grep: subsystem: Is a directory
> > > uevent:DEVTYPE=nd_dax
> > > uevent:MODALIAS=nd:t7
> > > 
> > > But I noticed that "resource" and "size" that are under
> > > ".../region0/dax0.1" work.  Is this intended?
> 
> You mean that region0/dax0.1 and region0/dax0.1/dax/dax0.0 are
> functional, but region0/dax0.0 is not? Yes, that is expected the
> libnvdimm "struct nd_dax" device is on a different device numbering
> scheme than the "struct dev_dax" instance. Depending on load and
> namespace reconfiguration order you can expect those names to not
> match. The dev_dax instance name is "dax region id"."sub-division
> instance"

Oh, I see. You are right. Looks like I can use the symbolic link under
"class/dax" to avoid this numbering issue.

# ll /sys/class/dax/dax0.0
lrwxrwxrwx 1 root root 0 May  4 02:55 /sys/class/dax/dax0.0 ->
../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region0/dax0.1
/dax/dax0.0

> > 
> > Here is an output from dax0.1 (I removed the size value).  Noticed
> > that mode is different.
> > 
> > # pwd
> > /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region0/dax
> > 0.1
> > 
> > # grep . *
> > align:2097152
> > grep: dax: Is a directory
> > grep: dax_region: Is a directory
> > devtype:nd_dax
> > grep: driver: Is a directory
> > modalias:nd:t7
> > mode:pmem
> > namespace:namespace0.0
> > numa_node:0
> > grep: power: Is a directory
> > resource:0x250200000
> > size:(size value)
> > grep: subsystem: Is a directory
> > uevent:DEVTYPE=nd_dax
> > uevent:DRIVER=dax_pmem
> > uevent:MODALIAS=nd:t7
> > uuid:8c71811f-260d-4788-8487-db88d829d393
> 
> The "mode" in this instance is the mode for the struct page
> allocation, i.e. whether it is coming from main memory "mem" or the
> persistent memory itself "pmem" in this case.

Right. I was totally confused by the numbering of these files...

Thanks!
-Toshi

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2017-05-04 14:08 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-03 15:31 [RFC PATCH] dax: add badblocks check to Device DAX Toshi Kani
2017-05-03 15:52 ` Dan Williams
2017-05-03 16:09   ` Kani, Toshimitsu
2017-05-03 16:30     ` Dan Williams
2017-05-03 18:46       ` Kani, Toshimitsu
2017-05-03 21:48         ` Dan Williams
2017-05-03 21:56           ` Dave Jiang
2017-05-03 22:41           ` Kani, Toshimitsu
2017-05-03 22:51             ` Dan Williams
2017-05-03 23:08               ` Dan Williams
2017-05-03 23:25                 ` Kani, Toshimitsu
2017-05-03 23:36                   ` Kani, Toshimitsu
2017-05-04  2:01                     ` Dan Williams
2017-05-04 14:08                       ` Kani, Toshimitsu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).