All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] block: fix pfn_mkwrite() DAX fault handler
@ 2016-01-28 19:35 ` Ross Zwisler
  0 siblings, 0 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-01-28 19:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ross Zwisler, Alexander Viro, Andrew Morton, Dan Williams,
	Dave Chinner, Jan Kara, Matthew Wilcox, linux-fsdevel,
	linux-nvdimm

Previously the pfn_mkwrite() fault handler for raw block devices called
bldev_dax_fault() -> __dax_fault() to do a full DAX page fault.  Really
what the pfn_mkwrite() fault handler needs to do is call dax_pfn_mkwrite()
to make sure that the radix tree entry for the given PTE is marked as dirty
so that a follow-up fsync or msync call will flush it durably to media.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Fixes: 5a023cdba50c ("block: enable dax for raw block devices")
---
 fs/block_dev.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7b9cd49..fa0507a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1730,6 +1730,12 @@ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return __dax_fault(vma, vmf, blkdev_get_block, NULL);
 }
 
+static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
+		struct vm_fault *vmf)
+{
+	return dax_pfn_mkwrite(vma, vmf);
+}
+
 static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd, unsigned int flags)
 {
@@ -1761,7 +1767,7 @@ static const struct vm_operations_struct blkdev_dax_vm_ops = {
 	.close		= blkdev_vm_close,
 	.fault		= blkdev_dax_fault,
 	.pmd_fault	= blkdev_dax_pmd_fault,
-	.pfn_mkwrite	= blkdev_dax_fault,
+	.pfn_mkwrite	= blkdev_dax_pfn_mkwrite,
 };
 
 static const struct vm_operations_struct blkdev_default_vm_ops = {
-- 
2.5.0


^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH 1/2] block: fix pfn_mkwrite() DAX fault handler
@ 2016-01-28 19:35 ` Ross Zwisler
  0 siblings, 0 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-01-28 19:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ross Zwisler, Alexander Viro, Andrew Morton, Dan Williams,
	Dave Chinner, Jan Kara, Matthew Wilcox, linux-fsdevel,
	linux-nvdimm

Previously the pfn_mkwrite() fault handler for raw block devices called
bldev_dax_fault() -> __dax_fault() to do a full DAX page fault.  Really
what the pfn_mkwrite() fault handler needs to do is call dax_pfn_mkwrite()
to make sure that the radix tree entry for the given PTE is marked as dirty
so that a follow-up fsync or msync call will flush it durably to media.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Fixes: 5a023cdba50c ("block: enable dax for raw block devices")
---
 fs/block_dev.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7b9cd49..fa0507a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1730,6 +1730,12 @@ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return __dax_fault(vma, vmf, blkdev_get_block, NULL);
 }
 
+static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
+		struct vm_fault *vmf)
+{
+	return dax_pfn_mkwrite(vma, vmf);
+}
+
 static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd, unsigned int flags)
 {
@@ -1761,7 +1767,7 @@ static const struct vm_operations_struct blkdev_dax_vm_ops = {
 	.close		= blkdev_vm_close,
 	.fault		= blkdev_dax_fault,
 	.pmd_fault	= blkdev_dax_pmd_fault,
-	.pfn_mkwrite	= blkdev_dax_fault,
+	.pfn_mkwrite	= blkdev_dax_pfn_mkwrite,
 };
 
 static const struct vm_operations_struct blkdev_default_vm_ops = {
-- 
2.5.0

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-28 19:35 ` Ross Zwisler
@ 2016-01-28 19:35   ` Ross Zwisler
  -1 siblings, 0 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-01-28 19:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ross Zwisler, Alexander Viro, Andrew Morton, Dan Williams,
	Dave Chinner, Jan Kara, Matthew Wilcox, linux-fsdevel,
	linux-nvdimm

There are a number of places in dax.c that look up the struct block_device
associated with an inode.  Previously this was done by just using
inode->i_sb->s_bdev.  This is correct for inodes that exist within the
filesystems supported by DAX (ext2, ext4 & XFS), but when running DAX
against raw block devices this value is NULL.  This causes NULL pointer
dereferences when these block_device pointers are used.

Instead, for raw block devices we need to look up the struct block_device
using I_BDEV().  This patch fixes all the block_device lookups in dax.c so
that they work properly for both filesystems and raw block devices.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/dax.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 4fd6b0c..e60a5a7 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,6 +32,9 @@
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
 
+#define DAX_BDEV(inode) (S_ISBLK(inode->i_mode) ? I_BDEV(inode) \
+				: inode->i_sb->s_bdev)
+
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
 	struct request_queue *q = bdev->bd_queue;
@@ -65,7 +68,7 @@ static void dax_unmap_atomic(struct block_device *bdev,
  */
 int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
 {
-	struct block_device *bdev = inode->i_sb->s_bdev;
+	struct block_device *bdev = DAX_BDEV(inode);
 	struct blk_dax_ctl dax = {
 		.sector = block << (inode->i_blkbits - 9),
 		.size = _size,
@@ -246,7 +249,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 	loff_t end = pos + iov_iter_count(iter);
 
 	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
+	bh.b_bdev = DAX_BDEV(inode);
 
 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
 		struct address_space *mapping = inode->i_mapping;
@@ -468,7 +471,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
 		loff_t end)
 {
 	struct inode *inode = mapping->host;
-	struct block_device *bdev = inode->i_sb->s_bdev;
+	struct block_device *bdev = DAX_BDEV(inode);
 	pgoff_t start_index, end_index, pmd_index;
 	pgoff_t indices[PAGEVEC_SIZE];
 	struct pagevec pvec;
@@ -608,7 +611,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	memset(&bh, 0, sizeof(bh));
 	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
-	bh.b_bdev = inode->i_sb->s_bdev;
+	bh.b_bdev = DAX_BDEV(inode);
 	bh.b_size = PAGE_SIZE;
 
  repeat:
@@ -827,7 +830,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	}
 
 	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
+	bh.b_bdev = DAX_BDEV(inode);
 	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
 
 	bh.b_size = PMD_SIZE;
@@ -1080,7 +1083,7 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
 	BUG_ON((offset + length) > PAGE_CACHE_SIZE);
 
 	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
+	bh.b_bdev = DAX_BDEV(inode);
 	bh.b_size = PAGE_CACHE_SIZE;
 	err = get_block(inode, index, &bh, 0);
 	if (err < 0)
-- 
2.5.0

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH 2/2] dax: fix bdev NULL pointer dereferences
@ 2016-01-28 19:35   ` Ross Zwisler
  0 siblings, 0 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-01-28 19:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ross Zwisler, Alexander Viro, Andrew Morton, Dan Williams,
	Dave Chinner, Jan Kara, Matthew Wilcox, linux-fsdevel,
	linux-nvdimm

There are a number of places in dax.c that look up the struct block_device
associated with an inode.  Previously this was done by just using
inode->i_sb->s_bdev.  This is correct for inodes that exist within the
filesystems supported by DAX (ext2, ext4 & XFS), but when running DAX
against raw block devices this value is NULL.  This causes NULL pointer
dereferences when these block_device pointers are used.

Instead, for raw block devices we need to look up the struct block_device
using I_BDEV().  This patch fixes all the block_device lookups in dax.c so
that they work properly for both filesystems and raw block devices.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/dax.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 4fd6b0c..e60a5a7 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,6 +32,9 @@
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
 
+#define DAX_BDEV(inode) (S_ISBLK(inode->i_mode) ? I_BDEV(inode) \
+				: inode->i_sb->s_bdev)
+
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
 	struct request_queue *q = bdev->bd_queue;
@@ -65,7 +68,7 @@ static void dax_unmap_atomic(struct block_device *bdev,
  */
 int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
 {
-	struct block_device *bdev = inode->i_sb->s_bdev;
+	struct block_device *bdev = DAX_BDEV(inode);
 	struct blk_dax_ctl dax = {
 		.sector = block << (inode->i_blkbits - 9),
 		.size = _size,
@@ -246,7 +249,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 	loff_t end = pos + iov_iter_count(iter);
 
 	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
+	bh.b_bdev = DAX_BDEV(inode);
 
 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
 		struct address_space *mapping = inode->i_mapping;
@@ -468,7 +471,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
 		loff_t end)
 {
 	struct inode *inode = mapping->host;
-	struct block_device *bdev = inode->i_sb->s_bdev;
+	struct block_device *bdev = DAX_BDEV(inode);
 	pgoff_t start_index, end_index, pmd_index;
 	pgoff_t indices[PAGEVEC_SIZE];
 	struct pagevec pvec;
@@ -608,7 +611,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	memset(&bh, 0, sizeof(bh));
 	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
-	bh.b_bdev = inode->i_sb->s_bdev;
+	bh.b_bdev = DAX_BDEV(inode);
 	bh.b_size = PAGE_SIZE;
 
  repeat:
@@ -827,7 +830,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	}
 
 	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
+	bh.b_bdev = DAX_BDEV(inode);
 	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
 
 	bh.b_size = PMD_SIZE;
@@ -1080,7 +1083,7 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
 	BUG_ON((offset + length) > PAGE_CACHE_SIZE);
 
 	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
+	bh.b_bdev = DAX_BDEV(inode);
 	bh.b_size = PAGE_CACHE_SIZE;
 	err = get_block(inode, index, &bh, 0);
 	if (err < 0)
-- 
2.5.0

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-28 19:35   ` Ross Zwisler
@ 2016-01-28 20:21     ` Dan Williams
  -1 siblings, 0 replies; 66+ messages in thread
From: Dan Williams @ 2016-01-28 20:21 UTC (permalink / raw)
  To: Ross Zwisler
  Cc: linux-kernel, Alexander Viro, Andrew Morton, Dave Chinner,
	Jan Kara, Matthew Wilcox, linux-fsdevel, linux-nvdimm

On Thu, Jan 28, 2016 at 11:35 AM, Ross Zwisler
<ross.zwisler@linux.intel.com> wrote:
> There are a number of places in dax.c that look up the struct block_device
> associated with an inode.  Previously this was done by just using
> inode->i_sb->s_bdev.  This is correct for inodes that exist within the
> filesystems supported by DAX (ext2, ext4 & XFS), but when running DAX
> against raw block devices this value is NULL.  This causes NULL pointer
> dereferences when these block_device pointers are used.
>
> Instead, for raw block devices we need to look up the struct block_device
> using I_BDEV().  This patch fixes all the block_device lookups in dax.c so
> that they work properly for both filesystems and raw block devices.
>
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>

It's a bit odd to check if it is a raw device inode in
dax_clear_blocks() since there's no use case to clear blocks in that
case, but I can't think of a better alternative.

Acked-by: Dan Williams <dan.j.williams@intel.com>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
@ 2016-01-28 20:21     ` Dan Williams
  0 siblings, 0 replies; 66+ messages in thread
From: Dan Williams @ 2016-01-28 20:21 UTC (permalink / raw)
  To: Ross Zwisler
  Cc: linux-kernel, Alexander Viro, Andrew Morton, Dave Chinner,
	Jan Kara, Matthew Wilcox, linux-fsdevel,
	linux-nvdimm@lists.01.org

On Thu, Jan 28, 2016 at 11:35 AM, Ross Zwisler
<ross.zwisler@linux.intel.com> wrote:
> There are a number of places in dax.c that look up the struct block_device
> associated with an inode.  Previously this was done by just using
> inode->i_sb->s_bdev.  This is correct for inodes that exist within the
> filesystems supported by DAX (ext2, ext4 & XFS), but when running DAX
> against raw block devices this value is NULL.  This causes NULL pointer
> dereferences when these block_device pointers are used.
>
> Instead, for raw block devices we need to look up the struct block_device
> using I_BDEV().  This patch fixes all the block_device lookups in dax.c so
> that they work properly for both filesystems and raw block devices.
>
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>

It's a bit odd to check if it is a raw device inode in
dax_clear_blocks() since there's no use case to clear blocks in that
case, but I can't think of a better alternative.

Acked-by: Dan Williams <dan.j.williams@intel.com>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-28 19:35   ` Ross Zwisler
  (?)
  (?)
@ 2016-01-28 21:38   ` Christoph Hellwig
  2016-01-29 18:28     ` Ross Zwisler
  2016-02-02  0:02     ` Ross Zwisler
  -1 siblings, 2 replies; 66+ messages in thread
From: Christoph Hellwig @ 2016-01-28 21:38 UTC (permalink / raw)
  To: Ross Zwisler
  Cc: linux-kernel, Alexander Viro, Andrew Morton, Dan Williams,
	Dave Chinner, Jan Kara, Matthew Wilcox, linux-fsdevel,
	linux-nvdimm

On Thu, Jan 28, 2016 at 12:35:04PM -0700, Ross Zwisler wrote:
> There are a number of places in dax.c that look up the struct block_device
> associated with an inode.  Previously this was done by just using
> inode->i_sb->s_bdev.  This is correct for inodes that exist within the
> filesystems supported by DAX (ext2, ext4 & XFS), but when running DAX
> against raw block devices this value is NULL.  This causes NULL pointer
> dereferences when these block_device pointers are used.

It's also wrong for an XFS file system with a RT device..

> +#define DAX_BDEV(inode) (S_ISBLK(inode->i_mode) ? I_BDEV(inode) \
> +				: inode->i_sb->s_bdev)

.. but this isn't going to fix it.  You must use a bdev returned by
get_blocks or a similar file system method.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-28 21:38   ` Christoph Hellwig
@ 2016-01-29 18:28     ` Ross Zwisler
  2016-01-29 23:34       ` Ross Zwisler
  2016-01-30  5:28       ` Matthew Wilcox
  2016-02-02  0:02     ` Ross Zwisler
  1 sibling, 2 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-01-29 18:28 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Ross Zwisler, linux-kernel, Alexander Viro, Andrew Morton,
	Dan Williams, Dave Chinner, Jan Kara, Matthew Wilcox,
	linux-fsdevel, linux-nvdimm

On Thu, Jan 28, 2016 at 01:38:58PM -0800, Christoph Hellwig wrote:
> On Thu, Jan 28, 2016 at 12:35:04PM -0700, Ross Zwisler wrote:
> > There are a number of places in dax.c that look up the struct block_device
> > associated with an inode.  Previously this was done by just using
> > inode->i_sb->s_bdev.  This is correct for inodes that exist within the
> > filesystems supported by DAX (ext2, ext4 & XFS), but when running DAX
> > against raw block devices this value is NULL.  This causes NULL pointer
> > dereferences when these block_device pointers are used.
> 
> It's also wrong for an XFS file system with a RT device..
> 
> > +#define DAX_BDEV(inode) (S_ISBLK(inode->i_mode) ? I_BDEV(inode) \
> > +				: inode->i_sb->s_bdev)
> 
> .. but this isn't going to fix it.  You must use a bdev returned by
> get_blocks or a similar file system method.

I guess I need to go off and understand if we can have DAX mappings on such a
device.  If we can, we may have a problem - we can get the block_device from
get_block() in I/O path and the various fault paths, but we don't have access
to get_block() when flushing via dax_writeback_mapping_range().  We avoid
needing it the normal case by storing the sector results from get_block() in
the radix tree.

/me is off to play with RT devices...

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-29 18:28     ` Ross Zwisler
@ 2016-01-29 23:34       ` Ross Zwisler
  2016-01-30  0:18         ` Dan Williams
  2016-01-31 22:44         ` Dave Chinner
  2016-01-30  5:28       ` Matthew Wilcox
  1 sibling, 2 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-01-29 23:34 UTC (permalink / raw)
  To: Ross Zwisler, Christoph Hellwig, linux-kernel, Alexander Viro,
	Andrew Morton, Dan Williams, Dave Chinner, Jan Kara,
	Matthew Wilcox, linux-fsdevel, linux-nvdimm

On Fri, Jan 29, 2016 at 11:28:15AM -0700, Ross Zwisler wrote:
> On Thu, Jan 28, 2016 at 01:38:58PM -0800, Christoph Hellwig wrote:
> > On Thu, Jan 28, 2016 at 12:35:04PM -0700, Ross Zwisler wrote:
> > > There are a number of places in dax.c that look up the struct block_device
> > > associated with an inode.  Previously this was done by just using
> > > inode->i_sb->s_bdev.  This is correct for inodes that exist within the
> > > filesystems supported by DAX (ext2, ext4 & XFS), but when running DAX
> > > against raw block devices this value is NULL.  This causes NULL pointer
> > > dereferences when these block_device pointers are used.
> > 
> > It's also wrong for an XFS file system with a RT device..
> > 
> > > +#define DAX_BDEV(inode) (S_ISBLK(inode->i_mode) ? I_BDEV(inode) \
> > > +				: inode->i_sb->s_bdev)
> > 
> > .. but this isn't going to fix it.  You must use a bdev returned by
> > get_blocks or a similar file system method.
> 
> I guess I need to go off and understand if we can have DAX mappings on such a
> device.  If we can, we may have a problem - we can get the block_device from
> get_block() in I/O path and the various fault paths, but we don't have access
> to get_block() when flushing via dax_writeback_mapping_range().  We avoid
> needing it the normal case by storing the sector results from get_block() in
> the radix tree.
> 
> /me is off to play with RT devices...

Well, RT devices are completely broken as far as I can see.  I've reported the
breakage to the XFS list.  Anything I do that triggers a RT block allocation
in XFS causes a lockdep splat + a kernel BUG - I've tried regular pwrite(),
xfs_rtcp and mmap() + write to address.  Not a new bug either - happens just
the same with v4.4.  Happens with both PMEM and BRD, and has no relationship
to whether I'm using DAX or not.

Does it work for this patch to go in as-is since it fixes an immediate OOPS
with raw block devices + DAX, and when RT devices are alive again I'll figure
out how to make them work too?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-29 23:34       ` Ross Zwisler
@ 2016-01-30  0:18         ` Dan Williams
  2016-01-31 22:44         ` Dave Chinner
  1 sibling, 0 replies; 66+ messages in thread
From: Dan Williams @ 2016-01-30  0:18 UTC (permalink / raw)
  To: Ross Zwisler, Christoph Hellwig, linux-kernel, Alexander Viro,
	Andrew Morton, Dan Williams, Dave Chinner, Jan Kara,
	Matthew Wilcox, linux-fsdevel, linux-nvdimm

On Fri, Jan 29, 2016 at 3:34 PM, Ross Zwisler
<ross.zwisler@linux.intel.com> wrote:
> On Fri, Jan 29, 2016 at 11:28:15AM -0700, Ross Zwisler wrote:
>> On Thu, Jan 28, 2016 at 01:38:58PM -0800, Christoph Hellwig wrote:
>> > On Thu, Jan 28, 2016 at 12:35:04PM -0700, Ross Zwisler wrote:
>> > > There are a number of places in dax.c that look up the struct block_device
>> > > associated with an inode.  Previously this was done by just using
>> > > inode->i_sb->s_bdev.  This is correct for inodes that exist within the
>> > > filesystems supported by DAX (ext2, ext4 & XFS), but when running DAX
>> > > against raw block devices this value is NULL.  This causes NULL pointer
>> > > dereferences when these block_device pointers are used.
>> >
>> > It's also wrong for an XFS file system with a RT device..
>> >
>> > > +#define DAX_BDEV(inode) (S_ISBLK(inode->i_mode) ? I_BDEV(inode) \
>> > > +                         : inode->i_sb->s_bdev)
>> >
>> > .. but this isn't going to fix it.  You must use a bdev returned by
>> > get_blocks or a similar file system method.
>>
>> I guess I need to go off and understand if we can have DAX mappings on such a
>> device.  If we can, we may have a problem - we can get the block_device from
>> get_block() in I/O path and the various fault paths, but we don't have access
>> to get_block() when flushing via dax_writeback_mapping_range().  We avoid
>> needing it the normal case by storing the sector results from get_block() in
>> the radix tree.
>>
>> /me is off to play with RT devices...
>
> Well, RT devices are completely broken as far as I can see.  I've reported the
> breakage to the XFS list.  Anything I do that triggers a RT block allocation
> in XFS causes a lockdep splat + a kernel BUG - I've tried regular pwrite(),
> xfs_rtcp and mmap() + write to address.  Not a new bug either - happens just
> the same with v4.4.  Happens with both PMEM and BRD, and has no relationship
> to whether I'm using DAX or not.
>
> Does it work for this patch to go in as-is since it fixes an immediate OOPS
> with raw block devices + DAX, and when RT devices are alive again I'll figure
> out how to make them work too?

Can we step back and be clear about which lookups should be coming
from get_blocks().  Which ones are critical vs ones we just
opportunistically lookup for a debug print.

Right now xfs and ext4 are basically disagreeing on whether
get_blocks() reliably sets ->bh_bdev, and checking for a raw
block-device inode in dax_clear_blocks() does not make sense.  So this
all seems a bit confused.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-29 18:28     ` Ross Zwisler
  2016-01-29 23:34       ` Ross Zwisler
@ 2016-01-30  5:28       ` Matthew Wilcox
  2016-01-30  6:01         ` Dan Williams
  2016-02-01 14:51         ` Jan Kara
  1 sibling, 2 replies; 66+ messages in thread
From: Matthew Wilcox @ 2016-01-30  5:28 UTC (permalink / raw)
  To: Ross Zwisler, Christoph Hellwig, linux-kernel, Alexander Viro,
	Andrew Morton, Dan Williams, Dave Chinner, Jan Kara,
	linux-fsdevel, linux-nvdimm

On Fri, Jan 29, 2016 at 11:28:15AM -0700, Ross Zwisler wrote:
> I guess I need to go off and understand if we can have DAX mappings on such a
> device.  If we can, we may have a problem - we can get the block_device from
> get_block() in I/O path and the various fault paths, but we don't have access
> to get_block() when flushing via dax_writeback_mapping_range().  We avoid
> needing it the normal case by storing the sector results from get_block() in
> the radix tree.

I think we're doing it wrong by storing the sector in the radix tree; we'd
really need to store both the sector and the bdev which is too much data.

If we store the PFN of the underlying page instead, we don't have this
problem.  Instead, we have a different problem; of the device going
away under us.  I'm trying to find the code which tears down PTEs when
the device goes away, and I'm not seeing it.  What do we do about user
mappings of the device?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-30  5:28       ` Matthew Wilcox
@ 2016-01-30  6:01         ` Dan Williams
  2016-01-30  7:08           ` Jared Hulbert
  2016-01-31  2:32           ` Matthew Wilcox
  2016-02-01 14:51         ` Jan Kara
  1 sibling, 2 replies; 66+ messages in thread
From: Dan Williams @ 2016-01-30  6:01 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Ross Zwisler, Christoph Hellwig, linux-kernel, Alexander Viro,
	Andrew Morton, Dave Chinner, Jan Kara, linux-fsdevel,
	linux-nvdimm

On Fri, Jan 29, 2016 at 9:28 PM, Matthew Wilcox <willy@linux.intel.com> wrote:
> On Fri, Jan 29, 2016 at 11:28:15AM -0700, Ross Zwisler wrote:
>> I guess I need to go off and understand if we can have DAX mappings on such a
>> device.  If we can, we may have a problem - we can get the block_device from
>> get_block() in I/O path and the various fault paths, but we don't have access
>> to get_block() when flushing via dax_writeback_mapping_range().  We avoid
>> needing it the normal case by storing the sector results from get_block() in
>> the radix tree.
>
> I think we're doing it wrong by storing the sector in the radix tree; we'd
> really need to store both the sector and the bdev which is too much data.
>
> If we store the PFN of the underlying page instead, we don't have this
> problem.  Instead, we have a different problem; of the device going
> away under us.  I'm trying to find the code which tears down PTEs when
> the device goes away, and I'm not seeing it.  What do we do about user
> mappings of the device?
>

I deferred the dax tear down code until next cycle as Al rightly
pointed out some needed re-works:

https://lists.01.org/pipermail/linux-nvdimm/2016-January/003995.html

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-30  6:01         ` Dan Williams
@ 2016-01-30  7:08           ` Jared Hulbert
  2016-01-31  2:32           ` Matthew Wilcox
  1 sibling, 0 replies; 66+ messages in thread
From: Jared Hulbert @ 2016-01-30  7:08 UTC (permalink / raw)
  To: Dan Williams
  Cc: Matthew Wilcox, Ross Zwisler, Christoph Hellwig, linux-kernel,
	Alexander Viro, Andrew Morton, Dave Chinner, Jan Kara,
	linux-fsdevel, linux-nvdimm

On Fri, Jan 29, 2016 at 10:01 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Fri, Jan 29, 2016 at 9:28 PM, Matthew Wilcox <willy@linux.intel.com> wrote:
>> On Fri, Jan 29, 2016 at 11:28:15AM -0700, Ross Zwisler wrote:
>>> I guess I need to go off and understand if we can have DAX mappings on such a
>>> device.  If we can, we may have a problem - we can get the block_device from
>>> get_block() in I/O path and the various fault paths, but we don't have access
>>> to get_block() when flushing via dax_writeback_mapping_range().  We avoid
>>> needing it the normal case by storing the sector results from get_block() in
>>> the radix tree.
>>
>> I think we're doing it wrong by storing the sector in the radix tree; we'd
>> really need to store both the sector and the bdev which is too much data.
>>
>> If we store the PFN of the underlying page instead, we don't have this
>> problem.  Instead, we have a different problem; of the device going
>> away under us.  I'm trying to find the code which tears down PTEs when
>> the device goes away, and I'm not seeing it.  What do we do about user
>> mappings of the device?
>>
>
> I deferred the dax tear down code until next cycle as Al rightly
> pointed out some needed re-works:
>
> https://lists.01.org/pipermail/linux-nvdimm/2016-January/003995.html

If you store sectors in the radix and the device gets removed you
still have to unmap user mappings of PFNs.

So why is the device remove harder with the PFN vs bdev+sector radix
entry?  Either way you need a list of PFNs and their corresponding
PTE's, right?

And are we just talking graceful removal?  Any plans for device failures?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-30  6:01         ` Dan Williams
  2016-01-30  7:08           ` Jared Hulbert
@ 2016-01-31  2:32           ` Matthew Wilcox
  2016-01-31  6:12             ` Ross Zwisler
  2016-02-01 13:44             ` Matthew Wilcox
  1 sibling, 2 replies; 66+ messages in thread
From: Matthew Wilcox @ 2016-01-31  2:32 UTC (permalink / raw)
  To: Dan Williams
  Cc: Ross Zwisler, Christoph Hellwig, linux-kernel, Alexander Viro,
	Andrew Morton, Dave Chinner, Jan Kara, linux-fsdevel,
	linux-nvdimm

On Fri, Jan 29, 2016 at 10:01:13PM -0800, Dan Williams wrote:
> On Fri, Jan 29, 2016 at 9:28 PM, Matthew Wilcox <willy@linux.intel.com> wrote:
> > If we store the PFN of the underlying page instead, we don't have this
> > problem.  Instead, we have a different problem; of the device going
> > away under us.  I'm trying to find the code which tears down PTEs when
> > the device goes away, and I'm not seeing it.  What do we do about user
> > mappings of the device?
> 
> I deferred the dax tear down code until next cycle as Al rightly
> pointed out some needed re-works:
> 
> https://lists.01.org/pipermail/linux-nvdimm/2016-January/003995.html

Thanks; I eventually found it in my email somewhere over the Pacific.

I did probably 70% of the work needed to switch the radix tree over to
storing PFNs instead of sectors.  It seems viable, though it's a big
change from where we are today:

 fs/dax.c                   | 415 +++++++++++++++++++++++----------------------
 include/linux/dax.h        |   3 +-
 include/linux/pfn_t.h      |  33 +++-
 include/linux/radix-tree.h |   9 -
 4 files changed, 236 insertions(+), 224 deletions(-)

I'll try and get that finished off this week.

One concrete and easily-separable piece is that dax_clear_blocks() has
the wrong signature.  It currently takes an inode & block as parameters;
it has no way of finding out the correct block device.  It's only two
callers are filesystems (ext2 and xfs).  Those filesystems should be
passing the block_device instead of the inode.  But without the inode,
we can't convert a block number to a sector number, so we also need
to pass the sector number, not the block number.  It still has type
sector_t, annoyingly.

@@ -63,12 +238,11 @@ static void dax_unmap_atomic(struct block_device *bdev,
  * and hence this means the stack from this point must follow GFP_NOFS
  * semantics for all operations.
  */
-int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
+int dax_clear_blocks(struct block_device *bdev, sector_t sector, long size)
 {
-       struct block_device *bdev = inode->i_sb->s_bdev;
        struct blk_dax_ctl dax = {
-               .sector = block << (inode->i_blkbits - 9),
-               .size = _size,
+               .sector = sector,
+               .size = size,
        };
 
        might_sleep();

but I haven't looked at doing the conversion of xfs or ext2 to use that
new interface.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-31  2:32           ` Matthew Wilcox
@ 2016-01-31  6:12             ` Ross Zwisler
  2016-01-31 10:55               ` Matthew Wilcox
  2016-02-01 13:44             ` Matthew Wilcox
  1 sibling, 1 reply; 66+ messages in thread
From: Ross Zwisler @ 2016-01-31  6:12 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Dan Williams, linux-nvdimm, Dave Chinner, linux-kernel,
	Christoph Hellwig, Alexander Viro, Jan Kara, linux-fsdevel,
	Andrew Morton

> On Jan 30, 2016, at 7:32 PM, Matthew Wilcox <willy@linux.intel.com> wrote:
> 
>> On Fri, Jan 29, 2016 at 10:01:13PM -0800, Dan Williams wrote:
>>> On Fri, Jan 29, 2016 at 9:28 PM, Matthew Wilcox <willy@linux.intel.com> wrote:
>>> If we store the PFN of the underlying page instead, we don't have this
>>> problem.  Instead, we have a different problem; of the device going
>>> away under us.  I'm trying to find the code which tears down PTEs when
>>> the device goes away, and I'm not seeing it.  What do we do about user
>>> mappings of the device?
>> 
>> I deferred the dax tear down code until next cycle as Al rightly
>> pointed out some needed re-works:
>> 
>> https://lists.01.org/pipermail/linux-nvdimm/2016-January/003995.html
> 
> Thanks; I eventually found it in my email somewhere over the Pacific.
> 
> I did probably 70% of the work needed to switch the radix tree over to
> storing PFNs instead of sectors.  It seems viable, though it's a big
> change from where we are today:

At one point I had kaddrs in the radix tree, so I could just pull the addresses out
and flush them.  That would save us a pfn -> kaddrs conversion before flush.

Is there a reason to store pnfs instead of kaddrs in the radix tree?

> 
> fs/dax.c                   | 415 +++++++++++++++++++++++----------------------
> include/linux/dax.h        |   3 +-
> include/linux/pfn_t.h      |  33 +++-
> include/linux/radix-tree.h |   9 -
> 4 files changed, 236 insertions(+), 224 deletions(-)
> 
> I'll try and get that finished off this week.
> 
> One concrete and easily-separable piece is that dax_clear_blocks() has
> the wrong signature.  It currently takes an inode & block as parameters;
> it has no way of finding out the correct block device.  It's only two
> callers are filesystems (ext2 and xfs).  Those filesystems should be
> passing the block_device instead of the inode.  But without the inode,
> we can't convert a block number to a sector number, so we also need
> to pass the sector number, not the block number.  It still has type
> sector_t, annoyingly.
> 
> @@ -63,12 +238,11 @@ static void dax_unmap_atomic(struct block_device *bdev,
>  * and hence this means the stack from this point must follow GFP_NOFS
>  * semantics for all operations.
>  */
> -int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
> +int dax_clear_blocks(struct block_device *bdev, sector_t sector, long size)
> {
> -       struct block_device *bdev = inode->i_sb->s_bdev;
>        struct blk_dax_ctl dax = {
> -               .sector = block << (inode->i_blkbits - 9),
> -               .size = _size,
> +               .sector = sector,
> +               .size = size,
>        };
> 
>        might_sleep();
> 
> but I haven't looked at doing the conversion of xfs or ext2 to use that
> new interface.
> _______________________________________________
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-31  6:12             ` Ross Zwisler
@ 2016-01-31 10:55               ` Matthew Wilcox
  2016-01-31 16:38                 ` Dan Williams
  0 siblings, 1 reply; 66+ messages in thread
From: Matthew Wilcox @ 2016-01-31 10:55 UTC (permalink / raw)
  To: Ross Zwisler
  Cc: Dan Williams, linux-nvdimm, Dave Chinner, linux-kernel,
	Christoph Hellwig, Alexander Viro, Jan Kara, linux-fsdevel,
	Andrew Morton

On Sat, Jan 30, 2016 at 11:12:12PM -0700, Ross Zwisler wrote:
> > I did probably 70% of the work needed to switch the radix tree over to
> > storing PFNs instead of sectors.  It seems viable, though it's a big
> > change from where we are today:
> 
> At one point I had kaddrs in the radix tree, so I could just pull the addresses out
> and flush them.  That would save us a pfn -> kaddrs conversion before flush.
> 
> Is there a reason to store pnfs instead of kaddrs in the radix tree?

Once ARM, MIPS and SPARC get supported, they're going to need temporary
kernel addresses assigned to PFNs rather than permanent ones.  Also,
it'll be easier for teardown to delete PFNs associated with a particular
device than kaddrs associated with a particular device.  And it lets
us support more persistent memory on a 32-bit machine (also on a 64-bit
machine, but that's mostly theoretical)

+/*
+ * DAX uses the 'exceptional' entries to store PFNs in the radix tree.
+ * Bit 0 is clear (the radix tree uses this for its own purposes).  Bit
+ * 1 is set (to indicate an exceptional entry).  Bits 2 & 3 are PFN_DEV
+ * and PFN_MAP.  The top two bits denote the size of the entry (PTE, PMD,
+ * PUD, one reserved).  That leaves us 26 bits on 32-bit systems and 58
+ * bits on 64-bit systems, able to address 256GB and 1024EB respectively.
+ */

It's also pretty cheap to look up the kaddr from the pfn, at least on
64-bit architectures without cache aliasing problems:

+static void *dax_map_pfn(pfn_t pfn, unsigned long index)
+{
+       preempt_disable();
+       pagefault_disable();
+       return pfn_to_kaddr(pfn_t_to_pfn(pfn));
+}
+
+static void dax_unmap_pfn(void *vaddr)
+{
+       pagefault_enable();
+       preempt_enable();
+}

32-bit x86 is going to want to do something similar to
iomap_atomic_prot_pfn().  ARM/SPARC/MIPS will want something in the
kmap_atomic family.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-31 10:55               ` Matthew Wilcox
@ 2016-01-31 16:38                 ` Dan Williams
  2016-01-31 18:07                   ` Matthew Wilcox
  0 siblings, 1 reply; 66+ messages in thread
From: Dan Williams @ 2016-01-31 16:38 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Ross Zwisler, linux-nvdimm, Dave Chinner, linux-kernel,
	Christoph Hellwig, Alexander Viro, Jan Kara, linux-fsdevel,
	Andrew Morton

On Sun, Jan 31, 2016 at 2:55 AM, Matthew Wilcox <willy@linux.intel.com> wrote:
> On Sat, Jan 30, 2016 at 11:12:12PM -0700, Ross Zwisler wrote:
>> > I did probably 70% of the work needed to switch the radix tree over to
>> > storing PFNs instead of sectors.  It seems viable, though it's a big
>> > change from where we are today:
>>
>> At one point I had kaddrs in the radix tree, so I could just pull the addresses out
>> and flush them.  That would save us a pfn -> kaddrs conversion before flush.
>>
>> Is there a reason to store pnfs instead of kaddrs in the radix tree?
>
> Once ARM, MIPS and SPARC get supported, they're going to need temporary
> kernel addresses assigned to PFNs rather than permanent ones.  Also,
> it'll be easier for teardown to delete PFNs associated with a particular
> device than kaddrs associated with a particular device.  And it lets
> us support more persistent memory on a 32-bit machine (also on a 64-bit
> machine, but that's mostly theoretical)
>
> +/*
> + * DAX uses the 'exceptional' entries to store PFNs in the radix tree.
> + * Bit 0 is clear (the radix tree uses this for its own purposes).  Bit
> + * 1 is set (to indicate an exceptional entry).  Bits 2 & 3 are PFN_DEV
> + * and PFN_MAP.  The top two bits denote the size of the entry (PTE, PMD,
> + * PUD, one reserved).  That leaves us 26 bits on 32-bit systems and 58
> + * bits on 64-bit systems, able to address 256GB and 1024EB respectively.
> + */
>
> It's also pretty cheap to look up the kaddr from the pfn, at least on
> 64-bit architectures without cache aliasing problems:
>
> +static void *dax_map_pfn(pfn_t pfn, unsigned long index)
> +{
> +       preempt_disable();
> +       pagefault_disable();
> +       return pfn_to_kaddr(pfn_t_to_pfn(pfn));

pfn_to_kaddr() assumes persistent memory is direct mapped which is not
always the case.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-31 16:38                 ` Dan Williams
@ 2016-01-31 18:07                   ` Matthew Wilcox
  2016-01-31 18:18                     ` Dan Williams
  2016-01-31 19:51                     ` Dan Williams
  0 siblings, 2 replies; 66+ messages in thread
From: Matthew Wilcox @ 2016-01-31 18:07 UTC (permalink / raw)
  To: Dan Williams
  Cc: Ross Zwisler, linux-nvdimm, Dave Chinner, linux-kernel,
	Christoph Hellwig, Alexander Viro, Jan Kara, linux-fsdevel,
	Andrew Morton

On Sun, Jan 31, 2016 at 08:38:20AM -0800, Dan Williams wrote:
> On Sun, Jan 31, 2016 at 2:55 AM, Matthew Wilcox <willy@linux.intel.com> wrote:
> > On Sat, Jan 30, 2016 at 11:12:12PM -0700, Ross Zwisler wrote:
> >> Is there a reason to store pnfs instead of kaddrs in the radix tree?
> >
> > Once ARM, MIPS and SPARC get supported, they're going to need temporary
> > kernel addresses assigned to PFNs rather than permanent ones.  Also,
> > it'll be easier for teardown to delete PFNs associated with a particular
> > device than kaddrs associated with a particular device.  And it lets
> > us support more persistent memory on a 32-bit machine (also on a 64-bit
> > machine, but that's mostly theoretical)
> >
> > +/*
> > + * DAX uses the 'exceptional' entries to store PFNs in the radix tree.
> > + * Bit 0 is clear (the radix tree uses this for its own purposes).  Bit
> > + * 1 is set (to indicate an exceptional entry).  Bits 2 & 3 are PFN_DEV
> > + * and PFN_MAP.  The top two bits denote the size of the entry (PTE, PMD,
> > + * PUD, one reserved).  That leaves us 26 bits on 32-bit systems and 58
> > + * bits on 64-bit systems, able to address 256GB and 1024EB respectively.
> > + */
> >
> > It's also pretty cheap to look up the kaddr from the pfn, at least on
> > 64-bit architectures without cache aliasing problems:
> >
> > +static void *dax_map_pfn(pfn_t pfn, unsigned long index)
> > +{
> > +       preempt_disable();
> > +       pagefault_disable();
> > +       return pfn_to_kaddr(pfn_t_to_pfn(pfn));
> 
> pfn_to_kaddr() assumes persistent memory is direct mapped which is not
> always the case.

Yes.  This is just the default implementation of dax_map_pfn() which works
for most situations.  We can introduce more complex implementations of
dax_map_pfn() as necessary.  You make another excellent point for why
we should store PFNs in the radix tree instead of kaddrs :-)

One option that I've been looking at (primarily for x86-32) is
having an rbtree of PFN ranges that drivers add to when they register
peristent memory.  That would let us use the io_mapping_create_wc() /
io_mapping_map_atomic_wc() API.  But having great support for persistent
memory with 32-bit x86 kernels is very very low on my priority list.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-31 18:07                   ` Matthew Wilcox
@ 2016-01-31 18:18                     ` Dan Williams
  2016-01-31 18:27                       ` Matthew Wilcox
  2016-01-31 19:51                     ` Dan Williams
  1 sibling, 1 reply; 66+ messages in thread
From: Dan Williams @ 2016-01-31 18:18 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Ross Zwisler, linux-nvdimm, Dave Chinner, linux-kernel,
	Christoph Hellwig, Alexander Viro, Jan Kara, linux-fsdevel,
	Andrew Morton

On Sun, Jan 31, 2016 at 10:07 AM, Matthew Wilcox <willy@linux.intel.com> wrote:
> On Sun, Jan 31, 2016 at 08:38:20AM -0800, Dan Williams wrote:
>> On Sun, Jan 31, 2016 at 2:55 AM, Matthew Wilcox <willy@linux.intel.com> wrote:
>> > On Sat, Jan 30, 2016 at 11:12:12PM -0700, Ross Zwisler wrote:
>> >> Is there a reason to store pnfs instead of kaddrs in the radix tree?
>> >
>> > Once ARM, MIPS and SPARC get supported, they're going to need temporary
>> > kernel addresses assigned to PFNs rather than permanent ones.  Also,
>> > it'll be easier for teardown to delete PFNs associated with a particular
>> > device than kaddrs associated with a particular device.  And it lets
>> > us support more persistent memory on a 32-bit machine (also on a 64-bit
>> > machine, but that's mostly theoretical)
>> >
>> > +/*
>> > + * DAX uses the 'exceptional' entries to store PFNs in the radix tree.
>> > + * Bit 0 is clear (the radix tree uses this for its own purposes).  Bit
>> > + * 1 is set (to indicate an exceptional entry).  Bits 2 & 3 are PFN_DEV
>> > + * and PFN_MAP.  The top two bits denote the size of the entry (PTE, PMD,
>> > + * PUD, one reserved).  That leaves us 26 bits on 32-bit systems and 58
>> > + * bits on 64-bit systems, able to address 256GB and 1024EB respectively.
>> > + */
>> >
>> > It's also pretty cheap to look up the kaddr from the pfn, at least on
>> > 64-bit architectures without cache aliasing problems:
>> >
>> > +static void *dax_map_pfn(pfn_t pfn, unsigned long index)
>> > +{
>> > +       preempt_disable();
>> > +       pagefault_disable();
>> > +       return pfn_to_kaddr(pfn_t_to_pfn(pfn));
>>
>> pfn_to_kaddr() assumes persistent memory is direct mapped which is not
>> always the case.
>
> Yes.  This is just the default implementation of dax_map_pfn() which works
> for most situations.  We can introduce more complex implementations of
> dax_map_pfn() as necessary.  You make another excellent point for why
> we should store PFNs in the radix tree instead of kaddrs :-)

How much complexity do we want to add in support of an fsync/msync
mechanism that is not the recommended way to use DAX?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-31 18:18                     ` Dan Williams
@ 2016-01-31 18:27                       ` Matthew Wilcox
  2016-01-31 18:50                         ` Dan Williams
  0 siblings, 1 reply; 66+ messages in thread
From: Matthew Wilcox @ 2016-01-31 18:27 UTC (permalink / raw)
  To: Dan Williams
  Cc: Ross Zwisler, linux-nvdimm, Dave Chinner, linux-kernel,
	Christoph Hellwig, Alexander Viro, Jan Kara, linux-fsdevel,
	Andrew Morton

On Sun, Jan 31, 2016 at 10:18:46AM -0800, Dan Williams wrote:
> On Sun, Jan 31, 2016 at 10:07 AM, Matthew Wilcox <willy@linux.intel.com> wrote:
> > Yes.  This is just the default implementation of dax_map_pfn() which works
> > for most situations.  We can introduce more complex implementations of
> > dax_map_pfn() as necessary.  You make another excellent point for why
> > we should store PFNs in the radix tree instead of kaddrs :-)
> 
> How much complexity do we want to add in support of an fsync/msync
> mechanism that is not the recommended way to use DAX?

It actually makes the dax_io path much, much simpler.  And it's not
primarily about fixing fsync/msync.  It also makes the fault path cheaper
in the case where we're refaulting a page that's already been faulted
by another process (or was previously faulted by this process and now
needs to be faulted at a different address).

And it fixes the problem with filesystems that use multiple block_devices.
It also makes DAX much less reliant on buffer heads, which is good for
the problem that Jared raised where he doesn't have a block_device in
an embedded system.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-31 18:27                       ` Matthew Wilcox
@ 2016-01-31 18:50                         ` Dan Williams
  0 siblings, 0 replies; 66+ messages in thread
From: Dan Williams @ 2016-01-31 18:50 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Ross Zwisler, linux-nvdimm, Dave Chinner, linux-kernel,
	Christoph Hellwig, Alexander Viro, Jan Kara, linux-fsdevel,
	Andrew Morton

On Sun, Jan 31, 2016 at 10:27 AM, Matthew Wilcox <willy@linux.intel.com> wrote:
> On Sun, Jan 31, 2016 at 10:18:46AM -0800, Dan Williams wrote:
>> On Sun, Jan 31, 2016 at 10:07 AM, Matthew Wilcox <willy@linux.intel.com> wrote:
>> > Yes.  This is just the default implementation of dax_map_pfn() which works
>> > for most situations.  We can introduce more complex implementations of
>> > dax_map_pfn() as necessary.  You make another excellent point for why
>> > we should store PFNs in the radix tree instead of kaddrs :-)
>>
>> How much complexity do we want to add in support of an fsync/msync
>> mechanism that is not the recommended way to use DAX?
>
> It actually makes the dax_io path much, much simpler.  And it's not
> primarily about fixing fsync/msync.  It also makes the fault path cheaper
> in the case where we're refaulting a page that's already been faulted
> by another process (or was previously faulted by this process and now
> needs to be faulted at a different address).
>
> And it fixes the problem with filesystems that use multiple block_devices.
> It also makes DAX much less reliant on buffer heads, which is good for
> the problem that Jared raised where he doesn't have a block_device in
> an embedded system.

Oh I thought we were talking about what goes in the radix.  Sure,
de-emphasizing the usage of a block_device throughout the dax
implementation is interesting.  It also has some synergy with the
LSF/MM topic I'm writing up "pmem as storage device vs pmem as
memory".

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-31 18:07                   ` Matthew Wilcox
  2016-01-31 18:18                     ` Dan Williams
@ 2016-01-31 19:51                     ` Dan Williams
  1 sibling, 0 replies; 66+ messages in thread
From: Dan Williams @ 2016-01-31 19:51 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Ross Zwisler, linux-nvdimm, Dave Chinner, linux-kernel,
	Christoph Hellwig, Alexander Viro, Jan Kara, linux-fsdevel,
	Andrew Morton

On Sun, Jan 31, 2016 at 10:07 AM, Matthew Wilcox <willy@linux.intel.com> wrote:
> One option that I've been looking at (primarily for x86-32) is
> having an rbtree of PFN ranges that drivers add to when they register
> peristent memory.

On this specific point we do already have find_dev_pagemap().

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-29 23:34       ` Ross Zwisler
  2016-01-30  0:18         ` Dan Williams
@ 2016-01-31 22:44         ` Dave Chinner
  1 sibling, 0 replies; 66+ messages in thread
From: Dave Chinner @ 2016-01-31 22:44 UTC (permalink / raw)
  To: Ross Zwisler, Christoph Hellwig, linux-kernel, Alexander Viro,
	Andrew Morton, Dan Williams, Jan Kara, Matthew Wilcox,
	linux-fsdevel, linux-nvdimm

On Fri, Jan 29, 2016 at 04:34:30PM -0700, Ross Zwisler wrote:
> On Fri, Jan 29, 2016 at 11:28:15AM -0700, Ross Zwisler wrote:
> > On Thu, Jan 28, 2016 at 01:38:58PM -0800, Christoph Hellwig wrote:
> > > On Thu, Jan 28, 2016 at 12:35:04PM -0700, Ross Zwisler wrote:
> > > > There are a number of places in dax.c that look up the struct block_device
> > > > associated with an inode.  Previously this was done by just using
> > > > inode->i_sb->s_bdev.  This is correct for inodes that exist within the
> > > > filesystems supported by DAX (ext2, ext4 & XFS), but when running DAX
> > > > against raw block devices this value is NULL.  This causes NULL pointer
> > > > dereferences when these block_device pointers are used.
> > > 
> > > It's also wrong for an XFS file system with a RT device..
> > > 
> > > > +#define DAX_BDEV(inode) (S_ISBLK(inode->i_mode) ? I_BDEV(inode) \
> > > > +				: inode->i_sb->s_bdev)
> > > 
> > > .. but this isn't going to fix it.  You must use a bdev returned by
> > > get_blocks or a similar file system method.
> > 
> > I guess I need to go off and understand if we can have DAX mappings on such a
> > device.  If we can, we may have a problem - we can get the block_device from
> > get_block() in I/O path and the various fault paths, but we don't have access
> > to get_block() when flushing via dax_writeback_mapping_range().  We avoid
> > needing it the normal case by storing the sector results from get_block() in
> > the radix tree.
> > 
> > /me is off to play with RT devices...
> 
> Well, RT devices are completely broken as far as I can see.  I've reported the
> breakage to the XFS list.  Anything I do that triggers a RT block allocation
> in XFS causes a lockdep splat + a kernel BUG - I've tried regular pwrite(),

Set CONFIG_XFS_DEBUG=n (assert failure that can be ignored causing
the bug, and lockdep simply has an annotation problem) and it should
work.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-31  2:32           ` Matthew Wilcox
  2016-01-31  6:12             ` Ross Zwisler
@ 2016-02-01 13:44             ` Matthew Wilcox
  1 sibling, 0 replies; 66+ messages in thread
From: Matthew Wilcox @ 2016-02-01 13:44 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-nvdimm, Dave Chinner, linux-kernel, Christoph Hellwig,
	Alexander Viro, Jan Kara, linux-fsdevel, Andrew Morton

On Sun, Jan 31, 2016 at 01:32:47PM +1100, Matthew Wilcox wrote:
> On Fri, Jan 29, 2016 at 10:01:13PM -0800, Dan Williams wrote:
> > On Fri, Jan 29, 2016 at 9:28 PM, Matthew Wilcox <willy@linux.intel.com> wrote:
> > > If we store the PFN of the underlying page instead, we don't have this
> > > problem.  Instead, we have a different problem; of the device going
> > > away under us.  I'm trying to find the code which tears down PTEs when
> > > the device goes away, and I'm not seeing it.  What do we do about user
> > > mappings of the device?
> > 
> > I deferred the dax tear down code until next cycle as Al rightly
> > pointed out some needed re-works:
> > 
> > https://lists.01.org/pipermail/linux-nvdimm/2016-January/003995.html
> 
> Thanks; I eventually found it in my email somewhere over the Pacific.
> 
> I did probably 70% of the work needed to switch the radix tree over to
> storing PFNs instead of sectors.  It seems viable, though it's a big
> change from where we are today:

70%?!  Hah.  I'd done maybe 50%.  This isn't everything needed; I still
need to write radix_tree_replace().  But it's enough to get a flavour for
where this line of thinking takes us.  I think it ends up being cleaner
code, and possibly better performing.  I also think it points us back
in the direction of wanting an address_space operation to return a PFN
for the radix tree instead of handling buffer_heads directly in dax.c.

Ah well.  Time to sleep ...

>From 0321c30eeb189ad2da8dcc25623419e2ba9c6cee Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 31 Jan 2016 13:38:21 +1100
Subject: [PATCH] Giant non-compiling mess

Note that clear_pmem needs to be updated to set the needs_wmb() flag.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 fs/dax.c                   | 1127 +++++++++++++++++++++-----------------------
 include/linux/dax.h        |    3 +-
 include/linux/pfn_t.h      |   41 +-
 include/linux/radix-tree.h |    9 -
 include/linux/sched.h      |    1 +
 5 files changed, 565 insertions(+), 616 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index e9701d6..38b92b5 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -25,12 +25,132 @@
 #include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/pagevec.h>
+#include <linux/pfn_t.h>
 #include <linux/pmem.h>
+#include <linux/preempt.h>
 #include <linux/sched.h>
+#include <linux/sizes.h>
 #include <linux/uio.h>
 #include <linux/vmstat.h>
-#include <linux/pfn_t.h>
-#include <linux/sizes.h>
+
+/*
+ * 32-bit architectures want to override this to actually map/unmap
+ * their persistent memory.  ARM, SPARC & MIPS also want to override it
+ * to map the PFN at an address that uses the same cachelines as the
+ * userspace mapping (that's what 'index' is for)
+ */
+static void *dax_map_pfn(pfn_t pfn, unsigned long index)
+{
+	if (is_bad_pfn_t(pfn))
+		return NULL;
+	preempt_disable();
+	pagefault_disable();
+	return pfn_to_kaddr(pfn_t_to_pfn(pfn));
+}
+
+static void dax_unmap_pfn(void *addr)
+{
+	pagefault_enable();
+	preempt_enable();
+}
+
+/*
+ * DAX uses the 'exceptional' entries to store PFNs in the radix tree.
+ * Bit 0 is clear (the radix tree uses this for its own purposes).  Bit
+ * 1 is set (to indicate an exceptional entry).  Bits 2 & 3 are PFN_DEV
+ * and PFN_MAP.  The top two bits denote the size of the entry (PTE, PMD,
+ * PUD, one reserved).  That leaves us 26 bits on 32-bit systems and 58
+ * bits on 64-bit systems, able to address 256GB and 1024EB respectively.
+ */
+#define RADIX_DAX_SIZE_MASK	(0x3UL << (BITS_PER_LONG - 2))
+#define RADIX_TREE_MASK		(RADIX_TREE_INDIRECT_PTR | \
+				 RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_PFN_MASK	(~(RADIX_DAX_SIZE_MASK | RADIX_TREE_MASK))
+#define RADIX_DAX_SHIFT		4
+#define RADIX_DAX_PTE		(0x0UL << (BITS_PER_LONG - 2))
+#define RADIX_DAX_PMD		(0x1UL << (BITS_PER_LONG - 2))
+#define RADIX_DAX_PUD		(0x2UL << (BITS_PER_LONG - 2))
+#define RADIX_DAX_SIZE(entry)	((unsigned long)entry & RADIX_DAX_SIZE_MASK)
+#define RADIX_DAX_ENTRY(pfn, size) \
+	((void *)((pfn_t_to_pfn(pfn) << RADIX_DAX_SHIFT) | size))
+
+/* The 'colour' (ie low bits) within a PMD/PUD of a page offset. */
+#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_CACHE_SHIFT) - 1)
+#define PG_PUD_COLOUR	((PUD_SIZE >> PAGE_CACHE_SHIFT) - 1)
+
+static pfn_t radix_to_pfn_t(void *entry, pgoff_t index)
+{
+	pfn_t pfn = { .val = (unsigned long)entry & RADIX_DAX_PFN_MASK };
+	unsigned offset = 0;
+
+	if (RADIX_DAX_SIZE(entry) == RADIX_DAX_PMD)
+		offset = index & PG_PMD_COLOUR;
+	else if (RADIX_DAX_SIZE(entry) == RADIX_DAX_PUD)
+		offset = index & PG_PUD_COLOUR;
+
+	return pfn_t_add(pfn, offset);
+}
+
+static void *pfn_to_radix(pfn_t pfn, unsigned long size)
+{
+	unsigned long value = pfn.val;
+	BUG_ON(value & RADIX_DAX_PFN_MASK);
+	return (void *)(value | size);
+}
+
+static unsigned size_to_order(unsigned long size)
+{
+	switch (size) {
+	case RADIX_DAX_PTE: return 0;
+	case RADIX_DAX_PMD: return PMD_SHIFT - PAGE_SHIFT;
+	case RADIX_DAX_PUD: return PUD_SHIFT - PAGE_SHIFT;
+	}
+	BUG();
+}
+
+static unsigned size_to_bytes(unsigned long size)
+{
+	switch (size) {
+	case RADIX_DAX_PTE: return PAGE_CACHE_SIZE;
+	case RADIX_DAX_PMD: return PMD_SIZE;
+	case RADIX_DAX_PUD: return PUD_SIZE;
+	}
+	BUG();
+}
+
+static int dax_add_radix_entry(struct address_space *mapping, pgoff_t index,
+				pfn_t pfn, unsigned long size, bool dirty)
+{
+	struct radix_tree_root *page_tree = &mapping->page_tree;
+	int count = 0;
+	void *entry;
+	unsigned order = size_to_order(size);
+
+	if (dirty)
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+	spin_lock_irq(&mapping->tree_lock);
+	entry = radix_tree_lookup(page_tree, index);
+	if (!radix_tree_exceptional_entry(entry)) {
+		count = -EEXIST;
+		goto unlock;
+	} else if (entry) {
+		if (size <= RADIX_DAX_SIZE(entry))
+			goto dirty;
+	}
+	count = radix_tree_replace(page_tree, index, order,
+					pfn_to_radix(pfn, size));
+	if (count < 0)
+		goto unlock;
+
+	mapping->nrexceptional -= (count - 1);
+ dirty:
+	if (dirty)
+		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
+ unlock:
+	spin_unlock_irq(&mapping->tree_lock);
+	return count;
+}
 
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
@@ -58,17 +178,235 @@ static void dax_unmap_atomic(struct block_device *bdev,
 	blk_queue_exit(bdev->bd_queue);
 }
 
+static sector_t to_sector(const struct buffer_head *bh,
+		const struct inode *inode)
+{
+	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+
+	return sector;
+}
+
+static bool buffer_written(struct buffer_head *bh)
+{
+	return buffer_mapped(bh) && !buffer_unwritten(bh);
+}
+
+static int dax_replace_hole(struct address_space *mapping, pgoff_t index,
+				unsigned long size, pfn_t pfn)
+{
+	unsigned order = size_to_order(size);
+	int i, error;
+
+	for (i = 0; i < (1 << order); i++) {
+		struct page *page;
+ repeat:
+		page = find_get_entry(mapping, index + i);
+		if (!page || radix_tree_exceptional_entry(page))
+			continue;
+
+		lock_page(page);
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto repeat;
+		}
+
+		delete_from_page_cache(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	/*
+	 * Somebody else could look in the radix tree and find nothing.
+	 * It's harmless though; they'll find the correct pfn by calling
+	 * the filesystem.
+	 */
+	error = dax_add_radix_entry(mapping, index, pfn, size, true);
+
+	unmap_mapping_range(mapping, index << PAGE_CACHE_SHIFT,
+				PAGE_CACHE_SIZE, 0);
+
+	return error;
+}
+
+static int dax_add_pfn_sized(struct address_space *mapping, pgoff_t index,
+				size_t size, bool write, pfn_t pfn,
+				unsigned long radix_size, unsigned entry_size)
+{
+	int error;
+	bool report = true;
+
+	while (size >= entry_size) {
+		error = dax_add_radix_entry(mapping, index, pfn,
+						radix_size, write);
+		if (error == -EEXIST)
+			error = dax_replace_hole(mapping, index, radix_size,
+						pfn);
+		if (error)
+			break;
+		report = false;
+
+		size -= entry_size;
+		pfn = pfn_t_add(pfn, entry_size / PAGE_CACHE_SIZE);
+		index += entry_size / PAGE_CACHE_SIZE;
+	}
+
+	return report ? error : 0;
+}
+
+static int dax_add_pfn_entries(struct address_space *mapping, pgoff_t index,
+				size_t size, bool write, pfn_t pfn)
+{
+	int error = 0;
+	int called = 0;
+	size_t max;
+
+	max = (PG_PMD_COLOUR + 1 - index) << PAGE_CACHE_SHIFT;
+	if (index & PG_PMD_COLOUR) {
+		error = dax_add_pfn_sized(mapping, index, min(size, max),
+				write, pfn, RADIX_DAX_PTE, PAGE_CACHE_SIZE);
+		called++;
+	}
+	size -= min(size, max);
+	if (error || !size)
+		goto out;
+	index += max >> PAGE_CACHE_SHIFT; 
+
+	max = (PG_PUD_COLOUR + 1 - index) << PAGE_CACHE_SHIFT;
+	if (index & PG_PUD_COLOUR) {
+		error = dax_add_pfn_sized(mapping, index, min(size, max),
+				write, pfn, RADIX_DAX_PMD, PMD_SIZE);
+		called++;
+	}
+	size -= min(size, max);
+	if (error || !size)
+		goto out;
+	index += max >> PMD_SHIFT; 
+
+	error = dax_add_pfn_sized(mapping, index, size,
+				write, pfn, RADIX_DAX_PUD, PUD_SIZE);
+	called++;
+	index += size >> PUD_SHIFT;
+	size = size & ~PMD_MASK;
+	if (error || !size)
+		goto out;
+
+	error = dax_add_pfn_sized(mapping, index, size,
+				write, pfn, RADIX_DAX_PMD, PMD_SIZE);
+	index += size >> PMD_SHIFT;
+	size = size & ~PAGE_CACHE_MASK;
+	if (error || !size)
+		return 0;
+
+	error = dax_add_pfn_sized(mapping, index, size,
+				write, pfn, RADIX_DAX_PTE, PAGE_CACHE_SIZE);
+ out:
+	if (called > 1)
+		error = 0;
+	return error;
+}
+
+/*
+ * Populate the page cache with as many pfns as the filesystem is willing
+ * to tell us about from a single call to get_block, starting at @index and
+ * continuing up to @max bytes.
+ */
+static int dax_create_pfns(struct address_space *mapping, pgoff_t index,
+				unsigned max, bool write, pfn_t *pfn,
+				get_block_t get_block, struct buffer_head *bh)
+{
+	struct inode *inode = mapping->host;
+	unsigned blkbits = inode->i_blkbits;
+	sector_t block = index << (PAGE_CACHE_SHIFT - blkbits);
+	struct blk_dax_ctl dax;
+	int error, result = 0;
+
+	bh->b_size = max;
+	bh->b_state = 0;
+	error = get_block(inode, block, bh, write);
+	if (error)
+		goto error;
+
+	if (!buffer_written(bh))
+		goto hole;
+
+	dax.sector = to_sector(bh, inode);
+	dax.size = bh->b_size;
+	error = dax_map_atomic(bh->b_bdev, &dax);
+	if (error < 0)
+		goto error;
+
+	/*
+	 * We may be about to write data to it, but now it's allocated,
+	 * and another thread will be able to find it in the page cache,
+	 * so we have to zero it otherwise there's a write vs fault race
+	 * that could expose stale data to an application.
+	 */
+	if (buffer_unwritten(bh) || buffer_new(bh)) {
+		clear_pmem(dax.addr, bh->b_size);
+		result = 1;
+	}
+
+	dax_unmap_atomic(bh->b_bdev, &dax);
+
+	error = dax_add_pfn_entries(mapping, index, bh->b_size,
+					write, dax.pfn);
+
+	/*
+	 * Even if we had an error adding the PFN to the radix tree,
+	 * the PFN is still good, so return it.
+	 */
+	*pfn = dax.pfn;
+	return error ? error : result;
+
+ hole:
+ error:
+	*pfn = bad_pfn_t;
+	return error;
+}
+
+/*
+ * Returns either a negative errno, 0 if no allocation had to be performed,
+ * or 1 if the filesystem allocated a block.
+ */
+static int dax_get_pfn(struct address_space *mapping, pgoff_t index,
+				size_t len, bool write, pfn_t *pfn,
+				get_block_t get_block, struct buffer_head *bh)
+{
+	void *entry;
+
+	rcu_read_lock();
+	entry = radix_tree_lookup(&mapping->page_tree, index);
+	rcu_read_unlock();
+
+	if (radix_tree_exceptional_entry(entry)) {
+		*pfn = radix_to_pfn_t(entry, index);
+		return 0;
+	}
+
+	if (entry) {
+		if (write)
+			return dax_create_pfns(mapping, index, len, true, pfn,
+						get_block, bh);
+	} else {
+		return dax_create_pfns(mapping, index, len, write, pfn,
+						get_block, bh);
+	}
+
+	*pfn = bad_pfn_t;
+	return 0;
+}
+
 /*
  * dax_clear_blocks() is called from within transaction context from XFS,
  * and hence this means the stack from this point must follow GFP_NOFS
  * semantics for all operations.
  */
-int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
+int dax_clear_blocks(struct block_device *bdev, sector_t sector, long size)
 {
-	struct block_device *bdev = inode->i_sb->s_bdev;
 	struct blk_dax_ctl dax = {
-		.sector = block << (inode->i_blkbits - 9),
-		.size = _size,
+		.sector = sector,
+		.size = size,
 	};
 
 	might_sleep();
@@ -91,133 +429,52 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
 }
 EXPORT_SYMBOL_GPL(dax_clear_blocks);
 
-/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
-static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
-		loff_t pos, loff_t end)
-{
-	loff_t final = end - pos + first; /* The final byte of the buffer */
-
-	if (first > 0)
-		clear_pmem(addr, first);
-	if (final < size)
-		clear_pmem(addr + final, size - final);
-}
-
-static bool buffer_written(struct buffer_head *bh)
-{
-	return buffer_mapped(bh) && !buffer_unwritten(bh);
-}
-
-/*
- * When ext4 encounters a hole, it returns without modifying the buffer_head
- * which means that we can't trust b_size.  To cope with this, we set b_state
- * to 0 before calling get_block and, if any bit is set, we know we can trust
- * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
- * and would save us time calling get_block repeatedly.
- */
-static bool buffer_size_valid(struct buffer_head *bh)
-{
-	return bh->b_state != 0;
-}
-
-
-static sector_t to_sector(const struct buffer_head *bh,
-		const struct inode *inode)
-{
-	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
-
-	return sector;
-}
-
 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
-		      loff_t start, loff_t end, get_block_t get_block,
-		      struct buffer_head *bh)
+				loff_t start, loff_t end,
+				get_block_t get_block, struct buffer_head *bh)
 {
-	loff_t pos = start, max = start, bh_max = start;
-	bool hole = false, need_wmb = false;
-	struct block_device *bdev = NULL;
-	int rw = iov_iter_rw(iter), rc;
-	long map_len = 0;
-	struct blk_dax_ctl dax = {
-		.addr = (void __pmem *) ERR_PTR(-EIO),
-	};
+	loff_t pos = start;
+	int error = 0;
+	const int rw = iov_iter_rw(iter);
 
 	if (rw == READ)
 		end = min(end, i_size_read(inode));
 
-	while (pos < end) {
-		size_t len;
-		if (pos == max) {
-			unsigned blkbits = inode->i_blkbits;
-			long page = pos >> PAGE_SHIFT;
-			sector_t block = page << (PAGE_SHIFT - blkbits);
-			unsigned first = pos - (block << blkbits);
-			long size;
-
-			if (pos == bh_max) {
-				bh->b_size = PAGE_ALIGN(end - pos);
-				bh->b_state = 0;
-				rc = get_block(inode, block, bh, rw == WRITE);
-				if (rc)
-					break;
-				if (!buffer_size_valid(bh))
-					bh->b_size = 1 << blkbits;
-				bh_max = pos - first + bh->b_size;
-				bdev = bh->b_bdev;
-			} else {
-				unsigned done = bh->b_size -
-						(bh_max - (pos - first));
-				bh->b_blocknr += done >> blkbits;
-				bh->b_size -= done;
-			}
+	while (!error && pos < end) {
+		pgoff_t pgoff = pos >> PAGE_CACHE_SHIFT;
+		unsigned off = pos & ~PAGE_CACHE_MASK;
+		size_t len = end - pos;
+		pfn_t pfn;
+		void __pmem *addr;
 
-			hole = rw == READ && !buffer_written(bh);
-			if (hole) {
-				size = bh->b_size - first;
-			} else {
-				dax_unmap_atomic(bdev, &dax);
-				dax.sector = to_sector(bh, inode);
-				dax.size = bh->b_size;
-				map_len = dax_map_atomic(bdev, &dax);
-				if (map_len < 0) {
-					rc = map_len;
-					break;
-				}
-				if (buffer_unwritten(bh) || buffer_new(bh)) {
-					dax_new_buf(dax.addr, map_len, first,
-							pos, end);
-					need_wmb = true;
-				}
-				dax.addr += first;
-				size = map_len - first;
-			}
-			max = min(pos + size, end);
-		}
+		error = dax_get_pfn(inode->i_mapping, pgoff, len, rw == WRITE,
+						&pfn, get_block, bh);
+		if (error < 0)
+			break;
+		addr = dax_map_pfn(pfn, pgoff) + off;
 
-		if (iov_iter_rw(iter) == WRITE) {
-			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
-			need_wmb = true;
-		} else if (!hole)
-			len = copy_to_iter((void __force *) dax.addr, max - pos,
-					iter);
+		if (len > PAGE_CACHE_SIZE)
+			len = PAGE_CACHE_SIZE;
+
+		if (rw == WRITE) {
+			len = copy_from_iter_pmem(addr, len, iter);
+			current->needs_wmb = true;
+		} else if (addr)
+			len = copy_to_iter((void __force *) addr, len, iter);
 		else
-			len = iov_iter_zero(max - pos, iter);
+			len = iov_iter_zero(len, iter);
+		dax_unmap_pfn(addr - off);
 
-		if (!len) {
-			rc = -EFAULT;
-			break;
-		}
+		if (!len)
+			error = -EFAULT;
 
 		pos += len;
-		if (!IS_ERR(dax.addr))
-			dax.addr += len;
 	}
 
-	if (need_wmb)
+	if (current->needs_wmb)
 		wmb_pmem();
-	dax_unmap_atomic(bdev, &dax);
 
-	return (pos == start) ? rc : pos - start;
+	return (pos == start) ? error : pos - start;
 }
 
 /**
@@ -238,15 +495,14 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
  * is in progress.
  */
 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
-		  struct iov_iter *iter, loff_t pos, get_block_t get_block,
-		  dio_iodone_t end_io, int flags)
+			struct iov_iter *iter, loff_t pos,
+			get_block_t get_block, dio_iodone_t end_io, int flags)
 {
 	struct buffer_head bh;
 	ssize_t retval = -EINVAL;
 	loff_t end = pos + iov_iter_count(iter);
 
 	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
 
 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
 		struct address_space *mapping = inode->i_mapping;
@@ -277,124 +533,26 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 }
 EXPORT_SYMBOL_GPL(dax_do_io);
 
-/*
- * The user has performed a load from a hole in the file.  Allocating
- * a new page in the file would cause excessive storage usage for
- * workloads with sparse files.  We allocate a page cache page instead.
- * We'll kick it out of the page cache if it's ever written to,
- * otherwise it will simply fall out of the page cache under memory
- * pressure without ever having been dirtied.
- */
-static int dax_load_hole(struct address_space *mapping, struct page *page,
-							struct vm_fault *vmf)
+static int copy_user_pfn(struct vm_fault *vmf, pfn_t pfn)
 {
-	if (!page)
-		page = find_or_create_page(mapping, vmf->pgoff,
-						vmf->gfp_mask | __GFP_ZERO);
-	if (!page)
-		return VM_FAULT_OOM;
-	vmf->page = page;
-	return VM_FAULT_LOCKED;
-}
+	void *vto, *vfrom;
 
-static int copy_user_bh(struct page *to, struct inode *inode,
-		struct buffer_head *bh, unsigned long vaddr)
-{
-	struct blk_dax_ctl dax = {
-		.sector = to_sector(bh, inode),
-		.size = bh->b_size,
-	};
-	struct block_device *bdev = bh->b_bdev;
-	void *vto;
-
-	if (dax_map_atomic(bdev, &dax) < 0)
-		return PTR_ERR(dax.addr);
-	vto = kmap_atomic(to);
-	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
+	vfrom = dax_map_pfn(pfn, vmf->pgoff);
+	vto = kmap_atomic(vmf->cow_page);
+	copy_user_page(vto, vfrom, (unsigned long)vmf->virtual_address,
+			vmf->cow_page);
 	kunmap_atomic(vto);
-	dax_unmap_atomic(bdev, &dax);
+	dax_unmap_pfn(vfrom);
 	return 0;
 }
 
-#define NO_SECTOR -1
-#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
-
-static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
-		sector_t sector, bool pmd_entry, bool dirty)
-{
-	struct radix_tree_root *page_tree = &mapping->page_tree;
-	pgoff_t pmd_index = DAX_PMD_INDEX(index);
-	int type, error = 0;
-	void *entry;
-
-	WARN_ON_ONCE(pmd_entry && !dirty);
-	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-
-	spin_lock_irq(&mapping->tree_lock);
-
-	entry = radix_tree_lookup(page_tree, pmd_index);
-	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
-		index = pmd_index;
-		goto dirty;
-	}
-
-	entry = radix_tree_lookup(page_tree, index);
-	if (entry) {
-		type = RADIX_DAX_TYPE(entry);
-		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
-					type != RADIX_DAX_PMD)) {
-			error = -EIO;
-			goto unlock;
-		}
-
-		if (!pmd_entry || type == RADIX_DAX_PMD)
-			goto dirty;
-
-		/*
-		 * We only insert dirty PMD entries into the radix tree.  This
-		 * means we don't need to worry about removing a dirty PTE
-		 * entry and inserting a clean PMD entry, thus reducing the
-		 * range we would flush with a follow-up fsync/msync call.
-		 */
-		radix_tree_delete(&mapping->page_tree, index);
-		mapping->nrexceptional--;
-	}
-
-	if (sector == NO_SECTOR) {
-		/*
-		 * This can happen during correct operation if our pfn_mkwrite
-		 * fault raced against a hole punch operation.  If this
-		 * happens the pte that was hole punched will have been
-		 * unmapped and the radix tree entry will have been removed by
-		 * the time we are called, but the call will still happen.  We
-		 * will return all the way up to wp_pfn_shared(), where the
-		 * pte_same() check will fail, eventually causing page fault
-		 * to be retried by the CPU.
-		 */
-		goto unlock;
-	}
-
-	error = radix_tree_insert(page_tree, index,
-			RADIX_DAX_ENTRY(sector, pmd_entry));
-	if (error)
-		goto unlock;
-
-	mapping->nrexceptional++;
- dirty:
-	if (dirty)
-		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
- unlock:
-	spin_unlock_irq(&mapping->tree_lock);
-	return error;
-}
-
 static int dax_writeback_one(struct block_device *bdev,
 		struct address_space *mapping, pgoff_t index, void *entry)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
-	int type = RADIX_DAX_TYPE(entry);
+	unsigned size = RADIX_DAX_SIZE(entry);
 	struct radix_tree_node *node;
-	struct blk_dax_ctl dax;
+	void __pmem *addr;
 	void **slot;
 	int ret = 0;
 
@@ -412,38 +570,14 @@ static int dax_writeback_one(struct block_device *bdev,
 	/* another fsync thread may have already written back this entry */
 	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
 		goto unlock;
-
-	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
-		ret = -EIO;
-		goto unlock;
-	}
-
-	dax.sector = RADIX_DAX_SECTOR(entry);
-	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
 	spin_unlock_irq(&mapping->tree_lock);
 
-	/*
-	 * We cannot hold tree_lock while calling dax_map_atomic() because it
-	 * eventually calls cond_resched().
-	 */
-	ret = dax_map_atomic(bdev, &dax);
-	if (ret < 0)
-		return ret;
-
-	if (WARN_ON_ONCE(ret < dax.size)) {
-		ret = -EIO;
-		goto unmap;
-	}
-
-	wb_cache_pmem(dax.addr, dax.size);
+	addr = dax_map_pfn(radix_to_pfn_t(entry, index), index);
+	wb_cache_pmem(addr, size_to_bytes(size));
+	dax_unmap_pfn(addr);
 
 	spin_lock_irq(&mapping->tree_lock);
 	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
-	spin_unlock_irq(&mapping->tree_lock);
- unmap:
-	dax_unmap_atomic(bdev, &dax);
-	return ret;
-
  unlock:
 	spin_unlock_irq(&mapping->tree_lock);
 	return ret;
@@ -459,27 +593,17 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
 {
 	struct inode *inode = mapping->host;
 	struct block_device *bdev = inode->i_sb->s_bdev;
-	pgoff_t start_index, end_index, pmd_index;
+	pgoff_t start_index, end_index;
 	pgoff_t indices[PAGEVEC_SIZE];
 	struct pagevec pvec;
 	bool done = false;
 	int i, ret = 0;
-	void *entry;
 
 	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
 		return -EIO;
 
 	start_index = start >> PAGE_CACHE_SHIFT;
 	end_index = end >> PAGE_CACHE_SHIFT;
-	pmd_index = DAX_PMD_INDEX(start_index);
-
-	rcu_read_lock();
-	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
-	rcu_read_unlock();
-
-	/* see if the start of our range is covered by a PMD entry */
-	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
-		start_index = pmd_index;
 
 	tag_pages_for_writeback(mapping, start_index, end_index);
 
@@ -509,107 +633,80 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
-static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
-			struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	unsigned long vaddr = (unsigned long)vmf->virtual_address;
-	struct address_space *mapping = inode->i_mapping;
-	struct block_device *bdev = bh->b_bdev;
-	struct blk_dax_ctl dax = {
-		.sector = to_sector(bh, inode),
-		.size = bh->b_size,
-	};
-	int error;
-
-	if (dax_map_atomic(bdev, &dax) < 0) {
-		error = PTR_ERR(dax.addr);
-		goto out;
-	}
-
-	if (buffer_unwritten(bh) || buffer_new(bh)) {
-		clear_pmem(dax.addr, PAGE_SIZE);
-		wmb_pmem();
-	}
-	dax_unmap_atomic(bdev, &dax);
-
-	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
-			vmf->flags & FAULT_FLAG_WRITE);
-	if (error)
-		goto out;
-
-	error = vm_insert_mixed(vma, vaddr, dax.pfn);
-
- out:
-	return error;
-}
-
 static int dax_pte_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			get_block_t get_block, dax_iodone_t complete_unwritten)
 {
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
+	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 	struct page *page;
+	pfn_t pfn;
 	struct buffer_head bh;
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
-	unsigned blkbits = inode->i_blkbits;
-	sector_t block;
 	pgoff_t size;
 	int error;
 	int major = 0;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (vmf->pgoff >= size)
 		return VM_FAULT_SIGBUS;
 
 	memset(&bh, 0, sizeof(bh));
-	block = (sector_t)vmf->pgoff << (PAGE_CACHE_SHIFT - blkbits);
-	bh.b_bdev = inode->i_sb->s_bdev;
-	bh.b_size = PAGE_CACHE_SIZE;
 
  repeat:
-	page = find_get_page(mapping, vmf->pgoff);
-	if (page) {
+	page = find_get_entry(mapping, vmf->pgoff);
+	if (radix_tree_exceptional_entry(page)) {
+		pfn = radix_to_pfn_t(page, vmf->pgoff);
+		page = NULL;
+	} else if (!page) {
+		error = dax_create_pfns(mapping, vmf->pgoff, PAGE_CACHE_SIZE,
+				write && !vmf->cow_page, &pfn, get_block, &bh);
+		if (error < 0)
+			goto out;
+
+		if (error) {
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			major = VM_FAULT_MAJOR;
+			error = 0;
+		}
+	} else {
 		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
 			page_cache_release(page);
 			return VM_FAULT_RETRY;
-		}
-		if (unlikely(page->mapping != mapping)) {
+		} else if (unlikely(page->mapping != mapping)) {
 			unlock_page(page);
 			page_cache_release(page);
 			goto repeat;
 		}
 	}
 
-	error = get_block(inode, block, &bh, 0);
-	if (!error && (bh.b_size < PAGE_CACHE_SIZE))
-		error = -EIO;		/* fs corruption? */
-	if (error)
-		goto unlock_page;
+	if (is_bad_pfn_t(pfn) && !vmf->cow_page) {
+		/*
+		 * Allocating a new page in the file would cause excessive
+		 * storage usage for workloads with sparse files.  We allocate
+		 * a page cache page instead.  We'll kick it out of the page
+		 * cache if it's ever written to, otherwise it will simply
+		 * fall out of the page cache under memory pressure without
+		 * ever having been dirtied.
+		 */
+		if (!page)
+			page = find_or_create_page(mapping, vmf->pgoff,
+						vmf->gfp_mask | __GFP_ZERO);
+		if (!page)
+			return VM_FAULT_OOM;
+		vmf->page = page;
+		return VM_FAULT_LOCKED;
+	}
 
-	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
-		if (vmf->flags & FAULT_FLAG_WRITE) {
-			error = get_block(inode, block, &bh, 1);
-			count_vm_event(PGMAJFAULT);
-			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-			major = VM_FAULT_MAJOR;
-			if (!error && (bh.b_size < PAGE_CACHE_SIZE))
-				error = -EIO;
+	if (vmf->cow_page) {
+		if (is_bad_pfn_t(pfn)) {
+			clear_user_highpage(vmf->cow_page, vaddr);
+		} else {
+			error = copy_user_pfn(vmf, pfn);
 			if (error)
 				goto unlock_page;
-		} else {
-			return dax_load_hole(mapping, page, vmf);
 		}
-	}
-
-	if (vmf->cow_page) {
-		struct page *new_page = vmf->cow_page;
-		if (buffer_written(&bh))
-			error = copy_user_bh(new_page, inode, &bh, vaddr);
-		else
-			clear_user_highpage(new_page, vaddr);
-		if (error)
-			goto unlock_page;
 		vmf->page = page;
 
 		/*
@@ -625,18 +722,10 @@ static int dax_pte_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		return VM_FAULT_LOCKED;
 	}
 
-	/* Check we didn't race with a read fault installing a new page */
-	if (!page && major)
-		page = find_lock_page(mapping, vmf->pgoff);
+	if (current->needs_wmb)
+		wmb_pmem();
 
-	if (page) {
-		unmap_mapping_range(mapping, vmf->pgoff << PAGE_CACHE_SHIFT,
-							PAGE_CACHE_SIZE, 0);
-		delete_from_page_cache(page);
-		unlock_page(page);
-		page_cache_release(page);
-		page = NULL;
-	}
+	error = vm_insert_mixed(vma, vaddr, pfn);
 
 	/*
 	 * If we successfully insert the new mapping over an unwritten extent,
@@ -648,12 +737,11 @@ static int dax_pte_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	 * indicate what the callback should do via the uptodate variable, same
 	 * as for normal BH based IO completions.
 	 */
-	error = dax_insert_mapping(inode, &bh, vma, vmf);
 	if (buffer_unwritten(&bh)) {
 		if (complete_unwritten)
 			complete_unwritten(&bh, !error);
 		else
-			WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
+			WARN_ON_ONCE(!write);
 	}
 
  out:
@@ -673,12 +761,6 @@ static int dax_pte_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/*
- * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
- * more often than one might expect in the below function.
- */
-#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_CACHE_SHIFT) - 1)
-
 static void __dax_dbg(struct buffer_head *bh, unsigned long address,
 		const char *reason, const char *fn)
 {
@@ -697,98 +779,19 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address,
 
 #define dax_pmd_dbg(bh, address, reason)	__dax_dbg(bh, address, reason, "dax_pmd")
 
-static int dax_insert_pmd_mapping(struct inode *inode, struct buffer_head *bh,
-			struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	int major = 0;
-	struct blk_dax_ctl dax = {
-		.sector = to_sector(bh, inode),
-		.size = PMD_SIZE,
-	};
-	struct block_device *bdev = bh->b_bdev;
-	bool write = vmf->flags & FAULT_FLAG_WRITE;
-	unsigned long address = (unsigned long)vmf->virtual_address;
-	long length = dax_map_atomic(bdev, &dax);
-
-	if (length < 0)
-		return VM_FAULT_SIGBUS;
-	if (length < PMD_SIZE) {
-		dax_pmd_dbg(bh, address, "dax-length too small");
-		goto unmap;
-	}
-
-	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
-		dax_pmd_dbg(bh, address, "pfn unaligned");
-		goto unmap;
-	}
-
-	if (!pfn_t_devmap(dax.pfn)) {
-		dax_pmd_dbg(bh, address, "pfn not in memmap");
-		goto unmap;
-	}
-
-	if (buffer_unwritten(bh) || buffer_new(bh)) {
-		clear_pmem(dax.addr, PMD_SIZE);
-		wmb_pmem();
-		count_vm_event(PGMAJFAULT);
-		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-		major = VM_FAULT_MAJOR;
-	}
-	dax_unmap_atomic(bdev, &dax);
-
-	/*
-	 * For PTE faults we insert a radix tree entry for reads, and leave
-	 * it clean.  Then on the first write we dirty the radix tree entry
-	 * via the dax_pfn_mkwrite() path.  This sequence allows the
-	 * dax_pfn_mkwrite() call to be simpler and avoid a call into
-	 * get_block() to translate the pgoff to a sector in order to be able
-	 * to create a new radix tree entry.
-	 *
-	 * The PMD path doesn't have an equivalent to dax_pfn_mkwrite(),
-	 * though, so for a read followed by a write we traverse all the way
-	 * through dax_pmd_fault() twice.  This means we can just skip
-	 * inserting a radix tree entry completely on the initial read and
-	 * just wait until the write to insert a dirty entry.
-	 */
-	if (write) {
-		int error = dax_radix_entry(vma->vm_file->f_mapping, vmf->pgoff,
-						dax.sector, true, true);
-		if (error) {
-			dax_pmd_dbg(bh, address,
-					"PMD radix insertion failed");
-			goto fallback;
-		}
-	}
-
-	dev_dbg(part_to_dev(bdev->bd_part),
-			"%s: %s addr: %lx pfn: %lx sect: %llx\n",
-			__func__, current->comm, address,
-			pfn_t_to_pfn(dax.pfn),
-			(unsigned long long) dax.sector);
-	return major | vmf_insert_pfn_pmd(vma, address, vmf->pmd,
-						dax.pfn, write);
- unmap:
-	dax_unmap_atomic(bdev, &dax);
- fallback:
-	count_vm_event(THP_FAULT_FALLBACK);
-	return VM_FAULT_FALLBACK;
-}
-
 static int dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		get_block_t get_block, dax_iodone_t complete_unwritten)
 {
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
+	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
+	void *entry;
+	pfn_t pfn;
 	struct buffer_head bh;
-	unsigned blkbits = inode->i_blkbits;
-	unsigned long address = (unsigned long)vmf->virtual_address;
-	unsigned long pmd_addr = address & PMD_MASK;
-	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	unsigned long pmd_addr = vaddr & PMD_MASK;
 	pgoff_t size;
-	sector_t block;
-	int result;
-	bool alloc = false;
+	int result = 0;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	/* dax pmd mappings require pfn_t_devmap() */
 	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
@@ -796,17 +799,17 @@ static int dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED)) {
-		split_huge_pmd(vma, vmf->pmd, address);
-		dax_pmd_dbg(NULL, address, "cow write");
+		split_huge_pmd(vma, vmf->pmd, vaddr);
+		dax_pmd_dbg(NULL, vaddr, "cow write");
 		return VM_FAULT_FALLBACK;
 	}
 	/* If the PMD would extend outside the VMA */
 	if (pmd_addr < vma->vm_start) {
-		dax_pmd_dbg(NULL, address, "vma start unaligned");
+		dax_pmd_dbg(NULL, vaddr, "vma start unaligned");
 		return VM_FAULT_FALLBACK;
 	}
 	if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
-		dax_pmd_dbg(NULL, address, "vma end unaligned");
+		dax_pmd_dbg(NULL, vaddr, "vma end unaligned");
 		return VM_FAULT_FALLBACK;
 	}
 
@@ -815,76 +818,69 @@ static int dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		return VM_FAULT_SIGBUS;
 	/* If the PMD would cover blocks out of the file */
 	if ((vmf->pgoff | PG_PMD_COLOUR) >= size) {
-		dax_pmd_dbg(NULL, address,
+		dax_pmd_dbg(NULL, vaddr,
 				"offset + huge page size > file size");
 		return VM_FAULT_FALLBACK;
 	}
 
 	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
-	block = (sector_t)vmf->pgoff << (PAGE_CACHE_SHIFT - blkbits);
-
-	bh.b_size = PMD_SIZE;
-
-	if (get_block(inode, block, &bh, 0) != 0)
-		return VM_FAULT_SIGBUS;
-
-	if (!buffer_mapped(&bh) && write) {
-		if (get_block(inode, block, &bh, 1) != 0)
-			return VM_FAULT_SIGBUS;
-		alloc = true;
-	}
 
-	/*
-	 * If the filesystem isn't willing to tell us the length of a hole,
-	 * just fall back to PTEs.  Calling get_block 512 times in a loop
-	 * would be silly.
-	 */
-	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
-		dax_pmd_dbg(&bh, address, "allocated block too small");
-		return VM_FAULT_FALLBACK;
-	}
+	entry = find_get_entry(mapping, vmf->pgoff);
+	if (radix_tree_exceptional_entry(entry) &&
+				RADIX_DAX_SIZE(entry) >= RADIX_DAX_PMD) {
+		pfn = radix_to_pfn_t(entry, vmf->pgoff);
+	} else {
+		int error = dax_create_pfns(mapping, vmf->pgoff, PMD_SIZE,
+					write, &pfn, get_block, &bh);
+		if (error < 0)
+			goto fallback;
 
-	/*
-	 * If we allocated new storage, make sure no process has any
-	 * zero pages covering this hole
-	 */
-	if (alloc) {
-		loff_t lstart = vmf->pgoff << PAGE_CACHE_SHIFT;
-		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
+		if (error) {
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			result = VM_FAULT_MAJOR;
+			error = 0;
+		}
 
-		truncate_pagecache_range(inode, lstart, lend);
+		/*
+		 * We don't know if dax_create_pfns() was able to allocate
+		 * a contiguous aligned chunk, or whether it was only able
+		 * to do a partial allocation.
+		 */
+		entry = find_get_entry(mapping, vmf->pgoff);
+		if (!radix_tree_exceptional_entry(entry) ||
+				RADIX_DAX_SIZE(entry) < RADIX_DAX_PMD)
+			goto fallback;
 	}
 
-	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+	if (is_bad_pfn_t(pfn)) {
 		spinlock_t *ptl;
 		pmd_t entry, *pmd = vmf->pmd;
 		struct page *zero_page = get_huge_zero_page();
 
 		if (unlikely(!zero_page)) {
-			dax_pmd_dbg(&bh, address, "no zero page");
+			dax_pmd_dbg(&bh, vaddr, "no zero page");
 			goto fallback;
 		}
 
 		ptl = pmd_lock(vma->vm_mm, pmd);
 		if (!pmd_none(*pmd)) {
 			spin_unlock(ptl);
-			dax_pmd_dbg(&bh, address, "pmd already present");
+			dax_pmd_dbg(&bh, vaddr, "pmd already present");
 			goto fallback;
 		}
 
-		dev_dbg(part_to_dev(bh.b_bdev->bd_part),
-				"%s: %s addr: %lx pfn: <zero> sect: %llx\n",
-				__func__, current->comm, address,
-				(unsigned long long) to_sector(&bh, inode));
-
 		entry = mk_pmd(zero_page, vma->vm_page_prot);
 		entry = pmd_mkhuge(entry);
 		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
 		result = VM_FAULT_NOPAGE;
 		spin_unlock(ptl);
 	} else {
-		result = dax_insert_pmd_mapping(inode, &bh, vma, vmf);
+		if (current->needs_wmb)
+			wmb_pmem();
+
+		result |= vmf_insert_pfn_pmd(vma, vaddr, vmf->pmd, pfn,
+						write);
 	}
 
  out:
@@ -907,80 +903,21 @@ static int dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 #endif /* !CONFIG_TRANSPARENT_HUGEPAGE */
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-/*
- * The 'colour' (ie low bits) within a PUD of a page offset.  This comes up
- * more often than one might expect in the below function.
- */
-#define PG_PUD_COLOUR	((PUD_SIZE >> PAGE_CACHE_SHIFT) - 1)
-
 #define dax_pud_dbg(bh, address, reason)	__dax_dbg(bh, address, reason, "dax_pud")
 
-static int dax_insert_pud_mapping(struct inode *inode, struct buffer_head *bh,
-			struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	int major = 0;
-	struct blk_dax_ctl dax = {
-		.sector = to_sector(bh, inode),
-		.size = PUD_SIZE,
-	};
-	struct block_device *bdev = bh->b_bdev;
-	bool write = vmf->flags & FAULT_FLAG_WRITE;
-	unsigned long address = (unsigned long)vmf->virtual_address;
-	long length = dax_map_atomic(bdev, &dax);
-
-	if (length < 0)
-		return VM_FAULT_SIGBUS;
-	if (length < PUD_SIZE) {
-		dax_pud_dbg(bh, address, "dax-length too small");
-		goto unmap;
-	}
-	if (pfn_t_to_pfn(dax.pfn) & PG_PUD_COLOUR) {
-		dax_pud_dbg(bh, address, "pfn unaligned");
-		goto unmap;
-	}
-
-	if (!pfn_t_devmap(dax.pfn)) {
-		dax_pud_dbg(bh, address, "pfn not in memmap");
-		goto unmap;
-	}
-
-	if (buffer_unwritten(bh) || buffer_new(bh)) {
-		clear_pmem(dax.addr, PUD_SIZE);
-		wmb_pmem();
-		count_vm_event(PGMAJFAULT);
-		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-		major = VM_FAULT_MAJOR;
-	}
-	dax_unmap_atomic(bdev, &dax);
-
-	dev_dbg(part_to_dev(bdev->bd_part),
-			"%s: %s addr: %lx pfn: %lx sect: %llx\n",
-			__func__, current->comm, address,
-			pfn_t_to_pfn(dax.pfn),
-			(unsigned long long) dax.sector);
-	return major | vmf_insert_pfn_pud(vma, address, vmf->pud,
-						dax.pfn, write);
- unmap:
-	dax_unmap_atomic(bdev, &dax);
-	count_vm_event(THP_FAULT_FALLBACK);
-	return VM_FAULT_FALLBACK;
-}
-
 static int dax_pud_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		get_block_t get_block, dax_iodone_t complete_unwritten)
 {
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
+	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
+	void *entry;
+	pfn_t pfn;
 	struct buffer_head bh;
-	unsigned blkbits = inode->i_blkbits;
-	unsigned long address = (unsigned long)vmf->virtual_address;
-	unsigned long pud_addr = address & PUD_MASK;
-	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	unsigned long pud_addr = vaddr & PUD_MASK;
 	pgoff_t size;
-	sector_t block;
-	int result;
-	bool alloc = false;
+	int result = 0;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	/* dax pud mappings require pfn_t_devmap() */
 	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
@@ -988,17 +925,17 @@ static int dax_pud_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED)) {
-		split_huge_pud(vma, vmf->pud, address);
-		dax_pud_dbg(NULL, address, "cow write");
+		split_huge_pud(vma, vmf->pud, vaddr);
+		dax_pud_dbg(NULL, vaddr, "cow write");
 		return VM_FAULT_FALLBACK;
 	}
 	/* If the PUD would extend outside the VMA */
 	if (pud_addr < vma->vm_start) {
-		dax_pud_dbg(NULL, address, "vma start unaligned");
+		dax_pud_dbg(NULL, vaddr, "vma start unaligned");
 		return VM_FAULT_FALLBACK;
 	}
 	if ((pud_addr + PUD_SIZE) > vma->vm_end) {
-		dax_pud_dbg(NULL, address, "vma end unaligned");
+		dax_pud_dbg(NULL, vaddr, "vma end unaligned");
 		return VM_FAULT_FALLBACK;
 	}
 
@@ -1007,52 +944,50 @@ static int dax_pud_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		return VM_FAULT_SIGBUS;
 	/* If the PUD would cover blocks out of the file */
 	if ((vmf->pgoff | PG_PUD_COLOUR) >= size) {
-		dax_pud_dbg(NULL, address,
+		dax_pud_dbg(NULL, vaddr,
 				"offset + huge page size > file size");
 		return VM_FAULT_FALLBACK;
 	}
 
 	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
-	block = (sector_t)vmf->pgoff << (PAGE_CACHE_SHIFT - blkbits);
 
-	bh.b_size = PUD_SIZE;
-
-	if (get_block(inode, block, &bh, 0) != 0)
-		return VM_FAULT_SIGBUS;
-
-	if (!buffer_mapped(&bh) && write) {
-		if (get_block(inode, block, &bh, 1) != 0)
-			return VM_FAULT_SIGBUS;
-		alloc = true;
-	}
-
-	/*
-	 * If the filesystem isn't willing to tell us the length of a hole,
-	 * just fall back to PMDs.  Calling get_block 512 times in a loop
-	 * would be silly.
-	 */
-	if (!buffer_size_valid(&bh) || bh.b_size < PUD_SIZE) {
-		dax_pud_dbg(&bh, address, "allocated block too small");
-		return VM_FAULT_FALLBACK;
-	}
+	entry = find_get_entry(mapping, vmf->pgoff);
+	if (radix_tree_exceptional_entry(entry) &&
+				RADIX_DAX_SIZE(entry) >= RADIX_DAX_PUD) {
+		pfn = radix_to_pfn_t(entry, vmf->pgoff);
+	} else {
+		int error = dax_create_pfns(mapping, vmf->pgoff, PUD_SIZE,
+					write, &pfn, get_block, &bh);
+		if (error < 0)
+			goto fallback;
 
-	/*
-	 * If we allocated new storage, make sure no process has any
-	 * zero pages covering this hole
-	 */
-	if (alloc) {
-		loff_t lstart = vmf->pgoff << PAGE_CACHE_SHIFT;
-		loff_t lend = lstart + PUD_SIZE - 1; /* inclusive */
+		if (error) {
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			result = VM_FAULT_MAJOR;
+			error = 0;
+		}
 
-		truncate_pagecache_range(inode, lstart, lend);
+		/*
+		 * We don't know if dax_create_pfns() was able to allocate
+		 * a contiguous aligned chunk, or whether it was only able
+		 * to do a partial allocation.
+		 */
+		entry = find_get_entry(mapping, vmf->pgoff);
+		if (!radix_tree_exceptional_entry(entry) ||
+				RADIX_DAX_SIZE(entry) < RADIX_DAX_PUD)
+			goto fallback;
 	}
 
-	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
-		dax_pud_dbg(&bh, address, "no zero page");
+	if (is_bad_pfn_t(pfn)) {
+		dax_pud_dbg(&bh, vaddr, "no zero page");
 		goto fallback;
 	} else {
-		result = dax_insert_pud_mapping(inode, &bh, vma, vmf);
+		if (current->needs_wmb)
+			wmb_pmem();
+
+		result |= vmf_insert_pfn_pud(vma, vaddr, vmf->pud, pfn,
+						write);
 	}
 
  out:
@@ -1113,17 +1048,13 @@ EXPORT_SYMBOL_GPL(dax_fault);
  */
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	struct file *file = vma->vm_file;
+	struct address_space *mapping = vma->vm_file->f_mapping;
+
+	spin_lock_irq(&mapping->tree_lock);
+	radix_tree_tag_set(&mapping->page_tree, vmf->pgoff,
+							PAGECACHE_TAG_DIRTY);
+	spin_unlock_irq(&mapping->tree_lock);
 
-	/*
-	 * We pass NO_SECTOR to dax_radix_entry() because we expect that a
-	 * RADIX_DAX_PTE entry already exists in the radix tree from a
-	 * previous call to dax_fault().  We just want to look up that PTE
-	 * entry using vmf->pgoff and make sure the dirty tag is set.  This
-	 * saves us from having to make a call to get_block() here to look
-	 * up the sector.
-	 */
-	dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 8e58c36..0a6505d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -5,9 +5,10 @@
 #include <linux/mm.h>
 #include <asm/pgtable.h>
 
+int dax_clear_blocks(struct block_device *, sector_t sector, long size);
+
 ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
 		  get_block_t, dio_iodone_t, int flags);
-int dax_clear_blocks(struct inode *, sector_t block, long size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 07d18a8..95a7b50 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -8,21 +8,46 @@
  * PFN_SG_LAST - pfn references a page and is the last scatterlist entry
  * PFN_DEV - pfn is not covered by system memmap by default
  * PFN_MAP - pfn has a dynamic page mapping established by a device driver
+ *
+ * Note that DAX uses the same format for its radix tree entries.  The
+ * bottom two bits are used by the radix tree.
  */
-#define PFN_FLAGS_MASK (((unsigned long) ~PAGE_MASK) \
-		<< (BITS_PER_LONG - PAGE_SHIFT))
-#define PFN_SG_CHAIN (1UL << (BITS_PER_LONG - 1))
-#define PFN_SG_LAST (1UL << (BITS_PER_LONG - 2))
-#define PFN_DEV (1UL << (BITS_PER_LONG - 3))
-#define PFN_MAP (1UL << (BITS_PER_LONG - 4))
+#define PFN_FLAG_BITS	4
+#define PFN_FLAGS_MASK	((1 << PFN_FLAG_BITS) - 1)
+#define PFN_SG_CHAIN	0x1UL
+#define PFN_SG_LAST	0x2UL
+#define PFN_DEV		0x4UL
+#define PFN_MAP		0x8UL
 
 static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, unsigned long flags)
 {
-	pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), };
+	pfn_t pfn_t = { .val = (pfn << PFN_FLAG_BITS) |
+					(flags & PFN_FLAGS_MASK), };
 
 	return pfn_t;
 }
 
+static inline __must_check pfn_t pfn_t_add(const pfn_t pfn, int val)
+{
+	pfn_t tmp = pfn;
+	tmp.val += val << PFN_FLAG_BITS;
+	return tmp;
+}
+	
+/*
+ * It makes no sense to have both SG_CHAIN and SG_LAST set, so we could
+ * encode an errno in here if we need to.  Note that you can't put a
+ * bad_pfn_t in the radix tree because the radix tree uses the bottom bit
+ * for its own purposes.
+ */
+#define bad_pfn_t	((pfn_t) { .val = -1 })
+
+static inline bool is_bad_pfn_t(pfn_t pfn)
+{
+	return ((pfn.val & (PFN_SG_CHAIN | PFN_SG_LAST)) ==
+			  (PFN_SG_CHAIN | PFN_SG_LAST));
+}
+
 /* a default pfn to pfn_t conversion assumes that @pfn is pfn_valid() */
 static inline pfn_t pfn_to_pfn_t(unsigned long pfn)
 {
@@ -38,7 +63,7 @@ static inline bool pfn_t_has_page(pfn_t pfn)
 
 static inline unsigned long pfn_t_to_pfn(pfn_t pfn)
 {
-	return pfn.val & ~PFN_FLAGS_MASK;
+	return pfn.val >> PFN_FLAG_BITS;
 }
 
 static inline struct page *pfn_t_to_page(pfn_t pfn)
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 7c88ad1..57e7d87 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -51,15 +51,6 @@
 #define RADIX_TREE_EXCEPTIONAL_ENTRY	2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT	2
 
-#define RADIX_DAX_MASK	0xf
-#define RADIX_DAX_SHIFT	4
-#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
-#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
-#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
-
 static inline int radix_tree_is_indirect_ptr(void *ptr)
 {
 	return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e95d8a..2cdfe76 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1476,6 +1476,7 @@ struct task_struct {
 	/* unserialized, strictly 'current' */
 	unsigned in_execve:1; /* bit to tell LSMs we're in execve */
 	unsigned in_iowait:1;
+	unsigned needs_wmb:1;
 #ifdef CONFIG_MEMCG
 	unsigned memcg_may_oom:1;
 #ifndef CONFIG_SLOB
-- 
2.7.0.rc3

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-30  5:28       ` Matthew Wilcox
  2016-01-30  6:01         ` Dan Williams
@ 2016-02-01 14:51         ` Jan Kara
  2016-02-01 20:49           ` Matthew Wilcox
  2016-02-01 21:47           ` Dave Chinner
  1 sibling, 2 replies; 66+ messages in thread
From: Jan Kara @ 2016-02-01 14:51 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Ross Zwisler, Christoph Hellwig, linux-kernel, Alexander Viro,
	Andrew Morton, Dan Williams, Dave Chinner, Jan Kara,
	linux-fsdevel, linux-nvdimm

On Sat 30-01-16 00:28:33, Matthew Wilcox wrote:
> On Fri, Jan 29, 2016 at 11:28:15AM -0700, Ross Zwisler wrote:
> > I guess I need to go off and understand if we can have DAX mappings on such a
> > device.  If we can, we may have a problem - we can get the block_device from
> > get_block() in I/O path and the various fault paths, but we don't have access
> > to get_block() when flushing via dax_writeback_mapping_range().  We avoid
> > needing it the normal case by storing the sector results from get_block() in
> > the radix tree.
> 
> I think we're doing it wrong by storing the sector in the radix tree; we'd
> really need to store both the sector and the bdev which is too much data.
> 
> If we store the PFN of the underlying page instead, we don't have this
> problem.  Instead, we have a different problem; of the device going
> away under us.  I'm trying to find the code which tears down PTEs when
> the device goes away, and I'm not seeing it.  What do we do about user
> mappings of the device?

So I don't have a strong opinion whether storing PFN or sector is better.
Maybe PFN is somewhat more generic but OTOH turning DAX off for special
cases like inodes on XFS RT devices would be IMHO fine.

I'm somewhat concerned that there are several things in flight (page fault
rework, invalidation on device removal, issues with DAX access to block
devices Ross found) and this is IMHO the smallest trouble we have and changing
this seems relatively invasive. So could we settle the fault code and
similar stuff first and look into this somewhat later? Because frankly I
have some trouble following how all the pieces are going to fit together
and I'm afraid we'll introduce some non-trivial bugs when several
fundamental things are in flux in parallel.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-01 14:51         ` Jan Kara
@ 2016-02-01 20:49           ` Matthew Wilcox
  2016-02-01 21:47           ` Dave Chinner
  1 sibling, 0 replies; 66+ messages in thread
From: Matthew Wilcox @ 2016-02-01 20:49 UTC (permalink / raw)
  To: Jan Kara
  Cc: Ross Zwisler, Christoph Hellwig, linux-kernel, Alexander Viro,
	Andrew Morton, Dan Williams, Dave Chinner, Jan Kara,
	linux-fsdevel, linux-nvdimm

On Mon, Feb 01, 2016 at 03:51:47PM +0100, Jan Kara wrote:
> On Sat 30-01-16 00:28:33, Matthew Wilcox wrote:
> > I think we're doing it wrong by storing the sector in the radix tree; we'd
> > really need to store both the sector and the bdev which is too much data.
> > 
> > If we store the PFN of the underlying page instead, we don't have this
> > problem.  Instead, we have a different problem; of the device going
> > away under us.  I'm trying to find the code which tears down PTEs when
> > the device goes away, and I'm not seeing it.  What do we do about user
> > mappings of the device?
> 
> So I don't have a strong opinion whether storing PFN or sector is better.
> Maybe PFN is somewhat more generic but OTOH turning DAX off for special
> cases like inodes on XFS RT devices would be IMHO fine.

I'm not sure that's such a great idea.  RT devices might be the best
way to get aligned pages on storage.

> I'm somewhat concerned that there are several things in flight (page fault
> rework, invalidation on device removal, issues with DAX access to block
> devices Ross found) and this is IMHO the smallest trouble we have and changing
> this seems relatively invasive.

This isn't the minimal change to convert us from storing sectors to
storing PFNs.  This is a wholescale rework based around using the page
cache radix tree as the primary data structure instead of buffer heads.

> So could we settle the fault code and
> similar stuff first and look into this somewhat later? Because frankly I
> have some trouble following how all the pieces are going to fit together
> and I'm afraid we'll introduce some non-trivial bugs when several
> fundamental things are in flux in parallel.

We can, of course, do a much smaller patch that will use the radix tree
much less centrally.  And that might be the right way to go for 4.5.
With the extra couple of months we'll have, I hope that this redesign
can be the basis for the DAX code in 4.6.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-01 14:51         ` Jan Kara
  2016-02-01 20:49           ` Matthew Wilcox
@ 2016-02-01 21:47           ` Dave Chinner
  2016-02-02  6:06             ` Jared Hulbert
  2016-02-02 11:17             ` Jan Kara
  1 sibling, 2 replies; 66+ messages in thread
From: Dave Chinner @ 2016-02-01 21:47 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox, Ross Zwisler, Christoph Hellwig, linux-kernel,
	Alexander Viro, Andrew Morton, Dan Williams, Jan Kara,
	linux-fsdevel, linux-nvdimm

On Mon, Feb 01, 2016 at 03:51:47PM +0100, Jan Kara wrote:
> On Sat 30-01-16 00:28:33, Matthew Wilcox wrote:
> > On Fri, Jan 29, 2016 at 11:28:15AM -0700, Ross Zwisler wrote:
> > > I guess I need to go off and understand if we can have DAX mappings on such a
> > > device.  If we can, we may have a problem - we can get the block_device from
> > > get_block() in I/O path and the various fault paths, but we don't have access
> > > to get_block() when flushing via dax_writeback_mapping_range().  We avoid
> > > needing it the normal case by storing the sector results from get_block() in
> > > the radix tree.
> > 
> > I think we're doing it wrong by storing the sector in the radix tree; we'd
> > really need to store both the sector and the bdev which is too much data.
> > 
> > If we store the PFN of the underlying page instead, we don't have this
> > problem.  Instead, we have a different problem; of the device going
> > away under us.  I'm trying to find the code which tears down PTEs when
> > the device goes away, and I'm not seeing it.  What do we do about user
> > mappings of the device?
> 
> So I don't have a strong opinion whether storing PFN or sector is better.
> Maybe PFN is somewhat more generic but OTOH turning DAX off for special
> cases like inodes on XFS RT devices would be IMHO fine.

We need to support alternate devices.

There is a strong case for using the XFS RT device with DAX,
especially for applications that know they are going to always use
large/huge/giant pages to access their data files. The XFS RT device
can guarantee allocation is always aligned to large/huge/giant page
constraints right up to ENOSPC and throughout the production life of
the filesystem. We have no other filesystem capable of providing
such guarantees, which means the XFS RT device is uniquely suited to
certain aplications with DAX...

> I'm somewhat concerned that there are several things in flight (page fault
> rework, invalidation on device removal, issues with DAX access to block
> devices Ross found) and this is IMHO the smallest trouble we have and changing
> this seems relatively invasive. So could we settle the fault code and
> similar stuff first and look into this somewhat later? Because frankly I
> have some trouble following how all the pieces are going to fit together
> and I'm afraid we'll introduce some non-trivial bugs when several
> fundamental things are in flux in parallel.

Yup, there's way to many balls in the air at the moment.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-01-28 21:38   ` Christoph Hellwig
  2016-01-29 18:28     ` Ross Zwisler
@ 2016-02-02  0:02     ` Ross Zwisler
  2016-02-02  7:10       ` Dave Chinner
  2016-02-02 10:34       ` Jan Kara
  1 sibling, 2 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-02-02  0:02 UTC (permalink / raw)
  To: Christoph Hellwig, Jan Kara, Dave Chinner, Dan Williams
  Cc: Ross Zwisler, linux-kernel, Alexander Viro, Andrew Morton,
	Dan Williams, Dave Chinner, Jan Kara, Matthew Wilcox,
	linux-fsdevel, linux-nvdimm

On Thu, Jan 28, 2016 at 01:38:58PM -0800, Christoph Hellwig wrote:
> On Thu, Jan 28, 2016 at 12:35:04PM -0700, Ross Zwisler wrote:
> > There are a number of places in dax.c that look up the struct block_device
> > associated with an inode.  Previously this was done by just using
> > inode->i_sb->s_bdev.  This is correct for inodes that exist within the
> > filesystems supported by DAX (ext2, ext4 & XFS), but when running DAX
> > against raw block devices this value is NULL.  This causes NULL pointer
> > dereferences when these block_device pointers are used.
> 
> It's also wrong for an XFS file system with a RT device..
> 
> > +#define DAX_BDEV(inode) (S_ISBLK(inode->i_mode) ? I_BDEV(inode) \
> > +				: inode->i_sb->s_bdev)
> 
> .. but this isn't going to fix it.  You must use a bdev returned by
> get_blocks or a similar file system method.

Jan & Dave,

Before I start in on a solution to this issue I just wanted to confirm that
DAX can rely on the fact that the filesystem's get_block() call will reliably
set bh->b_bdev for non-error returns.  From this conversation between Jan &
Dave:

https://lkml.org/lkml/2016/1/7/723

"
  > No. The real problem is a long-standing abuse of struct buffer_head to be
  > used for passing block mapping information (it's on my todo list to remove
  > that at least from DAX code and use cleaner block mapping interface but
  > first I want basic DAX functionality to settle down to avoid unnecessary
  > conflicts). Filesystem is not supposed to touch bh->b_bdev.
  
  That has not been true for a long, long time. e.g. XFS always
  rewrites bh->b_bdev in get_blocks because the file may not reside on
  the primary block device of the filesystem. i.e.:
  
          /*
           * If this is a realtime file, data may be on a different device.
           * to that pointed to from the buffer_head b_bdev currently.
           */
          bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
  > If you need
  > that filled in, set it yourself in before passing bh to the block mapping
  > function.
  
  That may be true, but we cannot assume that the bdev coming back
  out of get_block is the same one that was passed in.
"

It sounds like this is always true for XFS, and from looking at the ext4 code
I think this is true there as well because bh->b_bdev is set in
ext4_dax_mmap_get_block() via map_bh().

Relying on the bh->b_bdev returned by get_block() is correct, yea?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-01 21:47           ` Dave Chinner
@ 2016-02-02  6:06             ` Jared Hulbert
  2016-02-02  6:46               ` Dan Williams
  2016-02-02 11:17             ` Jan Kara
  1 sibling, 1 reply; 66+ messages in thread
From: Jared Hulbert @ 2016-02-02  6:06 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Matthew Wilcox, Ross Zwisler, Christoph Hellwig, LKML,
	Alexander Viro, Andrew Morton, Dan Williams, Jan Kara,
	Linux FS Devel, linux-nvdimm

On Mon, Feb 1, 2016 at 1:47 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Mon, Feb 01, 2016 at 03:51:47PM +0100, Jan Kara wrote:
>> On Sat 30-01-16 00:28:33, Matthew Wilcox wrote:
>> > On Fri, Jan 29, 2016 at 11:28:15AM -0700, Ross Zwisler wrote:
>> > > I guess I need to go off and understand if we can have DAX mappings on such a
>> > > device.  If we can, we may have a problem - we can get the block_device from
>> > > get_block() in I/O path and the various fault paths, but we don't have access
>> > > to get_block() when flushing via dax_writeback_mapping_range().  We avoid
>> > > needing it the normal case by storing the sector results from get_block() in
>> > > the radix tree.
>> >
>> > I think we're doing it wrong by storing the sector in the radix tree; we'd
>> > really need to store both the sector and the bdev which is too much data.
>> >
>> > If we store the PFN of the underlying page instead, we don't have this
>> > problem.  Instead, we have a different problem; of the device going
>> > away under us.  I'm trying to find the code which tears down PTEs when
>> > the device goes away, and I'm not seeing it.  What do we do about user
>> > mappings of the device?
>>
>> So I don't have a strong opinion whether storing PFN or sector is better.
>> Maybe PFN is somewhat more generic but OTOH turning DAX off for special
>> cases like inodes on XFS RT devices would be IMHO fine.
>
> We need to support alternate devices.

Embedded devices trying to use NOR Flash to free up RAM was
historically one of the more prevalent real world uses of the old
filemap_xip.c code although the users never made it to mainline.  So I
spent some time last week trying to figure out how to make a subset of
DAX not depend on CONFIG_BLOCK.  It was a very frustrating and
unfruitful experience.  I discarded my main conclusion as impractical,
but now that I see the difficultly DAX faces in dealing with
"alternate devices" especially some of the crazy stuff btrfs can do, I
wonder if it's not so crazy after all.

Lets stop calling bdev_direct_access() directly from DAX.  Let the
filesystems do it.

Sure we could enable generic_dax_direct_access() helper for the
filesystems that only support single devices to make it easy.  But XFS
and btrfs for example, have to do the work of figuring out what bdev
is required and then calling bdev_direct_access().

My reasoning is that the filesystem knows how to map inodes and
offsets to devices and sectors, no matter how complex that is.  It
would even enable a filesystem to intelligently use a mix of
direct_access and regular block devices down the road.  Of course it
would also make the block-less solution doable.

Good idea?  Stupid idea?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02  6:06             ` Jared Hulbert
@ 2016-02-02  6:46               ` Dan Williams
  2016-02-02  8:05                 ` Jared Hulbert
  0 siblings, 1 reply; 66+ messages in thread
From: Dan Williams @ 2016-02-02  6:46 UTC (permalink / raw)
  To: Jared Hulbert
  Cc: Dave Chinner, Jan Kara, Matthew Wilcox, Ross Zwisler,
	Christoph Hellwig, LKML, Alexander Viro, Andrew Morton, Jan Kara,
	Linux FS Devel, linux-nvdimm

On Mon, Feb 1, 2016 at 10:06 PM, Jared Hulbert <jaredeh@gmail.com> wrote:
> On Mon, Feb 1, 2016 at 1:47 PM, Dave Chinner <david@fromorbit.com> wrote:
>> On Mon, Feb 01, 2016 at 03:51:47PM +0100, Jan Kara wrote:
>>> On Sat 30-01-16 00:28:33, Matthew Wilcox wrote:
>>> > On Fri, Jan 29, 2016 at 11:28:15AM -0700, Ross Zwisler wrote:
>>> > > I guess I need to go off and understand if we can have DAX mappings on such a
>>> > > device.  If we can, we may have a problem - we can get the block_device from
>>> > > get_block() in I/O path and the various fault paths, but we don't have access
>>> > > to get_block() when flushing via dax_writeback_mapping_range().  We avoid
>>> > > needing it the normal case by storing the sector results from get_block() in
>>> > > the radix tree.
>>> >
>>> > I think we're doing it wrong by storing the sector in the radix tree; we'd
>>> > really need to store both the sector and the bdev which is too much data.
>>> >
>>> > If we store the PFN of the underlying page instead, we don't have this
>>> > problem.  Instead, we have a different problem; of the device going
>>> > away under us.  I'm trying to find the code which tears down PTEs when
>>> > the device goes away, and I'm not seeing it.  What do we do about user
>>> > mappings of the device?
>>>
>>> So I don't have a strong opinion whether storing PFN or sector is better.
>>> Maybe PFN is somewhat more generic but OTOH turning DAX off for special
>>> cases like inodes on XFS RT devices would be IMHO fine.
>>
>> We need to support alternate devices.
>
> Embedded devices trying to use NOR Flash to free up RAM was
> historically one of the more prevalent real world uses of the old
> filemap_xip.c code although the users never made it to mainline.  So I
> spent some time last week trying to figure out how to make a subset of
> DAX not depend on CONFIG_BLOCK.  It was a very frustrating and
> unfruitful experience.  I discarded my main conclusion as impractical,
> but now that I see the difficultly DAX faces in dealing with
> "alternate devices" especially some of the crazy stuff btrfs can do, I
> wonder if it's not so crazy after all.
>
> Lets stop calling bdev_direct_access() directly from DAX.  Let the
> filesystems do it.
>
> Sure we could enable generic_dax_direct_access() helper for the
> filesystems that only support single devices to make it easy.  But XFS
> and btrfs for example, have to do the work of figuring out what bdev
> is required and then calling bdev_direct_access().
>
> My reasoning is that the filesystem knows how to map inodes and
> offsets to devices and sectors, no matter how complex that is.  It
> would even enable a filesystem to intelligently use a mix of
> direct_access and regular block devices down the road.  Of course it
> would also make the block-less solution doable.
>
> Good idea?  Stupid idea?

The CONFIG_BLOCK=y case isn't going anywhere, so if anything it seems
the CONFIG_BLOCK=n is an incremental feature in its own right.  What
driver and what filesystem are looking to enable this XIP support in?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02  0:02     ` Ross Zwisler
@ 2016-02-02  7:10       ` Dave Chinner
  2016-02-02 10:34       ` Jan Kara
  1 sibling, 0 replies; 66+ messages in thread
From: Dave Chinner @ 2016-02-02  7:10 UTC (permalink / raw)
  To: Ross Zwisler, Christoph Hellwig, Jan Kara, Dan Williams,
	linux-kernel, Alexander Viro, Andrew Morton, Jan Kara,
	Matthew Wilcox, linux-fsdevel, linux-nvdimm

On Mon, Feb 01, 2016 at 05:02:12PM -0700, Ross Zwisler wrote:
> Relying on the bh->b_bdev returned by get_block() is correct, yea?

IMO, yes.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02  6:46               ` Dan Williams
@ 2016-02-02  8:05                 ` Jared Hulbert
  2016-02-02 16:51                   ` Dan Williams
  0 siblings, 1 reply; 66+ messages in thread
From: Jared Hulbert @ 2016-02-02  8:05 UTC (permalink / raw)
  To: Dan Williams
  Cc: Dave Chinner, Jan Kara, Matthew Wilcox, Ross Zwisler,
	Christoph Hellwig, LKML, Alexander Viro, Andrew Morton, Jan Kara,
	Linux FS Devel, linux-nvdimm

On Mon, Feb 1, 2016 at 10:46 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Mon, Feb 1, 2016 at 10:06 PM, Jared Hulbert <jaredeh@gmail.com> wrote:
>> On Mon, Feb 1, 2016 at 1:47 PM, Dave Chinner <david@fromorbit.com> wrote:
>>> On Mon, Feb 01, 2016 at 03:51:47PM +0100, Jan Kara wrote:
>>>> On Sat 30-01-16 00:28:33, Matthew Wilcox wrote:
>>>> > On Fri, Jan 29, 2016 at 11:28:15AM -0700, Ross Zwisler wrote:
>>>> > > I guess I need to go off and understand if we can have DAX mappings on such a
>>>> > > device.  If we can, we may have a problem - we can get the block_device from
>>>> > > get_block() in I/O path and the various fault paths, but we don't have access
>>>> > > to get_block() when flushing via dax_writeback_mapping_range().  We avoid
>>>> > > needing it the normal case by storing the sector results from get_block() in
>>>> > > the radix tree.
>>>> >
>>>> > I think we're doing it wrong by storing the sector in the radix tree; we'd
>>>> > really need to store both the sector and the bdev which is too much data.
>>>> >
>>>> > If we store the PFN of the underlying page instead, we don't have this
>>>> > problem.  Instead, we have a different problem; of the device going
>>>> > away under us.  I'm trying to find the code which tears down PTEs when
>>>> > the device goes away, and I'm not seeing it.  What do we do about user
>>>> > mappings of the device?
>>>>
>>>> So I don't have a strong opinion whether storing PFN or sector is better.
>>>> Maybe PFN is somewhat more generic but OTOH turning DAX off for special
>>>> cases like inodes on XFS RT devices would be IMHO fine.
>>>
>>> We need to support alternate devices.
>>
>> Embedded devices trying to use NOR Flash to free up RAM was
>> historically one of the more prevalent real world uses of the old
>> filemap_xip.c code although the users never made it to mainline.  So I
>> spent some time last week trying to figure out how to make a subset of
>> DAX not depend on CONFIG_BLOCK.  It was a very frustrating and
>> unfruitful experience.  I discarded my main conclusion as impractical,
>> but now that I see the difficultly DAX faces in dealing with
>> "alternate devices" especially some of the crazy stuff btrfs can do, I
>> wonder if it's not so crazy after all.
>>
>> Lets stop calling bdev_direct_access() directly from DAX.  Let the
>> filesystems do it.
>>
>> Sure we could enable generic_dax_direct_access() helper for the
>> filesystems that only support single devices to make it easy.  But XFS
>> and btrfs for example, have to do the work of figuring out what bdev
>> is required and then calling bdev_direct_access().
>>
>> My reasoning is that the filesystem knows how to map inodes and
>> offsets to devices and sectors, no matter how complex that is.  It
>> would even enable a filesystem to intelligently use a mix of
>> direct_access and regular block devices down the road.  Of course it
>> would also make the block-less solution doable.
>>
>> Good idea?  Stupid idea?
>
> The CONFIG_BLOCK=y case isn't going anywhere, so if anything it seems
> the CONFIG_BLOCK=n is an incremental feature in its own right.  What
> driver and what filesystem are looking to enable this XIP support in?

Well... as CONFIG_BLOCK was not required with filemap_xip.c for a
decade.  This CONFIG_BLOCK dependency is a result of an incremental
feature from a certain point of view ;)

The obvious 'driver' is physical RAM without a particular driver.
Remember please I'm talking about embedded.  RAM measured in MiB and
funky one off hardware etc.  In the embedded world there are lots of
ways that persistent memory has been supported in device specific ways
without the new fancypants NFIT and Intel instructions, so frankly
they don't fit in the PMEM stuff.  Maybe they could be supported in
PMEM but not without effort to bring embedded players to the table.

The other drivers are the MTD drivers, probably as read-only for now.
But the paradigm there isn't so different from what PMEM looks like
with asymmetric read/write capabilities.

The filesystem I'm concerned with is AXFS
(https://www.kernel.org/doc/ols/2008/ols2008v1-pages-211-218.pdf).
Which I've been planning on trying to merge again due to a recent
resurgence of interest.  The device model for AXFS is... weird.  It
can use one or two devices at a time of any mix of NOR MTD, NAND MTD,
block, and unmanaged physical memory.  It's a terribly useful model
for embedded.  Anyway AXFS is readonly so hacking in a read only
dax_fault_nodev() and dax_file_read() would work fine, looks easy
enough.  But... it would be cool if similar small embedded focused RW
filesystems were enabled.

I don't expect you to taint DAX with design requirements for this
stuff that it wasn't built for, nobody ends up happy in that case.
However, if enabling the filesystem to manage the bdev_direct_access()
interactions solves some of the "alternate device" problems you are
discussing here, then there is a chance we can accommodate both.
Sometimes that works.

So... Forget CONFIG_BLOCK=n entirely I didn't want that to be the
focus anyway.  Does it help to support the weirder XFS and btrfs
device models to enable the filesystem to handle the
bdev_direct_access() stuff?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02  0:02     ` Ross Zwisler
  2016-02-02  7:10       ` Dave Chinner
@ 2016-02-02 10:34       ` Jan Kara
  1 sibling, 0 replies; 66+ messages in thread
From: Jan Kara @ 2016-02-02 10:34 UTC (permalink / raw)
  To: Ross Zwisler
  Cc: Christoph Hellwig, Jan Kara, Dave Chinner, Dan Williams,
	linux-kernel, Alexander Viro, Andrew Morton, Jan Kara,
	Matthew Wilcox, linux-fsdevel, linux-nvdimm

On Mon 01-02-16 17:02:12, Ross Zwisler wrote:
> On Thu, Jan 28, 2016 at 01:38:58PM -0800, Christoph Hellwig wrote:
> > On Thu, Jan 28, 2016 at 12:35:04PM -0700, Ross Zwisler wrote:
> > > There are a number of places in dax.c that look up the struct block_device
> > > associated with an inode.  Previously this was done by just using
> > > inode->i_sb->s_bdev.  This is correct for inodes that exist within the
> > > filesystems supported by DAX (ext2, ext4 & XFS), but when running DAX
> > > against raw block devices this value is NULL.  This causes NULL pointer
> > > dereferences when these block_device pointers are used.
> > 
> > It's also wrong for an XFS file system with a RT device..
> > 
> > > +#define DAX_BDEV(inode) (S_ISBLK(inode->i_mode) ? I_BDEV(inode) \
> > > +				: inode->i_sb->s_bdev)
> > 
> > .. but this isn't going to fix it.  You must use a bdev returned by
> > get_blocks or a similar file system method.
> 
> Jan & Dave,
> 
> Before I start in on a solution to this issue I just wanted to confirm that
> DAX can rely on the fact that the filesystem's get_block() call will reliably
> set bh->b_bdev for non-error returns.  From this conversation between Jan &
> Dave:
> 
> https://lkml.org/lkml/2016/1/7/723
> 
> "
>   > No. The real problem is a long-standing abuse of struct buffer_head to be
>   > used for passing block mapping information (it's on my todo list to remove
>   > that at least from DAX code and use cleaner block mapping interface but
>   > first I want basic DAX functionality to settle down to avoid unnecessary
>   > conflicts). Filesystem is not supposed to touch bh->b_bdev.
>   
>   That has not been true for a long, long time. e.g. XFS always
>   rewrites bh->b_bdev in get_blocks because the file may not reside on
>   the primary block device of the filesystem. i.e.:
>   
>           /*
>            * If this is a realtime file, data may be on a different device.
>            * to that pointed to from the buffer_head b_bdev currently.
>            */
>           bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
>   > If you need
>   > that filled in, set it yourself in before passing bh to the block mapping
>   > function.
>   
>   That may be true, but we cannot assume that the bdev coming back
>   out of get_block is the same one that was passed in.
> "
> 
> It sounds like this is always true for XFS, and from looking at the ext4 code
> I think this is true there as well because bh->b_bdev is set in
> ext4_dax_mmap_get_block() via map_bh().
> 
> Relying on the bh->b_bdev returned by get_block() is correct, yea?

Yeah, sorry, I was confused. If the result is a mapped block (i.e. return
value of get_block callback is > 0), ext4 also sets bh->b_bdev via map_bh()
as you correctly point out. If the result is a hole or error, ext4 doesn't
set bh->b_bdev at all. So you can rely on bh->b_bdev.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-01 21:47           ` Dave Chinner
  2016-02-02  6:06             ` Jared Hulbert
@ 2016-02-02 11:17             ` Jan Kara
  2016-02-02 16:33               ` Dan Williams
  2016-02-02 18:41               ` Ross Zwisler
  1 sibling, 2 replies; 66+ messages in thread
From: Jan Kara @ 2016-02-02 11:17 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Matthew Wilcox, Ross Zwisler, Christoph Hellwig,
	linux-kernel, Alexander Viro, Andrew Morton, Dan Williams,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Tue 02-02-16 08:47:30, Dave Chinner wrote:
> On Mon, Feb 01, 2016 at 03:51:47PM +0100, Jan Kara wrote:
> > On Sat 30-01-16 00:28:33, Matthew Wilcox wrote:
> > > On Fri, Jan 29, 2016 at 11:28:15AM -0700, Ross Zwisler wrote:
> > > > I guess I need to go off and understand if we can have DAX mappings on such a
> > > > device.  If we can, we may have a problem - we can get the block_device from
> > > > get_block() in I/O path and the various fault paths, but we don't have access
> > > > to get_block() when flushing via dax_writeback_mapping_range().  We avoid
> > > > needing it the normal case by storing the sector results from get_block() in
> > > > the radix tree.
> > > 
> > > I think we're doing it wrong by storing the sector in the radix tree; we'd
> > > really need to store both the sector and the bdev which is too much data.
> > > 
> > > If we store the PFN of the underlying page instead, we don't have this
> > > problem.  Instead, we have a different problem; of the device going
> > > away under us.  I'm trying to find the code which tears down PTEs when
> > > the device goes away, and I'm not seeing it.  What do we do about user
> > > mappings of the device?
> > 
> > So I don't have a strong opinion whether storing PFN or sector is better.
> > Maybe PFN is somewhat more generic but OTOH turning DAX off for special
> > cases like inodes on XFS RT devices would be IMHO fine.
> 
> We need to support alternate devices.
> 
> There is a strong case for using the XFS RT device with DAX,
> especially for applications that know they are going to always use
> large/huge/giant pages to access their data files. The XFS RT device
> can guarantee allocation is always aligned to large/huge/giant page
> constraints right up to ENOSPC and throughout the production life of
> the filesystem. We have no other filesystem capable of providing
> such guarantees, which means the XFS RT device is uniquely suited to
> certain aplications with DAX...

I see, thanks for explanation. So I'm OK with changing what is stored in
the radix tree to accommodate this use case but my reservation that we IHMO
have other more pressing things to fix remains...

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 11:17             ` Jan Kara
@ 2016-02-02 16:33               ` Dan Williams
  2016-02-02 16:46                 ` Jan Kara
  2016-02-02 18:41               ` Ross Zwisler
  1 sibling, 1 reply; 66+ messages in thread
From: Dan Williams @ 2016-02-02 16:33 UTC (permalink / raw)
  To: Jan Kara
  Cc: Dave Chinner, Matthew Wilcox, Ross Zwisler, Christoph Hellwig,
	linux-kernel, Alexander Viro, Andrew Morton, Jan Kara,
	linux-fsdevel, linux-nvdimm

On Tue, Feb 2, 2016 at 3:17 AM, Jan Kara <jack@suse.cz> wrote:
[..]
> I see, thanks for explanation. So I'm OK with changing what is stored in
> the radix tree to accommodate this use case but my reservation that we IHMO
> have other more pressing things to fix remains...

We don't need pfns in the radix to support XFS RT configurations.
Just call get_blocks() again and use the sector, or am I missing
something?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 16:33               ` Dan Williams
@ 2016-02-02 16:46                 ` Jan Kara
  2016-02-02 17:10                   ` Dan Williams
  0 siblings, 1 reply; 66+ messages in thread
From: Jan Kara @ 2016-02-02 16:46 UTC (permalink / raw)
  To: Dan Williams
  Cc: Jan Kara, Dave Chinner, Matthew Wilcox, Ross Zwisler,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Tue 02-02-16 08:33:56, Dan Williams wrote:
> On Tue, Feb 2, 2016 at 3:17 AM, Jan Kara <jack@suse.cz> wrote:
> [..]
> > I see, thanks for explanation. So I'm OK with changing what is stored in
> > the radix tree to accommodate this use case but my reservation that we IHMO
> > have other more pressing things to fix remains...
> 
> We don't need pfns in the radix to support XFS RT configurations.
> Just call get_blocks() again and use the sector, or am I missing
> something?

You are correct. But if you decide to pay the cost of additional
get_block() call, you only need the dirty tag in the radix tree and nothing
else. So my understanding was that the whole point of games with radix tree
is avoiding this extra get_block() calls for fsync().

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02  8:05                 ` Jared Hulbert
@ 2016-02-02 16:51                   ` Dan Williams
  2016-02-02 21:46                     ` Jared Hulbert
  0 siblings, 1 reply; 66+ messages in thread
From: Dan Williams @ 2016-02-02 16:51 UTC (permalink / raw)
  To: Jared Hulbert
  Cc: Dave Chinner, Jan Kara, Matthew Wilcox, Ross Zwisler,
	Christoph Hellwig, LKML, Alexander Viro, Andrew Morton, Jan Kara,
	Linux FS Devel, linux-nvdimm

On Tue, Feb 2, 2016 at 12:05 AM, Jared Hulbert <jaredeh@gmail.com> wrote:
[..]
> Well... as CONFIG_BLOCK was not required with filemap_xip.c for a
> decade.  This CONFIG_BLOCK dependency is a result of an incremental
> feature from a certain point of view ;)
>
> The obvious 'driver' is physical RAM without a particular driver.
> Remember please I'm talking about embedded.  RAM measured in MiB and
> funky one off hardware etc.  In the embedded world there are lots of
> ways that persistent memory has been supported in device specific ways
> without the new fancypants NFIT and Intel instructions,so frankly
> they don't fit in the PMEM stuff.  Maybe they could be supported in
> PMEM but not without effort to bring embedded players to the table.

Not sure what you're trying to say here.  An ACPI NFIT only feeds the
generic libnvdimm device model.  You don't need NFIT to get pmem.

> The other drivers are the MTD drivers, probably as read-only for now.
> But the paradigm there isn't so different from what PMEM looks like
> with asymmetric read/write capabilities.
>
> The filesystem I'm concerned with is AXFS
> (https://www.kernel.org/doc/ols/2008/ols2008v1-pages-211-218.pdf).
> Which I've been planning on trying to merge again due to a recent
> resurgence of interest.  The device model for AXFS is... weird.  It
> can use one or two devices at a time of any mix of NOR MTD, NAND MTD,
> block, and unmanaged physical memory.  It's a terribly useful model
> for embedded.  Anyway AXFS is readonly so hacking in a read only
> dax_fault_nodev() and dax_file_read() would work fine, looks easy
> enough.  But... it would be cool if similar small embedded focused RW
> filesystems were enabled.

Are those also out of tree?

> I don't expect you to taint DAX with design requirements for this
> stuff that it wasn't built for, nobody ends up happy in that case.
> However, if enabling the filesystem to manage the bdev_direct_access()
> interactions solves some of the "alternate device" problems you are
> discussing here, then there is a chance we can accommodate both.
> Sometimes that works.
>
> So... Forget CONFIG_BLOCK=n entirely I didn't want that to be the
> focus anyway.  Does it help to support the weirder XFS and btrfs
> device models to enable the filesystem to handle the
> bdev_direct_access() stuff?

It's not clear that it does.  We just clarified with xfs and ext4 that
we can really on get_blocks().  That solves the immediate concern with
multi-device filesystems.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 16:46                 ` Jan Kara
@ 2016-02-02 17:10                   ` Dan Williams
  2016-02-02 17:34                     ` Ross Zwisler
  0 siblings, 1 reply; 66+ messages in thread
From: Dan Williams @ 2016-02-02 17:10 UTC (permalink / raw)
  To: Jan Kara
  Cc: Dave Chinner, Matthew Wilcox, Ross Zwisler, Christoph Hellwig,
	linux-kernel, Alexander Viro, Andrew Morton, Jan Kara,
	linux-fsdevel, linux-nvdimm

On Tue, Feb 2, 2016 at 8:46 AM, Jan Kara <jack@suse.cz> wrote:
> On Tue 02-02-16 08:33:56, Dan Williams wrote:
>> On Tue, Feb 2, 2016 at 3:17 AM, Jan Kara <jack@suse.cz> wrote:
>> [..]
>> > I see, thanks for explanation. So I'm OK with changing what is stored in
>> > the radix tree to accommodate this use case but my reservation that we IHMO
>> > have other more pressing things to fix remains...
>>
>> We don't need pfns in the radix to support XFS RT configurations.
>> Just call get_blocks() again and use the sector, or am I missing
>> something?
>
> You are correct. But if you decide to pay the cost of additional
> get_block() call, you only need the dirty tag in the radix tree and nothing
> else. So my understanding was that the whole point of games with radix tree
> is avoiding this extra get_block() calls for fsync().
>

DAX-fsync() is already a potentially expensive operation to cover data
durability guarantees for DAX-unaware applications.  A DAX-aware
application is going to skip fsync, and the get_blocks() cost, to do
cache management itself.

Willy pointed out some other potential benefits, assuming a suitable
replacement for the protections afforded by the block-device
percpu_ref counter can be found.  However, optimizing for the
DAX-unaware-application case seems the wrong motivation to me.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 17:10                   ` Dan Williams
@ 2016-02-02 17:34                     ` Ross Zwisler
  2016-02-02 17:46                       ` Dan Williams
  2016-02-03 10:46                       ` Jan Kara
  0 siblings, 2 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-02-02 17:34 UTC (permalink / raw)
  To: Dan Williams
  Cc: Jan Kara, Dave Chinner, Matthew Wilcox, Ross Zwisler,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Tue, Feb 02, 2016 at 09:10:24AM -0800, Dan Williams wrote:
> On Tue, Feb 2, 2016 at 8:46 AM, Jan Kara <jack@suse.cz> wrote:
> > On Tue 02-02-16 08:33:56, Dan Williams wrote:
> >> On Tue, Feb 2, 2016 at 3:17 AM, Jan Kara <jack@suse.cz> wrote:
> >> [..]
> >> > I see, thanks for explanation. So I'm OK with changing what is stored in
> >> > the radix tree to accommodate this use case but my reservation that we IHMO
> >> > have other more pressing things to fix remains...
> >>
> >> We don't need pfns in the radix to support XFS RT configurations.
> >> Just call get_blocks() again and use the sector, or am I missing
> >> something?
> >
> > You are correct. But if you decide to pay the cost of additional
> > get_block() call, you only need the dirty tag in the radix tree and nothing
> > else. So my understanding was that the whole point of games with radix tree
> > is avoiding this extra get_block() calls for fsync().
> >
> 
> DAX-fsync() is already a potentially expensive operation to cover data
> durability guarantees for DAX-unaware applications.  A DAX-aware
> application is going to skip fsync, and the get_blocks() cost, to do
> cache management itself.
> 
> Willy pointed out some other potential benefits, assuming a suitable
> replacement for the protections afforded by the block-device
> percpu_ref counter can be found.  However, optimizing for the
> DAX-unaware-application case seems the wrong motivation to me.

Oh, no, the primary issue with calling get_block() in the fsync path isn't
performance.  It's that we don't have any idea what get_block() function to
call.

The fault handler calls all come from the filesystem directly, so they can
easily give us an appropriate get_block() function pointer.  But the
dax_writeback_mapping_range() calls come from the generic code in
mm/filemap.c, and don't know what get_block() to pass in.

During one iteration I had the calls to dax_writeback_mapping_range()
happening in the filesystem fsync code so that it could pass in get_block(),
but Dave Chinner pointed out that this misses other paths in the filesystem
that need to have things flushed via a call to filemap_write_and_wait_range().

In yet another iteration of this series I tried adding get_block() to struct
inode_operations so that I could access it from what is now
dax_writeback_mapping_range(), but this was shot down as well.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 17:34                     ` Ross Zwisler
@ 2016-02-02 17:46                       ` Dan Williams
  2016-02-02 17:47                         ` Dan Williams
  2016-02-02 18:46                         ` Matthew Wilcox
  2016-02-03 10:46                       ` Jan Kara
  1 sibling, 2 replies; 66+ messages in thread
From: Dan Williams @ 2016-02-02 17:46 UTC (permalink / raw)
  To: Ross Zwisler, Dan Williams, Jan Kara, Dave Chinner,
	Matthew Wilcox, Christoph Hellwig, linux-kernel, Alexander Viro,
	Andrew Morton, Jan Kara, linux-fsdevel, linux-nvdimm

On Tue, Feb 2, 2016 at 9:34 AM, Ross Zwisler
<ross.zwisler@linux.intel.com> wrote:
> On Tue, Feb 02, 2016 at 09:10:24AM -0800, Dan Williams wrote:
>> On Tue, Feb 2, 2016 at 8:46 AM, Jan Kara <jack@suse.cz> wrote:
>> > On Tue 02-02-16 08:33:56, Dan Williams wrote:
>> >> On Tue, Feb 2, 2016 at 3:17 AM, Jan Kara <jack@suse.cz> wrote:
>> >> [..]
>> >> > I see, thanks for explanation. So I'm OK with changing what is stored in
>> >> > the radix tree to accommodate this use case but my reservation that we IHMO
>> >> > have other more pressing things to fix remains...
>> >>
>> >> We don't need pfns in the radix to support XFS RT configurations.
>> >> Just call get_blocks() again and use the sector, or am I missing
>> >> something?
>> >
>> > You are correct. But if you decide to pay the cost of additional
>> > get_block() call, you only need the dirty tag in the radix tree and nothing
>> > else. So my understanding was that the whole point of games with radix tree
>> > is avoiding this extra get_block() calls for fsync().
>> >
>>
>> DAX-fsync() is already a potentially expensive operation to cover data
>> durability guarantees for DAX-unaware applications.  A DAX-aware
>> application is going to skip fsync, and the get_blocks() cost, to do
>> cache management itself.
>>
>> Willy pointed out some other potential benefits, assuming a suitable
>> replacement for the protections afforded by the block-device
>> percpu_ref counter can be found.  However, optimizing for the
>> DAX-unaware-application case seems the wrong motivation to me.
>
> Oh, no, the primary issue with calling get_block() in the fsync path isn't
> performance.  It's that we don't have any idea what get_block() function to
> call.
>
> The fault handler calls all come from the filesystem directly, so they can
> easily give us an appropriate get_block() function pointer.  But the
> dax_writeback_mapping_range() calls come from the generic code in
> mm/filemap.c, and don't know what get_block() to pass in.
>
> During one iteration I had the calls to dax_writeback_mapping_range()
> happening in the filesystem fsync code so that it could pass in get_block(),
> but Dave Chinner pointed out that this misses other paths in the filesystem
> that need to have things flushed via a call to filemap_write_and_wait_range().
>
> In yet another iteration of this series I tried adding get_block() to struct
> inode_operations so that I could access it from what is now
> dax_writeback_mapping_range(), but this was shot down as well.

Ugh, and we can't trigger it from where a filesystem normally syncs a
block device, becauDid you tryse we lose track of the inode radix
information at that level.

What a about a super_operation?  That seems the right level, given
we're currently doing:

inode->i_sb->s_bdev

...it does not seem terrible to instead do:

inode->i_sb->s_op->get_block()

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 17:46                       ` Dan Williams
@ 2016-02-02 17:47                         ` Dan Williams
  2016-02-02 18:24                           ` Ross Zwisler
  2016-02-02 18:46                         ` Matthew Wilcox
  1 sibling, 1 reply; 66+ messages in thread
From: Dan Williams @ 2016-02-02 17:47 UTC (permalink / raw)
  To: Ross Zwisler, Dan Williams, Jan Kara, Dave Chinner,
	Matthew Wilcox, Christoph Hellwig, linux-kernel, Alexander Viro,
	Andrew Morton, Jan Kara, linux-fsdevel, linux-nvdimm

On Tue, Feb 2, 2016 at 9:46 AM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Tue, Feb 2, 2016 at 9:34 AM, Ross Zwisler
> <ross.zwisler@linux.intel.com> wrote:
>> On Tue, Feb 02, 2016 at 09:10:24AM -0800, Dan Williams wrote:
>>> On Tue, Feb 2, 2016 at 8:46 AM, Jan Kara <jack@suse.cz> wrote:
>>> > On Tue 02-02-16 08:33:56, Dan Williams wrote:
>>> >> On Tue, Feb 2, 2016 at 3:17 AM, Jan Kara <jack@suse.cz> wrote:
>>> >> [..]
>>> >> > I see, thanks for explanation. So I'm OK with changing what is stored in
>>> >> > the radix tree to accommodate this use case but my reservation that we IHMO
>>> >> > have other more pressing things to fix remains...
>>> >>
>>> >> We don't need pfns in the radix to support XFS RT configurations.
>>> >> Just call get_blocks() again and use the sector, or am I missing
>>> >> something?
>>> >
>>> > You are correct. But if you decide to pay the cost of additional
>>> > get_block() call, you only need the dirty tag in the radix tree and nothing
>>> > else. So my understanding was that the whole point of games with radix tree
>>> > is avoiding this extra get_block() calls for fsync().
>>> >
>>>
>>> DAX-fsync() is already a potentially expensive operation to cover data
>>> durability guarantees for DAX-unaware applications.  A DAX-aware
>>> application is going to skip fsync, and the get_blocks() cost, to do
>>> cache management itself.
>>>
>>> Willy pointed out some other potential benefits, assuming a suitable
>>> replacement for the protections afforded by the block-device
>>> percpu_ref counter can be found.  However, optimizing for the
>>> DAX-unaware-application case seems the wrong motivation to me.
>>
>> Oh, no, the primary issue with calling get_block() in the fsync path isn't
>> performance.  It's that we don't have any idea what get_block() function to
>> call.
>>
>> The fault handler calls all come from the filesystem directly, so they can
>> easily give us an appropriate get_block() function pointer.  But the
>> dax_writeback_mapping_range() calls come from the generic code in
>> mm/filemap.c, and don't know what get_block() to pass in.
>>
>> During one iteration I had the calls to dax_writeback_mapping_range()
>> happening in the filesystem fsync code so that it could pass in get_block(),
>> but Dave Chinner pointed out that this misses other paths in the filesystem
>> that need to have things flushed via a call to filemap_write_and_wait_range().
>>
>> In yet another iteration of this series I tried adding get_block() to struct
>> inode_operations so that I could access it from what is now
>> dax_writeback_mapping_range(), but this was shot down as well.
>
> Ugh, and we can't trigger it from where a filesystem normally syncs a
> block device, becauDid you tryse we lose track of the inode radix

[ sorry, copy paste error ]

block device, because we lose track of the inode radix

> information at that level.
>
> What a about a super_operation?  That seems the right level, given
> we're currently doing:
>
> inode->i_sb->s_bdev
>
> ...it does not seem terrible to instead do:
>
> inode->i_sb->s_op->get_block()

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 17:47                         ` Dan Williams
@ 2016-02-02 18:24                           ` Ross Zwisler
  0 siblings, 0 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-02-02 18:24 UTC (permalink / raw)
  To: Dan Williams
  Cc: Ross Zwisler, Jan Kara, Dave Chinner, Matthew Wilcox,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Tue, Feb 02, 2016 at 09:47:37AM -0800, Dan Williams wrote:
> On Tue, Feb 2, 2016 at 9:46 AM, Dan Williams <dan.j.williams@intel.com> wrote:
> > On Tue, Feb 2, 2016 at 9:34 AM, Ross Zwisler
> > <ross.zwisler@linux.intel.com> wrote:
> >> On Tue, Feb 02, 2016 at 09:10:24AM -0800, Dan Williams wrote:
> >>> On Tue, Feb 2, 2016 at 8:46 AM, Jan Kara <jack@suse.cz> wrote:
> >>> > On Tue 02-02-16 08:33:56, Dan Williams wrote:
> >>> >> On Tue, Feb 2, 2016 at 3:17 AM, Jan Kara <jack@suse.cz> wrote:
> >>> >> [..]
> >>> >> > I see, thanks for explanation. So I'm OK with changing what is stored in
> >>> >> > the radix tree to accommodate this use case but my reservation that we IHMO
> >>> >> > have other more pressing things to fix remains...
> >>> >>
> >>> >> We don't need pfns in the radix to support XFS RT configurations.
> >>> >> Just call get_blocks() again and use the sector, or am I missing
> >>> >> something?
> >>> >
> >>> > You are correct. But if you decide to pay the cost of additional
> >>> > get_block() call, you only need the dirty tag in the radix tree and nothing
> >>> > else. So my understanding was that the whole point of games with radix tree
> >>> > is avoiding this extra get_block() calls for fsync().
> >>> >
> >>>
> >>> DAX-fsync() is already a potentially expensive operation to cover data
> >>> durability guarantees for DAX-unaware applications.  A DAX-aware
> >>> application is going to skip fsync, and the get_blocks() cost, to do
> >>> cache management itself.
> >>>
> >>> Willy pointed out some other potential benefits, assuming a suitable
> >>> replacement for the protections afforded by the block-device
> >>> percpu_ref counter can be found.  However, optimizing for the
> >>> DAX-unaware-application case seems the wrong motivation to me.
> >>
> >> Oh, no, the primary issue with calling get_block() in the fsync path isn't
> >> performance.  It's that we don't have any idea what get_block() function to
> >> call.
> >>
> >> The fault handler calls all come from the filesystem directly, so they can
> >> easily give us an appropriate get_block() function pointer.  But the
> >> dax_writeback_mapping_range() calls come from the generic code in
> >> mm/filemap.c, and don't know what get_block() to pass in.
> >>
> >> During one iteration I had the calls to dax_writeback_mapping_range()
> >> happening in the filesystem fsync code so that it could pass in get_block(),
> >> but Dave Chinner pointed out that this misses other paths in the filesystem
> >> that need to have things flushed via a call to filemap_write_and_wait_range().
> >>
> >> In yet another iteration of this series I tried adding get_block() to struct
> >> inode_operations so that I could access it from what is now
> >> dax_writeback_mapping_range(), but this was shot down as well.
> >
> > Ugh, and we can't trigger it from where a filesystem normally syncs a
> > block device, becauDid you tryse we lose track of the inode radix
> 
> [ sorry, copy paste error ]
> 
> block device, because we lose track of the inode radix
> 
> > information at that level.
> >
> > What a about a super_operation?  That seems the right level, given
> > we're currently doing:
> >
> > inode->i_sb->s_bdev
> >
> > ...it does not seem terrible to instead do:
> >
> > inode->i_sb->s_op->get_block()

This seems promising.  I'll try and code it up.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 11:17             ` Jan Kara
  2016-02-02 16:33               ` Dan Williams
@ 2016-02-02 18:41               ` Ross Zwisler
  2016-02-02 18:53                 ` Ross Zwisler
  1 sibling, 1 reply; 66+ messages in thread
From: Ross Zwisler @ 2016-02-02 18:41 UTC (permalink / raw)
  To: Jan Kara
  Cc: Dave Chinner, Matthew Wilcox, Ross Zwisler, Christoph Hellwig,
	linux-kernel, Alexander Viro, Andrew Morton, Dan Williams,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Tue, Feb 02, 2016 at 12:17:23PM +0100, Jan Kara wrote:
> On Tue 02-02-16 08:47:30, Dave Chinner wrote:
> > On Mon, Feb 01, 2016 at 03:51:47PM +0100, Jan Kara wrote:
> > > On Sat 30-01-16 00:28:33, Matthew Wilcox wrote:
> > > > On Fri, Jan 29, 2016 at 11:28:15AM -0700, Ross Zwisler wrote:
> > > > > I guess I need to go off and understand if we can have DAX mappings on such a
> > > > > device.  If we can, we may have a problem - we can get the block_device from
> > > > > get_block() in I/O path and the various fault paths, but we don't have access
> > > > > to get_block() when flushing via dax_writeback_mapping_range().  We avoid
> > > > > needing it the normal case by storing the sector results from get_block() in
> > > > > the radix tree.
> > > > 
> > > > I think we're doing it wrong by storing the sector in the radix tree; we'd
> > > > really need to store both the sector and the bdev which is too much data.
> > > > 
> > > > If we store the PFN of the underlying page instead, we don't have this
> > > > problem.  Instead, we have a different problem; of the device going
> > > > away under us.  I'm trying to find the code which tears down PTEs when
> > > > the device goes away, and I'm not seeing it.  What do we do about user
> > > > mappings of the device?
> > > 
> > > So I don't have a strong opinion whether storing PFN or sector is better.
> > > Maybe PFN is somewhat more generic but OTOH turning DAX off for special
> > > cases like inodes on XFS RT devices would be IMHO fine.
> > 
> > We need to support alternate devices.
> > 
> > There is a strong case for using the XFS RT device with DAX,
> > especially for applications that know they are going to always use
> > large/huge/giant pages to access their data files. The XFS RT device
> > can guarantee allocation is always aligned to large/huge/giant page
> > constraints right up to ENOSPC and throughout the production life of
> > the filesystem. We have no other filesystem capable of providing
> > such guarantees, which means the XFS RT device is uniquely suited to
> > certain aplications with DAX...
> 
> I see, thanks for explanation. So I'm OK with changing what is stored in
> the radix tree to accommodate this use case but my reservation that we IHMO
> have other more pressing things to fix remains...

IMO this is pretty pressing - without it neither XFS RT devices nor DAX raw
block devices work.  The case has been made above for XFS RT devices, and with
DAX raw block devices we really need a fix because the current code will cause
a kernel BUG when a user tries to fsync/msync a raw block device mmap().  This
is especially bad because, unlike with filesystems where you mount with the
dax mount option, there is no opt-in step for raw block devices.

This has to be fixed - it seems like we either figure out how to fix DAX
fsync, or we have to disable DAX on raw block devices for a kernel cycle.  I'm
hoping for the former. :)

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 17:46                       ` Dan Williams
  2016-02-02 17:47                         ` Dan Williams
@ 2016-02-02 18:46                         ` Matthew Wilcox
  2016-02-02 18:59                           ` Dan Williams
  2016-02-03 11:09                           ` Jan Kara
  1 sibling, 2 replies; 66+ messages in thread
From: Matthew Wilcox @ 2016-02-02 18:46 UTC (permalink / raw)
  To: Dan Williams
  Cc: Ross Zwisler, Jan Kara, Dave Chinner, Christoph Hellwig,
	linux-kernel, Alexander Viro, Andrew Morton, Jan Kara,
	linux-fsdevel, linux-nvdimm

On Tue, Feb 02, 2016 at 09:46:21AM -0800, Dan Williams wrote:
> What a about a super_operation?  That seems the right level, given
> we're currently doing:
> 
> inode->i_sb->s_bdev
> 
> ...it does not seem terrible to instead do:
> 
> inode->i_sb->s_op->get_block()

The point is that filesystems have lots of different get_block operations,
and the right one to use depends not just on the inode, but also upon
what VFS function is being called, and in some filesystems the phase
of the moon, or the file open flags (so even inode->i_ops->get_block is
wrong; file->f_ops->get_block would be better, but of course we've lost
that by the point we're doing writeback).

I now realise that basing DAX around get_block & buffer_heads was a mistake.
I think the Right Solution (not for 4.5) is to ask filesystems to populate
the radix tree.  A flow somewhat like this:

1. VFS or VM calls filesystem (eg ->fault())
2. Filesystem calls DAX (eg dax_fault())
3. DAX looks in radix tree, finds no information.
4. DAX calls (NEW!) mapping->a_ops->populate_pfns
5. Filesystem looks up its internal data structure (eg extent tree) and
   calls dax_create_pfns() (see giant patch from yesterday, only instead of
   passing a get_block_t, the filesystem has already filled in a bh which
   describes the entire extent that this access happens to land in).
6. DAX continues to take care of calling bdev_direct_access() from
   dax_create_pfns().

After we have that step done, we can look at what it would take to
avoid calling bdev_direct_access for non-block-based filesystems.
That looks to me like just calling dax_add_pfn_entries() from their
->populate_pfns implementation.  And we introduce a CONFIG_BLOCK ifdef
around dax_create_pfns(), dax_clear_blocks() and dax_zero_page_range().
Or maybe modify dax_zero_page_range() to use the radix tree as above,
since it's probably a useful helper function.

Once we have buffer_head usage confined to a fairly small part of DAX,
we can look at replacing it with a more appropriate data structure with
better-defined contents.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 18:41               ` Ross Zwisler
@ 2016-02-02 18:53                 ` Ross Zwisler
  0 siblings, 0 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-02-02 18:53 UTC (permalink / raw)
  To: Ross Zwisler, Jan Kara, Dave Chinner, Matthew Wilcox,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	Dan Williams, Jan Kara, linux-fsdevel, linux-nvdimm

On Tue, Feb 02, 2016 at 11:41:34AM -0700, Ross Zwisler wrote:
> On Tue, Feb 02, 2016 at 12:17:23PM +0100, Jan Kara wrote:
> > On Tue 02-02-16 08:47:30, Dave Chinner wrote:
> > > On Mon, Feb 01, 2016 at 03:51:47PM +0100, Jan Kara wrote:
> > > > On Sat 30-01-16 00:28:33, Matthew Wilcox wrote:
> > > > > On Fri, Jan 29, 2016 at 11:28:15AM -0700, Ross Zwisler wrote:
> > > > > > I guess I need to go off and understand if we can have DAX mappings on such a
> > > > > > device.  If we can, we may have a problem - we can get the block_device from
> > > > > > get_block() in I/O path and the various fault paths, but we don't have access
> > > > > > to get_block() when flushing via dax_writeback_mapping_range().  We avoid
> > > > > > needing it the normal case by storing the sector results from get_block() in
> > > > > > the radix tree.
> > > > > 
> > > > > I think we're doing it wrong by storing the sector in the radix tree; we'd
> > > > > really need to store both the sector and the bdev which is too much data.
> > > > > 
> > > > > If we store the PFN of the underlying page instead, we don't have this
> > > > > problem.  Instead, we have a different problem; of the device going
> > > > > away under us.  I'm trying to find the code which tears down PTEs when
> > > > > the device goes away, and I'm not seeing it.  What do we do about user
> > > > > mappings of the device?
> > > > 
> > > > So I don't have a strong opinion whether storing PFN or sector is better.
> > > > Maybe PFN is somewhat more generic but OTOH turning DAX off for special
> > > > cases like inodes on XFS RT devices would be IMHO fine.
> > > 
> > > We need to support alternate devices.
> > > 
> > > There is a strong case for using the XFS RT device with DAX,
> > > especially for applications that know they are going to always use
> > > large/huge/giant pages to access their data files. The XFS RT device
> > > can guarantee allocation is always aligned to large/huge/giant page
> > > constraints right up to ENOSPC and throughout the production life of
> > > the filesystem. We have no other filesystem capable of providing
> > > such guarantees, which means the XFS RT device is uniquely suited to
> > > certain aplications with DAX...
> > 
> > I see, thanks for explanation. So I'm OK with changing what is stored in
> > the radix tree to accommodate this use case but my reservation that we IHMO
> > have other more pressing things to fix remains...
> 
> IMO this is pretty pressing - without it neither XFS RT devices nor DAX raw
> block devices work.  The case has been made above for XFS RT devices, and with
> DAX raw block devices we really need a fix because the current code will cause
> a kernel BUG when a user tries to fsync/msync a raw block device mmap().  This
> is especially bad because, unlike with filesystems where you mount with the
> dax mount option, there is no opt-in step for raw block devices.
> 
> This has to be fixed - it seems like we either figure out how to fix DAX
> fsync, or we have to disable DAX on raw block devices for a kernel cycle.  I'm
> hoping for the former. :)

Well, I guess a third option would be to keep DAX raw block device in and just
take this patch as a temporary fix:

https://lkml.org/lkml/2016/1/28/679

This would leave XFS RT broken, though, so we may want to explicitly disable
DAX + XFS RT configs for now, but at least we wouldn't have the raw block
device kernel BUG.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 18:46                         ` Matthew Wilcox
@ 2016-02-02 18:59                           ` Dan Williams
  2016-02-02 20:14                             ` Matthew Wilcox
  2016-02-03 11:09                           ` Jan Kara
  1 sibling, 1 reply; 66+ messages in thread
From: Dan Williams @ 2016-02-02 18:59 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Ross Zwisler, Jan Kara, Dave Chinner, Christoph Hellwig,
	linux-kernel, Alexander Viro, Andrew Morton, Jan Kara,
	linux-fsdevel, linux-nvdimm

On Tue, Feb 2, 2016 at 10:46 AM, Matthew Wilcox <willy@linux.intel.com> wrote:
> On Tue, Feb 02, 2016 at 09:46:21AM -0800, Dan Williams wrote:
>> What a about a super_operation?  That seems the right level, given
>> we're currently doing:
>>
>> inode->i_sb->s_bdev
>>
>> ...it does not seem terrible to instead do:
>>
>> inode->i_sb->s_op->get_block()
>
> The point is that filesystems have lots of different get_block operations,
> and the right one to use depends not just on the inode, but also upon
> what VFS function is being called, and in some filesystems the phase
> of the moon, or the file open flags (so even inode->i_ops->get_block is
> wrong; file->f_ops->get_block would be better, but of course we've lost
> that by the point we're doing writeback).

True, but in this case we're just trying to resolve the bdev for a
inode / sector combination to already allocated storage.  So
get_block() is a misnomer, this is just get_bdev() to resolve a
super_block-inode+sector tuple to a bdev for cases when s_bdev is the
wrong answer.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 18:59                           ` Dan Williams
@ 2016-02-02 20:14                             ` Matthew Wilcox
  0 siblings, 0 replies; 66+ messages in thread
From: Matthew Wilcox @ 2016-02-02 20:14 UTC (permalink / raw)
  To: Dan Williams
  Cc: Ross Zwisler, Jan Kara, Dave Chinner, Christoph Hellwig,
	linux-kernel, Alexander Viro, Andrew Morton, Jan Kara,
	linux-fsdevel, linux-nvdimm

On Tue, Feb 02, 2016 at 10:59:41AM -0800, Dan Williams wrote:
> True, but in this case we're just trying to resolve the bdev for a
> inode / sector combination to already allocated storage.  So
> get_block() is a misnomer, this is just get_bdev() to resolve a
> super_block-inode+sector tuple to a bdev for cases when s_bdev is the
> wrong answer.

Then let's call it that.  i_ops->get_bdev(struct inode *inode, pgoff_t index)
And if it doesn't exist, use i_sb->s_bdev.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 16:51                   ` Dan Williams
@ 2016-02-02 21:46                     ` Jared Hulbert
  2016-02-03  0:34                       ` Matthew Wilcox
  0 siblings, 1 reply; 66+ messages in thread
From: Jared Hulbert @ 2016-02-02 21:46 UTC (permalink / raw)
  To: Dan Williams
  Cc: Dave Chinner, Jan Kara, Matthew Wilcox, Ross Zwisler,
	Christoph Hellwig, LKML, Alexander Viro, Andrew Morton, Jan Kara,
	Linux FS Devel, linux-nvdimm

On Tue, Feb 2, 2016 at 8:51 AM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Tue, Feb 2, 2016 at 12:05 AM, Jared Hulbert <jaredeh@gmail.com> wrote:
> [..]
>> Well... as CONFIG_BLOCK was not required with filemap_xip.c for a
>> decade.  This CONFIG_BLOCK dependency is a result of an incremental
>> feature from a certain point of view ;)
>>
>> The obvious 'driver' is physical RAM without a particular driver.
>> Remember please I'm talking about embedded.  RAM measured in MiB and
>> funky one off hardware etc.  In the embedded world there are lots of
>> ways that persistent memory has been supported in device specific ways
>> without the new fancypants NFIT and Intel instructions,so frankly
>> they don't fit in the PMEM stuff.  Maybe they could be supported in
>> PMEM but not without effort to bring embedded players to the table.
>
> Not sure what you're trying to say here.  An ACPI NFIT only feeds the
> generic libnvdimm device model.  You don't need NFIT to get pmem.

Right... I'm just not seeing how the libnvdimm device model fits, is
relevant, or useful to a persistent SRAM in embedded.  Therefore I
don't see some of the user will have a driver.

>> The other drivers are the MTD drivers, probably as read-only for now.
>> But the paradigm there isn't so different from what PMEM looks like
>> with asymmetric read/write capabilities.
>>
>> The filesystem I'm concerned with is AXFS
>> (https://www.kernel.org/doc/ols/2008/ols2008v1-pages-211-218.pdf).
>> Which I've been planning on trying to merge again due to a recent
>> resurgence of interest.  The device model for AXFS is... weird.  It
>> can use one or two devices at a time of any mix of NOR MTD, NAND MTD,
>> block, and unmanaged physical memory.  It's a terribly useful model
>> for embedded.  Anyway AXFS is readonly so hacking in a read only
>> dax_fault_nodev() and dax_file_read() would work fine, looks easy
>> enough.  But... it would be cool if similar small embedded focused RW
>> filesystems were enabled.
>
> Are those also out of tree?

Of course.  Merging embedded filesystems is little merging regular
filesystems except 98% of you reviewers don't want it merged.

>> I don't expect you to taint DAX with design requirements for this
>> stuff that it wasn't built for, nobody ends up happy in that case.
>> However, if enabling the filesystem to manage the bdev_direct_access()
>> interactions solves some of the "alternate device" problems you are
>> discussing here, then there is a chance we can accommodate both.
>> Sometimes that works.
>>
>> So... Forget CONFIG_BLOCK=n entirely I didn't want that to be the
>> focus anyway.  Does it help to support the weirder XFS and btrfs
>> device models to enable the filesystem to handle the
>> bdev_direct_access() stuff?
>
> It's not clear that it does.  We just clarified with xfs and ext4 that
> we can really on get_blocks().  That solves the immediate concern with
> multi-device filesystems.

IMO you're making DAX more complex by overly coupling to the bdev and
I think it could bite you later.  I submit this rework of the radix
tree and confusion about where to get the real bdev as evidence.  I'm
guessing that it won't be the last time.  It's unnecessary to couple
it like this, and in fact is not how the vfs has been layered in the
past.

The trouble with vfs work has been that it straddles the line between
mm and block, unfortunately that line is dark chasm with ill defined
boundaries.  DAX is even more exciting because it's trying to duct
tape the filesystem even closer to the mm system, one could argue it's
actually in some respects enabling the filesystem to bypass the mm
code.  On top of that DAX is designed to enable block based
filesystems to use RAM like devices.

Bolting the block device interface on to NVDIMM is a brilliant hack
and the right design choice, but it's still a hack.  The upside is it
enables the reuse of all this glorious legacy filesystem code which
does a pretty amazing job of handling what the pmem device
applications need considering they were designed to manage data on
platters of slow spinning rust.  How would DAX look like developed
with a filesystem purpose built for pmem?

To look at the the downside consider dax_fault().  Its called on a
fault to a user memory map, uses the filesystems get_block() to lookup
a sector so you can ask a block device to convert it to an address on
a DIMM.  Come on, that's awkward.  Everything around dax_fault() is
dripping with memory semantic interfaces, the dax_fault() call are
fundamentally about memory, the pmem calls are memory, the hardware is
memory, and yet it directly calls bdev_direct_access().  It's out of
place.

The legacy vfs/mm code didn't have this layering problem either.  Even
filemap_fault() that dax_fault() is modeled after doesn't call any
bdev methods directly, when it needs something it asks the filesystem
with a ->readpage().  The precedence is that you ask the filesystem
for what you need.  Look at the get_bdev() thing you've concluded you
need.  It _almost_ makes my point.  I just happen to be of the opinion
that you don't actually want or need the bdev, you want the pfn/kaddr
so you can flush or map or memcpy().

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 21:46                     ` Jared Hulbert
@ 2016-02-03  0:34                       ` Matthew Wilcox
  2016-02-03  1:21                         ` Jared Hulbert
  0 siblings, 1 reply; 66+ messages in thread
From: Matthew Wilcox @ 2016-02-03  0:34 UTC (permalink / raw)
  To: Jared Hulbert
  Cc: Dan Williams, Dave Chinner, Jan Kara, Ross Zwisler,
	Christoph Hellwig, LKML, Alexander Viro, Andrew Morton, Jan Kara,
	Linux FS Devel, linux-nvdimm

On Tue, Feb 02, 2016 at 01:46:06PM -0800, Jared Hulbert wrote:
> On Tue, Feb 2, 2016 at 8:51 AM, Dan Williams <dan.j.williams@intel.com> wrote:
> >> The filesystem I'm concerned with is AXFS
> >> (https://www.kernel.org/doc/ols/2008/ols2008v1-pages-211-218.pdf).
> >> Which I've been planning on trying to merge again due to a recent
> >> resurgence of interest.  The device model for AXFS is... weird.  It
> >> can use one or two devices at a time of any mix of NOR MTD, NAND MTD,
> >> block, and unmanaged physical memory.  It's a terribly useful model
> >> for embedded.  Anyway AXFS is readonly so hacking in a read only
> >> dax_fault_nodev() and dax_file_read() would work fine, looks easy
> >> enough.  But... it would be cool if similar small embedded focused RW
> >> filesystems were enabled.
> >
> > Are those also out of tree?
> 
> Of course.  Merging embedded filesystems is little merging regular
> filesystems except 98% of you reviewers don't want it merged.

You should at least be able to get it into staging these days.  I mean,
look at some of the junk that's in staging ... and I don't think AXFS was
nearly as bad.

> IMO you're making DAX more complex by overly coupling to the bdev and
> I think it could bite you later.  I submit this rework of the radix
> tree and confusion about where to get the real bdev as evidence.  I'm
> guessing that it won't be the last time.  It's unnecessary to couple
> it like this, and in fact is not how the vfs has been layered in the
> past.

Huh?  The rework to use the radix tree for PFNs was done with one eye
firmly on your usage case.  Just because I had to thread the get_block
interface through it for the moment doesn't mean that I didn't have
the "how do we get rid of get_block entirely" question on my mind.

Using get_block seemed like the right idea three years ago.  I didn't
know just how fundamentally ext4 and XFS disagree on how it should be
used.

> To look at the the downside consider dax_fault().  Its called on a
> fault to a user memory map, uses the filesystems get_block() to lookup
> a sector so you can ask a block device to convert it to an address on
> a DIMM.  Come on, that's awkward.  Everything around dax_fault() is
> dripping with memory semantic interfaces, the dax_fault() call are
> fundamentally about memory, the pmem calls are memory, the hardware is
> memory, and yet it directly calls bdev_direct_access().  It's out of
> place.

What was out of place was the old 'get_xip_mem' in address_space
operations.  Returning a kernel virtual address and a PFN from a
filesystem operation?  That looks awful.  All the other operations deal
in struct pages, file offsets and occasionally sectors.  Of course, we
don't have a struct page, so a pfn makes sense, but the kernel virtual
address being returned was a gargantuan layering problem.

> The legacy vfs/mm code didn't have this layering problem either.  Even
> filemap_fault() that dax_fault() is modeled after doesn't call any
> bdev methods directly, when it needs something it asks the filesystem
> with a ->readpage().  The precedence is that you ask the filesystem
> for what you need.  Look at the get_bdev() thing you've concluded you
> need.  It _almost_ makes my point.  I just happen to be of the opinion
> that you don't actually want or need the bdev, you want the pfn/kaddr
> so you can flush or map or memcpy().

You want the pfn.  The device driver doesn't have enough information to
give you a (coherent with userspace) kaddr.  That's what (some future
arch-specific implementation of) dax_map_pfn() is for.  That's why it
takes 'index' as a parameter, so you can calculate where it'll be mapped
in userspace, and determine an appropriate kernel virtual address to
use for it.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-03  0:34                       ` Matthew Wilcox
@ 2016-02-03  1:21                         ` Jared Hulbert
  0 siblings, 0 replies; 66+ messages in thread
From: Jared Hulbert @ 2016-02-03  1:21 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Dan Williams, Dave Chinner, Jan Kara, Ross Zwisler,
	Christoph Hellwig, LKML, Alexander Viro, Andrew Morton, Jan Kara,
	Linux FS Devel, linux-nvdimm

On Tue, Feb 2, 2016 at 4:34 PM, Matthew Wilcox <willy@linux.intel.com> wrote:
> On Tue, Feb 02, 2016 at 01:46:06PM -0800, Jared Hulbert wrote:
>> On Tue, Feb 2, 2016 at 8:51 AM, Dan Williams <dan.j.williams@intel.com> wrote:
>> >> The filesystem I'm concerned with is AXFS
>> >> (https://www.kernel.org/doc/ols/2008/ols2008v1-pages-211-218.pdf).
>> >> Which I've been planning on trying to merge again due to a recent
>> >> resurgence of interest.  The device model for AXFS is... weird.  It
>> >> can use one or two devices at a time of any mix of NOR MTD, NAND MTD,
>> >> block, and unmanaged physical memory.  It's a terribly useful model
>> >> for embedded.  Anyway AXFS is readonly so hacking in a read only
>> >> dax_fault_nodev() and dax_file_read() would work fine, looks easy
>> >> enough.  But... it would be cool if similar small embedded focused RW
>> >> filesystems were enabled.
>> >
>> > Are those also out of tree?
>>
>> Of course.  Merging embedded filesystems is little merging regular
>> filesystems except 98% of you reviewers don't want it merged.
>
> You should at least be able to get it into staging these days.  I mean,
> look at some of the junk that's in staging ... and I don't think AXFS was
> nearly as bad.

Thanks....? ;)

>> IMO you're making DAX more complex by overly coupling to the bdev and
>> I think it could bite you later.  I submit this rework of the radix
>> tree and confusion about where to get the real bdev as evidence.  I'm
>> guessing that it won't be the last time.  It's unnecessary to couple
>> it like this, and in fact is not how the vfs has been layered in the
>> past.
>
> Huh?  The rework to use the radix tree for PFNs was done with one eye
> firmly on your usage case.  Just because I had to thread the get_block
> interface through it for the moment doesn't mean that I didn't have
> the "how do we get rid of get_block entirely" question on my mind.

Oh yeah.  I think we're on the same page.  But I'm not sure Dan is.  I
get the need to phase this in too.

> Using get_block seemed like the right idea three years ago.  I didn't
> know just how fundamentally ext4 and XFS disagree on how it should be
> used.

Sure.  I can see that.

>> To look at the the downside consider dax_fault().  Its called on a
>> fault to a user memory map, uses the filesystems get_block() to lookup
>> a sector so you can ask a block device to convert it to an address on
>> a DIMM.  Come on, that's awkward.  Everything around dax_fault() is
>> dripping with memory semantic interfaces, the dax_fault() call are
>> fundamentally about memory, the pmem calls are memory, the hardware is
>> memory, and yet it directly calls bdev_direct_access().  It's out of
>> place.
>
> What was out of place was the old 'get_xip_mem' in address_space
> operations.  Returning a kernel virtual address and a PFN from a
> filesystem operation?  That looks awful.

Yes.  Yes it does!  But at least my big hack was just one line. ;)
Nobody really even seemed to notice at the time.

>  All the other operations deal
> in struct pages, file offsets and occasionally sectors.  Of course, we
> don't have a struct page, so a pfn makes sense, but the kernel virtual
> address being returned was a gargantuan layering problem.

Well yes, but it was an expedient hack.

>> The legacy vfs/mm code didn't have this layering problem either.  Even
>> filemap_fault() that dax_fault() is modeled after doesn't call any
>> bdev methods directly, when it needs something it asks the filesystem
>> with a ->readpage().  The precedence is that you ask the filesystem
>> for what you need.  Look at the get_bdev() thing you've concluded you
>> need.  It _almost_ makes my point.  I just happen to be of the opinion
>> that you don't actually want or need the bdev, you want the pfn/kaddr
>> so you can flush or map or memcpy().
>
> You want the pfn.  The device driver doesn't have enough information to
> give you a (coherent with userspace) kaddr.  That's what (some future
> arch-specific implementation of) dax_map_pfn() is for.  That's why it
> takes 'index' as a parameter, so you can calculate where it'll be mapped
> in userspace, and determine an appropriate kernel virtual address to
> use for it.

Oh.... I think I'm just beginning to catch your vision for
dax_map_pfn().  I still don't get why we can't just do semi-arch
specific flushing instead of the alignment thing.  But that just might
be epic ignorance on my part.  Either way flush or magic alignments
dax_(un)map_pfn() would handle it, right?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 17:34                     ` Ross Zwisler
  2016-02-02 17:46                       ` Dan Williams
@ 2016-02-03 10:46                       ` Jan Kara
  2016-02-03 20:13                         ` Ross Zwisler
  2016-02-04 19:56                         ` Ross Zwisler
  1 sibling, 2 replies; 66+ messages in thread
From: Jan Kara @ 2016-02-03 10:46 UTC (permalink / raw)
  To: Ross Zwisler
  Cc: Dan Williams, Jan Kara, Dave Chinner, Matthew Wilcox,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Tue 02-02-16 10:34:56, Ross Zwisler wrote:
> On Tue, Feb 02, 2016 at 09:10:24AM -0800, Dan Williams wrote:
> > On Tue, Feb 2, 2016 at 8:46 AM, Jan Kara <jack@suse.cz> wrote:
> > > On Tue 02-02-16 08:33:56, Dan Williams wrote:
> > >> On Tue, Feb 2, 2016 at 3:17 AM, Jan Kara <jack@suse.cz> wrote:
> > >> [..]
> > >> > I see, thanks for explanation. So I'm OK with changing what is stored in
> > >> > the radix tree to accommodate this use case but my reservation that we IHMO
> > >> > have other more pressing things to fix remains...
> > >>
> > >> We don't need pfns in the radix to support XFS RT configurations.
> > >> Just call get_blocks() again and use the sector, or am I missing
> > >> something?
> > >
> > > You are correct. But if you decide to pay the cost of additional
> > > get_block() call, you only need the dirty tag in the radix tree and nothing
> > > else. So my understanding was that the whole point of games with radix tree
> > > is avoiding this extra get_block() calls for fsync().
> > >
> > 
> > DAX-fsync() is already a potentially expensive operation to cover data
> > durability guarantees for DAX-unaware applications.  A DAX-aware
> > application is going to skip fsync, and the get_blocks() cost, to do
> > cache management itself.
> > 
> > Willy pointed out some other potential benefits, assuming a suitable
> > replacement for the protections afforded by the block-device
> > percpu_ref counter can be found.  However, optimizing for the
> > DAX-unaware-application case seems the wrong motivation to me.
> 
> Oh, no, the primary issue with calling get_block() in the fsync path isn't
> performance.  It's that we don't have any idea what get_block() function to
> call.
> 
> The fault handler calls all come from the filesystem directly, so they can
> easily give us an appropriate get_block() function pointer.  But the
> dax_writeback_mapping_range() calls come from the generic code in
> mm/filemap.c, and don't know what get_block() to pass in.
> 
> During one iteration I had the calls to dax_writeback_mapping_range()
> happening in the filesystem fsync code so that it could pass in get_block(),
> but Dave Chinner pointed out that this misses other paths in the filesystem
> that need to have things flushed via a call to filemap_write_and_wait_range().

Let's clear this up a bit: The problem with using ->fsync() method is that
it doesn't get called for sync(2). We could use ->sync_fs() to flush caches
in case of sync(2) (that's what's happening for normal storage) but the
problem with PMEM is that "flush all cached data" operation effectively
means iterate through all modified pages and we didn't want to implement
this for DAX fsync code.

So we have decided to do cache flushing for DAX at a different point - mark
inodes which may have writes cached as dirty and use writeback code for the
cache flushing. But looking at it now, we have actually chosen a wrong
place to do the flushing in the writeback path - note that sync(2) writes
data via __writeback_single_inode() -> do_writepages() and thus doesn't
even get to filemap_write_and_wait().

So revisiting the decision I see two options:

1) Move the DAX flushing code from filemap_write_and_wait() into
->writepages() fs callback. There the filesystem can provide all the
information it needs including bdev, get_block callback, or whatever.

2) Back out even further and implement own tracking and iteration of inodes
to write.

So far I still think 2) is not worth the complexity (although it would
bring DAX code closer to how things behave with standard storage) so I
would go for 1).

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-02 18:46                         ` Matthew Wilcox
  2016-02-02 18:59                           ` Dan Williams
@ 2016-02-03 11:09                           ` Jan Kara
  1 sibling, 0 replies; 66+ messages in thread
From: Jan Kara @ 2016-02-03 11:09 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Dan Williams, Ross Zwisler, Jan Kara, Dave Chinner,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Tue 02-02-16 13:46:43, Matthew Wilcox wrote:
> On Tue, Feb 02, 2016 at 09:46:21AM -0800, Dan Williams wrote:
> > What a about a super_operation?  That seems the right level, given
> > we're currently doing:
> > 
> > inode->i_sb->s_bdev
> > 
> > ...it does not seem terrible to instead do:
> > 
> > inode->i_sb->s_op->get_block()
> 
> The point is that filesystems have lots of different get_block operations,
> and the right one to use depends not just on the inode, but also upon
> what VFS function is being called, and in some filesystems the phase
> of the moon, or the file open flags (so even inode->i_ops->get_block is
> wrong; file->f_ops->get_block would be better, but of course we've lost
> that by the point we're doing writeback).

See what I wrote to Ross. I think this particular issue needs to be solved
by moving the flushing to ->writepages() callback.

> I now realise that basing DAX around get_block & buffer_heads was a mistake.
> I think the Right Solution (not for 4.5) is to ask filesystems to populate
> the radix tree.  A flow somewhat like this:
> 
> 1. VFS or VM calls filesystem (eg ->fault())
> 2. Filesystem calls DAX (eg dax_fault())
> 3. DAX looks in radix tree, finds no information.
> 4. DAX calls (NEW!) mapping->a_ops->populate_pfns
> 5. Filesystem looks up its internal data structure (eg extent tree) and
>    calls dax_create_pfns() (see giant patch from yesterday, only instead of
>    passing a get_block_t, the filesystem has already filled in a bh which
>    describes the entire extent that this access happens to land in).
> 6. DAX continues to take care of calling bdev_direct_access() from
>    dax_create_pfns().
 
So I don't think that ->populate_pfns() is the right interface because it
doesn't really tell the filesystem what you want to do. It is essentially
like get_blocks() callback only you additionaly ask the fs to fill in the
mapping information into the radix tree. So it has the same problems as
get_blocks() callback in inode_operations (or superblock_operations,
aops, or anywhere else). History has taught us (there was get_blocks()
callback in inode operations in ancient times ;) that fs really needs to
know wider context to decide how exactly to fulfill the request.

I don't see anything obviously wrong with using radix tree as a primary
source of mapping information for DAX (after all we do that for page cache
as well where the mapping information is attached to pages in the radix
tree). But this seems independent of the get_blocks() vs something else
discussion.

And if your problem is with vaguely defined meaning of buffer_head flags
returned from get_blocks() callback, using the iomap interface (which XFS
currently uses for pNFS) would solve that.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-03 10:46                       ` Jan Kara
@ 2016-02-03 20:13                         ` Ross Zwisler
  2016-02-04  9:15                           ` Jan Kara
  2016-02-04 19:56                         ` Ross Zwisler
  1 sibling, 1 reply; 66+ messages in thread
From: Ross Zwisler @ 2016-02-03 20:13 UTC (permalink / raw)
  To: Jan Kara, Dave Chinner
  Cc: Ross Zwisler, Dan Williams, Matthew Wilcox, Christoph Hellwig,
	linux-kernel, Alexander Viro, Andrew Morton, linux-fsdevel,
	linux-nvdimm

On Wed, Feb 03, 2016 at 11:46:11AM +0100, Jan Kara wrote:
> On Tue 02-02-16 10:34:56, Ross Zwisler wrote:
<>
> > Oh, no, the primary issue with calling get_block() in the fsync path isn't
> > performance.  It's that we don't have any idea what get_block() function to
> > call.
> > 
> > The fault handler calls all come from the filesystem directly, so they can
> > easily give us an appropriate get_block() function pointer.  But the
> > dax_writeback_mapping_range() calls come from the generic code in
> > mm/filemap.c, and don't know what get_block() to pass in.
> > 
> > During one iteration I had the calls to dax_writeback_mapping_range()
> > happening in the filesystem fsync code so that it could pass in get_block(),
> > but Dave Chinner pointed out that this misses other paths in the filesystem
> > that need to have things flushed via a call to filemap_write_and_wait_range().
> 
> Let's clear this up a bit: The problem with using ->fsync() method is that
> it doesn't get called for sync(2). 

Ugh, yep.  We need to correct this.

> We could use ->sync_fs() to flush caches
> in case of sync(2) (that's what's happening for normal storage) but the
> problem with PMEM is that "flush all cached data" operation effectively
> means iterate through all modified pages and we didn't want to implement
> this for DAX fsync code.

I'm not sure what you mean by this - this is exactly what we implemented for
the DAX fsync code?  We iterate through all dirty pages, as recorded in the
radix tree, and flush them to media.

It seems like we could and should do the same thing for sync(2) and syncfs()?

> So we have decided to do cache flushing for DAX at a different point - mark
> inodes which may have writes cached as dirty and use writeback code for the
> cache flushing. But looking at it now, we have actually chosen a wrong
> place to do the flushing in the writeback path - note that sync(2) writes
> data via __writeback_single_inode() -> do_writepages() and thus doesn't
> even get to filemap_write_and_wait().

True - but I think we have to basically add another case for sync() regardless
of what we do, since AFAICS the fsync() and sync() paths never intersect.  So,
if we call into the DAX code at the filesystem level we'll need to do it in
both ->fsync and ->sync_fs/->writepages, and if we do it in common MM code
we'll need to do it in filemap_write_and_wait_range() and do_writepages() or
similar.

Here is the comment from Dave Chinner that had me move to having the calls to
dax_writeback_mapping_range() into the generic mm code:

   > Lastly, this flushing really needs to be inside
   > filemap_write_and_wait_range(), because we call the writeback code
   > from many more places than just fsync to ensure ordering of various
   > operations such that files are in known state before proceeding
   > (e.g. hole punch).

https://lkml.org/lkml/2015/11/16/847

Both ext4 & XFS call filemap_write_and_wait_range() outside of ->fsync for
hole punch, truncate, and block relocation (xfs_shift_file_space() &&
ext4_collapse_range()/ext4_insert_range()).

I think having DAX special case code sprinkled over all these call sites seems
like an indication that we've made a bad choice, and that we should centralize
in the mm layer with filemap_write_and_wait_range() and do_writepages().

OTOH, I think that it might be true that we don't actually need to cover all
these non-fsync/sync cases.  In the page cache case when we have dirty data in
the page cache, that data will be actively lost if we evict a dirty page cache
page without flushing it to media first.  For DAX, though, the data will
remain consistent with the physical address to which it was written regardless
of whether it's in the processor cache or not - really the only reason I see
to flush is in response to a fsync/msync/sync/syncfs so that our data is
durable on media in case of a power loss.  The case where we could throw dirty
data out of the page cache and essentially lose writes simply doesn't exist.

Does this sound correct, or am I missing something?

> So revisiting the decision I see two options:
> 
> 1) Move the DAX flushing code from filemap_write_and_wait() into
> ->writepages() fs callback. There the filesystem can provide all the
> information it needs including bdev, get_block callback, or whatever.

This seems fine as long as we add it to ->fsync as well since ->writepages is
never called in that path, and as long as we are okay with skipping DAX
writebacks on hole punch, truncate, and block relocation.

> 2) Back out even further and implement own tracking and iteration of inodes
> to write.
> 
> So far I still think 2) is not worth the complexity (although it would
> bring DAX code closer to how things behave with standard storage) so I
> would go for 1).

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-03 20:13                         ` Ross Zwisler
@ 2016-02-04  9:15                           ` Jan Kara
  2016-02-04 23:38                             ` Ross Zwisler
  2016-02-06 23:15                             ` Dave Chinner
  0 siblings, 2 replies; 66+ messages in thread
From: Jan Kara @ 2016-02-04  9:15 UTC (permalink / raw)
  To: Ross Zwisler
  Cc: Jan Kara, Dave Chinner, Dan Williams, Matthew Wilcox,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	linux-fsdevel, linux-nvdimm

On Wed 03-02-16 13:13:28, Ross Zwisler wrote:
> On Wed, Feb 03, 2016 at 11:46:11AM +0100, Jan Kara wrote:
> > On Tue 02-02-16 10:34:56, Ross Zwisler wrote:
> <>
> > > Oh, no, the primary issue with calling get_block() in the fsync path isn't
> > > performance.  It's that we don't have any idea what get_block() function to
> > > call.
> > > 
> > > The fault handler calls all come from the filesystem directly, so they can
> > > easily give us an appropriate get_block() function pointer.  But the
> > > dax_writeback_mapping_range() calls come from the generic code in
> > > mm/filemap.c, and don't know what get_block() to pass in.
> > > 
> > > During one iteration I had the calls to dax_writeback_mapping_range()
> > > happening in the filesystem fsync code so that it could pass in get_block(),
> > > but Dave Chinner pointed out that this misses other paths in the filesystem
> > > that need to have things flushed via a call to filemap_write_and_wait_range().
> > 
> > Let's clear this up a bit: The problem with using ->fsync() method is that
> > it doesn't get called for sync(2). 
> 
> Ugh, yep.  We need to correct this.
> 
> > We could use ->sync_fs() to flush caches
> > in case of sync(2) (that's what's happening for normal storage) but the
> > problem with PMEM is that "flush all cached data" operation effectively
> > means iterate through all modified pages and we didn't want to implement
> > this for DAX fsync code.
> 
> I'm not sure what you mean by this - this is exactly what we implemented for
> the DAX fsync code?  We iterate through all dirty pages, as recorded in the
> radix tree, and flush them to media.

I by "iterate through all modified pages" I mean "iterate through all
inodes with modified pages and for each inode iterate through all modified
pages". The "iterate through all inodes" part is actually the one which
adds to complexity - either you resort to iterating over all inodes
attached to sb->s_inodes_list which is unnecessarily slow (there can be
milions of inodes there and you may need to flush only a couple of them)
or you have to somehow track which inodes need flushing and so you have
another inode list to manage.

By marking inodes that need flushing dirty and treating indexes to flush as
dirty pages, we can use all the writeback & dirty inode tracking machinery
to track and iterate through dirty inodes for us.

> It seems like we could and should do the same thing for sync(2) and syncfs()?
> 
> > So we have decided to do cache flushing for DAX at a different point - mark
> > inodes which may have writes cached as dirty and use writeback code for the
> > cache flushing. But looking at it now, we have actually chosen a wrong
> > place to do the flushing in the writeback path - note that sync(2) writes
> > data via __writeback_single_inode() -> do_writepages() and thus doesn't
> > even get to filemap_write_and_wait().
> 
> True - but I think we have to basically add another case for sync()
> regardless of what we do, since AFAICS the fsync() and sync() paths never
> intersect.  So, if we call into the DAX code at the filesystem level
> we'll need to do it in both ->fsync and ->sync_fs/->writepages, and if we
> do it in common MM code we'll need to do it in
> filemap_write_and_wait_range() and do_writepages() or similar.

Both fsync(2) and sync(2) paths end up in do_writepages() and consequently
->writepages() callback. That's why I suggested moving calls to DAX code
into ->writepages(). The only trouble is that sometimes we have a
performance optimization checks for (mapping->nrpages > 0) which get in the
way. We need to update those to:

	(mapping->nrpages > 0 || (IS_DAX(inode) && mapping->nrexceptional > 0))

Probably we should introduce a helper function for this check so that
people adding new checks won't forget about DAX.

> Here is the comment from Dave Chinner that had me move to having the calls to
> dax_writeback_mapping_range() into the generic mm code:
> 
>    > Lastly, this flushing really needs to be inside
>    > filemap_write_and_wait_range(), because we call the writeback code
>    > from many more places than just fsync to ensure ordering of various
>    > operations such that files are in known state before proceeding
>    > (e.g. hole punch).
> 
> https://lkml.org/lkml/2015/11/16/847
> 
> Both ext4 & XFS call filemap_write_and_wait_range() outside of ->fsync for
> hole punch, truncate, and block relocation (xfs_shift_file_space() &&
> ext4_collapse_range()/ext4_insert_range()).
> 
> I think having DAX special case code sprinkled over all these call sites seems
> like an indication that we've made a bad choice, and that we should centralize
> in the mm layer with filemap_write_and_wait_range() and do_writepages().
> 
> OTOH, I think that it might be true that we don't actually need to cover all
> these non-fsync/sync cases.  In the page cache case when we have dirty data in
> the page cache, that data will be actively lost if we evict a dirty page cache
> page without flushing it to media first.  For DAX, though, the data will
> remain consistent with the physical address to which it was written regardless
> of whether it's in the processor cache or not - really the only reason I see
> to flush is in response to a fsync/msync/sync/syncfs so that our data is
> durable on media in case of a power loss.  The case where we could throw dirty
> data out of the page cache and essentially lose writes simply doesn't exist.
> 
> Does this sound correct, or am I missing something?

Yes, you are right. filemap_write_and_wait_range() actually doesn't
guarantee data durability. That function only means all dirty data has been
sent to storage and the storage has acknowledged them. This is noop for
PMEM. So we are perfectly fine ignoring calls to
filemap_write_and_wait_range(). What guarantees data durability are only
->fsync() and ->sync_fs() calls. But some code could get upset by seeing
that filemap_write_and_wait_range() didn't actually get rid of dirty pages
(in some special cases like inode eviction or similar). That's why I'd
choose one of the two options for consistency:

1) Treat inode indexes to flush as close to dirty pages as we can - this
means inode is dirty with all the tracking associated with it, radix tree
entries have dirty tag, we get rid of these in ->writepages(). We are close
to this currently.

2) Completely avoid the dirty tracking and writeback code and reimplement
everything in DAX code.

Because some hybrid between these is IMHO bound to provoke weird (and very
hard to find) bugs.

> > So revisiting the decision I see two options:
> > 
> > 1) Move the DAX flushing code from filemap_write_and_wait() into
> > ->writepages() fs callback. There the filesystem can provide all the
> > information it needs including bdev, get_block callback, or whatever.
> 
> This seems fine as long as we add it to ->fsync as well since ->writepages is
> never called in that path, and as long as we are okay with skipping DAX
> writebacks on hole punch, truncate, and block relocation.

Look at ext4_sync_file() -> filemap_write_and_wait_range() ->
__filemap_fdatawrite_range() -> do_writepages(). Except those nrpages > 0
checks which would need to be changed.

> > 2) Back out even further and implement own tracking and iteration of inodes
> > to write.
> > 
> > So far I still think 2) is not worth the complexity (although it would
> > bring DAX code closer to how things behave with standard storage) so I
> > would go for 1).

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-03 10:46                       ` Jan Kara
  2016-02-03 20:13                         ` Ross Zwisler
@ 2016-02-04 19:56                         ` Ross Zwisler
  2016-02-04 20:29                           ` Jan Kara
  1 sibling, 1 reply; 66+ messages in thread
From: Ross Zwisler @ 2016-02-04 19:56 UTC (permalink / raw)
  To: Jan Kara
  Cc: Ross Zwisler, Dan Williams, Dave Chinner, Matthew Wilcox,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Wed, Feb 03, 2016 at 11:46:11AM +0100, Jan Kara wrote:
> On Tue 02-02-16 10:34:56, Ross Zwisler wrote:
> > On Tue, Feb 02, 2016 at 09:10:24AM -0800, Dan Williams wrote:
> > > On Tue, Feb 2, 2016 at 8:46 AM, Jan Kara <jack@suse.cz> wrote:
> > > > On Tue 02-02-16 08:33:56, Dan Williams wrote:
> > > >> On Tue, Feb 2, 2016 at 3:17 AM, Jan Kara <jack@suse.cz> wrote:
> > > >> [..]
> > > >> > I see, thanks for explanation. So I'm OK with changing what is stored in
> > > >> > the radix tree to accommodate this use case but my reservation that we IHMO
> > > >> > have other more pressing things to fix remains...
> > > >>
> > > >> We don't need pfns in the radix to support XFS RT configurations.
> > > >> Just call get_blocks() again and use the sector, or am I missing
> > > >> something?
> > > >
> > > > You are correct. But if you decide to pay the cost of additional
> > > > get_block() call, you only need the dirty tag in the radix tree and nothing
> > > > else. So my understanding was that the whole point of games with radix tree
> > > > is avoiding this extra get_block() calls for fsync().
> > > >
> > > 
> > > DAX-fsync() is already a potentially expensive operation to cover data
> > > durability guarantees for DAX-unaware applications.  A DAX-aware
> > > application is going to skip fsync, and the get_blocks() cost, to do
> > > cache management itself.
> > > 
> > > Willy pointed out some other potential benefits, assuming a suitable
> > > replacement for the protections afforded by the block-device
> > > percpu_ref counter can be found.  However, optimizing for the
> > > DAX-unaware-application case seems the wrong motivation to me.
> > 
> > Oh, no, the primary issue with calling get_block() in the fsync path isn't
> > performance.  It's that we don't have any idea what get_block() function to
> > call.
> > 
> > The fault handler calls all come from the filesystem directly, so they can
> > easily give us an appropriate get_block() function pointer.  But the
> > dax_writeback_mapping_range() calls come from the generic code in
> > mm/filemap.c, and don't know what get_block() to pass in.
> > 
> > During one iteration I had the calls to dax_writeback_mapping_range()
> > happening in the filesystem fsync code so that it could pass in get_block(),
> > but Dave Chinner pointed out that this misses other paths in the filesystem
> > that need to have things flushed via a call to filemap_write_and_wait_range().
> 
> Let's clear this up a bit: The problem with using ->fsync() method is that
> it doesn't get called for sync(2). We could use ->sync_fs() to flush caches
> in case of sync(2) (that's what's happening for normal storage) but the
> problem with PMEM is that "flush all cached data" operation effectively
> means iterate through all modified pages and we didn't want to implement
> this for DAX fsync code.
> 
> So we have decided to do cache flushing for DAX at a different point - mark
> inodes which may have writes cached as dirty and use writeback code for the
> cache flushing. But looking at it now, we have actually chosen a wrong
> place to do the flushing in the writeback path - note that sync(2) writes
> data via __writeback_single_inode() -> do_writepages() and thus doesn't
> even get to filemap_write_and_wait().
> 
> So revisiting the decision I see two options:
> 
> 1) Move the DAX flushing code from filemap_write_and_wait() into
> ->writepages() fs callback. There the filesystem can provide all the
> information it needs including bdev, get_block callback, or whatever.
> 
> 2) Back out even further and implement own tracking and iteration of inodes
> to write.
> 
> So far I still think 2) is not worth the complexity (although it would
> bring DAX code closer to how things behave with standard storage) so I
> would go for 1).

Jan, just to clarify, are you proposing this change for v4.5 in the remaining
RCs as an alternative to the get_bdev() patch?

https://lkml.org/lkml/2016/2/2/941

Or can we move forward with get_bdev(), and try and figure out this new way of
calling fsync/msync for v4.6?  My main concern here is that changing how the
DAX sync code gets called will affect all three filesystems as well as MM, and
that it might be too much for RC inclusion...

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-04 19:56                         ` Ross Zwisler
@ 2016-02-04 20:29                           ` Jan Kara
  2016-02-04 22:19                             ` Ross Zwisler
  2016-02-05 22:25                             ` Ross Zwisler
  0 siblings, 2 replies; 66+ messages in thread
From: Jan Kara @ 2016-02-04 20:29 UTC (permalink / raw)
  To: Ross Zwisler
  Cc: Jan Kara, Dan Williams, Dave Chinner, Matthew Wilcox,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Thu 04-02-16 12:56:19, Ross Zwisler wrote:
> On Wed, Feb 03, 2016 at 11:46:11AM +0100, Jan Kara wrote:
> > On Tue 02-02-16 10:34:56, Ross Zwisler wrote:
> > > On Tue, Feb 02, 2016 at 09:10:24AM -0800, Dan Williams wrote:
> > > > On Tue, Feb 2, 2016 at 8:46 AM, Jan Kara <jack@suse.cz> wrote:
> > > > > On Tue 02-02-16 08:33:56, Dan Williams wrote:
> > > > >> On Tue, Feb 2, 2016 at 3:17 AM, Jan Kara <jack@suse.cz> wrote:
> > > > >> [..]
> > > > >> > I see, thanks for explanation. So I'm OK with changing what is stored in
> > > > >> > the radix tree to accommodate this use case but my reservation that we IHMO
> > > > >> > have other more pressing things to fix remains...
> > > > >>
> > > > >> We don't need pfns in the radix to support XFS RT configurations.
> > > > >> Just call get_blocks() again and use the sector, or am I missing
> > > > >> something?
> > > > >
> > > > > You are correct. But if you decide to pay the cost of additional
> > > > > get_block() call, you only need the dirty tag in the radix tree and nothing
> > > > > else. So my understanding was that the whole point of games with radix tree
> > > > > is avoiding this extra get_block() calls for fsync().
> > > > >
> > > > 
> > > > DAX-fsync() is already a potentially expensive operation to cover data
> > > > durability guarantees for DAX-unaware applications.  A DAX-aware
> > > > application is going to skip fsync, and the get_blocks() cost, to do
> > > > cache management itself.
> > > > 
> > > > Willy pointed out some other potential benefits, assuming a suitable
> > > > replacement for the protections afforded by the block-device
> > > > percpu_ref counter can be found.  However, optimizing for the
> > > > DAX-unaware-application case seems the wrong motivation to me.
> > > 
> > > Oh, no, the primary issue with calling get_block() in the fsync path isn't
> > > performance.  It's that we don't have any idea what get_block() function to
> > > call.
> > > 
> > > The fault handler calls all come from the filesystem directly, so they can
> > > easily give us an appropriate get_block() function pointer.  But the
> > > dax_writeback_mapping_range() calls come from the generic code in
> > > mm/filemap.c, and don't know what get_block() to pass in.
> > > 
> > > During one iteration I had the calls to dax_writeback_mapping_range()
> > > happening in the filesystem fsync code so that it could pass in get_block(),
> > > but Dave Chinner pointed out that this misses other paths in the filesystem
> > > that need to have things flushed via a call to filemap_write_and_wait_range().
> > 
> > Let's clear this up a bit: The problem with using ->fsync() method is that
> > it doesn't get called for sync(2). We could use ->sync_fs() to flush caches
> > in case of sync(2) (that's what's happening for normal storage) but the
> > problem with PMEM is that "flush all cached data" operation effectively
> > means iterate through all modified pages and we didn't want to implement
> > this for DAX fsync code.
> > 
> > So we have decided to do cache flushing for DAX at a different point - mark
> > inodes which may have writes cached as dirty and use writeback code for the
> > cache flushing. But looking at it now, we have actually chosen a wrong
> > place to do the flushing in the writeback path - note that sync(2) writes
> > data via __writeback_single_inode() -> do_writepages() and thus doesn't
> > even get to filemap_write_and_wait().
> > 
> > So revisiting the decision I see two options:
> > 
> > 1) Move the DAX flushing code from filemap_write_and_wait() into
> > ->writepages() fs callback. There the filesystem can provide all the
> > information it needs including bdev, get_block callback, or whatever.
> > 
> > 2) Back out even further and implement own tracking and iteration of inodes
> > to write.
> > 
> > So far I still think 2) is not worth the complexity (although it would
> > bring DAX code closer to how things behave with standard storage) so I
> > would go for 1).
> 
> Jan, just to clarify, are you proposing this change for v4.5 in the remaining
> RCs as an alternative to the get_bdev() patch?
> 
> https://lkml.org/lkml/2016/2/2/941

Yes, because I don't think anything like ->get_bdev() is needed at all.
Look: dax_do_io(), __dax_fault(), __dax_pmd_fault(), dax_zero_page_range()
don't really need bdev - we have agreed that get_block() fills that in just
fine.

dax_clear_blocks() has IMO just the wrong signature - it should take bdev
and not inode as an argument. Because combination inode + bdev sector
doesn't really make much sense.

dax_writeback_mapping_range() is the only remaining offender and it can
easily take bdev as an argument when called from ->writepages().

> Or can we move forward with get_bdev(), and try and figure out this new way of
> calling fsync/msync for v4.6?  My main concern here is that changing how the
> DAX sync code gets called will affect all three filesystems as well as MM, and
> that it might be too much for RC inclusion...

I think changes aren't very intrusive so we can feed them in during RC
phase and frankly, you have to move to using ->writepages() anyway to make
sync(2) work reliably.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-04 20:29                           ` Jan Kara
@ 2016-02-04 22:19                             ` Ross Zwisler
  2016-02-05 22:25                             ` Ross Zwisler
  1 sibling, 0 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-02-04 22:19 UTC (permalink / raw)
  To: Jan Kara
  Cc: Ross Zwisler, Dan Williams, Dave Chinner, Matthew Wilcox,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Thu, Feb 04, 2016 at 09:29:57PM +0100, Jan Kara wrote:
> On Thu 04-02-16 12:56:19, Ross Zwisler wrote:
> > On Wed, Feb 03, 2016 at 11:46:11AM +0100, Jan Kara wrote:
<>
> > > Let's clear this up a bit: The problem with using ->fsync() method is that
> > > it doesn't get called for sync(2). We could use ->sync_fs() to flush caches
> > > in case of sync(2) (that's what's happening for normal storage) but the
> > > problem with PMEM is that "flush all cached data" operation effectively
> > > means iterate through all modified pages and we didn't want to implement
> > > this for DAX fsync code.
> > > 
> > > So we have decided to do cache flushing for DAX at a different point - mark
> > > inodes which may have writes cached as dirty and use writeback code for the
> > > cache flushing. But looking at it now, we have actually chosen a wrong
> > > place to do the flushing in the writeback path - note that sync(2) writes
> > > data via __writeback_single_inode() -> do_writepages() and thus doesn't
> > > even get to filemap_write_and_wait().
> > > 
> > > So revisiting the decision I see two options:
> > > 
> > > 1) Move the DAX flushing code from filemap_write_and_wait() into
> > > ->writepages() fs callback. There the filesystem can provide all the
> > > information it needs including bdev, get_block callback, or whatever.
> > > 
> > > 2) Back out even further and implement own tracking and iteration of inodes
> > > to write.
> > > 
> > > So far I still think 2) is not worth the complexity (although it would
> > > bring DAX code closer to how things behave with standard storage) so I
> > > would go for 1).
> > 
> > Jan, just to clarify, are you proposing this change for v4.5 in the remaining
> > RCs as an alternative to the get_bdev() patch?
> > 
> > https://lkml.org/lkml/2016/2/2/941
> 
> Yes, because I don't think anything like ->get_bdev() is needed at all.
> Look: dax_do_io(), __dax_fault(), __dax_pmd_fault(), dax_zero_page_range()
> don't really need bdev - we have agreed that get_block() fills that in just
> fine.
> 
> dax_clear_blocks() has IMO just the wrong signature - it should take bdev
> and not inode as an argument. Because combination inode + bdev sector
> doesn't really make much sense.
> 
> dax_writeback_mapping_range() is the only remaining offender and it can
> easily take bdev as an argument when called from ->writepages().
> 
> > Or can we move forward with get_bdev(), and try and figure out this new way of
> > calling fsync/msync for v4.6?  My main concern here is that changing how the
> > DAX sync code gets called will affect all three filesystems as well as MM, and
> > that it might be too much for RC inclusion...
> 
> I think changes aren't very intrusive so we can feed them in during RC
> phase and frankly, you have to move to using ->writepages() anyway to make
> sync(2) work reliably.

Okay, sounds good.  I'll send it out once I've got it working & tested.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-04  9:15                           ` Jan Kara
@ 2016-02-04 23:38                             ` Ross Zwisler
  2016-02-06 23:15                             ` Dave Chinner
  1 sibling, 0 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-02-04 23:38 UTC (permalink / raw)
  To: Jan Kara
  Cc: Ross Zwisler, Dave Chinner, Dan Williams, Matthew Wilcox,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	linux-fsdevel, linux-nvdimm

On Thu, Feb 04, 2016 at 10:15:58AM +0100, Jan Kara wrote:
<>
> Yes, you are right. filemap_write_and_wait_range() actually doesn't
> guarantee data durability. That function only means all dirty data has been
> sent to storage and the storage has acknowledged them. This is noop for
> PMEM. So we are perfectly fine ignoring calls to
> filemap_write_and_wait_range(). What guarantees data durability are only
> ->fsync() and ->sync_fs() calls. But some code could get upset by seeing
> that filemap_write_and_wait_range() didn't actually get rid of dirty pages
> (in some special cases like inode eviction or similar). That's why I'd
> choose one of the two options for consistency:
> 
> 1) Treat inode indexes to flush as close to dirty pages as we can - this
> means inode is dirty with all the tracking associated with it, radix tree
> entries have dirty tag, we get rid of these in ->writepages(). We are close
> to this currently.

I think we're actually pretty far from this, at least for v4.5.  The issue is
that I don't think we can safely clear radix tree dirty entries during the DAX
code that is called via ->writepages().  To do this correctly we would need to
also mark the PTE as clean so that when the userspace process next writes to
their mmap mapping we would get a new fault to make the page writable. This
would allow us to re-dirty the DAX entry in the radix tree.

I implemented code to do this in v2 of my set, but ripped it out in v3:

https://lkml.org/lkml/2015/11/13/759 (DAX fsync v2)
https://lkml.org/lkml/2015/12/8/583  (DAX fsync v3)

The race that compelled this removal is described here:

https://lists.01.org/pipermail/linux-nvdimm/2016-January/004057.html

(sorry for all the links)

Anyway, for v4.5 I think whatever solution we come up with must be okay with
an ever growing list of dirty radix tree entries, as we currently have.  Are
you aware of a reason why this won't work, or was the cleaning of the radix
tree entries just a good goal to have?  (And I agree it is a good goal, I just
don't know how to do it safely.)

> 2) Completely avoid the dirty tracking and writeback code and reimplement
> everything in DAX code.
> 
> Because some hybrid between these is IMHO bound to provoke weird (and very
> hard to find) bugs.
> 
> > > So revisiting the decision I see two options:
> > > 
> > > 1) Move the DAX flushing code from filemap_write_and_wait() into
> > > ->writepages() fs callback. There the filesystem can provide all the
> > > information it needs including bdev, get_block callback, or whatever.
> > 
> > This seems fine as long as we add it to ->fsync as well since ->writepages is
> > never called in that path, and as long as we are okay with skipping DAX
> > writebacks on hole punch, truncate, and block relocation.
> 
> Look at ext4_sync_file() -> filemap_write_and_wait_range() ->
> __filemap_fdatawrite_range() -> do_writepages(). Except those nrpages > 0
> checks which would need to be changed.

Ah, cool, I missed this path.  Thank you for setting me straight.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-04 20:29                           ` Jan Kara
  2016-02-04 22:19                             ` Ross Zwisler
@ 2016-02-05 22:25                             ` Ross Zwisler
  2016-02-06 23:40                               ` Dave Chinner
  2016-02-07  8:38                               ` Christoph Hellwig
  1 sibling, 2 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-02-05 22:25 UTC (permalink / raw)
  To: Jan Kara
  Cc: Ross Zwisler, Dan Williams, Dave Chinner, Matthew Wilcox,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Thu, Feb 04, 2016 at 09:29:57PM +0100, Jan Kara wrote:
> I think changes aren't very intrusive so we can feed them in during RC
> phase and frankly, you have to move to using ->writepages() anyway to make
> sync(2) work reliably.

I've been looking into this a bit more, and I don't think we actually want to
have DAX flushing in response to sync(2) or syncfs(2).  Here's some text from
the BUGS section of the sync(2) man page:

BUGS
	According to the standard specification (e.g., POSIX.1-2001), sync()
	schedules the writes,  but  may  return before  the  actual  writing
	is done.  However, since version 1.3.20 Linux does actually wait.
	(This still does not guarantee data integrity: modern disks have large
	caches.) 

Based on this I don't believe that it is a requirement that sync and syncfs
actually flush the data durably to media before they return - they just need
to make sure it has been sent to the device, which is always true for all
writes PMEM via DAX, even without any calls to dax_writeback_mapping_range().

The fsync(2) man page, on the other hand, *does* require that a call to
fsync() will result in the data being durable on the media before the system
call returns.  From fsync(2):

	fsync()  transfers  ("flushes") all modified in-core data of (i.e.,
	modified buffer cache pages for) the file referred to by the file
	descriptor fd to the disk device (or other permanent  storage  device)
	so  that  all changed information  can  be retrieved even after the
	system crashed or was rebooted.  This includes writing through or
	flushing a disk cache if present. 

I think we are doing the right thing if we only call
dax_writeback_mapping_range() in response to fsync() and msync(), but skip it
in response to sync() and syncfs().  Practically this also simplifies things a
lot because we don't need to worry about cleaning the DAX inodes.  With my
naive implementation where I was calling dax_writeback_mapping_range() in
ext4_writepages(), we were flushing all DAX pages that had ever been dirtied
every 5 seconds in response to a periodic filesystem sync(), which I'm sure is
untenable.

Please let me know if anyone disagrees with any of the above.  Based on this,
I'm off to move the calls to dax_writeback_mapping_range() back into the
filesystem specific fsync()/msync() paths so that we can provide an
appropriate block device.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-04  9:15                           ` Jan Kara
  2016-02-04 23:38                             ` Ross Zwisler
@ 2016-02-06 23:15                             ` Dave Chinner
  2016-02-07  5:27                               ` Ross Zwisler
  1 sibling, 1 reply; 66+ messages in thread
From: Dave Chinner @ 2016-02-06 23:15 UTC (permalink / raw)
  To: Jan Kara
  Cc: Ross Zwisler, Dan Williams, Matthew Wilcox, Christoph Hellwig,
	linux-kernel, Alexander Viro, Andrew Morton, linux-fsdevel,
	linux-nvdimm

On Thu, Feb 04, 2016 at 10:15:58AM +0100, Jan Kara wrote:
> On Wed 03-02-16 13:13:28, Ross Zwisler wrote:
> > Here is the comment from Dave Chinner that had me move to having the calls to
> > dax_writeback_mapping_range() into the generic mm code:
> > 
> >    > Lastly, this flushing really needs to be inside
> >    > filemap_write_and_wait_range(), because we call the writeback code
> >    > from many more places than just fsync to ensure ordering of various
> >    > operations such that files are in known state before proceeding
> >    > (e.g. hole punch).
> > https://lkml.org/lkml/2015/11/16/847
.....
> > > So revisiting the decision I see two options:
> > > 
> > > 1) Move the DAX flushing code from filemap_write_and_wait() into
> > > ->writepages() fs callback. There the filesystem can provide all the
> > > information it needs including bdev, get_block callback, or whatever.
> > 
> > This seems fine as long as we add it to ->fsync as well since ->writepages is
> > never called in that path, and as long as we are okay with skipping DAX
> > writebacks on hole punch, truncate, and block relocation.
> 
> Look at ext4_sync_file() -> filemap_write_and_wait_range() ->
> __filemap_fdatawrite_range() -> do_writepages(). Except those nrpages > 0
> checks which would need to be changed.

Just to be clear: this is pretty much what I was implying was
necessary when I said that the DAX flushing needed to be "inside
filemap_write_and_wait_range".  And that's what I thought Ross was
planning on doing after that round discussion.  i.e. Ross said:

"If the race described above isn't an issue then I agree moving this
call out of the filesystems and down into the generic page writeback
code is probably the right thing to do."

https://lkml.org/lkml/2015/11/17/718

Realistically, what Jan is saying in this thread is exactly what I
said we needed to do way back when I first pointed out that fsync
was broken and dirty tracking in the mapping radix tree was still
needed for fsync to work effectively.

Clearly, haven't been following recent developments in this patchset
as closely as I should have - I did not think that such
micro-management was going to be necessary after the discussion taht
was had back in November.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-05 22:25                             ` Ross Zwisler
@ 2016-02-06 23:40                               ` Dave Chinner
  2016-02-07  6:43                                 ` Ross Zwisler
  2016-02-07  8:38                               ` Christoph Hellwig
  1 sibling, 1 reply; 66+ messages in thread
From: Dave Chinner @ 2016-02-06 23:40 UTC (permalink / raw)
  To: Ross Zwisler, Jan Kara, Dan Williams, Matthew Wilcox,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Fri, Feb 05, 2016 at 03:25:00PM -0700, Ross Zwisler wrote:
> On Thu, Feb 04, 2016 at 09:29:57PM +0100, Jan Kara wrote:
> > I think changes aren't very intrusive so we can feed them in during RC
> > phase and frankly, you have to move to using ->writepages() anyway to make
> > sync(2) work reliably.
> 
> I've been looking into this a bit more, and I don't think we actually want to
> have DAX flushing in response to sync(2) or syncfs(2).  Here's some text from
> the BUGS section of the sync(2) man page:
> 
> BUGS
> 	According to the standard specification (e.g., POSIX.1-2001), sync()
> 	schedules the writes,  but  may  return before  the  actual  writing
> 	is done.  However, since version 1.3.20 Linux does actually wait.
> 	(This still does not guarantee data integrity: modern disks have large
> 	caches.) 
> 
> Based on this I don't believe that it is a requirement that sync and syncfs
> actually flush the data durably to media before they return - they just need
> to make sure it has been sent to the device, which is always true for all
> writes PMEM via DAX, even without any calls to dax_writeback_mapping_range().

That's an assumption we've already pointed out as being platform
dependent, not to mention also being undesirable from a performance
point of view (writes are 10x slower into pmem than into the page
cache using the same physical memory!).

Further, the ordering constraints of modern filesystems mean that
they guarantee durability of data written back when sync() is run.
i.e.  ext4, XFS, btrfs, etc all ensure that sync guarantees data
integrity is maintained across all the data and metadata written
back during sync().

e.g. for XFS we do file size update transactions at IO completion.
sync() triggers writeback of data, then runs a transaction that
modifies the file size so that the data is valid on disk. We
absolutely need to ensure that this transaction is durable before
sync() returns, otherwise we lose that data if a failure occurs
immediately after sync() returns because the size update is not on
disk.

Users are right to complain when data written before a sync() call
is made does not accessible after a crash/reboot because we failed
to make it durable. That's why ->sync_fs(wait) is called at the end
of the sync() implementation - it enables filesystems to ensure all
data and metadata written during the sync processing is on durable
storage.

IOWs, we can't language-lawyer or weasel-word our way out of
providing durability guarantees for sync().

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-06 23:15                             ` Dave Chinner
@ 2016-02-07  5:27                               ` Ross Zwisler
  0 siblings, 0 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-02-07  5:27 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Ross Zwisler, Dan Williams, Matthew Wilcox,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	linux-fsdevel, linux-nvdimm

On Sun, Feb 07, 2016 at 10:15:12AM +1100, Dave Chinner wrote:
> On Thu, Feb 04, 2016 at 10:15:58AM +0100, Jan Kara wrote:
> > On Wed 03-02-16 13:13:28, Ross Zwisler wrote:
> > > Here is the comment from Dave Chinner that had me move to having the calls to
> > > dax_writeback_mapping_range() into the generic mm code:
> > > 
> > >    > Lastly, this flushing really needs to be inside
> > >    > filemap_write_and_wait_range(), because we call the writeback code
> > >    > from many more places than just fsync to ensure ordering of various
> > >    > operations such that files are in known state before proceeding
> > >    > (e.g. hole punch).
> > > https://lkml.org/lkml/2015/11/16/847
> .....
> > > > So revisiting the decision I see two options:
> > > > 
> > > > 1) Move the DAX flushing code from filemap_write_and_wait() into
> > > > ->writepages() fs callback. There the filesystem can provide all the
> > > > information it needs including bdev, get_block callback, or whatever.
> > > 
> > > This seems fine as long as we add it to ->fsync as well since ->writepages is
> > > never called in that path, and as long as we are okay with skipping DAX
> > > writebacks on hole punch, truncate, and block relocation.
> > 
> > Look at ext4_sync_file() -> filemap_write_and_wait_range() ->
> > __filemap_fdatawrite_range() -> do_writepages(). Except those nrpages > 0
> > checks which would need to be changed.
> 
> Just to be clear: this is pretty much what I was implying was
> necessary when I said that the DAX flushing needed to be "inside
> filemap_write_and_wait_range".  And that's what I thought Ross was
> planning on doing after that round discussion.  i.e. Ross said:
> 
> "If the race described above isn't an issue then I agree moving this
> call out of the filesystems and down into the generic page writeback
> code is probably the right thing to do."
> 
> https://lkml.org/lkml/2015/11/17/718
> 
> Realistically, what Jan is saying in this thread is exactly what I
> said we needed to do way back when I first pointed out that fsync
> was broken and dirty tracking in the mapping radix tree was still
> needed for fsync to work effectively.

Here, let me try and quickly summarize what is going on.

1) The DAX fsync set was merged into v4.5-rc1, it does use the radix tree for
tracking dirty PTE and PMD pages, and we do currently call into the DAX sync
code via filemap_write_and_wait_range() as you initially suggested.

2) During testing of raw block devices + DAX I noticed that the struct
block_device that we were using for DAX operations was incorrect.  For the
fault handlers, etc. we can just get the correct bdev via get_block(), which
is passed in as a function pointer, but for the flushing code we don't have
access to get_block().  This is also an issue for XFS real-time devices,
whenever we get those working.

In short, somehow we need to get dax_writeback_mapping_range() a valid bdev.
Right now it is called via filemap_write_and_wait_range(), which can't provide
either the bdev nor a get_block() function pointer.  So, our options seem to
be:
  a) Move the calls to dax_writeback_mapping_range() into the filesystems
  (what Jan is suggesting, i.e. ->writepages())
  b) Keep the calls to dax_writeback_mapping_range() in the mm code, and
  provide a generic way to ask a filesystem for an inode's bdev.  I did a
  version of this using a superblock operation here:
  https://lkml.org/lkml/2016/2/2/941

3) During the review and discussion for the above problems, Jan noticed that
the flushing code wasn't being called for sync() and syncfs().  Clearly from
your other response (https://lkml.org/lkml/2016/2/6/168) you think this is
incorrect.  Regardless, the above issue remains -
dax_writeback_mapping_range() needs a bdev.  Do we move the calls into the
filesystem so the fs can provide a bdev, or do we we create a generic method
for DAX to ask the fs for the correct bdev for an inode?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-06 23:40                               ` Dave Chinner
@ 2016-02-07  6:43                                 ` Ross Zwisler
  2016-02-08 13:48                                   ` Jan Kara
  0 siblings, 1 reply; 66+ messages in thread
From: Ross Zwisler @ 2016-02-07  6:43 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Ross Zwisler, Jan Kara, Dan Williams, Matthew Wilcox,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Sun, Feb 07, 2016 at 10:40:53AM +1100, Dave Chinner wrote:
> On Fri, Feb 05, 2016 at 03:25:00PM -0700, Ross Zwisler wrote:
> > On Thu, Feb 04, 2016 at 09:29:57PM +0100, Jan Kara wrote:
> > > I think changes aren't very intrusive so we can feed them in during RC
> > > phase and frankly, you have to move to using ->writepages() anyway to make
> > > sync(2) work reliably.
> > 
> > I've been looking into this a bit more, and I don't think we actually want to
> > have DAX flushing in response to sync(2) or syncfs(2).  Here's some text from
> > the BUGS section of the sync(2) man page:
> > 
> > BUGS
> > 	According to the standard specification (e.g., POSIX.1-2001), sync()
> > 	schedules the writes,  but  may  return before  the  actual  writing
> > 	is done.  However, since version 1.3.20 Linux does actually wait.
> > 	(This still does not guarantee data integrity: modern disks have large
> > 	caches.) 
> > 
> > Based on this I don't believe that it is a requirement that sync and syncfs
> > actually flush the data durably to media before they return - they just need
> > to make sure it has been sent to the device, which is always true for all
> > writes PMEM via DAX, even without any calls to dax_writeback_mapping_range().
> 
> That's an assumption we've already pointed out as being platform
> dependent, not to mention also being undesirable from a performance
> point of view (writes are 10x slower into pmem than into the page
> cache using the same physical memory!).
> 
> Further, the ordering constraints of modern filesystems mean that
> they guarantee durability of data written back when sync() is run.
> i.e.  ext4, XFS, btrfs, etc all ensure that sync guarantees data
> integrity is maintained across all the data and metadata written
> back during sync().
> 
> e.g. for XFS we do file size update transactions at IO completion.
> sync() triggers writeback of data, then runs a transaction that
> modifies the file size so that the data is valid on disk. We
> absolutely need to ensure that this transaction is durable before
> sync() returns, otherwise we lose that data if a failure occurs
> immediately after sync() returns because the size update is not on
> disk.
> 
> Users are right to complain when data written before a sync() call
> is made does not accessible after a crash/reboot because we failed
> to make it durable. That's why ->sync_fs(wait) is called at the end
> of the sync() implementation - it enables filesystems to ensure all
> data and metadata written during the sync processing is on durable
> storage.
> 
> IOWs, we can't language-lawyer or weasel-word our way out of
> providing durability guarantees for sync().

To be clear, we are only talking about user data that was mmaped.  All I/Os
initiated by the filesystem for metadata, and all user issued I/Os will be
durable on media before the I/O is completed.  Nothing we do or don't do for
fsync, msync, sync or syncfs() will break filesystem metadata guarantees.

I think the question then is: "Do users currently rely on data written to an
mmap to be durable on media after a sync() or syncfs(), in spite of the BUGS
section quoted above?"

If the answer for this is "yes", then I agree that we need to enhance the
current DAX fsync code to cover the sync use case.

However, as stated previously I don't think that it is as easy as just moving
the call to dax_writeback_mapping_range() to the sync() call path.  On my
system sync() is called every 5 seconds, and flushing every page of every DAX
mmap that has ever been dirtied every 5 seconds is unreasonable.

If we need to support the sync() use case with our DAX mmap flushing code, I
think the only way to do that is to clear out radix entries as we flush them,
so that the sync calls that happen every 5 seconds are only flushing new
writes.

I think we're actually pretty far from this, at least for v4.5.  The issue is
that I don't think we can safely clear radix tree dirty entries during the DAX
code that is called via ->writepages().  To do this correctly we would need to
also mark the PTE as clean so that when the userspace process next writes to
their mmap mapping we would get a new fault to make the page writable. This
would allow us to re-dirty the DAX entry in the radix tree.

I implemented code to do this in v2 of my set, but ripped it out in v3:

https://lkml.org/lkml/2015/11/13/759 (DAX fsync v2)
https://lkml.org/lkml/2015/12/8/583  (DAX fsync v3)

The race that compelled this removal is described here:

https://lists.01.org/pipermail/linux-nvdimm/2016-January/004057.html

If this is really where we need to be, that's fine, we'll have to figure out
some way to defeat the race and get this code into v4.6.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-05 22:25                             ` Ross Zwisler
  2016-02-06 23:40                               ` Dave Chinner
@ 2016-02-07  8:38                               ` Christoph Hellwig
  2016-02-08 15:55                                 ` Ross Zwisler
  1 sibling, 1 reply; 66+ messages in thread
From: Christoph Hellwig @ 2016-02-07  8:38 UTC (permalink / raw)
  To: Ross Zwisler, Jan Kara, Dan Williams, Dave Chinner,
	Matthew Wilcox, Christoph Hellwig, linux-kernel, Alexander Viro,
	Andrew Morton, Jan Kara, linux-fsdevel, linux-nvdimm

On Fri, Feb 05, 2016 at 03:25:00PM -0700, Ross Zwisler wrote:
> 	According to the standard specification (e.g., POSIX.1-2001), sync()
> 	schedules the writes,  but  may  return before  the  actual  writing
> 	is done.  However, since version 1.3.20 Linux does actually wait.
> 	(This still does not guarantee data integrity: modern disks have large
> 	caches.) 
> 
> Based on this I don't believe that it is a requirement that sync and syncfs
> actually flush the data durably to media before they return - they just need
> to make sure it has been sent to the device, which is always true for all
> writes PMEM via DAX, even without any calls to dax_writeback_mapping_range().

For Linux there is a requirement to not return before the data is on disk,
and our users rely on it.  The man page is rather confusing, and I'll see
if I can fix it up.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-07  6:43                                 ` Ross Zwisler
@ 2016-02-08 13:48                                   ` Jan Kara
  0 siblings, 0 replies; 66+ messages in thread
From: Jan Kara @ 2016-02-08 13:48 UTC (permalink / raw)
  To: Ross Zwisler
  Cc: Dave Chinner, Jan Kara, Dan Williams, Matthew Wilcox,
	Christoph Hellwig, linux-kernel, Alexander Viro, Andrew Morton,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Sat 06-02-16 23:43:49, Ross Zwisler wrote:
> On Sun, Feb 07, 2016 at 10:40:53AM +1100, Dave Chinner wrote:
> > On Fri, Feb 05, 2016 at 03:25:00PM -0700, Ross Zwisler wrote:
> > > On Thu, Feb 04, 2016 at 09:29:57PM +0100, Jan Kara wrote:
> > > > I think changes aren't very intrusive so we can feed them in during RC
> > > > phase and frankly, you have to move to using ->writepages() anyway to make
> > > > sync(2) work reliably.
> > > 
> > > I've been looking into this a bit more, and I don't think we actually want to
> > > have DAX flushing in response to sync(2) or syncfs(2).  Here's some text from
> > > the BUGS section of the sync(2) man page:
> > > 
> > > BUGS
> > > 	According to the standard specification (e.g., POSIX.1-2001), sync()
> > > 	schedules the writes,  but  may  return before  the  actual  writing
> > > 	is done.  However, since version 1.3.20 Linux does actually wait.
> > > 	(This still does not guarantee data integrity: modern disks have large
> > > 	caches.) 
> > > 
> > > Based on this I don't believe that it is a requirement that sync and syncfs
> > > actually flush the data durably to media before they return - they just need
> > > to make sure it has been sent to the device, which is always true for all
> > > writes PMEM via DAX, even without any calls to dax_writeback_mapping_range().
> > 
> > That's an assumption we've already pointed out as being platform
> > dependent, not to mention also being undesirable from a performance
> > point of view (writes are 10x slower into pmem than into the page
> > cache using the same physical memory!).
> > 
> > Further, the ordering constraints of modern filesystems mean that
> > they guarantee durability of data written back when sync() is run.
> > i.e.  ext4, XFS, btrfs, etc all ensure that sync guarantees data
> > integrity is maintained across all the data and metadata written
> > back during sync().
> > 
> > e.g. for XFS we do file size update transactions at IO completion.
> > sync() triggers writeback of data, then runs a transaction that
> > modifies the file size so that the data is valid on disk. We
> > absolutely need to ensure that this transaction is durable before
> > sync() returns, otherwise we lose that data if a failure occurs
> > immediately after sync() returns because the size update is not on
> > disk.
> > 
> > Users are right to complain when data written before a sync() call
> > is made does not accessible after a crash/reboot because we failed
> > to make it durable. That's why ->sync_fs(wait) is called at the end
> > of the sync() implementation - it enables filesystems to ensure all
> > data and metadata written during the sync processing is on durable
> > storage.
> > 
> > IOWs, we can't language-lawyer or weasel-word our way out of
> > providing durability guarantees for sync().
> 
> To be clear, we are only talking about user data that was mmaped.  All I/Os
> initiated by the filesystem for metadata, and all user issued I/Os will be
> durable on media before the I/O is completed.  Nothing we do or don't do for
> fsync, msync, sync or syncfs() will break filesystem metadata guarantees.

Yes, but be careful here:

So far ext4 (and AFAIK XFS as well) have depended on the fact that
blkdev_issue_flush() (or WRITE_FLUSH request) will flush all the volatile
caches for the storage. We do such calls / writes from transaction commit
code to make sure that all metadata *and data* writes reach permanent
storage before we make some metadata changes visible. This is necessary so
that we don't expose uninitialized block contents after a power failure
(think of making i_size increase / block allocation visible after power
failure without data being durable). 

For PMEM, we ignore blkdev_issue_flush() / WRITE_FLUSH requests on the
grounds that writes are either done bypassing caches (the case for metadata
IO and IO done via dax_do_io()). Currently we are fine even for mmap
because both ext4 & XFS zero out allocated blocks using non-cached writes
so even though latest data needn't be on persistent storage we won't expose
stale data. But it is a catch that may hit us in future. So from this POV
flushing caches from ->writepages() would also make PMEM give more similar
guarantees to fs as ordinary block storage.

> I think the question then is: "Do users currently rely on data written to an
> mmap to be durable on media after a sync() or syncfs(), in spite of the BUGS
> section quoted above?"
> 
> If the answer for this is "yes", then I agree that we need to enhance the
> current DAX fsync code to cover the sync use case.

IMO the answer is yes.

> However, as stated previously I don't think that it is as easy as just moving
> the call to dax_writeback_mapping_range() to the sync() call path.  On my
> system sync() is called every 5 seconds, and flushing every page of every DAX
> mmap that has ever been dirtied every 5 seconds is unreasonable.

It is not sync() that is called but background writeback. But you are
correct that ->writepages() will get called for a dirty inode every 5
seconds.

> If we need to support the sync() use case with our DAX mmap flushing code, I
> think the only way to do that is to clear out radix entries as we flush them,
> so that the sync calls that happen every 5 seconds are only flushing new
> writes.

I agree and I forgot about the issue that we currently don't clear out
dirty radix tree entries for DAX. I also agree that clearing of written
radix tree entries needs to be implemented before flushing caches from
->writepages() is practical.

> I think we're actually pretty far from this, at least for v4.5.  The issue is
> that I don't think we can safely clear radix tree dirty entries during the DAX
> code that is called via ->writepages().  To do this correctly we would need to
> also mark the PTE as clean so that when the userspace process next writes to
> their mmap mapping we would get a new fault to make the page writable. This
> would allow us to re-dirty the DAX entry in the radix tree.

Yup, so I agree that clearing of radix tree entries probably is not a 4.5
material.

> I implemented code to do this in v2 of my set, but ripped it out in v3:
> 
> https://lkml.org/lkml/2015/11/13/759 (DAX fsync v2)
> https://lkml.org/lkml/2015/12/8/583  (DAX fsync v3)
> 
> The race that compelled this removal is described here:
> 
> https://lists.01.org/pipermail/linux-nvdimm/2016-January/004057.html
> 
> If this is really where we need to be, that's fine, we'll have to figure out
> some way to defeat the race and get this code into v4.6.

I think we need to find a way to safely clear radix tree entries anyway.
And I agree that without page lock it is not easy. I see two possibilities
here:

1) Protect dax_writeback_mapping_range() with exclusive
EXT4_I(inode)->i_mmap_sem and provide new callback called from
wp_page_reuse() to mark index dirty in the radix tree (that would take
shared i_mmap_sem). This is relatively expensive but it should work.

Or

2) We could devote one bit in the radix tree exceptional entry as a bitlock
equivalent to page lock. This will likely scale better but it is more
intrusive.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
  2016-02-07  8:38                               ` Christoph Hellwig
@ 2016-02-08 15:55                                 ` Ross Zwisler
  0 siblings, 0 replies; 66+ messages in thread
From: Ross Zwisler @ 2016-02-08 15:55 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Ross Zwisler, Jan Kara, Dan Williams, Dave Chinner,
	Matthew Wilcox, linux-kernel, Alexander Viro, Andrew Morton,
	Jan Kara, linux-fsdevel, linux-nvdimm

On Sun, Feb 07, 2016 at 12:38:14AM -0800, Christoph Hellwig wrote:
> On Fri, Feb 05, 2016 at 03:25:00PM -0700, Ross Zwisler wrote:
> > 	According to the standard specification (e.g., POSIX.1-2001), sync()
> > 	schedules the writes,  but  may  return before  the  actual  writing
> > 	is done.  However, since version 1.3.20 Linux does actually wait.
> > 	(This still does not guarantee data integrity: modern disks have large
> > 	caches.) 
> > 
> > Based on this I don't believe that it is a requirement that sync and syncfs
> > actually flush the data durably to media before they return - they just need
> > to make sure it has been sent to the device, which is always true for all
> > writes PMEM via DAX, even without any calls to dax_writeback_mapping_range().
> 
> For Linux there is a requirement to not return before the data is on disk,
> and our users rely on it.  The man page is rather confusing, and I'll see
> if I can fix it up.

Okay, thank you for the clarification. :)

^ permalink raw reply	[flat|nested] 66+ messages in thread

end of thread, other threads:[~2016-02-08 15:55 UTC | newest]

Thread overview: 66+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-28 19:35 [PATCH 1/2] block: fix pfn_mkwrite() DAX fault handler Ross Zwisler
2016-01-28 19:35 ` Ross Zwisler
2016-01-28 19:35 ` [PATCH 2/2] dax: fix bdev NULL pointer dereferences Ross Zwisler
2016-01-28 19:35   ` Ross Zwisler
2016-01-28 20:21   ` Dan Williams
2016-01-28 20:21     ` Dan Williams
2016-01-28 21:38   ` Christoph Hellwig
2016-01-29 18:28     ` Ross Zwisler
2016-01-29 23:34       ` Ross Zwisler
2016-01-30  0:18         ` Dan Williams
2016-01-31 22:44         ` Dave Chinner
2016-01-30  5:28       ` Matthew Wilcox
2016-01-30  6:01         ` Dan Williams
2016-01-30  7:08           ` Jared Hulbert
2016-01-31  2:32           ` Matthew Wilcox
2016-01-31  6:12             ` Ross Zwisler
2016-01-31 10:55               ` Matthew Wilcox
2016-01-31 16:38                 ` Dan Williams
2016-01-31 18:07                   ` Matthew Wilcox
2016-01-31 18:18                     ` Dan Williams
2016-01-31 18:27                       ` Matthew Wilcox
2016-01-31 18:50                         ` Dan Williams
2016-01-31 19:51                     ` Dan Williams
2016-02-01 13:44             ` Matthew Wilcox
2016-02-01 14:51         ` Jan Kara
2016-02-01 20:49           ` Matthew Wilcox
2016-02-01 21:47           ` Dave Chinner
2016-02-02  6:06             ` Jared Hulbert
2016-02-02  6:46               ` Dan Williams
2016-02-02  8:05                 ` Jared Hulbert
2016-02-02 16:51                   ` Dan Williams
2016-02-02 21:46                     ` Jared Hulbert
2016-02-03  0:34                       ` Matthew Wilcox
2016-02-03  1:21                         ` Jared Hulbert
2016-02-02 11:17             ` Jan Kara
2016-02-02 16:33               ` Dan Williams
2016-02-02 16:46                 ` Jan Kara
2016-02-02 17:10                   ` Dan Williams
2016-02-02 17:34                     ` Ross Zwisler
2016-02-02 17:46                       ` Dan Williams
2016-02-02 17:47                         ` Dan Williams
2016-02-02 18:24                           ` Ross Zwisler
2016-02-02 18:46                         ` Matthew Wilcox
2016-02-02 18:59                           ` Dan Williams
2016-02-02 20:14                             ` Matthew Wilcox
2016-02-03 11:09                           ` Jan Kara
2016-02-03 10:46                       ` Jan Kara
2016-02-03 20:13                         ` Ross Zwisler
2016-02-04  9:15                           ` Jan Kara
2016-02-04 23:38                             ` Ross Zwisler
2016-02-06 23:15                             ` Dave Chinner
2016-02-07  5:27                               ` Ross Zwisler
2016-02-04 19:56                         ` Ross Zwisler
2016-02-04 20:29                           ` Jan Kara
2016-02-04 22:19                             ` Ross Zwisler
2016-02-05 22:25                             ` Ross Zwisler
2016-02-06 23:40                               ` Dave Chinner
2016-02-07  6:43                                 ` Ross Zwisler
2016-02-08 13:48                                   ` Jan Kara
2016-02-07  8:38                               ` Christoph Hellwig
2016-02-08 15:55                                 ` Ross Zwisler
2016-02-02 18:41               ` Ross Zwisler
2016-02-02 18:53                 ` Ross Zwisler
2016-02-02  0:02     ` Ross Zwisler
2016-02-02  7:10       ` Dave Chinner
2016-02-02 10:34       ` Jan Kara

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.