All of lore.kernel.org
 help / color / mirror / Atom feed
From: Matthew Wilcox <willy@linux.intel.com>
To: Dan Williams <dan.j.williams@intel.com>
Cc: linux-nvdimm <linux-nvdimm@ml01.01.org>,
	Dave Chinner <david@fromorbit.com>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	Christoph Hellwig <hch@infradead.org>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	Jan Kara <jack@suse.com>,
	linux-fsdevel <linux-fsdevel@vger.kernel.org>,
	Andrew Morton <akpm@linux-foundation.org>
Subject: Re: [PATCH 2/2] dax: fix bdev NULL pointer dereferences
Date: Mon, 1 Feb 2016 08:44:10 -0500	[thread overview]
Message-ID: <20160201134410.GD2948@linux.intel.com> (raw)
In-Reply-To: <20160131023247.GZ2948@linux.intel.com>

On Sun, Jan 31, 2016 at 01:32:47PM +1100, Matthew Wilcox wrote:
> On Fri, Jan 29, 2016 at 10:01:13PM -0800, Dan Williams wrote:
> > On Fri, Jan 29, 2016 at 9:28 PM, Matthew Wilcox <willy@linux.intel.com> wrote:
> > > If we store the PFN of the underlying page instead, we don't have this
> > > problem.  Instead, we have a different problem; of the device going
> > > away under us.  I'm trying to find the code which tears down PTEs when
> > > the device goes away, and I'm not seeing it.  What do we do about user
> > > mappings of the device?
> > 
> > I deferred the dax tear down code until next cycle as Al rightly
> > pointed out some needed re-works:
> > 
> > https://lists.01.org/pipermail/linux-nvdimm/2016-January/003995.html
> 
> Thanks; I eventually found it in my email somewhere over the Pacific.
> 
> I did probably 70% of the work needed to switch the radix tree over to
> storing PFNs instead of sectors.  It seems viable, though it's a big
> change from where we are today:

70%?!  Hah.  I'd done maybe 50%.  This isn't everything needed; I still
need to write radix_tree_replace().  But it's enough to get a flavour for
where this line of thinking takes us.  I think it ends up being cleaner
code, and possibly better performing.  I also think it points us back
in the direction of wanting an address_space operation to return a PFN
for the radix tree instead of handling buffer_heads directly in dax.c.

Ah well.  Time to sleep ...

>From 0321c30eeb189ad2da8dcc25623419e2ba9c6cee Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 31 Jan 2016 13:38:21 +1100
Subject: [PATCH] Giant non-compiling mess

Note that clear_pmem needs to be updated to set the needs_wmb() flag.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 fs/dax.c                   | 1127 +++++++++++++++++++++-----------------------
 include/linux/dax.h        |    3 +-
 include/linux/pfn_t.h      |   41 +-
 include/linux/radix-tree.h |    9 -
 include/linux/sched.h      |    1 +
 5 files changed, 565 insertions(+), 616 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index e9701d6..38b92b5 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -25,12 +25,132 @@
 #include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/pagevec.h>
+#include <linux/pfn_t.h>
 #include <linux/pmem.h>
+#include <linux/preempt.h>
 #include <linux/sched.h>
+#include <linux/sizes.h>
 #include <linux/uio.h>
 #include <linux/vmstat.h>
-#include <linux/pfn_t.h>
-#include <linux/sizes.h>
+
+/*
+ * 32-bit architectures want to override this to actually map/unmap
+ * their persistent memory.  ARM, SPARC & MIPS also want to override it
+ * to map the PFN at an address that uses the same cachelines as the
+ * userspace mapping (that's what 'index' is for)
+ */
+static void *dax_map_pfn(pfn_t pfn, unsigned long index)
+{
+	if (is_bad_pfn_t(pfn))
+		return NULL;
+	preempt_disable();
+	pagefault_disable();
+	return pfn_to_kaddr(pfn_t_to_pfn(pfn));
+}
+
+static void dax_unmap_pfn(void *addr)
+{
+	pagefault_enable();
+	preempt_enable();
+}
+
+/*
+ * DAX uses the 'exceptional' entries to store PFNs in the radix tree.
+ * Bit 0 is clear (the radix tree uses this for its own purposes).  Bit
+ * 1 is set (to indicate an exceptional entry).  Bits 2 & 3 are PFN_DEV
+ * and PFN_MAP.  The top two bits denote the size of the entry (PTE, PMD,
+ * PUD, one reserved).  That leaves us 26 bits on 32-bit systems and 58
+ * bits on 64-bit systems, able to address 256GB and 1024EB respectively.
+ */
+#define RADIX_DAX_SIZE_MASK	(0x3UL << (BITS_PER_LONG - 2))
+#define RADIX_TREE_MASK		(RADIX_TREE_INDIRECT_PTR | \
+				 RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_PFN_MASK	(~(RADIX_DAX_SIZE_MASK | RADIX_TREE_MASK))
+#define RADIX_DAX_SHIFT		4
+#define RADIX_DAX_PTE		(0x0UL << (BITS_PER_LONG - 2))
+#define RADIX_DAX_PMD		(0x1UL << (BITS_PER_LONG - 2))
+#define RADIX_DAX_PUD		(0x2UL << (BITS_PER_LONG - 2))
+#define RADIX_DAX_SIZE(entry)	((unsigned long)entry & RADIX_DAX_SIZE_MASK)
+#define RADIX_DAX_ENTRY(pfn, size) \
+	((void *)((pfn_t_to_pfn(pfn) << RADIX_DAX_SHIFT) | size))
+
+/* The 'colour' (ie low bits) within a PMD/PUD of a page offset. */
+#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_CACHE_SHIFT) - 1)
+#define PG_PUD_COLOUR	((PUD_SIZE >> PAGE_CACHE_SHIFT) - 1)
+
+static pfn_t radix_to_pfn_t(void *entry, pgoff_t index)
+{
+	pfn_t pfn = { .val = (unsigned long)entry & RADIX_DAX_PFN_MASK };
+	unsigned offset = 0;
+
+	if (RADIX_DAX_SIZE(entry) == RADIX_DAX_PMD)
+		offset = index & PG_PMD_COLOUR;
+	else if (RADIX_DAX_SIZE(entry) == RADIX_DAX_PUD)
+		offset = index & PG_PUD_COLOUR;
+
+	return pfn_t_add(pfn, offset);
+}
+
+static void *pfn_to_radix(pfn_t pfn, unsigned long size)
+{
+	unsigned long value = pfn.val;
+	BUG_ON(value & RADIX_DAX_PFN_MASK);
+	return (void *)(value | size);
+}
+
+static unsigned size_to_order(unsigned long size)
+{
+	switch (size) {
+	case RADIX_DAX_PTE: return 0;
+	case RADIX_DAX_PMD: return PMD_SHIFT - PAGE_SHIFT;
+	case RADIX_DAX_PUD: return PUD_SHIFT - PAGE_SHIFT;
+	}
+	BUG();
+}
+
+static unsigned size_to_bytes(unsigned long size)
+{
+	switch (size) {
+	case RADIX_DAX_PTE: return PAGE_CACHE_SIZE;
+	case RADIX_DAX_PMD: return PMD_SIZE;
+	case RADIX_DAX_PUD: return PUD_SIZE;
+	}
+	BUG();
+}
+
+static int dax_add_radix_entry(struct address_space *mapping, pgoff_t index,
+				pfn_t pfn, unsigned long size, bool dirty)
+{
+	struct radix_tree_root *page_tree = &mapping->page_tree;
+	int count = 0;
+	void *entry;
+	unsigned order = size_to_order(size);
+
+	if (dirty)
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+	spin_lock_irq(&mapping->tree_lock);
+	entry = radix_tree_lookup(page_tree, index);
+	if (!radix_tree_exceptional_entry(entry)) {
+		count = -EEXIST;
+		goto unlock;
+	} else if (entry) {
+		if (size <= RADIX_DAX_SIZE(entry))
+			goto dirty;
+	}
+	count = radix_tree_replace(page_tree, index, order,
+					pfn_to_radix(pfn, size));
+	if (count < 0)
+		goto unlock;
+
+	mapping->nrexceptional -= (count - 1);
+ dirty:
+	if (dirty)
+		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
+ unlock:
+	spin_unlock_irq(&mapping->tree_lock);
+	return count;
+}
 
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
@@ -58,17 +178,235 @@ static void dax_unmap_atomic(struct block_device *bdev,
 	blk_queue_exit(bdev->bd_queue);
 }
 
+static sector_t to_sector(const struct buffer_head *bh,
+		const struct inode *inode)
+{
+	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+
+	return sector;
+}
+
+static bool buffer_written(struct buffer_head *bh)
+{
+	return buffer_mapped(bh) && !buffer_unwritten(bh);
+}
+
+static int dax_replace_hole(struct address_space *mapping, pgoff_t index,
+				unsigned long size, pfn_t pfn)
+{
+	unsigned order = size_to_order(size);
+	int i, error;
+
+	for (i = 0; i < (1 << order); i++) {
+		struct page *page;
+ repeat:
+		page = find_get_entry(mapping, index + i);
+		if (!page || radix_tree_exceptional_entry(page))
+			continue;
+
+		lock_page(page);
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto repeat;
+		}
+
+		delete_from_page_cache(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	/*
+	 * Somebody else could look in the radix tree and find nothing.
+	 * It's harmless though; they'll find the correct pfn by calling
+	 * the filesystem.
+	 */
+	error = dax_add_radix_entry(mapping, index, pfn, size, true);
+
+	unmap_mapping_range(mapping, index << PAGE_CACHE_SHIFT,
+				PAGE_CACHE_SIZE, 0);
+
+	return error;
+}
+
+static int dax_add_pfn_sized(struct address_space *mapping, pgoff_t index,
+				size_t size, bool write, pfn_t pfn,
+				unsigned long radix_size, unsigned entry_size)
+{
+	int error;
+	bool report = true;
+
+	while (size >= entry_size) {
+		error = dax_add_radix_entry(mapping, index, pfn,
+						radix_size, write);
+		if (error == -EEXIST)
+			error = dax_replace_hole(mapping, index, radix_size,
+						pfn);
+		if (error)
+			break;
+		report = false;
+
+		size -= entry_size;
+		pfn = pfn_t_add(pfn, entry_size / PAGE_CACHE_SIZE);
+		index += entry_size / PAGE_CACHE_SIZE;
+	}
+
+	return report ? error : 0;
+}
+
+static int dax_add_pfn_entries(struct address_space *mapping, pgoff_t index,
+				size_t size, bool write, pfn_t pfn)
+{
+	int error = 0;
+	int called = 0;
+	size_t max;
+
+	max = (PG_PMD_COLOUR + 1 - index) << PAGE_CACHE_SHIFT;
+	if (index & PG_PMD_COLOUR) {
+		error = dax_add_pfn_sized(mapping, index, min(size, max),
+				write, pfn, RADIX_DAX_PTE, PAGE_CACHE_SIZE);
+		called++;
+	}
+	size -= min(size, max);
+	if (error || !size)
+		goto out;
+	index += max >> PAGE_CACHE_SHIFT; 
+
+	max = (PG_PUD_COLOUR + 1 - index) << PAGE_CACHE_SHIFT;
+	if (index & PG_PUD_COLOUR) {
+		error = dax_add_pfn_sized(mapping, index, min(size, max),
+				write, pfn, RADIX_DAX_PMD, PMD_SIZE);
+		called++;
+	}
+	size -= min(size, max);
+	if (error || !size)
+		goto out;
+	index += max >> PMD_SHIFT; 
+
+	error = dax_add_pfn_sized(mapping, index, size,
+				write, pfn, RADIX_DAX_PUD, PUD_SIZE);
+	called++;
+	index += size >> PUD_SHIFT;
+	size = size & ~PMD_MASK;
+	if (error || !size)
+		goto out;
+
+	error = dax_add_pfn_sized(mapping, index, size,
+				write, pfn, RADIX_DAX_PMD, PMD_SIZE);
+	index += size >> PMD_SHIFT;
+	size = size & ~PAGE_CACHE_MASK;
+	if (error || !size)
+		return 0;
+
+	error = dax_add_pfn_sized(mapping, index, size,
+				write, pfn, RADIX_DAX_PTE, PAGE_CACHE_SIZE);
+ out:
+	if (called > 1)
+		error = 0;
+	return error;
+}
+
+/*
+ * Populate the page cache with as many pfns as the filesystem is willing
+ * to tell us about from a single call to get_block, starting at @index and
+ * continuing up to @max bytes.
+ */
+static int dax_create_pfns(struct address_space *mapping, pgoff_t index,
+				unsigned max, bool write, pfn_t *pfn,
+				get_block_t get_block, struct buffer_head *bh)
+{
+	struct inode *inode = mapping->host;
+	unsigned blkbits = inode->i_blkbits;
+	sector_t block = index << (PAGE_CACHE_SHIFT - blkbits);
+	struct blk_dax_ctl dax;
+	int error, result = 0;
+
+	bh->b_size = max;
+	bh->b_state = 0;
+	error = get_block(inode, block, bh, write);
+	if (error)
+		goto error;
+
+	if (!buffer_written(bh))
+		goto hole;
+
+	dax.sector = to_sector(bh, inode);
+	dax.size = bh->b_size;
+	error = dax_map_atomic(bh->b_bdev, &dax);
+	if (error < 0)
+		goto error;
+
+	/*
+	 * We may be about to write data to it, but now it's allocated,
+	 * and another thread will be able to find it in the page cache,
+	 * so we have to zero it otherwise there's a write vs fault race
+	 * that could expose stale data to an application.
+	 */
+	if (buffer_unwritten(bh) || buffer_new(bh)) {
+		clear_pmem(dax.addr, bh->b_size);
+		result = 1;
+	}
+
+	dax_unmap_atomic(bh->b_bdev, &dax);
+
+	error = dax_add_pfn_entries(mapping, index, bh->b_size,
+					write, dax.pfn);
+
+	/*
+	 * Even if we had an error adding the PFN to the radix tree,
+	 * the PFN is still good, so return it.
+	 */
+	*pfn = dax.pfn;
+	return error ? error : result;
+
+ hole:
+ error:
+	*pfn = bad_pfn_t;
+	return error;
+}
+
+/*
+ * Returns either a negative errno, 0 if no allocation had to be performed,
+ * or 1 if the filesystem allocated a block.
+ */
+static int dax_get_pfn(struct address_space *mapping, pgoff_t index,
+				size_t len, bool write, pfn_t *pfn,
+				get_block_t get_block, struct buffer_head *bh)
+{
+	void *entry;
+
+	rcu_read_lock();
+	entry = radix_tree_lookup(&mapping->page_tree, index);
+	rcu_read_unlock();
+
+	if (radix_tree_exceptional_entry(entry)) {
+		*pfn = radix_to_pfn_t(entry, index);
+		return 0;
+	}
+
+	if (entry) {
+		if (write)
+			return dax_create_pfns(mapping, index, len, true, pfn,
+						get_block, bh);
+	} else {
+		return dax_create_pfns(mapping, index, len, write, pfn,
+						get_block, bh);
+	}
+
+	*pfn = bad_pfn_t;
+	return 0;
+}
+
 /*
  * dax_clear_blocks() is called from within transaction context from XFS,
  * and hence this means the stack from this point must follow GFP_NOFS
  * semantics for all operations.
  */
-int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
+int dax_clear_blocks(struct block_device *bdev, sector_t sector, long size)
 {
-	struct block_device *bdev = inode->i_sb->s_bdev;
 	struct blk_dax_ctl dax = {
-		.sector = block << (inode->i_blkbits - 9),
-		.size = _size,
+		.sector = sector,
+		.size = size,
 	};
 
 	might_sleep();
@@ -91,133 +429,52 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
 }
 EXPORT_SYMBOL_GPL(dax_clear_blocks);
 
-/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
-static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
-		loff_t pos, loff_t end)
-{
-	loff_t final = end - pos + first; /* The final byte of the buffer */
-
-	if (first > 0)
-		clear_pmem(addr, first);
-	if (final < size)
-		clear_pmem(addr + final, size - final);
-}
-
-static bool buffer_written(struct buffer_head *bh)
-{
-	return buffer_mapped(bh) && !buffer_unwritten(bh);
-}
-
-/*
- * When ext4 encounters a hole, it returns without modifying the buffer_head
- * which means that we can't trust b_size.  To cope with this, we set b_state
- * to 0 before calling get_block and, if any bit is set, we know we can trust
- * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
- * and would save us time calling get_block repeatedly.
- */
-static bool buffer_size_valid(struct buffer_head *bh)
-{
-	return bh->b_state != 0;
-}
-
-
-static sector_t to_sector(const struct buffer_head *bh,
-		const struct inode *inode)
-{
-	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
-
-	return sector;
-}
-
 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
-		      loff_t start, loff_t end, get_block_t get_block,
-		      struct buffer_head *bh)
+				loff_t start, loff_t end,
+				get_block_t get_block, struct buffer_head *bh)
 {
-	loff_t pos = start, max = start, bh_max = start;
-	bool hole = false, need_wmb = false;
-	struct block_device *bdev = NULL;
-	int rw = iov_iter_rw(iter), rc;
-	long map_len = 0;
-	struct blk_dax_ctl dax = {
-		.addr = (void __pmem *) ERR_PTR(-EIO),
-	};
+	loff_t pos = start;
+	int error = 0;
+	const int rw = iov_iter_rw(iter);
 
 	if (rw == READ)
 		end = min(end, i_size_read(inode));
 
-	while (pos < end) {
-		size_t len;
-		if (pos == max) {
-			unsigned blkbits = inode->i_blkbits;
-			long page = pos >> PAGE_SHIFT;
-			sector_t block = page << (PAGE_SHIFT - blkbits);
-			unsigned first = pos - (block << blkbits);
-			long size;
-
-			if (pos == bh_max) {
-				bh->b_size = PAGE_ALIGN(end - pos);
-				bh->b_state = 0;
-				rc = get_block(inode, block, bh, rw == WRITE);
-				if (rc)
-					break;
-				if (!buffer_size_valid(bh))
-					bh->b_size = 1 << blkbits;
-				bh_max = pos - first + bh->b_size;
-				bdev = bh->b_bdev;
-			} else {
-				unsigned done = bh->b_size -
-						(bh_max - (pos - first));
-				bh->b_blocknr += done >> blkbits;
-				bh->b_size -= done;
-			}
+	while (!error && pos < end) {
+		pgoff_t pgoff = pos >> PAGE_CACHE_SHIFT;
+		unsigned off = pos & ~PAGE_CACHE_MASK;
+		size_t len = end - pos;
+		pfn_t pfn;
+		void __pmem *addr;
 
-			hole = rw == READ && !buffer_written(bh);
-			if (hole) {
-				size = bh->b_size - first;
-			} else {
-				dax_unmap_atomic(bdev, &dax);
-				dax.sector = to_sector(bh, inode);
-				dax.size = bh->b_size;
-				map_len = dax_map_atomic(bdev, &dax);
-				if (map_len < 0) {
-					rc = map_len;
-					break;
-				}
-				if (buffer_unwritten(bh) || buffer_new(bh)) {
-					dax_new_buf(dax.addr, map_len, first,
-							pos, end);
-					need_wmb = true;
-				}
-				dax.addr += first;
-				size = map_len - first;
-			}
-			max = min(pos + size, end);
-		}
+		error = dax_get_pfn(inode->i_mapping, pgoff, len, rw == WRITE,
+						&pfn, get_block, bh);
+		if (error < 0)
+			break;
+		addr = dax_map_pfn(pfn, pgoff) + off;
 
-		if (iov_iter_rw(iter) == WRITE) {
-			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
-			need_wmb = true;
-		} else if (!hole)
-			len = copy_to_iter((void __force *) dax.addr, max - pos,
-					iter);
+		if (len > PAGE_CACHE_SIZE)
+			len = PAGE_CACHE_SIZE;
+
+		if (rw == WRITE) {
+			len = copy_from_iter_pmem(addr, len, iter);
+			current->needs_wmb = true;
+		} else if (addr)
+			len = copy_to_iter((void __force *) addr, len, iter);
 		else
-			len = iov_iter_zero(max - pos, iter);
+			len = iov_iter_zero(len, iter);
+		dax_unmap_pfn(addr - off);
 
-		if (!len) {
-			rc = -EFAULT;
-			break;
-		}
+		if (!len)
+			error = -EFAULT;
 
 		pos += len;
-		if (!IS_ERR(dax.addr))
-			dax.addr += len;
 	}
 
-	if (need_wmb)
+	if (current->needs_wmb)
 		wmb_pmem();
-	dax_unmap_atomic(bdev, &dax);
 
-	return (pos == start) ? rc : pos - start;
+	return (pos == start) ? error : pos - start;
 }
 
 /**
@@ -238,15 +495,14 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
  * is in progress.
  */
 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
-		  struct iov_iter *iter, loff_t pos, get_block_t get_block,
-		  dio_iodone_t end_io, int flags)
+			struct iov_iter *iter, loff_t pos,
+			get_block_t get_block, dio_iodone_t end_io, int flags)
 {
 	struct buffer_head bh;
 	ssize_t retval = -EINVAL;
 	loff_t end = pos + iov_iter_count(iter);
 
 	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
 
 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
 		struct address_space *mapping = inode->i_mapping;
@@ -277,124 +533,26 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 }
 EXPORT_SYMBOL_GPL(dax_do_io);
 
-/*
- * The user has performed a load from a hole in the file.  Allocating
- * a new page in the file would cause excessive storage usage for
- * workloads with sparse files.  We allocate a page cache page instead.
- * We'll kick it out of the page cache if it's ever written to,
- * otherwise it will simply fall out of the page cache under memory
- * pressure without ever having been dirtied.
- */
-static int dax_load_hole(struct address_space *mapping, struct page *page,
-							struct vm_fault *vmf)
+static int copy_user_pfn(struct vm_fault *vmf, pfn_t pfn)
 {
-	if (!page)
-		page = find_or_create_page(mapping, vmf->pgoff,
-						vmf->gfp_mask | __GFP_ZERO);
-	if (!page)
-		return VM_FAULT_OOM;
-	vmf->page = page;
-	return VM_FAULT_LOCKED;
-}
+	void *vto, *vfrom;
 
-static int copy_user_bh(struct page *to, struct inode *inode,
-		struct buffer_head *bh, unsigned long vaddr)
-{
-	struct blk_dax_ctl dax = {
-		.sector = to_sector(bh, inode),
-		.size = bh->b_size,
-	};
-	struct block_device *bdev = bh->b_bdev;
-	void *vto;
-
-	if (dax_map_atomic(bdev, &dax) < 0)
-		return PTR_ERR(dax.addr);
-	vto = kmap_atomic(to);
-	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
+	vfrom = dax_map_pfn(pfn, vmf->pgoff);
+	vto = kmap_atomic(vmf->cow_page);
+	copy_user_page(vto, vfrom, (unsigned long)vmf->virtual_address,
+			vmf->cow_page);
 	kunmap_atomic(vto);
-	dax_unmap_atomic(bdev, &dax);
+	dax_unmap_pfn(vfrom);
 	return 0;
 }
 
-#define NO_SECTOR -1
-#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
-
-static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
-		sector_t sector, bool pmd_entry, bool dirty)
-{
-	struct radix_tree_root *page_tree = &mapping->page_tree;
-	pgoff_t pmd_index = DAX_PMD_INDEX(index);
-	int type, error = 0;
-	void *entry;
-
-	WARN_ON_ONCE(pmd_entry && !dirty);
-	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-
-	spin_lock_irq(&mapping->tree_lock);
-
-	entry = radix_tree_lookup(page_tree, pmd_index);
-	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
-		index = pmd_index;
-		goto dirty;
-	}
-
-	entry = radix_tree_lookup(page_tree, index);
-	if (entry) {
-		type = RADIX_DAX_TYPE(entry);
-		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
-					type != RADIX_DAX_PMD)) {
-			error = -EIO;
-			goto unlock;
-		}
-
-		if (!pmd_entry || type == RADIX_DAX_PMD)
-			goto dirty;
-
-		/*
-		 * We only insert dirty PMD entries into the radix tree.  This
-		 * means we don't need to worry about removing a dirty PTE
-		 * entry and inserting a clean PMD entry, thus reducing the
-		 * range we would flush with a follow-up fsync/msync call.
-		 */
-		radix_tree_delete(&mapping->page_tree, index);
-		mapping->nrexceptional--;
-	}
-
-	if (sector == NO_SECTOR) {
-		/*
-		 * This can happen during correct operation if our pfn_mkwrite
-		 * fault raced against a hole punch operation.  If this
-		 * happens the pte that was hole punched will have been
-		 * unmapped and the radix tree entry will have been removed by
-		 * the time we are called, but the call will still happen.  We
-		 * will return all the way up to wp_pfn_shared(), where the
-		 * pte_same() check will fail, eventually causing page fault
-		 * to be retried by the CPU.
-		 */
-		goto unlock;
-	}
-
-	error = radix_tree_insert(page_tree, index,
-			RADIX_DAX_ENTRY(sector, pmd_entry));
-	if (error)
-		goto unlock;
-
-	mapping->nrexceptional++;
- dirty:
-	if (dirty)
-		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
- unlock:
-	spin_unlock_irq(&mapping->tree_lock);
-	return error;
-}
-
 static int dax_writeback_one(struct block_device *bdev,
 		struct address_space *mapping, pgoff_t index, void *entry)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
-	int type = RADIX_DAX_TYPE(entry);
+	unsigned size = RADIX_DAX_SIZE(entry);
 	struct radix_tree_node *node;
-	struct blk_dax_ctl dax;
+	void __pmem *addr;
 	void **slot;
 	int ret = 0;
 
@@ -412,38 +570,14 @@ static int dax_writeback_one(struct block_device *bdev,
 	/* another fsync thread may have already written back this entry */
 	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
 		goto unlock;
-
-	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
-		ret = -EIO;
-		goto unlock;
-	}
-
-	dax.sector = RADIX_DAX_SECTOR(entry);
-	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
 	spin_unlock_irq(&mapping->tree_lock);
 
-	/*
-	 * We cannot hold tree_lock while calling dax_map_atomic() because it
-	 * eventually calls cond_resched().
-	 */
-	ret = dax_map_atomic(bdev, &dax);
-	if (ret < 0)
-		return ret;
-
-	if (WARN_ON_ONCE(ret < dax.size)) {
-		ret = -EIO;
-		goto unmap;
-	}
-
-	wb_cache_pmem(dax.addr, dax.size);
+	addr = dax_map_pfn(radix_to_pfn_t(entry, index), index);
+	wb_cache_pmem(addr, size_to_bytes(size));
+	dax_unmap_pfn(addr);
 
 	spin_lock_irq(&mapping->tree_lock);
 	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
-	spin_unlock_irq(&mapping->tree_lock);
- unmap:
-	dax_unmap_atomic(bdev, &dax);
-	return ret;
-
  unlock:
 	spin_unlock_irq(&mapping->tree_lock);
 	return ret;
@@ -459,27 +593,17 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
 {
 	struct inode *inode = mapping->host;
 	struct block_device *bdev = inode->i_sb->s_bdev;
-	pgoff_t start_index, end_index, pmd_index;
+	pgoff_t start_index, end_index;
 	pgoff_t indices[PAGEVEC_SIZE];
 	struct pagevec pvec;
 	bool done = false;
 	int i, ret = 0;
-	void *entry;
 
 	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
 		return -EIO;
 
 	start_index = start >> PAGE_CACHE_SHIFT;
 	end_index = end >> PAGE_CACHE_SHIFT;
-	pmd_index = DAX_PMD_INDEX(start_index);
-
-	rcu_read_lock();
-	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
-	rcu_read_unlock();
-
-	/* see if the start of our range is covered by a PMD entry */
-	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
-		start_index = pmd_index;
 
 	tag_pages_for_writeback(mapping, start_index, end_index);
 
@@ -509,107 +633,80 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
-static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
-			struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	unsigned long vaddr = (unsigned long)vmf->virtual_address;
-	struct address_space *mapping = inode->i_mapping;
-	struct block_device *bdev = bh->b_bdev;
-	struct blk_dax_ctl dax = {
-		.sector = to_sector(bh, inode),
-		.size = bh->b_size,
-	};
-	int error;
-
-	if (dax_map_atomic(bdev, &dax) < 0) {
-		error = PTR_ERR(dax.addr);
-		goto out;
-	}
-
-	if (buffer_unwritten(bh) || buffer_new(bh)) {
-		clear_pmem(dax.addr, PAGE_SIZE);
-		wmb_pmem();
-	}
-	dax_unmap_atomic(bdev, &dax);
-
-	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
-			vmf->flags & FAULT_FLAG_WRITE);
-	if (error)
-		goto out;
-
-	error = vm_insert_mixed(vma, vaddr, dax.pfn);
-
- out:
-	return error;
-}
-
 static int dax_pte_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			get_block_t get_block, dax_iodone_t complete_unwritten)
 {
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
+	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 	struct page *page;
+	pfn_t pfn;
 	struct buffer_head bh;
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
-	unsigned blkbits = inode->i_blkbits;
-	sector_t block;
 	pgoff_t size;
 	int error;
 	int major = 0;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (vmf->pgoff >= size)
 		return VM_FAULT_SIGBUS;
 
 	memset(&bh, 0, sizeof(bh));
-	block = (sector_t)vmf->pgoff << (PAGE_CACHE_SHIFT - blkbits);
-	bh.b_bdev = inode->i_sb->s_bdev;
-	bh.b_size = PAGE_CACHE_SIZE;
 
  repeat:
-	page = find_get_page(mapping, vmf->pgoff);
-	if (page) {
+	page = find_get_entry(mapping, vmf->pgoff);
+	if (radix_tree_exceptional_entry(page)) {
+		pfn = radix_to_pfn_t(page, vmf->pgoff);
+		page = NULL;
+	} else if (!page) {
+		error = dax_create_pfns(mapping, vmf->pgoff, PAGE_CACHE_SIZE,
+				write && !vmf->cow_page, &pfn, get_block, &bh);
+		if (error < 0)
+			goto out;
+
+		if (error) {
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			major = VM_FAULT_MAJOR;
+			error = 0;
+		}
+	} else {
 		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
 			page_cache_release(page);
 			return VM_FAULT_RETRY;
-		}
-		if (unlikely(page->mapping != mapping)) {
+		} else if (unlikely(page->mapping != mapping)) {
 			unlock_page(page);
 			page_cache_release(page);
 			goto repeat;
 		}
 	}
 
-	error = get_block(inode, block, &bh, 0);
-	if (!error && (bh.b_size < PAGE_CACHE_SIZE))
-		error = -EIO;		/* fs corruption? */
-	if (error)
-		goto unlock_page;
+	if (is_bad_pfn_t(pfn) && !vmf->cow_page) {
+		/*
+		 * Allocating a new page in the file would cause excessive
+		 * storage usage for workloads with sparse files.  We allocate
+		 * a page cache page instead.  We'll kick it out of the page
+		 * cache if it's ever written to, otherwise it will simply
+		 * fall out of the page cache under memory pressure without
+		 * ever having been dirtied.
+		 */
+		if (!page)
+			page = find_or_create_page(mapping, vmf->pgoff,
+						vmf->gfp_mask | __GFP_ZERO);
+		if (!page)
+			return VM_FAULT_OOM;
+		vmf->page = page;
+		return VM_FAULT_LOCKED;
+	}
 
-	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
-		if (vmf->flags & FAULT_FLAG_WRITE) {
-			error = get_block(inode, block, &bh, 1);
-			count_vm_event(PGMAJFAULT);
-			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-			major = VM_FAULT_MAJOR;
-			if (!error && (bh.b_size < PAGE_CACHE_SIZE))
-				error = -EIO;
+	if (vmf->cow_page) {
+		if (is_bad_pfn_t(pfn)) {
+			clear_user_highpage(vmf->cow_page, vaddr);
+		} else {
+			error = copy_user_pfn(vmf, pfn);
 			if (error)
 				goto unlock_page;
-		} else {
-			return dax_load_hole(mapping, page, vmf);
 		}
-	}
-
-	if (vmf->cow_page) {
-		struct page *new_page = vmf->cow_page;
-		if (buffer_written(&bh))
-			error = copy_user_bh(new_page, inode, &bh, vaddr);
-		else
-			clear_user_highpage(new_page, vaddr);
-		if (error)
-			goto unlock_page;
 		vmf->page = page;
 
 		/*
@@ -625,18 +722,10 @@ static int dax_pte_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		return VM_FAULT_LOCKED;
 	}
 
-	/* Check we didn't race with a read fault installing a new page */
-	if (!page && major)
-		page = find_lock_page(mapping, vmf->pgoff);
+	if (current->needs_wmb)
+		wmb_pmem();
 
-	if (page) {
-		unmap_mapping_range(mapping, vmf->pgoff << PAGE_CACHE_SHIFT,
-							PAGE_CACHE_SIZE, 0);
-		delete_from_page_cache(page);
-		unlock_page(page);
-		page_cache_release(page);
-		page = NULL;
-	}
+	error = vm_insert_mixed(vma, vaddr, pfn);
 
 	/*
 	 * If we successfully insert the new mapping over an unwritten extent,
@@ -648,12 +737,11 @@ static int dax_pte_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	 * indicate what the callback should do via the uptodate variable, same
 	 * as for normal BH based IO completions.
 	 */
-	error = dax_insert_mapping(inode, &bh, vma, vmf);
 	if (buffer_unwritten(&bh)) {
 		if (complete_unwritten)
 			complete_unwritten(&bh, !error);
 		else
-			WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
+			WARN_ON_ONCE(!write);
 	}
 
  out:
@@ -673,12 +761,6 @@ static int dax_pte_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/*
- * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
- * more often than one might expect in the below function.
- */
-#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_CACHE_SHIFT) - 1)
-
 static void __dax_dbg(struct buffer_head *bh, unsigned long address,
 		const char *reason, const char *fn)
 {
@@ -697,98 +779,19 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address,
 
 #define dax_pmd_dbg(bh, address, reason)	__dax_dbg(bh, address, reason, "dax_pmd")
 
-static int dax_insert_pmd_mapping(struct inode *inode, struct buffer_head *bh,
-			struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	int major = 0;
-	struct blk_dax_ctl dax = {
-		.sector = to_sector(bh, inode),
-		.size = PMD_SIZE,
-	};
-	struct block_device *bdev = bh->b_bdev;
-	bool write = vmf->flags & FAULT_FLAG_WRITE;
-	unsigned long address = (unsigned long)vmf->virtual_address;
-	long length = dax_map_atomic(bdev, &dax);
-
-	if (length < 0)
-		return VM_FAULT_SIGBUS;
-	if (length < PMD_SIZE) {
-		dax_pmd_dbg(bh, address, "dax-length too small");
-		goto unmap;
-	}
-
-	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
-		dax_pmd_dbg(bh, address, "pfn unaligned");
-		goto unmap;
-	}
-
-	if (!pfn_t_devmap(dax.pfn)) {
-		dax_pmd_dbg(bh, address, "pfn not in memmap");
-		goto unmap;
-	}
-
-	if (buffer_unwritten(bh) || buffer_new(bh)) {
-		clear_pmem(dax.addr, PMD_SIZE);
-		wmb_pmem();
-		count_vm_event(PGMAJFAULT);
-		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-		major = VM_FAULT_MAJOR;
-	}
-	dax_unmap_atomic(bdev, &dax);
-
-	/*
-	 * For PTE faults we insert a radix tree entry for reads, and leave
-	 * it clean.  Then on the first write we dirty the radix tree entry
-	 * via the dax_pfn_mkwrite() path.  This sequence allows the
-	 * dax_pfn_mkwrite() call to be simpler and avoid a call into
-	 * get_block() to translate the pgoff to a sector in order to be able
-	 * to create a new radix tree entry.
-	 *
-	 * The PMD path doesn't have an equivalent to dax_pfn_mkwrite(),
-	 * though, so for a read followed by a write we traverse all the way
-	 * through dax_pmd_fault() twice.  This means we can just skip
-	 * inserting a radix tree entry completely on the initial read and
-	 * just wait until the write to insert a dirty entry.
-	 */
-	if (write) {
-		int error = dax_radix_entry(vma->vm_file->f_mapping, vmf->pgoff,
-						dax.sector, true, true);
-		if (error) {
-			dax_pmd_dbg(bh, address,
-					"PMD radix insertion failed");
-			goto fallback;
-		}
-	}
-
-	dev_dbg(part_to_dev(bdev->bd_part),
-			"%s: %s addr: %lx pfn: %lx sect: %llx\n",
-			__func__, current->comm, address,
-			pfn_t_to_pfn(dax.pfn),
-			(unsigned long long) dax.sector);
-	return major | vmf_insert_pfn_pmd(vma, address, vmf->pmd,
-						dax.pfn, write);
- unmap:
-	dax_unmap_atomic(bdev, &dax);
- fallback:
-	count_vm_event(THP_FAULT_FALLBACK);
-	return VM_FAULT_FALLBACK;
-}
-
 static int dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		get_block_t get_block, dax_iodone_t complete_unwritten)
 {
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
+	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
+	void *entry;
+	pfn_t pfn;
 	struct buffer_head bh;
-	unsigned blkbits = inode->i_blkbits;
-	unsigned long address = (unsigned long)vmf->virtual_address;
-	unsigned long pmd_addr = address & PMD_MASK;
-	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	unsigned long pmd_addr = vaddr & PMD_MASK;
 	pgoff_t size;
-	sector_t block;
-	int result;
-	bool alloc = false;
+	int result = 0;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	/* dax pmd mappings require pfn_t_devmap() */
 	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
@@ -796,17 +799,17 @@ static int dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED)) {
-		split_huge_pmd(vma, vmf->pmd, address);
-		dax_pmd_dbg(NULL, address, "cow write");
+		split_huge_pmd(vma, vmf->pmd, vaddr);
+		dax_pmd_dbg(NULL, vaddr, "cow write");
 		return VM_FAULT_FALLBACK;
 	}
 	/* If the PMD would extend outside the VMA */
 	if (pmd_addr < vma->vm_start) {
-		dax_pmd_dbg(NULL, address, "vma start unaligned");
+		dax_pmd_dbg(NULL, vaddr, "vma start unaligned");
 		return VM_FAULT_FALLBACK;
 	}
 	if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
-		dax_pmd_dbg(NULL, address, "vma end unaligned");
+		dax_pmd_dbg(NULL, vaddr, "vma end unaligned");
 		return VM_FAULT_FALLBACK;
 	}
 
@@ -815,76 +818,69 @@ static int dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		return VM_FAULT_SIGBUS;
 	/* If the PMD would cover blocks out of the file */
 	if ((vmf->pgoff | PG_PMD_COLOUR) >= size) {
-		dax_pmd_dbg(NULL, address,
+		dax_pmd_dbg(NULL, vaddr,
 				"offset + huge page size > file size");
 		return VM_FAULT_FALLBACK;
 	}
 
 	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
-	block = (sector_t)vmf->pgoff << (PAGE_CACHE_SHIFT - blkbits);
-
-	bh.b_size = PMD_SIZE;
-
-	if (get_block(inode, block, &bh, 0) != 0)
-		return VM_FAULT_SIGBUS;
-
-	if (!buffer_mapped(&bh) && write) {
-		if (get_block(inode, block, &bh, 1) != 0)
-			return VM_FAULT_SIGBUS;
-		alloc = true;
-	}
 
-	/*
-	 * If the filesystem isn't willing to tell us the length of a hole,
-	 * just fall back to PTEs.  Calling get_block 512 times in a loop
-	 * would be silly.
-	 */
-	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
-		dax_pmd_dbg(&bh, address, "allocated block too small");
-		return VM_FAULT_FALLBACK;
-	}
+	entry = find_get_entry(mapping, vmf->pgoff);
+	if (radix_tree_exceptional_entry(entry) &&
+				RADIX_DAX_SIZE(entry) >= RADIX_DAX_PMD) {
+		pfn = radix_to_pfn_t(entry, vmf->pgoff);
+	} else {
+		int error = dax_create_pfns(mapping, vmf->pgoff, PMD_SIZE,
+					write, &pfn, get_block, &bh);
+		if (error < 0)
+			goto fallback;
 
-	/*
-	 * If we allocated new storage, make sure no process has any
-	 * zero pages covering this hole
-	 */
-	if (alloc) {
-		loff_t lstart = vmf->pgoff << PAGE_CACHE_SHIFT;
-		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
+		if (error) {
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			result = VM_FAULT_MAJOR;
+			error = 0;
+		}
 
-		truncate_pagecache_range(inode, lstart, lend);
+		/*
+		 * We don't know if dax_create_pfns() was able to allocate
+		 * a contiguous aligned chunk, or whether it was only able
+		 * to do a partial allocation.
+		 */
+		entry = find_get_entry(mapping, vmf->pgoff);
+		if (!radix_tree_exceptional_entry(entry) ||
+				RADIX_DAX_SIZE(entry) < RADIX_DAX_PMD)
+			goto fallback;
 	}
 
-	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+	if (is_bad_pfn_t(pfn)) {
 		spinlock_t *ptl;
 		pmd_t entry, *pmd = vmf->pmd;
 		struct page *zero_page = get_huge_zero_page();
 
 		if (unlikely(!zero_page)) {
-			dax_pmd_dbg(&bh, address, "no zero page");
+			dax_pmd_dbg(&bh, vaddr, "no zero page");
 			goto fallback;
 		}
 
 		ptl = pmd_lock(vma->vm_mm, pmd);
 		if (!pmd_none(*pmd)) {
 			spin_unlock(ptl);
-			dax_pmd_dbg(&bh, address, "pmd already present");
+			dax_pmd_dbg(&bh, vaddr, "pmd already present");
 			goto fallback;
 		}
 
-		dev_dbg(part_to_dev(bh.b_bdev->bd_part),
-				"%s: %s addr: %lx pfn: <zero> sect: %llx\n",
-				__func__, current->comm, address,
-				(unsigned long long) to_sector(&bh, inode));
-
 		entry = mk_pmd(zero_page, vma->vm_page_prot);
 		entry = pmd_mkhuge(entry);
 		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
 		result = VM_FAULT_NOPAGE;
 		spin_unlock(ptl);
 	} else {
-		result = dax_insert_pmd_mapping(inode, &bh, vma, vmf);
+		if (current->needs_wmb)
+			wmb_pmem();
+
+		result |= vmf_insert_pfn_pmd(vma, vaddr, vmf->pmd, pfn,
+						write);
 	}
 
  out:
@@ -907,80 +903,21 @@ static int dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 #endif /* !CONFIG_TRANSPARENT_HUGEPAGE */
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-/*
- * The 'colour' (ie low bits) within a PUD of a page offset.  This comes up
- * more often than one might expect in the below function.
- */
-#define PG_PUD_COLOUR	((PUD_SIZE >> PAGE_CACHE_SHIFT) - 1)
-
 #define dax_pud_dbg(bh, address, reason)	__dax_dbg(bh, address, reason, "dax_pud")
 
-static int dax_insert_pud_mapping(struct inode *inode, struct buffer_head *bh,
-			struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	int major = 0;
-	struct blk_dax_ctl dax = {
-		.sector = to_sector(bh, inode),
-		.size = PUD_SIZE,
-	};
-	struct block_device *bdev = bh->b_bdev;
-	bool write = vmf->flags & FAULT_FLAG_WRITE;
-	unsigned long address = (unsigned long)vmf->virtual_address;
-	long length = dax_map_atomic(bdev, &dax);
-
-	if (length < 0)
-		return VM_FAULT_SIGBUS;
-	if (length < PUD_SIZE) {
-		dax_pud_dbg(bh, address, "dax-length too small");
-		goto unmap;
-	}
-	if (pfn_t_to_pfn(dax.pfn) & PG_PUD_COLOUR) {
-		dax_pud_dbg(bh, address, "pfn unaligned");
-		goto unmap;
-	}
-
-	if (!pfn_t_devmap(dax.pfn)) {
-		dax_pud_dbg(bh, address, "pfn not in memmap");
-		goto unmap;
-	}
-
-	if (buffer_unwritten(bh) || buffer_new(bh)) {
-		clear_pmem(dax.addr, PUD_SIZE);
-		wmb_pmem();
-		count_vm_event(PGMAJFAULT);
-		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-		major = VM_FAULT_MAJOR;
-	}
-	dax_unmap_atomic(bdev, &dax);
-
-	dev_dbg(part_to_dev(bdev->bd_part),
-			"%s: %s addr: %lx pfn: %lx sect: %llx\n",
-			__func__, current->comm, address,
-			pfn_t_to_pfn(dax.pfn),
-			(unsigned long long) dax.sector);
-	return major | vmf_insert_pfn_pud(vma, address, vmf->pud,
-						dax.pfn, write);
- unmap:
-	dax_unmap_atomic(bdev, &dax);
-	count_vm_event(THP_FAULT_FALLBACK);
-	return VM_FAULT_FALLBACK;
-}
-
 static int dax_pud_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		get_block_t get_block, dax_iodone_t complete_unwritten)
 {
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
+	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
+	void *entry;
+	pfn_t pfn;
 	struct buffer_head bh;
-	unsigned blkbits = inode->i_blkbits;
-	unsigned long address = (unsigned long)vmf->virtual_address;
-	unsigned long pud_addr = address & PUD_MASK;
-	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	unsigned long pud_addr = vaddr & PUD_MASK;
 	pgoff_t size;
-	sector_t block;
-	int result;
-	bool alloc = false;
+	int result = 0;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	/* dax pud mappings require pfn_t_devmap() */
 	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
@@ -988,17 +925,17 @@ static int dax_pud_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED)) {
-		split_huge_pud(vma, vmf->pud, address);
-		dax_pud_dbg(NULL, address, "cow write");
+		split_huge_pud(vma, vmf->pud, vaddr);
+		dax_pud_dbg(NULL, vaddr, "cow write");
 		return VM_FAULT_FALLBACK;
 	}
 	/* If the PUD would extend outside the VMA */
 	if (pud_addr < vma->vm_start) {
-		dax_pud_dbg(NULL, address, "vma start unaligned");
+		dax_pud_dbg(NULL, vaddr, "vma start unaligned");
 		return VM_FAULT_FALLBACK;
 	}
 	if ((pud_addr + PUD_SIZE) > vma->vm_end) {
-		dax_pud_dbg(NULL, address, "vma end unaligned");
+		dax_pud_dbg(NULL, vaddr, "vma end unaligned");
 		return VM_FAULT_FALLBACK;
 	}
 
@@ -1007,52 +944,50 @@ static int dax_pud_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		return VM_FAULT_SIGBUS;
 	/* If the PUD would cover blocks out of the file */
 	if ((vmf->pgoff | PG_PUD_COLOUR) >= size) {
-		dax_pud_dbg(NULL, address,
+		dax_pud_dbg(NULL, vaddr,
 				"offset + huge page size > file size");
 		return VM_FAULT_FALLBACK;
 	}
 
 	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
-	block = (sector_t)vmf->pgoff << (PAGE_CACHE_SHIFT - blkbits);
 
-	bh.b_size = PUD_SIZE;
-
-	if (get_block(inode, block, &bh, 0) != 0)
-		return VM_FAULT_SIGBUS;
-
-	if (!buffer_mapped(&bh) && write) {
-		if (get_block(inode, block, &bh, 1) != 0)
-			return VM_FAULT_SIGBUS;
-		alloc = true;
-	}
-
-	/*
-	 * If the filesystem isn't willing to tell us the length of a hole,
-	 * just fall back to PMDs.  Calling get_block 512 times in a loop
-	 * would be silly.
-	 */
-	if (!buffer_size_valid(&bh) || bh.b_size < PUD_SIZE) {
-		dax_pud_dbg(&bh, address, "allocated block too small");
-		return VM_FAULT_FALLBACK;
-	}
+	entry = find_get_entry(mapping, vmf->pgoff);
+	if (radix_tree_exceptional_entry(entry) &&
+				RADIX_DAX_SIZE(entry) >= RADIX_DAX_PUD) {
+		pfn = radix_to_pfn_t(entry, vmf->pgoff);
+	} else {
+		int error = dax_create_pfns(mapping, vmf->pgoff, PUD_SIZE,
+					write, &pfn, get_block, &bh);
+		if (error < 0)
+			goto fallback;
 
-	/*
-	 * If we allocated new storage, make sure no process has any
-	 * zero pages covering this hole
-	 */
-	if (alloc) {
-		loff_t lstart = vmf->pgoff << PAGE_CACHE_SHIFT;
-		loff_t lend = lstart + PUD_SIZE - 1; /* inclusive */
+		if (error) {
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			result = VM_FAULT_MAJOR;
+			error = 0;
+		}
 
-		truncate_pagecache_range(inode, lstart, lend);
+		/*
+		 * We don't know if dax_create_pfns() was able to allocate
+		 * a contiguous aligned chunk, or whether it was only able
+		 * to do a partial allocation.
+		 */
+		entry = find_get_entry(mapping, vmf->pgoff);
+		if (!radix_tree_exceptional_entry(entry) ||
+				RADIX_DAX_SIZE(entry) < RADIX_DAX_PUD)
+			goto fallback;
 	}
 
-	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
-		dax_pud_dbg(&bh, address, "no zero page");
+	if (is_bad_pfn_t(pfn)) {
+		dax_pud_dbg(&bh, vaddr, "no zero page");
 		goto fallback;
 	} else {
-		result = dax_insert_pud_mapping(inode, &bh, vma, vmf);
+		if (current->needs_wmb)
+			wmb_pmem();
+
+		result |= vmf_insert_pfn_pud(vma, vaddr, vmf->pud, pfn,
+						write);
 	}
 
  out:
@@ -1113,17 +1048,13 @@ EXPORT_SYMBOL_GPL(dax_fault);
  */
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	struct file *file = vma->vm_file;
+	struct address_space *mapping = vma->vm_file->f_mapping;
+
+	spin_lock_irq(&mapping->tree_lock);
+	radix_tree_tag_set(&mapping->page_tree, vmf->pgoff,
+							PAGECACHE_TAG_DIRTY);
+	spin_unlock_irq(&mapping->tree_lock);
 
-	/*
-	 * We pass NO_SECTOR to dax_radix_entry() because we expect that a
-	 * RADIX_DAX_PTE entry already exists in the radix tree from a
-	 * previous call to dax_fault().  We just want to look up that PTE
-	 * entry using vmf->pgoff and make sure the dirty tag is set.  This
-	 * saves us from having to make a call to get_block() here to look
-	 * up the sector.
-	 */
-	dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 8e58c36..0a6505d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -5,9 +5,10 @@
 #include <linux/mm.h>
 #include <asm/pgtable.h>
 
+int dax_clear_blocks(struct block_device *, sector_t sector, long size);
+
 ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
 		  get_block_t, dio_iodone_t, int flags);
-int dax_clear_blocks(struct inode *, sector_t block, long size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 07d18a8..95a7b50 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -8,21 +8,46 @@
  * PFN_SG_LAST - pfn references a page and is the last scatterlist entry
  * PFN_DEV - pfn is not covered by system memmap by default
  * PFN_MAP - pfn has a dynamic page mapping established by a device driver
+ *
+ * Note that DAX uses the same format for its radix tree entries.  The
+ * bottom two bits are used by the radix tree.
  */
-#define PFN_FLAGS_MASK (((unsigned long) ~PAGE_MASK) \
-		<< (BITS_PER_LONG - PAGE_SHIFT))
-#define PFN_SG_CHAIN (1UL << (BITS_PER_LONG - 1))
-#define PFN_SG_LAST (1UL << (BITS_PER_LONG - 2))
-#define PFN_DEV (1UL << (BITS_PER_LONG - 3))
-#define PFN_MAP (1UL << (BITS_PER_LONG - 4))
+#define PFN_FLAG_BITS	4
+#define PFN_FLAGS_MASK	((1 << PFN_FLAG_BITS) - 1)
+#define PFN_SG_CHAIN	0x1UL
+#define PFN_SG_LAST	0x2UL
+#define PFN_DEV		0x4UL
+#define PFN_MAP		0x8UL
 
 static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, unsigned long flags)
 {
-	pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), };
+	pfn_t pfn_t = { .val = (pfn << PFN_FLAG_BITS) |
+					(flags & PFN_FLAGS_MASK), };
 
 	return pfn_t;
 }
 
+static inline __must_check pfn_t pfn_t_add(const pfn_t pfn, int val)
+{
+	pfn_t tmp = pfn;
+	tmp.val += val << PFN_FLAG_BITS;
+	return tmp;
+}
+	
+/*
+ * It makes no sense to have both SG_CHAIN and SG_LAST set, so we could
+ * encode an errno in here if we need to.  Note that you can't put a
+ * bad_pfn_t in the radix tree because the radix tree uses the bottom bit
+ * for its own purposes.
+ */
+#define bad_pfn_t	((pfn_t) { .val = -1 })
+
+static inline bool is_bad_pfn_t(pfn_t pfn)
+{
+	return ((pfn.val & (PFN_SG_CHAIN | PFN_SG_LAST)) ==
+			  (PFN_SG_CHAIN | PFN_SG_LAST));
+}
+
 /* a default pfn to pfn_t conversion assumes that @pfn is pfn_valid() */
 static inline pfn_t pfn_to_pfn_t(unsigned long pfn)
 {
@@ -38,7 +63,7 @@ static inline bool pfn_t_has_page(pfn_t pfn)
 
 static inline unsigned long pfn_t_to_pfn(pfn_t pfn)
 {
-	return pfn.val & ~PFN_FLAGS_MASK;
+	return pfn.val >> PFN_FLAG_BITS;
 }
 
 static inline struct page *pfn_t_to_page(pfn_t pfn)
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 7c88ad1..57e7d87 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -51,15 +51,6 @@
 #define RADIX_TREE_EXCEPTIONAL_ENTRY	2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT	2
 
-#define RADIX_DAX_MASK	0xf
-#define RADIX_DAX_SHIFT	4
-#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
-#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
-#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
-
 static inline int radix_tree_is_indirect_ptr(void *ptr)
 {
 	return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e95d8a..2cdfe76 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1476,6 +1476,7 @@ struct task_struct {
 	/* unserialized, strictly 'current' */
 	unsigned in_execve:1; /* bit to tell LSMs we're in execve */
 	unsigned in_iowait:1;
+	unsigned needs_wmb:1;
 #ifdef CONFIG_MEMCG
 	unsigned memcg_may_oom:1;
 #ifndef CONFIG_SLOB
-- 
2.7.0.rc3

  parent reply	other threads:[~2016-02-01 13:44 UTC|newest]

Thread overview: 66+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-01-28 19:35 [PATCH 1/2] block: fix pfn_mkwrite() DAX fault handler Ross Zwisler
2016-01-28 19:35 ` Ross Zwisler
2016-01-28 19:35 ` [PATCH 2/2] dax: fix bdev NULL pointer dereferences Ross Zwisler
2016-01-28 19:35   ` Ross Zwisler
2016-01-28 20:21   ` Dan Williams
2016-01-28 20:21     ` Dan Williams
2016-01-28 21:38   ` Christoph Hellwig
2016-01-29 18:28     ` Ross Zwisler
2016-01-29 23:34       ` Ross Zwisler
2016-01-30  0:18         ` Dan Williams
2016-01-31 22:44         ` Dave Chinner
2016-01-30  5:28       ` Matthew Wilcox
2016-01-30  6:01         ` Dan Williams
2016-01-30  7:08           ` Jared Hulbert
2016-01-31  2:32           ` Matthew Wilcox
2016-01-31  6:12             ` Ross Zwisler
2016-01-31 10:55               ` Matthew Wilcox
2016-01-31 16:38                 ` Dan Williams
2016-01-31 18:07                   ` Matthew Wilcox
2016-01-31 18:18                     ` Dan Williams
2016-01-31 18:27                       ` Matthew Wilcox
2016-01-31 18:50                         ` Dan Williams
2016-01-31 19:51                     ` Dan Williams
2016-02-01 13:44             ` Matthew Wilcox [this message]
2016-02-01 14:51         ` Jan Kara
2016-02-01 20:49           ` Matthew Wilcox
2016-02-01 21:47           ` Dave Chinner
2016-02-02  6:06             ` Jared Hulbert
2016-02-02  6:46               ` Dan Williams
2016-02-02  8:05                 ` Jared Hulbert
2016-02-02 16:51                   ` Dan Williams
2016-02-02 21:46                     ` Jared Hulbert
2016-02-03  0:34                       ` Matthew Wilcox
2016-02-03  1:21                         ` Jared Hulbert
2016-02-02 11:17             ` Jan Kara
2016-02-02 16:33               ` Dan Williams
2016-02-02 16:46                 ` Jan Kara
2016-02-02 17:10                   ` Dan Williams
2016-02-02 17:34                     ` Ross Zwisler
2016-02-02 17:46                       ` Dan Williams
2016-02-02 17:47                         ` Dan Williams
2016-02-02 18:24                           ` Ross Zwisler
2016-02-02 18:46                         ` Matthew Wilcox
2016-02-02 18:59                           ` Dan Williams
2016-02-02 20:14                             ` Matthew Wilcox
2016-02-03 11:09                           ` Jan Kara
2016-02-03 10:46                       ` Jan Kara
2016-02-03 20:13                         ` Ross Zwisler
2016-02-04  9:15                           ` Jan Kara
2016-02-04 23:38                             ` Ross Zwisler
2016-02-06 23:15                             ` Dave Chinner
2016-02-07  5:27                               ` Ross Zwisler
2016-02-04 19:56                         ` Ross Zwisler
2016-02-04 20:29                           ` Jan Kara
2016-02-04 22:19                             ` Ross Zwisler
2016-02-05 22:25                             ` Ross Zwisler
2016-02-06 23:40                               ` Dave Chinner
2016-02-07  6:43                                 ` Ross Zwisler
2016-02-08 13:48                                   ` Jan Kara
2016-02-07  8:38                               ` Christoph Hellwig
2016-02-08 15:55                                 ` Ross Zwisler
2016-02-02 18:41               ` Ross Zwisler
2016-02-02 18:53                 ` Ross Zwisler
2016-02-02  0:02     ` Ross Zwisler
2016-02-02  7:10       ` Dave Chinner
2016-02-02 10:34       ` Jan Kara

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160201134410.GD2948@linux.intel.com \
    --to=willy@linux.intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=dan.j.williams@intel.com \
    --cc=david@fromorbit.com \
    --cc=hch@infradead.org \
    --cc=jack@suse.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nvdimm@ml01.01.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.