linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* sub-page blocksize support in iomap non-buffer head path v3
@ 2018-05-23 14:46 Christoph Hellwig
  2018-05-23 14:46 ` [PATCH 1/2] iomap: add support for sub-pagesize buffered I/O without buffer heads Christoph Hellwig
  2018-05-23 14:46 ` [PATCH 2/2] xfs: add support for sub-pagesize writeback without buffer_heads Christoph Hellwig
  0 siblings, 2 replies; 7+ messages in thread
From: Christoph Hellwig @ 2018-05-23 14:46 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Hi all,

this series adds support for buffered I/O without buffer heads for
block size < PAGE_SIZE to the iomap and XFS code.

A git tree is available at:

    git://git.infradead.org/users/hch/xfs.git xfs-iomap-read xfs-remove-bufferheads.2

Gitweb:

    http://git.infradead.org/users/hch/xfs.git/shortlog/refs/heads/xfs-remove-bufferheads.2

Changes since v2:
 - rebased

Changes since v1:
 - call iomap_page_create in page_mkwrite to fix generic/095
 - split into a separate series

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/2] iomap: add support for sub-pagesize buffered I/O without buffer heads
  2018-05-23 14:46 sub-page blocksize support in iomap non-buffer head path v3 Christoph Hellwig
@ 2018-05-23 14:46 ` Christoph Hellwig
  2018-05-25 17:17   ` Brian Foster
  2018-05-23 14:46 ` [PATCH 2/2] xfs: add support for sub-pagesize writeback without buffer_heads Christoph Hellwig
  1 sibling, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2018-05-23 14:46 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

After already supporting a simple implementation of buffered writes for
the blocksize == PAGE_SIZE case in the last commit this adds full support
even for smaller block sizes.   There are three bits of per-block
information in the buffer_head structure that really matter for the iomap
read and write path:

 - uptodate status (BH_uptodate)
 - marked as currently under read I/O (BH_Async_Read)
 - marked as currently under write I/O (BH_Async_Write)

Instead of having new per-block structures this now adds a per-page
structure called struct iomap_page to track this information in a slightly
different form:

 - a bitmap for the per-block uptodate status.  For worst case of a 64k
   page size system this bitmap needs to contain 128 bits.  For the
   typical 4k page size case it only needs 8 bits, although we still
   need a full unsigned long due to the way the atomic bitmap API works.
 - two atomic_t counters are used to track the outstanding read and write
   counts

There is quite a bit of boilerplate code as the buffered I/O path uses
various helper methods, but the actual code is very straight forward.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c            | 247 +++++++++++++++++++++++++++++++++++++++---
 include/linux/iomap.h |  31 ++++++
 2 files changed, 260 insertions(+), 18 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index debb859a8a14..ea746e0287f9 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -17,6 +17,7 @@
 #include <linux/iomap.h>
 #include <linux/uaccess.h>
 #include <linux/gfp.h>
+#include <linux/migrate.h>
 #include <linux/mm.h>
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
@@ -104,6 +105,107 @@ iomap_sector(struct iomap *iomap, loff_t pos)
 	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
 }
 
+static struct iomap_page *
+iomap_page_create(struct inode *inode, struct page *page)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (iop || i_blocksize(inode) == PAGE_SIZE)
+		return iop;
+
+	iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
+	atomic_set(&iop->read_count, 0);
+	atomic_set(&iop->write_count, 0);
+	bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
+	set_page_private(page, (unsigned long)iop);
+	SetPagePrivate(page);
+	return iop;
+}
+
+/*
+ * Calculate the range inside the page that we actually need to read.
+ */
+static void
+iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
+		loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
+{
+	unsigned poff = *pos & (PAGE_SIZE - 1);
+	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+
+	if (iop) {
+		unsigned block_size = i_blocksize(inode);
+		unsigned first = poff >> inode->i_blkbits;
+		unsigned last = (poff + plen - 1) >> inode->i_blkbits;
+		unsigned int i;
+
+		/* move forward for each leading block marked uptodate */
+		for (i = first; i <= last; i++) {
+			if (!test_bit(i, iop->uptodate))
+				break;
+			*pos += block_size;
+			poff += block_size;
+			plen -= block_size;
+		}
+
+		/* truncate len if we find any trailing uptodate block(s) */
+		for ( ; i <= last; i++) {
+			if (test_bit(i, iop->uptodate)) {
+				plen -= (last - i + 1) * block_size;
+				break;
+			}
+		}
+	}
+
+	*offp = poff;
+	*lenp = plen;
+}
+
+static void
+iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+	struct inode *inode = page->mapping->host;
+	unsigned first = off >> inode->i_blkbits;
+	unsigned last = (off + len - 1) >> inode->i_blkbits;
+	unsigned int i;
+	bool uptodate = true;
+
+	if (iop) {
+		for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
+			if (i >= first && i <= last)
+				set_bit(i, iop->uptodate);
+			else if (!test_bit(i, iop->uptodate))
+				uptodate = false;
+		}
+	}
+
+	if (uptodate && !PageError(page))
+		SetPageUptodate(page);
+}
+
+static void
+iomap_read_finish(struct iomap_page *iop, struct page *page)
+{
+	if (!iop || atomic_dec_and_test(&iop->read_count))
+		unlock_page(page);
+}
+
+static void
+iomap_read_page_end_io(struct bio_vec *bvec, int error)
+{
+	struct page *page = bvec->bv_page;
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (unlikely(error)) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len);
+	}
+
+	iomap_read_finish(iop, page);
+}
+
 static void
 iomap_read_end_io(struct bio *bio)
 {
@@ -112,7 +214,7 @@ iomap_read_end_io(struct bio *bio)
 	int i;
 
 	bio_for_each_segment_all(bvec, bio, i)
-		page_endio(bvec->bv_page, false, error);
+		iomap_read_page_end_io(bvec, error);
 	bio_put(bio);
 }
 
@@ -142,18 +244,19 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 {
 	struct iomap_readpage_ctx *ctx = data;
 	struct page *page = ctx->cur_page;
-	unsigned poff = pos & (PAGE_SIZE - 1);
-	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+	struct iomap_page *iop = iomap_page_create(inode, page);
 	bool is_contig = false;
+	loff_t orig_pos = pos;
+	unsigned poff, plen;
 	sector_t sector;
 
-	/* we don't support blocksize < PAGE_SIZE quite yet: */
-	WARN_ON_ONCE(pos != page_offset(page));
-	WARN_ON_ONCE(plen != PAGE_SIZE);
+	iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
+	if (plen == 0)
+		goto done;
 
 	if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
 		zero_user(page, poff, plen);
-		SetPageUptodate(page);
+		iomap_set_range_uptodate(page, poff, plen);
 		goto done;
 	}
 
@@ -169,6 +272,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		is_contig = true;
 	}
 
+	/*
+	 * If we start a new segment we need to increase the read count, and we
+	 * need to do so before submitting any previous full bio to make sure
+	 * that we don't prematurely unlock the page.
+	 */
+	if (iop)
+		atomic_inc(&iop->read_count);
+
 	if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
 		if (ctx->bio)
 			submit_bio(ctx->bio);
@@ -177,7 +288,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
 	__bio_add_page(ctx->bio, page, plen, poff);
 done:
-	return plen;
+	return pos - orig_pos + plen;
 }
 
 int
@@ -188,8 +299,6 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
 	unsigned poff;
 	loff_t ret;
 
-	WARN_ON_ONCE(page_has_buffers(page));
-
 	for (poff = 0; poff < PAGE_SIZE; poff += ret) {
 		ret = iomap_apply(inode, page_offset(page) + poff,
 				PAGE_SIZE - poff, 0, ops, &ctx,
@@ -295,6 +404,92 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
 }
 EXPORT_SYMBOL_GPL(iomap_readpages);
 
+int
+iomap_is_partially_uptodate(struct page *page, unsigned long from,
+		unsigned long count)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+	struct inode *inode = page->mapping->host;
+	unsigned first = from >> inode->i_blkbits;
+	unsigned last = (from + count - 1) >> inode->i_blkbits;
+	unsigned i;
+
+	if (iop) {
+		for (i = first; i <= last; i++)
+			if (!test_bit(i, iop->uptodate))
+				return 0;
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
+
+int
+iomap_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+
+	/*
+	 * mm accommodates an old ext3 case where clean pages might not have had
+	 * the dirty bit cleared. Thus, it can send actual dirty pages to
+	 * ->releasepage() via shrink_active_list(), skip those here.
+	 */
+	if (PageDirty(page) || PageWriteback(page))
+		return 0;
+
+	if (iop) {
+		WARN_ON_ONCE(atomic_read(&iop->read_count));
+		WARN_ON_ONCE(atomic_read(&iop->write_count));
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		kfree(iop);
+	}
+	return 1;
+}
+EXPORT_SYMBOL_GPL(iomap_releasepage);
+
+void
+iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
+{
+	/*
+	 * If we are invalidating the entire page, clear the dirty state from it
+	 * and release it to avoid unnecessary buildup of the LRU.
+	 */
+	if (offset == 0 && len == PAGE_SIZE) {
+		cancel_dirty_page(page);
+		iomap_releasepage(page, GFP_NOIO);
+	}
+}
+EXPORT_SYMBOL_GPL(iomap_invalidatepage);
+
+#ifdef CONFIG_MIGRATION
+int
+iomap_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page, enum migrate_mode mode)
+{
+	int ret;
+
+	ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+	if (ret != MIGRATEPAGE_SUCCESS)
+		return ret;
+
+	if (page_has_private(page)) {
+		ClearPagePrivate(page);
+		set_page_private(newpage, page_private(page));
+		set_page_private(page, 0);
+		SetPagePrivate(newpage);
+	}
+
+	if (mode != MIGRATE_SYNC_NO_COPY)
+		migrate_page_copy(newpage, page);
+	else
+		migrate_page_states(newpage, page);
+	return MIGRATEPAGE_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(iomap_migrate_page);
+#endif /* CONFIG_MIGRATION */
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -318,6 +513,7 @@ iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
 
 	if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
 		zero_user_segments(page, poff, from, to, poff + plen);
+		iomap_set_range_uptodate(page, poff, plen);
 		return 0;
 	}
 
@@ -333,6 +529,7 @@ static int
 __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
 		struct page *page, struct iomap *iomap)
 {
+	struct iomap_page *iop = iomap_page_create(inode, page);
 	loff_t block_size = i_blocksize(inode);
 	loff_t block_start = pos & ~(block_size - 1);
 	loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
@@ -340,15 +537,29 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
 	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - block_start);
 	unsigned from = pos & (PAGE_SIZE - 1);
 	unsigned to = from + len;
-
-	WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
+	int status;
 
 	if (PageUptodate(page))
 		return 0;
-	if (from <= poff && to >= poff + plen)
-		return 0;
-	return iomap_read_page_sync(inode, block_start, page,
-			poff, plen, from, to, iomap);
+
+	do {
+		iomap_adjust_read_range(inode, iop, &block_start,
+				block_end - block_start, &poff, &plen);
+		if (plen == 0)
+			break;
+
+		if ((from > poff && from < poff + plen) ||
+		    (to > poff && to < poff + plen)) {
+			status = iomap_read_page_sync(inode, block_start, page,
+					poff, plen, from, to, iomap);
+			if (status)
+				return status;
+		}
+
+		block_start += plen;
+	} while (poff + plen < PAGE_SIZE);
+
+	return 0;
 }
 
 static int
@@ -429,7 +640,7 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
 	if (unlikely(copied < len && !PageUptodate(page))) {
 		copied = 0;
 	} else {
-		SetPageUptodate(page);
+		iomap_set_range_uptodate(page, pos & (PAGE_SIZE - 1), len);
 		iomap_set_page_dirty(page);
 	}
 	return __generic_write_end(inode, pos, copied, page);
@@ -741,7 +952,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
 		block_commit_write(page, 0, length);
 	} else {
 		WARN_ON_ONCE(!PageUptodate(page));
-		WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
+		iomap_page_create(inode, page);
 	}
 
 	return length;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4d3d9d0cd69f..7f8787a1bbce 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -2,6 +2,9 @@
 #ifndef LINUX_IOMAP_H
 #define LINUX_IOMAP_H 1
 
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/mm.h>
 #include <linux/types.h>
 
 struct address_space;
@@ -88,12 +91,40 @@ struct iomap_ops {
 			ssize_t written, unsigned flags, struct iomap *iomap);
 };
 
+/*
+ * Structure allocate for each page when block size < PAGE_SIZE to track
+ * sub-page uptodate status and I/O completions.
+ */
+struct iomap_page {
+	atomic_t		read_count;
+	atomic_t		write_count;
+	DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
+};
+
+static inline struct iomap_page *to_iomap_page(struct page *page)
+{
+	if (page_has_private(page))
+		return (struct iomap_page *)page_private(page);
+	return NULL;
+}
+
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops);
 int iomap_readpage(struct page *page, const struct iomap_ops *ops);
 int iomap_readpages(struct address_space *mapping, struct list_head *pages,
 		unsigned nr_pages, const struct iomap_ops *ops);
 int iomap_set_page_dirty(struct page *page);
+int iomap_is_partially_uptodate(struct page *page, unsigned long from,
+		unsigned long count);
+int iomap_releasepage(struct page *page, gfp_t gfp_mask);
+void iomap_invalidatepage(struct page *page, unsigned int offset,
+		unsigned int len);
+#ifdef CONFIG_MIGRATION
+int iomap_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page, enum migrate_mode mode);
+#else
+#define iomap_migrate_page NULL
+#endif
 int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
-- 
2.17.0

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 2/2] xfs: add support for sub-pagesize writeback without buffer_heads
  2018-05-23 14:46 sub-page blocksize support in iomap non-buffer head path v3 Christoph Hellwig
  2018-05-23 14:46 ` [PATCH 1/2] iomap: add support for sub-pagesize buffered I/O without buffer heads Christoph Hellwig
@ 2018-05-23 14:46 ` Christoph Hellwig
  2018-05-25 17:17   ` Brian Foster
  1 sibling, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2018-05-23 14:46 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Switch to using the iomap_page structure for checking sub-page uptodate
status and track sub-page I/O completion status, and remove large
quantities of boilerplate code working around buffer heads.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c  | 536 +++++++--------------------------------------
 fs/xfs/xfs_buf.h   |   1 -
 fs/xfs/xfs_iomap.c |   3 -
 fs/xfs/xfs_super.c |   2 +-
 fs/xfs/xfs_trace.h |  18 +-
 5 files changed, 79 insertions(+), 481 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index efa2cbb27d67..d279929e53fb 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -32,9 +32,6 @@
 #include "xfs_bmap_util.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_reflink.h"
-#include <linux/gfp.h>
-#include <linux/mpage.h>
-#include <linux/pagevec.h>
 #include <linux/writeback.h>
 
 /*
@@ -46,25 +43,6 @@ struct xfs_writepage_ctx {
 	struct xfs_ioend	*ioend;
 };
 
-void
-xfs_count_page_state(
-	struct page		*page,
-	int			*delalloc,
-	int			*unwritten)
-{
-	struct buffer_head	*bh, *head;
-
-	*delalloc = *unwritten = 0;
-
-	bh = head = page_buffers(page);
-	do {
-		if (buffer_unwritten(bh))
-			(*unwritten) = 1;
-		else if (buffer_delay(bh))
-			(*delalloc) = 1;
-	} while ((bh = bh->b_this_page) != head);
-}
-
 struct block_device *
 xfs_find_bdev_for_inode(
 	struct inode		*inode)
@@ -97,67 +75,17 @@ xfs_finish_page_writeback(
 	struct bio_vec		*bvec,
 	int			error)
 {
+	struct iomap_page	*iop = to_iomap_page(bvec->bv_page);
+
 	if (error) {
 		SetPageError(bvec->bv_page);
 		mapping_set_error(inode->i_mapping, -EIO);
 	}
-	end_page_writeback(bvec->bv_page);
-}
 
-/*
- * We're now finished for good with this page.  Update the page state via the
- * associated buffer_heads, paying attention to the start and end offsets that
- * we need to process on the page.
- *
- * Note that we open code the action in end_buffer_async_write here so that we
- * only have to iterate over the buffers attached to the page once.  This is not
- * only more efficient, but also ensures that we only calls end_page_writeback
- * at the end of the iteration, and thus avoids the pitfall of having the page
- * and buffers potentially freed after every call to end_buffer_async_write.
- */
-static void
-xfs_finish_buffer_writeback(
-	struct inode		*inode,
-	struct bio_vec		*bvec,
-	int			error)
-{
-	struct buffer_head	*head = page_buffers(bvec->bv_page), *bh = head;
-	bool			busy = false;
-	unsigned int		off = 0;
-	unsigned long		flags;
-
-	ASSERT(bvec->bv_offset < PAGE_SIZE);
-	ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
-	ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
-	ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
-
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
-	do {
-		if (off >= bvec->bv_offset &&
-		    off < bvec->bv_offset + bvec->bv_len) {
-			ASSERT(buffer_async_write(bh));
-			ASSERT(bh->b_end_io == NULL);
-
-			if (error) {
-				mark_buffer_write_io_error(bh);
-				clear_buffer_uptodate(bh);
-				SetPageError(bvec->bv_page);
-			} else {
-				set_buffer_uptodate(bh);
-			}
-			clear_buffer_async_write(bh);
-			unlock_buffer(bh);
-		} else if (buffer_async_write(bh)) {
-			ASSERT(buffer_locked(bh));
-			busy = true;
-		}
-		off += bh->b_size;
-	} while ((bh = bh->b_this_page) != head);
-	bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
-	local_irq_restore(flags);
+	ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
+	ASSERT(!iop || atomic_read(&iop->write_count) > 0);
 
-	if (!busy)
+	if (!iop || atomic_dec_and_test(&iop->write_count))
 		end_page_writeback(bvec->bv_page);
 }
 
@@ -191,12 +119,8 @@ xfs_destroy_ioend(
 			next = bio->bi_private;
 
 		/* walk each page on bio, ending page IO on them */
-		bio_for_each_segment_all(bvec, bio, i) {
-			if (page_has_buffers(bvec->bv_page))
-				xfs_finish_buffer_writeback(inode, bvec, error);
-			else
-				xfs_finish_page_writeback(inode, bvec, error);
-		}
+		bio_for_each_segment_all(bvec, bio, i)
+			xfs_finish_page_writeback(inode, bvec, error);
 		bio_put(bio);
 	}
 
@@ -638,6 +562,7 @@ xfs_add_to_ioend(
 	struct inode		*inode,
 	xfs_off_t		offset,
 	struct page		*page,
+	struct iomap_page	*iop,
 	struct xfs_writepage_ctx *wpc,
 	struct writeback_control *wbc,
 	struct list_head	*iolist)
@@ -661,100 +586,27 @@ xfs_add_to_ioend(
 				bdev, sector);
 	}
 
-	/*
-	 * If the block doesn't fit into the bio we need to allocate a new
-	 * one.  This shouldn't happen more than once for a given block.
-	 */
-	while (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len)
-		xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
+	if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
+		if (iop)
+			atomic_inc(&iop->write_count);
+		if (bio_full(wpc->ioend->io_bio))
+			xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
+		__bio_add_page(wpc->ioend->io_bio, page, len, poff);
+	}
 
 	wpc->ioend->io_size += len;
 }
 
-STATIC void
-xfs_map_buffer(
-	struct inode		*inode,
-	struct buffer_head	*bh,
-	struct xfs_bmbt_irec	*imap,
-	xfs_off_t		offset)
-{
-	sector_t		bn;
-	struct xfs_mount	*m = XFS_I(inode)->i_mount;
-	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
-	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
-
-	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-
-	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
-	      ((offset - iomap_offset) >> inode->i_blkbits);
-
-	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
-
-	bh->b_blocknr = bn;
-	set_buffer_mapped(bh);
-}
-
-STATIC void
-xfs_map_at_offset(
-	struct inode		*inode,
-	struct buffer_head	*bh,
-	struct xfs_bmbt_irec	*imap,
-	xfs_off_t		offset)
-{
-	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-
-	lock_buffer(bh);
-	xfs_map_buffer(inode, bh, imap, offset);
-	set_buffer_mapped(bh);
-	clear_buffer_delay(bh);
-	clear_buffer_unwritten(bh);
-
-	/*
-	 * If this is a realtime file, data may be on a different device.
-	 * to that pointed to from the buffer_head b_bdev currently. We can't
-	 * trust that the bufferhead has a already been mapped correctly, so
-	 * set the bdev now.
-	 */
-	bh->b_bdev = xfs_find_bdev_for_inode(inode);
-	bh->b_end_io = NULL;
-	set_buffer_async_write(bh);
-	set_buffer_uptodate(bh);
-	clear_buffer_dirty(bh);
-}
-
-STATIC void
-xfs_vm_invalidatepage(
-	struct page		*page,
-	unsigned int		offset,
-	unsigned int		length)
-{
-	trace_xfs_invalidatepage(page->mapping->host, page, offset,
-				 length);
-
-	/*
-	 * If we are invalidating the entire page, clear the dirty state from it
-	 * so that we can check for attempts to release dirty cached pages in
-	 * xfs_vm_releasepage().
-	 */
-	if (offset == 0 && length >= PAGE_SIZE)
-		cancel_dirty_page(page);
-	block_invalidatepage(page, offset, length);
-}
-
 /*
- * If the page has delalloc buffers on it, we need to punch them out before we
- * invalidate the page. If we don't, we leave a stale delalloc mapping on the
- * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
- * is done on that same region - the delalloc extent is returned when none is
- * supposed to be there.
+ * If the page has delalloc blocks on it, we need to punch them out before we
+ * invalidate the page.  If we don't, we leave a stale delalloc mapping on the
+ * inode that can trip up a later direct I/O read operation on the same region.
  *
- * We prevent this by truncating away the delalloc regions on the page before
- * invalidating it. Because they are delalloc, we can do this without needing a
- * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
- * truncation without a transaction as there is no space left for block
- * reservation (typically why we see a ENOSPC in writeback).
+ * We prevent this by truncating away the delalloc regions on the page.  Because
+ * they are delalloc, we can do this without needing a transaction. Indeed - if
+ * we get ENOSPC errors, we have to be able to do this truncation without a
+ * transaction as there is no space left for block reservation (typically why we
+ * see a ENOSPC in writeback).
  */
 STATIC void
 xfs_aops_discard_page(
@@ -768,7 +620,7 @@ xfs_aops_discard_page(
 	int			error;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		goto out_invalidate;
+		goto out;
 
 	xfs_alert(mp,
 		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
@@ -778,15 +630,15 @@ xfs_aops_discard_page(
 			PAGE_SIZE / i_blocksize(inode));
 	if (error && !XFS_FORCED_SHUTDOWN(mp))
 		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
-out_invalidate:
-	xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
+out:
+	iomap_invalidatepage(page, 0, PAGE_SIZE);
 }
 
 /*
  * We implement an immediate ioend submission policy here to avoid needing to
  * chain multiple ioends and hence nest mempool allocations which can violate
  * forward progress guarantees we need to provide. The current ioend we are
- * adding buffers to is cached on the writepage context, and if the new buffer
+ * adding blocks to is cached on the writepage context, and if the new block
  * does not append to the cached ioend it will create a new ioend and cache that
  * instead.
  *
@@ -807,41 +659,28 @@ xfs_writepage_map(
 	uint64_t		end_offset)
 {
 	LIST_HEAD(submit_list);
+	struct iomap_page	*iop = to_iomap_page(page);
+	unsigned		len = i_blocksize(inode);
 	struct xfs_ioend	*ioend, *next;
-	struct buffer_head	*bh = NULL;
-	ssize_t			len = i_blocksize(inode);
-	int			error = 0;
-	int			count = 0;
-	loff_t			file_offset;	/* file offset of page */
-	unsigned		poffset;	/* offset into page */
+	int			error = 0, count = 0, i;
+	u64			file_offset;	/* file offset of page */
 
-	if (page_has_buffers(page))
-		bh = page_buffers(page);
+	ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
+	ASSERT(!iop || atomic_read(&iop->write_count) == 0);
 
 	/*
-	 * Walk the blocks on the page, and we we run off then end of the
-	 * current map or find the current map invalid, grab a new one.
-	 * We only use bufferheads here to check per-block state - they no
-	 * longer control the iteration through the page. This allows us to
-	 * replace the bufferhead with some other state tracking mechanism in
-	 * future.
+	 * Walk through the page to find areas to write back. If we run off the
+	 * end of the current map or find the current map invalid, grab a new
+	 * one.
 	 */
-	for (poffset = 0, file_offset = page_offset(page);
-	     poffset < PAGE_SIZE;
-	     poffset += len, file_offset += len) {
-		/* past the range we are writing, so nothing more to write. */
-		if (file_offset >= end_offset)
-			break;
-
+	for (i = 0, file_offset = page_offset(page);
+	     i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
+	     i++, file_offset += len) {
 		/*
 		 * Block does not contain valid data, skip it.
 		 */
-		if (bh && !buffer_uptodate(bh)) {
-			if (PageUptodate(page))
-				ASSERT(buffer_mapped(bh));
-			bh = bh->b_this_page;
+		if (iop && !test_bit(i, iop->uptodate))
 			continue;
-		}
 
 		/*
 		 * If we don't have a valid map, now it's time to get a new one
@@ -854,52 +693,33 @@ xfs_writepage_map(
 			error = xfs_map_blocks(inode, file_offset, &wpc->imap,
 					     &wpc->io_type);
 			if (error)
-				goto out;
+				break;
 		}
 
-		if (wpc->io_type == XFS_IO_HOLE) {
-			/*
-			 * set_page_dirty dirties all buffers in a page, independent
-			 * of their state.  The dirty state however is entirely
-			 * meaningless for holes (!mapped && uptodate), so check we did
-			 * have a buffer covering a hole here and continue.
-			 */
-			if (bh)
-				bh = bh->b_this_page;
-			continue;
-		}
-
-		if (bh) {
-			xfs_map_at_offset(inode, bh, &wpc->imap, file_offset);
-			bh = bh->b_this_page;
+		if (wpc->io_type != XFS_IO_HOLE) {
+			xfs_add_to_ioend(inode, file_offset, page, iop, wpc,
+				wbc, &submit_list);
+			count++;
 		}
-		xfs_add_to_ioend(inode, file_offset, page, wpc, wbc,
-				&submit_list);
-		count++;
 	}
 
 	ASSERT(wpc->ioend || list_empty(&submit_list));
-
-out:
 	ASSERT(PageLocked(page));
 	ASSERT(!PageWriteback(page));
 
 	/*
-	 * On error, we have to fail the ioend here because we have locked
-	 * buffers in the ioend. If we don't do this, we'll deadlock
-	 * invalidating the page as that tries to lock the buffers on the page.
-	 * Also, because we may have set pages under writeback, we have to make
-	 * sure we run IO completion to mark the error state of the IO
-	 * appropriately, so we can't cancel the ioend directly here. That means
-	 * we have to mark this page as under writeback if we included any
-	 * buffers from it in the ioend chain so that completion treats it
-	 * correctly.
+	 * On error, we have to fail the ioend here because we may have set
+	 * pages under writeback, we have to make sure we run IO completion to
+	 * mark the error state of the IO appropriately, so we can't cancel the
+	 * ioend directly here.  That means we have to mark this page as under
+	 * writeback if we included any blocks from it in the ioend chain so
+	 * that completion treats it correctly.
 	 *
 	 * If we didn't include the page in the ioend, the on error we can
 	 * simply discard and unlock it as there are no other users of the page
-	 * or it's buffers right now. The caller will still need to trigger
-	 * submission of outstanding ioends on the writepage context so they are
-	 * treated correctly on error.
+	 * now.  The caller will still need to trigger submission of outstanding
+	 * ioends on the writepage context so they are treated correctly on
+	 * error.
 	 */
 	if (unlikely(error)) {
 		if (!count) {
@@ -940,8 +760,8 @@ xfs_writepage_map(
 	}
 
 	/*
-	 * We can end up here with no error and nothing to write if we race with
-	 * a partial page truncate on a sub-page block sized filesystem.
+	 * We can end up here with no error and nothing to write only if we race
+	 * with a partial page truncate on a sub-page block sized filesystem.
 	 */
 	if (!count)
 		end_page_writeback(page);
@@ -956,7 +776,6 @@ xfs_writepage_map(
  * For delalloc space on the page we need to allocate space and flush it.
  * For unwritten space on the page we need to start the conversion to
  * regular allocated space.
- * For any other dirty buffer heads on the page we should flush them.
  */
 STATIC int
 xfs_do_writepage(
@@ -1110,168 +929,6 @@ xfs_dax_writepages(
 			xfs_find_bdev_for_inode(mapping->host), wbc);
 }
 
-/*
- * Called to move a page into cleanable state - and from there
- * to be released. The page should already be clean. We always
- * have buffer heads in this call.
- *
- * Returns 1 if the page is ok to release, 0 otherwise.
- */
-STATIC int
-xfs_vm_releasepage(
-	struct page		*page,
-	gfp_t			gfp_mask)
-{
-	int			delalloc, unwritten;
-
-	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
-
-	/*
-	 * mm accommodates an old ext3 case where clean pages might not have had
-	 * the dirty bit cleared. Thus, it can send actual dirty pages to
-	 * ->releasepage() via shrink_active_list(). Conversely,
-	 * block_invalidatepage() can send pages that are still marked dirty but
-	 * otherwise have invalidated buffers.
-	 *
-	 * We want to release the latter to avoid unnecessary buildup of the
-	 * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
-	 * that are entirely invalidated and need to be released.  Hence the
-	 * only time we should get dirty pages here is through
-	 * shrink_active_list() and so we can simply skip those now.
-	 *
-	 * warn if we've left any lingering delalloc/unwritten buffers on clean
-	 * or invalidated pages we are about to release.
-	 */
-	if (PageDirty(page))
-		return 0;
-
-	xfs_count_page_state(page, &delalloc, &unwritten);
-
-	if (WARN_ON_ONCE(delalloc))
-		return 0;
-	if (WARN_ON_ONCE(unwritten))
-		return 0;
-
-	return try_to_free_buffers(page);
-}
-
-/*
- * If this is O_DIRECT or the mpage code calling tell them how large the mapping
- * is, so that we can avoid repeated get_blocks calls.
- *
- * If the mapping spans EOF, then we have to break the mapping up as the mapping
- * for blocks beyond EOF must be marked new so that sub block regions can be
- * correctly zeroed. We can't do this for mappings within EOF unless the mapping
- * was just allocated or is unwritten, otherwise the callers would overwrite
- * existing data with zeros. Hence we have to split the mapping into a range up
- * to and including EOF, and a second mapping for beyond EOF.
- */
-static void
-xfs_map_trim_size(
-	struct inode		*inode,
-	sector_t		iblock,
-	struct buffer_head	*bh_result,
-	struct xfs_bmbt_irec	*imap,
-	xfs_off_t		offset,
-	ssize_t			size)
-{
-	xfs_off_t		mapping_size;
-
-	mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
-	mapping_size <<= inode->i_blkbits;
-
-	ASSERT(mapping_size > 0);
-	if (mapping_size > size)
-		mapping_size = size;
-	if (offset < i_size_read(inode) &&
-	    (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) {
-		/* limit mapping to block that spans EOF */
-		mapping_size = roundup_64(i_size_read(inode) - offset,
-					  i_blocksize(inode));
-	}
-	if (mapping_size > LONG_MAX)
-		mapping_size = LONG_MAX;
-
-	bh_result->b_size = mapping_size;
-}
-
-static int
-xfs_get_blocks(
-	struct inode		*inode,
-	sector_t		iblock,
-	struct buffer_head	*bh_result,
-	int			create)
-{
-	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	xfs_fileoff_t		offset_fsb, end_fsb;
-	int			error = 0;
-	int			lockmode = 0;
-	struct xfs_bmbt_irec	imap;
-	int			nimaps = 1;
-	xfs_off_t		offset;
-	ssize_t			size;
-
-	BUG_ON(create);
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
-	offset = (xfs_off_t)iblock << inode->i_blkbits;
-	ASSERT(bh_result->b_size >= i_blocksize(inode));
-	size = bh_result->b_size;
-
-	if (offset >= i_size_read(inode))
-		return 0;
-
-	/*
-	 * Direct I/O is usually done on preallocated files, so try getting
-	 * a block mapping without an exclusive lock first.
-	 */
-	lockmode = xfs_ilock_data_map_shared(ip);
-
-	ASSERT(offset <= mp->m_super->s_maxbytes);
-	if (offset > mp->m_super->s_maxbytes - size)
-		size = mp->m_super->s_maxbytes - offset;
-	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
-	offset_fsb = XFS_B_TO_FSBT(mp, offset);
-
-	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
-			&nimaps, 0);
-	if (error)
-		goto out_unlock;
-	if (!nimaps) {
-		trace_xfs_get_blocks_notfound(ip, offset, size);
-		goto out_unlock;
-	}
-
-	trace_xfs_get_blocks_found(ip, offset, size,
-		imap.br_state == XFS_EXT_UNWRITTEN ?
-			XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
-	xfs_iunlock(ip, lockmode);
-
-	/* trim mapping down to size requested */
-	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
-
-	/*
-	 * For unwritten extents do not report a disk address in the buffered
-	 * read case (treat as if we're reading into a hole).
-	 */
-	if (xfs_bmap_is_real_extent(&imap))
-		xfs_map_buffer(inode, bh_result, &imap, offset);
-
-	/*
-	 * If this is a realtime file, data may be on a different device.
-	 * to that pointed to from the buffer_head b_bdev currently.
-	 */
-	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
-	return 0;
-
-out_unlock:
-	xfs_iunlock(ip, lockmode);
-	return error;
-}
-
 STATIC sector_t
 xfs_vm_bmap(
 	struct address_space	*mapping,
@@ -1301,9 +958,7 @@ xfs_vm_readpage(
 	struct page		*page)
 {
 	trace_xfs_vm_readpage(page->mapping->host, 1);
-	if (i_blocksize(page->mapping->host) == PAGE_SIZE)
-		return iomap_readpage(page, &xfs_iomap_ops);
-	return mpage_readpage(page, xfs_get_blocks);
+	return iomap_readpage(page, &xfs_iomap_ops);
 }
 
 STATIC int
@@ -1314,65 +969,26 @@ xfs_vm_readpages(
 	unsigned		nr_pages)
 {
 	trace_xfs_vm_readpages(mapping->host, nr_pages);
-	if (i_blocksize(mapping->host) == PAGE_SIZE)
-		return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
-	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
+	return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
 }
 
-/*
- * This is basically a copy of __set_page_dirty_buffers() with one
- * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
- * dirty, we'll never be able to clean them because we don't write buffers
- * beyond EOF, and that means we can't invalidate pages that span EOF
- * that have been marked dirty. Further, the dirty state can leak into
- * the file interior if the file is extended, resulting in all sorts of
- * bad things happening as the state does not match the underlying data.
- *
- * XXX: this really indicates that bufferheads in XFS need to die. Warts like
- * this only exist because of bufferheads and how the generic code manages them.
- */
-STATIC int
-xfs_vm_set_page_dirty(
-	struct page		*page)
+static int
+xfs_vm_releasepage(
+	struct page		*page,
+	gfp_t			gfp_mask)
 {
-	struct address_space	*mapping = page->mapping;
-	struct inode		*inode = mapping->host;
-	loff_t			end_offset;
-	loff_t			offset;
-	int			newly_dirty;
-
-	if (unlikely(!mapping))
-		return !TestSetPageDirty(page);
-
-	end_offset = i_size_read(inode);
-	offset = page_offset(page);
-
-	spin_lock(&mapping->private_lock);
-	if (page_has_buffers(page)) {
-		struct buffer_head *head = page_buffers(page);
-		struct buffer_head *bh = head;
+	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
+	return iomap_releasepage(page, gfp_mask);
+}
 
-		do {
-			if (offset < end_offset)
-				set_buffer_dirty(bh);
-			bh = bh->b_this_page;
-			offset += i_blocksize(inode);
-		} while (bh != head);
-	}
-	/*
-	 * Lock out page->mem_cgroup migration to keep PageDirty
-	 * synchronized with per-memcg dirty page counters.
-	 */
-	lock_page_memcg(page);
-	newly_dirty = !TestSetPageDirty(page);
-	spin_unlock(&mapping->private_lock);
-
-	if (newly_dirty)
-		__set_page_dirty(page, mapping, 1);
-	unlock_page_memcg(page);
-	if (newly_dirty)
-		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-	return newly_dirty;
+static void
+xfs_vm_invalidatepage(
+	struct page		*page,
+	unsigned int		offset,
+	unsigned int		length)
+{
+	trace_xfs_invalidatepage(page->mapping->host, page, offset, length);
+	iomap_invalidatepage(page, offset, length);
 }
 
 static int
@@ -1390,13 +1006,13 @@ const struct address_space_operations xfs_address_space_operations = {
 	.readpages		= xfs_vm_readpages,
 	.writepage		= xfs_vm_writepage,
 	.writepages		= xfs_vm_writepages,
-	.set_page_dirty		= xfs_vm_set_page_dirty,
+	.set_page_dirty		= iomap_set_page_dirty,
 	.releasepage		= xfs_vm_releasepage,
 	.invalidatepage		= xfs_vm_invalidatepage,
 	.bmap			= xfs_vm_bmap,
 	.direct_IO		= noop_direct_IO,
-	.migratepage		= buffer_migrate_page,
-	.is_partially_uptodate  = block_is_partially_uptodate,
+	.migratepage		= iomap_migrate_page,
+	.is_partially_uptodate  = iomap_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 	.swap_activate		= xfs_iomap_swapfile_activate,
 };
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index f5f2b71c2fde..f3fa197bd272 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -24,7 +24,6 @@
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/dax.h>
-#include <linux/buffer_head.h>
 #include <linux/uio.h>
 #include <linux/list_lru.h>
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 93c40da3378a..c646d84cd55e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1031,9 +1031,6 @@ xfs_file_iomap_begin(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	if (i_blocksize(inode) < PAGE_SIZE)
-		iomap->flags |= IOMAP_F_BUFFER_HEAD;
-
 	if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
 			!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
 		/* Reserve delalloc blocks for regular writeback. */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 39e5ec3d407f..a9f23ec95216 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1866,7 +1866,7 @@ MODULE_ALIAS_FS("xfs");
 STATIC int __init
 xfs_init_zones(void)
 {
-	xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
+	xfs_ioend_bioset = bioset_create(4 * (PAGE_SIZE / SECTOR_SIZE),
 			offsetof(struct xfs_ioend, io_inline_bio),
 			BIOSET_NEED_BVECS);
 	if (!xfs_ioend_bioset)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ed8f774944ba..e4dc7c7f3da9 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1165,33 +1165,23 @@ DECLARE_EVENT_CLASS(xfs_page_class,
 		__field(loff_t, size)
 		__field(unsigned long, offset)
 		__field(unsigned int, length)
-		__field(int, delalloc)
-		__field(int, unwritten)
 	),
 	TP_fast_assign(
-		int delalloc = -1, unwritten = -1;
-
-		if (page_has_buffers(page))
-			xfs_count_page_state(page, &delalloc, &unwritten);
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = XFS_I(inode)->i_ino;
 		__entry->pgoff = page_offset(page);
 		__entry->size = i_size_read(inode);
 		__entry->offset = off;
 		__entry->length = len;
-		__entry->delalloc = delalloc;
-		__entry->unwritten = unwritten;
 	),
 	TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
-		  "length %x delalloc %d unwritten %d",
+		  "length %x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->pgoff,
 		  __entry->size,
 		  __entry->offset,
-		  __entry->length,
-		  __entry->delalloc,
-		  __entry->unwritten)
+		  __entry->length)
 )
 
 #define DEFINE_PAGE_EVENT(name)		\
@@ -1275,9 +1265,6 @@ DEFINE_EVENT(xfs_imap_class, name,	\
 	TP_ARGS(ip, offset, count, type, irec))
 DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
 DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
 DEFINE_IOMAP_EVENT(xfs_iomap_found);
 
@@ -1316,7 +1303,6 @@ DEFINE_EVENT(xfs_simple_io_class, name,	\
 	TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
-DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
 DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
 DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
-- 
2.17.0

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] iomap: add support for sub-pagesize buffered I/O without buffer heads
  2018-05-23 14:46 ` [PATCH 1/2] iomap: add support for sub-pagesize buffered I/O without buffer heads Christoph Hellwig
@ 2018-05-25 17:17   ` Brian Foster
  2018-05-28  6:50     ` Christoph Hellwig
  0 siblings, 1 reply; 7+ messages in thread
From: Brian Foster @ 2018-05-25 17:17 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, linux-block, linux-mm

On Wed, May 23, 2018 at 04:46:45PM +0200, Christoph Hellwig wrote:
> After already supporting a simple implementation of buffered writes for
> the blocksize == PAGE_SIZE case in the last commit this adds full support
> even for smaller block sizes.   There are three bits of per-block
> information in the buffer_head structure that really matter for the iomap
> read and write path:
> 
>  - uptodate status (BH_uptodate)
>  - marked as currently under read I/O (BH_Async_Read)
>  - marked as currently under write I/O (BH_Async_Write)
> 
> Instead of having new per-block structures this now adds a per-page
> structure called struct iomap_page to track this information in a slightly
> different form:
> 
>  - a bitmap for the per-block uptodate status.  For worst case of a 64k
>    page size system this bitmap needs to contain 128 bits.  For the
>    typical 4k page size case it only needs 8 bits, although we still
>    need a full unsigned long due to the way the atomic bitmap API works.
>  - two atomic_t counters are used to track the outstanding read and write
>    counts
> 
> There is quite a bit of boilerplate code as the buffered I/O path uses
> various helper methods, but the actual code is very straight forward.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/iomap.c            | 247 +++++++++++++++++++++++++++++++++++++++---
>  include/linux/iomap.h |  31 ++++++
>  2 files changed, 260 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/iomap.c b/fs/iomap.c
> index debb859a8a14..ea746e0287f9 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
...
> @@ -104,6 +105,107 @@ iomap_sector(struct iomap *iomap, loff_t pos)
>  	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
>  }
>  
> +static struct iomap_page *
> +iomap_page_create(struct inode *inode, struct page *page)
> +{
> +	struct iomap_page *iop = to_iomap_page(page);
> +
> +	if (iop || i_blocksize(inode) == PAGE_SIZE)
> +		return iop;
> +
> +	iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
> +	atomic_set(&iop->read_count, 0);
> +	atomic_set(&iop->write_count, 0);
> +	bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
> +	set_page_private(page, (unsigned long)iop);
> +	SetPagePrivate(page);

The buffer head implementation does a get/put page when the private
state is set. I'm not quite sure why that is tbh, but do you know
whether we need that here or not?

> +	return iop;
> +}
> +
...
> @@ -142,18 +244,19 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>  {
>  	struct iomap_readpage_ctx *ctx = data;
>  	struct page *page = ctx->cur_page;
> -	unsigned poff = pos & (PAGE_SIZE - 1);
> -	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
> +	struct iomap_page *iop = iomap_page_create(inode, page);
>  	bool is_contig = false;
> +	loff_t orig_pos = pos;
> +	unsigned poff, plen;
>  	sector_t sector;
>  
> -	/* we don't support blocksize < PAGE_SIZE quite yet: */
> -	WARN_ON_ONCE(pos != page_offset(page));
> -	WARN_ON_ONCE(plen != PAGE_SIZE);
> +	iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
> +	if (plen == 0)
> +		goto done;
>  
>  	if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
>  		zero_user(page, poff, plen);
> -		SetPageUptodate(page);
> +		iomap_set_range_uptodate(page, poff, plen);
>  		goto done;
>  	}
>  
> @@ -169,6 +272,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>  		is_contig = true;
>  	}
>  
> +	/*
> +	 * If we start a new segment we need to increase the read count, and we
> +	 * need to do so before submitting any previous full bio to make sure
> +	 * that we don't prematurely unlock the page.
> +	 */
> +	if (iop)
> +		atomic_inc(&iop->read_count);
> +
>  	if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
>  		if (ctx->bio)
>  			submit_bio(ctx->bio);
> @@ -177,7 +288,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>  
>  	__bio_add_page(ctx->bio, page, plen, poff);
>  done:
> -	return plen;
> +	return pos - orig_pos + plen;

A brief comment here (or above the adjust_read_range() call) to explain
the final length calculation would be helpful. E.g., it looks like
leading uptodate blocks are part of the read while trailing uptodate
blocks can be truncated by the above call.

>  }
>  
>  int
> @@ -188,8 +299,6 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
>  	unsigned poff;
>  	loff_t ret;
>  
> -	WARN_ON_ONCE(page_has_buffers(page));
> -
>  	for (poff = 0; poff < PAGE_SIZE; poff += ret) {
>  		ret = iomap_apply(inode, page_offset(page) + poff,
>  				PAGE_SIZE - poff, 0, ops, &ctx,
> @@ -295,6 +404,92 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
>  }
>  EXPORT_SYMBOL_GPL(iomap_readpages);
>  
> +int
> +iomap_is_partially_uptodate(struct page *page, unsigned long from,
> +		unsigned long count)
> +{
> +	struct iomap_page *iop = to_iomap_page(page);
> +	struct inode *inode = page->mapping->host;
> +	unsigned first = from >> inode->i_blkbits;
> +	unsigned last = (from + count - 1) >> inode->i_blkbits;
> +	unsigned i;
> +

block_is_partially_uptodate() has this check:

        if (from < blocksize && to > PAGE_SIZE - blocksize)
                return 0;

... which looks like it checks that the range is actually partial wrt to
block size. The only callers check the page first, but I'm still not
sure why it returns 0 in that case. Any idea?

> +	if (iop) {
> +		for (i = first; i <= last; i++)
> +			if (!test_bit(i, iop->uptodate))
> +				return 0;
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
> +
...
> +
> +void
> +iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
> +{
> +	/*
> +	 * If we are invalidating the entire page, clear the dirty state from it
> +	 * and release it to avoid unnecessary buildup of the LRU.
> +	 */
> +	if (offset == 0 && len == PAGE_SIZE) {
> +		cancel_dirty_page(page);
> +		iomap_releasepage(page, GFP_NOIO);

Seems like this should probably be calling ->releasepage().

> +	}
> +}
> +EXPORT_SYMBOL_GPL(iomap_invalidatepage);
> +
...
> @@ -333,6 +529,7 @@ static int
>  __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
>  		struct page *page, struct iomap *iomap)
>  {
> +	struct iomap_page *iop = iomap_page_create(inode, page);
>  	loff_t block_size = i_blocksize(inode);
>  	loff_t block_start = pos & ~(block_size - 1);
>  	loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
> @@ -340,15 +537,29 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
>  	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - block_start);

poff/plen are now initialized here and in iomap_adjust_read_range().
Perhaps drop this one so the semantic of these being set by the latter
is a bit more clear?

>  	unsigned from = pos & (PAGE_SIZE - 1);
>  	unsigned to = from + len;
> -
> -	WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
> +	int status;
>  
>  	if (PageUptodate(page))
>  		return 0;
> -	if (from <= poff && to >= poff + plen)
> -		return 0;
> -	return iomap_read_page_sync(inode, block_start, page,
> -			poff, plen, from, to, iomap);
> +
> +	do {
> +		iomap_adjust_read_range(inode, iop, &block_start,
> +				block_end - block_start, &poff, &plen);
> +		if (plen == 0)
> +			break;
> +
> +		if ((from > poff && from < poff + plen) ||
> +		    (to > poff && to < poff + plen)) {
> +			status = iomap_read_page_sync(inode, block_start, page,
> +					poff, plen, from, to, iomap);
> +			if (status)
> +				return status;
> +		}
> +
> +		block_start += plen;
> +	} while (poff + plen < PAGE_SIZE);

Something like while (block_start < block_end) would seem a bit more
clear here as well.

Brian

> +
> +	return 0;
>  }
>  
>  static int
> @@ -429,7 +640,7 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
>  	if (unlikely(copied < len && !PageUptodate(page))) {
>  		copied = 0;
>  	} else {
> -		SetPageUptodate(page);
> +		iomap_set_range_uptodate(page, pos & (PAGE_SIZE - 1), len);
>  		iomap_set_page_dirty(page);
>  	}
>  	return __generic_write_end(inode, pos, copied, page);
> @@ -741,7 +952,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
>  		block_commit_write(page, 0, length);
>  	} else {
>  		WARN_ON_ONCE(!PageUptodate(page));
> -		WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
> +		iomap_page_create(inode, page);
>  	}
>  
>  	return length;
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 4d3d9d0cd69f..7f8787a1bbce 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -2,6 +2,9 @@
>  #ifndef LINUX_IOMAP_H
>  #define LINUX_IOMAP_H 1
>  
> +#include <linux/atomic.h>
> +#include <linux/bitmap.h>
> +#include <linux/mm.h>
>  #include <linux/types.h>
>  
>  struct address_space;
> @@ -88,12 +91,40 @@ struct iomap_ops {
>  			ssize_t written, unsigned flags, struct iomap *iomap);
>  };
>  
> +/*
> + * Structure allocate for each page when block size < PAGE_SIZE to track
> + * sub-page uptodate status and I/O completions.
> + */
> +struct iomap_page {
> +	atomic_t		read_count;
> +	atomic_t		write_count;
> +	DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
> +};
> +
> +static inline struct iomap_page *to_iomap_page(struct page *page)
> +{
> +	if (page_has_private(page))
> +		return (struct iomap_page *)page_private(page);
> +	return NULL;
> +}
> +
>  ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
>  		const struct iomap_ops *ops);
>  int iomap_readpage(struct page *page, const struct iomap_ops *ops);
>  int iomap_readpages(struct address_space *mapping, struct list_head *pages,
>  		unsigned nr_pages, const struct iomap_ops *ops);
>  int iomap_set_page_dirty(struct page *page);
> +int iomap_is_partially_uptodate(struct page *page, unsigned long from,
> +		unsigned long count);
> +int iomap_releasepage(struct page *page, gfp_t gfp_mask);
> +void iomap_invalidatepage(struct page *page, unsigned int offset,
> +		unsigned int len);
> +#ifdef CONFIG_MIGRATION
> +int iomap_migrate_page(struct address_space *mapping, struct page *newpage,
> +		struct page *page, enum migrate_mode mode);
> +#else
> +#define iomap_migrate_page NULL
> +#endif
>  int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
>  		const struct iomap_ops *ops);
>  int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
> -- 
> 2.17.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] xfs: add support for sub-pagesize writeback without buffer_heads
  2018-05-23 14:46 ` [PATCH 2/2] xfs: add support for sub-pagesize writeback without buffer_heads Christoph Hellwig
@ 2018-05-25 17:17   ` Brian Foster
  2018-05-28  6:57     ` Christoph Hellwig
  0 siblings, 1 reply; 7+ messages in thread
From: Brian Foster @ 2018-05-25 17:17 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, linux-block, linux-mm

On Wed, May 23, 2018 at 04:46:46PM +0200, Christoph Hellwig wrote:
> Switch to using the iomap_page structure for checking sub-page uptodate
> status and track sub-page I/O completion status, and remove large
> quantities of boilerplate code working around buffer heads.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_aops.c  | 536 +++++++--------------------------------------
>  fs/xfs/xfs_buf.h   |   1 -
>  fs/xfs/xfs_iomap.c |   3 -
>  fs/xfs/xfs_super.c |   2 +-
>  fs/xfs/xfs_trace.h |  18 +-
>  5 files changed, 79 insertions(+), 481 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index efa2cbb27d67..d279929e53fb 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
...
> @@ -768,7 +620,7 @@ xfs_aops_discard_page(
>  	int			error;
>  
>  	if (XFS_FORCED_SHUTDOWN(mp))
> -		goto out_invalidate;
> +		goto out;
>  
>  	xfs_alert(mp,
>  		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
> @@ -778,15 +630,15 @@ xfs_aops_discard_page(
>  			PAGE_SIZE / i_blocksize(inode));
>  	if (error && !XFS_FORCED_SHUTDOWN(mp))
>  		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
> -out_invalidate:
> -	xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
> +out:
> +	iomap_invalidatepage(page, 0, PAGE_SIZE);

All this does is lose the tracepoint. I don't think this call needs to
change. The rest looks Ok to me, but I still need to run some tests on
the whole thing.

Brian

>  }
>  
>  /*
>   * We implement an immediate ioend submission policy here to avoid needing to
>   * chain multiple ioends and hence nest mempool allocations which can violate
>   * forward progress guarantees we need to provide. The current ioend we are
> - * adding buffers to is cached on the writepage context, and if the new buffer
> + * adding blocks to is cached on the writepage context, and if the new block
>   * does not append to the cached ioend it will create a new ioend and cache that
>   * instead.
>   *
> @@ -807,41 +659,28 @@ xfs_writepage_map(
>  	uint64_t		end_offset)
>  {
>  	LIST_HEAD(submit_list);
> +	struct iomap_page	*iop = to_iomap_page(page);
> +	unsigned		len = i_blocksize(inode);
>  	struct xfs_ioend	*ioend, *next;
> -	struct buffer_head	*bh = NULL;
> -	ssize_t			len = i_blocksize(inode);
> -	int			error = 0;
> -	int			count = 0;
> -	loff_t			file_offset;	/* file offset of page */
> -	unsigned		poffset;	/* offset into page */
> +	int			error = 0, count = 0, i;
> +	u64			file_offset;	/* file offset of page */
>  
> -	if (page_has_buffers(page))
> -		bh = page_buffers(page);
> +	ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
> +	ASSERT(!iop || atomic_read(&iop->write_count) == 0);
>  
>  	/*
> -	 * Walk the blocks on the page, and we we run off then end of the
> -	 * current map or find the current map invalid, grab a new one.
> -	 * We only use bufferheads here to check per-block state - they no
> -	 * longer control the iteration through the page. This allows us to
> -	 * replace the bufferhead with some other state tracking mechanism in
> -	 * future.
> +	 * Walk through the page to find areas to write back. If we run off the
> +	 * end of the current map or find the current map invalid, grab a new
> +	 * one.
>  	 */
> -	for (poffset = 0, file_offset = page_offset(page);
> -	     poffset < PAGE_SIZE;
> -	     poffset += len, file_offset += len) {
> -		/* past the range we are writing, so nothing more to write. */
> -		if (file_offset >= end_offset)
> -			break;
> -
> +	for (i = 0, file_offset = page_offset(page);
> +	     i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
> +	     i++, file_offset += len) {
>  		/*
>  		 * Block does not contain valid data, skip it.
>  		 */
> -		if (bh && !buffer_uptodate(bh)) {
> -			if (PageUptodate(page))
> -				ASSERT(buffer_mapped(bh));
> -			bh = bh->b_this_page;
> +		if (iop && !test_bit(i, iop->uptodate))
>  			continue;
> -		}
>  
>  		/*
>  		 * If we don't have a valid map, now it's time to get a new one
> @@ -854,52 +693,33 @@ xfs_writepage_map(
>  			error = xfs_map_blocks(inode, file_offset, &wpc->imap,
>  					     &wpc->io_type);
>  			if (error)
> -				goto out;
> +				break;
>  		}
>  
> -		if (wpc->io_type == XFS_IO_HOLE) {
> -			/*
> -			 * set_page_dirty dirties all buffers in a page, independent
> -			 * of their state.  The dirty state however is entirely
> -			 * meaningless for holes (!mapped && uptodate), so check we did
> -			 * have a buffer covering a hole here and continue.
> -			 */
> -			if (bh)
> -				bh = bh->b_this_page;
> -			continue;
> -		}
> -
> -		if (bh) {
> -			xfs_map_at_offset(inode, bh, &wpc->imap, file_offset);
> -			bh = bh->b_this_page;
> +		if (wpc->io_type != XFS_IO_HOLE) {
> +			xfs_add_to_ioend(inode, file_offset, page, iop, wpc,
> +				wbc, &submit_list);
> +			count++;
>  		}
> -		xfs_add_to_ioend(inode, file_offset, page, wpc, wbc,
> -				&submit_list);
> -		count++;
>  	}
>  
>  	ASSERT(wpc->ioend || list_empty(&submit_list));
> -
> -out:
>  	ASSERT(PageLocked(page));
>  	ASSERT(!PageWriteback(page));
>  
>  	/*
> -	 * On error, we have to fail the ioend here because we have locked
> -	 * buffers in the ioend. If we don't do this, we'll deadlock
> -	 * invalidating the page as that tries to lock the buffers on the page.
> -	 * Also, because we may have set pages under writeback, we have to make
> -	 * sure we run IO completion to mark the error state of the IO
> -	 * appropriately, so we can't cancel the ioend directly here. That means
> -	 * we have to mark this page as under writeback if we included any
> -	 * buffers from it in the ioend chain so that completion treats it
> -	 * correctly.
> +	 * On error, we have to fail the ioend here because we may have set
> +	 * pages under writeback, we have to make sure we run IO completion to
> +	 * mark the error state of the IO appropriately, so we can't cancel the
> +	 * ioend directly here.  That means we have to mark this page as under
> +	 * writeback if we included any blocks from it in the ioend chain so
> +	 * that completion treats it correctly.
>  	 *
>  	 * If we didn't include the page in the ioend, the on error we can
>  	 * simply discard and unlock it as there are no other users of the page
> -	 * or it's buffers right now. The caller will still need to trigger
> -	 * submission of outstanding ioends on the writepage context so they are
> -	 * treated correctly on error.
> +	 * now.  The caller will still need to trigger submission of outstanding
> +	 * ioends on the writepage context so they are treated correctly on
> +	 * error.
>  	 */
>  	if (unlikely(error)) {
>  		if (!count) {
> @@ -940,8 +760,8 @@ xfs_writepage_map(
>  	}
>  
>  	/*
> -	 * We can end up here with no error and nothing to write if we race with
> -	 * a partial page truncate on a sub-page block sized filesystem.
> +	 * We can end up here with no error and nothing to write only if we race
> +	 * with a partial page truncate on a sub-page block sized filesystem.
>  	 */
>  	if (!count)
>  		end_page_writeback(page);
> @@ -956,7 +776,6 @@ xfs_writepage_map(
>   * For delalloc space on the page we need to allocate space and flush it.
>   * For unwritten space on the page we need to start the conversion to
>   * regular allocated space.
> - * For any other dirty buffer heads on the page we should flush them.
>   */
>  STATIC int
>  xfs_do_writepage(
> @@ -1110,168 +929,6 @@ xfs_dax_writepages(
>  			xfs_find_bdev_for_inode(mapping->host), wbc);
>  }
>  
> -/*
> - * Called to move a page into cleanable state - and from there
> - * to be released. The page should already be clean. We always
> - * have buffer heads in this call.
> - *
> - * Returns 1 if the page is ok to release, 0 otherwise.
> - */
> -STATIC int
> -xfs_vm_releasepage(
> -	struct page		*page,
> -	gfp_t			gfp_mask)
> -{
> -	int			delalloc, unwritten;
> -
> -	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
> -
> -	/*
> -	 * mm accommodates an old ext3 case where clean pages might not have had
> -	 * the dirty bit cleared. Thus, it can send actual dirty pages to
> -	 * ->releasepage() via shrink_active_list(). Conversely,
> -	 * block_invalidatepage() can send pages that are still marked dirty but
> -	 * otherwise have invalidated buffers.
> -	 *
> -	 * We want to release the latter to avoid unnecessary buildup of the
> -	 * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
> -	 * that are entirely invalidated and need to be released.  Hence the
> -	 * only time we should get dirty pages here is through
> -	 * shrink_active_list() and so we can simply skip those now.
> -	 *
> -	 * warn if we've left any lingering delalloc/unwritten buffers on clean
> -	 * or invalidated pages we are about to release.
> -	 */
> -	if (PageDirty(page))
> -		return 0;
> -
> -	xfs_count_page_state(page, &delalloc, &unwritten);
> -
> -	if (WARN_ON_ONCE(delalloc))
> -		return 0;
> -	if (WARN_ON_ONCE(unwritten))
> -		return 0;
> -
> -	return try_to_free_buffers(page);
> -}
> -
> -/*
> - * If this is O_DIRECT or the mpage code calling tell them how large the mapping
> - * is, so that we can avoid repeated get_blocks calls.
> - *
> - * If the mapping spans EOF, then we have to break the mapping up as the mapping
> - * for blocks beyond EOF must be marked new so that sub block regions can be
> - * correctly zeroed. We can't do this for mappings within EOF unless the mapping
> - * was just allocated or is unwritten, otherwise the callers would overwrite
> - * existing data with zeros. Hence we have to split the mapping into a range up
> - * to and including EOF, and a second mapping for beyond EOF.
> - */
> -static void
> -xfs_map_trim_size(
> -	struct inode		*inode,
> -	sector_t		iblock,
> -	struct buffer_head	*bh_result,
> -	struct xfs_bmbt_irec	*imap,
> -	xfs_off_t		offset,
> -	ssize_t			size)
> -{
> -	xfs_off_t		mapping_size;
> -
> -	mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
> -	mapping_size <<= inode->i_blkbits;
> -
> -	ASSERT(mapping_size > 0);
> -	if (mapping_size > size)
> -		mapping_size = size;
> -	if (offset < i_size_read(inode) &&
> -	    (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) {
> -		/* limit mapping to block that spans EOF */
> -		mapping_size = roundup_64(i_size_read(inode) - offset,
> -					  i_blocksize(inode));
> -	}
> -	if (mapping_size > LONG_MAX)
> -		mapping_size = LONG_MAX;
> -
> -	bh_result->b_size = mapping_size;
> -}
> -
> -static int
> -xfs_get_blocks(
> -	struct inode		*inode,
> -	sector_t		iblock,
> -	struct buffer_head	*bh_result,
> -	int			create)
> -{
> -	struct xfs_inode	*ip = XFS_I(inode);
> -	struct xfs_mount	*mp = ip->i_mount;
> -	xfs_fileoff_t		offset_fsb, end_fsb;
> -	int			error = 0;
> -	int			lockmode = 0;
> -	struct xfs_bmbt_irec	imap;
> -	int			nimaps = 1;
> -	xfs_off_t		offset;
> -	ssize_t			size;
> -
> -	BUG_ON(create);
> -
> -	if (XFS_FORCED_SHUTDOWN(mp))
> -		return -EIO;
> -
> -	offset = (xfs_off_t)iblock << inode->i_blkbits;
> -	ASSERT(bh_result->b_size >= i_blocksize(inode));
> -	size = bh_result->b_size;
> -
> -	if (offset >= i_size_read(inode))
> -		return 0;
> -
> -	/*
> -	 * Direct I/O is usually done on preallocated files, so try getting
> -	 * a block mapping without an exclusive lock first.
> -	 */
> -	lockmode = xfs_ilock_data_map_shared(ip);
> -
> -	ASSERT(offset <= mp->m_super->s_maxbytes);
> -	if (offset > mp->m_super->s_maxbytes - size)
> -		size = mp->m_super->s_maxbytes - offset;
> -	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
> -	offset_fsb = XFS_B_TO_FSBT(mp, offset);
> -
> -	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
> -			&nimaps, 0);
> -	if (error)
> -		goto out_unlock;
> -	if (!nimaps) {
> -		trace_xfs_get_blocks_notfound(ip, offset, size);
> -		goto out_unlock;
> -	}
> -
> -	trace_xfs_get_blocks_found(ip, offset, size,
> -		imap.br_state == XFS_EXT_UNWRITTEN ?
> -			XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
> -	xfs_iunlock(ip, lockmode);
> -
> -	/* trim mapping down to size requested */
> -	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
> -
> -	/*
> -	 * For unwritten extents do not report a disk address in the buffered
> -	 * read case (treat as if we're reading into a hole).
> -	 */
> -	if (xfs_bmap_is_real_extent(&imap))
> -		xfs_map_buffer(inode, bh_result, &imap, offset);
> -
> -	/*
> -	 * If this is a realtime file, data may be on a different device.
> -	 * to that pointed to from the buffer_head b_bdev currently.
> -	 */
> -	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
> -	return 0;
> -
> -out_unlock:
> -	xfs_iunlock(ip, lockmode);
> -	return error;
> -}
> -
>  STATIC sector_t
>  xfs_vm_bmap(
>  	struct address_space	*mapping,
> @@ -1301,9 +958,7 @@ xfs_vm_readpage(
>  	struct page		*page)
>  {
>  	trace_xfs_vm_readpage(page->mapping->host, 1);
> -	if (i_blocksize(page->mapping->host) == PAGE_SIZE)
> -		return iomap_readpage(page, &xfs_iomap_ops);
> -	return mpage_readpage(page, xfs_get_blocks);
> +	return iomap_readpage(page, &xfs_iomap_ops);
>  }
>  
>  STATIC int
> @@ -1314,65 +969,26 @@ xfs_vm_readpages(
>  	unsigned		nr_pages)
>  {
>  	trace_xfs_vm_readpages(mapping->host, nr_pages);
> -	if (i_blocksize(mapping->host) == PAGE_SIZE)
> -		return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
> -	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
> +	return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
>  }
>  
> -/*
> - * This is basically a copy of __set_page_dirty_buffers() with one
> - * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
> - * dirty, we'll never be able to clean them because we don't write buffers
> - * beyond EOF, and that means we can't invalidate pages that span EOF
> - * that have been marked dirty. Further, the dirty state can leak into
> - * the file interior if the file is extended, resulting in all sorts of
> - * bad things happening as the state does not match the underlying data.
> - *
> - * XXX: this really indicates that bufferheads in XFS need to die. Warts like
> - * this only exist because of bufferheads and how the generic code manages them.
> - */
> -STATIC int
> -xfs_vm_set_page_dirty(
> -	struct page		*page)
> +static int
> +xfs_vm_releasepage(
> +	struct page		*page,
> +	gfp_t			gfp_mask)
>  {
> -	struct address_space	*mapping = page->mapping;
> -	struct inode		*inode = mapping->host;
> -	loff_t			end_offset;
> -	loff_t			offset;
> -	int			newly_dirty;
> -
> -	if (unlikely(!mapping))
> -		return !TestSetPageDirty(page);
> -
> -	end_offset = i_size_read(inode);
> -	offset = page_offset(page);
> -
> -	spin_lock(&mapping->private_lock);
> -	if (page_has_buffers(page)) {
> -		struct buffer_head *head = page_buffers(page);
> -		struct buffer_head *bh = head;
> +	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
> +	return iomap_releasepage(page, gfp_mask);
> +}
>  
> -		do {
> -			if (offset < end_offset)
> -				set_buffer_dirty(bh);
> -			bh = bh->b_this_page;
> -			offset += i_blocksize(inode);
> -		} while (bh != head);
> -	}
> -	/*
> -	 * Lock out page->mem_cgroup migration to keep PageDirty
> -	 * synchronized with per-memcg dirty page counters.
> -	 */
> -	lock_page_memcg(page);
> -	newly_dirty = !TestSetPageDirty(page);
> -	spin_unlock(&mapping->private_lock);
> -
> -	if (newly_dirty)
> -		__set_page_dirty(page, mapping, 1);
> -	unlock_page_memcg(page);
> -	if (newly_dirty)
> -		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
> -	return newly_dirty;
> +static void
> +xfs_vm_invalidatepage(
> +	struct page		*page,
> +	unsigned int		offset,
> +	unsigned int		length)
> +{
> +	trace_xfs_invalidatepage(page->mapping->host, page, offset, length);
> +	iomap_invalidatepage(page, offset, length);
>  }
>  
>  static int
> @@ -1390,13 +1006,13 @@ const struct address_space_operations xfs_address_space_operations = {
>  	.readpages		= xfs_vm_readpages,
>  	.writepage		= xfs_vm_writepage,
>  	.writepages		= xfs_vm_writepages,
> -	.set_page_dirty		= xfs_vm_set_page_dirty,
> +	.set_page_dirty		= iomap_set_page_dirty,
>  	.releasepage		= xfs_vm_releasepage,
>  	.invalidatepage		= xfs_vm_invalidatepage,
>  	.bmap			= xfs_vm_bmap,
>  	.direct_IO		= noop_direct_IO,
> -	.migratepage		= buffer_migrate_page,
> -	.is_partially_uptodate  = block_is_partially_uptodate,
> +	.migratepage		= iomap_migrate_page,
> +	.is_partially_uptodate  = iomap_is_partially_uptodate,
>  	.error_remove_page	= generic_error_remove_page,
>  	.swap_activate		= xfs_iomap_swapfile_activate,
>  };
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index f5f2b71c2fde..f3fa197bd272 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -24,7 +24,6 @@
>  #include <linux/mm.h>
>  #include <linux/fs.h>
>  #include <linux/dax.h>
> -#include <linux/buffer_head.h>
>  #include <linux/uio.h>
>  #include <linux/list_lru.h>
>  
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 93c40da3378a..c646d84cd55e 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1031,9 +1031,6 @@ xfs_file_iomap_begin(
>  	if (XFS_FORCED_SHUTDOWN(mp))
>  		return -EIO;
>  
> -	if (i_blocksize(inode) < PAGE_SIZE)
> -		iomap->flags |= IOMAP_F_BUFFER_HEAD;
> -
>  	if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
>  			!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
>  		/* Reserve delalloc blocks for regular writeback. */
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 39e5ec3d407f..a9f23ec95216 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1866,7 +1866,7 @@ MODULE_ALIAS_FS("xfs");
>  STATIC int __init
>  xfs_init_zones(void)
>  {
> -	xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
> +	xfs_ioend_bioset = bioset_create(4 * (PAGE_SIZE / SECTOR_SIZE),
>  			offsetof(struct xfs_ioend, io_inline_bio),
>  			BIOSET_NEED_BVECS);
>  	if (!xfs_ioend_bioset)
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index ed8f774944ba..e4dc7c7f3da9 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -1165,33 +1165,23 @@ DECLARE_EVENT_CLASS(xfs_page_class,
>  		__field(loff_t, size)
>  		__field(unsigned long, offset)
>  		__field(unsigned int, length)
> -		__field(int, delalloc)
> -		__field(int, unwritten)
>  	),
>  	TP_fast_assign(
> -		int delalloc = -1, unwritten = -1;
> -
> -		if (page_has_buffers(page))
> -			xfs_count_page_state(page, &delalloc, &unwritten);
>  		__entry->dev = inode->i_sb->s_dev;
>  		__entry->ino = XFS_I(inode)->i_ino;
>  		__entry->pgoff = page_offset(page);
>  		__entry->size = i_size_read(inode);
>  		__entry->offset = off;
>  		__entry->length = len;
> -		__entry->delalloc = delalloc;
> -		__entry->unwritten = unwritten;
>  	),
>  	TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
> -		  "length %x delalloc %d unwritten %d",
> +		  "length %x",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
>  		  __entry->ino,
>  		  __entry->pgoff,
>  		  __entry->size,
>  		  __entry->offset,
> -		  __entry->length,
> -		  __entry->delalloc,
> -		  __entry->unwritten)
> +		  __entry->length)
>  )
>  
>  #define DEFINE_PAGE_EVENT(name)		\
> @@ -1275,9 +1265,6 @@ DEFINE_EVENT(xfs_imap_class, name,	\
>  	TP_ARGS(ip, offset, count, type, irec))
>  DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
>  DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
> -DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
> -DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
> -DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
>  DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
>  DEFINE_IOMAP_EVENT(xfs_iomap_found);
>  
> @@ -1316,7 +1303,6 @@ DEFINE_EVENT(xfs_simple_io_class, name,	\
>  	TP_ARGS(ip, offset, count))
>  DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
>  DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
> -DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
>  DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
>  DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
>  DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
> -- 
> 2.17.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] iomap: add support for sub-pagesize buffered I/O without buffer heads
  2018-05-25 17:17   ` Brian Foster
@ 2018-05-28  6:50     ` Christoph Hellwig
  0 siblings, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2018-05-28  6:50 UTC (permalink / raw)
  To: Brian Foster
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, linux-block, linux-mm

On Fri, May 25, 2018 at 01:17:02PM -0400, Brian Foster wrote:
> > +static struct iomap_page *
> > +iomap_page_create(struct inode *inode, struct page *page)
> > +{
> > +	struct iomap_page *iop = to_iomap_page(page);
> > +
> > +	if (iop || i_blocksize(inode) == PAGE_SIZE)
> > +		return iop;
> > +
> > +	iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
> > +	atomic_set(&iop->read_count, 0);
> > +	atomic_set(&iop->write_count, 0);
> > +	bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
> > +	set_page_private(page, (unsigned long)iop);
> > +	SetPagePrivate(page);
> 
> The buffer head implementation does a get/put page when the private
> state is set. I'm not quite sure why that is tbh, but do you know
> whether we need that here or not?

I don't really see any good reason why that would be needed, as we need
a successfull ->releasepage return to drop the page from the page cache.
I'll look around a little more if there is any other reason for it -
adding get/put page pair here would be easy to do, so maybe we should just
cargo-cult it in to be on the safe side.

> > -	return plen;
> > +	return pos - orig_pos + plen;
> 
> A brief comment here (or above the adjust_read_range() call) to explain
> the final length calculation would be helpful. E.g., it looks like
> leading uptodate blocks are part of the read while trailing uptodate
> blocks can be truncated by the above call.

Ok.

> > +int
> > +iomap_is_partially_uptodate(struct page *page, unsigned long from,
> > +		unsigned long count)
> > +{
> > +	struct iomap_page *iop = to_iomap_page(page);
> > +	struct inode *inode = page->mapping->host;
> > +	unsigned first = from >> inode->i_blkbits;
> > +	unsigned last = (from + count - 1) >> inode->i_blkbits;
> > +	unsigned i;
> > +
> 
> block_is_partially_uptodate() has this check:
> 
>         if (from < blocksize && to > PAGE_SIZE - blocksize)
>                 return 0;
> 
> ... which looks like it checks that the range is actually partial wrt to
> block size. The only callers check the page first, but I'm still not
> sure why it returns 0 in that case. Any idea?

The calling convention is generally pretty insane.  I plan to clean
this up, but didn't want to grow my XFS-related series even more.

> > +{
> > +	/*
> > +	 * If we are invalidating the entire page, clear the dirty state from it
> > +	 * and release it to avoid unnecessary buildup of the LRU.
> > +	 */
> > +	if (offset == 0 && len == PAGE_SIZE) {
> > +		cancel_dirty_page(page);
> > +		iomap_releasepage(page, GFP_NOIO);
> 
> Seems like this should probably be calling ->releasepage().

Not really.  I don't want the fs in the loop here.  My other option
was to have a iomap_page_free helper called here and in ->releasepage.
Maybe I'll move back to that is it is less confusing.

> > @@ -333,6 +529,7 @@ static int
> >  __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
> >  		struct page *page, struct iomap *iomap)
> >  {
> > +	struct iomap_page *iop = iomap_page_create(inode, page);
> >  	loff_t block_size = i_blocksize(inode);
> >  	loff_t block_start = pos & ~(block_size - 1);
> >  	loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
> > @@ -340,15 +537,29 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
> >  	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - block_start);
> 
> poff/plen are now initialized here and in iomap_adjust_read_range().
> Perhaps drop this one so the semantic of these being set by the latter
> is a bit more clear?

Yes, will do.

> > +
> > +	do {
> > +		iomap_adjust_read_range(inode, iop, &block_start,
> > +				block_end - block_start, &poff, &plen);
> > +		if (plen == 0)
> > +			break;
> > +
> > +		if ((from > poff && from < poff + plen) ||
> > +		    (to > poff && to < poff + plen)) {
> > +			status = iomap_read_page_sync(inode, block_start, page,
> > +					poff, plen, from, to, iomap);
> > +			if (status)
> > +				return status;
> > +		}
> > +
> > +		block_start += plen;
> > +	} while (poff + plen < PAGE_SIZE);
> 
> Something like while (block_start < block_end) would seem a bit more
> clear here as well.

I'll look into it.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] xfs: add support for sub-pagesize writeback without buffer_heads
  2018-05-25 17:17   ` Brian Foster
@ 2018-05-28  6:57     ` Christoph Hellwig
  0 siblings, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2018-05-28  6:57 UTC (permalink / raw)
  To: Brian Foster
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, linux-block, linux-mm

On Fri, May 25, 2018 at 01:17:15PM -0400, Brian Foster wrote:
> On Wed, May 23, 2018 at 04:46:46PM +0200, Christoph Hellwig wrote:
> > Switch to using the iomap_page structure for checking sub-page uptodate
> > status and track sub-page I/O completion status, and remove large
> > quantities of boilerplate code working around buffer heads.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >  fs/xfs/xfs_aops.c  | 536 +++++++--------------------------------------
> >  fs/xfs/xfs_buf.h   |   1 -
> >  fs/xfs/xfs_iomap.c |   3 -
> >  fs/xfs/xfs_super.c |   2 +-
> >  fs/xfs/xfs_trace.h |  18 +-
> >  5 files changed, 79 insertions(+), 481 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> > index efa2cbb27d67..d279929e53fb 100644
> > --- a/fs/xfs/xfs_aops.c
> > +++ b/fs/xfs/xfs_aops.c
> ...
> > @@ -768,7 +620,7 @@ xfs_aops_discard_page(
> >  	int			error;
> >  
> >  	if (XFS_FORCED_SHUTDOWN(mp))
> > -		goto out_invalidate;
> > +		goto out;
> >  
> >  	xfs_alert(mp,
> >  		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
> > @@ -778,15 +630,15 @@ xfs_aops_discard_page(
> >  			PAGE_SIZE / i_blocksize(inode));
> >  	if (error && !XFS_FORCED_SHUTDOWN(mp))
> >  		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
> > -out_invalidate:
> > -	xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
> > +out:
> > +	iomap_invalidatepage(page, 0, PAGE_SIZE);
> 
> All this does is lose the tracepoint. I don't think this call needs to
> change. The rest looks Ok to me, but I still need to run some tests on
> the whole thing.

Ok.  I actually had it that way, then thought we shouldn't need the
invalidatepage without bufferheads, but it turns out we still do and
added it back this way.  I'll go back to start and won't collect $200..

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2018-05-28  6:57 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-23 14:46 sub-page blocksize support in iomap non-buffer head path v3 Christoph Hellwig
2018-05-23 14:46 ` [PATCH 1/2] iomap: add support for sub-pagesize buffered I/O without buffer heads Christoph Hellwig
2018-05-25 17:17   ` Brian Foster
2018-05-28  6:50     ` Christoph Hellwig
2018-05-23 14:46 ` [PATCH 2/2] xfs: add support for sub-pagesize writeback without buffer_heads Christoph Hellwig
2018-05-25 17:17   ` Brian Foster
2018-05-28  6:57     ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).