All of lore.kernel.org
 help / color / mirror / Atom feed
* stop using buffer heads in xfs and iomap
@ 2018-05-09  7:47 Christoph Hellwig
  2018-05-09  7:47 ` [PATCH 01/33] block: add a lower-level bio_add_page interface Christoph Hellwig
                   ` (33 more replies)
  0 siblings, 34 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:47 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Hi all,

this series adds support for reading blocks from disk using the iomap
interface, and then gradually switched the buffered I/O path to not
require buffer heads.  It has survived xfstests for 1k and 4k block
size.

There are various small changes to the core VFS, block and readahead
code to make this happen.


A git tree is available at:

    git://git.infradead.org/users/hch/xfs.git xfs-remove-bufferheads

Gitweb:

    http://git.infradead.org/users/hch/xfs.git/shortlog/refs/heads/xfs-remove-bufferheads

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH 01/33] block: add a lower-level bio_add_page interface
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
@ 2018-05-09  7:47 ` Christoph Hellwig
  2018-05-09 15:12   ` Matthew Wilcox
                     ` (2 more replies)
  2018-05-09  7:47 ` [PATCH 02/33] fs: factor out a __generic_write_end helper Christoph Hellwig
                   ` (32 subsequent siblings)
  33 siblings, 3 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:47 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

For the upcoming removal of buffer heads in XFS we need to keep track of
the number of outstanding writeback requests per page.  For this we need
to know if bio_add_page merged a region with the previous bvec or not.
Instead of adding additional arguments this refactors bio_add_page to
be implemented using three lower level helpers which users like XFS can
use directly if they care about the merge decisions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/bio.c         | 87 ++++++++++++++++++++++++++++++---------------
 include/linux/bio.h |  9 +++++
 2 files changed, 67 insertions(+), 29 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 53e0f0a1ed94..6ceba6adbf42 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -773,7 +773,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
 			return 0;
 	}
 
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
+	if (bio_full(bio))
 		return 0;
 
 	/*
@@ -820,6 +820,59 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
 }
 EXPORT_SYMBOL(bio_add_pc_page);
 
+/**
+ * __bio_try_merge_page - try adding data to an existing bvec
+ * @bio: destination bio
+ * @page: page to add
+ * @len: length of the range to add
+ * @off: offset into @page
+ *
+ * Try adding the data described at @page + @offset to the last bvec of @bio.
+ * Return %true on success or %false on failure.  This can happen frequently
+ * for file systems with a block size smaller than the page size.
+ */
+bool __bio_try_merge_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int off)
+{
+	if (bio->bi_vcnt > 0) {
+		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+		if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
+			bv->bv_len += len;
+			bio->bi_iter.bi_size += len;
+			return true;
+		}
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(__bio_try_merge_page);
+
+/**
+ * __bio_add_page - add page to a bio in a new segment
+ * @bio: destination bio
+ * @page: page to add
+ * @len: length of the range to add
+ * @off: offset into @page
+ *
+ * Add the data at @page + @offset to @bio as a new bvec.  The caller must
+ * ensure that @bio has space for another bvec.
+ */
+void __bio_add_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int off)
+{
+	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
+
+	WARN_ON_ONCE(bio_full(bio));
+
+	bv->bv_page = page;
+	bv->bv_offset = off;
+	bv->bv_len = len;
+
+	bio->bi_iter.bi_size += len;
+	bio->bi_vcnt++;
+}
+EXPORT_SYMBOL_GPL(__bio_add_page);
+
 /**
  *	bio_add_page	-	attempt to add page to bio
  *	@bio: destination bio
@@ -833,40 +886,16 @@ EXPORT_SYMBOL(bio_add_pc_page);
 int bio_add_page(struct bio *bio, struct page *page,
 		 unsigned int len, unsigned int offset)
 {
-	struct bio_vec *bv;
-
 	/*
 	 * cloned bio must not modify vec list
 	 */
 	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
 		return 0;
-
-	/*
-	 * For filesystems with a blocksize smaller than the pagesize
-	 * we will often be called with the same page as last time and
-	 * a consecutive offset.  Optimize this special case.
-	 */
-	if (bio->bi_vcnt > 0) {
-		bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
-		if (page == bv->bv_page &&
-		    offset == bv->bv_offset + bv->bv_len) {
-			bv->bv_len += len;
-			goto done;
-		}
+	if (!__bio_try_merge_page(bio, page, len, offset)) {
+		if (bio_full(bio))
+			return 0;
+		__bio_add_page(bio, page, len, offset);
 	}
-
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
-		return 0;
-
-	bv		= &bio->bi_io_vec[bio->bi_vcnt];
-	bv->bv_page	= page;
-	bv->bv_len	= len;
-	bv->bv_offset	= offset;
-
-	bio->bi_vcnt++;
-done:
-	bio->bi_iter.bi_size += len;
 	return len;
 }
 EXPORT_SYMBOL(bio_add_page);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ce547a25e8ae..3e73c8bc25ea 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -123,6 +123,11 @@ static inline void *bio_data(struct bio *bio)
 	return NULL;
 }
 
+static inline bool bio_full(struct bio *bio)
+{
+	return bio->bi_vcnt >= bio->bi_max_vecs;
+}
+
 /*
  * will die
  */
@@ -470,6 +475,10 @@ void bio_chain(struct bio *, struct bio *);
 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
+bool __bio_try_merge_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int off);
+void __bio_add_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 02/33] fs: factor out a __generic_write_end helper
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
  2018-05-09  7:47 ` [PATCH 01/33] block: add a lower-level bio_add_page interface Christoph Hellwig
@ 2018-05-09  7:47 ` Christoph Hellwig
  2018-05-09 15:15   ` Matthew Wilcox
  2018-05-09  7:48 ` [PATCH 03/33] fs: move page_cache_seek_hole_data to iomap.c Christoph Hellwig
                   ` (31 subsequent siblings)
  33 siblings, 1 reply; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:47 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Bits of the buffer.c based write_end implementations that don't know
about buffer_heads and can be reused by other implementations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/buffer.c   | 68 +++++++++++++++++++++++++++------------------------
 fs/internal.h |  2 ++
 2 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 249b83fafe48..923391702f51 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2076,6 +2076,40 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
 }
 EXPORT_SYMBOL(block_write_begin);
 
+int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
+		struct page *page)
+{
+	loff_t old_size = inode->i_size;
+	bool i_size_changed = false;
+
+	/*
+	 * No need to use i_size_read() here, the i_size cannot change under us
+	 * because we hold i_rwsem.
+	 *
+	 * But it's important to update i_size while still holding page lock:
+	 * page writeout could otherwise come in and zero beyond i_size.
+	 */
+	if (pos + copied > inode->i_size) {
+		i_size_write(inode, pos + copied);
+		i_size_changed = true;
+	}
+
+	unlock_page(page);
+	put_page(page);
+
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		mark_inode_dirty(inode);
+	return copied;
+}
+
 int block_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
@@ -2116,42 +2150,12 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
-	struct inode *inode = mapping->host;
-	loff_t old_size = inode->i_size;
-	int i_size_changed = 0;
-
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-	/*
-	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold i_mutex.
-	 *
-	 * But it's important to update i_size while still holding page lock:
-	 * page writeout could otherwise come in and zero beyond i_size.
-	 */
-	if (pos+copied > inode->i_size) {
-		i_size_write(inode, pos+copied);
-		i_size_changed = 1;
-	}
-
-	unlock_page(page);
-	put_page(page);
-
-	if (old_size < pos)
-		pagecache_isize_extended(inode, old_size, pos);
-	/*
-	 * Don't mark the inode dirty under page lock. First, it unnecessarily
-	 * makes the holding time of page lock longer. Second, it forces lock
-	 * ordering of page lock and transaction start for journaling
-	 * filesystems.
-	 */
-	if (i_size_changed)
-		mark_inode_dirty(inode);
-
-	return copied;
+	return __generic_write_end(mapping->host, pos, copied, page);
 }
 EXPORT_SYMBOL(generic_write_end);
 
+
 /*
  * block_is_partially_uptodate checks whether buffers within a page are
  * uptodate or not.
diff --git a/fs/internal.h b/fs/internal.h
index e08972db0303..b955232d3d49 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -43,6 +43,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
 extern void guard_bio_eod(int rw, struct bio *bio);
 extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
 		get_block_t *get_block, struct iomap *iomap);
+int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
+		struct page *page);
 
 /*
  * char_dev.c
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 03/33] fs: move page_cache_seek_hole_data to iomap.c
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
  2018-05-09  7:47 ` [PATCH 01/33] block: add a lower-level bio_add_page interface Christoph Hellwig
  2018-05-09  7:47 ` [PATCH 02/33] fs: factor out a __generic_write_end helper Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 04/33] fs: remove the buffer_unwritten check in page_seek_hole_data Christoph Hellwig
                   ` (30 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

This function is only used by the iomap code, depends on being called
from it, and will soon stop poking into buffer head internals.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/buffer.c                 | 114 -----------------------------------
 fs/iomap.c                  | 116 ++++++++++++++++++++++++++++++++++++
 include/linux/buffer_head.h |   2 -
 3 files changed, 116 insertions(+), 116 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 923391702f51..71ea9a29e9d5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3431,120 +3431,6 @@ int bh_submit_read(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(bh_submit_read);
 
-/*
- * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
- *
- * Returns the offset within the file on success, and -ENOENT otherwise.
- */
-static loff_t
-page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
-{
-	loff_t offset = page_offset(page);
-	struct buffer_head *bh, *head;
-	bool seek_data = whence == SEEK_DATA;
-
-	if (lastoff < offset)
-		lastoff = offset;
-
-	bh = head = page_buffers(page);
-	do {
-		offset += bh->b_size;
-		if (lastoff >= offset)
-			continue;
-
-		/*
-		 * Unwritten extents that have data in the page cache covering
-		 * them can be identified by the BH_Unwritten state flag.
-		 * Pages with multiple buffers might have a mix of holes, data
-		 * and unwritten extents - any buffer with valid data in it
-		 * should have BH_Uptodate flag set on it.
-		 */
-
-		if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
-			return lastoff;
-
-		lastoff = offset;
-	} while ((bh = bh->b_this_page) != head);
-	return -ENOENT;
-}
-
-/*
- * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
- *
- * Within unwritten extents, the page cache determines which parts are holes
- * and which are data: unwritten and uptodate buffer heads count as data;
- * everything else counts as a hole.
- *
- * Returns the resulting offset on successs, and -ENOENT otherwise.
- */
-loff_t
-page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
-			  int whence)
-{
-	pgoff_t index = offset >> PAGE_SHIFT;
-	pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
-	loff_t lastoff = offset;
-	struct pagevec pvec;
-
-	if (length <= 0)
-		return -ENOENT;
-
-	pagevec_init(&pvec);
-
-	do {
-		unsigned nr_pages, i;
-
-		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
-						end - 1);
-		if (nr_pages == 0)
-			break;
-
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-
-			/*
-			 * At this point, the page may be truncated or
-			 * invalidated (changing page->mapping to NULL), or
-			 * even swizzled back from swapper_space to tmpfs file
-			 * mapping.  However, page->index will not change
-			 * because we have a reference on the page.
-                         *
-			 * If current page offset is beyond where we've ended,
-			 * we've found a hole.
-                         */
-			if (whence == SEEK_HOLE &&
-			    lastoff < page_offset(page))
-				goto check_range;
-
-			lock_page(page);
-			if (likely(page->mapping == inode->i_mapping) &&
-			    page_has_buffers(page)) {
-				lastoff = page_seek_hole_data(page, lastoff, whence);
-				if (lastoff >= 0) {
-					unlock_page(page);
-					goto check_range;
-				}
-			}
-			unlock_page(page);
-			lastoff = page_offset(page) + PAGE_SIZE;
-		}
-		pagevec_release(&pvec);
-	} while (index < end);
-
-	/* When no page at lastoff and we are not done, we found a hole. */
-	if (whence != SEEK_HOLE)
-		goto not_found;
-
-check_range:
-	if (lastoff < offset + length)
-		goto out;
-not_found:
-	lastoff = -ENOENT;
-out:
-	pagevec_release(&pvec);
-	return lastoff;
-}
-
 void __init buffer_init(void)
 {
 	unsigned long nrpages;
diff --git a/fs/iomap.c b/fs/iomap.c
index afd163586aa0..13f518c7d3be 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
+#include <linux/pagevec.h>
 #include <linux/file.h>
 #include <linux/uio.h>
 #include <linux/backing-dev.h>
@@ -587,6 +588,121 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
 }
 EXPORT_SYMBOL_GPL(iomap_fiemap);
 
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
+ *
+ * Returns the offset within the file on success, and -ENOENT otherwise.
+ */
+static loff_t
+page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
+{
+	loff_t offset = page_offset(page);
+	struct buffer_head *bh, *head;
+	bool seek_data = whence == SEEK_DATA;
+
+	if (lastoff < offset)
+		lastoff = offset;
+
+	bh = head = page_buffers(page);
+	do {
+		offset += bh->b_size;
+		if (lastoff >= offset)
+			continue;
+
+		/*
+		 * Unwritten extents that have data in the page cache covering
+		 * them can be identified by the BH_Unwritten state flag.
+		 * Pages with multiple buffers might have a mix of holes, data
+		 * and unwritten extents - any buffer with valid data in it
+		 * should have BH_Uptodate flag set on it.
+		 */
+
+		if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
+			return lastoff;
+
+		lastoff = offset;
+	} while ((bh = bh->b_this_page) != head);
+	return -ENOENT;
+}
+
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ *
+ * Within unwritten extents, the page cache determines which parts are holes
+ * and which are data: unwritten and uptodate buffer heads count as data;
+ * everything else counts as a hole.
+ *
+ * Returns the resulting offset on successs, and -ENOENT otherwise.
+ */
+static loff_t
+page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
+		int whence)
+{
+	pgoff_t index = offset >> PAGE_SHIFT;
+	pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+	loff_t lastoff = offset;
+	struct pagevec pvec;
+
+	if (length <= 0)
+		return -ENOENT;
+
+	pagevec_init(&pvec);
+
+	do {
+		unsigned nr_pages, i;
+
+		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
+						end - 1);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point, the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or
+			 * even swizzled back from swapper_space to tmpfs file
+			 * mapping.  However, page->index will not change
+			 * because we have a reference on the page.
+                         *
+			 * If current page offset is beyond where we've ended,
+			 * we've found a hole.
+                         */
+			if (whence == SEEK_HOLE &&
+			    lastoff < page_offset(page))
+				goto check_range;
+
+			lock_page(page);
+			if (likely(page->mapping == inode->i_mapping) &&
+			    page_has_buffers(page)) {
+				lastoff = page_seek_hole_data(page, lastoff, whence);
+				if (lastoff >= 0) {
+					unlock_page(page);
+					goto check_range;
+				}
+			}
+			unlock_page(page);
+			lastoff = page_offset(page) + PAGE_SIZE;
+		}
+		pagevec_release(&pvec);
+	} while (index < end);
+
+	/* When no page at lastoff and we are not done, we found a hole. */
+	if (whence != SEEK_HOLE)
+		goto not_found;
+
+check_range:
+	if (lastoff < offset + length)
+		goto out;
+not_found:
+	lastoff = -ENOENT;
+out:
+	pagevec_release(&pvec);
+	return lastoff;
+}
+
+
 static loff_t
 iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
 		      void *data, struct iomap *iomap)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 894e5d125de6..96225a77c112 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -205,8 +205,6 @@ void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
 int bh_uptodate_or_lock(struct buffer_head *bh);
 int bh_submit_read(struct buffer_head *bh);
-loff_t page_cache_seek_hole_data(struct inode *inode, loff_t offset,
-				 loff_t length, int whence);
 
 extern int buffer_heads_over_limit;
 
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 04/33] fs: remove the buffer_unwritten check in page_seek_hole_data
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (2 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 03/33] fs: move page_cache_seek_hole_data to iomap.c Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-17 11:33   ` Andreas Grünbacher
  2018-05-09  7:48 ` [PATCH 05/33] fs: use ->is_partially_uptodate in page_cache_seek_hole_data Christoph Hellwig
                   ` (29 subsequent siblings)
  33 siblings, 1 reply; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

We only call into this function through the iomap iterators, so we already
know the buffer is unwritten.  In addition to that we always require the
uptodate flag that is ORed with the result anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 13f518c7d3be..a739f3f995d9 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -610,14 +610,9 @@ page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
 			continue;
 
 		/*
-		 * Unwritten extents that have data in the page cache covering
-		 * them can be identified by the BH_Unwritten state flag.
-		 * Pages with multiple buffers might have a mix of holes, data
-		 * and unwritten extents - any buffer with valid data in it
-		 * should have BH_Uptodate flag set on it.
+		 * Any buffer with valid data in it should have BH_Uptodate set.
 		 */
-
-		if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
+		if (buffer_uptodate(bh) == seek_data)
 			return lastoff;
 
 		lastoff = offset;
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 05/33] fs: use ->is_partially_uptodate in page_cache_seek_hole_data
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (3 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 04/33] fs: remove the buffer_unwritten check in page_seek_hole_data Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 06/33] mm: give the 'ret' variable a better name __do_page_cache_readahead Christoph Hellwig
                   ` (28 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

This way the implementation doesn't depend on buffer_head internals.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c | 83 +++++++++++++++++++++++++++---------------------------
 1 file changed, 42 insertions(+), 41 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index a739f3f995d9..b3592183b1a0 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -593,31 +593,54 @@ EXPORT_SYMBOL_GPL(iomap_fiemap);
  *
  * Returns the offset within the file on success, and -ENOENT otherwise.
  */
-static loff_t
-page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
+static bool
+page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff,
+		int whence)
 {
-	loff_t offset = page_offset(page);
-	struct buffer_head *bh, *head;
+	const struct address_space_operations *ops = inode->i_mapping->a_ops;
+	unsigned int bsize = i_blocksize(inode), off;
 	bool seek_data = whence == SEEK_DATA;
+	loff_t poff = page_offset(page);
 
-	if (lastoff < offset)
-		lastoff = offset;
-
-	bh = head = page_buffers(page);
-	do {
-		offset += bh->b_size;
-		if (lastoff >= offset)
-			continue;
+	if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE))
+		return false;
 
+	if (*lastoff < poff) {
 		/*
-		 * Any buffer with valid data in it should have BH_Uptodate set.
+		 * Last offset smaller than the start of the page means we found
+		 * a hole:
 		 */
-		if (buffer_uptodate(bh) == seek_data)
-			return lastoff;
+		if (whence == SEEK_HOLE)
+			return true;
+		*lastoff = poff;
+	}
 
-		lastoff = offset;
-	} while ((bh = bh->b_this_page) != head);
-	return -ENOENT;
+	/*
+	 * Just check the page unless we can and should check block ranges:
+	 */
+	if (bsize == PAGE_SIZE || !ops->is_partially_uptodate) {
+		if (PageUptodate(page) == seek_data)
+			return true;
+		return false;
+	}
+
+	lock_page(page);
+	if (unlikely(page->mapping != inode->i_mapping))
+		goto out_unlock_not_found;
+
+	for (off = 0; off < PAGE_SIZE; off += bsize) {
+		if ((*lastoff & ~PAGE_MASK) >= off + bsize)
+			continue;
+		if (ops->is_partially_uptodate(page, off, bsize) == seek_data) {
+			unlock_page(page);
+			return true;
+		}
+		*lastoff = poff + off + bsize;
+	}
+
+out_unlock_not_found:
+	unlock_page(page);
+	return false;
 }
 
 /*
@@ -654,30 +677,8 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
-			/*
-			 * At this point, the page may be truncated or
-			 * invalidated (changing page->mapping to NULL), or
-			 * even swizzled back from swapper_space to tmpfs file
-			 * mapping.  However, page->index will not change
-			 * because we have a reference on the page.
-                         *
-			 * If current page offset is beyond where we've ended,
-			 * we've found a hole.
-                         */
-			if (whence == SEEK_HOLE &&
-			    lastoff < page_offset(page))
+			if (page_seek_hole_data(inode, page, &lastoff, whence))
 				goto check_range;
-
-			lock_page(page);
-			if (likely(page->mapping == inode->i_mapping) &&
-			    page_has_buffers(page)) {
-				lastoff = page_seek_hole_data(page, lastoff, whence);
-				if (lastoff >= 0) {
-					unlock_page(page);
-					goto check_range;
-				}
-			}
-			unlock_page(page);
 			lastoff = page_offset(page) + PAGE_SIZE;
 		}
 		pagevec_release(&pvec);
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 06/33] mm: give the 'ret' variable a better name __do_page_cache_readahead
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (4 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 05/33] fs: use ->is_partially_uptodate in page_cache_seek_hole_data Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09 15:45   ` Matthew Wilcox
  2018-05-09  7:48 ` [PATCH 07/33] mm: split ->readpages calls to avoid non-contiguous pages lists Christoph Hellwig
                   ` (27 subsequent siblings)
  33 siblings, 1 reply; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

It counts the number of pages acted on, so name it nr_pages to make that
obvious.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 mm/readahead.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index 539bbb6c1fad..16d0cb1e2616 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -156,7 +156,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	unsigned long end_index;	/* The last page we want to read */
 	LIST_HEAD(page_pool);
 	int page_idx;
-	int ret = 0;
+	int nr_pages = 0;
 	loff_t isize = i_size_read(inode);
 	gfp_t gfp_mask = readahead_gfp_mask(mapping);
 
@@ -187,7 +187,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 		list_add(&page->lru, &page_pool);
 		if (page_idx == nr_to_read - lookahead_size)
 			SetPageReadahead(page);
-		ret++;
+		nr_pages++;
 	}
 
 	/*
@@ -195,11 +195,11 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	 * uptodate then the caller will launch readpage again, and
 	 * will then handle the error.
 	 */
-	if (ret)
-		read_pages(mapping, filp, &page_pool, ret, gfp_mask);
+	if (nr_pages)
+		read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
 	BUG_ON(!list_empty(&page_pool));
 out:
-	return ret;
+	return nr_pages;
 }
 
 /*
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 07/33] mm: split ->readpages calls to avoid non-contiguous pages lists
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (5 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 06/33] mm: give the 'ret' variable a better name __do_page_cache_readahead Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09 15:46   ` Matthew Wilcox
  2018-05-09  7:48 ` [PATCH 08/33] iomap: use __bio_add_page in iomap_dio_zero Christoph Hellwig
                   ` (26 subsequent siblings)
  33 siblings, 1 reply; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

That way file systems don't have to go spotting for non-contiguous pages
and work around them.  It also kicks off I/O earlier, allowing it to
finish earlier and reduce latency.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 mm/readahead.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index 16d0cb1e2616..3f608e00286d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -177,8 +177,18 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 		rcu_read_lock();
 		page = radix_tree_lookup(&mapping->i_pages, page_offset);
 		rcu_read_unlock();
-		if (page && !radix_tree_exceptional_entry(page))
+		if (page && !radix_tree_exceptional_entry(page)) {
+			/*
+			 * Page already present?  Kick off the current batch of
+			 * contiguous pages before continueing with the next
+			 * batch.
+			 */
+			if (nr_pages)
+				read_pages(mapping, filp, &page_pool, nr_pages,
+						gfp_mask);
+			nr_pages = 0;
 			continue;
+		}
 
 		page = __page_cache_alloc(gfp_mask);
 		if (!page)
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 08/33] iomap: use __bio_add_page in iomap_dio_zero
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (6 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 07/33] mm: split ->readpages calls to avoid non-contiguous pages lists Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 09/33] iomap: add a iomap_sector helper Christoph Hellwig
                   ` (25 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

We don't need any merging logic, and this also replaces a BUG_ON with a
WARN_ON_ONCE inside __bio_add_page for the impossible overflow condition.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index b3592183b1a0..58bb39bac72b 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -951,8 +951,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 	bio->bi_end_io = iomap_dio_bio_end_io;
 
 	get_page(page);
-	if (bio_add_page(bio, page, len, 0) != len)
-		BUG();
+	__bio_add_page(bio, page, len, 0);
 	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
 
 	atomic_inc(&dio->ref);
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 09/33] iomap: add a iomap_sector helper
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (7 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 08/33] iomap: use __bio_add_page in iomap_dio_zero Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 10/33] iomap: add an iomap-based bmap implementation Christoph Hellwig
                   ` (24 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Factor the repeated calculation of the on-disk sector for a given logical
block into a littler helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 58bb39bac72b..af525cb47339 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -96,6 +96,12 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
 	return written ? written : ret;
 }
 
+static sector_t
+iomap_sector(struct iomap *iomap, loff_t pos)
+{
+	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
+}
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -353,11 +359,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
 static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
 		struct iomap *iomap)
 {
-	sector_t sector = (iomap->addr +
-			   (pos & PAGE_MASK) - iomap->offset) >> 9;
-
-	return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
-			offset, bytes);
+	return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
+			iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
 }
 
 static loff_t
@@ -945,8 +948,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 
 	bio = bio_alloc(GFP_KERNEL, 1);
 	bio_set_dev(bio, iomap->bdev);
-	bio->bi_iter.bi_sector =
-		(iomap->addr + pos - iomap->offset) >> 9;
+	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
 	bio->bi_private = dio;
 	bio->bi_end_io = iomap_dio_bio_end_io;
 
@@ -1027,8 +1029,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 
 		bio = bio_alloc(GFP_KERNEL, nr_pages);
 		bio_set_dev(bio, iomap->bdev);
-		bio->bi_iter.bi_sector =
-			(iomap->addr + pos - iomap->offset) >> 9;
+		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
 		bio->bi_write_hint = dio->iocb->ki_hint;
 		bio->bi_private = dio;
 		bio->bi_end_io = iomap_dio_bio_end_io;
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 10/33] iomap: add an iomap-based bmap implementation
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (8 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 09/33] iomap: add a iomap_sector helper Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09 16:46   ` Darrick J. Wong
  2018-05-09  7:48 ` [PATCH 11/33] iomap: add an iomap-based readpage and readpages implementation Christoph Hellwig
                   ` (23 subsequent siblings)
  33 siblings, 1 reply; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

This adds a simple iomap-based implementation of the legacy ->bmap
interface.  Note that we can't easily add checks for rt or reflink
files, so these will have to remain in the callers.  This interface
just needs to die..

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c            | 29 +++++++++++++++++++++++++++++
 include/linux/iomap.h |  3 +++
 2 files changed, 32 insertions(+)

diff --git a/fs/iomap.c b/fs/iomap.c
index af525cb47339..049e0c4aacac 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1201,3 +1201,32 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iomap_dio_rw);
+
+static loff_t
+iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	sector_t *bno = data;
+
+	if (iomap->type == IOMAP_MAPPED)
+		*bno = (iomap->addr + pos - iomap->offset) >> inode->i_blkbits;
+	return 0;
+}
+
+/* legacy ->bmap interface.  0 is the error return (!) */
+sector_t
+iomap_bmap(struct address_space *mapping, sector_t bno,
+		const struct iomap_ops *ops)
+{
+	struct inode *inode = mapping->host;
+	loff_t pos = bno >> inode->i_blkbits;
+	unsigned blocksize = i_blocksize(inode);
+
+	if (filemap_write_and_wait(mapping))
+		return 0;
+
+	bno = 0;
+	iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
+	return bno;
+}
+EXPORT_SYMBOL_GPL(iomap_bmap);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 19a07de28212..07f73224c38b 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -4,6 +4,7 @@
 
 #include <linux/types.h>
 
+struct address_space;
 struct fiemap_extent_info;
 struct inode;
 struct iov_iter;
@@ -95,6 +96,8 @@ loff_t iomap_seek_hole(struct inode *inode, loff_t offset,
 		const struct iomap_ops *ops);
 loff_t iomap_seek_data(struct inode *inode, loff_t offset,
 		const struct iomap_ops *ops);
+sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
+		const struct iomap_ops *ops);
 
 /*
  * Flags for direct I/O ->end_io:
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 11/33] iomap: add an iomap-based readpage and readpages implementation
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (9 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 10/33] iomap: add an iomap-based bmap implementation Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-10  1:17   ` Dave Chinner
  2018-05-09  7:48 ` [PATCH 12/33] xfs: use iomap_bmap Christoph Hellwig
                   ` (22 subsequent siblings)
  33 siblings, 1 reply; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Simply use iomap_apply to iterate over the file and a submit a bio for
each non-uptodate but mapped region and zero everything else.  Note that
as-is this can not be used for file systems with a blocksize smaller than
the page size, but that support will be added later.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c            | 195 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/iomap.h |   4 +
 2 files changed, 198 insertions(+), 1 deletion(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 049e0c4aacac..967bd31540fe 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016 Christoph Hellwig.
+ * Copyright (c) 2016-2018 Christoph Hellwig.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -18,6 +18,7 @@
 #include <linux/uaccess.h>
 #include <linux/gfp.h>
 #include <linux/mm.h>
+#include <linux/mm_inline.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
@@ -102,6 +103,198 @@ iomap_sector(struct iomap *iomap, loff_t pos)
 	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
 }
 
+static inline bool
+iomap_block_needs_zeroing(struct inode *inode, loff_t pos, struct iomap *iomap)
+{
+       return iomap->type != IOMAP_MAPPED || pos > i_size_read(inode);
+}
+
+static void
+iomap_read_end_io(struct bio *bio)
+{
+	int error = blk_status_to_errno(bio->bi_status);
+	struct bio_vec *bvec;
+	int i;
+
+	bio_for_each_segment_all(bvec, bio, i)
+		page_endio(bvec->bv_page, false, error);
+	bio_put(bio);
+}
+
+static struct bio *
+iomap_read_bio_alloc(struct iomap *iomap, sector_t sector, loff_t length)
+{
+	int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	struct bio *bio = bio_alloc(GFP_NOFS, min(BIO_MAX_PAGES, nr_vecs));
+
+	bio->bi_opf = REQ_OP_READ;
+	bio->bi_iter.bi_sector = sector;
+	bio_set_dev(bio, iomap->bdev);
+	bio->bi_end_io = iomap_read_end_io;
+	return bio;
+}
+
+struct iomap_readpage_ctx {
+	struct page		*cur_page;
+	bool			cur_page_in_bio;
+	struct bio		*bio;
+	struct list_head	*pages;
+};
+
+static loff_t
+iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	struct iomap_readpage_ctx *ctx = data;
+	struct page *page = ctx->cur_page;
+	unsigned poff = pos & (PAGE_SIZE - 1);
+	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+	bool is_contig = false;
+	sector_t sector;
+
+	/* we don't support blocksize < PAGE_SIZE quite yet: */
+	WARN_ON_ONCE(pos != page_offset(page));
+	WARN_ON_ONCE(plen != PAGE_SIZE);
+
+	if (iomap_block_needs_zeroing(inode, pos, iomap)) {
+		zero_user(page, poff, plen);
+		SetPageUptodate(page);
+		goto done;
+	}
+
+	ctx->cur_page_in_bio = true;
+
+	/*
+	 * Try to merge into a previous segment if we can.
+	 */
+	sector = iomap_sector(iomap, pos);
+	if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
+		if (__bio_try_merge_page(ctx->bio, page, plen, poff))
+			goto done;
+		is_contig = true;
+	}
+
+	if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
+		if (ctx->bio)
+			submit_bio(ctx->bio);
+		ctx->bio = iomap_read_bio_alloc(iomap, sector, length);
+	}
+
+	__bio_add_page(ctx->bio, page, plen, poff);
+done:
+	return plen;
+}
+
+int
+iomap_readpage(struct page *page, const struct iomap_ops *ops)
+{
+	struct iomap_readpage_ctx ctx = { .cur_page = page };
+	struct inode *inode = page->mapping->host;
+	unsigned poff;
+	loff_t ret;
+
+	WARN_ON_ONCE(page_has_buffers(page));
+
+	for (poff = 0; poff < PAGE_SIZE; poff += ret) {
+		ret = iomap_apply(inode, page_offset(page) + poff,
+				PAGE_SIZE - poff, 0, ops, &ctx,
+				iomap_readpage_actor);
+		if (ret <= 0) {
+			SetPageError(page);
+			break;
+		}
+	}
+
+	if (ctx.bio)
+		submit_bio(ctx.bio);
+	else
+		unlock_page(page);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_readpage);
+
+static struct page *
+iomap_next_page(struct inode *inode, struct list_head *pages, loff_t end,
+		loff_t *done)
+{
+	while (!list_empty(pages)) {
+		struct page *page = lru_to_page(pages);
+
+		if (page_offset(page) >= end)
+			break;
+
+		list_del(&page->lru);
+		if (!add_to_page_cache_lru(page, inode->i_mapping, page->index,
+				GFP_NOFS))
+			return page;
+
+		*done += PAGE_SIZE;
+		put_page(page);
+	}
+
+	return NULL;
+}
+
+static loff_t
+iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	struct iomap_readpage_ctx *ctx = data;
+	loff_t done, ret;
+
+	for (done = 0; done < length; done += ret) {
+		if (ctx->cur_page && ((pos + done) & (PAGE_SIZE - 1)) == 0) {
+			if (!ctx->cur_page_in_bio)
+				unlock_page(ctx->cur_page);
+			put_page(ctx->cur_page);
+			ctx->cur_page = NULL;
+		}
+		if (!ctx->cur_page) {
+			ctx->cur_page = iomap_next_page(inode, ctx->pages,
+					pos + length, &done);
+			if (!ctx->cur_page)
+				break;
+			ctx->cur_page_in_bio = false;
+		}
+		ret = iomap_readpage_actor(inode, pos + done, length - done,
+				ctx, iomap);
+	}
+
+	return done;
+}
+
+int
+iomap_readpages(struct address_space *mapping, struct list_head *pages,
+		unsigned nr_pages, const struct iomap_ops *ops)
+{
+	struct iomap_readpage_ctx ctx = { .pages = pages };
+	loff_t pos = page_offset(list_entry(pages->prev, struct page, lru));
+	loff_t last = page_offset(list_entry(pages->next, struct page, lru));
+	loff_t length = last - pos + PAGE_SIZE, ret = 0;
+
+	while (length > 0) {
+		ret = iomap_apply(mapping->host, pos, length, 0, ops,
+				&ctx, iomap_readpages_actor);
+		if (ret <= 0)
+			break;
+		pos += ret;
+		length -= ret;
+	}
+
+	ret = 0;
+
+	if (ctx.bio)
+		submit_bio(ctx.bio);
+	if (ctx.cur_page) {
+		if (!ctx.cur_page_in_bio)
+			unlock_page(ctx.cur_page);
+		put_page(ctx.cur_page);
+	}
+	WARN_ON_ONCE(ret && !list_empty(ctx.pages));
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_readpages);
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 07f73224c38b..4710789620e7 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -9,6 +9,7 @@ struct fiemap_extent_info;
 struct inode;
 struct iov_iter;
 struct kiocb;
+struct page;
 struct vm_area_struct;
 struct vm_fault;
 
@@ -83,6 +84,9 @@ struct iomap_ops {
 
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops);
+int iomap_readpage(struct page *page, const struct iomap_ops *ops);
+int iomap_readpages(struct address_space *mapping, struct list_head *pages,
+		unsigned nr_pages, const struct iomap_ops *ops);
 int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 12/33] xfs: use iomap_bmap
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (10 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 11/33] iomap: add an iomap-based readpage and readpages implementation Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 13/33] xfs: use iomap for blocksize == PAGE_SIZE readpage and readpages Christoph Hellwig
                   ` (21 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Switch to the iomap based bmap implementation to get rid of one of the
last users of xfs_get_blocks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 0ab824f574ed..8b06be0a80da 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1378,10 +1378,9 @@ xfs_vm_bmap(
 	struct address_space	*mapping,
 	sector_t		block)
 {
-	struct inode		*inode = (struct inode *)mapping->host;
-	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_inode	*ip = XFS_I(mapping->host);
 
-	trace_xfs_vm_bmap(XFS_I(inode));
+	trace_xfs_vm_bmap(ip);
 
 	/*
 	 * The swap code (ab-)uses ->bmap to get a block mapping and then
@@ -1394,9 +1393,7 @@ xfs_vm_bmap(
 	 */
 	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
 		return 0;
-
-	filemap_write_and_wait(mapping);
-	return generic_block_bmap(mapping, block, xfs_get_blocks);
+	return iomap_bmap(mapping, block, &xfs_iomap_ops);
 }
 
 STATIC int
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 13/33] xfs: use iomap for blocksize == PAGE_SIZE readpage and readpages
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (11 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 12/33] xfs: use iomap_bmap Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 14/33] xfs: simplify xfs_bmap_punch_delalloc_range Christoph Hellwig
                   ` (20 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

For file systems with a block size that equals the page size we never do
partial reads, so we can use the buffer_head-less iomap versions of
readpage and readpages without conflicting with the buffer_head structures
create later in write_begin.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 8b06be0a80da..8e4d01e76fc8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1402,6 +1402,8 @@ xfs_vm_readpage(
 	struct page		*page)
 {
 	trace_xfs_vm_readpage(page->mapping->host, 1);
+	if (i_blocksize(page->mapping->host) == PAGE_SIZE)
+		return iomap_readpage(page, &xfs_iomap_ops);
 	return mpage_readpage(page, xfs_get_blocks);
 }
 
@@ -1413,6 +1415,8 @@ xfs_vm_readpages(
 	unsigned		nr_pages)
 {
 	trace_xfs_vm_readpages(mapping->host, nr_pages);
+	if (i_blocksize(mapping->host) == PAGE_SIZE)
+		return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
 	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
 }
 
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 14/33] xfs: simplify xfs_bmap_punch_delalloc_range
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (12 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 13/33] xfs: use iomap for blocksize == PAGE_SIZE readpage and readpages Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 15/33] xfs: simplify xfs_aops_discard_page Christoph Hellwig
                   ` (19 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Instead of using xfs_bmapi_read to find delalloc extents and then punch
them out using xfs_bunmapi, opencode the loop to iterate over the extents
and call xfs_bmap_del_extent_delay directly.  This both simplifies the
code and reduces the number of extent tree lookups required.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_bmap_util.c | 78 ++++++++++++++----------------------------
 1 file changed, 25 insertions(+), 53 deletions(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 8cd8c412f52d..7d2ba4cc8fba 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -695,12 +695,10 @@ xfs_getbmap(
 }
 
 /*
- * dead simple method of punching delalyed allocation blocks from a range in
- * the inode. Walks a block at a time so will be slow, but is only executed in
- * rare error cases so the overhead is not critical. This will always punch out
- * both the start and end blocks, even if the ranges only partially overlap
- * them, so it is up to the caller to ensure that partial blocks are not
- * passed in.
+ * Dead simple method of punching delalyed allocation blocks from a range in
+ * the inode.  This will always punch out both the start and end blocks, even
+ * if the ranges only partially overlap them, so it is up to the caller to
+ * ensure that partial blocks are not passed in.
  */
 int
 xfs_bmap_punch_delalloc_range(
@@ -708,63 +706,37 @@ xfs_bmap_punch_delalloc_range(
 	xfs_fileoff_t		start_fsb,
 	xfs_fileoff_t		length)
 {
-	xfs_fileoff_t		remaining = length;
+	struct xfs_ifork	*ifp = &ip->i_df;
+	struct xfs_bmbt_irec	got, del;
+	struct xfs_iext_cursor	icur;
 	int			error = 0;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
-	do {
-		int		done;
-		xfs_bmbt_irec_t	imap;
-		int		nimaps = 1;
-		xfs_fsblock_t	firstblock;
-		struct xfs_defer_ops dfops;
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+		if (error)
+			return error;
+	}
 
-		/*
-		 * Map the range first and check that it is a delalloc extent
-		 * before trying to unmap the range. Otherwise we will be
-		 * trying to remove a real extent (which requires a
-		 * transaction) or a hole, which is probably a bad idea...
-		 */
-		error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
-				       XFS_BMAPI_ENTIRE);
+	if (!xfs_iext_lookup_extent(ip, ifp, start_fsb, &icur, &got))
+		return 0;
 
-		if (error) {
-			/* something screwed, just bail */
-			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-				xfs_alert(ip->i_mount,
-			"Failed delalloc mapping lookup ino %lld fsb %lld.",
-						ip->i_ino, start_fsb);
-			}
+	do {
+		if (got.br_startoff >= start_fsb + length)
 			break;
-		}
-		if (!nimaps) {
-			/* nothing there */
-			goto next_block;
-		}
-		if (imap.br_startblock != DELAYSTARTBLOCK) {
-			/* been converted, ignore */
-			goto next_block;
-		}
-		WARN_ON(imap.br_blockcount == 0);
+		if (!isnullstartblock(got.br_startblock))
+			continue;
 
-		/*
-		 * Note: while we initialise the firstblock/dfops pair, they
-		 * should never be used because blocks should never be
-		 * allocated or freed for a delalloc extent and hence we need
-		 * don't cancel or finish them after the xfs_bunmapi() call.
-		 */
-		xfs_defer_init(&dfops, &firstblock);
-		error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
-					&dfops, &done);
+		del = got;
+		xfs_trim_extent(&del, start_fsb, length);
+		error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur,
+				&got, &del);
 		if (error)
 			break;
-
-		ASSERT(!xfs_defer_has_unfinished_work(&dfops));
-next_block:
-		start_fsb++;
-		remaining--;
-	} while(remaining > 0);
+		if (!xfs_iext_get_extent(ifp, &icur, &got))
+			break;
+	} while (xfs_iext_next_extent(ifp, &icur, &got));
 
 	return error;
 }
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 15/33] xfs: simplify xfs_aops_discard_page
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (13 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 14/33] xfs: simplify xfs_bmap_punch_delalloc_range Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 16/33] xfs: move locking into xfs_bmap_punch_delalloc_range Christoph Hellwig
                   ` (18 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Instead of looking at the buffer heads to see if a block is delalloc just
call xfs_bmap_punch_delalloc_range on the whole page - this will leave
any non-delalloc block intact and handle the iteration for us.  As a side
effect one more place stops caring about buffer heads and we can remove the
xfs_check_page_type function entirely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 85 +++++------------------------------------------
 1 file changed, 9 insertions(+), 76 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 8e4d01e76fc8..313449ac0288 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -711,49 +711,6 @@ xfs_map_at_offset(
 	clear_buffer_unwritten(bh);
 }
 
-/*
- * Test if a given page contains at least one buffer of a given @type.
- * If @check_all_buffers is true, then we walk all the buffers in the page to
- * try to find one of the type passed in. If it is not set, then the caller only
- * needs to check the first buffer on the page for a match.
- */
-STATIC bool
-xfs_check_page_type(
-	struct page		*page,
-	unsigned int		type,
-	bool			check_all_buffers)
-{
-	struct buffer_head	*bh;
-	struct buffer_head	*head;
-
-	if (PageWriteback(page))
-		return false;
-	if (!page->mapping)
-		return false;
-	if (!page_has_buffers(page))
-		return false;
-
-	bh = head = page_buffers(page);
-	do {
-		if (buffer_unwritten(bh)) {
-			if (type == XFS_IO_UNWRITTEN)
-				return true;
-		} else if (buffer_delay(bh)) {
-			if (type == XFS_IO_DELALLOC)
-				return true;
-		} else if (buffer_dirty(bh) && buffer_mapped(bh)) {
-			if (type == XFS_IO_OVERWRITE)
-				return true;
-		}
-
-		/* If we are only checking the first buffer, we are done now. */
-		if (!check_all_buffers)
-			break;
-	} while ((bh = bh->b_this_page) != head);
-
-	return false;
-}
-
 STATIC void
 xfs_vm_invalidatepage(
 	struct page		*page,
@@ -785,9 +742,6 @@ xfs_vm_invalidatepage(
  * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
  * truncation without a transaction as there is no space left for block
  * reservation (typically why we see a ENOSPC in writeback).
- *
- * This is not a performance critical path, so for now just do the punching a
- * buffer head at a time.
  */
 STATIC void
 xfs_aops_discard_page(
@@ -795,47 +749,26 @@ xfs_aops_discard_page(
 {
 	struct inode		*inode = page->mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
-	struct buffer_head	*bh, *head;
+	struct xfs_mount	*mp = ip->i_mount;
 	loff_t			offset = page_offset(page);
+	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, offset);
+	int			error;
 
-	if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
-		goto out_invalidate;
-
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+	if (XFS_FORCED_SHUTDOWN(mp))
 		goto out_invalidate;
 
-	xfs_alert(ip->i_mount,
+	xfs_alert(mp,
 		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
 			page, ip->i_ino, offset);
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	bh = head = page_buffers(page);
-	do {
-		int		error;
-		xfs_fileoff_t	start_fsb;
-
-		if (!buffer_delay(bh))
-			goto next_buffer;
-
-		start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
-		error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
-		if (error) {
-			/* something screwed, just bail */
-			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-				xfs_alert(ip->i_mount,
-			"page discard unable to remove delalloc mapping.");
-			}
-			break;
-		}
-next_buffer:
-		offset += i_blocksize(inode);
-
-	} while ((bh = bh->b_this_page) != head);
-
+	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+			PAGE_SIZE / i_blocksize(inode));
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	if (error && !XFS_FORCED_SHUTDOWN(mp))
+		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
 out_invalidate:
 	xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
-	return;
 }
 
 static int
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 16/33] xfs: move locking into xfs_bmap_punch_delalloc_range
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (14 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 15/33] xfs: simplify xfs_aops_discard_page Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 17/33] xfs: make xfs_writepage_map extent map centric Christoph Hellwig
                   ` (17 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Both callers want the same looking, so do it only once.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c      | 2 --
 fs/xfs/xfs_bmap_util.c | 7 ++++---
 fs/xfs/xfs_iomap.c     | 3 ---
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 313449ac0288..196c5daa6dce 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -761,10 +761,8 @@ xfs_aops_discard_page(
 		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
 			page, ip->i_ino, offset);
 
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
 			PAGE_SIZE / i_blocksize(inode));
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	if (error && !XFS_FORCED_SHUTDOWN(mp))
 		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
 out_invalidate:
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 7d2ba4cc8fba..82189ea97319 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -711,12 +711,11 @@ xfs_bmap_punch_delalloc_range(
 	struct xfs_iext_cursor	icur;
 	int			error = 0;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
 		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
 		if (error)
-			return error;
+			goto out_unlock;
 	}
 
 	if (!xfs_iext_lookup_extent(ip, ifp, start_fsb, &icur, &got))
@@ -738,6 +737,8 @@ xfs_bmap_punch_delalloc_range(
 			break;
 	} while (xfs_iext_next_extent(ifp, &icur, &got));
 
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 046469fcc1b8..407bbd978d1b 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1167,11 +1167,8 @@ xfs_file_iomap_end_delalloc(
 		truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
 					 XFS_FSB_TO_B(mp, end_fsb) - 1);
 
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
 					       end_fsb - start_fsb);
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
 		if (error && !XFS_FORCED_SHUTDOWN(mp)) {
 			xfs_alert(mp, "%s: unable to clean up ino %lld",
 				__func__, ip->i_ino);
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 17/33] xfs: make xfs_writepage_map extent map centric
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (15 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 16/33] xfs: move locking into xfs_bmap_punch_delalloc_range Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 18/33] xfs: remove the now unused XFS_BMAPI_IGSTATE flag Christoph Hellwig
                   ` (16 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm, Dave Chinner

From: Dave Chinner <dchinner@redhat.com>

xfs_writepage_map() iterates over the bufferheads on a page to decide
what sort of IO to do and what actions to take.  However, when it comes
to reflink and deciding when it needs to execute a COW operation, we no
longer look at the bufferhead state but instead we ignore than and look
up internal state held in teh COW fork extent list.

This means xfs_writepage_map() is somewhat confused. It does stuff, then
ignores it, then tries to handle the impedence mismatch by shovelling the
results inside the existing mapping code.  It works, but it's a bit of a
mess and it makes it hard to fix the cached map bug that the writepage
code currently has.

To unify the two different mechanisms, we first have to choose a direction.
That's already been set - we're de-emphasising bufferheads so they are no
longer a control structure as we need to do taht to allow for eventual
removal.  Hence we need to move away from looking at bufferhead state to
determine what operations we need to perform.

We can't completely get rid of bufferheads yet - they do contain some
state that is absolutely necessary, such as whether that part of the page
contains valid data or not (buffer_uptodate()).  Other state in the
bufferhead is redundant:

	BH_dirty - the page is dirty, so we can ignore this and just
		write it
	BH_delay - we have delalloc extent info in the DATA fork extent
		tree
	BH_unwritten - same as BH_delay
	BH_mapped - indicates we've already used it once for IO and it is
		mapped to a disk address. Needs to be ignored for COW
		blocks.

The BH_mapped flag is an interesting case - it's supposed to indicate that
it's already mapped to disk and so we can just use it "as is".  In theory,
we don't even have to do an extent lookup to find where to write it too,
but we have to do that anyway to determine we are actually writing over a
valid extent.  Hence it's not even serving the purpose of avoiding a an
extent lookup during writeback, and so we can pretty much ignore it.
Especially as we have to ignore it for COW operations...

Therefore, use the extent map as the source of information to tell us
what actions we need to take and what sort of IO we should perform.  The
first step is integration xfs_map_blocks() and xfs_map_cow() and have
xfs_map_blocks() set the io type according to what it looks up.  This
means it can easily handle both normal overwrite and COW cases.  The
only thing we also need to add is the ability to return hole mappings.

We need to return and cache hole mappings now for the case of multiple
blocks per page.  We no longer use the BH_mapped to indicate a block over
a hole, so we have to get that info from xfs_map_blocks().  We cache it so
that holes that span two pages don't need separate lookups.  This allows us
to avoid ever doing write IO over a hole, too.

Further, we need to drop the XFS_BMAPI_IGSTATE flag so that we don't
combine contiguous written and unwritten extents into a single map.  The
io type needs to match the extent type we are writing to so that we run the
correct IO completion routine for the IO. There is scope for optimisation
that would allow us to re-instate the XFS_BMAPI_IGSTATE flag, but this
requires tweaks to code outside the scope of this change.

Now that we have xfs_map_blocks() returning both a cached map and the type
of IO we need to perform, we can rewrite xfs_writepage_map() to drop all
the bufferhead control. It's also much simplified because it doesn't need
to explicitly handle COW operations.  Instead of iterating bufferheads, it
iterates blocks within the page and then looks up what per-block state is
required from the appropriate bufferhead.  It then validates the cached
map, and if it's not valid, we get a new map.  If we don't get a valid map
or it's over a hole, we skip the block.

At this point, we have to remap the bufferhead via xfs_map_at_offset().
As previously noted, we had to do this even if the buffer was already
mapped as the mapping would be stale for XFS_IO_DELALLOC, XFS_IO_UNWRITTEN
and XFS_IO_COW IO types.  With xfs_map_blocks() now controlling the type,
even XFS_IO_OVERWRITE types need remapping, as converted-but-not-yet-
written delalloc extents beyond EOF can be reported at XFS_IO_OVERWRITE.
Bufferheads that span such regions still need their BH_Delay flags cleared
and their block numbers calculated, so we now unconditionally map each
bufferhead before submission.

But wait! There's more - remember the old "treat unwritten extents as
holes on read" hack?  Yeah, that means we can have a dirty page with
unmapped, unwritten bufferheads that contain data!  What makes these so
special is that the unwritten "hole" bufferheads do not have a valid block
device pointer, so if we attempt to write them xfs_add_to_ioend() blows
up. So we make xfs_map_at_offset() do the "realtime or data device"
lookup from the inode and ignore what was or wasn't put into the
bufferhead when the buffer was instantiated.

The astute reader will have realised by now that this code treats
unwritten extents in multiple-blocks-per-page situations differently.
If we get any combination of unwritten blocks on a dirty page that contain
valid data in the page, we're going to convert them to real extents.  This
can actually be a win, because it means that pages with interleaving
unwritten and written blocks will get converted to a single written extent
with zeros replacing the interspersed unwritten blocks.  This is actually
good for reducing extent list and conversion overhead, and it means we
issue a contiguous IO instead of lots of little ones.  The downside is
that we use up a little extra IO bandwidth.  Neither of these seem like a
bad thing given that spinning disks are seek sensitive, and SSDs/pmem have
bandwidth to burn and the lower Io latency/CPU overhead of fewer, larger
IOs will result in better performance on them...

As a result of all this, the only state we actually care about from the
bufferhead is a single flag - BH_Uptodate. We still use the bufferhead to
pass some information to the bio via xfs_add_to_ioend(), but that is
trivial to separate and pass explicitly.  This means we really only need
1 bit of state per block per page from the buffered write path in the
writeback path.  Everything else we do with the bufferhead is purely to
make the buffered IO front end continue to work correctly. i.e we've
pretty much marginalised bufferheads in the writeback path completely.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
[hch: forward port + slight refactoring]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 277 +++++++++++++++++++++-------------------------
 fs/xfs/xfs_aops.h |   4 +-
 2 files changed, 130 insertions(+), 151 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 196c5daa6dce..6ad43829c89a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -378,78 +378,101 @@ xfs_map_blocks(
 	struct inode		*inode,
 	loff_t			offset,
 	struct xfs_bmbt_irec	*imap,
-	int			type)
+	int			*type)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	ssize_t			count = i_blocksize(inode);
 	xfs_fileoff_t		offset_fsb, end_fsb;
+	int			whichfork = XFS_DATA_FORK;
 	int			error = 0;
-	int			bmapi_flags = XFS_BMAPI_ENTIRE;
 	int			nimaps = 1;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	/*
-	 * Truncate can race with writeback since writeback doesn't take the
-	 * iolock and truncate decreases the file size before it starts
-	 * truncating the pages between new_size and old_size.  Therefore, we
-	 * can end up in the situation where writeback gets a CoW fork mapping
-	 * but the truncate makes the mapping invalid and we end up in here
-	 * trying to get a new mapping.  Bail out here so that we simply never
-	 * get a valid mapping and so we drop the write altogether.  The page
-	 * truncation will kill the contents anyway.
-	 */
-	if (type == XFS_IO_COW && offset > i_size_read(inode))
-		return 0;
-
-	ASSERT(type != XFS_IO_COW);
-	if (type == XFS_IO_UNWRITTEN)
-		bmapi_flags |= XFS_BMAPI_IGSTATE;
-
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       (ip->i_df.if_flags & XFS_IFEXTENTS));
 	ASSERT(offset <= mp->m_super->s_maxbytes);
 
+	if (xfs_is_reflink_inode(ip) &&
+	    xfs_reflink_find_cow_mapping(ip, offset, imap)) {
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		/*
+		 * Truncate can race with writeback since writeback doesn't
+		 * take the iolock and truncate decreases the file size before
+		 * it starts truncating the pages between new_size and old_size.
+		 * Therefore, we can end up in the situation where writeback
+		 * gets a CoW fork mapping but the truncate makes the mapping
+		 * invalid and we end up in here trying to get a new mapping.
+		 * bail out here so that we simply never get a valid mapping
+		 * and so we drop the write altogether.  The page truncation
+		 * will kill the contents anyway.
+		 */
+		if (offset > i_size_read(inode))
+			return 0;
+		whichfork = XFS_COW_FORK;
+		*type = XFS_IO_COW;
+		goto done;
+	}
+
 	if (offset > mp->m_super->s_maxbytes - count)
 		count = mp->m_super->s_maxbytes - offset;
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
-				imap, &nimaps, bmapi_flags);
-	/*
-	 * Truncate an overwrite extent if there's a pending CoW
-	 * reservation before the end of this extent.  This forces us
-	 * to come back to writepage to take care of the CoW.
-	 */
-	if (nimaps && type == XFS_IO_OVERWRITE)
+				imap, &nimaps, XFS_BMAPI_ENTIRE);
+	if (!nimaps) {
+		/*
+		 * Lookup returns no match? Beyond eof? regardless,
+		 * return it as a hole so we don't write it
+		 */
+		imap->br_startoff = offset_fsb;
+		imap->br_blockcount = end_fsb - offset_fsb;
+		imap->br_startblock = HOLESTARTBLOCK;
+		*type = XFS_IO_HOLE;
+	} else if (imap->br_startblock == HOLESTARTBLOCK) {
+		/* landed in a hole */
+		*type = XFS_IO_HOLE;
+	} else if (isnullstartblock(imap->br_startblock)) {
+		/* got a delalloc extent */
+		*type = XFS_IO_DELALLOC;
+	} else {
+		/*
+		 * Got an existing extent for overwrite.  Truncate it if there
+		 * is a pending CoW reservation before the end of this extent,
+		 * so that we pick up the COW extents in the next iteration.
+		 */
 		xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap);
+		if (imap->br_state == XFS_EXT_UNWRITTEN)
+			*type = XFS_IO_UNWRITTEN;
+		else
+			*type = XFS_IO_OVERWRITE;
+	}
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
 	if (error)
 		return error;
 
-	if (type == XFS_IO_DELALLOC &&
-	    (!nimaps || isnullstartblock(imap->br_startblock))) {
-		error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset,
-				imap);
-		if (!error)
-			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
-		return error;
-	}
-
-#ifdef DEBUG
-	if (type == XFS_IO_UNWRITTEN) {
-		ASSERT(nimaps);
-		ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-		ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+done:
+	switch (*type) {
+	case XFS_IO_HOLE:
+	case XFS_IO_OVERWRITE:
+	case XFS_IO_UNWRITTEN:
+		/* nothing to do! */
+		trace_xfs_map_blocks_found(ip, offset, count, *type, imap);
+		return 0;
+	case XFS_IO_DELALLOC:
+	case XFS_IO_COW:
+		error = xfs_iomap_write_allocate(ip, whichfork, offset, imap);
+		if (error)
+			return error;
+		trace_xfs_map_blocks_alloc(ip, offset, count, *type, imap);
+		return 0;
+	default:
+		ASSERT(1);
+		return -EFSCORRUPTED;
 	}
-#endif
-	if (nimaps)
-		trace_xfs_map_blocks_found(ip, offset, count, type, imap);
-	return 0;
 }
 
 STATIC bool
@@ -709,6 +732,14 @@ xfs_map_at_offset(
 	set_buffer_mapped(bh);
 	clear_buffer_delay(bh);
 	clear_buffer_unwritten(bh);
+
+	/*
+	 * If this is a realtime file, data may be on a different device.
+	 * to that pointed to from the buffer_head b_bdev currently. We can't
+	 * trust that the bufferhead has a already been mapped correctly, so
+	 * set the bdev now.
+	 */
+	bh->b_bdev = xfs_find_bdev_for_inode(inode);
 }
 
 STATIC void
@@ -769,56 +800,6 @@ xfs_aops_discard_page(
 	xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
 }
 
-static int
-xfs_map_cow(
-	struct xfs_writepage_ctx *wpc,
-	struct inode		*inode,
-	loff_t			offset,
-	unsigned int		*new_type)
-{
-	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_bmbt_irec	imap;
-	bool			is_cow = false;
-	int			error;
-
-	/*
-	 * If we already have a valid COW mapping keep using it.
-	 */
-	if (wpc->io_type == XFS_IO_COW) {
-		wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
-		if (wpc->imap_valid) {
-			*new_type = XFS_IO_COW;
-			return 0;
-		}
-	}
-
-	/*
-	 * Else we need to check if there is a COW mapping at this offset.
-	 */
-	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-	if (!is_cow)
-		return 0;
-
-	/*
-	 * And if the COW mapping has a delayed extent here we need to
-	 * allocate real space for it now.
-	 */
-	if (isnullstartblock(imap.br_startblock)) {
-		error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
-				&imap);
-		if (error)
-			return error;
-	}
-
-	wpc->io_type = *new_type = XFS_IO_COW;
-	wpc->imap_valid = true;
-	wpc->imap = imap;
-	return 0;
-}
-
 /*
  * We implement an immediate ioend submission policy here to avoid needing to
  * chain multiple ioends and hence nest mempool allocations which can violate
@@ -845,85 +826,81 @@ xfs_writepage_map(
 {
 	LIST_HEAD(submit_list);
 	struct xfs_ioend	*ioend, *next;
-	struct buffer_head	*bh, *head;
+	struct buffer_head	*bh;
 	ssize_t			len = i_blocksize(inode);
-	uint64_t		offset;
 	int			error = 0;
 	int			count = 0;
-	int			uptodate = 1;
-	unsigned int		new_type;
+	bool			uptodate = true;
+	loff_t			file_offset;	/* file offset of page */
+	unsigned		poffset;	/* offset into page */
 
-	bh = head = page_buffers(page);
-	offset = page_offset(page);
-	do {
-		if (offset >= end_offset)
+	/*
+	 * Walk the blocks on the page, and we we run off then end of the
+	 * current map or find the current map invalid, grab a new one.
+	 * We only use bufferheads here to check per-block state - they no
+	 * longer control the iteration through the page. This allows us to
+	 * replace the bufferhead with some other state tracking mechanism in
+	 * future.
+	 */
+	file_offset = page_offset(page);
+	bh = page_buffers(page);
+	for (poffset = 0;
+	     poffset < PAGE_SIZE;
+	     poffset += len, file_offset += len, bh = bh->b_this_page) {
+		/* past the range we are writing, so nothing more to write. */
+		if (file_offset >= end_offset)
 			break;
-		if (!buffer_uptodate(bh))
-			uptodate = 0;
 
 		/*
-		 * set_page_dirty dirties all buffers in a page, independent
-		 * of their state.  The dirty state however is entirely
-		 * meaningless for holes (!mapped && uptodate), so skip
-		 * buffers covering holes here.
+		 * Block does not contain valid data, skip it, mark the current
+		 * map as invalid because we have a discontiguity. This ensures
+		 * we put subsequent writeable buffers into a new ioend.
 		 */
-		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-			wpc->imap_valid = false;
-			continue;
-		}
-
-		if (buffer_unwritten(bh))
-			new_type = XFS_IO_UNWRITTEN;
-		else if (buffer_delay(bh))
-			new_type = XFS_IO_DELALLOC;
-		else if (buffer_uptodate(bh))
-			new_type = XFS_IO_OVERWRITE;
-		else {
+		if (!buffer_uptodate(bh)) {
 			if (PageUptodate(page))
 				ASSERT(buffer_mapped(bh));
-			/*
-			 * This buffer is not uptodate and will not be
-			 * written to disk.  Ensure that we will put any
-			 * subsequent writeable buffers into a new
-			 * ioend.
-			 */
+			uptodate = false;
 			wpc->imap_valid = false;
 			continue;
 		}
 
-		if (xfs_is_reflink_inode(XFS_I(inode))) {
-			error = xfs_map_cow(wpc, inode, offset, &new_type);
-			if (error)
-				goto out;
-		}
-
-		if (wpc->io_type != new_type) {
-			wpc->io_type = new_type;
-			wpc->imap_valid = false;
-		}
-
+		/* Check to see if current map spans this file offset */
 		if (wpc->imap_valid)
 			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
-							 offset);
+							 file_offset);
+		/*
+		 * If we don't have a valid map, now it's time to get a new one
+		 * for this offset.  This will convert delayed allocations
+		 * (including COW ones) into real extents.  If we return without
+		 * a valid map, it means we landed in a hole and we skip the
+		 * block.
+		 */
 		if (!wpc->imap_valid) {
-			error = xfs_map_blocks(inode, offset, &wpc->imap,
-					     wpc->io_type);
+			error = xfs_map_blocks(inode, file_offset, &wpc->imap,
+					     &wpc->io_type);
 			if (error)
 				goto out;
 			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
-							 offset);
+							 file_offset);
 		}
-		if (wpc->imap_valid) {
-			lock_buffer(bh);
-			if (wpc->io_type != XFS_IO_OVERWRITE)
-				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
-			count++;
+
+		if (!wpc->imap_valid || wpc->io_type == XFS_IO_HOLE) {
+			/*
+			 * set_page_dirty dirties all buffers in a page, independent
+			 * of their state.  The dirty state however is entirely
+			 * meaningless for holes (!mapped && uptodate), so check we did
+			 * have a buffer covering a hole here and continue.
+			 */
+			continue;
 		}
 
-	} while (offset += len, ((bh = bh->b_this_page) != head));
+		lock_buffer(bh);
+		xfs_map_at_offset(inode, bh, &wpc->imap, file_offset);
+		xfs_add_to_ioend(inode, bh, file_offset, wpc, wbc, &submit_list);
+		count++;
+	}
 
-	if (uptodate && bh == head)
+	if (uptodate && poffset == PAGE_SIZE)
 		SetPageUptodate(page);
 
 	ASSERT(wpc->ioend || list_empty(&submit_list));
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 69346d460dfa..b2ef5b661761 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -29,6 +29,7 @@ enum {
 	XFS_IO_UNWRITTEN,	/* covers allocated but uninitialized data */
 	XFS_IO_OVERWRITE,	/* covers already allocated extent */
 	XFS_IO_COW,		/* covers copy-on-write extent */
+	XFS_IO_HOLE,		/* covers region without any block allocation */
 };
 
 #define XFS_IO_TYPES \
@@ -36,7 +37,8 @@ enum {
 	{ XFS_IO_DELALLOC,		"delalloc" }, \
 	{ XFS_IO_UNWRITTEN,		"unwritten" }, \
 	{ XFS_IO_OVERWRITE,		"overwrite" }, \
-	{ XFS_IO_COW,			"CoW" }
+	{ XFS_IO_COW,			"CoW" }, \
+	{ XFS_IO_HOLE,			"hole" }
 
 /*
  * Structure for buffered I/O completions.
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 18/33] xfs: remove the now unused XFS_BMAPI_IGSTATE flag
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (16 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 17/33] xfs: make xfs_writepage_map extent map centric Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 19/33] xfs: remove xfs_reflink_find_cow_mapping Christoph Hellwig
                   ` (15 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap.c | 6 ++----
 fs/xfs/libxfs/xfs_bmap.h | 3 ---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 6a7c2f03ea11..30a2242a1eba 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3785,8 +3785,7 @@ xfs_bmapi_update_map(
 		   mval[-1].br_startblock != HOLESTARTBLOCK &&
 		   mval->br_startblock == mval[-1].br_startblock +
 					  mval[-1].br_blockcount &&
-		   ((flags & XFS_BMAPI_IGSTATE) ||
-			mval[-1].br_state == mval->br_state)) {
+		   mval[-1].br_state == mval->br_state) {
 		ASSERT(mval->br_startoff ==
 		       mval[-1].br_startoff + mval[-1].br_blockcount);
 		mval[-1].br_blockcount += mval->br_blockcount;
@@ -3831,7 +3830,7 @@ xfs_bmapi_read(
 
 	ASSERT(*nmap >= 1);
 	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
-			   XFS_BMAPI_IGSTATE|XFS_BMAPI_COWFORK)));
+			   XFS_BMAPI_COWFORK)));
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
 
 	if (unlikely(XFS_TEST_ERROR(
@@ -4275,7 +4274,6 @@ xfs_bmapi_write(
 
 	ASSERT(*nmap >= 1);
 	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
-	ASSERT(!(flags & XFS_BMAPI_IGSTATE));
 	ASSERT(tp != NULL ||
 	       (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
 			(XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 2b766b37096d..2c6da709a521 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -79,8 +79,6 @@ struct xfs_extent_free_item
 #define XFS_BMAPI_METADATA	0x002	/* mapping metadata not user data */
 #define XFS_BMAPI_ATTRFORK	0x004	/* use attribute fork not data */
 #define XFS_BMAPI_PREALLOC	0x008	/* preallocation op: unwritten space */
-#define XFS_BMAPI_IGSTATE	0x010	/* Ignore state - */
-					/* combine contig. space */
 #define XFS_BMAPI_CONTIG	0x020	/* must allocate only one extent */
 /*
  * unwritten extent conversion - this needs write cache flushing and no additional
@@ -121,7 +119,6 @@ struct xfs_extent_free_item
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
 	{ XFS_BMAPI_ATTRFORK,	"ATTRFORK" }, \
 	{ XFS_BMAPI_PREALLOC,	"PREALLOC" }, \
-	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
 	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
 	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
 	{ XFS_BMAPI_ZERO,	"ZERO" }, \
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 19/33] xfs: remove xfs_reflink_find_cow_mapping
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (17 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 18/33] xfs: remove the now unused XFS_BMAPI_IGSTATE flag Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 20/33] xfs: remove xfs_reflink_trim_irec_to_next_cow Christoph Hellwig
                   ` (14 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

We only have one caller left, and open coding the simple extent list
lookup in it allows us to make the code both more understandable and
reuse calculations and variables already present.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c    | 17 ++++++++++++-----
 fs/xfs/xfs_reflink.c | 30 ------------------------------
 fs/xfs/xfs_reflink.h |  2 --
 fs/xfs/xfs_trace.h   |  1 -
 4 files changed, 12 insertions(+), 38 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 6ad43829c89a..41616629dd13 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -385,6 +385,7 @@ xfs_map_blocks(
 	ssize_t			count = i_blocksize(inode);
 	xfs_fileoff_t		offset_fsb, end_fsb;
 	int			whichfork = XFS_DATA_FORK;
+	struct xfs_iext_cursor	icur;
 	int			error = 0;
 	int			nimaps = 1;
 
@@ -396,8 +397,18 @@ xfs_map_blocks(
 	       (ip->i_df.if_flags & XFS_IFEXTENTS));
 	ASSERT(offset <= mp->m_super->s_maxbytes);
 
+	if (offset > mp->m_super->s_maxbytes - count)
+		count = mp->m_super->s_maxbytes - offset;
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+	/*
+	 * Check if this is offset is covered by a COW extents, and if yes use
+	 * it directly instead of looking up anything in the data fork.
+	 */
 	if (xfs_is_reflink_inode(ip) &&
-	    xfs_reflink_find_cow_mapping(ip, offset, imap)) {
+	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, imap) &&
+	    imap->br_startoff <= offset_fsb) {
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		/*
 		 * Truncate can race with writeback since writeback doesn't
@@ -417,10 +428,6 @@ xfs_map_blocks(
 		goto done;
 	}
 
-	if (offset > mp->m_super->s_maxbytes - count)
-		count = mp->m_super->s_maxbytes - offset;
-	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
-	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
 				imap, &nimaps, XFS_BMAPI_ENTIRE);
 	if (!nimaps) {
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index cdbd342a5249..3776b7bbd8c6 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -484,36 +484,6 @@ xfs_reflink_allocate_cow(
 	return error;
 }
 
-/*
- * Find the CoW reservation for a given byte offset of a file.
- */
-bool
-xfs_reflink_find_cow_mapping(
-	struct xfs_inode		*ip,
-	xfs_off_t			offset,
-	struct xfs_bmbt_irec		*imap)
-{
-	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	xfs_fileoff_t			offset_fsb;
-	struct xfs_bmbt_irec		got;
-	struct xfs_iext_cursor		icur;
-
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
-
-	if (!xfs_is_reflink_inode(ip))
-		return false;
-	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
-	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got))
-		return false;
-	if (got.br_startoff > offset_fsb)
-		return false;
-
-	trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
-			&got);
-	*imap = got;
-	return true;
-}
-
 /*
  * Trim an extent to end at the next CoW reservation past offset_fsb.
  */
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 701487bab468..15a456492667 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -32,8 +32,6 @@ extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
 extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t count);
-extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
-		struct xfs_bmbt_irec *imap);
 extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
 		xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap);
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8955254b900e..aa284f840d33 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3220,7 +3220,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
 DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
 
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
-DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec);
 
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 20/33] xfs: remove xfs_reflink_trim_irec_to_next_cow
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (18 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 19/33] xfs: remove xfs_reflink_find_cow_mapping Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 21/33] xfs: simplify xfs_map_blocks by using xfs_iext_lookup_extent directly Christoph Hellwig
                   ` (13 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

In the only caller we just did a lookup in the COW extent tree for
the same offset.  Reuse that result and save a lookup, as well as
shortening the ilock hold time.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c    | 25 +++++++++++++++++--------
 fs/xfs/xfs_reflink.c | 33 ---------------------------------
 fs/xfs/xfs_reflink.h |  2 --
 3 files changed, 17 insertions(+), 43 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 41616629dd13..09fb10be1256 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -383,11 +383,12 @@ xfs_map_blocks(
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	ssize_t			count = i_blocksize(inode);
-	xfs_fileoff_t		offset_fsb, end_fsb;
+	xfs_fileoff_t		offset_fsb, end_fsb, cow_fsb = 0;
 	int			whichfork = XFS_DATA_FORK;
 	struct xfs_iext_cursor	icur;
 	int			error = 0;
 	int			nimaps = 1;
+	bool			cow_valid = false;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
@@ -407,8 +408,11 @@ xfs_map_blocks(
 	 * it directly instead of looking up anything in the data fork.
 	 */
 	if (xfs_is_reflink_inode(ip) &&
-	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, imap) &&
-	    imap->br_startoff <= offset_fsb) {
+	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, imap)) {
+		cow_fsb = imap->br_startoff;
+		cow_valid = true;
+	}
+	if (cow_valid && cow_fsb <= offset_fsb) {
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		/*
 		 * Truncate can race with writeback since writeback doesn't
@@ -430,6 +434,10 @@ xfs_map_blocks(
 
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
 				imap, &nimaps, XFS_BMAPI_ENTIRE);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	if (error)
+		return error;
+
 	if (!nimaps) {
 		/*
 		 * Lookup returns no match? Beyond eof? regardless,
@@ -451,16 +459,17 @@ xfs_map_blocks(
 		 * is a pending CoW reservation before the end of this extent,
 		 * so that we pick up the COW extents in the next iteration.
 		 */
-		xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap);
+		if (cow_valid &&
+		    cow_fsb < imap->br_startoff + imap->br_blockcount) {
+			imap->br_blockcount = cow_fsb - imap->br_startoff;
+			trace_xfs_reflink_trim_irec(ip, imap);
+		}
+
 		if (imap->br_state == XFS_EXT_UNWRITTEN)
 			*type = XFS_IO_UNWRITTEN;
 		else
 			*type = XFS_IO_OVERWRITE;
 	}
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-	if (error)
-		return error;
-
 done:
 	switch (*type) {
 	case XFS_IO_HOLE:
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 3776b7bbd8c6..8231109f6256 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -484,39 +484,6 @@ xfs_reflink_allocate_cow(
 	return error;
 }
 
-/*
- * Trim an extent to end at the next CoW reservation past offset_fsb.
- */
-void
-xfs_reflink_trim_irec_to_next_cow(
-	struct xfs_inode		*ip,
-	xfs_fileoff_t			offset_fsb,
-	struct xfs_bmbt_irec		*imap)
-{
-	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	struct xfs_bmbt_irec		got;
-	struct xfs_iext_cursor		icur;
-
-	if (!xfs_is_reflink_inode(ip))
-		return;
-
-	/* Find the extent in the CoW fork. */
-	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got))
-		return;
-
-	/* This is the extent before; try sliding up one. */
-	if (got.br_startoff < offset_fsb) {
-		if (!xfs_iext_next_extent(ifp, &icur, &got))
-			return;
-	}
-
-	if (got.br_startoff >= imap->br_startoff + imap->br_blockcount)
-		return;
-
-	imap->br_blockcount = got.br_startoff - imap->br_startoff;
-	trace_xfs_reflink_trim_irec(ip, imap);
-}
-
 /*
  * Cancel CoW reservations for some block range of an inode.
  *
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 15a456492667..e8d4d50c629f 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -32,8 +32,6 @@ extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
 extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t count);
-extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
-		xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap);
 
 extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
 		struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 21/33] xfs: simplify xfs_map_blocks by using xfs_iext_lookup_extent directly
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (19 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 20/33] xfs: remove xfs_reflink_trim_irec_to_next_cow Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 22/33] xfs: don't clear imap_valid for a non-uptodate buffers Christoph Hellwig
                   ` (12 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

xfs_bmapi_read adds zero value in xfs_map_blocks.  Replace it with a
direct call to the low-level extent lookup function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 09fb10be1256..07d5255a0f9f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -387,7 +387,6 @@ xfs_map_blocks(
 	int			whichfork = XFS_DATA_FORK;
 	struct xfs_iext_cursor	icur;
 	int			error = 0;
-	int			nimaps = 1;
 	bool			cow_valid = false;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
@@ -432,24 +431,16 @@ xfs_map_blocks(
 		goto done;
 	}
 
-	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
-				imap, &nimaps, XFS_BMAPI_ENTIRE);
+	if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, imap))
+		imap->br_startoff = end_fsb;	/* fake a hole past EOF */
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-	if (error)
-		return error;
 
-	if (!nimaps) {
-		/*
-		 * Lookup returns no match? Beyond eof? regardless,
-		 * return it as a hole so we don't write it
-		 */
+	if (imap->br_startoff > offset_fsb) {
+		/* landed in a hole or beyond EOF */
+		imap->br_blockcount = imap->br_startoff - offset_fsb;
 		imap->br_startoff = offset_fsb;
-		imap->br_blockcount = end_fsb - offset_fsb;
 		imap->br_startblock = HOLESTARTBLOCK;
 		*type = XFS_IO_HOLE;
-	} else if (imap->br_startblock == HOLESTARTBLOCK) {
-		/* landed in a hole */
-		*type = XFS_IO_HOLE;
 	} else if (isnullstartblock(imap->br_startblock)) {
 		/* got a delalloc extent */
 		*type = XFS_IO_DELALLOC;
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 22/33] xfs: don't clear imap_valid for a non-uptodate buffers
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (20 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 21/33] xfs: simplify xfs_map_blocks by using xfs_iext_lookup_extent directly Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 23/33] xfs: remove the imap_valid flag Christoph Hellwig
                   ` (11 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Finding a buffer that isn't uptodate doesn't invalidate the mapping for
any given block.  The last_sector check will already take care of starting
another ioend as soon as we find any non-update buffer, and if the current
mapping doesn't include the next uptodate buffer the xfs_imap_valid check
will take care of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 07d5255a0f9f..5da2e99b0559 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -859,15 +859,12 @@ xfs_writepage_map(
 			break;
 
 		/*
-		 * Block does not contain valid data, skip it, mark the current
-		 * map as invalid because we have a discontiguity. This ensures
-		 * we put subsequent writeable buffers into a new ioend.
+		 * Block does not contain valid data, skip it.
 		 */
 		if (!buffer_uptodate(bh)) {
 			if (PageUptodate(page))
 				ASSERT(buffer_mapped(bh));
 			uptodate = false;
-			wpc->imap_valid = false;
 			continue;
 		}
 
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 23/33] xfs: remove the imap_valid flag
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (21 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 22/33] xfs: don't clear imap_valid for a non-uptodate buffers Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 24/33] xfs: don't look at buffer heads in xfs_add_to_ioend Christoph Hellwig
                   ` (10 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Simplify the way we check for a valid imap - we know we have a valid
mapping after xfs_map_blocks returned successfully, and we know we can
call xfs_imap_valid on any imap, as it will always fail on a
zero-initialized map.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 5da2e99b0559..7ebd686cb723 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -42,7 +42,6 @@
  */
 struct xfs_writepage_ctx {
 	struct xfs_bmbt_irec    imap;
-	bool			imap_valid;
 	unsigned int		io_type;
 	struct xfs_ioend	*ioend;
 	sector_t		last_block;
@@ -868,10 +867,6 @@ xfs_writepage_map(
 			continue;
 		}
 
-		/* Check to see if current map spans this file offset */
-		if (wpc->imap_valid)
-			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
-							 file_offset);
 		/*
 		 * If we don't have a valid map, now it's time to get a new one
 		 * for this offset.  This will convert delayed allocations
@@ -879,16 +874,14 @@ xfs_writepage_map(
 		 * a valid map, it means we landed in a hole and we skip the
 		 * block.
 		 */
-		if (!wpc->imap_valid) {
+		if (!xfs_imap_valid(inode, &wpc->imap, file_offset)) {
 			error = xfs_map_blocks(inode, file_offset, &wpc->imap,
 					     &wpc->io_type);
 			if (error)
 				goto out;
-			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
-							 file_offset);
 		}
 
-		if (!wpc->imap_valid || wpc->io_type == XFS_IO_HOLE) {
+		if (wpc->io_type == XFS_IO_HOLE) {
 			/*
 			 * set_page_dirty dirties all buffers in a page, independent
 			 * of their state.  The dirty state however is entirely
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 24/33] xfs: don't look at buffer heads in xfs_add_to_ioend
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (22 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 23/33] xfs: remove the imap_valid flag Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 25/33] xfs: move all writeback buffer_head manipulation into xfs_map_at_offset Christoph Hellwig
                   ` (9 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Calculate all information for the bio based on the passed in information
without requiring a buffer_head structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 68 ++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 36 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 7ebd686cb723..f6d28e6aa911 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -44,7 +44,6 @@ struct xfs_writepage_ctx {
 	struct xfs_bmbt_irec    imap;
 	unsigned int		io_type;
 	struct xfs_ioend	*ioend;
-	sector_t		last_block;
 };
 
 void
@@ -545,11 +544,6 @@ xfs_start_page_writeback(
 	unlock_page(page);
 }
 
-static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
-{
-	return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
-}
-
 /*
  * Submit the bio for an ioend. We are passed an ioend with a bio attached to
  * it, and we submit that bio. The ioend may be used for multiple bio
@@ -604,27 +598,20 @@ xfs_submit_ioend(
 	return 0;
 }
 
-static void
-xfs_init_bio_from_bh(
-	struct bio		*bio,
-	struct buffer_head	*bh)
-{
-	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-	bio_set_dev(bio, bh->b_bdev);
-}
-
 static struct xfs_ioend *
 xfs_alloc_ioend(
 	struct inode		*inode,
 	unsigned int		type,
 	xfs_off_t		offset,
-	struct buffer_head	*bh)
+	struct block_device	*bdev,
+	sector_t		sector)
 {
 	struct xfs_ioend	*ioend;
 	struct bio		*bio;
 
 	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
-	xfs_init_bio_from_bh(bio, bh);
+	bio_set_dev(bio, bdev);
+	bio->bi_iter.bi_sector = sector;
 
 	ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
 	INIT_LIST_HEAD(&ioend->io_list);
@@ -649,13 +636,14 @@ static void
 xfs_chain_bio(
 	struct xfs_ioend	*ioend,
 	struct writeback_control *wbc,
-	struct buffer_head	*bh)
+	struct block_device	*bdev,
+	sector_t		sector)
 {
 	struct bio *new;
 
 	new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
-	xfs_init_bio_from_bh(new, bh);
-
+	bio_set_dev(new, bdev);
+	new->bi_iter.bi_sector = sector;
 	bio_chain(ioend->io_bio, new);
 	bio_get(ioend->io_bio);		/* for xfs_destroy_ioend */
 	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
@@ -665,39 +653,45 @@ xfs_chain_bio(
 }
 
 /*
- * Test to see if we've been building up a completion structure for
- * earlier buffers -- if so, we try to append to this ioend if we
- * can, otherwise we finish off any current ioend and start another.
- * Return the ioend we finished off so that the caller can submit it
- * once it has finished processing the dirty page.
+ * Test to see if we have an existing ioend structure that we could append to
+ * first, otherwise finish off the current ioend and start another.
  */
 STATIC void
 xfs_add_to_ioend(
 	struct inode		*inode,
-	struct buffer_head	*bh,
 	xfs_off_t		offset,
+	struct page		*page,
 	struct xfs_writepage_ctx *wpc,
 	struct writeback_control *wbc,
 	struct list_head	*iolist)
 {
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);
+	unsigned		len = i_blocksize(inode);
+	unsigned		poff = offset & (PAGE_SIZE - 1);
+	sector_t		sector;
+
+	sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
+		((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
+
 	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
-	    bh->b_blocknr != wpc->last_block + 1 ||
+	    sector != bio_end_sector(wpc->ioend->io_bio) ||
 	    offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
 		if (wpc->ioend)
 			list_add(&wpc->ioend->io_list, iolist);
-		wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
+		wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
+				bdev, sector);
 	}
 
 	/*
-	 * If the buffer doesn't fit into the bio we need to allocate a new
-	 * one.  This shouldn't happen more than once for a given buffer.
+	 * If the block doesn't fit into the bio we need to allocate a new
+	 * one.  This shouldn't happen more than once for a given block.
 	 */
-	while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
-		xfs_chain_bio(wpc->ioend, wbc, bh);
+	while (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len)
+		xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
 
-	wpc->ioend->io_size += bh->b_size;
-	wpc->last_block = bh->b_blocknr;
-	xfs_start_buffer_writeback(bh);
+	wpc->ioend->io_size += len;
 }
 
 STATIC void
@@ -893,7 +887,9 @@ xfs_writepage_map(
 
 		lock_buffer(bh);
 		xfs_map_at_offset(inode, bh, &wpc->imap, file_offset);
-		xfs_add_to_ioend(inode, bh, file_offset, wpc, wbc, &submit_list);
+		xfs_add_to_ioend(inode, file_offset, page, wpc, wbc,
+				&submit_list);
+		xfs_start_buffer_writeback(bh);
 		count++;
 	}
 
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 25/33] xfs: move all writeback buffer_head manipulation into xfs_map_at_offset
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (23 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 24/33] xfs: don't look at buffer heads in xfs_add_to_ioend Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 26/33] xfs: allow writeback on pages without buffer heads Christoph Hellwig
                   ` (8 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

This keeps it in a single place so it can be made otional more easily.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f6d28e6aa911..c76c943473be 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -505,21 +505,6 @@ xfs_imap_valid(
 		offset < imap->br_startoff + imap->br_blockcount;
 }
 
-STATIC void
-xfs_start_buffer_writeback(
-	struct buffer_head	*bh)
-{
-	ASSERT(buffer_mapped(bh));
-	ASSERT(buffer_locked(bh));
-	ASSERT(!buffer_delay(bh));
-	ASSERT(!buffer_unwritten(bh));
-
-	bh->b_end_io = NULL;
-	set_buffer_async_write(bh);
-	set_buffer_uptodate(bh);
-	clear_buffer_dirty(bh);
-}
-
 STATIC void
 xfs_start_page_writeback(
 	struct page		*page,
@@ -728,6 +713,7 @@ xfs_map_at_offset(
 	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 
+	lock_buffer(bh);
 	xfs_map_buffer(inode, bh, imap, offset);
 	set_buffer_mapped(bh);
 	clear_buffer_delay(bh);
@@ -740,6 +726,10 @@ xfs_map_at_offset(
 	 * set the bdev now.
 	 */
 	bh->b_bdev = xfs_find_bdev_for_inode(inode);
+	bh->b_end_io = NULL;
+	set_buffer_async_write(bh);
+	set_buffer_uptodate(bh);
+	clear_buffer_dirty(bh);
 }
 
 STATIC void
@@ -885,11 +875,9 @@ xfs_writepage_map(
 			continue;
 		}
 
-		lock_buffer(bh);
 		xfs_map_at_offset(inode, bh, &wpc->imap, file_offset);
 		xfs_add_to_ioend(inode, file_offset, page, wpc, wbc,
 				&submit_list);
-		xfs_start_buffer_writeback(bh);
 		count++;
 	}
 
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 26/33] xfs: allow writeback on pages without buffer heads
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (24 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 25/33] xfs: move all writeback buffer_head manipulation into xfs_map_at_offset Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 27/33] xfs: remove xfs_start_page_writeback Christoph Hellwig
                   ` (7 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

We'll soon allow these through changes in the iomap write_begin and
page_mkwrite implementations, so get ready for them.  After the previous
refactoring this is as simple as not maintaining the bh variable if
the page doesn' thave private data, and skipping the non-uptodate buffer
check in this case for the writepage path, and adding a new per-page
I/O completion handler that skips all buffer head manipulation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 47 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index c76c943473be..879599f723b6 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -91,6 +91,19 @@ xfs_find_daxdev_for_inode(
 		return mp->m_ddev_targp->bt_daxdev;
 }
 
+static void
+xfs_finish_page_writeback(
+	struct inode		*inode,
+	struct bio_vec		*bvec,
+	int			error)
+{
+	if (error) {
+		SetPageError(bvec->bv_page);
+		mapping_set_error(inode->i_mapping, -EIO);
+	}
+	end_page_writeback(bvec->bv_page);
+}
+
 /*
  * We're now finished for good with this page.  Update the page state via the
  * associated buffer_heads, paying attention to the start and end offsets that
@@ -103,7 +116,7 @@ xfs_find_daxdev_for_inode(
  * and buffers potentially freed after every call to end_buffer_async_write.
  */
 static void
-xfs_finish_page_writeback(
+xfs_finish_buffer_writeback(
 	struct inode		*inode,
 	struct bio_vec		*bvec,
 	int			error)
@@ -178,9 +191,12 @@ xfs_destroy_ioend(
 			next = bio->bi_private;
 
 		/* walk each page on bio, ending page IO on them */
-		bio_for_each_segment_all(bvec, bio, i)
-			xfs_finish_page_writeback(inode, bvec, error);
-
+		bio_for_each_segment_all(bvec, bio, i) {
+			if (page_has_buffers(bvec->bv_page))
+				xfs_finish_buffer_writeback(inode, bvec, error);
+			else
+				xfs_finish_page_writeback(inode, bvec, error);
+		}
 		bio_put(bio);
 	}
 
@@ -816,7 +832,7 @@ xfs_writepage_map(
 {
 	LIST_HEAD(submit_list);
 	struct xfs_ioend	*ioend, *next;
-	struct buffer_head	*bh;
+	struct buffer_head	*bh = NULL;
 	ssize_t			len = i_blocksize(inode);
 	int			error = 0;
 	int			count = 0;
@@ -824,6 +840,9 @@ xfs_writepage_map(
 	loff_t			file_offset;	/* file offset of page */
 	unsigned		poffset;	/* offset into page */
 
+	if (page_has_buffers(page))
+		bh = page_buffers(page);
+
 	/*
 	 * Walk the blocks on the page, and we we run off then end of the
 	 * current map or find the current map invalid, grab a new one.
@@ -832,11 +851,9 @@ xfs_writepage_map(
 	 * replace the bufferhead with some other state tracking mechanism in
 	 * future.
 	 */
-	file_offset = page_offset(page);
-	bh = page_buffers(page);
-	for (poffset = 0;
+	for (poffset = 0, file_offset = page_offset(page);
 	     poffset < PAGE_SIZE;
-	     poffset += len, file_offset += len, bh = bh->b_this_page) {
+	     poffset += len, file_offset += len) {
 		/* past the range we are writing, so nothing more to write. */
 		if (file_offset >= end_offset)
 			break;
@@ -844,10 +861,11 @@ xfs_writepage_map(
 		/*
 		 * Block does not contain valid data, skip it.
 		 */
-		if (!buffer_uptodate(bh)) {
+		if (bh && !buffer_uptodate(bh)) {
 			if (PageUptodate(page))
 				ASSERT(buffer_mapped(bh));
 			uptodate = false;
+			bh = bh->b_this_page;
 			continue;
 		}
 
@@ -872,10 +890,15 @@ xfs_writepage_map(
 			 * meaningless for holes (!mapped && uptodate), so check we did
 			 * have a buffer covering a hole here and continue.
 			 */
+			if (bh)
+				bh = bh->b_this_page;
 			continue;
 		}
 
-		xfs_map_at_offset(inode, bh, &wpc->imap, file_offset);
+		if (bh) {
+			xfs_map_at_offset(inode, bh, &wpc->imap, file_offset);
+			bh = bh->b_this_page;
+		}
 		xfs_add_to_ioend(inode, file_offset, page, wpc, wbc,
 				&submit_list);
 		count++;
@@ -960,8 +983,6 @@ xfs_do_writepage(
 
 	trace_xfs_writepage(inode, page, 0, 0);
 
-	ASSERT(page_has_buffers(page));
-
 	/*
 	 * Refuse to write the page out if we are called from reclaim context.
 	 *
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 27/33] xfs: remove xfs_start_page_writeback
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (25 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 26/33] xfs: allow writeback on pages without buffer heads Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 28/33] xfs: refactor the tail of xfs_writepage_map Christoph Hellwig
                   ` (6 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

This helper only has two callers, one of them with a constant error
argument.  Remove it to make pending changes to the code a little easier.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 47 +++++++++++++++++++++--------------------------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 879599f723b6..6b39792270aa 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -521,30 +521,6 @@ xfs_imap_valid(
 		offset < imap->br_startoff + imap->br_blockcount;
 }
 
-STATIC void
-xfs_start_page_writeback(
-	struct page		*page,
-	int			clear_dirty)
-{
-	ASSERT(PageLocked(page));
-	ASSERT(!PageWriteback(page));
-
-	/*
-	 * if the page was not fully cleaned, we need to ensure that the higher
-	 * layers come back to it correctly. That means we need to keep the page
-	 * dirty, and for WB_SYNC_ALL writeback we need to ensure the
-	 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
-	 * write this page in this writeback sweep will be made.
-	 */
-	if (clear_dirty) {
-		clear_page_dirty_for_io(page);
-		set_page_writeback(page);
-	} else
-		set_page_writeback_keepwrite(page);
-
-	unlock_page(page);
-}
-
 /*
  * Submit the bio for an ioend. We are passed an ioend with a bio attached to
  * it, and we submit that bio. The ioend may be used for multiple bio
@@ -910,6 +886,9 @@ xfs_writepage_map(
 	ASSERT(wpc->ioend || list_empty(&submit_list));
 
 out:
+	ASSERT(PageLocked(page));
+	ASSERT(!PageWriteback(page));
+
 	/*
 	 * On error, we have to fail the ioend here because we have locked
 	 * buffers in the ioend. If we don't do this, we'll deadlock
@@ -928,7 +907,21 @@ xfs_writepage_map(
 	 * treated correctly on error.
 	 */
 	if (count) {
-		xfs_start_page_writeback(page, !error);
+		/*
+		 * If the page was not fully cleaned, we need to ensure that the
+		 * higher layers come back to it correctly.  That means we need
+		 * to keep the page dirty, and for WB_SYNC_ALL writeback we need
+		 * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
+		 * so another attempt to write this page in this writeback sweep
+		 * will be made.
+		 */
+		if (error) {
+			set_page_writeback_keepwrite(page);
+		} else {
+			clear_page_dirty_for_io(page);
+			set_page_writeback(page);
+		}
+		unlock_page(page);
 
 		/*
 		 * Preserve the original error if there was one, otherwise catch
@@ -953,7 +946,9 @@ xfs_writepage_map(
 		 * race with a partial page truncate on a sub-page block sized
 		 * filesystem. In that case we need to mark the page clean.
 		 */
-		xfs_start_page_writeback(page, 1);
+		clear_page_dirty_for_io(page);
+		set_page_writeback(page);
+		unlock_page(page);
 		end_page_writeback(page);
 	}
 
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 28/33] xfs: refactor the tail of xfs_writepage_map
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (26 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 27/33] xfs: remove xfs_start_page_writeback Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 29/33] xfs: do not set the page uptodate in xfs_writepage_map Christoph Hellwig
                   ` (5 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Rejuggle how we deal with the different error vs non-error and have
ioends vs not have ioend cases to keep the fast path streamlined, and
the duplicate code at a minimum.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 65 +++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 33 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 6b39792270aa..dc82d4e71a64 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -906,7 +906,14 @@ xfs_writepage_map(
 	 * submission of outstanding ioends on the writepage context so they are
 	 * treated correctly on error.
 	 */
-	if (count) {
+	if (unlikely(error)) {
+		if (!count) {
+			xfs_aops_discard_page(page);
+			ClearPageUptodate(page);
+			unlock_page(page);
+			goto done;
+		}
+
 		/*
 		 * If the page was not fully cleaned, we need to ensure that the
 		 * higher layers come back to it correctly.  That means we need
@@ -915,43 +922,35 @@ xfs_writepage_map(
 		 * so another attempt to write this page in this writeback sweep
 		 * will be made.
 		 */
-		if (error) {
-			set_page_writeback_keepwrite(page);
-		} else {
-			clear_page_dirty_for_io(page);
-			set_page_writeback(page);
-		}
-		unlock_page(page);
-
-		/*
-		 * Preserve the original error if there was one, otherwise catch
-		 * submission errors here and propagate into subsequent ioend
-		 * submissions.
-		 */
-		list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
-			int error2;
-
-			list_del_init(&ioend->io_list);
-			error2 = xfs_submit_ioend(wbc, ioend, error);
-			if (error2 && !error)
-				error = error2;
-		}
-	} else if (error) {
-		xfs_aops_discard_page(page);
-		ClearPageUptodate(page);
-		unlock_page(page);
+		set_page_writeback_keepwrite(page);
 	} else {
-		/*
-		 * We can end up here with no error and nothing to write if we
-		 * race with a partial page truncate on a sub-page block sized
-		 * filesystem. In that case we need to mark the page clean.
-		 */
 		clear_page_dirty_for_io(page);
 		set_page_writeback(page);
-		unlock_page(page);
-		end_page_writeback(page);
 	}
 
+	unlock_page(page);
+
+	/*
+	 * Preserve the original error if there was one, otherwise catch
+	 * submission errors here and propagate into subsequent ioend
+	 * submissions.
+	 */
+	list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
+		int error2;
+
+		list_del_init(&ioend->io_list);
+		error2 = xfs_submit_ioend(wbc, ioend, error);
+		if (error2 && !error)
+			error = error2;
+	}
+
+	/*
+	 * We can end up here with no error and nothing to write if we race with
+	 * a partial page truncate on a sub-page block sized filesystem.
+	 */
+	if (!count)
+		end_page_writeback(page);
+done:
 	mapping_set_error(page->mapping, error);
 	return error;
 }
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 29/33] xfs: do not set the page uptodate in xfs_writepage_map
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (27 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 28/33] xfs: refactor the tail of xfs_writepage_map Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 30/33] iomap: add initial support for writes without buffer heads Christoph Hellwig
                   ` (4 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

We already track the page uptodate status based on the buffer uptodate
status, which is updated whenever reading or zeroing blocks.

This code has been there since commit a ptool commit in 2002, which
claims to:

    "merge" the 2.4 fsx fix for block size < page size to 2.5.  This needed
    major changes to actually fit.

and isn't present in other writepage implementations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index dc82d4e71a64..dc92f23b0ea4 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -812,7 +812,6 @@ xfs_writepage_map(
 	ssize_t			len = i_blocksize(inode);
 	int			error = 0;
 	int			count = 0;
-	bool			uptodate = true;
 	loff_t			file_offset;	/* file offset of page */
 	unsigned		poffset;	/* offset into page */
 
@@ -840,7 +839,6 @@ xfs_writepage_map(
 		if (bh && !buffer_uptodate(bh)) {
 			if (PageUptodate(page))
 				ASSERT(buffer_mapped(bh));
-			uptodate = false;
 			bh = bh->b_this_page;
 			continue;
 		}
@@ -880,9 +878,6 @@ xfs_writepage_map(
 		count++;
 	}
 
-	if (uptodate && poffset == PAGE_SIZE)
-		SetPageUptodate(page);
-
 	ASSERT(wpc->ioend || list_empty(&submit_list));
 
 out:
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 30/33] iomap: add initial support for writes without buffer heads
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (28 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 29/33] xfs: do not set the page uptodate in xfs_writepage_map Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 31/33] iomap: add support for sub-pagesize buffered I/O " Christoph Hellwig
                   ` (3 subsequent siblings)
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

For now just limited to blocksize == PAGE_SIZE, where we can simply read
in the full page in write begin, and just set the whole page dirty after
copying data into it.  This code is enabled by default and XFS will now
be feed pages without buffer heads in ->writepage and ->writepages.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 120 insertions(+), 9 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 967bd31540fe..a3861945504f 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -308,6 +308,56 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 		truncate_pagecache_range(inode, max(pos, i_size), pos + len);
 }
 
+static int
+iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
+		unsigned poff, unsigned plen, struct iomap *iomap)
+{
+	struct bio_vec bvec;
+	struct bio bio;
+	int ret;
+
+	bio_init(&bio, &bvec, 1);
+	bio.bi_opf = REQ_OP_READ;
+	bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
+	bio_set_dev(&bio, iomap->bdev);
+	__bio_add_page(&bio, page, plen, poff);
+	ret = submit_bio_wait(&bio);
+	if (ret < 0 && iomap_block_needs_zeroing(inode, block_start, iomap))
+		zero_user(page, poff, plen);
+	return ret;
+}
+
+static int
+__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
+		struct page *page, struct iomap *iomap)
+{
+	loff_t block_size = i_blocksize(inode);
+	loff_t block_start = pos & ~(block_size - 1);
+	loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
+	unsigned poff = block_start & (PAGE_SIZE - 1);
+	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - block_start);
+	int status;
+
+	if (PageUptodate(page))
+		return 0;
+
+	if (iomap_block_needs_zeroing(inode, block_start, iomap)) {
+		unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
+		unsigned pend = poff + plen;
+
+		if (poff < from || pend > to)
+			zero_user_segments(page, poff, from, to, pend);
+	} else {
+		status = iomap_read_page_sync(inode, block_start, page,
+				poff, plen, iomap);
+		if (status < 0)
+			return status;
+		SetPageUptodate(page);
+	}
+
+	return 0;
+}
+
 static int
 iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, struct iomap *iomap)
@@ -325,7 +375,10 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 	if (!page)
 		return -ENOMEM;
 
-	status = __block_write_begin_int(page, pos, len, NULL, iomap);
+	if (i_blocksize(inode) == PAGE_SIZE)
+		status = __iomap_write_begin(inode, pos, len, page, iomap);
+	else
+		status = __block_write_begin_int(page, pos, len, NULL, iomap);
 	if (unlikely(status)) {
 		unlock_page(page);
 		put_page(page);
@@ -338,12 +391,63 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 	return status;
 }
 
+static int
+iomap_set_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	int newly_dirty;
+
+	if (unlikely(!mapping))
+		return !TestSetPageDirty(page);
+
+	/*
+	 * Lock out page->mem_cgroup migration to keep PageDirty
+	 * synchronized with per-memcg dirty page counters.
+	 */
+	lock_page_memcg(page);
+	newly_dirty = !TestSetPageDirty(page);
+	if (newly_dirty)
+		__set_page_dirty(page, mapping, 0);
+	unlock_page_memcg(page);
+
+	if (newly_dirty)
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+	return newly_dirty;
+}
+
+static int
+__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+		unsigned copied, struct page *page, struct iomap *iomap)
+{
+	unsigned start = pos & (PAGE_SIZE - 1);
+	int ret;
+
+	if (unlikely(copied < len)) {
+		/* see block_write_end() for an explanation */
+		if (!PageUptodate(page))
+			copied = 0;
+		if (iomap_block_needs_zeroing(inode, pos, iomap))
+			zero_user(page, start + copied, len - copied);
+	}
+
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	iomap_set_page_dirty(page);
+	ret = __generic_write_end(inode, pos, copied, page);
+	if (ret < len)
+		iomap_write_failed(inode, pos, len);
+	return ret;
+}
+
 static int
 iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
-		unsigned copied, struct page *page)
+		unsigned copied, struct page *page, struct iomap *iomap)
 {
 	int ret;
 
+	if (i_blocksize(inode) == PAGE_SIZE)
+		return __iomap_write_end(inode, pos, len, copied, page, iomap);
+
 	ret = generic_write_end(NULL, inode->i_mapping, pos, len,
 			copied, page, NULL);
 	if (ret < len)
@@ -400,7 +504,8 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
 		flush_dcache_page(page);
 
-		status = iomap_write_end(inode, pos, bytes, copied, page);
+		status = iomap_write_end(inode, pos, bytes, copied, page,
+				iomap);
 		if (unlikely(status < 0))
 			break;
 		copied = status;
@@ -494,7 +599,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
 		WARN_ON_ONCE(!PageUptodate(page));
 
-		status = iomap_write_end(inode, pos, bytes, bytes, page);
+		status = iomap_write_end(inode, pos, bytes, bytes, page, iomap);
 		if (unlikely(status <= 0)) {
 			if (WARN_ON_ONCE(status == 0))
 				return -EIO;
@@ -546,7 +651,7 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
 	zero_user(page, offset, bytes);
 	mark_page_accessed(page);
 
-	return iomap_write_end(inode, pos, bytes, bytes, page);
+	return iomap_write_end(inode, pos, bytes, bytes, page, iomap);
 }
 
 static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
@@ -632,11 +737,14 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
 	struct page *page = data;
 	int ret;
 
-	ret = __block_write_begin_int(page, pos, length, NULL, iomap);
-	if (ret)
-		return ret;
+	if (i_blocksize(inode) != PAGE_SIZE) {
+		ret = __block_write_begin_int(page, pos, length, NULL, iomap);
+		if (ret)
+			return ret;
+
+		block_commit_write(page, 0, length);
+	}
 
-	block_commit_write(page, 0, length);
 	return length;
 }
 
@@ -663,6 +771,9 @@ int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
 	else
 		length = PAGE_SIZE;
 
+	if (i_blocksize(inode) == PAGE_SIZE)
+		WARN_ON_ONCE(!PageUptodate(page));
+
 	offset = page_offset(page);
 	while (length > 0) {
 		ret = iomap_apply(inode, offset, length,
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 31/33] iomap: add support for sub-pagesize buffered I/O without buffer heads
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (29 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 30/33] iomap: add initial support for writes without buffer heads Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-14 16:00   ` Goldwyn Rodrigues
  2018-05-09  7:48 ` [PATCH 32/33] xfs: add support for sub-pagesize writeback without buffer_heads Christoph Hellwig
                   ` (2 subsequent siblings)
  33 siblings, 1 reply; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

After already supporting a simple implementation of buffered writes for
the blocksize == PAGE_SIZE case in the last commit this adds full support
even for smaller block sizes.   There are three bits of per-block
information in the buffer_head structure that really matter for the iomap
read and write path:

 - uptodate status (BH_uptodate)
 - marked as currently under read I/O (BH_Async_Read)
 - marked as currently under write I/O (BH_Async_Write)

Instead of having new per-block structures this now adds a per-page
structure called struct iomap_page to track this information in a slightly
different form:

 - a bitmap for the per-block uptodate status.  For worst case of a 64k
   page size system this bitmap needs to contain 128 bits.  For the
   typical 4k page size case it only needs 8 bits, although we still
   need a full unsigned long due to the way the atomic bitmap API works.
 - two atomic_t counters are used to track the outstanding read and write
   counts

There is quite a bit of boilerplate code as the buffered I/O path uses
various helper methods, but the actual code is very straight forward.

In this commit the code can't actually be used yet, as we need to
switch from the old implementation to the new one together with the
XFS writeback code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c            | 262 +++++++++++++++++++++++++++++++++++++-----
 include/linux/iomap.h |  32 ++++++
 2 files changed, 264 insertions(+), 30 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index a3861945504f..4e7ac6aa88ef 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -17,6 +17,7 @@
 #include <linux/iomap.h>
 #include <linux/uaccess.h>
 #include <linux/gfp.h>
+#include <linux/migrate.h>
 #include <linux/mm.h>
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
@@ -109,6 +110,107 @@ iomap_block_needs_zeroing(struct inode *inode, loff_t pos, struct iomap *iomap)
        return iomap->type != IOMAP_MAPPED || pos > i_size_read(inode);
 }
 
+static struct iomap_page *
+iomap_page_create(struct inode *inode, struct page *page)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (iop || i_blocksize(inode) == PAGE_SIZE)
+		return iop;
+
+	iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
+	atomic_set(&iop->read_count, 0);
+	atomic_set(&iop->write_count, 0);
+	bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
+	set_page_private(page, (unsigned long)iop);
+	SetPagePrivate(page);
+	return iop;
+}
+
+/*
+ * Calculate the range inside the page that we actually need to read.
+ */
+static void
+iomap_read_calculate_range(struct inode *inode, struct iomap_page *iop,
+		loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
+{
+	unsigned poff = *pos & (PAGE_SIZE - 1);
+	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+
+	if (iop) {
+		unsigned block_size = i_blocksize(inode);
+		unsigned first = poff >> inode->i_blkbits;
+		unsigned last = (poff + plen - 1) >> inode->i_blkbits;
+		unsigned int i;
+
+		/* move forward for each leading block marked uptodate */
+		for (i = first; i <= last; i++) {
+			if (!test_bit(i, iop->uptodate))
+				break;
+			*pos += block_size;
+			poff += block_size;
+			plen -= block_size;
+		}
+
+		/* truncate len if we find any trailing uptodate block(s) */
+		for ( ; i <= last; i++) {
+			if (test_bit(i, iop->uptodate)) {
+				plen -= (last - i + 1) * block_size;
+				break;
+			}
+		}
+	}
+
+	*offp = poff;
+	*lenp = plen;
+}
+
+static void
+iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+	struct inode *inode = page->mapping->host;
+	unsigned first = off >> inode->i_blkbits;
+	unsigned last = (off + len - 1) >> inode->i_blkbits;
+	unsigned int i;
+	bool uptodate = true;
+
+	if (iop) {
+		for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
+			if (i >= first && i <= last)
+				set_bit(i, iop->uptodate);
+			else if (!test_bit(i, iop->uptodate))
+				uptodate = false;
+		}
+	}
+
+	if (uptodate && !PageError(page))
+		SetPageUptodate(page);
+}
+
+static void
+iomap_read_finish(struct iomap_page *iop, struct page *page)
+{
+	if (!iop || atomic_dec_and_test(&iop->read_count))
+		unlock_page(page);
+}
+
+static void
+iomap_read_page_end_io(struct bio_vec *bvec, int error)
+{
+	struct page *page = bvec->bv_page;
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (unlikely(error)) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len);
+	}
+
+	iomap_read_finish(iop, page);
+}
+
 static void
 iomap_read_end_io(struct bio *bio)
 {
@@ -117,7 +219,7 @@ iomap_read_end_io(struct bio *bio)
 	int i;
 
 	bio_for_each_segment_all(bvec, bio, i)
-		page_endio(bvec->bv_page, false, error);
+		iomap_read_page_end_io(bvec, error);
 	bio_put(bio);
 }
 
@@ -147,18 +249,19 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 {
 	struct iomap_readpage_ctx *ctx = data;
 	struct page *page = ctx->cur_page;
-	unsigned poff = pos & (PAGE_SIZE - 1);
-	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+	struct iomap_page *iop = iomap_page_create(inode, page);
 	bool is_contig = false;
+	loff_t orig_pos = pos;
+	unsigned poff, plen;
 	sector_t sector;
 
-	/* we don't support blocksize < PAGE_SIZE quite yet: */
-	WARN_ON_ONCE(pos != page_offset(page));
-	WARN_ON_ONCE(plen != PAGE_SIZE);
+	iomap_read_calculate_range(inode, iop, &pos, length, &poff, &plen);
+	if (plen == 0)
+		goto done;
 
 	if (iomap_block_needs_zeroing(inode, pos, iomap)) {
 		zero_user(page, poff, plen);
-		SetPageUptodate(page);
+		iomap_set_range_uptodate(page, poff, plen);
 		goto done;
 	}
 
@@ -174,6 +277,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		is_contig = true;
 	}
 
+	/*
+	 * If we start a new segment we need to increase the read count, and we
+	 * need to do so before submitting any previous full bio to make sure
+	 * that we don't prematurely unlock the page.
+	 */
+	if (iop)
+		atomic_inc(&iop->read_count);
+
 	if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
 		if (ctx->bio)
 			submit_bio(ctx->bio);
@@ -182,7 +293,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
 	__bio_add_page(ctx->bio, page, plen, poff);
 done:
-	return plen;
+	return pos - orig_pos + plen;
 }
 
 int
@@ -193,8 +304,6 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
 	unsigned poff;
 	loff_t ret;
 
-	WARN_ON_ONCE(page_has_buffers(page));
-
 	for (poff = 0; poff < PAGE_SIZE; poff += ret) {
 		ret = iomap_apply(inode, page_offset(page) + poff,
 				PAGE_SIZE - poff, 0, ops, &ctx,
@@ -295,6 +404,90 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
 }
 EXPORT_SYMBOL_GPL(iomap_readpages);
 
+int
+iomap_is_partially_uptodate(struct page *page, unsigned long from,
+		unsigned long count)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+	struct inode *inode = page->mapping->host;
+	unsigned first = from >> inode->i_blkbits;
+	unsigned last = (from + count - 1) >> inode->i_blkbits;
+	unsigned i;
+
+	if (iop) {
+		for (i = first; i <= last; i++)
+			if (!test_bit(i, iop->uptodate))
+				return 0;
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
+
+int
+iomap_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+
+	/*
+	 * mm accommodates an old ext3 case where clean pages might not have had
+	 * the dirty bit cleared. Thus, it can send actual dirty pages to
+	 * ->releasepage() via shrink_active_list(), skip those here.
+	 */
+	if (PageDirty(page) || PageWriteback(page))
+		return 0;
+
+	if (iop) {
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		kfree(iop);
+	}
+	return 1;
+}
+EXPORT_SYMBOL_GPL(iomap_releasepage);
+
+void
+iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
+{
+	/*
+	 * If we are invalidating the entire page, clear the dirty state from it
+	 * and release it to avoid unnecessary buildup of the LRU.
+	 */
+	if (offset == 0 && len == PAGE_SIZE) {
+		cancel_dirty_page(page);
+		iomap_releasepage(page, 0);
+	}
+}
+EXPORT_SYMBOL_GPL(iomap_invalidatepage);
+
+#ifdef CONFIG_MIGRATION
+int
+iomap_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page, enum migrate_mode mode)
+{
+	int ret;
+
+	ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+	if (ret != MIGRATEPAGE_SUCCESS)
+		return ret;
+
+	if (page_has_private(page)) {
+		ClearPagePrivate(page);
+		set_page_private(newpage, page_private(page));
+		set_page_private(page, 0);
+		SetPagePrivate(newpage);
+	}
+
+	if (mode != MIGRATE_SYNC_NO_COPY)
+		migrate_page_copy(newpage, page);
+	else
+		migrate_page_states(newpage, page);
+	return MIGRATEPAGE_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(iomap_migrate_page);
+#endif /* CONFIG_MIGRATION */
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -331,28 +524,37 @@ static int
 __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
 		struct page *page, struct iomap *iomap)
 {
+	struct iomap_page *iop = iomap_page_create(inode, page);
 	loff_t block_size = i_blocksize(inode);
 	loff_t block_start = pos & ~(block_size - 1);
 	loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
-	unsigned poff = block_start & (PAGE_SIZE - 1);
-	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - block_start);
-	int status;
+	int status = 0;
 
-	if (PageUptodate(page))
-		return 0;
+	while (!PageUptodate(page)) {
+		unsigned poff, plen;
 
-	if (iomap_block_needs_zeroing(inode, block_start, iomap)) {
-		unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
-		unsigned pend = poff + plen;
+		iomap_read_calculate_range(inode, iop, &block_start,
+				block_end - block_start, &poff, &plen);
+		if (plen == 0)
+			break;
 
-		if (poff < from || pend > to)
-			zero_user_segments(page, poff, from, to, pend);
-	} else {
-		status = iomap_read_page_sync(inode, block_start, page,
-				poff, plen, iomap);
-		if (status < 0)
-			return status;
-		SetPageUptodate(page);
+		if (iomap_block_needs_zeroing(inode, block_start, iomap)) {
+			unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
+			unsigned pend = poff + plen;
+
+			if (poff < from || pend > to)
+				zero_user_segments(page, poff, from, to, pend);
+		} else {
+			status = iomap_read_page_sync(inode, block_start,
+					page, poff, plen, iomap);
+			if (status)
+				return status;
+			iomap_set_range_uptodate(page, poff, plen);
+		}
+
+		if (poff + plen >= PAGE_SIZE)
+			break;
+		block_start += plen;
 	}
 
 	return 0;
@@ -391,7 +593,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 	return status;
 }
 
-static int
+int
 iomap_set_page_dirty(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
@@ -414,6 +616,7 @@ iomap_set_page_dirty(struct page *page)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	return newly_dirty;
 }
+EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
 
 static int
 __iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
@@ -431,7 +634,7 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
 	}
 
 	flush_dcache_page(page);
-	SetPageUptodate(page);
+	iomap_set_range_uptodate(page, start, len);
 	iomap_set_page_dirty(page);
 	ret = __generic_write_end(inode, pos, copied, page);
 	if (ret < len)
@@ -771,8 +974,7 @@ int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
 	else
 		length = PAGE_SIZE;
 
-	if (i_blocksize(inode) == PAGE_SIZE)
-		WARN_ON_ONCE(!PageUptodate(page));
+	WARN_ON_ONCE(!PageUptodate(page));
 
 	offset = page_offset(page);
 	while (length > 0) {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4710789620e7..fe432a0f02aa 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -2,6 +2,9 @@
 #ifndef LINUX_IOMAP_H
 #define LINUX_IOMAP_H 1
 
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/mm.h>
 #include <linux/types.h>
 
 struct address_space;
@@ -82,11 +85,40 @@ struct iomap_ops {
 			ssize_t written, unsigned flags, struct iomap *iomap);
 };
 
+/*
+ * Structure allocate for each page when block size < PAGE_SIZE to track
+ * sub-page uptodate status and I/O completions.
+ */
+struct iomap_page {
+	atomic_t		read_count;
+	atomic_t		write_count;
+	DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
+};
+
+static inline struct iomap_page *to_iomap_page(struct page *page)
+{
+	if (page_has_private(page))
+		return (struct iomap_page *)page_private(page);
+	return NULL;
+}
+
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops);
 int iomap_readpage(struct page *page, const struct iomap_ops *ops);
 int iomap_readpages(struct address_space *mapping, struct list_head *pages,
 		unsigned nr_pages, const struct iomap_ops *ops);
+int iomap_set_page_dirty(struct page *page);
+int iomap_is_partially_uptodate(struct page *page, unsigned long from,
+		unsigned long count);
+int iomap_releasepage(struct page *page, gfp_t gfp_mask);
+void iomap_invalidatepage(struct page *page, unsigned int offset,
+		unsigned int len);
+#ifdef CONFIG_MIGRATION
+int iomap_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page, enum migrate_mode mode);
+#else
+#define iomap_migrate_page NULL
+#endif
 int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 32/33] xfs: add support for sub-pagesize writeback without buffer_heads
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (30 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 31/33] iomap: add support for sub-pagesize buffered I/O " Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-09  7:48 ` [PATCH 33/33] fs: remove __block_write_begin and iomap_to_bh Christoph Hellwig
  2018-05-10 15:13 ` stop using buffer heads in xfs and iomap Darrick J. Wong
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Switch to using the iomap_page structure for checking sub-page uptodate
status and track sub-page I/O completion status, and remove tons of
boilerplate code working around buffer heads.

This also flips the switch in iomap.c to actually enable the new
buffered write code for blocksize < PAGE_SIZE.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c         |  35 +--
 fs/xfs/xfs_aops.c  | 518 ++++++---------------------------------------
 fs/xfs/xfs_buf.h   |   1 -
 fs/xfs/xfs_super.c |   2 +-
 fs/xfs/xfs_trace.h |  18 +-
 5 files changed, 72 insertions(+), 502 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 4e7ac6aa88ef..78bbbfadf499 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -577,10 +577,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 	if (!page)
 		return -ENOMEM;
 
-	if (i_blocksize(inode) == PAGE_SIZE)
-		status = __iomap_write_begin(inode, pos, len, page, iomap);
-	else
-		status = __block_write_begin_int(page, pos, len, NULL, iomap);
+	status = __iomap_write_begin(inode, pos, len, page, iomap);
 	if (unlikely(status)) {
 		unlock_page(page);
 		put_page(page);
@@ -619,7 +616,7 @@ iomap_set_page_dirty(struct page *page)
 EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
 
 static int
-__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
 		unsigned copied, struct page *page, struct iomap *iomap)
 {
 	unsigned start = pos & (PAGE_SIZE - 1);
@@ -642,22 +639,6 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
 	return ret;
 }
 
-static int
-iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
-		unsigned copied, struct page *page, struct iomap *iomap)
-{
-	int ret;
-
-	if (i_blocksize(inode) == PAGE_SIZE)
-		return __iomap_write_end(inode, pos, len, copied, page, iomap);
-
-	ret = generic_write_end(NULL, inode->i_mapping, pos, len,
-			copied, page, NULL);
-	if (ret < len)
-		iomap_write_failed(inode, pos, len);
-	return ret;
-}
-
 static loff_t
 iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct iomap *iomap)
@@ -933,21 +914,11 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 }
 EXPORT_SYMBOL_GPL(iomap_truncate_page);
 
+/* no need to do anything, mkwrite just needs to ensure blocks are allocated */
 static loff_t
 iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
 		void *data, struct iomap *iomap)
 {
-	struct page *page = data;
-	int ret;
-
-	if (i_blocksize(inode) != PAGE_SIZE) {
-		ret = __block_write_begin_int(page, pos, length, NULL, iomap);
-		if (ret)
-			return ret;
-
-		block_commit_write(page, 0, length);
-	}
-
 	return length;
 }
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index dc92f23b0ea4..540ba97826c9 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -32,9 +32,6 @@
 #include "xfs_bmap_util.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_reflink.h"
-#include <linux/gfp.h>
-#include <linux/mpage.h>
-#include <linux/pagevec.h>
 #include <linux/writeback.h>
 
 /*
@@ -46,25 +43,6 @@ struct xfs_writepage_ctx {
 	struct xfs_ioend	*ioend;
 };
 
-void
-xfs_count_page_state(
-	struct page		*page,
-	int			*delalloc,
-	int			*unwritten)
-{
-	struct buffer_head	*bh, *head;
-
-	*delalloc = *unwritten = 0;
-
-	bh = head = page_buffers(page);
-	do {
-		if (buffer_unwritten(bh))
-			(*unwritten) = 1;
-		else if (buffer_delay(bh))
-			(*delalloc) = 1;
-	} while ((bh = bh->b_this_page) != head);
-}
-
 struct block_device *
 xfs_find_bdev_for_inode(
 	struct inode		*inode)
@@ -91,73 +69,39 @@ xfs_find_daxdev_for_inode(
 		return mp->m_ddev_targp->bt_daxdev;
 }
 
+static int
+xfs_vm_releasepage(
+	struct page		*page,
+	gfp_t			gfp_mask)
+{
+	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
+	return iomap_releasepage(page, gfp_mask);
+}
+
+static void
+xfs_vm_invalidatepage(
+	struct page		*page,
+	unsigned int		offset,
+	unsigned int		length)
+{
+	trace_xfs_invalidatepage(page->mapping->host, page, offset, length);
+	iomap_invalidatepage(page, offset, length);
+}
+
 static void
 xfs_finish_page_writeback(
 	struct inode		*inode,
 	struct bio_vec		*bvec,
 	int			error)
 {
+	struct iomap_page	*iop = to_iomap_page(bvec->bv_page);
+
 	if (error) {
 		SetPageError(bvec->bv_page);
 		mapping_set_error(inode->i_mapping, -EIO);
 	}
-	end_page_writeback(bvec->bv_page);
-}
 
-/*
- * We're now finished for good with this page.  Update the page state via the
- * associated buffer_heads, paying attention to the start and end offsets that
- * we need to process on the page.
- *
- * Note that we open code the action in end_buffer_async_write here so that we
- * only have to iterate over the buffers attached to the page once.  This is not
- * only more efficient, but also ensures that we only calls end_page_writeback
- * at the end of the iteration, and thus avoids the pitfall of having the page
- * and buffers potentially freed after every call to end_buffer_async_write.
- */
-static void
-xfs_finish_buffer_writeback(
-	struct inode		*inode,
-	struct bio_vec		*bvec,
-	int			error)
-{
-	struct buffer_head	*head = page_buffers(bvec->bv_page), *bh = head;
-	bool			busy = false;
-	unsigned int		off = 0;
-	unsigned long		flags;
-
-	ASSERT(bvec->bv_offset < PAGE_SIZE);
-	ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
-	ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
-	ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
-
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
-	do {
-		if (off >= bvec->bv_offset &&
-		    off < bvec->bv_offset + bvec->bv_len) {
-			ASSERT(buffer_async_write(bh));
-			ASSERT(bh->b_end_io == NULL);
-
-			if (error) {
-				mark_buffer_write_io_error(bh);
-				clear_buffer_uptodate(bh);
-				SetPageError(bvec->bv_page);
-			} else {
-				set_buffer_uptodate(bh);
-			}
-			clear_buffer_async_write(bh);
-			unlock_buffer(bh);
-		} else if (buffer_async_write(bh)) {
-			ASSERT(buffer_locked(bh));
-			busy = true;
-		}
-		off += bh->b_size;
-	} while ((bh = bh->b_this_page) != head);
-	bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
-	local_irq_restore(flags);
-
-	if (!busy)
+	if (!iop || atomic_dec_and_test(&iop->write_count))
 		end_page_writeback(bvec->bv_page);
 }
 
@@ -191,12 +135,8 @@ xfs_destroy_ioend(
 			next = bio->bi_private;
 
 		/* walk each page on bio, ending page IO on them */
-		bio_for_each_segment_all(bvec, bio, i) {
-			if (page_has_buffers(bvec->bv_page))
-				xfs_finish_buffer_writeback(inode, bvec, error);
-			else
-				xfs_finish_page_writeback(inode, bvec, error);
-		}
+		bio_for_each_segment_all(bvec, bio, i)
+			xfs_finish_page_writeback(inode, bvec, error);
 		bio_put(bio);
 	}
 
@@ -638,6 +578,7 @@ xfs_add_to_ioend(
 	struct inode		*inode,
 	xfs_off_t		offset,
 	struct page		*page,
+	struct iomap_page	*iop,
 	struct xfs_writepage_ctx *wpc,
 	struct writeback_control *wbc,
 	struct list_head	*iolist)
@@ -661,90 +602,19 @@ xfs_add_to_ioend(
 				bdev, sector);
 	}
 
-	/*
-	 * If the block doesn't fit into the bio we need to allocate a new
-	 * one.  This shouldn't happen more than once for a given block.
-	 */
-	while (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len)
-		xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
+	if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
+		if (iop)
+			atomic_inc(&iop->write_count);
+		if (bio_full(wpc->ioend->io_bio))
+			xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
+		__bio_add_page(wpc->ioend->io_bio, page, len, poff);
+	}
 
 	wpc->ioend->io_size += len;
 }
 
-STATIC void
-xfs_map_buffer(
-	struct inode		*inode,
-	struct buffer_head	*bh,
-	struct xfs_bmbt_irec	*imap,
-	xfs_off_t		offset)
-{
-	sector_t		bn;
-	struct xfs_mount	*m = XFS_I(inode)->i_mount;
-	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
-	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
-
-	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-
-	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
-	      ((offset - iomap_offset) >> inode->i_blkbits);
-
-	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
-
-	bh->b_blocknr = bn;
-	set_buffer_mapped(bh);
-}
-
-STATIC void
-xfs_map_at_offset(
-	struct inode		*inode,
-	struct buffer_head	*bh,
-	struct xfs_bmbt_irec	*imap,
-	xfs_off_t		offset)
-{
-	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-
-	lock_buffer(bh);
-	xfs_map_buffer(inode, bh, imap, offset);
-	set_buffer_mapped(bh);
-	clear_buffer_delay(bh);
-	clear_buffer_unwritten(bh);
-
-	/*
-	 * If this is a realtime file, data may be on a different device.
-	 * to that pointed to from the buffer_head b_bdev currently. We can't
-	 * trust that the bufferhead has a already been mapped correctly, so
-	 * set the bdev now.
-	 */
-	bh->b_bdev = xfs_find_bdev_for_inode(inode);
-	bh->b_end_io = NULL;
-	set_buffer_async_write(bh);
-	set_buffer_uptodate(bh);
-	clear_buffer_dirty(bh);
-}
-
-STATIC void
-xfs_vm_invalidatepage(
-	struct page		*page,
-	unsigned int		offset,
-	unsigned int		length)
-{
-	trace_xfs_invalidatepage(page->mapping->host, page, offset,
-				 length);
-
-	/*
-	 * If we are invalidating the entire page, clear the dirty state from it
-	 * so that we can check for attempts to release dirty cached pages in
-	 * xfs_vm_releasepage().
-	 */
-	if (offset == 0 && length >= PAGE_SIZE)
-		cancel_dirty_page(page);
-	block_invalidatepage(page, offset, length);
-}
-
 /*
- * If the page has delalloc buffers on it, we need to punch them out before we
+ * If the page has delalloc blocks on it, we need to punch them out before we
  * invalidate the page. If we don't, we leave a stale delalloc mapping on the
  * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
  * is done on that same region - the delalloc extent is returned when none is
@@ -786,7 +656,7 @@ xfs_aops_discard_page(
  * We implement an immediate ioend submission policy here to avoid needing to
  * chain multiple ioends and hence nest mempool allocations which can violate
  * forward progress guarantees we need to provide. The current ioend we are
- * adding buffers to is cached on the writepage context, and if the new buffer
+ * adding blocks to is cached on the writepage context, and if the new block
  * does not append to the cached ioend it will create a new ioend and cache that
  * instead.
  *
@@ -807,41 +677,27 @@ xfs_writepage_map(
 	uint64_t		end_offset)
 {
 	LIST_HEAD(submit_list);
+	struct iomap_page	*iop = to_iomap_page(page);
+	unsigned		len = i_blocksize(inode);
 	struct xfs_ioend	*ioend, *next;
-	struct buffer_head	*bh = NULL;
-	ssize_t			len = i_blocksize(inode);
-	int			error = 0;
-	int			count = 0;
+	int			error = 0, count = 0, i;
 	loff_t			file_offset;	/* file offset of page */
-	unsigned		poffset;	/* offset into page */
 
-	if (page_has_buffers(page))
-		bh = page_buffers(page);
+	ASSERT(!iop || atomic_read(&iop->write_count) == 0);
 
 	/*
-	 * Walk the blocks on the page, and we we run off then end of the
-	 * current map or find the current map invalid, grab a new one.
-	 * We only use bufferheads here to check per-block state - they no
-	 * longer control the iteration through the page. This allows us to
-	 * replace the bufferhead with some other state tracking mechanism in
-	 * future.
+	 * Walk through the page to find areas to write back. If we run off the
+	 * end of the current map or find the current map invalid, grab a new
+	 * one.
 	 */
-	for (poffset = 0, file_offset = page_offset(page);
-	     poffset < PAGE_SIZE;
-	     poffset += len, file_offset += len) {
-		/* past the range we are writing, so nothing more to write. */
-		if (file_offset >= end_offset)
-			break;
-
+	for (i = 0, file_offset = page_offset(page);
+	     i < PAGE_SIZE >> inode->i_blkbits && file_offset < end_offset;
+	     i++, file_offset += len) {
 		/*
 		 * Block does not contain valid data, skip it.
 		 */
-		if (bh && !buffer_uptodate(bh)) {
-			if (PageUptodate(page))
-				ASSERT(buffer_mapped(bh));
-			bh = bh->b_this_page;
+		if (iop && !test_bit(i, iop->uptodate))
 			continue;
-		}
 
 		/*
 		 * If we don't have a valid map, now it's time to get a new one
@@ -854,52 +710,33 @@ xfs_writepage_map(
 			error = xfs_map_blocks(inode, file_offset, &wpc->imap,
 					     &wpc->io_type);
 			if (error)
-				goto out;
-		}
-
-		if (wpc->io_type == XFS_IO_HOLE) {
-			/*
-			 * set_page_dirty dirties all buffers in a page, independent
-			 * of their state.  The dirty state however is entirely
-			 * meaningless for holes (!mapped && uptodate), so check we did
-			 * have a buffer covering a hole here and continue.
-			 */
-			if (bh)
-				bh = bh->b_this_page;
-			continue;
+				break;
 		}
 
-		if (bh) {
-			xfs_map_at_offset(inode, bh, &wpc->imap, file_offset);
-			bh = bh->b_this_page;
+		if (wpc->io_type != XFS_IO_HOLE) {
+			xfs_add_to_ioend(inode, file_offset, page, iop, wpc,
+				wbc, &submit_list);
+			count++;
 		}
-		xfs_add_to_ioend(inode, file_offset, page, wpc, wbc,
-				&submit_list);
-		count++;
 	}
 
 	ASSERT(wpc->ioend || list_empty(&submit_list));
-
-out:
 	ASSERT(PageLocked(page));
 	ASSERT(!PageWriteback(page));
 
 	/*
-	 * On error, we have to fail the ioend here because we have locked
-	 * buffers in the ioend. If we don't do this, we'll deadlock
-	 * invalidating the page as that tries to lock the buffers on the page.
-	 * Also, because we may have set pages under writeback, we have to make
-	 * sure we run IO completion to mark the error state of the IO
-	 * appropriately, so we can't cancel the ioend directly here. That means
-	 * we have to mark this page as under writeback if we included any
-	 * buffers from it in the ioend chain so that completion treats it
-	 * correctly.
+	 * On error, we have to fail the ioend here because we may have set
+	 * pages under writeback, we have to make sure we run IO completion to
+	 * mark the error state of the IO appropriately, so we can't cancel the
+	 * ioend directly here.  That means we have to mark this page as under
+	 * writeback if we included any blocks from it in the ioend chain so
+	 * that completion treats it correctly.
 	 *
 	 * If we didn't include the page in the ioend, the on error we can
 	 * simply discard and unlock it as there are no other users of the page
-	 * or it's buffers right now. The caller will still need to trigger
-	 * submission of outstanding ioends on the writepage context so they are
-	 * treated correctly on error.
+	 * now.  The caller will still need to trigger submission of outstanding
+	 * ioends on the writepage context so they are treated correctly on
+	 * error.
 	 */
 	if (unlikely(error)) {
 		if (!count) {
@@ -940,8 +777,8 @@ xfs_writepage_map(
 	}
 
 	/*
-	 * We can end up here with no error and nothing to write if we race with
-	 * a partial page truncate on a sub-page block sized filesystem.
+	 * We can end up here with no error and nothing to write only if we race
+	 * with a partial page truncate on a sub-page block sized filesystem.
 	 */
 	if (!count)
 		end_page_writeback(page);
@@ -956,7 +793,6 @@ xfs_writepage_map(
  * For delalloc space on the page we need to allocate space and flush it.
  * For unwritten space on the page we need to start the conversion to
  * regular allocated space.
- * For any other dirty buffer heads on the page we should flush them.
  */
 STATIC int
 xfs_do_writepage(
@@ -1110,168 +946,6 @@ xfs_dax_writepages(
 			xfs_find_bdev_for_inode(mapping->host), wbc);
 }
 
-/*
- * Called to move a page into cleanable state - and from there
- * to be released. The page should already be clean. We always
- * have buffer heads in this call.
- *
- * Returns 1 if the page is ok to release, 0 otherwise.
- */
-STATIC int
-xfs_vm_releasepage(
-	struct page		*page,
-	gfp_t			gfp_mask)
-{
-	int			delalloc, unwritten;
-
-	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
-
-	/*
-	 * mm accommodates an old ext3 case where clean pages might not have had
-	 * the dirty bit cleared. Thus, it can send actual dirty pages to
-	 * ->releasepage() via shrink_active_list(). Conversely,
-	 * block_invalidatepage() can send pages that are still marked dirty but
-	 * otherwise have invalidated buffers.
-	 *
-	 * We want to release the latter to avoid unnecessary buildup of the
-	 * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
-	 * that are entirely invalidated and need to be released.  Hence the
-	 * only time we should get dirty pages here is through
-	 * shrink_active_list() and so we can simply skip those now.
-	 *
-	 * warn if we've left any lingering delalloc/unwritten buffers on clean
-	 * or invalidated pages we are about to release.
-	 */
-	if (PageDirty(page))
-		return 0;
-
-	xfs_count_page_state(page, &delalloc, &unwritten);
-
-	if (WARN_ON_ONCE(delalloc))
-		return 0;
-	if (WARN_ON_ONCE(unwritten))
-		return 0;
-
-	return try_to_free_buffers(page);
-}
-
-/*
- * If this is O_DIRECT or the mpage code calling tell them how large the mapping
- * is, so that we can avoid repeated get_blocks calls.
- *
- * If the mapping spans EOF, then we have to break the mapping up as the mapping
- * for blocks beyond EOF must be marked new so that sub block regions can be
- * correctly zeroed. We can't do this for mappings within EOF unless the mapping
- * was just allocated or is unwritten, otherwise the callers would overwrite
- * existing data with zeros. Hence we have to split the mapping into a range up
- * to and including EOF, and a second mapping for beyond EOF.
- */
-static void
-xfs_map_trim_size(
-	struct inode		*inode,
-	sector_t		iblock,
-	struct buffer_head	*bh_result,
-	struct xfs_bmbt_irec	*imap,
-	xfs_off_t		offset,
-	ssize_t			size)
-{
-	xfs_off_t		mapping_size;
-
-	mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
-	mapping_size <<= inode->i_blkbits;
-
-	ASSERT(mapping_size > 0);
-	if (mapping_size > size)
-		mapping_size = size;
-	if (offset < i_size_read(inode) &&
-	    (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) {
-		/* limit mapping to block that spans EOF */
-		mapping_size = roundup_64(i_size_read(inode) - offset,
-					  i_blocksize(inode));
-	}
-	if (mapping_size > LONG_MAX)
-		mapping_size = LONG_MAX;
-
-	bh_result->b_size = mapping_size;
-}
-
-static int
-xfs_get_blocks(
-	struct inode		*inode,
-	sector_t		iblock,
-	struct buffer_head	*bh_result,
-	int			create)
-{
-	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	xfs_fileoff_t		offset_fsb, end_fsb;
-	int			error = 0;
-	int			lockmode = 0;
-	struct xfs_bmbt_irec	imap;
-	int			nimaps = 1;
-	xfs_off_t		offset;
-	ssize_t			size;
-
-	BUG_ON(create);
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
-	offset = (xfs_off_t)iblock << inode->i_blkbits;
-	ASSERT(bh_result->b_size >= i_blocksize(inode));
-	size = bh_result->b_size;
-
-	if (offset >= i_size_read(inode))
-		return 0;
-
-	/*
-	 * Direct I/O is usually done on preallocated files, so try getting
-	 * a block mapping without an exclusive lock first.
-	 */
-	lockmode = xfs_ilock_data_map_shared(ip);
-
-	ASSERT(offset <= mp->m_super->s_maxbytes);
-	if (offset > mp->m_super->s_maxbytes - size)
-		size = mp->m_super->s_maxbytes - offset;
-	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
-	offset_fsb = XFS_B_TO_FSBT(mp, offset);
-
-	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
-			&nimaps, 0);
-	if (error)
-		goto out_unlock;
-	if (!nimaps) {
-		trace_xfs_get_blocks_notfound(ip, offset, size);
-		goto out_unlock;
-	}
-
-	trace_xfs_get_blocks_found(ip, offset, size,
-		imap.br_state == XFS_EXT_UNWRITTEN ?
-			XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
-	xfs_iunlock(ip, lockmode);
-
-	/* trim mapping down to size requested */
-	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
-
-	/*
-	 * For unwritten extents do not report a disk address in the buffered
-	 * read case (treat as if we're reading into a hole).
-	 */
-	if (xfs_bmap_is_real_extent(&imap))
-		xfs_map_buffer(inode, bh_result, &imap, offset);
-
-	/*
-	 * If this is a realtime file, data may be on a different device.
-	 * to that pointed to from the buffer_head b_bdev currently.
-	 */
-	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
-	return 0;
-
-out_unlock:
-	xfs_iunlock(ip, lockmode);
-	return error;
-}
-
 STATIC sector_t
 xfs_vm_bmap(
 	struct address_space	*mapping,
@@ -1301,9 +975,7 @@ xfs_vm_readpage(
 	struct page		*page)
 {
 	trace_xfs_vm_readpage(page->mapping->host, 1);
-	if (i_blocksize(page->mapping->host) == PAGE_SIZE)
-		return iomap_readpage(page, &xfs_iomap_ops);
-	return mpage_readpage(page, xfs_get_blocks);
+	return iomap_readpage(page, &xfs_iomap_ops);
 }
 
 STATIC int
@@ -1314,65 +986,7 @@ xfs_vm_readpages(
 	unsigned		nr_pages)
 {
 	trace_xfs_vm_readpages(mapping->host, nr_pages);
-	if (i_blocksize(mapping->host) == PAGE_SIZE)
-		return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
-	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
-}
-
-/*
- * This is basically a copy of __set_page_dirty_buffers() with one
- * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
- * dirty, we'll never be able to clean them because we don't write buffers
- * beyond EOF, and that means we can't invalidate pages that span EOF
- * that have been marked dirty. Further, the dirty state can leak into
- * the file interior if the file is extended, resulting in all sorts of
- * bad things happening as the state does not match the underlying data.
- *
- * XXX: this really indicates that bufferheads in XFS need to die. Warts like
- * this only exist because of bufferheads and how the generic code manages them.
- */
-STATIC int
-xfs_vm_set_page_dirty(
-	struct page		*page)
-{
-	struct address_space	*mapping = page->mapping;
-	struct inode		*inode = mapping->host;
-	loff_t			end_offset;
-	loff_t			offset;
-	int			newly_dirty;
-
-	if (unlikely(!mapping))
-		return !TestSetPageDirty(page);
-
-	end_offset = i_size_read(inode);
-	offset = page_offset(page);
-
-	spin_lock(&mapping->private_lock);
-	if (page_has_buffers(page)) {
-		struct buffer_head *head = page_buffers(page);
-		struct buffer_head *bh = head;
-
-		do {
-			if (offset < end_offset)
-				set_buffer_dirty(bh);
-			bh = bh->b_this_page;
-			offset += i_blocksize(inode);
-		} while (bh != head);
-	}
-	/*
-	 * Lock out page->mem_cgroup migration to keep PageDirty
-	 * synchronized with per-memcg dirty page counters.
-	 */
-	lock_page_memcg(page);
-	newly_dirty = !TestSetPageDirty(page);
-	spin_unlock(&mapping->private_lock);
-
-	if (newly_dirty)
-		__set_page_dirty(page, mapping, 1);
-	unlock_page_memcg(page);
-	if (newly_dirty)
-		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-	return newly_dirty;
+	return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
 }
 
 const struct address_space_operations xfs_address_space_operations = {
@@ -1380,13 +994,13 @@ const struct address_space_operations xfs_address_space_operations = {
 	.readpages		= xfs_vm_readpages,
 	.writepage		= xfs_vm_writepage,
 	.writepages		= xfs_vm_writepages,
-	.set_page_dirty		= xfs_vm_set_page_dirty,
+	.set_page_dirty		= iomap_set_page_dirty,
 	.releasepage		= xfs_vm_releasepage,
 	.invalidatepage		= xfs_vm_invalidatepage,
 	.bmap			= xfs_vm_bmap,
 	.direct_IO		= noop_direct_IO,
-	.migratepage		= buffer_migrate_page,
-	.is_partially_uptodate  = block_is_partially_uptodate,
+	.migratepage		= iomap_migrate_page,
+	.is_partially_uptodate  = iomap_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index edced162a674..3a8389c64241 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -24,7 +24,6 @@
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/dax.h>
-#include <linux/buffer_head.h>
 #include <linux/uio.h>
 #include <linux/list_lru.h>
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d71424052917..637dd6d8bbe2 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1845,7 +1845,7 @@ MODULE_ALIAS_FS("xfs");
 STATIC int __init
 xfs_init_zones(void)
 {
-	xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
+	xfs_ioend_bioset = bioset_create(4 * (PAGE_SIZE / SECTOR_SIZE),
 			offsetof(struct xfs_ioend, io_inline_bio),
 			BIOSET_NEED_BVECS);
 	if (!xfs_ioend_bioset)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index aa284f840d33..5d0ede4fb164 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1168,33 +1168,23 @@ DECLARE_EVENT_CLASS(xfs_page_class,
 		__field(loff_t, size)
 		__field(unsigned long, offset)
 		__field(unsigned int, length)
-		__field(int, delalloc)
-		__field(int, unwritten)
 	),
 	TP_fast_assign(
-		int delalloc = -1, unwritten = -1;
-
-		if (page_has_buffers(page))
-			xfs_count_page_state(page, &delalloc, &unwritten);
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = XFS_I(inode)->i_ino;
 		__entry->pgoff = page_offset(page);
 		__entry->size = i_size_read(inode);
 		__entry->offset = off;
 		__entry->length = len;
-		__entry->delalloc = delalloc;
-		__entry->unwritten = unwritten;
 	),
 	TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
-		  "length %x delalloc %d unwritten %d",
+		  "length %x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->pgoff,
 		  __entry->size,
 		  __entry->offset,
-		  __entry->length,
-		  __entry->delalloc,
-		  __entry->unwritten)
+		  __entry->length)
 )
 
 #define DEFINE_PAGE_EVENT(name)		\
@@ -1278,9 +1268,6 @@ DEFINE_EVENT(xfs_imap_class, name,	\
 	TP_ARGS(ip, offset, count, type, irec))
 DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
 DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
 DEFINE_IOMAP_EVENT(xfs_iomap_found);
 
@@ -1319,7 +1306,6 @@ DEFINE_EVENT(xfs_simple_io_class, name,	\
 	TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
-DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
 DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
 DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 33/33] fs: remove __block_write_begin and iomap_to_bh
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (31 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 32/33] xfs: add support for sub-pagesize writeback without buffer_heads Christoph Hellwig
@ 2018-05-09  7:48 ` Christoph Hellwig
  2018-05-10 15:13 ` stop using buffer heads in xfs and iomap Darrick J. Wong
  33 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:48 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm

Now that the iomap buffered write code stopped using bufferheads these
aren't used anymore.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/buffer.c   | 76 ++++-----------------------------------------------
 fs/internal.h |  2 --
 2 files changed, 5 insertions(+), 73 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 71ea9a29e9d5..5cdcaa6230ed 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -22,7 +22,6 @@
 #include <linux/sched/signal.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
-#include <linux/iomap.h>
 #include <linux/mm.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
@@ -1863,62 +1862,8 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
 
-static void
-iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
-		struct iomap *iomap)
-{
-	loff_t offset = block << inode->i_blkbits;
-
-	bh->b_bdev = iomap->bdev;
-
-	/*
-	 * Block points to offset in file we need to map, iomap contains
-	 * the offset at which the map starts. If the map ends before the
-	 * current block, then do not map the buffer and let the caller
-	 * handle it.
-	 */
-	BUG_ON(offset >= iomap->offset + iomap->length);
-
-	switch (iomap->type) {
-	case IOMAP_HOLE:
-		/*
-		 * If the buffer is not up to date or beyond the current EOF,
-		 * we need to mark it as new to ensure sub-block zeroing is
-		 * executed if necessary.
-		 */
-		if (!buffer_uptodate(bh) ||
-		    (offset >= i_size_read(inode)))
-			set_buffer_new(bh);
-		break;
-	case IOMAP_DELALLOC:
-		if (!buffer_uptodate(bh) ||
-		    (offset >= i_size_read(inode)))
-			set_buffer_new(bh);
-		set_buffer_uptodate(bh);
-		set_buffer_mapped(bh);
-		set_buffer_delay(bh);
-		break;
-	case IOMAP_UNWRITTEN:
-		/*
-		 * For unwritten regions, we always need to ensure that
-		 * sub-block writes cause the regions in the block we are not
-		 * writing to are zeroed. Set the buffer as new to ensure this.
-		 */
-		set_buffer_new(bh);
-		set_buffer_unwritten(bh);
-		/* FALLTHRU */
-	case IOMAP_MAPPED:
-		if (offset >= i_size_read(inode))
-			set_buffer_new(bh);
-		bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
-				inode->i_blkbits;
-		set_buffer_mapped(bh);
-		break;
-	}
-}
-
-int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
-		get_block_t *get_block, struct iomap *iomap)
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block)
 {
 	unsigned from = pos & (PAGE_SIZE - 1);
 	unsigned to = from + len;
@@ -1954,14 +1899,9 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
 			clear_buffer_new(bh);
 		if (!buffer_mapped(bh)) {
 			WARN_ON(bh->b_size != blocksize);
-			if (get_block) {
-				err = get_block(inode, block, bh, 1);
-				if (err)
-					break;
-			} else {
-				iomap_to_bh(inode, block, bh, iomap);
-			}
-
+			err = get_block(inode, block, bh, 1);
+			if (err)
+				break;
 			if (buffer_new(bh)) {
 				clean_bdev_bh_alias(bh);
 				if (PageUptodate(page)) {
@@ -2001,12 +1941,6 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
 		page_zero_new_buffers(page, from, to);
 	return err;
 }
-
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-		get_block_t *get_block)
-{
-	return __block_write_begin_int(page, pos, len, get_block, NULL);
-}
 EXPORT_SYMBOL(__block_write_begin);
 
 static int __block_commit_write(struct inode *inode, struct page *page,
diff --git a/fs/internal.h b/fs/internal.h
index b955232d3d49..47764f739075 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -41,8 +41,6 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
  * buffer.c
  */
 extern void guard_bio_eod(int rw, struct bio *bio);
-extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
-		get_block_t *get_block, struct iomap *iomap);
 int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
 		struct page *page);
 
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* Re: [PATCH 01/33] block: add a lower-level bio_add_page interface
  2018-05-09  7:47 ` [PATCH 01/33] block: add a lower-level bio_add_page interface Christoph Hellwig
@ 2018-05-09 15:12   ` Matthew Wilcox
  2018-05-10  6:40     ` Christoph Hellwig
  2018-05-10  8:52   ` Ming Lei
  2018-05-16  5:06   ` Ritesh Harjani
  2 siblings, 1 reply; 66+ messages in thread
From: Matthew Wilcox @ 2018-05-09 15:12 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, linux-block, linux-mm

On Wed, May 09, 2018 at 09:47:58AM +0200, Christoph Hellwig wrote:
> +/**
> + * __bio_try_merge_page - try adding data to an existing bvec
> + * @bio: destination bio
> + * @page: page to add
> + * @len: length of the range to add
> + * @off: offset into @page
> + *
> + * Try adding the data described at @page + @offset to the last bvec of @bio.
> + * Return %true on success or %false on failure.  This can happen frequently
> + * for file systems with a block size smaller than the page size.
> + */

Could we make this:

/**
 * __bio_try_merge_page() - Try appending data to an existing bvec.
 * @bio: Destination bio.
 * @page: Page to add.
 * @len: Length of the data to add.
 * @off: Offset of the data in @page.
 *
 * Try to add the data at @page + @off to the last bvec of @bio.  This is
 * a useful optimisation for file systems with a block size smaller than
 * the page size.
 *
 * Context: Any context.
 * Return: %true on success or %false on failure.
 */

(page, len, off) is a bit weird to me.  Usually we do (page, off, len).

> +/**
> + * __bio_add_page - add page to a bio in a new segment
> + * @bio: destination bio
> + * @page: page to add
> + * @len: length of the range to add
> + * @off: offset into @page
> + *
> + * Add the data at @page + @offset to @bio as a new bvec.  The caller must
> + * ensure that @bio has space for another bvec.
> + */

/**
 * __bio_add_page - Add page to a bio in a new segment.
 * @bio: Destination bio.
 * @page: Page to add.
 * @len: Length of the data to add.
 * @off: Offset of the data in @page.
 *
 * Add the data at @page + @off to @bio as a new bvec.  The caller must
 * ensure that @bio has space for another bvec.
 *
 * Context: Any context.
 */

> +static inline bool bio_full(struct bio *bio)
> +{
> +	return bio->bi_vcnt >= bio->bi_max_vecs;
> +}

I really like this helper.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 02/33] fs: factor out a __generic_write_end helper
  2018-05-09  7:47 ` [PATCH 02/33] fs: factor out a __generic_write_end helper Christoph Hellwig
@ 2018-05-09 15:15   ` Matthew Wilcox
  2018-05-10  6:40     ` Christoph Hellwig
  0 siblings, 1 reply; 66+ messages in thread
From: Matthew Wilcox @ 2018-05-09 15:15 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, linux-block, linux-mm

On Wed, May 09, 2018 at 09:47:59AM +0200, Christoph Hellwig wrote:
>  }
>  EXPORT_SYMBOL(generic_write_end);
>  
> +
>  /*

Spurious?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 06/33] mm: give the 'ret' variable a better name __do_page_cache_readahead
  2018-05-09  7:48 ` [PATCH 06/33] mm: give the 'ret' variable a better name __do_page_cache_readahead Christoph Hellwig
@ 2018-05-09 15:45   ` Matthew Wilcox
  2018-05-10  6:41     ` Christoph Hellwig
  0 siblings, 1 reply; 66+ messages in thread
From: Matthew Wilcox @ 2018-05-09 15:45 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, linux-block, linux-mm

On Wed, May 09, 2018 at 09:48:03AM +0200, Christoph Hellwig wrote:
> It counts the number of pages acted on, so name it nr_pages to make that
> obvious.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Yes!

Also, it can't return an error, so how about changing it to unsigned int?
And deleting the error check from the caller?

>  mm/readahead.c | 10 +++++-----
>  1 file changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/readahead.c b/mm/readahead.c
> index 539bbb6c1fad..16d0cb1e2616 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -156,7 +156,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
>  	unsigned long end_index;	/* The last page we want to read */
>  	LIST_HEAD(page_pool);
>  	int page_idx;
> -	int ret = 0;
> +	int nr_pages = 0;
>  	loff_t isize = i_size_read(inode);
>  	gfp_t gfp_mask = readahead_gfp_mask(mapping);
>  
> @@ -187,7 +187,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
>  		list_add(&page->lru, &page_pool);
>  		if (page_idx == nr_to_read - lookahead_size)
>  			SetPageReadahead(page);
> -		ret++;
> +		nr_pages++;
>  	}
>  
>  	/*
> @@ -195,11 +195,11 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
>  	 * uptodate then the caller will launch readpage again, and
>  	 * will then handle the error.
>  	 */
> -	if (ret)
> -		read_pages(mapping, filp, &page_pool, ret, gfp_mask);
> +	if (nr_pages)
> +		read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
>  	BUG_ON(!list_empty(&page_pool));
>  out:
> -	return ret;
> +	return nr_pages;
>  }
>  
>  /*
> -- 
> 2.17.0
> 

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 07/33] mm: split ->readpages calls to avoid non-contiguous pages lists
  2018-05-09  7:48 ` [PATCH 07/33] mm: split ->readpages calls to avoid non-contiguous pages lists Christoph Hellwig
@ 2018-05-09 15:46   ` Matthew Wilcox
  0 siblings, 0 replies; 66+ messages in thread
From: Matthew Wilcox @ 2018-05-09 15:46 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, linux-block, linux-mm

On Wed, May 09, 2018 at 09:48:04AM +0200, Christoph Hellwig wrote:
> That way file systems don't have to go spotting for non-contiguous pages
> and work around them.  It also kicks off I/O earlier, allowing it to
> finish earlier and reduce latency.

Makes sense.

> +			/*
> +			 * Page already present?  Kick off the current batch of
> +			 * contiguous pages before continueing with the next

"continuing" (no 'e')

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 10/33] iomap: add an iomap-based bmap implementation
  2018-05-09  7:48 ` [PATCH 10/33] iomap: add an iomap-based bmap implementation Christoph Hellwig
@ 2018-05-09 16:46   ` Darrick J. Wong
  2018-05-10  6:42     ` Christoph Hellwig
  0 siblings, 1 reply; 66+ messages in thread
From: Darrick J. Wong @ 2018-05-09 16:46 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, linux-block, linux-mm

On Wed, May 09, 2018 at 09:48:07AM +0200, Christoph Hellwig wrote:
> This adds a simple iomap-based implementation of the legacy ->bmap
> interface.  Note that we can't easily add checks for rt or reflink
> files, so these will have to remain in the callers.  This interface
> just needs to die..

You /can/ check these...

if (iomap->bdev != inode->i_sb->s_bdev)
	return 0;
if (iomap->flags & IOMAP_F_SHARED)
	return 0;

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/iomap.c            | 29 +++++++++++++++++++++++++++++
>  include/linux/iomap.h |  3 +++
>  2 files changed, 32 insertions(+)
> 
> diff --git a/fs/iomap.c b/fs/iomap.c
> index af525cb47339..049e0c4aacac 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -1201,3 +1201,32 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(iomap_dio_rw);
> +
> +static loff_t
> +iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
> +		void *data, struct iomap *iomap)
> +{
> +	sector_t *bno = data;
> +
> +	if (iomap->type == IOMAP_MAPPED)
> +		*bno = (iomap->addr + pos - iomap->offset) >> inode->i_blkbits;

Does this need to be careful w.r.t. overflow on systems where sector_t
is a 32-bit unsigned long?

Also, ioctl_fibmap() typecasts the returned sector_t to an int, which
also seems broken.  I agree the interface needs to die, but ioctls take
a long time to deprecate.

--D

> +	return 0;
> +}
> +
> +/* legacy ->bmap interface.  0 is the error return (!) */
> +sector_t
> +iomap_bmap(struct address_space *mapping, sector_t bno,
> +		const struct iomap_ops *ops)
> +{
> +	struct inode *inode = mapping->host;
> +	loff_t pos = bno >> inode->i_blkbits;
> +	unsigned blocksize = i_blocksize(inode);
> +
> +	if (filemap_write_and_wait(mapping))
> +		return 0;
> +
> +	bno = 0;
> +	iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
> +	return bno;
> +}
> +EXPORT_SYMBOL_GPL(iomap_bmap);
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 19a07de28212..07f73224c38b 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -4,6 +4,7 @@
>  
>  #include <linux/types.h>
>  
> +struct address_space;
>  struct fiemap_extent_info;
>  struct inode;
>  struct iov_iter;
> @@ -95,6 +96,8 @@ loff_t iomap_seek_hole(struct inode *inode, loff_t offset,
>  		const struct iomap_ops *ops);
>  loff_t iomap_seek_data(struct inode *inode, loff_t offset,
>  		const struct iomap_ops *ops);
> +sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
> +		const struct iomap_ops *ops);
>  
>  /*
>   * Flags for direct I/O ->end_io:
> -- 
> 2.17.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 11/33] iomap: add an iomap-based readpage and readpages implementation
  2018-05-09  7:48 ` [PATCH 11/33] iomap: add an iomap-based readpage and readpages implementation Christoph Hellwig
@ 2018-05-10  1:17   ` Dave Chinner
  2018-05-10  6:44     ` Christoph Hellwig
  0 siblings, 1 reply; 66+ messages in thread
From: Dave Chinner @ 2018-05-10  1:17 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, linux-block, linux-mm

On Wed, May 09, 2018 at 09:48:08AM +0200, Christoph Hellwig wrote:
> Simply use iomap_apply to iterate over the file and a submit a bio for
> each non-uptodate but mapped region and zero everything else.  Note that
> as-is this can not be used for file systems with a blocksize smaller than
> the page size, but that support will be added later.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
.....
> +int
> +iomap_readpages(struct address_space *mapping, struct list_head *pages,
> +		unsigned nr_pages, const struct iomap_ops *ops)
> +{
> +	struct iomap_readpage_ctx ctx = { .pages = pages };
> +	loff_t pos = page_offset(list_entry(pages->prev, struct page, lru));
> +	loff_t last = page_offset(list_entry(pages->next, struct page, lru));
> +	loff_t length = last - pos + PAGE_SIZE, ret = 0;
> +
> +	while (length > 0) {
> +		ret = iomap_apply(mapping->host, pos, length, 0, ops,
> +				&ctx, iomap_readpages_actor);
> +		if (ret <= 0)
> +			break;
> +		pos += ret;
> +		length -= ret;
> +	}
> +
> +	ret = 0;

This means the function will always return zero, regardless of
whether iomap_apply returned an error or not.

> +	if (ctx.bio)
> +		submit_bio(ctx.bio);
> +	if (ctx.cur_page) {
> +		if (!ctx.cur_page_in_bio)
> +			unlock_page(ctx.cur_page);
> +		put_page(ctx.cur_page);
> +	}
> +	WARN_ON_ONCE(ret && !list_empty(ctx.pages));

And this warning will never trigger. Was this intended behaviour?
If it is, it needs a comment, because it looks wrong....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 01/33] block: add a lower-level bio_add_page interface
  2018-05-09 15:12   ` Matthew Wilcox
@ 2018-05-10  6:40     ` Christoph Hellwig
  2018-05-10 21:49       ` Andreas Dilger
  0 siblings, 1 reply; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-10  6:40 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, linux-block, linux-mm

On Wed, May 09, 2018 at 08:12:43AM -0700, Matthew Wilcox wrote:
> (page, len, off) is a bit weird to me.  Usually we do (page, off, len).

That's what I'd usually do, too.  But this odd convention is what
bio_add_page uses, so I decided to stick to that instead of having two
different conventions in one family of functions.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 02/33] fs: factor out a __generic_write_end helper
  2018-05-09 15:15   ` Matthew Wilcox
@ 2018-05-10  6:40     ` Christoph Hellwig
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-10  6:40 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, linux-block, linux-mm

On Wed, May 09, 2018 at 08:15:56AM -0700, Matthew Wilcox wrote:
> On Wed, May 09, 2018 at 09:47:59AM +0200, Christoph Hellwig wrote:
> >  }
> >  EXPORT_SYMBOL(generic_write_end);
> >  
> > +
> >  /*
> 
> Spurious?

Yes.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 06/33] mm: give the 'ret' variable a better name __do_page_cache_readahead
  2018-05-09 15:45   ` Matthew Wilcox
@ 2018-05-10  6:41     ` Christoph Hellwig
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-10  6:41 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, linux-block, linux-mm

On Wed, May 09, 2018 at 08:45:01AM -0700, Matthew Wilcox wrote:
> On Wed, May 09, 2018 at 09:48:03AM +0200, Christoph Hellwig wrote:
> > It counts the number of pages acted on, so name it nr_pages to make that
> > obvious.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Yes!
> 
> Also, it can't return an error, so how about changing it to unsigned int?
> And deleting the error check from the caller?

I'll take a look at that.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 10/33] iomap: add an iomap-based bmap implementation
  2018-05-09 16:46   ` Darrick J. Wong
@ 2018-05-10  6:42     ` Christoph Hellwig
  2018-05-10 15:08       ` Darrick J. Wong
  0 siblings, 1 reply; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-10  6:42 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, linux-block, linux-mm

On Wed, May 09, 2018 at 09:46:28AM -0700, Darrick J. Wong wrote:
> On Wed, May 09, 2018 at 09:48:07AM +0200, Christoph Hellwig wrote:
> > This adds a simple iomap-based implementation of the legacy ->bmap
> > interface.  Note that we can't easily add checks for rt or reflink
> > files, so these will have to remain in the callers.  This interface
> > just needs to die..
> 
> You /can/ check these...
> 
> if (iomap->bdev != inode->i_sb->s_bdev)
> 	return 0;
> if (iomap->flags & IOMAP_F_SHARED)
> 	return 0;

The latter only checks for a shared extent, not a file with possibly
shared extents.  I'd rather keep the check for a file with possible
shared extents.

> > +static loff_t
> > +iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
> > +		void *data, struct iomap *iomap)
> > +{
> > +	sector_t *bno = data;
> > +
> > +	if (iomap->type == IOMAP_MAPPED)
> > +		*bno = (iomap->addr + pos - iomap->offset) >> inode->i_blkbits;
> 
> Does this need to be careful w.r.t. overflow on systems where sector_t
> is a 32-bit unsigned long?
> 
> Also, ioctl_fibmap() typecasts the returned sector_t to an int, which
> also seems broken.  I agree the interface needs to die, but ioctls take
> a long time to deprecate.

Not much we can do about the interface.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 11/33] iomap: add an iomap-based readpage and readpages implementation
  2018-05-10  1:17   ` Dave Chinner
@ 2018-05-10  6:44     ` Christoph Hellwig
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-10  6:44 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, linux-block, linux-mm

On Thu, May 10, 2018 at 11:17:58AM +1000, Dave Chinner wrote:
> > +		if (ret <= 0)
> > +			break;
> > +		pos += ret;
> > +		length -= ret;
> > +	}
> > +
> > +	ret = 0;
> 
> This means the function will always return zero, regardless of
> whether iomap_apply returned an error or not.
> 
> > +	if (ctx.bio)
> > +		submit_bio(ctx.bio);
> > +	if (ctx.cur_page) {
> > +		if (!ctx.cur_page_in_bio)
> > +			unlock_page(ctx.cur_page);
> > +		put_page(ctx.cur_page);
> > +	}
> > +	WARN_ON_ONCE(ret && !list_empty(ctx.pages));
> 
> And this warning will never trigger. Was this intended behaviour?
> If it is, it needs a comment, because it looks wrong....

Yes, the break should have been a goto out which jumps after the
ret.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 01/33] block: add a lower-level bio_add_page interface
  2018-05-09  7:47 ` [PATCH 01/33] block: add a lower-level bio_add_page interface Christoph Hellwig
  2018-05-09 15:12   ` Matthew Wilcox
@ 2018-05-10  8:52   ` Ming Lei
  2018-05-11  6:24     ` Christoph Hellwig
  2018-05-16  5:06   ` Ritesh Harjani
  2 siblings, 1 reply; 66+ messages in thread
From: Ming Lei @ 2018-05-10  8:52 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: open list:XFS FILESYSTEM, Linux FS Devel, linux-block, linux-mm

On Wed, May 9, 2018 at 3:47 PM, Christoph Hellwig <hch@lst.de> wrote:
> For the upcoming removal of buffer heads in XFS we need to keep track of
> the number of outstanding writeback requests per page.  For this we need
> to know if bio_add_page merged a region with the previous bvec or not.
> Instead of adding additional arguments this refactors bio_add_page to
> be implemented using three lower level helpers which users like XFS can
> use directly if they care about the merge decisions.

The merge policy may be transparent to fs, such as multipage bvec.

>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/bio.c         | 87 ++++++++++++++++++++++++++++++---------------
>  include/linux/bio.h |  9 +++++
>  2 files changed, 67 insertions(+), 29 deletions(-)
>
> diff --git a/block/bio.c b/block/bio.c
> index 53e0f0a1ed94..6ceba6adbf42 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -773,7 +773,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
>                         return 0;
>         }
>
> -       if (bio->bi_vcnt >= bio->bi_max_vecs)
> +       if (bio_full(bio))
>                 return 0;
>
>         /*
> @@ -820,6 +820,59 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
>  }
>  EXPORT_SYMBOL(bio_add_pc_page);
>
> +/**
> + * __bio_try_merge_page - try adding data to an existing bvec
> + * @bio: destination bio
> + * @page: page to add
> + * @len: length of the range to add
> + * @off: offset into @page
> + *
> + * Try adding the data described at @page + @offset to the last bvec of @bio.
> + * Return %true on success or %false on failure.  This can happen frequently
> + * for file systems with a block size smaller than the page size.
> + */
> +bool __bio_try_merge_page(struct bio *bio, struct page *page,
> +               unsigned int len, unsigned int off)
> +{
> +       if (bio->bi_vcnt > 0) {
> +               struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
> +
> +               if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
> +                       bv->bv_len += len;
> +                       bio->bi_iter.bi_size += len;
> +                       return true;
> +               }
> +       }
> +       return false;
> +}
> +EXPORT_SYMBOL_GPL(__bio_try_merge_page);
> +
> +/**
> + * __bio_add_page - add page to a bio in a new segment
> + * @bio: destination bio
> + * @page: page to add
> + * @len: length of the range to add
> + * @off: offset into @page
> + *
> + * Add the data at @page + @offset to @bio as a new bvec.  The caller must
> + * ensure that @bio has space for another bvec.
> + */
> +void __bio_add_page(struct bio *bio, struct page *page,
> +               unsigned int len, unsigned int off)
> +{
> +       struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
> +
> +       WARN_ON_ONCE(bio_full(bio));
> +
> +       bv->bv_page = page;
> +       bv->bv_offset = off;
> +       bv->bv_len = len;
> +
> +       bio->bi_iter.bi_size += len;
> +       bio->bi_vcnt++;
> +}
> +EXPORT_SYMBOL_GPL(__bio_add_page);

Given both __bio_try_merge_page() and __bio_add_page() are exported,
please add WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)), otherwise
both may be misused by external users.

-- 
Ming Lei

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 10/33] iomap: add an iomap-based bmap implementation
  2018-05-10  6:42     ` Christoph Hellwig
@ 2018-05-10 15:08       ` Darrick J. Wong
  2018-05-11  6:25         ` Christoph Hellwig
  0 siblings, 1 reply; 66+ messages in thread
From: Darrick J. Wong @ 2018-05-10 15:08 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, linux-block, linux-mm

On Thu, May 10, 2018 at 08:42:50AM +0200, Christoph Hellwig wrote:
> On Wed, May 09, 2018 at 09:46:28AM -0700, Darrick J. Wong wrote:
> > On Wed, May 09, 2018 at 09:48:07AM +0200, Christoph Hellwig wrote:
> > > This adds a simple iomap-based implementation of the legacy ->bmap
> > > interface.  Note that we can't easily add checks for rt or reflink
> > > files, so these will have to remain in the callers.  This interface
> > > just needs to die..
> > 
> > You /can/ check these...
> > 
> > if (iomap->bdev != inode->i_sb->s_bdev)
> > 	return 0;
> > if (iomap->flags & IOMAP_F_SHARED)
> > 	return 0;
> 
> The latter only checks for a shared extent, not a file with possibly
> shared extents.  I'd rather keep the check for a file with possible
> shared extents.

<nod>

> > > +static loff_t
> > > +iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
> > > +		void *data, struct iomap *iomap)
> > > +{
> > > +	sector_t *bno = data;
> > > +
> > > +	if (iomap->type == IOMAP_MAPPED)
> > > +		*bno = (iomap->addr + pos - iomap->offset) >> inode->i_blkbits;
> > 
> > Does this need to be careful w.r.t. overflow on systems where sector_t
> > is a 32-bit unsigned long?
> > 
> > Also, ioctl_fibmap() typecasts the returned sector_t to an int, which
> > also seems broken.  I agree the interface needs to die, but ioctls take
> > a long time to deprecate.
> 
> Not much we can do about the interface.

Yes, the interface is fubar, but if file /foo maps to block 8589934720
then do we return the truncated result 128?

--D

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: stop using buffer heads in xfs and iomap
  2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
                   ` (32 preceding siblings ...)
  2018-05-09  7:48 ` [PATCH 33/33] fs: remove __block_write_begin and iomap_to_bh Christoph Hellwig
@ 2018-05-10 15:13 ` Darrick J. Wong
  2018-05-11  6:22   ` Christoph Hellwig
  33 siblings, 1 reply; 66+ messages in thread
From: Darrick J. Wong @ 2018-05-10 15:13 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, linux-block, linux-mm

On Wed, May 09, 2018 at 09:47:57AM +0200, Christoph Hellwig wrote:
> Hi all,
> 
> this series adds support for reading blocks from disk using the iomap
> interface, and then gradually switched the buffered I/O path to not
> require buffer heads.  It has survived xfstests for 1k and 4k block
> size.
> 
> There are various small changes to the core VFS, block and readahead
> code to make this happen.
> 
> 
> A git tree is available at:
> 
>     git://git.infradead.org/users/hch/xfs.git xfs-remove-bufferheads
> 
> Gitweb:
> 
>     http://git.infradead.org/users/hch/xfs.git/shortlog/refs/heads/xfs-remove-bufferheads

I ran xfstests on this for fun last night but hung in g/095:

FSTYP         -- xfs (debug)
PLATFORM      -- Linux/x86_64 submarine-djwong-mtr01 4.17.0-rc4-djw
MKFS_OPTIONS  -- -f -m reflink=1,rmapbt=1, -i sparse=1, -b size=1024, /dev/sdf
MOUNT_OPTIONS -- /dev/sdf /opt

FWIW the stock v4 and the 'v5 with everything and 4k blocks' vms
passed, so I guess there's a bug somewhere in the sub-page block size
code paths...

--D

[ 2586.943205] run fstests generic/095 at 2018-05-09 23:28:01
[ 2587.252740] XFS (sdf): Unmounting Filesystem
[ 2587.908441] XFS (sdf): Mounting V5 Filesystem
[ 2587.914685] XFS (sdf): Ending clean mount
[ 2702.258764] INFO: task kworker/u10:3:11834 blocked for more than 60 seconds.
[ 2702.261734]       Tainted: G        W         4.17.0-rc4-djw #2
[ 2702.263607] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 2702.265600] kworker/u10:3   D11984 11834      2 0x80000000
[ 2702.273445] Workqueue: writeback wb_workfn (flush-8:80)
[ 2702.274751] Call Trace:
[ 2702.275339]  ? __schedule+0x3e4/0xa70
[ 2702.276112]  ? blk_flush_plug_list+0xe4/0x280
[ 2702.277086]  schedule+0x40/0x90
[ 2702.277967]  io_schedule+0x16/0x40
[ 2702.278774]  __lock_page+0x12d/0x160
[ 2702.279680]  ? page_cache_tree_insert+0x100/0x100
[ 2702.280712]  write_cache_pages+0x32c/0x530
[ 2702.281820]  ? xfs_add_to_ioend+0x350/0x350 [xfs]
[ 2702.292350]  xfs_vm_writepages+0x57/0x80 [xfs]
[ 2702.294048]  do_writepages+0x1a/0x70
[ 2702.295068]  __writeback_single_inode+0x59/0x800
[ 2702.296118]  writeback_sb_inodes+0x282/0x550
[ 2702.297039]  __writeback_inodes_wb+0x87/0xb0
[ 2702.298173]  wb_writeback+0x430/0x5d0
[ 2702.299332]  ? wb_workfn+0x448/0x740
[ 2702.300578]  wb_workfn+0x448/0x740
[ 2702.301434]  ? lock_acquire+0xab/0x200
[ 2702.305413]  process_one_work+0x1ef/0x650
[ 2702.306687]  worker_thread+0x4d/0x3e0
[ 2702.307671]  kthread+0x106/0x140
[ 2702.308473]  ? rescuer_thread+0x340/0x340
[ 2702.309442]  ? kthread_delayed_work_timer_fn+0x90/0x90
[ 2702.310995]  ret_from_fork+0x3a/0x50
[ 2702.312088] INFO: task fio:2618 blocked for more than 60 seconds.
[ 2702.313395]       Tainted: G        W         4.17.0-rc4-djw #2
[ 2702.315139] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 2702.316820] fio             D14224  2618   2612 0x00000000
[ 2702.318050] Call Trace:
[ 2702.318757]  ? __schedule+0x3e4/0xa70
[ 2702.319639]  ? rwsem_down_read_failed+0x7f/0x170
[ 2702.320798]  schedule+0x40/0x90
[ 2702.321630]  rwsem_down_read_failed+0x128/0x170
[ 2702.322752]  ? current_time+0x18/0x70
[ 2702.323857]  ? xfs_file_dio_aio_read+0x6d/0x1c0 [xfs]
[ 2702.325162]  ? call_rwsem_down_read_failed+0x14/0x30
[ 2702.326423]  call_rwsem_down_read_failed+0x14/0x30
[ 2702.328393]  ? xfs_ilock+0x28f/0x330 [xfs]
[ 2702.329539]  down_read_nested+0x9d/0xa0
[ 2702.330452]  xfs_ilock+0x28f/0x330 [xfs]
[ 2702.331427]  xfs_file_dio_aio_read+0x6d/0x1c0 [xfs]
[ 2702.332590]  xfs_file_read_iter+0x9a/0xb0 [xfs]
[ 2702.333992]  __vfs_read+0x136/0x1a0
[ 2702.335133]  vfs_read+0xa3/0x150
[ 2702.336129]  ksys_read+0x45/0xa0
[ 2702.337085]  do_syscall_64+0x56/0x180
[ 2702.337985]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 2702.339537] RIP: 0033:0x7ff4c152751d
[ 2702.340623] RSP: 002b:00007fffa13c93b0 EFLAGS: 00000293 ORIG_RAX: 0000000000000000
[ 2702.342774] RAX: ffffffffffffffda RBX: 0000000000a0a2c0 RCX: 00007ff4c152751d
[ 2702.344269] RDX: 0000000000000400 RSI: 0000000000a17c00 RDI: 0000000000000005
[ 2702.345801] RBP: 00007ff4a6f57000 R08: 0800000000002000 R09: 0000000000000004
[ 2702.347342] R10: 0000000000000001 R11: 0000000000000293 R12: 0000000000000000
[ 2702.348861] R13: 0000000000000400 R14: 0000000000a0a2e8 R15: 00007ff4a6f57000
[ 2702.351298] INFO: task fio:2619 blocked for more than 60 seconds.
[ 2702.353158]       Tainted: G        W         4.17.0-rc4-djw #2
[ 2702.355103] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 2702.357586] fio             D14224  2619   2612 0x00000000
[ 2702.359181] Call Trace:
[ 2702.359815]  ? __schedule+0x3e4/0xa70
[ 2702.360708]  ? rwsem_down_read_failed+0x7f/0x170
[ 2702.361880]  schedule+0x40/0x90
[ 2702.362727]  rwsem_down_read_failed+0x128/0x170
[ 2702.363811]  ? current_time+0x18/0x70
[ 2702.364775]  ? xfs_file_dio_aio_read+0x6d/0x1c0 [xfs]
[ 2702.365994]  ? call_rwsem_down_read_failed+0x14/0x30
[ 2702.367217]  call_rwsem_down_read_failed+0x14/0x30
[ 2702.368411]  ? xfs_ilock+0x28f/0x330 [xfs]
[ 2702.369445]  down_read_nested+0x9d/0xa0
[ 2702.370420]  xfs_ilock+0x28f/0x330 [xfs]
[ 2702.371454]  xfs_file_dio_aio_read+0x6d/0x1c0 [xfs]
[ 2702.372665]  xfs_file_read_iter+0x9a/0xb0 [xfs]
[ 2702.373780]  __vfs_read+0x136/0x1a0
[ 2702.374708]  vfs_read+0xa3/0x150
[ 2702.375521]  ksys_read+0x45/0xa0
[ 2702.376318]  do_syscall_64+0x56/0x180
[ 2702.377207]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 2702.378839] RIP: 0033:0x7ff4c152751d
[ 2702.379625] RSP: 002b:00007fffa13c93b0 EFLAGS: 00000293 ORIG_RAX: 0000000000000000
[ 2702.381190] RAX: ffffffffffffffda RBX: 0000000000a0a2c0 RCX: 00007ff4c152751d
[ 2702.382720] RDX: 0000000000000400 RSI: 0000000000a17c00 RDI: 0000000000000005
[ 2702.384575] RBP: 00007ff4a6f66b48 R08: 1000000000020000 R09: 0000000000000004
[ 2702.386233] R10: 0000000000000001 R11: 0000000000000293 R12: 0000000000000000
[ 2702.387928] R13: 0000000000000400 R14: 0000000000a0a2e8 R15: 00007ff4a6f66b48
[ 2702.389643] INFO: task fio:2620 blocked for more than 60 seconds.
[ 2702.391114]       Tainted: G        W         4.17.0-rc4-djw #2
[ 2702.393061] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 2702.395659] fio             D14224  2620   2612 0x00000000
[ 2702.397589] Call Trace:
[ 2702.398474]  ? __schedule+0x3e4/0xa70
[ 2702.399445]  ? rwsem_down_read_failed+0x7f/0x170
[ 2702.400590]  schedule+0x40/0x90
[ 2702.401753]  rwsem_down_read_failed+0x128/0x170
[ 2702.402902]  ? current_time+0x18/0x70
[ 2702.403874]  ? xfs_file_dio_aio_read+0x6d/0x1c0 [xfs]
[ 2702.405080]  ? call_rwsem_down_read_failed+0x14/0x30
[ 2702.406597]  call_rwsem_down_read_failed+0x14/0x30
[ 2702.407918]  ? xfs_ilock+0x28f/0x330 [xfs]
[ 2702.409035]  down_read_nested+0x9d/0xa0
[ 2702.410284]  xfs_ilock+0x28f/0x330 [xfs]
[ 2702.411697]  xfs_file_dio_aio_read+0x6d/0x1c0 [xfs]
[ 2702.413413]  xfs_file_read_iter+0x9a/0xb0 [xfs]
[ 2702.414658]  __vfs_read+0x136/0x1a0
[ 2702.415855]  vfs_read+0xa3/0x150
[ 2702.416789]  ksys_read+0x45/0xa0
[ 2702.417956]  do_syscall_64+0x56/0x180
[ 2702.419108]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 2702.420636] RIP: 0033:0x7ff4c152751d
[ 2702.421861] RSP: 002b:00007fffa13c93b0 EFLAGS: 00000293 ORIG_RAX: 0000000000000000
[ 2702.424332] RAX: ffffffffffffffda RBX: 0000000000a0a2c0 RCX: 00007ff4c152751d
[ 2702.426572] RDX: 0000000000000400 RSI: 0000000000a17c00 RDI: 0000000000000005
[ 2702.428719] RBP: 00007ff4a6f76690 R08: 0001000000000000 R09: 0000000000000004
[ 2702.430587] R10: 0000000000000001 R11: 0000000000000293 R12: 0000000000000000
[ 2702.432113] R13: 0000000000000400 R14: 0000000000a0a2e8 R15: 00007ff4a6f76690
[ 2702.433608] INFO: task fio:2621 blocked for more than 60 seconds.
[ 2702.434932]       Tainted: G        W         4.17.0-rc4-djw #2
[ 2702.436202] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 2702.437846] fio             D14272  2621   2612 0x00000000
[ 2702.439057] Call Trace:
[ 2702.439633]  ? __schedule+0x3e4/0xa70
[ 2702.440438]  ? rwsem_down_read_failed+0x7f/0x170
[ 2702.441443]  schedule+0x40/0x90
[ 2702.442149]  rwsem_down_read_failed+0x128/0x170
[ 2702.443199]  ? xfs_file_dio_aio_read+0x6d/0x1c0 [xfs]
[ 2702.444275]  ? call_rwsem_down_read_failed+0x14/0x30
[ 2702.445315]  call_rwsem_down_read_failed+0x14/0x30
[ 2702.446370]  ? xfs_ilock+0x28f/0x330 [xfs]
[ 2702.447300]  down_read_nested+0x9d/0xa0
[ 2702.448172]  xfs_ilock+0x28f/0x330 [xfs]
[ 2702.449059]  xfs_file_dio_aio_read+0x6d/0x1c0 [xfs]
[ 2702.450166]  xfs_file_read_iter+0x9a/0xb0 [xfs]
[ 2702.451204]  __vfs_read+0x136/0x1a0
[ 2702.451991]  vfs_read+0xa3/0x150
[ 2702.452715]  ksys_read+0x45/0xa0
[ 2702.453462]  do_syscall_64+0x56/0x180
[ 2702.454277]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 2702.455406] RIP: 0033:0x7ff4c152751d
[ 2702.456206] RSP: 002b:00007fffa13c93b0 EFLAGS: 00000293 ORIG_RAX: 0000000000000000
[ 2702.457820] RAX: ffffffffffffffda RBX: 0000000000a0a2c0 RCX: 00007ff4c152751d
[ 2702.459361] RDX: 0000000000000400 RSI: 0000000000a17c00 RDI: 0000000000000005
[ 2702.460871] RBP: 00007ff4a6f861d8 R08: 0000000000800000 R09: 0000000000000004
[ 2702.462366] R10: 0000000000000001 R11: 0000000000000293 R12: 0000000000000000
[ 2702.463863] R13: 0000000000000400 R14: 0000000000a0a2e8 R15: 00007ff4a6f861d8
[ 2702.465332] INFO: task fio:2622 blocked for more than 60 seconds.
[ 2702.466634]       Tainted: G        W         4.17.0-rc4-djw #2
[ 2702.467880] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 2702.469534] fio             D14296  2622   2612 0x00000000
[ 2702.470742] Call Trace:
[ 2702.471312]  ? __schedule+0x3e4/0xa70
[ 2702.472116]  ? rwsem_down_read_failed+0x7f/0x170
[ 2702.473115]  schedule+0x40/0x90
[ 2702.473831]  rwsem_down_read_failed+0x128/0x170
[ 2702.474911]  ? xfs_file_dio_aio_read+0x6d/0x1c0 [xfs]
[ 2702.476014]  ? call_rwsem_down_read_failed+0x14/0x30
[ 2702.477085]  call_rwsem_down_read_failed+0x14/0x30
[ 2702.478171]  ? xfs_ilock+0x28f/0x330 [xfs]
[ 2702.479114]  down_read_nested+0x9d/0xa0
[ 2702.479999]  xfs_ilock+0x28f/0x330 [xfs]
[ 2702.481052]  xfs_file_dio_aio_read+0x6d/0x1c0 [xfs]
[ 2702.482155]  xfs_file_read_iter+0x9a/0xb0 [xfs]
[ 2702.483461]  __vfs_read+0x136/0x1a0
[ 2702.484542]  vfs_read+0xa3/0x150
[ 2702.485512]  ksys_read+0x45/0xa0
[ 2702.486458]  do_syscall_64+0x56/0x180
[ 2702.487597]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 2702.489099] RIP: 0033:0x7ff4c152751d
[ 2702.490149] RSP: 002b:00007fffa13c93b0 EFLAGS: 00000293 ORIG_RAX: 0000000000000000
[ 2702.492324] RAX: ffffffffffffffda RBX: 0000000000a0a2c0 RCX: 00007ff4c152751d
[ 2702.494580] RDX: 0000000000000400 RSI: 0000000000a17c00 RDI: 0000000000000005
[ 2702.496814] RBP: 00007ff4a6f95d20 R08: 0400000000000000 R09: 0000000000000004
[ 2702.498931] R10: 0000000000000001 R11: 0000000000000293 R12: 0000000000000000
[ 2702.501035] R13: 0000000000000400 R14: 0000000000a0a2e8 R15: 00007ff4a6f95d20
[ 2702.503181] INFO: task fio:2623 blocked for more than 60 seconds.
[ 2702.504980]       Tainted: G        W         4.17.0-rc4-djw #2
[ 2702.506658] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 2702.508464] fio             D14240  2623   2612 0x00000000
[ 2702.509927] Call Trace:
[ 2702.510802]  ? __schedule+0x3e4/0xa70
[ 2702.511832]  ? blk_flush_plug_list+0xe4/0x280
[ 2702.512872]  schedule+0x40/0x90
[ 2702.513782]  rwsem_down_read_failed+0x128/0x170
[ 2702.515267]  ? xfs_file_dio_aio_write+0xa8/0x470 [xfs]
[ 2702.516814]  ? call_rwsem_down_read_failed+0x14/0x30
[ 2702.518340]  call_rwsem_down_read_failed+0x14/0x30
[ 2702.520179]  ? xfs_ilock+0x28f/0x330 [xfs]
[ 2702.521639]  down_read_nested+0x9d/0xa0
[ 2702.522945]  xfs_ilock+0x28f/0x330 [xfs]
[ 2702.524196]  xfs_file_dio_aio_write+0xa8/0x470 [xfs]
[ 2702.525682]  xfs_file_write_iter+0x7b/0xb0 [xfs]
[ 2702.527109]  aio_write+0x133/0x1c0
[ 2702.528170]  ? lock_acquire+0xab/0x200
[ 2702.529282]  ? __might_fault+0x36/0x80
[ 2702.530421]  ? do_io_submit+0x41b/0x8d0
[ 2702.531617]  do_io_submit+0x41b/0x8d0
[ 2702.532758]  ? do_syscall_64+0x56/0x180
[ 2702.533950]  ? do_io_submit+0x8d0/0x8d0
[ 2702.535053]  do_syscall_64+0x56/0x180
[ 2702.536048]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 2702.537236] RIP: 0033:0x7ff4c1c57697
[ 2702.538382] RSP: 002b:00007fffa13c9348 EFLAGS: 00000212 ORIG_RAX: 00000000000000d1
[ 2702.540538] RAX: ffffffffffffffda RBX: 0000000000a0a440 RCX: 00007ff4c1c57697
[ 2702.553411] RDX: 0000000000a08320 RSI: 0000000000000008 RDI: 00007ff4a4ef7000
[ 2702.556112] RBP: 0000000000000000 R08: 0000000000000008 R09: 0000000000a097e0
[ 2702.559331] R10: 00000000000001f0 R11: 0000000000000212 R12: 00007ff4a6fa5868
[ 2702.562872] R13: 0000000000a0a480 R14: 0000000000000000 R15: 00007ff4a6fa5870
[ 2702.566224] INFO: task fio:2624 blocked for more than 60 seconds.
[ 2702.568916]       Tainted: G        W         4.17.0-rc4-djw #2
[ 2702.571389] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 2702.574938] fio             D14112  2624   2612 0x00000000
[ 2702.577412] Call Trace:
[ 2702.578442]  ? __schedule+0x3e4/0xa70
[ 2702.579995]  ? blk_flush_plug_list+0xe4/0x280
[ 2702.581662]  schedule+0x40/0x90
[ 2702.582905]  rwsem_down_read_failed+0x128/0x170
[ 2702.584628]  ? iomap_apply+0xd5/0x110
[ 2702.586113]  ? xfs_file_dio_aio_write+0xa8/0x470 [xfs]
[ 2702.588344]  ? call_rwsem_down_read_failed+0x14/0x30
[ 2702.590381]  call_rwsem_down_read_failed+0x14/0x30
[ 2702.592192]  ? xfs_ilock+0x28f/0x330 [xfs]
[ 2702.593850]  down_read_nested+0x9d/0xa0
[ 2702.595522]  xfs_ilock+0x28f/0x330 [xfs]
[ 2702.596803]  xfs_file_dio_aio_write+0xa8/0x470 [xfs]
[ 2702.598362]  xfs_file_write_iter+0x7b/0xb0 [xfs]
[ 2702.599758]  aio_write+0x133/0x1c0
[ 2702.600699]  ? lock_acquire+0xab/0x200
[ 2702.601902]  ? __might_fault+0x36/0x80
[ 2702.603056]  ? do_io_submit+0x41b/0x8d0
[ 2702.604191]  do_io_submit+0x41b/0x8d0
[ 2702.605368]  ? do_syscall_64+0x56/0x180
[ 2702.606499]  ? do_io_submit+0x8d0/0x8d0
[ 2702.607643]  do_syscall_64+0x56/0x180
[ 2702.608721]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 2702.610164] RIP: 0033:0x7ff4c1c57697
[ 2702.611202] RSP: 002b:00007fffa13c9348 EFLAGS: 00000212 ORIG_RAX: 00000000000000d1
[ 2702.613435] RAX: ffffffffffffffda RBX: 0000000000a0a440 RCX: 00007ff4c1c57697
[ 2702.615344] RDX: 0000000000a08320 RSI: 0000000000000008 RDI: 00007ff4a4ef6000
[ 2702.617378] RBP: 0000000000000000 R08: 0000000000000008 R09: 0000000000a097e0
[ 2702.619156] R10: 00000000000001f0 R11: 0000000000000212 R12: 00007ff4a6fb53b0
[ 2702.620699] R13: 0000000000a0a480 R14: 0000000000000000 R15: 00007ff4a6fb53b8
[ 2702.622672] INFO: task fio:2625 blocked for more than 60 seconds.
[ 2702.624119]       Tainted: G        W         4.17.0-rc4-djw #2
[ 2702.625533] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 2702.627422] fio             D14072  2625   2612 0x00000000
[ 2702.628745] Call Trace:
[ 2702.629393]  ? __schedule+0x3e4/0xa70
[ 2702.630308]  ? blk_flush_plug_list+0xe4/0x280
[ 2702.631402]  schedule+0x40/0x90
[ 2702.632191]  rwsem_down_read_failed+0x128/0x170
[ 2702.633428]  ? iomap_apply+0xd5/0x110
[ 2702.634418]  ? xfs_file_dio_aio_write+0xa8/0x470 [xfs]
[ 2702.635707]  ? call_rwsem_down_read_failed+0x14/0x30
[ 2702.636899]  call_rwsem_down_read_failed+0x14/0x30
[ 2702.638269]  ? xfs_ilock+0x28f/0x330 [xfs]
[ 2702.639300]  down_read_nested+0x9d/0xa0
[ 2702.640190]  xfs_ilock+0x28f/0x330 [xfs]
[ 2702.641091]  xfs_file_dio_aio_write+0xa8/0x470 [xfs]
[ 2702.642385]  xfs_file_write_iter+0x7b/0xb0 [xfs]
[ 2702.643624]  aio_write+0x133/0x1c0
[ 2702.644550]  ? lock_acquire+0xab/0x200
[ 2702.645443]  ? __might_fault+0x36/0x80
[ 2702.646404]  ? do_io_submit+0x41b/0x8d0
[ 2702.647924]  do_io_submit+0x41b/0x8d0
[ 2702.649437]  ? do_syscall_64+0x56/0x180
[ 2702.651040]  ? do_io_submit+0x8d0/0x8d0
[ 2702.652597]  do_syscall_64+0x56/0x180
[ 2702.654063]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 2702.655804] RIP: 0033:0x7ff4c1c57697
[ 2702.657254] RSP: 002b:00007fffa13c9348 EFLAGS: 00000212 ORIG_RAX: 00000000000000d1
[ 2702.660132] RAX: ffffffffffffffda RBX: 0000000000a12600 RCX: 00007ff4c1c57697
[ 2702.662115] RDX: 0000000000a08320 RSI: 0000000000000008 RDI: 00007ff4a4ef5000
[ 2702.663694] RBP: 0000000000000000 R08: 0000000000000008 R09: 0000000000a097e0
[ 2702.665239] R10: 00000000000001f0 R11: 0000000000000212 R12: 00007ff4a6fc4ef8
[ 2702.666850] R13: 0000000000a0a440 R14: 0000000000000000 R15: 00007ff4a6fc4f00
[ 2702.668419] INFO: task fio:2626 blocked for more than 60 seconds.
[ 2702.669747]       Tainted: G        W         4.17.0-rc4-djw #2
[ 2702.671300] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 2702.673099] fio             D13752  2626   2612 0x00000000
[ 2702.674841] Call Trace:
[ 2702.675719]  ? __schedule+0x3e4/0xa70
[ 2702.676960]  ? blk_flush_plug_list+0xe4/0x280
[ 2702.678387]  schedule+0x40/0x90
[ 2702.679383]  rwsem_down_read_failed+0x128/0x170
[ 2702.680667]  ? iomap_apply+0xd5/0x110
[ 2702.681779]  ? xfs_file_dio_aio_write+0xa8/0x470 [xfs]
[ 2702.683244]  ? call_rwsem_down_read_failed+0x14/0x30
[ 2702.684683]  call_rwsem_down_read_failed+0x14/0x30
[ 2702.686049]  ? xfs_ilock+0x28f/0x330 [xfs]
[ 2702.687279]  down_read_nested+0x9d/0xa0
[ 2702.688431]  xfs_ilock+0x28f/0x330 [xfs]
[ 2702.689521]  xfs_file_dio_aio_write+0xa8/0x470 [xfs]
[ 2702.690735]  xfs_file_write_iter+0x7b/0xb0 [xfs]
[ 2702.691848]  aio_write+0x133/0x1c0
[ 2702.692679]  ? lock_acquire+0xab/0x200
[ 2702.693565]  ? __might_fault+0x36/0x80
[ 2702.694557]  ? do_io_submit+0x41b/0x8d0
[ 2702.695626]  do_io_submit+0x41b/0x8d0
[ 2702.696461]  ? do_syscall_64+0x56/0x180
[ 2702.697327]  ? do_io_submit+0x8d0/0x8d0
[ 2702.698228]  do_syscall_64+0x56/0x180
[ 2702.699109]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 2702.700224] RIP: 0033:0x7ff4c1c57697
[ 2702.701060] RSP: 002b:00007fffa13c9348 EFLAGS: 00000212 ORIG_RAX: 00000000000000d1
[ 2702.703147] RAX: ffffffffffffffda RBX: 0000000000a0a440 RCX: 00007ff4c1c57697
[ 2702.704913] RDX: 0000000000a08320 RSI: 0000000000000008 RDI: 00007ff4a4ef4000
[ 2702.706747] RBP: 0000000000000000 R08: 0000000000000008 R09: 0000000000a097e0
[ 2702.708506] R10: 00000000000001f0 R11: 0000000000000212 R12: 00007ff4a6fd4a40
[ 2702.710047] R13: 0000000000a0a480 R14: 0000000000000000 R15: 00007ff4a6fd4a48
[ 2702.711685] INFO: lockdep is turned off.

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 01/33] block: add a lower-level bio_add_page interface
  2018-05-10  6:40     ` Christoph Hellwig
@ 2018-05-10 21:49       ` Andreas Dilger
  2018-05-11  6:29           ` Christoph Hellwig
  0 siblings, 1 reply; 66+ messages in thread
From: Andreas Dilger @ 2018-05-10 21:49 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Matthew Wilcox, linux-xfs, linux-fsdevel, linux-block, linux-mm

[-- Attachment #1: Type: text/plain, Size: 1059 bytes --]

On May 10, 2018, at 12:40 AM, Christoph Hellwig <hch@lst.de> wrote:
> 
> On Wed, May 09, 2018 at 08:12:43AM -0700, Matthew Wilcox wrote:
>> (page, len, off) is a bit weird to me.  Usually we do (page, off, len).
> 
> That's what I'd usually do, too.  But this odd convention is what
> bio_add_page uses, so I decided to stick to that instead of having two
> different conventions in one family of functions.

Would it make sense to change the bio_add_page() and bio_add_pc_page()
to use the more common convention instead of continuing the spread of
this non-standard calling convention?  This is doubly problematic since
"off" and "len" are both unsigned int values so it is easy to get them
mixed up, and just reordering the bio_add_page() arguments would not
generate any errors.

One option would be to rename this function bio_page_add() so there are
build errors or first add bio_page_add() and mark bio_add_page()
deprecated and allow some short time for transition?  There are about
50 uses under drivers/ and 50 uses under fs/.

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: stop using buffer heads in xfs and iomap
  2018-05-10 15:13 ` stop using buffer heads in xfs and iomap Darrick J. Wong
@ 2018-05-11  6:22   ` Christoph Hellwig
  2018-05-11  6:39     ` Darrick J. Wong
  0 siblings, 1 reply; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-11  6:22 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, linux-block, linux-mm

On Thu, May 10, 2018 at 08:13:03AM -0700, Darrick J. Wong wrote:
> I ran xfstests on this for fun last night but hung in g/095:
> 
> FSTYP         -- xfs (debug)
> PLATFORM      -- Linux/x86_64 submarine-djwong-mtr01 4.17.0-rc4-djw
> MKFS_OPTIONS  -- -f -m reflink=1,rmapbt=1, -i sparse=1, -b size=1024, /dev/sdf
> MOUNT_OPTIONS -- /dev/sdf /opt
> 
> FWIW the stock v4 and the 'v5 with everything and 4k blocks' vms
> passed, so I guess there's a bug somewhere in the sub-page block size
> code paths...

I haven't seen that in my -b size 1024 -m relink test  I'll try your above
exact setup, too.  Is this disk or SSD?  How much memory and how many
CPUs?

Btw, I think the series might be worthwhile even if we have to delay
the sub-page blocksize support - the blocksize == pagesize code is
basically entirely separate and already very useful.  Only the last
three patches contain the small blocksize support, without that we'll
just continue using buffer heads for that case for now.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 01/33] block: add a lower-level bio_add_page interface
  2018-05-10  8:52   ` Ming Lei
@ 2018-05-11  6:24     ` Christoph Hellwig
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-11  6:24 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, open list:XFS FILESYSTEM, Linux FS Devel,
	linux-block, linux-mm

On Thu, May 10, 2018 at 04:52:00PM +0800, Ming Lei wrote:
> On Wed, May 9, 2018 at 3:47 PM, Christoph Hellwig <hch@lst.de> wrote:
> > For the upcoming removal of buffer heads in XFS we need to keep track of
> > the number of outstanding writeback requests per page.  For this we need
> > to know if bio_add_page merged a region with the previous bvec or not.
> > Instead of adding additional arguments this refactors bio_add_page to
> > be implemented using three lower level helpers which users like XFS can
> > use directly if they care about the merge decisions.
> 
> The merge policy may be transparent to fs, such as multipage bvec.

The whole point of this series it to make it explicit.  That will have to
be carried over into a multipage bvec world.  That means the current
__bio_try_merge_page will have to remain as-is in that new world order,
but we'd also add a new __bio_try_merge_segment which merges beyond th
e page.  For the iomap and xfs code we'd first call __bio_try_merge_page,
if that fails increment the read/write count, and only after that
call __bio_try_merge_segment.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 10/33] iomap: add an iomap-based bmap implementation
  2018-05-10 15:08       ` Darrick J. Wong
@ 2018-05-11  6:25         ` Christoph Hellwig
  2018-05-12  1:56           ` Darrick J. Wong
  0 siblings, 1 reply; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-11  6:25 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, linux-block, linux-mm

On Thu, May 10, 2018 at 08:08:38AM -0700, Darrick J. Wong wrote:
> > > > +	sector_t *bno = data;
> > > > +
> > > > +	if (iomap->type == IOMAP_MAPPED)
> > > > +		*bno = (iomap->addr + pos - iomap->offset) >> inode->i_blkbits;
> > > 
> > > Does this need to be careful w.r.t. overflow on systems where sector_t
> > > is a 32-bit unsigned long?
> > > 
> > > Also, ioctl_fibmap() typecasts the returned sector_t to an int, which
> > > also seems broken.  I agree the interface needs to die, but ioctls take
> > > a long time to deprecate.
> > 
> > Not much we can do about the interface.
> 
> Yes, the interface is fubar, but if file /foo maps to block 8589934720
> then do we return the truncated result 128?

Then we'll get a corrupt result.  What do you think we could do here
eithere in the old or new code?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 01/33] block: add a lower-level bio_add_page interface
  2018-05-10 21:49       ` Andreas Dilger
@ 2018-05-11  6:29           ` Christoph Hellwig
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-11  6:29 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Christoph Hellwig, Matthew Wilcox, linux-xfs, linux-fsdevel,
	linux-block, linux-mm, axboe

On Thu, May 10, 2018 at 03:49:53PM -0600, Andreas Dilger wrote:
> Would it make sense to change the bio_add_page() and bio_add_pc_page()
> to use the more common convention instead of continuing the spread of
> this non-standard calling convention?  This is doubly problematic since
> "off" and "len" are both unsigned int values so it is easy to get them
> mixed up, and just reordering the bio_add_page() arguments would not
> generate any errors.

We have more than hundred callers.  I don't think we want to create
so much churn just to clean things up a bit without any meaѕurable
benefit.  And even if you want to clean it up I'd rather keep it
away from my iomap/xfs buffered I/O series :)

> One option would be to rename this function bio_page_add() so there are
> build errors or first add bio_page_add() and mark bio_add_page()
> deprecated and allow some short time for transition?  There are about
> 50 uses under drivers/ and 50 uses under fs/.

If you think the churn is worthwhile send a separate series for that.
My two new functions should have very few callers even by then, so
feel free to just update them as well.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 01/33] block: add a lower-level bio_add_page interface
@ 2018-05-11  6:29           ` Christoph Hellwig
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-11  6:29 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Christoph Hellwig, Matthew Wilcox, linux-xfs, linux-fsdevel,
	linux-block, linux-mm, axboe

On Thu, May 10, 2018 at 03:49:53PM -0600, Andreas Dilger wrote:
> Would it make sense to change the bio_add_page() and bio_add_pc_page()
> to use the more common convention instead of continuing the spread of
> this non-standard calling convention?  This is doubly problematic since
> "off" and "len" are both unsigned int values so it is easy to get them
> mixed up, and just reordering the bio_add_page() arguments would not
> generate any errors.

We have more than hundred callers.  I don't think we want to create
so much churn just to clean things up a bit without any meaN?urable
benefit.  And even if you want to clean it up I'd rather keep it
away from my iomap/xfs buffered I/O series :)

> One option would be to rename this function bio_page_add() so there are
> build errors or first add bio_page_add() and mark bio_add_page()
> deprecated and allow some short time for transition?  There are about
> 50 uses under drivers/ and 50 uses under fs/.

If you think the churn is worthwhile send a separate series for that.
My two new functions should have very few callers even by then, so
feel free to just update them as well.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: stop using buffer heads in xfs and iomap
  2018-05-11  6:22   ` Christoph Hellwig
@ 2018-05-11  6:39     ` Darrick J. Wong
  0 siblings, 0 replies; 66+ messages in thread
From: Darrick J. Wong @ 2018-05-11  6:39 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, linux-block, linux-mm

On Fri, May 11, 2018 at 08:22:08AM +0200, Christoph Hellwig wrote:
> On Thu, May 10, 2018 at 08:13:03AM -0700, Darrick J. Wong wrote:
> > I ran xfstests on this for fun last night but hung in g/095:
> > 
> > FSTYP         -- xfs (debug)
> > PLATFORM      -- Linux/x86_64 submarine-djwong-mtr01 4.17.0-rc4-djw
> > MKFS_OPTIONS  -- -f -m reflink=1,rmapbt=1, -i sparse=1, -b size=1024, /dev/sdf
> > MOUNT_OPTIONS -- /dev/sdf /opt
> > 
> > FWIW the stock v4 and the 'v5 with everything and 4k blocks' vms
> > passed, so I guess there's a bug somewhere in the sub-page block size
> > code paths...
> 
> I haven't seen that in my -b size 1024 -m relink test  I'll try your above
> exact setup, too.  Is this disk or SSD?  How much memory and how many
> CPUs?

4 CPUs in a VM on a Nehalem-era machine, 2GB RAM, two 2.3GB virtio-scsi
disks...

...the VM host itself is a quad-core Nehalem, 24G RAM, atop an ext4 fs
on spinning rust in a raid1.

> Btw, I think the series might be worthwhile even if we have to delay
> the sub-page blocksize support - the blocksize == pagesize code is
> basically entirely separate and already very useful.  Only the last
> three patches contain the small blocksize support, without that we'll
> just continue using buffer heads for that case for now.

<shrug> I'll keep reading, it seemed generally ok until I hit the
sub-page part and my eyes glazed over. :)

--D

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 10/33] iomap: add an iomap-based bmap implementation
  2018-05-11  6:25         ` Christoph Hellwig
@ 2018-05-12  1:56           ` Darrick J. Wong
  0 siblings, 0 replies; 66+ messages in thread
From: Darrick J. Wong @ 2018-05-12  1:56 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, linux-block, linux-mm

On Fri, May 11, 2018 at 08:25:27AM +0200, Christoph Hellwig wrote:
> On Thu, May 10, 2018 at 08:08:38AM -0700, Darrick J. Wong wrote:
> > > > > +	sector_t *bno = data;
> > > > > +
> > > > > +	if (iomap->type == IOMAP_MAPPED)
> > > > > +		*bno = (iomap->addr + pos - iomap->offset) >> inode->i_blkbits;
> > > > 
> > > > Does this need to be careful w.r.t. overflow on systems where sector_t
> > > > is a 32-bit unsigned long?
> > > > 
> > > > Also, ioctl_fibmap() typecasts the returned sector_t to an int, which
> > > > also seems broken.  I agree the interface needs to die, but ioctls take
> > > > a long time to deprecate.
> > > 
> > > Not much we can do about the interface.
> > 
> > Yes, the interface is fubar, but if file /foo maps to block 8589934720
> > then do we return the truncated result 128?
> 
> Then we'll get a corrupt result.  What do you think we could do here
> eithere in the old or new code?

I think the only thing we /can/ do is figure out if we'd be truncating
the result, dump a warning to the kernel, and return 0, because we don't
want smartypants FIBMAP callers to be using crap block pointers.

Something like this for the bmap implementation...

uint64_t mapping = iomap->addr;

#ifdef CONFIG_LBDAF
if (mapping > ULONG_MAX) {
	/* Do not truncate results. */
	return 0;
}
#endif

...and in the bmap ioctl...

sector_t mapping = ...;

if (mapping > INT_MAX) {
	WARN(1, "would truncate bmap result, go fix your stupid program");
	return 0;
}

--D

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 31/33] iomap: add support for sub-pagesize buffered I/O without buffer heads
  2018-05-09  7:48 ` [PATCH 31/33] iomap: add support for sub-pagesize buffered I/O " Christoph Hellwig
@ 2018-05-14 16:00   ` Goldwyn Rodrigues
  2018-05-15  7:26     ` Christoph Hellwig
  0 siblings, 1 reply; 66+ messages in thread
From: Goldwyn Rodrigues @ 2018-05-14 16:00 UTC (permalink / raw)
  To: Christoph Hellwig, linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm



On 05/09/2018 02:48 AM, Christoph Hellwig wrote:
> After already supporting a simple implementation of buffered writes for
> the blocksize == PAGE_SIZE case in the last commit this adds full support
> even for smaller block sizes.   There are three bits of per-block
> information in the buffer_head structure that really matter for the iomap
> read and write path:
> 
>  - uptodate status (BH_uptodate)
>  - marked as currently under read I/O (BH_Async_Read)
>  - marked as currently under write I/O (BH_Async_Write)
> 
> Instead of having new per-block structures this now adds a per-page
> structure called struct iomap_page to track this information in a slightly
> different form:
> 
>  - a bitmap for the per-block uptodate status.  For worst case of a 64k
>    page size system this bitmap needs to contain 128 bits.  For the
>    typical 4k page size case it only needs 8 bits, although we still
>    need a full unsigned long due to the way the atomic bitmap API works.
>  - two atomic_t counters are used to track the outstanding read and write
>    counts
> 
> There is quite a bit of boilerplate code as the buffered I/O path uses
> various helper methods, but the actual code is very straight forward.
> 
> In this commit the code can't actually be used yet, as we need to
> switch from the old implementation to the new one together with the
> XFS writeback code.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/iomap.c            | 262 +++++++++++++++++++++++++++++++++++++-----
>  include/linux/iomap.h |  32 ++++++
>  2 files changed, 264 insertions(+), 30 deletions(-)
> 
> diff --git a/fs/iomap.c b/fs/iomap.c
> index a3861945504f..4e7ac6aa88ef 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -17,6 +17,7 @@
>  #include <linux/iomap.h>
>  #include <linux/uaccess.h>
>  #include <linux/gfp.h>
> +#include <linux/migrate.h>
>  #include <linux/mm.h>
>  #include <linux/mm_inline.h>
>  #include <linux/swap.h>
> @@ -109,6 +110,107 @@ iomap_block_needs_zeroing(struct inode *inode, loff_t pos, struct iomap *iomap)
>         return iomap->type != IOMAP_MAPPED || pos > i_size_read(inode);
>  }
>  
> +static struct iomap_page *
> +iomap_page_create(struct inode *inode, struct page *page)
> +{
> +	struct iomap_page *iop = to_iomap_page(page);
> +
> +	if (iop || i_blocksize(inode) == PAGE_SIZE)
> +		return iop;

Why is this an equal comparison operator? Shouldn't this be >= to
include filesystem blocksize greater than PAGE_SIZE?

-- 
Goldwyn

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 31/33] iomap: add support for sub-pagesize buffered I/O without buffer heads
  2018-05-14 16:00   ` Goldwyn Rodrigues
@ 2018-05-15  7:26     ` Christoph Hellwig
  2018-05-15 13:47       ` Goldwyn Rodrigues
  0 siblings, 1 reply; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-15  7:26 UTC (permalink / raw)
  To: Goldwyn Rodrigues
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, linux-block, linux-mm

On Mon, May 14, 2018 at 11:00:08AM -0500, Goldwyn Rodrigues wrote:
> > +	if (iop || i_blocksize(inode) == PAGE_SIZE)
> > +		return iop;
> 
> Why is this an equal comparison operator? Shouldn't this be >= to
> include filesystem blocksize greater than PAGE_SIZE?

Which filesystems would that be that have a tested and working PAGE_SIZE
support using iomap?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 31/33] iomap: add support for sub-pagesize buffered I/O without buffer heads
  2018-05-15  7:26     ` Christoph Hellwig
@ 2018-05-15 13:47       ` Goldwyn Rodrigues
  2018-05-16  5:46         ` Dave Chinner
  0 siblings, 1 reply; 66+ messages in thread
From: Goldwyn Rodrigues @ 2018-05-15 13:47 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, linux-block, linux-mm



On 05/15/2018 02:26 AM, Christoph Hellwig wrote:
> On Mon, May 14, 2018 at 11:00:08AM -0500, Goldwyn Rodrigues wrote:
>>> +	if (iop || i_blocksize(inode) == PAGE_SIZE)
>>> +		return iop;
>>
>> Why is this an equal comparison operator? Shouldn't this be >= to
>> include filesystem blocksize greater than PAGE_SIZE?
> 
> Which filesystems would that be that have a tested and working PAGE_SIZE
> support using iomap?

Oh, I assumed iomap would work for filesystems with block size greater
than PAGE_SIZE.

-- 
Goldwyn

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 01/33] block: add a lower-level bio_add_page interface
  2018-05-11  6:29           ` Christoph Hellwig
@ 2018-05-15 16:47             ` Jens Axboe
  -1 siblings, 0 replies; 66+ messages in thread
From: Jens Axboe @ 2018-05-15 16:47 UTC (permalink / raw)
  To: Christoph Hellwig, Andreas Dilger
  Cc: Matthew Wilcox, linux-xfs, linux-fsdevel, linux-block, linux-mm

On 5/11/18 12:29 AM, Christoph Hellwig wrote:
> On Thu, May 10, 2018 at 03:49:53PM -0600, Andreas Dilger wrote:
>> Would it make sense to change the bio_add_page() and bio_add_pc_page()
>> to use the more common convention instead of continuing the spread of
>> this non-standard calling convention?  This is doubly problematic since
>> "off" and "len" are both unsigned int values so it is easy to get them
>> mixed up, and just reordering the bio_add_page() arguments would not
>> generate any errors.
> 
> We have more than hundred callers.  I don't think we want to create
> so much churn just to clean things up a bit without any meaѕurable
> benefit.  And even if you want to clean it up I'd rather keep it
> away from my iomap/xfs buffered I/O series :)

Yeah let's not do that, I know someone that always gets really grumpy
when changes like that are made. So given that, I think we should retain
the argument order for that we already have for __bio_try_merge_page()
as well.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 01/33] block: add a lower-level bio_add_page interface
@ 2018-05-15 16:47             ` Jens Axboe
  0 siblings, 0 replies; 66+ messages in thread
From: Jens Axboe @ 2018-05-15 16:47 UTC (permalink / raw)
  To: Christoph Hellwig, Andreas Dilger
  Cc: Matthew Wilcox, linux-xfs, linux-fsdevel, linux-block, linux-mm

On 5/11/18 12:29 AM, Christoph Hellwig wrote:
> On Thu, May 10, 2018 at 03:49:53PM -0600, Andreas Dilger wrote:
>> Would it make sense to change the bio_add_page() and bio_add_pc_page()
>> to use the more common convention instead of continuing the spread of
>> this non-standard calling convention?  This is doubly problematic since
>> "off" and "len" are both unsigned int values so it is easy to get them
>> mixed up, and just reordering the bio_add_page() arguments would not
>> generate any errors.
> 
> We have more than hundred callers.  I don't think we want to create
> so much churn just to clean things up a bit without any meaN?urable
> benefit.  And even if you want to clean it up I'd rather keep it
> away from my iomap/xfs buffered I/O series :)

Yeah let's not do that, I know someone that always gets really grumpy
when changes like that are made. So given that, I think we should retain
the argument order for that we already have for __bio_try_merge_page()
as well.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 01/33] block: add a lower-level bio_add_page interface
  2018-05-09  7:47 ` [PATCH 01/33] block: add a lower-level bio_add_page interface Christoph Hellwig
  2018-05-09 15:12   ` Matthew Wilcox
  2018-05-10  8:52   ` Ming Lei
@ 2018-05-16  5:06   ` Ritesh Harjani
  2018-05-16 18:05     ` Christoph Hellwig
  2 siblings, 1 reply; 66+ messages in thread
From: Ritesh Harjani @ 2018-05-16  5:06 UTC (permalink / raw)
  To: Christoph Hellwig, linux-xfs; +Cc: linux-fsdevel, linux-block, linux-mm



On 5/9/2018 1:17 PM, Christoph Hellwig wrote:
> For the upcoming removal of buffer heads in XFS we need to keep track of
> the number of outstanding writeback requests per page.  For this we need
> to know if bio_add_page merged a region with the previous bvec or not.
> Instead of adding additional arguments this refactors bio_add_page to
> be implemented using three lower level helpers which users like XFS can
> use directly if they care about the merge decisions.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   block/bio.c         | 87 ++++++++++++++++++++++++++++++---------------
>   include/linux/bio.h |  9 +++++
>   2 files changed, 67 insertions(+), 29 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 53e0f0a1ed94..6ceba6adbf42 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -773,7 +773,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
>   			return 0;
>   	}
>   
> -	if (bio->bi_vcnt >= bio->bi_max_vecs)
> +	if (bio_full(bio))
>   		return 0;
>   
>   	/*
> @@ -820,6 +820,59 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
>   }
>   EXPORT_SYMBOL(bio_add_pc_page);
>   
> +/**
> + * __bio_try_merge_page - try adding data to an existing bvec
> + * @bio: destination bio
> + * @page: page to add
> + * @len: length of the range to add
> + * @off: offset into @page
> + *
> + * Try adding the data described at @page + @offset to the last bvec of @bio.
> + * Return %true on success or %false on failure.  This can happen frequently
> + * for file systems with a block size smaller than the page size.
> + */
> +bool __bio_try_merge_page(struct bio *bio, struct page *page,
> +		unsigned int len, unsigned int off)
> +{
> +	if (bio->bi_vcnt > 0) {
> +		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
> +
> +		if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
> +			bv->bv_len += len;
> +			bio->bi_iter.bi_size += len;
> +			return true;
> +		}
> +	}
> +	return false;
> +}
> +EXPORT_SYMBOL_GPL(__bio_try_merge_page);
> +
> +/**
> + * __bio_add_page - add page to a bio in a new segment
> + * @bio: destination bio
> + * @page: page to add
> + * @len: length of the range to add
> + * @off: offset into @page
> + *
> + * Add the data at @page + @offset to @bio as a new bvec.  The caller must
> + * ensure that @bio has space for another bvec.
> + */
> +void __bio_add_page(struct bio *bio, struct page *page,
> +		unsigned int len, unsigned int off)
> +{
> +	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
> +
> +	WARN_ON_ONCE(bio_full(bio));

Please correct my understanding here. I am still new at understanding this.

1. if bio_full is true that means no space in bio->bio_io_vec[] no?
Than how come we are still proceeding ahead with only warning?
While originally in bio_add_page we used to return after checking
bio_full. Callers can still call __bio_add_page directly right.

2. Also the bio_io_vec size allocated will only be upto bio->bi_max_vecs 
right?
I could not follow up very well with the bvec_alloc function,
mainly when nr_iovec > inline_vecs. So how and where it is getting sure 
that we are getting _nr_iovecs_ allocated from the bvec_pool?

hmm.. tricky. Please help me understand this.
1. So we have defined different slabs of different sizes in bvec_slabs. 
and when the allocation request of nr_iovecs come
we try to grab the predefined(in terms of size) slab of bvec_slabs
and return. In case if that allocation does not succeed from slab,
we go for mempool_alloc.

2. IF above is correct why don't we set the bio->bi_max_vecs to the size
of the slab instead of keeeping it to nr_iovecs which user requested?
(in bio_alloc_bioset)


3. Could you please help understand why for cloned bio we still allow
__bio_add_page to work? why not WARN and return like in original code?

4. Ok, I see that in patch 32 you are first checking bio_full and 
calling for xfs_chain_bio. But there also I think you are making sure 
that new ioend->io_bio is the new chained bio which is not full.

Apologies if above doesn't make any sense.

> +
> +	bv->bv_page = page;
> +	bv->bv_offset = off;
> +	bv->bv_len = len;
> +
> +	bio->bi_iter.bi_size += len;
> +	bio->bi_vcnt++;
> +}
> +EXPORT_SYMBOL_GPL(__bio_add_page);
> +
>   /**
>    *	bio_add_page	-	attempt to add page to bio
>    *	@bio: destination bio
> @@ -833,40 +886,16 @@ EXPORT_SYMBOL(bio_add_pc_page);
>   int bio_add_page(struct bio *bio, struct page *page,
>   		 unsigned int len, unsigned int offset)
>   {
> -	struct bio_vec *bv;
> -
>   	/*
>   	 * cloned bio must not modify vec list
>   	 */
>   	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
>   		return 0;
> -
> -	/*
> -	 * For filesystems with a blocksize smaller than the pagesize
> -	 * we will often be called with the same page as last time and
> -	 * a consecutive offset.  Optimize this special case.
> -	 */
> -	if (bio->bi_vcnt > 0) {
> -		bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
> -
> -		if (page == bv->bv_page &&
> -		    offset == bv->bv_offset + bv->bv_len) {
> -			bv->bv_len += len;
> -			goto done;
> -		}
> +	if (!__bio_try_merge_page(bio, page, len, offset)) {
> +		if (bio_full(bio))
> +			return 0;
> +		__bio_add_page(bio, page, len, offset);
>   	}
> -
> -	if (bio->bi_vcnt >= bio->bi_max_vecs)
> -		return 0;
Originally here we were supposed to return and not proceed further.
Should __bio_add_page not have similar checks to safeguard crossing
the bio_io_vec[] boundary?


> -
> -	bv		= &bio->bi_io_vec[bio->bi_vcnt];
> -	bv->bv_page	= page;
> -	bv->bv_len	= len;
> -	bv->bv_offset	= offset;
> -
> -	bio->bi_vcnt++;
> -done:
> -	bio->bi_iter.bi_size += len;
>   	return len;
>   }
>   EXPORT_SYMBOL(bio_add_page);
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index ce547a25e8ae..3e73c8bc25ea 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -123,6 +123,11 @@ static inline void *bio_data(struct bio *bio)
>   	return NULL;
>   }
>   
> +static inline bool bio_full(struct bio *bio)
> +{
> +	return bio->bi_vcnt >= bio->bi_max_vecs;
> +}
> +
>   /*
>    * will die
>    */
> @@ -470,6 +475,10 @@ void bio_chain(struct bio *, struct bio *);
>   extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
>   extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
>   			   unsigned int, unsigned int);
> +bool __bio_try_merge_page(struct bio *bio, struct page *page,
> +		unsigned int len, unsigned int off);
> +void __bio_add_page(struct bio *bio, struct page *page,
> +		unsigned int len, unsigned int off);
>   int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
>   struct rq_map_data;
>   extern struct bio *bio_map_user_iov(struct request_queue *,
> 

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, 
Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a 
Linux Foundation Collaborative Project.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 31/33] iomap: add support for sub-pagesize buffered I/O without buffer heads
  2018-05-15 13:47       ` Goldwyn Rodrigues
@ 2018-05-16  5:46         ` Dave Chinner
  0 siblings, 0 replies; 66+ messages in thread
From: Dave Chinner @ 2018-05-16  5:46 UTC (permalink / raw)
  To: Goldwyn Rodrigues
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, linux-block, linux-mm

On Tue, May 15, 2018 at 08:47:25AM -0500, Goldwyn Rodrigues wrote:
> On 05/15/2018 02:26 AM, Christoph Hellwig wrote:
> > On Mon, May 14, 2018 at 11:00:08AM -0500, Goldwyn Rodrigues wrote:
> >>> +	if (iop || i_blocksize(inode) == PAGE_SIZE)
> >>> +		return iop;
> >>
> >> Why is this an equal comparison operator? Shouldn't this be >= to
> >> include filesystem blocksize greater than PAGE_SIZE?
> > 
> > Which filesystems would that be that have a tested and working PAGE_SIZE
> > support using iomap?
> 
> Oh, I assumed iomap would work for filesystems with block size greater
> than PAGE_SIZE.

It will eventually, but first we've got to remove the iomap
infrastructure and filesystem dependencies on bufferheads....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 01/33] block: add a lower-level bio_add_page interface
  2018-05-16  5:06   ` Ritesh Harjani
@ 2018-05-16 18:05     ` Christoph Hellwig
  2018-05-17  4:18       ` Ritesh Harjani
  0 siblings, 1 reply; 66+ messages in thread
From: Christoph Hellwig @ 2018-05-16 18:05 UTC (permalink / raw)
  To: Ritesh Harjani
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, linux-block, linux-mm

On Wed, May 16, 2018 at 10:36:14AM +0530, Ritesh Harjani wrote:
> 1. if bio_full is true that means no space in bio->bio_io_vec[] no?
> Than how come we are still proceeding ahead with only warning?
> While originally in bio_add_page we used to return after checking
> bio_full. Callers can still call __bio_add_page directly right.

I you don't know if the bio is full or not don't use __bio_add_page,
keep using bio_add_page.  The WARN_ON is just a debug tool to catch
cases where the developer did use it incorrectly.

> 2. IF above is correct why don't we set the bio->bi_max_vecs to the size
> of the slab instead of keeeping it to nr_iovecs which user requested?
> (in bio_alloc_bioset)

Because we limit the user to the number that the user requested.  Not
that this patch changes anything about that.

> 3. Could you please help understand why for cloned bio we still allow
> __bio_add_page to work? why not WARN and return like in original code?

It doesn't work, and I have now added the WARN_ON to deal with any
incorrect usage.

>> -	if (bio->bi_vcnt >= bio->bi_max_vecs)
>> -		return 0;
> Originally here we were supposed to return and not proceed further.
> Should __bio_add_page not have similar checks to safeguard crossing
> the bio_io_vec[] boundary?

No, __bio_add_page is the "I know what I am doing" interface.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 01/33] block: add a lower-level bio_add_page interface
  2018-05-16 18:05     ` Christoph Hellwig
@ 2018-05-17  4:18       ` Ritesh Harjani
  0 siblings, 0 replies; 66+ messages in thread
From: Ritesh Harjani @ 2018-05-17  4:18 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, linux-block, linux-mm



On 5/16/2018 11:35 PM, Christoph Hellwig wrote:
> On Wed, May 16, 2018 at 10:36:14AM +0530, Ritesh Harjani wrote:
>> 1. if bio_full is true that means no space in bio->bio_io_vec[] no?
>> Than how come we are still proceeding ahead with only warning?
>> While originally in bio_add_page we used to return after checking
>> bio_full. Callers can still call __bio_add_page directly right.
> 
> I you don't know if the bio is full or not don't use __bio_add_page,
> keep using bio_add_page.  The WARN_ON is just a debug tool to catch
> cases where the developer did use it incorrectly.
> 
>> 2. IF above is correct why don't we set the bio->bi_max_vecs to the size
>> of the slab instead of keeeping it to nr_iovecs which user requested?
>> (in bio_alloc_bioset)
> 
> Because we limit the user to the number that the user requested.  Not
> that this patch changes anything about that.
> 
>> 3. Could you please help understand why for cloned bio we still allow
>> __bio_add_page to work? why not WARN and return like in original code?
> 
> It doesn't work, and I have now added the WARN_ON to deal with any
> incorrect usage.
> 
>>> -	if (bio->bi_vcnt >= bio->bi_max_vecs)
>>> -		return 0;
>> Originally here we were supposed to return and not proceed further.
>> Should __bio_add_page not have similar checks to safeguard crossing
>> the bio_io_vec[] boundary?
> 
> No, __bio_add_page is the "I know what I am doing" interface.
> 

Thanks for explaining. I guess I missed reading the comment made on top 
of function __bio_add_page.
"The caller must ensure that @bio has space for another bvec"

This discussion helped me understand a bit about bios & bio_vec.

Thanks!!
Ritesh


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, 
Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a 
Linux Foundation Collaborative Project.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 04/33] fs: remove the buffer_unwritten check in page_seek_hole_data
  2018-05-09  7:48 ` [PATCH 04/33] fs: remove the buffer_unwritten check in page_seek_hole_data Christoph Hellwig
@ 2018-05-17 11:33   ` Andreas Grünbacher
  0 siblings, 0 replies; 66+ messages in thread
From: Andreas Grünbacher @ 2018-05-17 11:33 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-xfs, Linux FS-devel Mailing List, linux-block, linux-mm

2018-05-09 9:48 GMT+02:00 Christoph Hellwig <hch@lst.de>:
> We only call into this function through the iomap iterators, so we already
> know the buffer is unwritten.  In addition to that we always require the
> uptodate flag that is ORed with the result anyway.

Please update the page_cache_seek_hole_data description as well:

--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -647,8 +647,8 @@
  * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
  *
  * Within unwritten extents, the page cache determines which parts are holes
- * and which are data: unwritten and uptodate buffer heads count as data;
- * everything else counts as a hole.
+ * and which are data: uptodate buffer heads count as data; everything else
+ * counts as a hole.
  *
  * Returns the resulting offset on successs, and -ENOENT otherwise.
  */

Thanks,
Andreas

^ permalink raw reply	[flat|nested] 66+ messages in thread

end of thread, other threads:[~2018-05-17 11:33 UTC | newest]

Thread overview: 66+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-09  7:47 stop using buffer heads in xfs and iomap Christoph Hellwig
2018-05-09  7:47 ` [PATCH 01/33] block: add a lower-level bio_add_page interface Christoph Hellwig
2018-05-09 15:12   ` Matthew Wilcox
2018-05-10  6:40     ` Christoph Hellwig
2018-05-10 21:49       ` Andreas Dilger
2018-05-11  6:29         ` Christoph Hellwig
2018-05-11  6:29           ` Christoph Hellwig
2018-05-15 16:47           ` Jens Axboe
2018-05-15 16:47             ` Jens Axboe
2018-05-10  8:52   ` Ming Lei
2018-05-11  6:24     ` Christoph Hellwig
2018-05-16  5:06   ` Ritesh Harjani
2018-05-16 18:05     ` Christoph Hellwig
2018-05-17  4:18       ` Ritesh Harjani
2018-05-09  7:47 ` [PATCH 02/33] fs: factor out a __generic_write_end helper Christoph Hellwig
2018-05-09 15:15   ` Matthew Wilcox
2018-05-10  6:40     ` Christoph Hellwig
2018-05-09  7:48 ` [PATCH 03/33] fs: move page_cache_seek_hole_data to iomap.c Christoph Hellwig
2018-05-09  7:48 ` [PATCH 04/33] fs: remove the buffer_unwritten check in page_seek_hole_data Christoph Hellwig
2018-05-17 11:33   ` Andreas Grünbacher
2018-05-09  7:48 ` [PATCH 05/33] fs: use ->is_partially_uptodate in page_cache_seek_hole_data Christoph Hellwig
2018-05-09  7:48 ` [PATCH 06/33] mm: give the 'ret' variable a better name __do_page_cache_readahead Christoph Hellwig
2018-05-09 15:45   ` Matthew Wilcox
2018-05-10  6:41     ` Christoph Hellwig
2018-05-09  7:48 ` [PATCH 07/33] mm: split ->readpages calls to avoid non-contiguous pages lists Christoph Hellwig
2018-05-09 15:46   ` Matthew Wilcox
2018-05-09  7:48 ` [PATCH 08/33] iomap: use __bio_add_page in iomap_dio_zero Christoph Hellwig
2018-05-09  7:48 ` [PATCH 09/33] iomap: add a iomap_sector helper Christoph Hellwig
2018-05-09  7:48 ` [PATCH 10/33] iomap: add an iomap-based bmap implementation Christoph Hellwig
2018-05-09 16:46   ` Darrick J. Wong
2018-05-10  6:42     ` Christoph Hellwig
2018-05-10 15:08       ` Darrick J. Wong
2018-05-11  6:25         ` Christoph Hellwig
2018-05-12  1:56           ` Darrick J. Wong
2018-05-09  7:48 ` [PATCH 11/33] iomap: add an iomap-based readpage and readpages implementation Christoph Hellwig
2018-05-10  1:17   ` Dave Chinner
2018-05-10  6:44     ` Christoph Hellwig
2018-05-09  7:48 ` [PATCH 12/33] xfs: use iomap_bmap Christoph Hellwig
2018-05-09  7:48 ` [PATCH 13/33] xfs: use iomap for blocksize == PAGE_SIZE readpage and readpages Christoph Hellwig
2018-05-09  7:48 ` [PATCH 14/33] xfs: simplify xfs_bmap_punch_delalloc_range Christoph Hellwig
2018-05-09  7:48 ` [PATCH 15/33] xfs: simplify xfs_aops_discard_page Christoph Hellwig
2018-05-09  7:48 ` [PATCH 16/33] xfs: move locking into xfs_bmap_punch_delalloc_range Christoph Hellwig
2018-05-09  7:48 ` [PATCH 17/33] xfs: make xfs_writepage_map extent map centric Christoph Hellwig
2018-05-09  7:48 ` [PATCH 18/33] xfs: remove the now unused XFS_BMAPI_IGSTATE flag Christoph Hellwig
2018-05-09  7:48 ` [PATCH 19/33] xfs: remove xfs_reflink_find_cow_mapping Christoph Hellwig
2018-05-09  7:48 ` [PATCH 20/33] xfs: remove xfs_reflink_trim_irec_to_next_cow Christoph Hellwig
2018-05-09  7:48 ` [PATCH 21/33] xfs: simplify xfs_map_blocks by using xfs_iext_lookup_extent directly Christoph Hellwig
2018-05-09  7:48 ` [PATCH 22/33] xfs: don't clear imap_valid for a non-uptodate buffers Christoph Hellwig
2018-05-09  7:48 ` [PATCH 23/33] xfs: remove the imap_valid flag Christoph Hellwig
2018-05-09  7:48 ` [PATCH 24/33] xfs: don't look at buffer heads in xfs_add_to_ioend Christoph Hellwig
2018-05-09  7:48 ` [PATCH 25/33] xfs: move all writeback buffer_head manipulation into xfs_map_at_offset Christoph Hellwig
2018-05-09  7:48 ` [PATCH 26/33] xfs: allow writeback on pages without buffer heads Christoph Hellwig
2018-05-09  7:48 ` [PATCH 27/33] xfs: remove xfs_start_page_writeback Christoph Hellwig
2018-05-09  7:48 ` [PATCH 28/33] xfs: refactor the tail of xfs_writepage_map Christoph Hellwig
2018-05-09  7:48 ` [PATCH 29/33] xfs: do not set the page uptodate in xfs_writepage_map Christoph Hellwig
2018-05-09  7:48 ` [PATCH 30/33] iomap: add initial support for writes without buffer heads Christoph Hellwig
2018-05-09  7:48 ` [PATCH 31/33] iomap: add support for sub-pagesize buffered I/O " Christoph Hellwig
2018-05-14 16:00   ` Goldwyn Rodrigues
2018-05-15  7:26     ` Christoph Hellwig
2018-05-15 13:47       ` Goldwyn Rodrigues
2018-05-16  5:46         ` Dave Chinner
2018-05-09  7:48 ` [PATCH 32/33] xfs: add support for sub-pagesize writeback without buffer_heads Christoph Hellwig
2018-05-09  7:48 ` [PATCH 33/33] fs: remove __block_write_begin and iomap_to_bh Christoph Hellwig
2018-05-10 15:13 ` stop using buffer heads in xfs and iomap Darrick J. Wong
2018-05-11  6:22   ` Christoph Hellwig
2018-05-11  6:39     ` Darrick J. Wong

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.