All of lore.kernel.org
 help / color / mirror / Atom feed
From: Christoph Hellwig <hch@lst.de>
To: linux-xfs@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Subject: [PATCH 23/24] iomap: add support for sub-pagesize buffered I/O without buffer heads
Date: Fri, 15 Jun 2018 15:02:08 +0200	[thread overview]
Message-ID: <20180615130209.1970-24-hch@lst.de> (raw)
In-Reply-To: <20180615130209.1970-1-hch@lst.de>

After already supporting a simple implementation of buffered writes for
the blocksize == PAGE_SIZE case in the last commit this adds full support
even for smaller block sizes.   There are three bits of per-block
information in the buffer_head structure that really matter for the iomap
read and write path:

 - uptodate status (BH_uptodate)
 - marked as currently under read I/O (BH_Async_Read)
 - marked as currently under write I/O (BH_Async_Write)

Instead of having new per-block structures this now adds a per-page
structure called struct iomap_page to track this information in a slightly
different form:

 - a bitmap for the per-block uptodate status.  For worst case of a 64k
   page size system this bitmap needs to contain 128 bits.  For the
   typical 4k page size case it only needs 8 bits, although we still
   need a full unsigned long due to the way the atomic bitmap API works.
 - two atomic_t counters are used to track the outstanding read and write
   counts

There is quite a bit of boilerplate code as the buffered I/O path uses
various helper methods, but the actual code is very straight forward.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c            | 262 ++++++++++++++++++++++++++++++++++++++----
 include/linux/iomap.h |  31 +++++
 2 files changed, 272 insertions(+), 21 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index a504077bb38f..c59d1922991d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -17,6 +17,7 @@
 #include <linux/iomap.h>
 #include <linux/uaccess.h>
 #include <linux/gfp.h>
+#include <linux/migrate.h>
 #include <linux/mm.h>
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
@@ -104,6 +105,121 @@ iomap_sector(struct iomap *iomap, loff_t pos)
 	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
 }
 
+static struct iomap_page *
+iomap_page_create(struct inode *inode, struct page *page)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (iop || i_blocksize(inode) == PAGE_SIZE)
+		return iop;
+
+	iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
+	atomic_set(&iop->read_count, 0);
+	atomic_set(&iop->write_count, 0);
+	bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
+	set_page_private(page, (unsigned long)iop);
+	SetPagePrivate(page);
+	return iop;
+}
+
+static void
+iomap_page_release(struct page *page)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (!iop)
+		return;
+	WARN_ON_ONCE(atomic_read(&iop->read_count));
+	WARN_ON_ONCE(atomic_read(&iop->write_count));
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	kfree(iop);
+}
+
+/*
+ * Calculate the range inside the page that we actually need to read.
+ */
+static void
+iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
+		loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
+{
+	unsigned poff = *pos & (PAGE_SIZE - 1);
+	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+
+	if (iop) {
+		unsigned block_size = i_blocksize(inode);
+		unsigned first = poff >> inode->i_blkbits;
+		unsigned last = (poff + plen - 1) >> inode->i_blkbits;
+		unsigned int i;
+
+		/* move forward for each leading block marked uptodate */
+		for (i = first; i <= last; i++) {
+			if (!test_bit(i, iop->uptodate))
+				break;
+			*pos += block_size;
+			poff += block_size;
+			plen -= block_size;
+		}
+
+		/* truncate len if we find any trailing uptodate block(s) */
+		for ( ; i <= last; i++) {
+			if (test_bit(i, iop->uptodate)) {
+				plen -= (last - i + 1) * block_size;
+				break;
+			}
+		}
+	}
+
+	*offp = poff;
+	*lenp = plen;
+}
+
+static void
+iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+	struct inode *inode = page->mapping->host;
+	unsigned first = off >> inode->i_blkbits;
+	unsigned last = (off + len - 1) >> inode->i_blkbits;
+	unsigned int i;
+	bool uptodate = true;
+
+	if (iop) {
+		for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
+			if (i >= first && i <= last)
+				set_bit(i, iop->uptodate);
+			else if (!test_bit(i, iop->uptodate))
+				uptodate = false;
+		}
+	}
+
+	if (uptodate && !PageError(page))
+		SetPageUptodate(page);
+}
+
+static void
+iomap_read_finish(struct iomap_page *iop, struct page *page)
+{
+	if (!iop || atomic_dec_and_test(&iop->read_count))
+		unlock_page(page);
+}
+
+static void
+iomap_read_page_end_io(struct bio_vec *bvec, int error)
+{
+	struct page *page = bvec->bv_page;
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (unlikely(error)) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len);
+	}
+
+	iomap_read_finish(iop, page);
+}
+
 static void
 iomap_read_inline_data(struct inode *inode, struct page *page,
 		struct iomap *iomap)
@@ -132,7 +248,7 @@ iomap_read_end_io(struct bio *bio)
 	int i;
 
 	bio_for_each_segment_all(bvec, bio, i)
-		page_endio(bvec->bv_page, false, error);
+		iomap_read_page_end_io(bvec, error);
 	bio_put(bio);
 }
 
@@ -150,18 +266,19 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 {
 	struct iomap_readpage_ctx *ctx = data;
 	struct page *page = ctx->cur_page;
-	unsigned poff = pos & (PAGE_SIZE - 1);
-	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+	struct iomap_page *iop = iomap_page_create(inode, page);
 	bool is_contig = false;
+	loff_t orig_pos = pos;
+	unsigned poff, plen;
 	sector_t sector;
 
-	/* we don't support blocksize < PAGE_SIZE quite yet. */
-	WARN_ON_ONCE(pos != page_offset(page));
-	WARN_ON_ONCE(plen != PAGE_SIZE);
+	iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
+	if (plen == 0)
+		goto done;
 
 	if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
 		zero_user(page, poff, plen);
-		SetPageUptodate(page);
+		iomap_set_range_uptodate(page, poff, plen);
 		goto done;
 	}
 
@@ -177,6 +294,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		is_contig = true;
 	}
 
+	/*
+	 * If we start a new segment we need to increase the read count, and we
+	 * need to do so before submitting any previous full bio to make sure
+	 * that we don't prematurely unlock the page.
+	 */
+	if (iop)
+		atomic_inc(&iop->read_count);
+
 	if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
 		gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
 		int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -197,7 +322,13 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
 	__bio_add_page(ctx->bio, page, plen, poff);
 done:
-	return plen;
+	/*
+	 * Move the caller beyond our range so that it keeps making progress.
+	 * For that we have to include any leading non-uptodate ranges, but
+	 * we can skip trailing ones as they will be handled in the next
+	 * iteration.
+	 */
+	return pos - orig_pos + plen;
 }
 
 int
@@ -208,8 +339,6 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
 	unsigned poff;
 	loff_t ret;
 
-	WARN_ON_ONCE(page_has_buffers(page));
-
 	for (poff = 0; poff < PAGE_SIZE; poff += ret) {
 		ret = iomap_apply(inode, page_offset(page) + poff,
 				PAGE_SIZE - poff, 0, ops, &ctx,
@@ -335,6 +464,84 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
 }
 EXPORT_SYMBOL_GPL(iomap_readpages);
 
+int
+iomap_is_partially_uptodate(struct page *page, unsigned long from,
+		unsigned long count)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+	struct inode *inode = page->mapping->host;
+	unsigned first = from >> inode->i_blkbits;
+	unsigned last = (from + count - 1) >> inode->i_blkbits;
+	unsigned i;
+
+	if (iop) {
+		for (i = first; i <= last; i++)
+			if (!test_bit(i, iop->uptodate))
+				return 0;
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
+
+int
+iomap_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	/*
+	 * mm accommodates an old ext3 case where clean pages might not have had
+	 * the dirty bit cleared. Thus, it can send actual dirty pages to
+	 * ->releasepage() via shrink_active_list(), skip those here.
+	 */
+	if (PageDirty(page) || PageWriteback(page))
+		return 0;
+	iomap_page_release(page);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(iomap_releasepage);
+
+void
+iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
+{
+	/*
+	 * If we are invalidating the entire page, clear the dirty state from it
+	 * and release it to avoid unnecessary buildup of the LRU.
+	 */
+	if (offset == 0 && len == PAGE_SIZE) {
+		WARN_ON_ONCE(PageWriteback(page));
+		cancel_dirty_page(page);
+		iomap_page_release(page);
+	}
+}
+EXPORT_SYMBOL_GPL(iomap_invalidatepage);
+
+#ifdef CONFIG_MIGRATION
+int
+iomap_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page, enum migrate_mode mode)
+{
+	int ret;
+
+	ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+	if (ret != MIGRATEPAGE_SUCCESS)
+		return ret;
+
+	if (page_has_private(page)) {
+		ClearPagePrivate(page);
+		set_page_private(newpage, page_private(page));
+		set_page_private(page, 0);
+		SetPagePrivate(newpage);
+	}
+
+	if (mode != MIGRATE_SYNC_NO_COPY)
+		migrate_page_copy(newpage, page);
+	else
+		migrate_page_states(newpage, page);
+	return MIGRATEPAGE_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(iomap_migrate_page);
+#endif /* CONFIG_MIGRATION */
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -358,6 +565,7 @@ iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
 
 	if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
 		zero_user_segments(page, poff, from, to, poff + plen);
+		iomap_set_range_uptodate(page, poff, plen);
 		return 0;
 	}
 
@@ -373,21 +581,33 @@ static int
 __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
 		struct page *page, struct iomap *iomap)
 {
+	struct iomap_page *iop = iomap_page_create(inode, page);
 	loff_t block_size = i_blocksize(inode);
 	loff_t block_start = pos & ~(block_size - 1);
 	loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
-	unsigned poff = block_start & (PAGE_SIZE - 1);
-	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - block_start);
-	unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
-
-	WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
+	unsigned from = pos & (PAGE_SIZE - 1), to = from + len, poff, plen;
+	int status = 0;
 
 	if (PageUptodate(page))
 		return 0;
-	if (from <= poff && to >= poff + plen)
-		return 0;
-	return iomap_read_page_sync(inode, block_start, page,
-			poff, plen, from, to, iomap);
+
+	do {
+		iomap_adjust_read_range(inode, iop, &block_start,
+				block_end - block_start, &poff, &plen);
+		if (plen == 0)
+			break;
+
+		if ((from > poff && from < poff + plen) ||
+		    (to > poff && to < poff + plen)) {
+			status = iomap_read_page_sync(inode, block_start, page,
+					poff, plen, from, to, iomap);
+			if (status)
+				break;
+		}
+
+	} while ((block_start += plen) < block_end);
+
+	return status;
 }
 
 static int
@@ -470,7 +690,7 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
 	if (unlikely(copied < len && !PageUptodate(page))) {
 		copied = 0;
 	} else {
-		SetPageUptodate(page);
+		iomap_set_range_uptodate(page, pos & (PAGE_SIZE - 1), len);
 		iomap_set_page_dirty(page);
 	}
 	return __generic_write_end(inode, pos, copied, page);
@@ -806,7 +1026,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
 		block_commit_write(page, 0, length);
 	} else {
 		WARN_ON_ONCE(!PageUptodate(page));
-		WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
+		iomap_page_create(inode, page);
 	}
 
 	return length;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index c2706cfb27c7..60b196c54dd6 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -2,6 +2,9 @@
 #ifndef LINUX_IOMAP_H
 #define LINUX_IOMAP_H 1
 
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/mm.h>
 #include <linux/types.h>
 
 struct address_space;
@@ -99,12 +102,40 @@ struct iomap_ops {
 			ssize_t written, unsigned flags, struct iomap *iomap);
 };
 
+/*
+ * Structure allocate for each page when block size < PAGE_SIZE to track
+ * sub-page uptodate status and I/O completions.
+ */
+struct iomap_page {
+	atomic_t		read_count;
+	atomic_t		write_count;
+	DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
+};
+
+static inline struct iomap_page *to_iomap_page(struct page *page)
+{
+	if (page_has_private(page))
+		return (struct iomap_page *)page_private(page);
+	return NULL;
+}
+
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops);
 int iomap_readpage(struct page *page, const struct iomap_ops *ops);
 int iomap_readpages(struct address_space *mapping, struct list_head *pages,
 		unsigned nr_pages, const struct iomap_ops *ops);
 int iomap_set_page_dirty(struct page *page);
+int iomap_is_partially_uptodate(struct page *page, unsigned long from,
+		unsigned long count);
+int iomap_releasepage(struct page *page, gfp_t gfp_mask);
+void iomap_invalidatepage(struct page *page, unsigned int offset,
+		unsigned int len);
+#ifdef CONFIG_MIGRATION
+int iomap_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page, enum migrate_mode mode);
+#else
+#define iomap_migrate_page NULL
+#endif
 int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
-- 
2.17.1

  parent reply	other threads:[~2018-06-15 13:03 UTC|newest]

Thread overview: 64+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-06-15 13:01 stop using buffer heads in xfs v6 Christoph Hellwig
2018-06-15 13:01 ` [PATCH 01/24] iomap: add an iomap-based readpage and readpages implementation Christoph Hellwig
2018-06-29 14:44   ` [PATCH] iomap: Add inline data support to iomap_readpage_actor Andreas Gruenbacher
2018-06-29 14:44     ` [Cluster-devel] " Andreas Gruenbacher
2018-06-29 14:44     ` Andreas Gruenbacher
2018-07-01  6:21     ` Christoph Hellwig
2018-07-01  6:21       ` [Cluster-devel] " Christoph Hellwig
2018-07-01  6:21       ` Christoph Hellwig
2018-07-01 21:43       ` Andreas Gruenbacher
2018-07-01 21:43         ` [Cluster-devel] " Andreas Gruenbacher
2018-07-01 21:43         ` Andreas Gruenbacher
2018-07-02 12:52         ` Christoph Hellwig
2018-07-02 12:52           ` [Cluster-devel] " Christoph Hellwig
2018-07-02 12:52           ` Christoph Hellwig
2018-07-02 15:05           ` Andreas Gruenbacher
2018-07-02 15:05             ` [Cluster-devel] " Andreas Gruenbacher
2018-07-02 15:05             ` Andreas Gruenbacher
2018-06-15 13:01 ` [PATCH 02/24] xfs: use iomap for blocksize == PAGE_SIZE readpage and readpages Christoph Hellwig
2018-06-15 13:01 ` [PATCH 03/24] iomap: add initial support for writes without buffer heads Christoph Hellwig
2018-06-15 13:01 ` [PATCH 04/24] xfs: simplify xfs_bmap_punch_delalloc_range Christoph Hellwig
2018-06-15 13:01 ` [PATCH 05/24] xfs: simplify xfs_aops_discard_page Christoph Hellwig
2018-06-15 13:01 ` [PATCH 06/24] xfs: move locking into xfs_bmap_punch_delalloc_range Christoph Hellwig
2018-06-19  5:26   ` Darrick J. Wong
2018-06-15 13:01 ` [PATCH 07/24] xfs: do not set the page uptodate in xfs_writepage_map Christoph Hellwig
2018-06-15 13:01 ` [PATCH 08/24] xfs: don't clear imap_valid for a non-uptodate buffers Christoph Hellwig
2018-06-15 13:01 ` [PATCH 09/24] xfs: don't use XFS_BMAPI_IGSTATE in xfs_map_blocks Christoph Hellwig
2018-06-19  5:27   ` Darrick J. Wong
2018-06-15 13:01 ` [PATCH 10/24] xfs: remove xfs_reflink_trim_irec_to_next_cow Christoph Hellwig
2018-06-19  5:30   ` Darrick J. Wong
2018-06-15 13:01 ` [PATCH 11/24] xfs: remove xfs_map_cow Christoph Hellwig
2018-06-18 17:38   ` Brian Foster
2018-06-19  5:35     ` Darrick J. Wong
2018-06-19 16:53       ` Christoph Hellwig
2018-06-20  0:37         ` Darrick J. Wong
2018-06-15 13:01 ` [PATCH 12/24] xfs: rename the offset variable in xfs_writepage_map Christoph Hellwig
2018-06-19  5:37   ` Darrick J. Wong
2018-06-15 13:01 ` [PATCH 13/24] xfs: make xfs_writepage_map extent map centric Christoph Hellwig
2018-06-18 17:38   ` Brian Foster
2018-06-19  5:43   ` Darrick J. Wong
2018-06-19 16:52     ` Christoph Hellwig
2018-06-15 13:01 ` [PATCH 14/24] xfs: remove the now unused XFS_BMAPI_IGSTATE flag Christoph Hellwig
2018-06-15 13:02 ` [PATCH 15/24] xfs: remove xfs_reflink_find_cow_mapping Christoph Hellwig
2018-06-15 13:02 ` [PATCH 16/24] xfs: simplify xfs_map_blocks by using xfs_iext_lookup_extent directly Christoph Hellwig
2018-06-15 13:02 ` [PATCH 17/24] xfs: remove the imap_valid flag Christoph Hellwig
2018-06-15 13:02 ` [PATCH 18/24] xfs: don't look at buffer heads in xfs_add_to_ioend Christoph Hellwig
2018-06-15 13:02 ` [PATCH 19/24] xfs: move all writeback buffer_head manipulation into xfs_map_at_offset Christoph Hellwig
2018-06-15 13:02 ` [PATCH 20/24] xfs: remove xfs_start_page_writeback Christoph Hellwig
2018-06-15 13:02 ` [PATCH 21/24] xfs: refactor the tail of xfs_writepage_map Christoph Hellwig
2018-06-15 13:02 ` [PATCH 22/24] xfs: allow writeback on pages without buffer heads Christoph Hellwig
2018-06-15 13:02 ` Christoph Hellwig [this message]
2018-06-19 16:52   ` [PATCH 23/24] iomap: add support for sub-pagesize buffered I/O " Brian Foster
2018-06-20  7:56     ` Christoph Hellwig
2018-06-20 14:32       ` Brian Foster
2018-06-20 16:08         ` Darrick J. Wong
2018-06-20 18:12           ` Brian Foster
2018-06-20 19:02             ` Darrick J. Wong
2018-06-21  8:46               ` Christoph Hellwig
2018-06-23 13:06                 ` Brian Foster
2018-06-29 15:59                   ` Christoph Hellwig
2018-07-02 12:50                   ` Christoph Hellwig
2018-07-02 18:16                     ` Brian Foster
2018-06-21  7:53         ` Christoph Hellwig
2018-06-15 13:02 ` [PATCH 24/24] xfs: add support for sub-pagesize writeback without buffer_heads Christoph Hellwig
2018-06-19  6:15   ` Darrick J. Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180615130209.1970-24-hch@lst.de \
    --to=hch@lst.de \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-xfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.