linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Andreas Gruenbacher <agruenba@redhat.com>
To: cluster-devel@redhat.com, Christoph Hellwig <hch@lst.de>
Cc: linux-fsdevel@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>
Subject: [PATCH v4 09/11] gfs2: iomap direct I/O support
Date: Mon, 14 May 2018 17:36:22 +0200	[thread overview]
Message-ID: <20180514153624.29598-10-agruenba@redhat.com> (raw)
In-Reply-To: <20180514153624.29598-1-agruenba@redhat.com>

With that, the direct_IO address space operation can be all but
eliminated: only a dummy remains which indicates that the filesystem
supports direct I/O.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/aops.c |  92 +--------------------------
 fs/gfs2/bmap.c |  11 +++-
 fs/gfs2/file.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 170 insertions(+), 102 deletions(-)

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 3d9633175aa8..d676ee63ab2b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -84,12 +84,6 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
 	return 0;
 }
 
-static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
-				 struct buffer_head *bh_result, int create)
-{
-	return gfs2_block_map(inode, lblock, bh_result, 0);
-}
-
 /**
  * gfs2_writepage_common - Common bits of writepage
  * @page: The page to be written
@@ -1021,94 +1015,12 @@ static void gfs2_invalidatepage(struct page *page, unsigned int offset,
 		try_to_release_page(page, 0);
 }
 
-/**
- * gfs2_ok_for_dio - check that dio is valid on this file
- * @ip: The inode
- * @offset: The offset at which we are reading or writing
- *
- * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
- *          1 (to accept the i/o request)
- */
-static int gfs2_ok_for_dio(struct gfs2_inode *ip, loff_t offset)
-{
-	/*
-	 * Should we return an error here? I can't see that O_DIRECT for
-	 * a stuffed file makes any sense. For now we'll silently fall
-	 * back to buffered I/O
-	 */
-	if (gfs2_is_stuffed(ip))
-		return 0;
-
-	if (offset >= i_size_read(&ip->i_inode))
-		return 0;
-	return 1;
-}
-
-
-
 static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
-	struct address_space *mapping = inode->i_mapping;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	loff_t offset = iocb->ki_pos;
-	struct gfs2_holder gh;
-	int rv;
-
 	/*
-	 * Deferred lock, even if its a write, since we do no allocation
-	 * on this path. All we need change is atime, and this lock mode
-	 * ensures that other nodes have flushed their buffered read caches
-	 * (i.e. their page cache entries for this inode). We do not,
-	 * unfortunately have the option of only flushing a range like
-	 * the VFS does.
+	 * We just need the method present so that open/fcntl allow direct I/O.
 	 */
-	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
-	rv = gfs2_glock_nq(&gh);
-	if (rv)
-		goto out_uninit;
-	rv = gfs2_ok_for_dio(ip, offset);
-	if (rv != 1)
-		goto out; /* dio not valid, fall back to buffered i/o */
-
-	/*
-	 * Now since we are holding a deferred (CW) lock at this point, you
-	 * might be wondering why this is ever needed. There is a case however
-	 * where we've granted a deferred local lock against a cached exclusive
-	 * glock. That is ok provided all granted local locks are deferred, but
-	 * it also means that it is possible to encounter pages which are
-	 * cached and possibly also mapped. So here we check for that and sort
-	 * them out ahead of the dio. The glock state machine will take care of
-	 * everything else.
-	 *
-	 * If in fact the cached glock state (gl->gl_state) is deferred (CW) in
-	 * the first place, mapping->nr_pages will always be zero.
-	 */
-	if (mapping->nrpages) {
-		loff_t lstart = offset & ~(PAGE_SIZE - 1);
-		loff_t len = iov_iter_count(iter);
-		loff_t end = PAGE_ALIGN(offset + len) - 1;
-
-		rv = 0;
-		if (len == 0)
-			goto out;
-		if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
-			unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
-		rv = filemap_write_and_wait_range(mapping, lstart, end);
-		if (rv)
-			goto out;
-		if (iov_iter_rw(iter) == WRITE)
-			truncate_inode_pages_range(mapping, lstart, end);
-	}
-
-	rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-				  gfs2_get_block_direct, NULL, NULL, 0);
-out:
-	gfs2_glock_dq(&gh);
-out_uninit:
-	gfs2_holder_uninit(&gh);
-	return rv;
+	return -EINVAL;
 }
 
 /**
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 7ac08660c990..6f57e2bfab8e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1053,7 +1053,14 @@ int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 
 	trace_gfs2_iomap_start(ip, pos, length, flags);
 	if (flags & IOMAP_WRITE) {
-		ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap);
+		if (flags & IOMAP_DIRECT) {
+			ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
+			release_metapath(&mp);
+			if (iomap->type != IOMAP_MAPPED)
+				ret = -ENOTBLK;
+		} else {
+			ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap);
+		}
 	} else {
 		ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
 		release_metapath(&mp);
@@ -1124,7 +1131,7 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_trans *tr = current->journal_info;
 
-	if (!(flags & IOMAP_WRITE))
+	if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != IOMAP_WRITE)
 		return 0;
 
 	gfs2_ordered_add_inode(ip);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index c2467988f96f..5fb2ce779bf5 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -690,6 +690,122 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 	return ret ? ret : ret1;
 }
 
+static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	size_t count = iov_iter_count(to);
+	struct gfs2_holder gh;
+	ssize_t ret;
+
+	if (!count)
+		return 0; /* skip atime */
+
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret)
+		goto out_uninit;
+
+	/* fall back to buffered I/O for stuffed files */
+	ret = -ENOTBLK;
+	if (gfs2_is_stuffed(ip))
+		goto out;
+
+	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL);
+
+out:
+	gfs2_glock_dq(&gh);
+out_uninit:
+	gfs2_holder_uninit(&gh);
+	return ret;
+}
+
+static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct address_space *mapping = inode->i_mapping;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	size_t len = iov_iter_count(from);
+	loff_t offset = iocb->ki_pos;
+	struct gfs2_holder gh;
+	ssize_t ret;
+
+	/*
+	 * Deferred lock, even if its a write, since we do no allocation on
+	 * this path. All we need to change is the atime, and this lock mode
+	 * ensures that other nodes have flushed their buffered read caches
+	 * (i.e. their page cache entries for this inode). We do not,
+	 * unfortunately, have the option of only flushing a range like the
+	 * VFS does.
+	 */
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret)
+		goto out_uninit;
+
+	/* Silently fall back to buffered I/O for stuffed files */
+	if (gfs2_is_stuffed(ip))
+		goto out;
+
+	/* Silently fall back to buffered I/O when writing beyond EOF */
+	if (offset + len > i_size_read(&ip->i_inode))
+		goto out;
+
+	/*
+	 * Now since we are holding a deferred (CW) lock at this point, you
+	 * might be wondering why this is ever needed. There is a case however
+	 * where we've granted a deferred local lock against a cached exclusive
+	 * glock. That is ok provided all granted local locks are deferred, but
+	 * it also means that it is possible to encounter pages which are
+	 * cached and possibly also mapped. So here we check for that and sort
+	 * them out ahead of the dio. The glock state machine will take care of
+	 * everything else.
+	 *
+	 * If in fact the cached glock state (gl->gl_state) is deferred (CW) in
+	 * the first place, mapping->nr_pages will always be zero.
+	 */
+	if (mapping->nrpages) {
+		loff_t lstart = offset & ~(PAGE_SIZE - 1);
+		loff_t len = iov_iter_count(from);
+		loff_t end = PAGE_ALIGN(offset + len) - 1;
+
+		ret = 0;
+		if (len == 0)
+			goto out;
+		if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+			unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
+		ret = filemap_write_and_wait_range(mapping, lstart, end);
+		if (ret)
+			goto out;
+		if (iov_iter_rw(from) == WRITE)
+			truncate_inode_pages_range(mapping, lstart, end);
+	}
+
+	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL);
+
+out:
+	gfs2_glock_dq(&gh);
+out_uninit:
+	gfs2_holder_uninit(&gh);
+	return ret;
+}
+
+static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	ssize_t ret;
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		ret = gfs2_file_direct_read(iocb, to);
+		if (likely(ret != -ENOTBLK))
+			goto out;
+		iocb->ki_flags &= ~IOCB_DIRECT;
+	}
+	ret = generic_file_read_iter(iocb, to);
+out:
+	return ret;
+}
+
 /**
  * gfs2_file_write_iter - Perform a write to a file
  * @iocb: The io context
@@ -707,7 +823,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct gfs2_inode *ip = GFS2_I(inode);
-	ssize_t ret;
+	ssize_t written = 0, ret;
 
 	ret = gfs2_rsqa_alloc(ip);
 	if (ret)
@@ -724,9 +840,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		gfs2_glock_dq_uninit(&gh);
 	}
 
-	if (iocb->ki_flags & IOCB_DIRECT)
-		return generic_file_write_iter(iocb, from);
-
 	inode_lock(inode);
 	ret = generic_write_checks(iocb, from);
 	if (ret <= 0)
@@ -743,19 +856,55 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret)
 		goto out2;
 
-	ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		struct address_space *mapping = file->f_mapping;
+		loff_t pos, endbyte;
+		ssize_t buffered;
+
+		written = gfs2_file_direct_write(iocb, from);
+		if (written < 0 || !iov_iter_count(from))
+			goto out2;
+
+		ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+		if (unlikely(ret < 0))
+			goto out2;
+		buffered = ret;
+
+		/*
+		 * We need to ensure that the page cache pages are written to
+		 * disk and invalidated to preserve the expected O_DIRECT
+		 * semantics.
+		 */
+		pos = iocb->ki_pos;
+		endbyte = pos + buffered - 1;
+		ret = filemap_write_and_wait_range(mapping, pos, endbyte);
+		if (!ret) {
+			iocb->ki_pos += buffered;
+			written += buffered;
+			invalidate_mapping_pages(mapping,
+						 pos >> PAGE_SHIFT,
+						 endbyte >> PAGE_SHIFT);
+		} else {
+			/*
+			 * We don't know how much we wrote, so just return
+			 * the number of bytes which were direct-written
+			 */
+		}
+	} else {
+		ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+		if (likely(ret > 0))
+			iocb->ki_pos += ret;
+	}
 
 out2:
 	current->backing_dev_info = NULL;
 out:
 	inode_unlock(inode);
 	if (likely(ret > 0)) {
-		iocb->ki_pos += ret;
-
 		/* Handle various SYNC-type writes */
 		ret = generic_write_sync(iocb, ret);
 	}
-	return ret;
+	return written ? written : ret;
 }
 
 static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
@@ -1157,7 +1306,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 
 const struct file_operations gfs2_file_fops = {
 	.llseek		= gfs2_llseek,
-	.read_iter	= generic_file_read_iter,
+	.read_iter	= gfs2_file_read_iter,
 	.write_iter	= gfs2_file_write_iter,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.mmap		= gfs2_mmap,
@@ -1187,7 +1336,7 @@ const struct file_operations gfs2_dir_fops = {
 
 const struct file_operations gfs2_file_fops_nolock = {
 	.llseek		= gfs2_llseek,
-	.read_iter	= generic_file_read_iter,
+	.read_iter	= gfs2_file_read_iter,
 	.write_iter	= gfs2_file_write_iter,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.mmap		= gfs2_mmap,
-- 
2.17.0

  parent reply	other threads:[~2018-05-14 15:36 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-05-14 15:36 [PATCH v4 00/11] gfs2 iomap write support Andreas Gruenbacher
2018-05-14 15:36 ` [PATCH v4 01/11] gfs2: Update find_metapath comment Andreas Gruenbacher
2018-05-14 15:36 ` [PATCH v4 02/11] gfs2: hole_size improvement Andreas Gruenbacher
2018-05-14 15:36 ` [PATCH v4 03/11] gfs2: gfs2_stuffed_write_end cleanup Andreas Gruenbacher
2018-05-14 15:36 ` [PATCH v4 04/11] gfs2: Remove ordered write mode handling from gfs2_trans_add_data Andreas Gruenbacher
2018-05-14 15:36 ` [PATCH v4 05/11] gfs2: Iomap cleanups and improvements Andreas Gruenbacher
2018-05-14 15:36 ` [PATCH v4 06/11] iomap: Add write_{begin,end} iomap operations Andreas Gruenbacher
2018-05-15  1:11   ` Dave Chinner
2018-05-15  7:22     ` Christoph Hellwig
2018-05-15  8:16       ` Andreas Gruenbacher
2018-05-18 16:04         ` Christoph Hellwig
2018-05-25 17:58           ` Andreas Grünbacher
2018-05-28 13:02             ` Christoph Hellwig
2018-05-14 15:36 ` [PATCH v4 07/11] gfs2: iomap buffered write support Andreas Gruenbacher
2018-05-14 15:36 ` [PATCH v4 08/11] gfs2: gfs2_extent_length cleanup Andreas Gruenbacher
2018-05-14 15:36 ` Andreas Gruenbacher [this message]
2018-05-15  7:31   ` [PATCH v4 09/11] gfs2: iomap direct I/O support Christoph Hellwig
2018-05-16 20:36     ` Andreas Gruenbacher
2018-05-14 15:36 ` [PATCH v4 10/11] gfs2: Remove gfs2_write_{begin,end} Andreas Gruenbacher
2018-05-14 15:36 ` [PATCH v4 11/11] iomap: Complete partial direct I/O writes synchronously Andreas Gruenbacher
2018-05-15  7:24   ` Christoph Hellwig
2018-05-16 20:27     ` Andreas Gruenbacher
2018-05-18 15:56       ` Christoph Hellwig
2018-05-18 18:35         ` Andreas Grünbacher

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180514153624.29598-10-agruenba@redhat.com \
    --to=agruenba@redhat.com \
    --cc=cluster-devel@redhat.com \
    --cc=hch@lst.de \
    --cc=linux-fsdevel@vger.kernel.org \
    --subject='Re: [PATCH v4 09/11] gfs2: iomap direct I/O support' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).