All of lore.kernel.org
 help / color / mirror / Atom feed
From: Dave Chinner <david@fromorbit.com>
To: linux-xfs@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org, avi@scylladb.com, andres@anarazel.de
Subject: [PATCH 5/6] xfs: split unaligned DIO write code out
Date: Tue, 12 Jan 2021 12:07:45 +1100	[thread overview]
Message-ID: <20210112010746.1154363-6-david@fromorbit.com> (raw)
In-Reply-To: <20210112010746.1154363-1-david@fromorbit.com>

From: Dave Chinner <dchinner@redhat.com>

The unaligned DIO write path is more convulted than the normal path,
and we are about to make it more complex. Keep the block aligned
fast path dio write code trim and simple by splitting out the
unaligned DIO code from it.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_file.c | 177 +++++++++++++++++++++++++++++-----------------
 1 file changed, 113 insertions(+), 64 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 512833ce1d41..bba33be17eff 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -508,7 +508,7 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
 };
 
 /*
- * xfs_file_dio_aio_write - handle direct IO writes
+ * Handle block aligned direct IO writes
  *
  * Lock the inode appropriately to prepare for and issue a direct IO write.
  * By separating it from the buffered write path we remove all the tricky to
@@ -518,35 +518,88 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
  * until we're sure the bytes at the new EOF have been zeroed and/or the cached
  * pages are flushed out.
  *
- * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_write_aligned(
+	struct xfs_inode	*ip,
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	int			iolock = XFS_IOLOCK_SHARED;
+	size_t			count;
+	ssize_t			ret;
+	struct iomap_dio_rw_args args = {
+		.iocb			= iocb,
+		.iter			= from,
+		.ops			= &xfs_direct_write_iomap_ops,
+		.dops			= &xfs_dio_write_ops,
+		.wait_for_completion	= is_sync_kiocb(iocb),
+		.nonblocking		= (iocb->ki_flags & IOCB_NOWAIT),
+	};
+
+	ret = xfs_ilock_iocb(iocb, iolock);
+	if (ret)
+		return ret;
+	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+	if (ret)
+		goto out;
+	count = iov_iter_count(from);
+
+	/*
+	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
+	 * the iolock back to shared if we had to take the exclusive lock in
+	 * xfs_file_aio_write_checks() for other reasons.
+	 */
+	if (iolock == XFS_IOLOCK_EXCL) {
+		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		iolock = XFS_IOLOCK_SHARED;
+	}
+
+	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
+	ret = iomap_dio_rw(&args);
+out:
+	if (iolock)
+		xfs_iunlock(ip, iolock);
+
+	/*
+	 * No fallback to buffered IO after short writes for XFS, direct I/O
+	 * will either complete fully or return an error.
+	 */
+	ASSERT(ret < 0 || ret == count);
+	return ret;
+}
+
+/*
+ * Handle block unaligned direct IO writes
+ *
+ * In most cases direct IO writes will be done holding IOLOCK_SHARED
  * allowing them to be done in parallel with reads and other direct IO writes.
  * However, if the IO is not aligned to filesystem blocks, the direct IO layer
- * needs to do sub-block zeroing and that requires serialisation against other
+ * may need to do sub-block zeroing and that requires serialisation against other
  * direct IOs to the same block. In this case we need to serialise the
  * submission of the unaligned IOs so that we don't get racing block zeroing in
- * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * the dio layer.
+ *
+ * To provide the same serialisation for AIO, we also need to wait for
  * outstanding IOs to complete so that unwritten extent conversion is completed
  * before we try to map the overlapping block. This is currently implemented by
  * hitting it with a big hammer (i.e. inode_dio_wait()).
  *
- * Returns with locks held indicated by @iolock and errors indicated by
- * negative return values.
+ * This means that unaligned dio writes alwys block. There is no "nowait" fast
+ * path in this code - if IOCB_NOWAIT is set we simply return -EAGAIN up front
+ * and we don't have to worry about that anymore.
  */
-STATIC ssize_t
-xfs_file_dio_aio_write(
+static ssize_t
+xfs_file_dio_write_unaligned(
+	struct xfs_inode	*ip,
 	struct kiocb		*iocb,
 	struct iov_iter		*from)
 {
-	struct file		*file = iocb->ki_filp;
-	struct address_space	*mapping = file->f_mapping;
-	struct inode		*inode = mapping->host;
-	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	ssize_t			ret = 0;
-	int			unaligned_io = 0;
-	int			iolock;
-	size_t			count = iov_iter_count(from);
-	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+	int			iolock = XFS_IOLOCK_EXCL;
+	size_t			count;
+	ssize_t			ret;
 	struct iomap_dio_rw_args args = {
 		.iocb			= iocb,
 		.iter			= from,
@@ -556,49 +609,25 @@ xfs_file_dio_aio_write(
 		.nonblocking		= (iocb->ki_flags & IOCB_NOWAIT),
 	};
 
-	/* DIO must be aligned to device logical sector size */
-	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
-		return -EINVAL;
-
 	/*
-	 * Don't take the exclusive iolock here unless the I/O is unaligned to
-	 * the file system block size.  We don't need to consider the EOF
-	 * extension case here because xfs_file_aio_write_checks() will relock
-	 * the inode as necessary for EOF zeroing cases and fill out the new
-	 * inode size as appropriate.
+	 * This must be the only IO in-flight. Wait on it before we
+	 * release the iolock to prevent subsequent overlapping IO.
 	 */
-	if ((iocb->ki_pos & mp->m_blockmask) ||
-	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
-		unaligned_io = 1;
-
-		/*
-		 * This must be the only IO in-flight. Wait on it before we
-		 * release the iolock to prevent subsequent overlapping IO.
-		 */
-		args.wait_for_completion = true;
+	args.wait_for_completion = true;
 
-		/*
-		 * We can't properly handle unaligned direct I/O to reflink
-		 * files yet, as we can't unshare a partial block.
-		 */
-		if (xfs_is_cow_inode(ip)) {
-			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
-			return -ENOTBLK;
-		}
-		iolock = XFS_IOLOCK_EXCL;
-	} else {
-		iolock = XFS_IOLOCK_SHARED;
+	/*
+	 * We can't properly handle unaligned direct I/O to reflink
+	 * files yet, as we can't unshare a partial block.
+	 */
+	if (xfs_is_cow_inode(ip)) {
+		trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
+		return -ENOTBLK;
 	}
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		/* unaligned dio always waits, bail */
-		if (unaligned_io)
-			return -EAGAIN;
-		if (!xfs_ilock_nowait(ip, iolock))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, iolock);
-	}
+	/* unaligned dio always waits, bail */
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		return -EAGAIN;
+	xfs_ilock(ip, iolock);
 
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
@@ -612,13 +641,7 @@ xfs_file_dio_aio_write(
 	 * iolock if we had to take the exclusive lock in
 	 * xfs_file_aio_write_checks() for other reasons.
 	 */
-	if (unaligned_io) {
-		inode_dio_wait(inode);
-	} else if (iolock == XFS_IOLOCK_EXCL) {
-		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
-		iolock = XFS_IOLOCK_SHARED;
-	}
-
+	inode_dio_wait(VFS_I(ip));
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
 	ret = iomap_dio_rw(&args);
 out:
@@ -633,6 +656,32 @@ xfs_file_dio_aio_write(
 	return ret;
 }
 
+static ssize_t
+xfs_file_dio_write(
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+	size_t			count = iov_iter_count(from);
+
+	/* DIO must be aligned to device logical sector size */
+	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
+		return -EINVAL;
+
+	/*
+	 * Don't take the exclusive iolock here unless the I/O is unaligned to
+	 * the file system block size.  We don't need to consider the EOF
+	 * extension case here because xfs_file_aio_write_checks() will relock
+	 * the inode as necessary for EOF zeroing cases and fill out the new
+	 * inode size as appropriate.
+	 */
+	if ((iocb->ki_pos | count) & mp->m_blockmask)
+		return xfs_file_dio_write_unaligned(ip, iocb, from);
+	return xfs_file_dio_write_aligned(ip, iocb, from);
+}
+
 static noinline ssize_t
 xfs_file_dax_write(
 	struct kiocb		*iocb,
@@ -783,7 +832,7 @@ xfs_file_write_iter(
 		 * CoW.  In all other directio scenarios we do not
 		 * allow an operation to fall back to buffered mode.
 		 */
-		ret = xfs_file_dio_aio_write(iocb, from);
+		ret = xfs_file_dio_write(iocb, from);
 		if (ret != -ENOTBLK)
 			return ret;
 	}
-- 
2.28.0


  parent reply	other threads:[~2021-01-12  1:08 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-01-12  1:07 [RFC] xfs: reduce sub-block DIO serialisation Dave Chinner
2021-01-12  1:07 ` [PATCH 1/6] iomap: convert iomap_dio_rw() to an args structure Dave Chinner
2021-01-12  1:22   ` Damien Le Moal
2021-01-12  1:40   ` Darrick J. Wong
2021-01-12  1:53     ` Dave Chinner
2021-01-12 10:31   ` Christoph Hellwig
2021-01-12  1:07 ` [PATCH 2/6] iomap: move DIO NOWAIT setup up into filesystems Dave Chinner
2021-01-12  1:07 ` [PATCH 3/6] xfs: factor out a xfs_ilock_iocb helper Dave Chinner
2021-01-12  1:07 ` [PATCH 4/6] xfs: make xfs_file_aio_write_checks IOCB_NOWAIT-aware Dave Chinner
2021-01-12  1:07 ` Dave Chinner [this message]
2021-01-12 10:37   ` [PATCH 5/6] xfs: split unaligned DIO write code out Christoph Hellwig
2021-01-12  1:07 ` [PATCH 6/6] xfs: reduce exclusive locking on unaligned dio Dave Chinner
2021-01-12 10:42   ` Christoph Hellwig
2021-01-12 17:01     ` Brian Foster
2021-01-12 17:10       ` Christoph Hellwig
2021-01-12 22:06       ` Dave Chinner
2021-01-12  8:01 ` [RFC] xfs: reduce sub-block DIO serialisation Avi Kivity
2021-01-12 22:13   ` Dave Chinner
2021-01-13  8:00     ` Avi Kivity
2021-01-13 20:38       ` Dave Chinner
2021-01-14  6:48         ` Avi Kivity
2021-01-17 21:34           ` Dave Chinner
2021-01-18  7:41             ` Avi Kivity
     [not found] ` <CACz=WechdgSnVHQsg0LKjMiG8kHLujBshmc270yrdjxfpffmDQ@mail.gmail.com>
2021-01-17 21:36   ` Dave Chinner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210112010746.1154363-6-david@fromorbit.com \
    --to=david@fromorbit.com \
    --cc=andres@anarazel.de \
    --cc=avi@scylladb.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-xfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.