All of lore.kernel.org
 help / color / mirror / Atom feed
From: Christoph Hellwig <hch@lst.de>
To: linux-xfs@vger.kernel.org
Cc: Avi Kivity <avi@scylladb.com>, Brian Foster <bfoster@redhat.com>
Subject: [PATCH 3/3] xfs: try to avoid the iolock exclusive for non-aligned direct writes
Date: Mon, 11 Jan 2021 17:12:12 +0100	[thread overview]
Message-ID: <20210111161212.1414034-4-hch@lst.de> (raw)
In-Reply-To: <20210111161212.1414034-1-hch@lst.de>

We only need the exclusive iolock for direct writes to protect sub-block
zeroing after an allocation or conversion of unwritten extents, and the
synchronous execution of these writes is also only needed because the
iolock is dropped early for the dodgy i_dio_count synchronisation.

Always start out with the shared iolock in xfs_file_dio_aio_write for
non-appending writes and only upgrade it to exclusive if the start and
end of the write range are not already allocated and in written
state.  This means one or two extra lookups in the in-core extent tree,
but with our btree data structure those lookups are very cheap and do
not show up in profiles on NVMe hardware for me.  On the other hand
avoiding the lock allows for a high concurrency using aio or io_uring.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Avi Kivity <avi@scylladb.com>
Suggested-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/xfs_file.c | 127 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 96 insertions(+), 31 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1470fc4f2e0255..59d4c6e90f06c1 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -521,6 +521,57 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
 	.end_io		= xfs_dio_write_end_io,
 };
 
+static int
+xfs_dio_write_exclusive(
+	struct kiocb		*iocb,
+	size_t			count,
+	bool			*exclusive_io)
+{
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = &ip->i_df;
+	loff_t			offset = iocb->ki_pos;
+	loff_t			end = offset + count;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, end);
+	struct xfs_bmbt_irec	got = { };
+	struct xfs_iext_cursor	icur;
+	int			ret;
+
+	*exclusive_io = true;
+
+	/*
+	 * Bmap information not read in yet or no blocks allocated at all?
+	 */
+	if (!(ifp->if_flags & XFS_IFEXTENTS) || !ip->i_d.di_nblocks)
+		return 0;
+
+	ret = xfs_ilock_iocb(iocb, XFS_ILOCK_SHARED);
+	if (ret)
+		return ret;
+
+	if (offset & mp->m_blockmask) {
+		if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got) ||
+		    got.br_startoff > offset_fsb ||
+		    got.br_state == XFS_EXT_UNWRITTEN)
+		    	goto out_unlock;
+	}
+
+	if ((end & mp->m_blockmask) &&
+	    got.br_startoff + got.br_blockcount <= end_fsb) {
+		if (!xfs_iext_lookup_extent(ip, ifp, end_fsb, &icur, &got) ||
+		    got.br_startoff > end_fsb ||
+		    got.br_state == XFS_EXT_UNWRITTEN)
+		    	goto out_unlock;
+	}
+
+	*exclusive_io = false;
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return ret;
+}
+
 /*
  * xfs_file_dio_aio_write - handle direct IO writes
  *
@@ -557,8 +608,9 @@ xfs_file_dio_aio_write(
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	ssize_t			ret = 0;
-	int			unaligned_io = 0;
-	int			iolock;
+	int			iolock = XFS_IOLOCK_SHARED;
+	bool			subblock_io = false;
+	bool			exclusive_io = false;
 	size_t			count = iov_iter_count(from);
 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
 
@@ -566,45 +618,58 @@ xfs_file_dio_aio_write(
 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
 		return -EINVAL;
 
-	/*
-	 * Don't take the exclusive iolock here unless the I/O is unaligned to
-	 * the file system block size.  We don't need to consider the EOF
-	 * extension case here because xfs_file_aio_write_checks() will relock
-	 * the inode as necessary for EOF zeroing cases and fill out the new
-	 * inode size as appropriate.
-	 */
+	/* I/O that is not aligned to the fsblock size will need special care */
 	if ((iocb->ki_pos & mp->m_blockmask) ||
-	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
-		unaligned_io = 1;
+	    ((iocb->ki_pos + count) & mp->m_blockmask))
+		subblock_io = true;
 
-		/*
-		 * We can't properly handle unaligned direct I/O to reflink
-		 * files yet, as we can't unshare a partial block.
-		 */
-		if (xfs_is_cow_inode(ip)) {
-			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
-			return -ENOTBLK;
-		}
-		iolock = XFS_IOLOCK_EXCL;
-	} else {
-		iolock = XFS_IOLOCK_SHARED;
+	/*
+	 * We can't properly handle unaligned direct I/O to reflink files yet,
+	 * as we can't unshare a partial block.
+	 */
+	if (subblock_io && xfs_is_cow_inode(ip)) {
+		trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
+		return -ENOTBLK;
 	}
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		/* unaligned dio always waits, bail */
-		if (unaligned_io)
-			return -EAGAIN;
-		if (!xfs_ilock_nowait(ip, iolock))
+	/*
+	 * Racy shortcut for obvious appends to avoid too much relocking:
+	 */
+	if (iocb->ki_pos > i_size_read(inode)) {
+		if (iocb->ki_flags & IOCB_NOWAIT)
 			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, iolock);
+		iolock = XFS_IOLOCK_EXCL;
 	}
 
+relock:
+	ret = xfs_ilock_iocb(iocb, iolock);
+	if (ret)
+		return ret;
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
 		goto out;
 	count = iov_iter_count(from);
 
+	/*
+	 * Upgrade to an exclusive lock and force synchronous completion if the
+	 * I/O will require partial block zeroing.
+	 * We don't need to consider the EOF extension case here because
+	 * xfs_file_aio_write_checks() will relock the inode as necessary for
+	 * EOF zeroing cases and fill out the new inode size as appropriate.
+	 */
+	if (iolock != XFS_IOLOCK_EXCL && subblock_io) {
+		ret = xfs_dio_write_exclusive(iocb, count, &exclusive_io);
+		if (ret)
+			goto out;
+		if (exclusive_io) {
+			xfs_iunlock(ip, iolock);
+			if (iocb->ki_flags & IOCB_NOWAIT)
+				return -EAGAIN;
+			iolock = XFS_IOLOCK_EXCL;
+			goto relock;
+		}
+	}
+
 	/*
 	 * If we are doing unaligned IO, we can't allow any other overlapping IO
 	 * in-flight at the same time or we risk data corruption. Wait for all
@@ -612,7 +677,7 @@ xfs_file_dio_aio_write(
 	 * iolock if we had to take the exclusive lock in
 	 * xfs_file_aio_write_checks() for other reasons.
 	 */
-	if (unaligned_io) {
+	if (exclusive_io) {
 		inode_dio_wait(inode);
 	} else if (iolock == XFS_IOLOCK_EXCL) {
 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
@@ -626,7 +691,7 @@ xfs_file_dio_aio_write(
 	 */
 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
 			   &xfs_dio_write_ops,
-			   is_sync_kiocb(iocb) || unaligned_io);
+			   is_sync_kiocb(iocb) || exclusive_io);
 out:
 	if (iolock)
 		xfs_iunlock(ip, iolock);
-- 
2.29.2


  parent reply	other threads:[~2021-01-11 16:13 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-01-11 16:12 improve sub-block size direct I/O concurrency Christoph Hellwig
2021-01-11 16:12 ` [PATCH 1/3] xfs: factor out a xfs_ilock_iocb helper Christoph Hellwig
2021-01-11 18:55   ` Brian Foster
2021-01-11 16:12 ` [PATCH 2/3] xfs: make xfs_file_aio_write_checks IOCB_NOWAIT-aware Christoph Hellwig
2021-01-11 18:55   ` Brian Foster
2021-01-11 16:12 ` Christoph Hellwig [this message]
2021-01-11 18:59   ` [PATCH 3/3] xfs: try to avoid the iolock exclusive for non-aligned direct writes Brian Foster
2021-01-11 19:14     ` Christoph Hellwig
2021-01-11 19:49       ` Brian Foster
2021-01-11 20:52   ` Dave Chinner
2021-01-11 20:45 ` improve sub-block size direct I/O concurrency Dave Chinner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210111161212.1414034-4-hch@lst.de \
    --to=hch@lst.de \
    --cc=avi@scylladb.com \
    --cc=bfoster@redhat.com \
    --cc=linux-xfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.