linux-xfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* reduce sub-block DIO serialisation v2
@ 2021-01-18 19:35 Christoph Hellwig
  2021-01-18 19:35 ` [PATCH 01/11] xfs: factor out a xfs_ilock_iocb helper Christoph Hellwig
                   ` (10 more replies)
  0 siblings, 11 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-18 19:35 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, avi

This takes the approach from Dave, but adds a new flag instead of abusing
the nowait one, and keeps a simpler calling convention for iomap_dio_rw.

Changes since v2:
 - rename the new flags
 - add an EOF check for subblock I/O
 - minor cleanups

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH 01/11] xfs: factor out a xfs_ilock_iocb helper
  2021-01-18 19:35 reduce sub-block DIO serialisation v2 Christoph Hellwig
@ 2021-01-18 19:35 ` Christoph Hellwig
  2021-01-20 18:41   ` Darrick J. Wong
  2021-01-18 19:35 ` [PATCH 02/11] xfs: make xfs_file_aio_write_checks IOCB_NOWAIT-aware Christoph Hellwig
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-18 19:35 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, avi, Dave Chinner, Brian Foster

Add a helper to factor out the nowait locking logical for the read/write
helpers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/xfs_file.c | 55 +++++++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 26 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5b0f93f738372d..c441cddfa4acbc 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -197,6 +197,23 @@ xfs_file_fsync(
 	return error;
 }
 
+static int
+xfs_ilock_iocb(
+	struct kiocb		*iocb,
+	unsigned int		lock_mode)
+{
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (!xfs_ilock_nowait(ip, lock_mode))
+			return -EAGAIN;
+	} else {
+		xfs_ilock(ip, lock_mode);
+	}
+
+	return 0;
+}
+
 STATIC ssize_t
 xfs_file_dio_aio_read(
 	struct kiocb		*iocb,
@@ -213,12 +230,9 @@ xfs_file_dio_aio_read(
 
 	file_accessed(iocb->ki_filp);
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
+	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+	if (ret)
+		return ret;
 	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
 			is_sync_kiocb(iocb));
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -240,13 +254,9 @@ xfs_file_dax_read(
 	if (!count)
 		return 0; /* skip atime */
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
-
+	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+	if (ret)
+		return ret;
 	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
@@ -264,12 +274,9 @@ xfs_file_buffered_aio_read(
 
 	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
+	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+	if (ret)
+		return ret;
 	ret = generic_file_read_iter(iocb, to);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
@@ -608,13 +615,9 @@ xfs_file_dax_write(
 	size_t			count;
 	loff_t			pos;
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (!xfs_ilock_nowait(ip, iolock))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, iolock);
-	}
-
+	ret = xfs_ilock_iocb(iocb, iolock);
+	if (ret)
+		return ret;
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
 		goto out;
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 02/11] xfs: make xfs_file_aio_write_checks IOCB_NOWAIT-aware
  2021-01-18 19:35 reduce sub-block DIO serialisation v2 Christoph Hellwig
  2021-01-18 19:35 ` [PATCH 01/11] xfs: factor out a xfs_ilock_iocb helper Christoph Hellwig
@ 2021-01-18 19:35 ` Christoph Hellwig
       [not found]   ` <CACz=WeeaqMrGM53pJF0C_Wt2JuavTOnOV26-osPviYLUpqUmFw@mail.gmail.com>
  2021-01-20 18:42   ` Darrick J. Wong
  2021-01-18 19:35 ` [PATCH 03/11] xfs: cleanup the read/write helper naming Christoph Hellwig
                   ` (8 subsequent siblings)
  10 siblings, 2 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-18 19:35 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, avi, Dave Chinner, Brian Foster

Ensure we don't block on the iolock, or waiting for I/O in
xfs_file_aio_write_checks if the caller asked to avoid that.

Fixes: 29a5d29ec181 ("xfs: nowait aio support")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/xfs_file.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c441cddfa4acbc..fb4e6f2852bb8b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -335,7 +335,14 @@ xfs_file_aio_write_checks(
 	if (error <= 0)
 		return error;
 
-	error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		error = break_layout(inode, false);
+		if (error == -EWOULDBLOCK)
+			error = -EAGAIN;
+	} else {
+		error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+	}
+
 	if (error)
 		return error;
 
@@ -346,7 +353,11 @@ xfs_file_aio_write_checks(
 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
 		xfs_iunlock(ip, *iolock);
 		*iolock = XFS_IOLOCK_EXCL;
-		xfs_ilock(ip, *iolock);
+		error = xfs_ilock_iocb(iocb, *iolock);
+		if (error) {
+			*iolock = 0;
+			return error;
+		}
 		goto restart;
 	}
 	/*
@@ -368,6 +379,10 @@ xfs_file_aio_write_checks(
 	isize = i_size_read(inode);
 	if (iocb->ki_pos > isize) {
 		spin_unlock(&ip->i_flags_lock);
+
+		if (iocb->ki_flags & IOCB_NOWAIT)
+			return -EAGAIN;
+
 		if (!drained_dio) {
 			if (*iolock == XFS_IOLOCK_SHARED) {
 				xfs_iunlock(ip, *iolock);
@@ -593,7 +608,8 @@ xfs_file_dio_aio_write(
 			   &xfs_dio_write_ops,
 			   is_sync_kiocb(iocb) || unaligned_io);
 out:
-	xfs_iunlock(ip, iolock);
+	if (iolock)
+		xfs_iunlock(ip, iolock);
 
 	/*
 	 * No fallback to buffered IO after short writes for XFS, direct I/O
@@ -632,7 +648,8 @@ xfs_file_dax_write(
 		error = xfs_setfilesize(ip, pos, ret);
 	}
 out:
-	xfs_iunlock(ip, iolock);
+	if (iolock)
+		xfs_iunlock(ip, iolock);
 	if (error)
 		return error;
 
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 03/11] xfs: cleanup the read/write helper naming
  2021-01-18 19:35 reduce sub-block DIO serialisation v2 Christoph Hellwig
  2021-01-18 19:35 ` [PATCH 01/11] xfs: factor out a xfs_ilock_iocb helper Christoph Hellwig
  2021-01-18 19:35 ` [PATCH 02/11] xfs: make xfs_file_aio_write_checks IOCB_NOWAIT-aware Christoph Hellwig
@ 2021-01-18 19:35 ` Christoph Hellwig
  2021-01-19 15:23   ` Brian Foster
  2021-01-20 18:43   ` Darrick J. Wong
  2021-01-18 19:35 ` [PATCH 04/11] xfs: remove the buffered I/O fallback assert Christoph Hellwig
                   ` (7 subsequent siblings)
  10 siblings, 2 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-18 19:35 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, avi, Dave Chinner

Drop a few pointless aio_ prefixes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_file.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index fb4e6f2852bb8b..ae7313ccaa11ed 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -215,7 +215,7 @@ xfs_ilock_iocb(
 }
 
 STATIC ssize_t
-xfs_file_dio_aio_read(
+xfs_file_dio_read(
 	struct kiocb		*iocb,
 	struct iov_iter		*to)
 {
@@ -265,7 +265,7 @@ xfs_file_dax_read(
 }
 
 STATIC ssize_t
-xfs_file_buffered_aio_read(
+xfs_file_buffered_read(
 	struct kiocb		*iocb,
 	struct iov_iter		*to)
 {
@@ -300,9 +300,9 @@ xfs_file_read_iter(
 	if (IS_DAX(inode))
 		ret = xfs_file_dax_read(iocb, to);
 	else if (iocb->ki_flags & IOCB_DIRECT)
-		ret = xfs_file_dio_aio_read(iocb, to);
+		ret = xfs_file_dio_read(iocb, to);
 	else
-		ret = xfs_file_buffered_aio_read(iocb, to);
+		ret = xfs_file_buffered_read(iocb, to);
 
 	if (ret > 0)
 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
@@ -317,7 +317,7 @@ xfs_file_read_iter(
  * if called for a direct write beyond i_size.
  */
 STATIC ssize_t
-xfs_file_aio_write_checks(
+xfs_file_write_checks(
 	struct kiocb		*iocb,
 	struct iov_iter		*from,
 	int			*iolock)
@@ -502,7 +502,7 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
 };
 
 /*
- * xfs_file_dio_aio_write - handle direct IO writes
+ * xfs_file_dio_write - handle direct IO writes
  *
  * Lock the inode appropriately to prepare for and issue a direct IO write.
  * By separating it from the buffered write path we remove all the tricky to
@@ -527,7 +527,7 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
  * negative return values.
  */
 STATIC ssize_t
-xfs_file_dio_aio_write(
+xfs_file_dio_write(
 	struct kiocb		*iocb,
 	struct iov_iter		*from)
 {
@@ -549,7 +549,7 @@ xfs_file_dio_aio_write(
 	/*
 	 * Don't take the exclusive iolock here unless the I/O is unaligned to
 	 * the file system block size.  We don't need to consider the EOF
-	 * extension case here because xfs_file_aio_write_checks() will relock
+	 * extension case here because xfs_file_write_checks() will relock
 	 * the inode as necessary for EOF zeroing cases and fill out the new
 	 * inode size as appropriate.
 	 */
@@ -580,7 +580,7 @@ xfs_file_dio_aio_write(
 		xfs_ilock(ip, iolock);
 	}
 
-	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+	ret = xfs_file_write_checks(iocb, from, &iolock);
 	if (ret)
 		goto out;
 	count = iov_iter_count(from);
@@ -590,7 +590,7 @@ xfs_file_dio_aio_write(
 	 * in-flight at the same time or we risk data corruption. Wait for all
 	 * other IO to drain before we submit. If the IO is aligned, demote the
 	 * iolock if we had to take the exclusive lock in
-	 * xfs_file_aio_write_checks() for other reasons.
+	 * xfs_file_write_checks() for other reasons.
 	 */
 	if (unaligned_io) {
 		inode_dio_wait(inode);
@@ -634,7 +634,7 @@ xfs_file_dax_write(
 	ret = xfs_ilock_iocb(iocb, iolock);
 	if (ret)
 		return ret;
-	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+	ret = xfs_file_write_checks(iocb, from, &iolock);
 	if (ret)
 		goto out;
 
@@ -663,7 +663,7 @@ xfs_file_dax_write(
 }
 
 STATIC ssize_t
-xfs_file_buffered_aio_write(
+xfs_file_buffered_write(
 	struct kiocb		*iocb,
 	struct iov_iter		*from)
 {
@@ -682,7 +682,7 @@ xfs_file_buffered_aio_write(
 	iolock = XFS_IOLOCK_EXCL;
 	xfs_ilock(ip, iolock);
 
-	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+	ret = xfs_file_write_checks(iocb, from, &iolock);
 	if (ret)
 		goto out;
 
@@ -769,12 +769,12 @@ xfs_file_write_iter(
 		 * CoW.  In all other directio scenarios we do not
 		 * allow an operation to fall back to buffered mode.
 		 */
-		ret = xfs_file_dio_aio_write(iocb, from);
+		ret = xfs_file_dio_write(iocb, from);
 		if (ret != -ENOTBLK)
 			return ret;
 	}
 
-	return xfs_file_buffered_aio_write(iocb, from);
+	return xfs_file_buffered_write(iocb, from);
 }
 
 static void
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 04/11] xfs: remove the buffered I/O fallback assert
  2021-01-18 19:35 reduce sub-block DIO serialisation v2 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2021-01-18 19:35 ` [PATCH 03/11] xfs: cleanup the read/write helper naming Christoph Hellwig
@ 2021-01-18 19:35 ` Christoph Hellwig
  2021-01-19 15:23   ` Brian Foster
  2021-01-20 18:43   ` Darrick J. Wong
  2021-01-18 19:35 ` [PATCH 05/11] xfs: simplify the read/write tracepoints Christoph Hellwig
                   ` (6 subsequent siblings)
  10 siblings, 2 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-18 19:35 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, avi, Dave Chinner

The iomap code has been designed from the start not to do magic fallback,
so remove the assert in preparation for further code cleanups.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_file.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ae7313ccaa11ed..97836ec53397d4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -610,12 +610,6 @@ xfs_file_dio_write(
 out:
 	if (iolock)
 		xfs_iunlock(ip, iolock);
-
-	/*
-	 * No fallback to buffered IO after short writes for XFS, direct I/O
-	 * will either complete fully or return an error.
-	 */
-	ASSERT(ret < 0 || ret == count);
 	return ret;
 }
 
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 05/11] xfs: simplify the read/write tracepoints
  2021-01-18 19:35 reduce sub-block DIO serialisation v2 Christoph Hellwig
                   ` (3 preceding siblings ...)
  2021-01-18 19:35 ` [PATCH 04/11] xfs: remove the buffered I/O fallback assert Christoph Hellwig
@ 2021-01-18 19:35 ` Christoph Hellwig
  2021-01-19 15:23   ` Brian Foster
  2021-01-20 18:45   ` Darrick J. Wong
  2021-01-18 19:35 ` [PATCH 06/11] xfs: improve the reflink_bounce_dio_write tracepoint Christoph Hellwig
                   ` (5 subsequent siblings)
  10 siblings, 2 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-18 19:35 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, avi, Dave Chinner

Pass the iocb and iov_iter to the tracepoints and leave decoding of
actual arguments to the code only run when tracing is enabled.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_file.c  | 20 ++++++++------------
 fs/xfs/xfs_trace.h | 18 +++++++++---------
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 97836ec53397d4..aa64e78fc3c467 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -220,12 +220,11 @@ xfs_file_dio_read(
 	struct iov_iter		*to)
 {
 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
-	size_t			count = iov_iter_count(to);
 	ssize_t			ret;
 
-	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
+	trace_xfs_file_direct_read(iocb, to);
 
-	if (!count)
+	if (!iov_iter_count(to))
 		return 0; /* skip atime */
 
 	file_accessed(iocb->ki_filp);
@@ -246,12 +245,11 @@ xfs_file_dax_read(
 	struct iov_iter		*to)
 {
 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
-	size_t			count = iov_iter_count(to);
 	ssize_t			ret = 0;
 
-	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+	trace_xfs_file_dax_read(iocb, to);
 
-	if (!count)
+	if (!iov_iter_count(to))
 		return 0; /* skip atime */
 
 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
@@ -272,7 +270,7 @@ xfs_file_buffered_read(
 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
 	ssize_t			ret;
 
-	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+	trace_xfs_file_buffered_read(iocb, to);
 
 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
 	if (ret)
@@ -599,7 +597,7 @@ xfs_file_dio_write(
 		iolock = XFS_IOLOCK_SHARED;
 	}
 
-	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
+	trace_xfs_file_direct_write(iocb, from);
 	/*
 	 * If unaligned, this is the only IO in-flight. Wait on it before we
 	 * release the iolock to prevent subsequent overlapping IO.
@@ -622,7 +620,6 @@ xfs_file_dax_write(
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			iolock = XFS_IOLOCK_EXCL;
 	ssize_t			ret, error = 0;
-	size_t			count;
 	loff_t			pos;
 
 	ret = xfs_ilock_iocb(iocb, iolock);
@@ -633,9 +630,8 @@ xfs_file_dax_write(
 		goto out;
 
 	pos = iocb->ki_pos;
-	count = iov_iter_count(from);
 
-	trace_xfs_file_dax_write(ip, count, pos);
+	trace_xfs_file_dax_write(iocb, from);
 	ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
 		i_size_write(inode, iocb->ki_pos);
@@ -683,7 +679,7 @@ xfs_file_buffered_write(
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
 
-	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+	trace_xfs_file_buffered_write(iocb, from);
 	ret = iomap_file_buffered_write(iocb, from,
 			&xfs_buffered_write_iomap_ops);
 	if (likely(ret >= 0))
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 5a263ae3d4f008..a6d04d860a565e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1287,8 +1287,8 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
 )
 
 DECLARE_EVENT_CLASS(xfs_file_class,
-	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
-	TP_ARGS(ip, count, offset),
+	TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),
+	TP_ARGS(iocb, iter),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
@@ -1297,11 +1297,11 @@ DECLARE_EVENT_CLASS(xfs_file_class,
 		__field(size_t, count)
 	),
 	TP_fast_assign(
-		__entry->dev = VFS_I(ip)->i_sb->s_dev;
-		__entry->ino = ip->i_ino;
-		__entry->size = ip->i_d.di_size;
-		__entry->offset = offset;
-		__entry->count = count;
+		__entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
+		__entry->ino = XFS_I(file_inode(iocb->ki_filp))->i_ino;
+		__entry->size = XFS_I(file_inode(iocb->ki_filp))->i_d.di_size;
+		__entry->offset = iocb->ki_pos;
+		__entry->count = iov_iter_count(iter);
 	),
 	TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1313,8 +1313,8 @@ DECLARE_EVENT_CLASS(xfs_file_class,
 
 #define DEFINE_RW_EVENT(name)		\
 DEFINE_EVENT(xfs_file_class, name,	\
-	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),	\
-	TP_ARGS(ip, count, offset))
+	TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),		\
+	TP_ARGS(iocb, iter))
 DEFINE_RW_EVENT(xfs_file_buffered_read);
 DEFINE_RW_EVENT(xfs_file_direct_read);
 DEFINE_RW_EVENT(xfs_file_dax_read);
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 06/11] xfs: improve the reflink_bounce_dio_write tracepoint
  2021-01-18 19:35 reduce sub-block DIO serialisation v2 Christoph Hellwig
                   ` (4 preceding siblings ...)
  2021-01-18 19:35 ` [PATCH 05/11] xfs: simplify the read/write tracepoints Christoph Hellwig
@ 2021-01-18 19:35 ` Christoph Hellwig
  2021-01-19 15:23   ` Brian Foster
  2021-01-20 18:45   ` Darrick J. Wong
  2021-01-18 19:35 ` [PATCH 07/11] xfs: split unaligned DIO write code out Christoph Hellwig
                   ` (4 subsequent siblings)
  10 siblings, 2 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-18 19:35 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, avi, Dave Chinner

Use a more suitable event class.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_file.c  | 2 +-
 fs/xfs/xfs_trace.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index aa64e78fc3c467..a696bd34f71d21 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -560,7 +560,7 @@ xfs_file_dio_write(
 		 * files yet, as we can't unshare a partial block.
 		 */
 		if (xfs_is_cow_inode(ip)) {
-			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
+			trace_xfs_reflink_bounce_dio_write(iocb, from);
 			return -ENOTBLK;
 		}
 		iolock = XFS_IOLOCK_EXCL;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a6d04d860a565e..0cfd65cd67c190 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1321,6 +1321,8 @@ DEFINE_RW_EVENT(xfs_file_dax_read);
 DEFINE_RW_EVENT(xfs_file_buffered_write);
 DEFINE_RW_EVENT(xfs_file_direct_write);
 DEFINE_RW_EVENT(xfs_file_dax_write);
+DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
+
 
 DECLARE_EVENT_CLASS(xfs_imap_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
@@ -3294,8 +3296,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
 
-DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
-
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 07/11] xfs: split unaligned DIO write code out
  2021-01-18 19:35 reduce sub-block DIO serialisation v2 Christoph Hellwig
                   ` (5 preceding siblings ...)
  2021-01-18 19:35 ` [PATCH 06/11] xfs: improve the reflink_bounce_dio_write tracepoint Christoph Hellwig
@ 2021-01-18 19:35 ` Christoph Hellwig
  2021-01-19 15:23   ` Brian Foster
  2021-01-20 18:46   ` Darrick J. Wong
  2021-01-18 19:35 ` [PATCH 08/11] iomap: rename the flags variable in __iomap_dio_rw Christoph Hellwig
                   ` (3 subsequent siblings)
  10 siblings, 2 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-18 19:35 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, avi, Dave Chinner

From: Dave Chinner <dchinner@redhat.com>

The unaligned DIO write path is more convolted than the normal path,
and we are about to make it more complex. Keep the block aligned
fast path dio write code trim and simple by splitting out the
unaligned DIO code from it.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
[hch: rebased, fixed a few minor nits]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_file.c | 168 +++++++++++++++++++++++++---------------------
 1 file changed, 92 insertions(+), 76 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a696bd34f71d21..bffd7240cefb7f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -500,117 +500,133 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
 };
 
 /*
- * xfs_file_dio_write - handle direct IO writes
+ * Handle block aligned direct IO writes
  *
  * Lock the inode appropriately to prepare for and issue a direct IO write.
- * By separating it from the buffered write path we remove all the tricky to
- * follow locking changes and looping.
  *
  * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
  * until we're sure the bytes at the new EOF have been zeroed and/or the cached
  * pages are flushed out.
+ */
+static noinline ssize_t
+xfs_file_dio_write_aligned(
+	struct xfs_inode	*ip,
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	int			iolock = XFS_IOLOCK_SHARED;
+	ssize_t			ret;
+
+	ret = xfs_ilock_iocb(iocb, iolock);
+	if (ret)
+		return ret;
+	ret = xfs_file_write_checks(iocb, from, &iolock);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
+	 * the iolock back to shared if we had to take the exclusive lock in
+	 * xfs_file_write_checks() for other reasons.
+	 */
+	if (iolock == XFS_IOLOCK_EXCL) {
+		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		iolock = XFS_IOLOCK_SHARED;
+	}
+	trace_xfs_file_direct_write(iocb, from);
+	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+			   &xfs_dio_write_ops, is_sync_kiocb(iocb));
+out_unlock:
+	if (iolock)
+		xfs_iunlock(ip, iolock);
+	return ret;
+}
+
+/*
+ * Handle block unaligned direct IO writes
+ *
+ * In most cases direct IO writes will be done holding IOLOCK_SHARED, allowing
+ * them to be done in parallel with reads and other direct IO writes.  However,
+ * if the I/O is not aligned to filesystem blocks, the direct I/O layer may
+ * need to do sub-block zeroing and that requires serialisation against other
+ * direct I/Os to the same block. In this case we need to serialise the
+ * submission of the unaligned I/Os so that we don't get racing block zeroing in
+ * the dio layer.
  *
- * In most cases the direct IO writes will be done holding IOLOCK_SHARED
- * allowing them to be done in parallel with reads and other direct IO writes.
- * However, if the IO is not aligned to filesystem blocks, the direct IO layer
- * needs to do sub-block zeroing and that requires serialisation against other
- * direct IOs to the same block. In this case we need to serialise the
- * submission of the unaligned IOs so that we don't get racing block zeroing in
- * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * To provide the same serialisation for AIO, we also need to wait for
  * outstanding IOs to complete so that unwritten extent conversion is completed
  * before we try to map the overlapping block. This is currently implemented by
  * hitting it with a big hammer (i.e. inode_dio_wait()).
  *
- * Returns with locks held indicated by @iolock and errors indicated by
- * negative return values.
+ * This means that unaligned dio writes always block. There is no "nowait" fast
+ * path in this code - if IOCB_NOWAIT is set we simply return -EAGAIN up front
+ * and we don't have to worry about that anymore.
  */
-STATIC ssize_t
-xfs_file_dio_write(
+static noinline ssize_t
+xfs_file_dio_write_unaligned(
+	struct xfs_inode	*ip,
 	struct kiocb		*iocb,
 	struct iov_iter		*from)
 {
-	struct file		*file = iocb->ki_filp;
-	struct address_space	*mapping = file->f_mapping;
-	struct inode		*inode = mapping->host;
-	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	ssize_t			ret = 0;
-	int			unaligned_io = 0;
-	int			iolock;
-	size_t			count = iov_iter_count(from);
-	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+	int			iolock = XFS_IOLOCK_EXCL;
+	ssize_t			ret;
 
-	/* DIO must be aligned to device logical sector size */
-	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
-		return -EINVAL;
+	/* unaligned dio always waits, bail */
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		return -EAGAIN;
+	xfs_ilock(ip, iolock);
 
 	/*
-	 * Don't take the exclusive iolock here unless the I/O is unaligned to
-	 * the file system block size.  We don't need to consider the EOF
-	 * extension case here because xfs_file_write_checks() will relock
-	 * the inode as necessary for EOF zeroing cases and fill out the new
-	 * inode size as appropriate.
+	 * We can't properly handle unaligned direct I/O to reflink files yet,
+	 * as we can't unshare a partial block.
 	 */
-	if ((iocb->ki_pos & mp->m_blockmask) ||
-	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
-		unaligned_io = 1;
-
-		/*
-		 * We can't properly handle unaligned direct I/O to reflink
-		 * files yet, as we can't unshare a partial block.
-		 */
-		if (xfs_is_cow_inode(ip)) {
-			trace_xfs_reflink_bounce_dio_write(iocb, from);
-			return -ENOTBLK;
-		}
-		iolock = XFS_IOLOCK_EXCL;
-	} else {
-		iolock = XFS_IOLOCK_SHARED;
-	}
-
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		/* unaligned dio always waits, bail */
-		if (unaligned_io)
-			return -EAGAIN;
-		if (!xfs_ilock_nowait(ip, iolock))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, iolock);
+	if (xfs_is_cow_inode(ip)) {
+		trace_xfs_reflink_bounce_dio_write(iocb, from);
+		ret = -ENOTBLK;
+		goto out_unlock;
 	}
 
 	ret = xfs_file_write_checks(iocb, from, &iolock);
 	if (ret)
-		goto out;
-	count = iov_iter_count(from);
+		goto out_unlock;
 
 	/*
-	 * If we are doing unaligned IO, we can't allow any other overlapping IO
-	 * in-flight at the same time or we risk data corruption. Wait for all
-	 * other IO to drain before we submit. If the IO is aligned, demote the
-	 * iolock if we had to take the exclusive lock in
-	 * xfs_file_write_checks() for other reasons.
+	 * If we are doing unaligned I/O, we can't allow any other overlapping
+	 * I/O in-flight at the same time or we risk data corruption. Wait for
+	 * all other I/O to drain before we submit.
 	 */
-	if (unaligned_io) {
-		inode_dio_wait(inode);
-	} else if (iolock == XFS_IOLOCK_EXCL) {
-		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
-		iolock = XFS_IOLOCK_SHARED;
-	}
+	inode_dio_wait(VFS_I(ip));
 
-	trace_xfs_file_direct_write(iocb, from);
 	/*
-	 * If unaligned, this is the only IO in-flight. Wait on it before we
-	 * release the iolock to prevent subsequent overlapping IO.
+	 * This must be the only I/O in-flight. Wait on it before we release the
+	 * iolock to prevent subsequent overlapping I/O.
 	 */
+	trace_xfs_file_direct_write(iocb, from);
 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
-			   &xfs_dio_write_ops,
-			   is_sync_kiocb(iocb) || unaligned_io);
-out:
+			   &xfs_dio_write_ops, true);
+out_unlock:
 	if (iolock)
 		xfs_iunlock(ip, iolock);
 	return ret;
 }
 
+static ssize_t
+xfs_file_dio_write(
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+	size_t			count = iov_iter_count(from);
+
+	/* DIO must be aligned to device logical sector size */
+	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
+		return -EINVAL;
+	if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+		return xfs_file_dio_write_unaligned(ip, iocb, from);
+	return xfs_file_dio_write_aligned(ip, iocb, from);
+}
+
 static noinline ssize_t
 xfs_file_dax_write(
 	struct kiocb		*iocb,
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 08/11] iomap: rename the flags variable in __iomap_dio_rw
  2021-01-18 19:35 reduce sub-block DIO serialisation v2 Christoph Hellwig
                   ` (6 preceding siblings ...)
  2021-01-18 19:35 ` [PATCH 07/11] xfs: split unaligned DIO write code out Christoph Hellwig
@ 2021-01-18 19:35 ` Christoph Hellwig
  2021-01-18 20:34   ` Dave Chinner
                     ` (2 more replies)
  2021-01-18 19:35 ` [PATCH 09/11] iomap: pass a flags argument to iomap_dio_rw Christoph Hellwig
                   ` (2 subsequent siblings)
  10 siblings, 3 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-18 19:35 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, avi

Rename flags to iomap_flags to make the usage a little more clear.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/direct-io.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 933f234d5becd0..604103ab76f9c5 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -427,7 +427,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	size_t count = iov_iter_count(iter);
 	loff_t pos = iocb->ki_pos;
 	loff_t end = iocb->ki_pos + count - 1, ret = 0;
-	unsigned int flags = IOMAP_DIRECT;
+	unsigned int iomap_flags = IOMAP_DIRECT;
 	struct blk_plug plug;
 	struct iomap_dio *dio;
 
@@ -461,7 +461,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		if (iter_is_iovec(iter))
 			dio->flags |= IOMAP_DIO_DIRTY;
 	} else {
-		flags |= IOMAP_WRITE;
+		iomap_flags |= IOMAP_WRITE;
 		dio->flags |= IOMAP_DIO_WRITE;
 
 		/* for data sync or sync, we need sync completion processing */
@@ -483,7 +483,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 			ret = -EAGAIN;
 			goto out_free_dio;
 		}
-		flags |= IOMAP_NOWAIT;
+		iomap_flags |= IOMAP_NOWAIT;
 	}
 
 	ret = filemap_write_and_wait_range(mapping, pos, end);
@@ -514,7 +514,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 
 	blk_start_plug(&plug);
 	do {
-		ret = iomap_apply(inode, pos, count, flags, ops, dio,
+		ret = iomap_apply(inode, pos, count, iomap_flags, ops, dio,
 				iomap_dio_actor);
 		if (ret <= 0) {
 			/* magic error code to fall back to buffered I/O */
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 09/11] iomap: pass a flags argument to iomap_dio_rw
  2021-01-18 19:35 reduce sub-block DIO serialisation v2 Christoph Hellwig
                   ` (7 preceding siblings ...)
  2021-01-18 19:35 ` [PATCH 08/11] iomap: rename the flags variable in __iomap_dio_rw Christoph Hellwig
@ 2021-01-18 19:35 ` Christoph Hellwig
  2021-01-19 15:23   ` Brian Foster
  2021-01-20 18:17   ` Darrick J. Wong
  2021-01-18 19:35 ` [PATCH 10/11] iomap: add a IOMAP_DIO_UNALIGNED flag Christoph Hellwig
  2021-01-18 19:35 ` [PATCH 11/11] xfs: reduce exclusive locking on unaligned dio Christoph Hellwig
  10 siblings, 2 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-18 19:35 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, avi

Pass a set of flags to iomap_dio_rw instead of the boolean
wait_for_completion argument.  The IOMAP_DIO_FORCE_WAIT flag
replaces the wait_for_completion, but only needs to be passed
when the iocb isn't synchronous to start with to simplify the
callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/file.c       |  7 +++----
 fs/ext4/file.c        |  5 ++---
 fs/gfs2/file.c        |  7 ++-----
 fs/iomap/direct-io.c  | 11 +++++------
 fs/xfs/xfs_file.c     |  7 +++----
 fs/zonefs/super.c     |  4 ++--
 include/linux/iomap.h | 10 ++++++++--
 7 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0e41459b8de667..ddfd2e2adedf58 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1949,8 +1949,8 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
 		goto buffered;
 	}
 
-	dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops,
-			     &btrfs_dio_ops, is_sync_kiocb(iocb));
+	dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+			     0);
 
 	btrfs_inode_unlock(inode, ilock_flags);
 
@@ -3622,8 +3622,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
 		return 0;
 
 	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
-	ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
-			   is_sync_kiocb(iocb));
+	ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 	return ret;
 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 349b27f0dda0cb..194f5d00fa3267 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -74,8 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		return generic_file_read_iter(iocb, to);
 	}
 
-	ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
-			   is_sync_kiocb(iocb));
+	ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
 	inode_unlock_shared(inode);
 
 	file_accessed(iocb->ki_filp);
@@ -550,7 +549,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ilock_shared)
 		iomap_ops = &ext4_iomap_overwrite_ops;
 	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
-			   is_sync_kiocb(iocb) || unaligned_io || extend);
+			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
 	if (ret == -ENOTBLK)
 		ret = 0;
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b39b339feddc93..89609c2997177a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -797,9 +797,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
 	if (ret)
 		goto out_uninit;
 
-	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
-			   is_sync_kiocb(iocb));
-
+	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
 	gfs2_glock_dq(gh);
 out_uninit:
 	gfs2_holder_uninit(gh);
@@ -833,8 +831,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
 	if (offset + len > i_size_read(&ip->i_inode))
 		goto out;
 
-	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
-			   is_sync_kiocb(iocb));
+	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
 	if (ret == -ENOTBLK)
 		ret = 0;
 out:
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 604103ab76f9c5..32dbbf7dd4aadb 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -420,13 +420,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 struct iomap_dio *
 __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
-		bool wait_for_completion)
+		unsigned int dio_flags)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = file_inode(iocb->ki_filp);
 	size_t count = iov_iter_count(iter);
 	loff_t pos = iocb->ki_pos;
 	loff_t end = iocb->ki_pos + count - 1, ret = 0;
+	bool wait_for_completion =
+		is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT);
 	unsigned int iomap_flags = IOMAP_DIRECT;
 	struct blk_plug plug;
 	struct iomap_dio *dio;
@@ -434,9 +436,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (!count)
 		return NULL;
 
-	if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
-		return ERR_PTR(-EIO);
-
 	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
 	if (!dio)
 		return ERR_PTR(-ENOMEM);
@@ -598,11 +597,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
 ssize_t
 iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
-		bool wait_for_completion)
+		unsigned int flags)
 {
 	struct iomap_dio *dio;
 
-	dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion);
+	dio = __iomap_dio_rw(iocb, iter, ops, dops, flags);
 	if (IS_ERR_OR_NULL(dio))
 		return PTR_ERR_OR_ZERO(dio);
 	return iomap_dio_complete(dio);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index bffd7240cefb7f..b181db42f2f32f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -232,8 +232,7 @@ xfs_file_dio_read(
 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
 	if (ret)
 		return ret;
-	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
-			is_sync_kiocb(iocb));
+	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
 	return ret;
@@ -535,7 +534,7 @@ xfs_file_dio_write_aligned(
 	}
 	trace_xfs_file_direct_write(iocb, from);
 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
-			   &xfs_dio_write_ops, is_sync_kiocb(iocb));
+			   &xfs_dio_write_ops, 0);
 out_unlock:
 	if (iolock)
 		xfs_iunlock(ip, iolock);
@@ -603,7 +602,7 @@ xfs_file_dio_write_unaligned(
 	 */
 	trace_xfs_file_direct_write(iocb, from);
 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
-			   &xfs_dio_write_ops, true);
+			   &xfs_dio_write_ops, IOMAP_DIO_FORCE_WAIT);
 out_unlock:
 	if (iolock)
 		xfs_iunlock(ip, iolock);
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index bec47f2d074beb..0e7ab0bc00ae8e 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -780,7 +780,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 		ret = zonefs_file_dio_append(iocb, from);
 	else
 		ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
-				   &zonefs_write_dio_ops, sync);
+				   &zonefs_write_dio_ops, 0);
 	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
 	    (ret > 0 || ret == -EIOCBQUEUED)) {
 		if (ret > 0)
@@ -917,7 +917,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		}
 		file_accessed(iocb->ki_filp);
 		ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
-				   &zonefs_read_dio_ops, is_sync_kiocb(iocb));
+				   &zonefs_read_dio_ops, 0);
 	} else {
 		ret = generic_file_read_iter(iocb, to);
 		if (ret == -EIO)
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 5bd3cac4df9cb4..b322598dc10ec0 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -256,12 +256,18 @@ struct iomap_dio_ops {
 			struct bio *bio, loff_t file_offset);
 };
 
+/*
+ * Wait for the I/O to complete in iomap_dio_rw even if the kiocb is not
+ * synchronous.
+ */
+#define IOMAP_DIO_FORCE_WAIT	(1 << 0)
+
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
-		bool wait_for_completion);
+		unsigned int flags);
 struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
-		bool wait_for_completion);
+		unsigned int flags);
 ssize_t iomap_dio_complete(struct iomap_dio *dio);
 int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
 
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 10/11] iomap: add a IOMAP_DIO_UNALIGNED flag
  2021-01-18 19:35 reduce sub-block DIO serialisation v2 Christoph Hellwig
                   ` (8 preceding siblings ...)
  2021-01-18 19:35 ` [PATCH 09/11] iomap: pass a flags argument to iomap_dio_rw Christoph Hellwig
@ 2021-01-18 19:35 ` Christoph Hellwig
  2021-01-18 20:45   ` Dave Chinner
                     ` (2 more replies)
  2021-01-18 19:35 ` [PATCH 11/11] xfs: reduce exclusive locking on unaligned dio Christoph Hellwig
  10 siblings, 3 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-18 19:35 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, avi

Add a flag to signal an I/O that is not file system block aligned.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/direct-io.c  | 7 +++++++
 include/linux/iomap.h | 8 ++++++++
 2 files changed, 15 insertions(+)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 32dbbf7dd4aadb..d93019ee4c9e3e 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -485,6 +485,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		iomap_flags |= IOMAP_NOWAIT;
 	}
 
+	if (dio_flags & IOMAP_DIO_UNALIGNED) {
+		ret = -EAGAIN;
+		if (pos >= dio->i_size)
+			goto out_free_dio;
+		iomap_flags |= IOMAP_UNALIGNED;
+	}
+
 	ret = filemap_write_and_wait_range(mapping, pos, end);
 	if (ret)
 		goto out_free_dio;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index b322598dc10ec0..2fa94ec9583d0a 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -122,6 +122,7 @@ struct iomap_page_ops {
 #define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
 #define IOMAP_DIRECT		(1 << 4) /* direct I/O */
 #define IOMAP_NOWAIT		(1 << 5) /* do not block */
+#define IOMAP_UNALIGNED		(1 << 6) /* do not allocate blocks */
 
 struct iomap_ops {
 	/*
@@ -262,6 +263,13 @@ struct iomap_dio_ops {
  */
 #define IOMAP_DIO_FORCE_WAIT	(1 << 0)
 
+/*
+ * Direct I/O that is not aligned to the file system block.  Do not allocate
+ * blocks and do not zero partial blocks, fall back to the caller by returning
+ * -EAGAIN instead.
+ */
+#define IOMAP_DIO_UNALIGNED	(1 << 1)
+
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
 		unsigned int flags);
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 11/11] xfs: reduce exclusive locking on unaligned dio
  2021-01-18 19:35 reduce sub-block DIO serialisation v2 Christoph Hellwig
                   ` (9 preceding siblings ...)
  2021-01-18 19:35 ` [PATCH 10/11] iomap: add a IOMAP_DIO_UNALIGNED flag Christoph Hellwig
@ 2021-01-18 19:35 ` Christoph Hellwig
  2021-01-18 20:55   ` Dave Chinner
  2021-01-20 18:40   ` Darrick J. Wong
  10 siblings, 2 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-18 19:35 UTC (permalink / raw)
  To: linux-xfs; +Cc: linux-fsdevel, avi, Dave Chinner

From: Dave Chinner <dchinner@redhat.com>

Attempt shared locking for unaligned DIO, but only if the the
underlying extent is already allocated and in written state. On
failure, retry with the existing exclusive locking.

Test case is fio randrw of 512 byte IOs using AIO and an iodepth of
32 IOs.

Vanilla:

  READ: bw=4560KiB/s (4670kB/s), 4560KiB/s-4560KiB/s (4670kB/s-4670kB/s), io=134MiB (140MB), run=30001-30001msec
  WRITE: bw=4567KiB/s (4676kB/s), 4567KiB/s-4567KiB/s (4676kB/s-4676kB/s), io=134MiB (140MB), run=30001-30001msec

Patched:
   READ: bw=37.6MiB/s (39.4MB/s), 37.6MiB/s-37.6MiB/s (39.4MB/s-39.4MB/s), io=1127MiB (1182MB), run=30002-30002msec
  WRITE: bw=37.6MiB/s (39.4MB/s), 37.6MiB/s-37.6MiB/s (39.4MB/s-39.4MB/s), io=1128MiB (1183MB), run=30002-30002msec

That's an improvement from ~18k IOPS to a ~150k IOPS, which is
about the IOPS limit of the VM block device setup I'm testing on.

4kB block IO comparison:

   READ: bw=296MiB/s (310MB/s), 296MiB/s-296MiB/s (310MB/s-310MB/s), io=8868MiB (9299MB), run=30002-30002msec
  WRITE: bw=296MiB/s (310MB/s), 296MiB/s-296MiB/s (310MB/s-310MB/s), io=8878MiB (9309MB), run=30002-30002msec

Which is ~150k IOPS, same as what the test gets for sub-block
AIO+DIO writes with this patch.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
[hch: rebased, split unaligned from nowait]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_file.c  | 87 ++++++++++++++++++++++++++++++++--------------
 fs/xfs/xfs_iomap.c | 31 ++++++++++++-----
 2 files changed, 84 insertions(+), 34 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b181db42f2f32f..4e475e750148db 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -544,22 +544,35 @@ xfs_file_dio_write_aligned(
 /*
  * Handle block unaligned direct IO writes
  *
- * In most cases direct IO writes will be done holding IOLOCK_SHARED, allowing
- * them to be done in parallel with reads and other direct IO writes.  However,
- * if the I/O is not aligned to filesystem blocks, the direct I/O layer may
- * need to do sub-block zeroing and that requires serialisation against other
- * direct I/Os to the same block. In this case we need to serialise the
- * submission of the unaligned I/Os so that we don't get racing block zeroing in
- * the dio layer.
+ * In most cases direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * may need to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In the case where sub-block zeroing is not
+ * required, we can do concurrent sub-block dios to the same block successfully.
  *
- * To provide the same serialisation for AIO, we also need to wait for
+ * Hence we have two cases here - the shared, optimisitic fast path for written
+ * extents, and everything else that needs exclusive IO path access across the
+ * entire IO.
+ *
+ * For the first case, we do all the checks we need at the mapping layer in the
+ * DIO code as part of the existing NOWAIT infrastructure. Hence all we need to
+ * do to support concurrent subblock dio is first try a non-blocking submission.
+ * If that returns -EAGAIN, then we simply repeat the IO submission with full
+ * IO exclusivity guaranteed so that we avoid racing sub-block zeroing.
+ *
+ * The only wrinkle in this case is that the iomap DIO code always does
+ * partial tail sub-block zeroing for post-EOF writes. Hence for any IO that
+ * _ends_ past the current EOF we need to run with full exclusivity. Note that
+ * we also check for the start of IO being beyond EOF because then zeroing
+ * between the old EOF and the start of the IO is required and that also
+ * requires exclusivity. Hence we avoid lock cycles and blocking under
+ * IOCB_NOWAIT for this situation, too.
+ *
+ * To provide the exclusivity required when using AIO, we also need to wait for
  * outstanding IOs to complete so that unwritten extent conversion is completed
  * before we try to map the overlapping block. This is currently implemented by
  * hitting it with a big hammer (i.e. inode_dio_wait()).
- *
- * This means that unaligned dio writes always block. There is no "nowait" fast
- * path in this code - if IOCB_NOWAIT is set we simply return -EAGAIN up front
- * and we don't have to worry about that anymore.
  */
 static noinline ssize_t
 xfs_file_dio_write_unaligned(
@@ -567,13 +580,27 @@ xfs_file_dio_write_unaligned(
 	struct kiocb		*iocb,
 	struct iov_iter		*from)
 {
-	int			iolock = XFS_IOLOCK_EXCL;
+	size_t			isize = i_size_read(VFS_I(ip));
+	size_t			count = iov_iter_count(from);
+	int			iolock = XFS_IOLOCK_SHARED;
+	unsigned int		flags = IOMAP_DIO_UNALIGNED;
 	ssize_t			ret;
 
-	/* unaligned dio always waits, bail */
-	if (iocb->ki_flags & IOCB_NOWAIT)
-		return -EAGAIN;
-	xfs_ilock(ip, iolock);
+	/*
+	 * Extending writes need exclusivity because of the sub-block zeroing
+	 * that the DIO code always does for partial tail blocks beyond EOF.
+	 */
+	if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
+retry_exclusive:
+		if (iocb->ki_flags & IOCB_NOWAIT)
+			return -EAGAIN;
+		iolock = XFS_IOLOCK_EXCL;
+		flags = IOMAP_DIO_FORCE_WAIT;
+	}
+
+	ret = xfs_ilock_iocb(iocb, iolock);
+	if (ret)
+		return ret;
 
 	/*
 	 * We can't properly handle unaligned direct I/O to reflink files yet,
@@ -590,19 +617,27 @@ xfs_file_dio_write_unaligned(
 		goto out_unlock;
 
 	/*
-	 * If we are doing unaligned I/O, we can't allow any other overlapping
-	 * I/O in-flight at the same time or we risk data corruption. Wait for
-	 * all other I/O to drain before we submit.
+	 * If we are doing exclusive unaligned IO, we can't allow any other
+	 * overlapping IO in-flight at the same time or we risk data corruption.
+	 * Wait for all other IO to drain before we submit.
 	 */
-	inode_dio_wait(VFS_I(ip));
+	if (!(flags & IOMAP_DIO_UNALIGNED))
+		inode_dio_wait(VFS_I(ip));
 
-	/*
-	 * This must be the only I/O in-flight. Wait on it before we release the
-	 * iolock to prevent subsequent overlapping I/O.
-	 */
 	trace_xfs_file_direct_write(iocb, from);
 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
-			   &xfs_dio_write_ops, IOMAP_DIO_FORCE_WAIT);
+			   &xfs_dio_write_ops, flags);
+	/*
+	 * Retry unaligned IO with exclusive blocking semantics if the DIO
+	 * layer rejected it for mapping or locking reasons. If we are doing
+	 * nonblocking user IO, propagate the error.
+	 */
+	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
+		ASSERT(flags & IOMAP_DIO_UNALIGNED);
+		xfs_iunlock(ip, iolock);
+		goto retry_exclusive;
+	}
+
 out_unlock:
 	if (iolock)
 		xfs_iunlock(ip, iolock);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7b9ff824e82d48..dc8c86e98b99bf 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -784,15 +784,30 @@ xfs_direct_write_iomap_begin(
 		goto allocate_blocks;
 
 	/*
-	 * NOWAIT IO needs to span the entire requested IO with a single map so
-	 * that we avoid partial IO failures due to the rest of the IO range not
-	 * covered by this map triggering an EAGAIN condition when it is
-	 * subsequently mapped and aborting the IO.
+	 * NOWAIT and unaligned IO needs to span the entire requested IO with a
+	 * single map so that we avoid partial IO failures due to the rest of
+	 * the IO range not covered by this map triggering an EAGAIN condition
+	 * when it is subsequently mapped and aborting the IO.
 	 */
-	if ((flags & IOMAP_NOWAIT) &&
-	    !imap_spans_range(&imap, offset_fsb, end_fsb)) {
+	if (flags & (IOMAP_NOWAIT | IOMAP_UNALIGNED)) {
 		error = -EAGAIN;
-		goto out_unlock;
+		if (!imap_spans_range(&imap, offset_fsb, end_fsb))
+			goto out_unlock;
+	}
+
+	/*
+	 * For unsigned I/O we can't convert an unwritten extents if the I/O is
+	 * not block size aligned, as such a conversion would have to do
+	 * sub-block zeroing, and that can only be done under an exclusive
+	 * IOLOCK. Hence if this is not a written extent, return EAGAIN to tell
+	 * the caller to try again.
+	 */
+	if (flags & IOMAP_UNALIGNED) {
+		error = -EAGAIN;
+		if (imap.br_state != XFS_EXT_NORM &&
+		    ((offset & mp->m_blockmask) ||
+		     ((offset + length) & mp->m_blockmask)))
+			goto out_unlock;
 	}
 
 	xfs_iunlock(ip, lockmode);
@@ -801,7 +816,7 @@ xfs_direct_write_iomap_begin(
 
 allocate_blocks:
 	error = -EAGAIN;
-	if (flags & IOMAP_NOWAIT)
+	if (flags & (IOMAP_NOWAIT | IOMAP_UNALIGNED))
 		goto out_unlock;
 
 	/*
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH 08/11] iomap: rename the flags variable in __iomap_dio_rw
  2021-01-18 19:35 ` [PATCH 08/11] iomap: rename the flags variable in __iomap_dio_rw Christoph Hellwig
@ 2021-01-18 20:34   ` Dave Chinner
  2021-01-19 15:23   ` Brian Foster
  2021-01-20 18:46   ` Darrick J. Wong
  2 siblings, 0 replies; 42+ messages in thread
From: Dave Chinner @ 2021-01-18 20:34 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi

On Mon, Jan 18, 2021 at 08:35:13PM +0100, Christoph Hellwig wrote:
> Rename flags to iomap_flags to make the usage a little more clear.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Makes sense.

Reviewed-by: Dave Chinner <dchinner@redhat.com>
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 10/11] iomap: add a IOMAP_DIO_UNALIGNED flag
  2021-01-18 19:35 ` [PATCH 10/11] iomap: add a IOMAP_DIO_UNALIGNED flag Christoph Hellwig
@ 2021-01-18 20:45   ` Dave Chinner
  2021-01-18 21:41   ` Matthew Wilcox
  2021-01-20 18:47   ` Darrick J. Wong
  2 siblings, 0 replies; 42+ messages in thread
From: Dave Chinner @ 2021-01-18 20:45 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi

On Mon, Jan 18, 2021 at 08:35:15PM +0100, Christoph Hellwig wrote:
> Add a flag to signal an I/O that is not file system block aligned.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/iomap/direct-io.c  | 7 +++++++
>  include/linux/iomap.h | 8 ++++++++
>  2 files changed, 15 insertions(+)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 32dbbf7dd4aadb..d93019ee4c9e3e 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -485,6 +485,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		iomap_flags |= IOMAP_NOWAIT;
>  	}
>  
> +	if (dio_flags & IOMAP_DIO_UNALIGNED) {
> +		ret = -EAGAIN;
> +		if (pos >= dio->i_size)
> +			goto out_free_dio;

This also needs to check for pos+len > dio->i_size on a write as
iomap_dio_rw_actor will do unconditional sub-block zeroing in that
case, too.

> +		iomap_flags |= IOMAP_UNALIGNED;
> +	}
> +
>  	ret = filemap_write_and_wait_range(mapping, pos, end);
>  	if (ret)
>  		goto out_free_dio;
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index b322598dc10ec0..2fa94ec9583d0a 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -122,6 +122,7 @@ struct iomap_page_ops {
>  #define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
>  #define IOMAP_DIRECT		(1 << 4) /* direct I/O */
>  #define IOMAP_NOWAIT		(1 << 5) /* do not block */
> +#define IOMAP_UNALIGNED		(1 << 6) /* do not allocate blocks */
>  
>  struct iomap_ops {
>  	/*
> @@ -262,6 +263,13 @@ struct iomap_dio_ops {
>   */
>  #define IOMAP_DIO_FORCE_WAIT	(1 << 0)
>  
> +/*
> + * Direct I/O that is not aligned to the file system block.  Do not allocate
> + * blocks and do not zero partial blocks, fall back to the caller by returning
> + * -EAGAIN instead.
> + */
> +#define IOMAP_DIO_UNALIGNED	(1 << 1)

I'd describe it a little bit differently, clearly indicating that
this is for optional behaviour and not needed on all unaligned DIO.

/*
 * Filesystems may need to special case DIO that is not aligned to
 * block boundaries. If they set IOMAP_DIO_UNALIGNED on an unaligned
 * IO, then do not allocate blocks or zero partial blocks, but
 * instead fall back to the caller by returning -EAGAIN so they can
 * handle these conditions correctly.
 */

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 11/11] xfs: reduce exclusive locking on unaligned dio
  2021-01-18 19:35 ` [PATCH 11/11] xfs: reduce exclusive locking on unaligned dio Christoph Hellwig
@ 2021-01-18 20:55   ` Dave Chinner
  2021-01-20 16:36     ` Christoph Hellwig
  2021-01-20 18:40   ` Darrick J. Wong
  1 sibling, 1 reply; 42+ messages in thread
From: Dave Chinner @ 2021-01-18 20:55 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner

On Mon, Jan 18, 2021 at 08:35:16PM +0100, Christoph Hellwig wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Attempt shared locking for unaligned DIO, but only if the the
> underlying extent is already allocated and in written state. On
> failure, retry with the existing exclusive locking.
....
> @@ -590,19 +617,27 @@ xfs_file_dio_write_unaligned(
>  		goto out_unlock;
>  
>  	/*
> -	 * If we are doing unaligned I/O, we can't allow any other overlapping
> -	 * I/O in-flight at the same time or we risk data corruption. Wait for
> -	 * all other I/O to drain before we submit.
> +	 * If we are doing exclusive unaligned IO, we can't allow any other
> +	 * overlapping IO in-flight at the same time or we risk data corruption.
> +	 * Wait for all other IO to drain before we submit.
>  	 */
> -	inode_dio_wait(VFS_I(ip));
> +	if (!(flags & IOMAP_DIO_UNALIGNED))
> +		inode_dio_wait(VFS_I(ip));
>  
> -	/*
> -	 * This must be the only I/O in-flight. Wait on it before we release the
> -	 * iolock to prevent subsequent overlapping I/O.
> -	 */
>  	trace_xfs_file_direct_write(iocb, from);
>  	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> -			   &xfs_dio_write_ops, IOMAP_DIO_FORCE_WAIT);
> +			   &xfs_dio_write_ops, flags);
> +	/*
> +	 * Retry unaligned IO with exclusive blocking semantics if the DIO
> +	 * layer rejected it for mapping or locking reasons. If we are doing
> +	 * nonblocking user IO, propagate the error.
> +	 */
> +	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
> +		ASSERT(flags & IOMAP_DIO_UNALIGNED);
> +		xfs_iunlock(ip, iolock);
> +		goto retry_exclusive;
> +	}
> +
>  out_unlock:
>  	if (iolock)
>  		xfs_iunlock(ip, iolock);

Do we ever get here without holding the iolock anymore?

> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 7b9ff824e82d48..dc8c86e98b99bf 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -784,15 +784,30 @@ xfs_direct_write_iomap_begin(
>  		goto allocate_blocks;
>  
>  	/*
> -	 * NOWAIT IO needs to span the entire requested IO with a single map so
> -	 * that we avoid partial IO failures due to the rest of the IO range not
> -	 * covered by this map triggering an EAGAIN condition when it is
> -	 * subsequently mapped and aborting the IO.
> +	 * NOWAIT and unaligned IO needs to span the entire requested IO with a
> +	 * single map so that we avoid partial IO failures due to the rest of
> +	 * the IO range not covered by this map triggering an EAGAIN condition
> +	 * when it is subsequently mapped and aborting the IO.
>  	 */
> -	if ((flags & IOMAP_NOWAIT) &&
> -	    !imap_spans_range(&imap, offset_fsb, end_fsb)) {
> +	if (flags & (IOMAP_NOWAIT | IOMAP_UNALIGNED)) {
>  		error = -EAGAIN;
> -		goto out_unlock;
> +		if (!imap_spans_range(&imap, offset_fsb, end_fsb))
> +			goto out_unlock;
> +	}
> +
> +	/*
> +	 * For unsigned I/O we can't convert an unwritten extents if the I/O is
> +	 * not block size aligned, as such a conversion would have to do
> +	 * sub-block zeroing, and that can only be done under an exclusive
> +	 * IOLOCK. Hence if this is not a written extent, return EAGAIN to tell
> +	 * the caller to try again.
> +	 */

A few typos in that comment :)

	/*
	 * For unaligned IO, we cannot convert unwritten extents without
	 * requiring sub-block zeroing. This can only be done under an exclusive
	 * IOLOCK, hence return -EAGAIN if this is not a written extent to tell
	 * the caller to try again.
	 */

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 10/11] iomap: add a IOMAP_DIO_UNALIGNED flag
  2021-01-18 19:35 ` [PATCH 10/11] iomap: add a IOMAP_DIO_UNALIGNED flag Christoph Hellwig
  2021-01-18 20:45   ` Dave Chinner
@ 2021-01-18 21:41   ` Matthew Wilcox
  2021-01-20 16:40     ` Christoph Hellwig
  2021-01-20 18:47   ` Darrick J. Wong
  2 siblings, 1 reply; 42+ messages in thread
From: Matthew Wilcox @ 2021-01-18 21:41 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi

On Mon, Jan 18, 2021 at 08:35:15PM +0100, Christoph Hellwig wrote:
> Add a flag to signal an I/O that is not file system block aligned.
> +	if (dio_flags & IOMAP_DIO_UNALIGNED) {

There are a number of things that DIO has to be aligned to -- memory
addresses, for example.  Can we be a little more verbose about what is
unaligned here?  eg

	if (dio_flags & IOMAP_DIO_FS_UNALIGNED)

(or FSBLK_UNALIGNED, or ... something).


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 03/11] xfs: cleanup the read/write helper naming
  2021-01-18 19:35 ` [PATCH 03/11] xfs: cleanup the read/write helper naming Christoph Hellwig
@ 2021-01-19 15:23   ` Brian Foster
  2021-01-20 18:43   ` Darrick J. Wong
  1 sibling, 0 replies; 42+ messages in thread
From: Brian Foster @ 2021-01-19 15:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner

On Mon, Jan 18, 2021 at 08:35:08PM +0100, Christoph Hellwig wrote:
> Drop a few pointless aio_ prefixes.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_file.c | 30 +++++++++++++++---------------
>  1 file changed, 15 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index fb4e6f2852bb8b..ae7313ccaa11ed 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -215,7 +215,7 @@ xfs_ilock_iocb(
>  }
>  
>  STATIC ssize_t
> -xfs_file_dio_aio_read(
> +xfs_file_dio_read(
>  	struct kiocb		*iocb,
>  	struct iov_iter		*to)
>  {
> @@ -265,7 +265,7 @@ xfs_file_dax_read(
>  }
>  
>  STATIC ssize_t
> -xfs_file_buffered_aio_read(
> +xfs_file_buffered_read(
>  	struct kiocb		*iocb,
>  	struct iov_iter		*to)
>  {
> @@ -300,9 +300,9 @@ xfs_file_read_iter(
>  	if (IS_DAX(inode))
>  		ret = xfs_file_dax_read(iocb, to);
>  	else if (iocb->ki_flags & IOCB_DIRECT)
> -		ret = xfs_file_dio_aio_read(iocb, to);
> +		ret = xfs_file_dio_read(iocb, to);
>  	else
> -		ret = xfs_file_buffered_aio_read(iocb, to);
> +		ret = xfs_file_buffered_read(iocb, to);
>  
>  	if (ret > 0)
>  		XFS_STATS_ADD(mp, xs_read_bytes, ret);
> @@ -317,7 +317,7 @@ xfs_file_read_iter(
>   * if called for a direct write beyond i_size.
>   */
>  STATIC ssize_t
> -xfs_file_aio_write_checks(
> +xfs_file_write_checks(
>  	struct kiocb		*iocb,
>  	struct iov_iter		*from,
>  	int			*iolock)
> @@ -502,7 +502,7 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
>  };
>  
>  /*
> - * xfs_file_dio_aio_write - handle direct IO writes
> + * xfs_file_dio_write - handle direct IO writes
>   *
>   * Lock the inode appropriately to prepare for and issue a direct IO write.
>   * By separating it from the buffered write path we remove all the tricky to
> @@ -527,7 +527,7 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
>   * negative return values.
>   */
>  STATIC ssize_t
> -xfs_file_dio_aio_write(
> +xfs_file_dio_write(
>  	struct kiocb		*iocb,
>  	struct iov_iter		*from)
>  {
> @@ -549,7 +549,7 @@ xfs_file_dio_aio_write(
>  	/*
>  	 * Don't take the exclusive iolock here unless the I/O is unaligned to
>  	 * the file system block size.  We don't need to consider the EOF
> -	 * extension case here because xfs_file_aio_write_checks() will relock
> +	 * extension case here because xfs_file_write_checks() will relock
>  	 * the inode as necessary for EOF zeroing cases and fill out the new
>  	 * inode size as appropriate.
>  	 */
> @@ -580,7 +580,7 @@ xfs_file_dio_aio_write(
>  		xfs_ilock(ip, iolock);
>  	}
>  
> -	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
> +	ret = xfs_file_write_checks(iocb, from, &iolock);
>  	if (ret)
>  		goto out;
>  	count = iov_iter_count(from);
> @@ -590,7 +590,7 @@ xfs_file_dio_aio_write(
>  	 * in-flight at the same time or we risk data corruption. Wait for all
>  	 * other IO to drain before we submit. If the IO is aligned, demote the
>  	 * iolock if we had to take the exclusive lock in
> -	 * xfs_file_aio_write_checks() for other reasons.
> +	 * xfs_file_write_checks() for other reasons.
>  	 */
>  	if (unaligned_io) {
>  		inode_dio_wait(inode);
> @@ -634,7 +634,7 @@ xfs_file_dax_write(
>  	ret = xfs_ilock_iocb(iocb, iolock);
>  	if (ret)
>  		return ret;
> -	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
> +	ret = xfs_file_write_checks(iocb, from, &iolock);
>  	if (ret)
>  		goto out;
>  
> @@ -663,7 +663,7 @@ xfs_file_dax_write(
>  }
>  
>  STATIC ssize_t
> -xfs_file_buffered_aio_write(
> +xfs_file_buffered_write(
>  	struct kiocb		*iocb,
>  	struct iov_iter		*from)
>  {
> @@ -682,7 +682,7 @@ xfs_file_buffered_aio_write(
>  	iolock = XFS_IOLOCK_EXCL;
>  	xfs_ilock(ip, iolock);
>  
> -	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
> +	ret = xfs_file_write_checks(iocb, from, &iolock);
>  	if (ret)
>  		goto out;
>  
> @@ -769,12 +769,12 @@ xfs_file_write_iter(
>  		 * CoW.  In all other directio scenarios we do not
>  		 * allow an operation to fall back to buffered mode.
>  		 */
> -		ret = xfs_file_dio_aio_write(iocb, from);
> +		ret = xfs_file_dio_write(iocb, from);
>  		if (ret != -ENOTBLK)
>  			return ret;
>  	}
>  
> -	return xfs_file_buffered_aio_write(iocb, from);
> +	return xfs_file_buffered_write(iocb, from);
>  }
>  
>  static void
> -- 
> 2.29.2
> 


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 04/11] xfs: remove the buffered I/O fallback assert
  2021-01-18 19:35 ` [PATCH 04/11] xfs: remove the buffered I/O fallback assert Christoph Hellwig
@ 2021-01-19 15:23   ` Brian Foster
  2021-01-20 18:43   ` Darrick J. Wong
  1 sibling, 0 replies; 42+ messages in thread
From: Brian Foster @ 2021-01-19 15:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner

On Mon, Jan 18, 2021 at 08:35:09PM +0100, Christoph Hellwig wrote:
> The iomap code has been designed from the start not to do magic fallback,
> so remove the assert in preparation for further code cleanups.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_file.c | 6 ------
>  1 file changed, 6 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index ae7313ccaa11ed..97836ec53397d4 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -610,12 +610,6 @@ xfs_file_dio_write(
>  out:
>  	if (iolock)
>  		xfs_iunlock(ip, iolock);
> -
> -	/*
> -	 * No fallback to buffered IO after short writes for XFS, direct I/O
> -	 * will either complete fully or return an error.
> -	 */
> -	ASSERT(ret < 0 || ret == count);
>  	return ret;
>  }
>  
> -- 
> 2.29.2
> 


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 05/11] xfs: simplify the read/write tracepoints
  2021-01-18 19:35 ` [PATCH 05/11] xfs: simplify the read/write tracepoints Christoph Hellwig
@ 2021-01-19 15:23   ` Brian Foster
  2021-01-20 18:45   ` Darrick J. Wong
  1 sibling, 0 replies; 42+ messages in thread
From: Brian Foster @ 2021-01-19 15:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner

On Mon, Jan 18, 2021 at 08:35:10PM +0100, Christoph Hellwig wrote:
> Pass the iocb and iov_iter to the tracepoints and leave decoding of
> actual arguments to the code only run when tracing is enabled.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_file.c  | 20 ++++++++------------
>  fs/xfs/xfs_trace.h | 18 +++++++++---------
>  2 files changed, 17 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 97836ec53397d4..aa64e78fc3c467 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -220,12 +220,11 @@ xfs_file_dio_read(
>  	struct iov_iter		*to)
>  {
>  	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
> -	size_t			count = iov_iter_count(to);
>  	ssize_t			ret;
>  
> -	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
> +	trace_xfs_file_direct_read(iocb, to);
>  
> -	if (!count)
> +	if (!iov_iter_count(to))
>  		return 0; /* skip atime */
>  
>  	file_accessed(iocb->ki_filp);
> @@ -246,12 +245,11 @@ xfs_file_dax_read(
>  	struct iov_iter		*to)
>  {
>  	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
> -	size_t			count = iov_iter_count(to);
>  	ssize_t			ret = 0;
>  
> -	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
> +	trace_xfs_file_dax_read(iocb, to);
>  
> -	if (!count)
> +	if (!iov_iter_count(to))
>  		return 0; /* skip atime */
>  
>  	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
> @@ -272,7 +270,7 @@ xfs_file_buffered_read(
>  	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
>  	ssize_t			ret;
>  
> -	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
> +	trace_xfs_file_buffered_read(iocb, to);
>  
>  	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
>  	if (ret)
> @@ -599,7 +597,7 @@ xfs_file_dio_write(
>  		iolock = XFS_IOLOCK_SHARED;
>  	}
>  
> -	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
> +	trace_xfs_file_direct_write(iocb, from);
>  	/*
>  	 * If unaligned, this is the only IO in-flight. Wait on it before we
>  	 * release the iolock to prevent subsequent overlapping IO.
> @@ -622,7 +620,6 @@ xfs_file_dax_write(
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	int			iolock = XFS_IOLOCK_EXCL;
>  	ssize_t			ret, error = 0;
> -	size_t			count;
>  	loff_t			pos;
>  
>  	ret = xfs_ilock_iocb(iocb, iolock);
> @@ -633,9 +630,8 @@ xfs_file_dax_write(
>  		goto out;
>  
>  	pos = iocb->ki_pos;
> -	count = iov_iter_count(from);
>  
> -	trace_xfs_file_dax_write(ip, count, pos);
> +	trace_xfs_file_dax_write(iocb, from);
>  	ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
>  	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
>  		i_size_write(inode, iocb->ki_pos);
> @@ -683,7 +679,7 @@ xfs_file_buffered_write(
>  	/* We can write back this queue in page reclaim */
>  	current->backing_dev_info = inode_to_bdi(inode);
>  
> -	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
> +	trace_xfs_file_buffered_write(iocb, from);
>  	ret = iomap_file_buffered_write(iocb, from,
>  			&xfs_buffered_write_iomap_ops);
>  	if (likely(ret >= 0))
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 5a263ae3d4f008..a6d04d860a565e 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -1287,8 +1287,8 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
>  )
>  
>  DECLARE_EVENT_CLASS(xfs_file_class,
> -	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
> -	TP_ARGS(ip, count, offset),
> +	TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),
> +	TP_ARGS(iocb, iter),
>  	TP_STRUCT__entry(
>  		__field(dev_t, dev)
>  		__field(xfs_ino_t, ino)
> @@ -1297,11 +1297,11 @@ DECLARE_EVENT_CLASS(xfs_file_class,
>  		__field(size_t, count)
>  	),
>  	TP_fast_assign(
> -		__entry->dev = VFS_I(ip)->i_sb->s_dev;
> -		__entry->ino = ip->i_ino;
> -		__entry->size = ip->i_d.di_size;
> -		__entry->offset = offset;
> -		__entry->count = count;
> +		__entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
> +		__entry->ino = XFS_I(file_inode(iocb->ki_filp))->i_ino;
> +		__entry->size = XFS_I(file_inode(iocb->ki_filp))->i_d.di_size;
> +		__entry->offset = iocb->ki_pos;
> +		__entry->count = iov_iter_count(iter);
>  	),
>  	TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
> @@ -1313,8 +1313,8 @@ DECLARE_EVENT_CLASS(xfs_file_class,
>  
>  #define DEFINE_RW_EVENT(name)		\
>  DEFINE_EVENT(xfs_file_class, name,	\
> -	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),	\
> -	TP_ARGS(ip, count, offset))
> +	TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),		\
> +	TP_ARGS(iocb, iter))
>  DEFINE_RW_EVENT(xfs_file_buffered_read);
>  DEFINE_RW_EVENT(xfs_file_direct_read);
>  DEFINE_RW_EVENT(xfs_file_dax_read);
> -- 
> 2.29.2
> 


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 06/11] xfs: improve the reflink_bounce_dio_write tracepoint
  2021-01-18 19:35 ` [PATCH 06/11] xfs: improve the reflink_bounce_dio_write tracepoint Christoph Hellwig
@ 2021-01-19 15:23   ` Brian Foster
  2021-01-20 18:45   ` Darrick J. Wong
  1 sibling, 0 replies; 42+ messages in thread
From: Brian Foster @ 2021-01-19 15:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner

On Mon, Jan 18, 2021 at 08:35:11PM +0100, Christoph Hellwig wrote:
> Use a more suitable event class.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_file.c  | 2 +-
>  fs/xfs/xfs_trace.h | 4 ++--
>  2 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index aa64e78fc3c467..a696bd34f71d21 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -560,7 +560,7 @@ xfs_file_dio_write(
>  		 * files yet, as we can't unshare a partial block.
>  		 */
>  		if (xfs_is_cow_inode(ip)) {
> -			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
> +			trace_xfs_reflink_bounce_dio_write(iocb, from);
>  			return -ENOTBLK;
>  		}
>  		iolock = XFS_IOLOCK_EXCL;
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index a6d04d860a565e..0cfd65cd67c190 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -1321,6 +1321,8 @@ DEFINE_RW_EVENT(xfs_file_dax_read);
>  DEFINE_RW_EVENT(xfs_file_buffered_write);
>  DEFINE_RW_EVENT(xfs_file_direct_write);
>  DEFINE_RW_EVENT(xfs_file_dax_write);
> +DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
> +
>  
>  DECLARE_EVENT_CLASS(xfs_imap_class,
>  	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
> @@ -3294,8 +3296,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
>  
> -DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
> -
>  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
>  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
> -- 
> 2.29.2
> 


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 07/11] xfs: split unaligned DIO write code out
  2021-01-18 19:35 ` [PATCH 07/11] xfs: split unaligned DIO write code out Christoph Hellwig
@ 2021-01-19 15:23   ` Brian Foster
  2021-01-20 18:46   ` Darrick J. Wong
  1 sibling, 0 replies; 42+ messages in thread
From: Brian Foster @ 2021-01-19 15:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner

On Mon, Jan 18, 2021 at 08:35:12PM +0100, Christoph Hellwig wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> The unaligned DIO write path is more convolted than the normal path,
> and we are about to make it more complex. Keep the block aligned
> fast path dio write code trim and simple by splitting out the
> unaligned DIO code from it.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> [hch: rebased, fixed a few minor nits]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_file.c | 168 +++++++++++++++++++++++++---------------------
>  1 file changed, 92 insertions(+), 76 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index a696bd34f71d21..bffd7240cefb7f 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -500,117 +500,133 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
>  };
>  
>  /*
> - * xfs_file_dio_write - handle direct IO writes
> + * Handle block aligned direct IO writes
>   *
>   * Lock the inode appropriately to prepare for and issue a direct IO write.
> - * By separating it from the buffered write path we remove all the tricky to
> - * follow locking changes and looping.
>   *
>   * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
>   * until we're sure the bytes at the new EOF have been zeroed and/or the cached
>   * pages are flushed out.
> + */
> +static noinline ssize_t
> +xfs_file_dio_write_aligned(
> +	struct xfs_inode	*ip,
> +	struct kiocb		*iocb,
> +	struct iov_iter		*from)
> +{
> +	int			iolock = XFS_IOLOCK_SHARED;
> +	ssize_t			ret;
> +
> +	ret = xfs_ilock_iocb(iocb, iolock);
> +	if (ret)
> +		return ret;
> +	ret = xfs_file_write_checks(iocb, from, &iolock);
> +	if (ret)
> +		goto out_unlock;
> +
> +	/*
> +	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
> +	 * the iolock back to shared if we had to take the exclusive lock in
> +	 * xfs_file_write_checks() for other reasons.
> +	 */
> +	if (iolock == XFS_IOLOCK_EXCL) {
> +		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
> +		iolock = XFS_IOLOCK_SHARED;
> +	}
> +	trace_xfs_file_direct_write(iocb, from);
> +	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> +			   &xfs_dio_write_ops, is_sync_kiocb(iocb));
> +out_unlock:
> +	if (iolock)
> +		xfs_iunlock(ip, iolock);
> +	return ret;
> +}
> +
> +/*
> + * Handle block unaligned direct IO writes
> + *
> + * In most cases direct IO writes will be done holding IOLOCK_SHARED, allowing
> + * them to be done in parallel with reads and other direct IO writes.  However,
> + * if the I/O is not aligned to filesystem blocks, the direct I/O layer may
> + * need to do sub-block zeroing and that requires serialisation against other
> + * direct I/Os to the same block. In this case we need to serialise the
> + * submission of the unaligned I/Os so that we don't get racing block zeroing in
> + * the dio layer.
>   *
> - * In most cases the direct IO writes will be done holding IOLOCK_SHARED
> - * allowing them to be done in parallel with reads and other direct IO writes.
> - * However, if the IO is not aligned to filesystem blocks, the direct IO layer
> - * needs to do sub-block zeroing and that requires serialisation against other
> - * direct IOs to the same block. In this case we need to serialise the
> - * submission of the unaligned IOs so that we don't get racing block zeroing in
> - * the dio layer.  To avoid the problem with aio, we also need to wait for
> + * To provide the same serialisation for AIO, we also need to wait for
>   * outstanding IOs to complete so that unwritten extent conversion is completed
>   * before we try to map the overlapping block. This is currently implemented by
>   * hitting it with a big hammer (i.e. inode_dio_wait()).
>   *
> - * Returns with locks held indicated by @iolock and errors indicated by
> - * negative return values.
> + * This means that unaligned dio writes always block. There is no "nowait" fast
> + * path in this code - if IOCB_NOWAIT is set we simply return -EAGAIN up front
> + * and we don't have to worry about that anymore.
>   */
> -STATIC ssize_t
> -xfs_file_dio_write(
> +static noinline ssize_t
> +xfs_file_dio_write_unaligned(
> +	struct xfs_inode	*ip,
>  	struct kiocb		*iocb,
>  	struct iov_iter		*from)
>  {
> -	struct file		*file = iocb->ki_filp;
> -	struct address_space	*mapping = file->f_mapping;
> -	struct inode		*inode = mapping->host;
> -	struct xfs_inode	*ip = XFS_I(inode);
> -	struct xfs_mount	*mp = ip->i_mount;
> -	ssize_t			ret = 0;
> -	int			unaligned_io = 0;
> -	int			iolock;
> -	size_t			count = iov_iter_count(from);
> -	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
> +	int			iolock = XFS_IOLOCK_EXCL;
> +	ssize_t			ret;
>  
> -	/* DIO must be aligned to device logical sector size */
> -	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
> -		return -EINVAL;
> +	/* unaligned dio always waits, bail */
> +	if (iocb->ki_flags & IOCB_NOWAIT)
> +		return -EAGAIN;
> +	xfs_ilock(ip, iolock);
>  
>  	/*
> -	 * Don't take the exclusive iolock here unless the I/O is unaligned to
> -	 * the file system block size.  We don't need to consider the EOF
> -	 * extension case here because xfs_file_write_checks() will relock
> -	 * the inode as necessary for EOF zeroing cases and fill out the new
> -	 * inode size as appropriate.
> +	 * We can't properly handle unaligned direct I/O to reflink files yet,
> +	 * as we can't unshare a partial block.
>  	 */
> -	if ((iocb->ki_pos & mp->m_blockmask) ||
> -	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
> -		unaligned_io = 1;
> -
> -		/*
> -		 * We can't properly handle unaligned direct I/O to reflink
> -		 * files yet, as we can't unshare a partial block.
> -		 */
> -		if (xfs_is_cow_inode(ip)) {
> -			trace_xfs_reflink_bounce_dio_write(iocb, from);
> -			return -ENOTBLK;
> -		}
> -		iolock = XFS_IOLOCK_EXCL;
> -	} else {
> -		iolock = XFS_IOLOCK_SHARED;
> -	}
> -
> -	if (iocb->ki_flags & IOCB_NOWAIT) {
> -		/* unaligned dio always waits, bail */
> -		if (unaligned_io)
> -			return -EAGAIN;
> -		if (!xfs_ilock_nowait(ip, iolock))
> -			return -EAGAIN;
> -	} else {
> -		xfs_ilock(ip, iolock);
> +	if (xfs_is_cow_inode(ip)) {
> +		trace_xfs_reflink_bounce_dio_write(iocb, from);
> +		ret = -ENOTBLK;
> +		goto out_unlock;
>  	}
>  
>  	ret = xfs_file_write_checks(iocb, from, &iolock);
>  	if (ret)
> -		goto out;
> -	count = iov_iter_count(from);
> +		goto out_unlock;
>  
>  	/*
> -	 * If we are doing unaligned IO, we can't allow any other overlapping IO
> -	 * in-flight at the same time or we risk data corruption. Wait for all
> -	 * other IO to drain before we submit. If the IO is aligned, demote the
> -	 * iolock if we had to take the exclusive lock in
> -	 * xfs_file_write_checks() for other reasons.
> +	 * If we are doing unaligned I/O, we can't allow any other overlapping
> +	 * I/O in-flight at the same time or we risk data corruption. Wait for
> +	 * all other I/O to drain before we submit.
>  	 */
> -	if (unaligned_io) {
> -		inode_dio_wait(inode);
> -	} else if (iolock == XFS_IOLOCK_EXCL) {
> -		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
> -		iolock = XFS_IOLOCK_SHARED;
> -	}
> +	inode_dio_wait(VFS_I(ip));
>  
> -	trace_xfs_file_direct_write(iocb, from);
>  	/*
> -	 * If unaligned, this is the only IO in-flight. Wait on it before we
> -	 * release the iolock to prevent subsequent overlapping IO.
> +	 * This must be the only I/O in-flight. Wait on it before we release the
> +	 * iolock to prevent subsequent overlapping I/O.
>  	 */
> +	trace_xfs_file_direct_write(iocb, from);
>  	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> -			   &xfs_dio_write_ops,
> -			   is_sync_kiocb(iocb) || unaligned_io);
> -out:
> +			   &xfs_dio_write_ops, true);
> +out_unlock:
>  	if (iolock)
>  		xfs_iunlock(ip, iolock);
>  	return ret;
>  }
>  
> +static ssize_t
> +xfs_file_dio_write(
> +	struct kiocb		*iocb,
> +	struct iov_iter		*from)
> +{
> +	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
> +	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
> +	size_t			count = iov_iter_count(from);
> +
> +	/* DIO must be aligned to device logical sector size */
> +	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
> +		return -EINVAL;
> +	if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
> +		return xfs_file_dio_write_unaligned(ip, iocb, from);
> +	return xfs_file_dio_write_aligned(ip, iocb, from);
> +}
> +
>  static noinline ssize_t
>  xfs_file_dax_write(
>  	struct kiocb		*iocb,
> -- 
> 2.29.2
> 


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 08/11] iomap: rename the flags variable in __iomap_dio_rw
  2021-01-18 19:35 ` [PATCH 08/11] iomap: rename the flags variable in __iomap_dio_rw Christoph Hellwig
  2021-01-18 20:34   ` Dave Chinner
@ 2021-01-19 15:23   ` Brian Foster
  2021-01-20 18:46   ` Darrick J. Wong
  2 siblings, 0 replies; 42+ messages in thread
From: Brian Foster @ 2021-01-19 15:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi

On Mon, Jan 18, 2021 at 08:35:13PM +0100, Christoph Hellwig wrote:
> Rename flags to iomap_flags to make the usage a little more clear.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/iomap/direct-io.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 933f234d5becd0..604103ab76f9c5 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -427,7 +427,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	size_t count = iov_iter_count(iter);
>  	loff_t pos = iocb->ki_pos;
>  	loff_t end = iocb->ki_pos + count - 1, ret = 0;
> -	unsigned int flags = IOMAP_DIRECT;
> +	unsigned int iomap_flags = IOMAP_DIRECT;
>  	struct blk_plug plug;
>  	struct iomap_dio *dio;
>  
> @@ -461,7 +461,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		if (iter_is_iovec(iter))
>  			dio->flags |= IOMAP_DIO_DIRTY;
>  	} else {
> -		flags |= IOMAP_WRITE;
> +		iomap_flags |= IOMAP_WRITE;
>  		dio->flags |= IOMAP_DIO_WRITE;
>  
>  		/* for data sync or sync, we need sync completion processing */
> @@ -483,7 +483,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  			ret = -EAGAIN;
>  			goto out_free_dio;
>  		}
> -		flags |= IOMAP_NOWAIT;
> +		iomap_flags |= IOMAP_NOWAIT;
>  	}
>  
>  	ret = filemap_write_and_wait_range(mapping, pos, end);
> @@ -514,7 +514,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  
>  	blk_start_plug(&plug);
>  	do {
> -		ret = iomap_apply(inode, pos, count, flags, ops, dio,
> +		ret = iomap_apply(inode, pos, count, iomap_flags, ops, dio,
>  				iomap_dio_actor);
>  		if (ret <= 0) {
>  			/* magic error code to fall back to buffered I/O */
> -- 
> 2.29.2
> 


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 09/11] iomap: pass a flags argument to iomap_dio_rw
  2021-01-18 19:35 ` [PATCH 09/11] iomap: pass a flags argument to iomap_dio_rw Christoph Hellwig
@ 2021-01-19 15:23   ` Brian Foster
  2021-01-20 18:17   ` Darrick J. Wong
  1 sibling, 0 replies; 42+ messages in thread
From: Brian Foster @ 2021-01-19 15:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi

On Mon, Jan 18, 2021 at 08:35:14PM +0100, Christoph Hellwig wrote:
> Pass a set of flags to iomap_dio_rw instead of the boolean
> wait_for_completion argument.  The IOMAP_DIO_FORCE_WAIT flag
> replaces the wait_for_completion, but only needs to be passed
> when the iocb isn't synchronous to start with to simplify the
> callers.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/btrfs/file.c       |  7 +++----
>  fs/ext4/file.c        |  5 ++---
>  fs/gfs2/file.c        |  7 ++-----
>  fs/iomap/direct-io.c  | 11 +++++------
>  fs/xfs/xfs_file.c     |  7 +++----
>  fs/zonefs/super.c     |  4 ++--
>  include/linux/iomap.h | 10 ++++++++--
>  7 files changed, 25 insertions(+), 26 deletions(-)
> 
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 0e41459b8de667..ddfd2e2adedf58 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -1949,8 +1949,8 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
>  		goto buffered;
>  	}
>  
> -	dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops,
> -			     &btrfs_dio_ops, is_sync_kiocb(iocb));
> +	dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
> +			     0);
>  
>  	btrfs_inode_unlock(inode, ilock_flags);
>  
> @@ -3622,8 +3622,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
>  		return 0;
>  
>  	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
> -	ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
> -			   is_sync_kiocb(iocb));
> +	ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
>  	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
>  	return ret;
>  }
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 349b27f0dda0cb..194f5d00fa3267 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -74,8 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  		return generic_file_read_iter(iocb, to);
>  	}
>  
> -	ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
> -			   is_sync_kiocb(iocb));
> +	ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
>  	inode_unlock_shared(inode);
>  
>  	file_accessed(iocb->ki_filp);
> @@ -550,7 +549,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	if (ilock_shared)
>  		iomap_ops = &ext4_iomap_overwrite_ops;
>  	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
> -			   is_sync_kiocb(iocb) || unaligned_io || extend);
> +			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
>  	if (ret == -ENOTBLK)
>  		ret = 0;
>  
> diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> index b39b339feddc93..89609c2997177a 100644
> --- a/fs/gfs2/file.c
> +++ b/fs/gfs2/file.c
> @@ -797,9 +797,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
>  	if (ret)
>  		goto out_uninit;
>  
> -	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
> -			   is_sync_kiocb(iocb));
> -
> +	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
>  	gfs2_glock_dq(gh);
>  out_uninit:
>  	gfs2_holder_uninit(gh);
> @@ -833,8 +831,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
>  	if (offset + len > i_size_read(&ip->i_inode))
>  		goto out;
>  
> -	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
> -			   is_sync_kiocb(iocb));
> +	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
>  	if (ret == -ENOTBLK)
>  		ret = 0;
>  out:
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 604103ab76f9c5..32dbbf7dd4aadb 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -420,13 +420,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
>  struct iomap_dio *
>  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
> -		bool wait_for_completion)
> +		unsigned int dio_flags)
>  {
>  	struct address_space *mapping = iocb->ki_filp->f_mapping;
>  	struct inode *inode = file_inode(iocb->ki_filp);
>  	size_t count = iov_iter_count(iter);
>  	loff_t pos = iocb->ki_pos;
>  	loff_t end = iocb->ki_pos + count - 1, ret = 0;
> +	bool wait_for_completion =
> +		is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT);
>  	unsigned int iomap_flags = IOMAP_DIRECT;
>  	struct blk_plug plug;
>  	struct iomap_dio *dio;
> @@ -434,9 +436,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	if (!count)
>  		return NULL;
>  
> -	if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
> -		return ERR_PTR(-EIO);
> -
>  	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
>  	if (!dio)
>  		return ERR_PTR(-ENOMEM);
> @@ -598,11 +597,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
>  ssize_t
>  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
> -		bool wait_for_completion)
> +		unsigned int flags)
>  {
>  	struct iomap_dio *dio;
>  
> -	dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion);
> +	dio = __iomap_dio_rw(iocb, iter, ops, dops, flags);
>  	if (IS_ERR_OR_NULL(dio))
>  		return PTR_ERR_OR_ZERO(dio);
>  	return iomap_dio_complete(dio);
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index bffd7240cefb7f..b181db42f2f32f 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -232,8 +232,7 @@ xfs_file_dio_read(
>  	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
>  	if (ret)
>  		return ret;
> -	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
> -			is_sync_kiocb(iocb));
> +	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
>  	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
>  
>  	return ret;
> @@ -535,7 +534,7 @@ xfs_file_dio_write_aligned(
>  	}
>  	trace_xfs_file_direct_write(iocb, from);
>  	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> -			   &xfs_dio_write_ops, is_sync_kiocb(iocb));
> +			   &xfs_dio_write_ops, 0);
>  out_unlock:
>  	if (iolock)
>  		xfs_iunlock(ip, iolock);
> @@ -603,7 +602,7 @@ xfs_file_dio_write_unaligned(
>  	 */
>  	trace_xfs_file_direct_write(iocb, from);
>  	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> -			   &xfs_dio_write_ops, true);
> +			   &xfs_dio_write_ops, IOMAP_DIO_FORCE_WAIT);
>  out_unlock:
>  	if (iolock)
>  		xfs_iunlock(ip, iolock);
> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> index bec47f2d074beb..0e7ab0bc00ae8e 100644
> --- a/fs/zonefs/super.c
> +++ b/fs/zonefs/super.c
> @@ -780,7 +780,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
>  		ret = zonefs_file_dio_append(iocb, from);
>  	else
>  		ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
> -				   &zonefs_write_dio_ops, sync);
> +				   &zonefs_write_dio_ops, 0);
>  	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
>  	    (ret > 0 || ret == -EIOCBQUEUED)) {
>  		if (ret > 0)
> @@ -917,7 +917,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  		}
>  		file_accessed(iocb->ki_filp);
>  		ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
> -				   &zonefs_read_dio_ops, is_sync_kiocb(iocb));
> +				   &zonefs_read_dio_ops, 0);
>  	} else {
>  		ret = generic_file_read_iter(iocb, to);
>  		if (ret == -EIO)
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 5bd3cac4df9cb4..b322598dc10ec0 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -256,12 +256,18 @@ struct iomap_dio_ops {
>  			struct bio *bio, loff_t file_offset);
>  };
>  
> +/*
> + * Wait for the I/O to complete in iomap_dio_rw even if the kiocb is not
> + * synchronous.
> + */
> +#define IOMAP_DIO_FORCE_WAIT	(1 << 0)
> +
>  ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
> -		bool wait_for_completion);
> +		unsigned int flags);
>  struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
> -		bool wait_for_completion);
> +		unsigned int flags);
>  ssize_t iomap_dio_complete(struct iomap_dio *dio);
>  int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
>  
> -- 
> 2.29.2
> 


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 02/11] xfs: make xfs_file_aio_write_checks IOCB_NOWAIT-aware
       [not found]   ` <CACz=WeeaqMrGM53pJF0C_Wt2JuavTOnOV26-osPviYLUpqUmFw@mail.gmail.com>
@ 2021-01-20 16:28     ` Christoph Hellwig
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-20 16:28 UTC (permalink / raw)
  To: Raphael Carvalho
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, avi, Dave Chinner,
	Brian Foster

On Tue, Jan 19, 2021 at 09:33:37AM -0300, Raphael Carvalho wrote:
> >          * No fallback to buffered IO after short writes for XFS, direct
> > I/O
> > @@ -632,7 +648,8 @@ xfs_file_dax_write(
> >                 error = xfs_setfilesize(ip, pos, ret);
> >         }
> >  out:
> > -       xfs_iunlock(ip, iolock);
> > +       if (iolock)
> > +               xfs_iunlock(ip, iolock);
> >
> 
> Not familiar with the code but looks like you're setting *iolock to zero on
> error and perhaps you want to dereference it here instead

In this function iolock is a scalar value, not a pointer.
xfs_file_aio_write_checks gets it passed by reference and clears it,
and here we check that the iolock is locked at all.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 11/11] xfs: reduce exclusive locking on unaligned dio
  2021-01-18 20:55   ` Dave Chinner
@ 2021-01-20 16:36     ` Christoph Hellwig
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-20 16:36 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, avi, Dave Chinner

On Tue, Jan 19, 2021 at 07:55:21AM +1100, Dave Chinner wrote:
> > +			   &xfs_dio_write_ops, flags);
> > +	/*
> > +	 * Retry unaligned IO with exclusive blocking semantics if the DIO
> > +	 * layer rejected it for mapping or locking reasons. If we are doing
> > +	 * nonblocking user IO, propagate the error.
> > +	 */
> > +	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
> > +		ASSERT(flags & IOMAP_DIO_UNALIGNED);
> > +		xfs_iunlock(ip, iolock);
> > +		goto retry_exclusive;
> > +	}
> > +
> >  out_unlock:
> >  	if (iolock)
> >  		xfs_iunlock(ip, iolock);
> 
> Do we ever get here without holding the iolock anymore?

Yes, if xfs_ilock_iocb as called from xfs_file_write_checks fails.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 10/11] iomap: add a IOMAP_DIO_UNALIGNED flag
  2021-01-18 21:41   ` Matthew Wilcox
@ 2021-01-20 16:40     ` Christoph Hellwig
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-20 16:40 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, avi

I've renamed it to IOMAP_DIO_SUBBLOCK

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 09/11] iomap: pass a flags argument to iomap_dio_rw
  2021-01-18 19:35 ` [PATCH 09/11] iomap: pass a flags argument to iomap_dio_rw Christoph Hellwig
  2021-01-19 15:23   ` Brian Foster
@ 2021-01-20 18:17   ` Darrick J. Wong
  2021-01-20 18:35     ` Christoph Hellwig
  1 sibling, 1 reply; 42+ messages in thread
From: Darrick J. Wong @ 2021-01-20 18:17 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi

On Mon, Jan 18, 2021 at 08:35:14PM +0100, Christoph Hellwig wrote:
> Pass a set of flags to iomap_dio_rw instead of the boolean
> wait_for_completion argument.  The IOMAP_DIO_FORCE_WAIT flag
> replaces the wait_for_completion, but only needs to be passed
> when the iocb isn't synchronous to start with to simplify the
> callers.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/btrfs/file.c       |  7 +++----
>  fs/ext4/file.c        |  5 ++---
>  fs/gfs2/file.c        |  7 ++-----
>  fs/iomap/direct-io.c  | 11 +++++------
>  fs/xfs/xfs_file.c     |  7 +++----
>  fs/zonefs/super.c     |  4 ++--
>  include/linux/iomap.h | 10 ++++++++--
>  7 files changed, 25 insertions(+), 26 deletions(-)
> 
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 0e41459b8de667..ddfd2e2adedf58 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -1949,8 +1949,8 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
>  		goto buffered;
>  	}
>  
> -	dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops,
> -			     &btrfs_dio_ops, is_sync_kiocb(iocb));
> +	dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
> +			     0);
>  
>  	btrfs_inode_unlock(inode, ilock_flags);
>  
> @@ -3622,8 +3622,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
>  		return 0;
>  
>  	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
> -	ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
> -			   is_sync_kiocb(iocb));
> +	ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
>  	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
>  	return ret;
>  }
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 349b27f0dda0cb..194f5d00fa3267 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -74,8 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  		return generic_file_read_iter(iocb, to);
>  	}
>  
> -	ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
> -			   is_sync_kiocb(iocb));
> +	ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
>  	inode_unlock_shared(inode);
>  
>  	file_accessed(iocb->ki_filp);
> @@ -550,7 +549,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	if (ilock_shared)
>  		iomap_ops = &ext4_iomap_overwrite_ops;
>  	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
> -			   is_sync_kiocb(iocb) || unaligned_io || extend);
> +			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
>  	if (ret == -ENOTBLK)
>  		ret = 0;
>  
> diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> index b39b339feddc93..89609c2997177a 100644
> --- a/fs/gfs2/file.c
> +++ b/fs/gfs2/file.c
> @@ -797,9 +797,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
>  	if (ret)
>  		goto out_uninit;
>  
> -	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
> -			   is_sync_kiocb(iocb));
> -
> +	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
>  	gfs2_glock_dq(gh);
>  out_uninit:
>  	gfs2_holder_uninit(gh);
> @@ -833,8 +831,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
>  	if (offset + len > i_size_read(&ip->i_inode))
>  		goto out;
>  
> -	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
> -			   is_sync_kiocb(iocb));
> +	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
>  	if (ret == -ENOTBLK)
>  		ret = 0;
>  out:
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 604103ab76f9c5..32dbbf7dd4aadb 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -420,13 +420,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
>  struct iomap_dio *
>  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
> -		bool wait_for_completion)
> +		unsigned int dio_flags)
>  {
>  	struct address_space *mapping = iocb->ki_filp->f_mapping;
>  	struct inode *inode = file_inode(iocb->ki_filp);
>  	size_t count = iov_iter_count(iter);
>  	loff_t pos = iocb->ki_pos;
>  	loff_t end = iocb->ki_pos + count - 1, ret = 0;
> +	bool wait_for_completion =
> +		is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT);
>  	unsigned int iomap_flags = IOMAP_DIRECT;
>  	struct blk_plug plug;
>  	struct iomap_dio *dio;
> @@ -434,9 +436,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	if (!count)
>  		return NULL;
>  
> -	if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
> -		return ERR_PTR(-EIO);
> -
>  	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
>  	if (!dio)
>  		return ERR_PTR(-ENOMEM);
> @@ -598,11 +597,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
>  ssize_t
>  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
> -		bool wait_for_completion)
> +		unsigned int flags)

Can this be named "dio_flags", since it's passed directly into
__iomap_dio_rw?

>  {
>  	struct iomap_dio *dio;
>  
> -	dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion);
> +	dio = __iomap_dio_rw(iocb, iter, ops, dops, flags);
>  	if (IS_ERR_OR_NULL(dio))
>  		return PTR_ERR_OR_ZERO(dio);
>  	return iomap_dio_complete(dio);
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index bffd7240cefb7f..b181db42f2f32f 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -232,8 +232,7 @@ xfs_file_dio_read(
>  	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
>  	if (ret)
>  		return ret;
> -	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
> -			is_sync_kiocb(iocb));
> +	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
>  	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
>  
>  	return ret;
> @@ -535,7 +534,7 @@ xfs_file_dio_write_aligned(
>  	}
>  	trace_xfs_file_direct_write(iocb, from);
>  	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> -			   &xfs_dio_write_ops, is_sync_kiocb(iocb));
> +			   &xfs_dio_write_ops, 0);
>  out_unlock:
>  	if (iolock)
>  		xfs_iunlock(ip, iolock);
> @@ -603,7 +602,7 @@ xfs_file_dio_write_unaligned(
>  	 */
>  	trace_xfs_file_direct_write(iocb, from);
>  	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> -			   &xfs_dio_write_ops, true);
> +			   &xfs_dio_write_ops, IOMAP_DIO_FORCE_WAIT);
>  out_unlock:
>  	if (iolock)
>  		xfs_iunlock(ip, iolock);
> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> index bec47f2d074beb..0e7ab0bc00ae8e 100644
> --- a/fs/zonefs/super.c
> +++ b/fs/zonefs/super.c
> @@ -780,7 +780,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
>  		ret = zonefs_file_dio_append(iocb, from);
>  	else
>  		ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
> -				   &zonefs_write_dio_ops, sync);
> +				   &zonefs_write_dio_ops, 0);
>  	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
>  	    (ret > 0 || ret == -EIOCBQUEUED)) {
>  		if (ret > 0)
> @@ -917,7 +917,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  		}
>  		file_accessed(iocb->ki_filp);
>  		ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
> -				   &zonefs_read_dio_ops, is_sync_kiocb(iocb));
> +				   &zonefs_read_dio_ops, 0);
>  	} else {
>  		ret = generic_file_read_iter(iocb, to);
>  		if (ret == -EIO)
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 5bd3cac4df9cb4..b322598dc10ec0 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -256,12 +256,18 @@ struct iomap_dio_ops {
>  			struct bio *bio, loff_t file_offset);
>  };
>  
> +/*
> + * Wait for the I/O to complete in iomap_dio_rw even if the kiocb is not
> + * synchronous.
> + */
> +#define IOMAP_DIO_FORCE_WAIT	(1 << 0)
> +
>  ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
> -		bool wait_for_completion);
> +		unsigned int flags);
>  struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
> -		bool wait_for_completion);
> +		unsigned int flags);

...and please make the naming of that last parameter consistent with the
definitions. :)

Everything else here looks ok to me.

--D

>  ssize_t iomap_dio_complete(struct iomap_dio *dio);
>  int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
>  
> -- 
> 2.29.2
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 09/11] iomap: pass a flags argument to iomap_dio_rw
  2021-01-20 18:17   ` Darrick J. Wong
@ 2021-01-20 18:35     ` Christoph Hellwig
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-20 18:35 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, avi

On Wed, Jan 20, 2021 at 10:17:04AM -0800, Darrick J. Wong wrote:
> > @@ -598,11 +597,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
> >  ssize_t
> >  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> >  		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
> > -		bool wait_for_completion)
> > +		unsigned int flags)
> 
> Can this be named "dio_flags", since it's passed directly into
> __iomap_dio_rw?
> 
> >  struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> >  		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
> > -		bool wait_for_completion);
> > +		unsigned int flags);
> 
> ...and please make the naming of that last parameter consistent with the
> definitions. :)

Ok.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 11/11] xfs: reduce exclusive locking on unaligned dio
  2021-01-18 19:35 ` [PATCH 11/11] xfs: reduce exclusive locking on unaligned dio Christoph Hellwig
  2021-01-18 20:55   ` Dave Chinner
@ 2021-01-20 18:40   ` Darrick J. Wong
  2021-01-20 18:44     ` Christoph Hellwig
  1 sibling, 1 reply; 42+ messages in thread
From: Darrick J. Wong @ 2021-01-20 18:40 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner

On Mon, Jan 18, 2021 at 08:35:16PM +0100, Christoph Hellwig wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Attempt shared locking for unaligned DIO, but only if the the
> underlying extent is already allocated and in written state. On
> failure, retry with the existing exclusive locking.
> 
> Test case is fio randrw of 512 byte IOs using AIO and an iodepth of
> 32 IOs.
> 
> Vanilla:
> 
>   READ: bw=4560KiB/s (4670kB/s), 4560KiB/s-4560KiB/s (4670kB/s-4670kB/s), io=134MiB (140MB), run=30001-30001msec
>   WRITE: bw=4567KiB/s (4676kB/s), 4567KiB/s-4567KiB/s (4676kB/s-4676kB/s), io=134MiB (140MB), run=30001-30001msec
> 
> Patched:
>    READ: bw=37.6MiB/s (39.4MB/s), 37.6MiB/s-37.6MiB/s (39.4MB/s-39.4MB/s), io=1127MiB (1182MB), run=30002-30002msec
>   WRITE: bw=37.6MiB/s (39.4MB/s), 37.6MiB/s-37.6MiB/s (39.4MB/s-39.4MB/s), io=1128MiB (1183MB), run=30002-30002msec
> 
> That's an improvement from ~18k IOPS to a ~150k IOPS, which is
> about the IOPS limit of the VM block device setup I'm testing on.
> 
> 4kB block IO comparison:
> 
>    READ: bw=296MiB/s (310MB/s), 296MiB/s-296MiB/s (310MB/s-310MB/s), io=8868MiB (9299MB), run=30002-30002msec
>   WRITE: bw=296MiB/s (310MB/s), 296MiB/s-296MiB/s (310MB/s-310MB/s), io=8878MiB (9309MB), run=30002-30002msec
> 
> Which is ~150k IOPS, same as what the test gets for sub-block
> AIO+DIO writes with this patch.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> [hch: rebased, split unaligned from nowait]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_file.c  | 87 ++++++++++++++++++++++++++++++++--------------
>  fs/xfs/xfs_iomap.c | 31 ++++++++++++-----
>  2 files changed, 84 insertions(+), 34 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index b181db42f2f32f..4e475e750148db 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -544,22 +544,35 @@ xfs_file_dio_write_aligned(
>  /*
>   * Handle block unaligned direct IO writes
>   *
> - * In most cases direct IO writes will be done holding IOLOCK_SHARED, allowing
> - * them to be done in parallel with reads and other direct IO writes.  However,
> - * if the I/O is not aligned to filesystem blocks, the direct I/O layer may
> - * need to do sub-block zeroing and that requires serialisation against other
> - * direct I/Os to the same block. In this case we need to serialise the
> - * submission of the unaligned I/Os so that we don't get racing block zeroing in
> - * the dio layer.
> + * In most cases direct IO writes will be done holding IOLOCK_SHARED
> + * allowing them to be done in parallel with reads and other direct IO writes.
> + * However, if the IO is not aligned to filesystem blocks, the direct IO layer
> + * may need to do sub-block zeroing and that requires serialisation against other
> + * direct IOs to the same block. In the case where sub-block zeroing is not
> + * required, we can do concurrent sub-block dios to the same block successfully.
>   *
> - * To provide the same serialisation for AIO, we also need to wait for
> + * Hence we have two cases here - the shared, optimisitic fast path for written
> + * extents, and everything else that needs exclusive IO path access across the
> + * entire IO.
> + *
> + * For the first case, we do all the checks we need at the mapping layer in the
> + * DIO code as part of the existing NOWAIT infrastructure. Hence all we need to
> + * do to support concurrent subblock dio is first try a non-blocking submission.
> + * If that returns -EAGAIN, then we simply repeat the IO submission with full
> + * IO exclusivity guaranteed so that we avoid racing sub-block zeroing.
> + *
> + * The only wrinkle in this case is that the iomap DIO code always does
> + * partial tail sub-block zeroing for post-EOF writes. Hence for any IO that
> + * _ends_ past the current EOF we need to run with full exclusivity. Note that
> + * we also check for the start of IO being beyond EOF because then zeroing
> + * between the old EOF and the start of the IO is required and that also
> + * requires exclusivity. Hence we avoid lock cycles and blocking under
> + * IOCB_NOWAIT for this situation, too.
> + *
> + * To provide the exclusivity required when using AIO, we also need to wait for
>   * outstanding IOs to complete so that unwritten extent conversion is completed
>   * before we try to map the overlapping block. This is currently implemented by
>   * hitting it with a big hammer (i.e. inode_dio_wait()).
> - *
> - * This means that unaligned dio writes always block. There is no "nowait" fast
> - * path in this code - if IOCB_NOWAIT is set we simply return -EAGAIN up front
> - * and we don't have to worry about that anymore.
>   */
>  static noinline ssize_t
>  xfs_file_dio_write_unaligned(
> @@ -567,13 +580,27 @@ xfs_file_dio_write_unaligned(
>  	struct kiocb		*iocb,
>  	struct iov_iter		*from)
>  {
> -	int			iolock = XFS_IOLOCK_EXCL;
> +	size_t			isize = i_size_read(VFS_I(ip));
> +	size_t			count = iov_iter_count(from);
> +	int			iolock = XFS_IOLOCK_SHARED;
> +	unsigned int		flags = IOMAP_DIO_UNALIGNED;
>  	ssize_t			ret;
>  
> -	/* unaligned dio always waits, bail */
> -	if (iocb->ki_flags & IOCB_NOWAIT)
> -		return -EAGAIN;
> -	xfs_ilock(ip, iolock);
> +	/*
> +	 * Extending writes need exclusivity because of the sub-block zeroing
> +	 * that the DIO code always does for partial tail blocks beyond EOF.
> +	 */
> +	if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
> +retry_exclusive:
> +		if (iocb->ki_flags & IOCB_NOWAIT)
> +			return -EAGAIN;
> +		iolock = XFS_IOLOCK_EXCL;
> +		flags = IOMAP_DIO_FORCE_WAIT;
> +	}
> +
> +	ret = xfs_ilock_iocb(iocb, iolock);
> +	if (ret)
> +		return ret;
>  
>  	/*
>  	 * We can't properly handle unaligned direct I/O to reflink files yet,
> @@ -590,19 +617,27 @@ xfs_file_dio_write_unaligned(
>  		goto out_unlock;
>  
>  	/*
> -	 * If we are doing unaligned I/O, we can't allow any other overlapping
> -	 * I/O in-flight at the same time or we risk data corruption. Wait for
> -	 * all other I/O to drain before we submit.
> +	 * If we are doing exclusive unaligned IO, we can't allow any other
> +	 * overlapping IO in-flight at the same time or we risk data corruption.
> +	 * Wait for all other IO to drain before we submit.
>  	 */
> -	inode_dio_wait(VFS_I(ip));
> +	if (!(flags & IOMAP_DIO_UNALIGNED))
> +		inode_dio_wait(VFS_I(ip));

Er... this really confused me when I read it -- my first thought was
"How can we be in the unaligned direct write function but DIO_UNALIGNED
isn't set?  Wouldn't we be in some other function if we're doing an
aligned direct write?"

Then I looked upthread to where Christph said he'd renamed it
IOMAP_DIO_SUBBLOCK, but I didn't think that was sufficiently better:

	if (!(flags & IOMAP_DIO_SUBBLOCK))
		iomap_dio_wait(...);

This flag doesn't have a 1:1 relationship with the iocb asking for an
(fsblock-)unaligned write or the iocb saying that the write involves
sub-block io -- this flag really means "I require a stable written
mapping, no post-processing (of the disk block) allowed".

Admittedly the comment above the definition of IOMAP_DIO_UNALIGNED
actually says this, but as we all know I sometimes like to review
patchsets backwards. :P

How about...

IOMAP_DIO_REQUIRE_OVERWRITE ?

IOMAP_DIO_REQUIRE_STABLE ?

--D

>  
> -	/*
> -	 * This must be the only I/O in-flight. Wait on it before we release the
> -	 * iolock to prevent subsequent overlapping I/O.
> -	 */
>  	trace_xfs_file_direct_write(iocb, from);
>  	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> -			   &xfs_dio_write_ops, IOMAP_DIO_FORCE_WAIT);
> +			   &xfs_dio_write_ops, flags);
> +	/*
> +	 * Retry unaligned IO with exclusive blocking semantics if the DIO
> +	 * layer rejected it for mapping or locking reasons. If we are doing
> +	 * nonblocking user IO, propagate the error.
> +	 */
> +	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
> +		ASSERT(flags & IOMAP_DIO_UNALIGNED);
> +		xfs_iunlock(ip, iolock);
> +		goto retry_exclusive;
> +	}
> +
>  out_unlock:
>  	if (iolock)
>  		xfs_iunlock(ip, iolock);
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 7b9ff824e82d48..dc8c86e98b99bf 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -784,15 +784,30 @@ xfs_direct_write_iomap_begin(
>  		goto allocate_blocks;
>  
>  	/*
> -	 * NOWAIT IO needs to span the entire requested IO with a single map so
> -	 * that we avoid partial IO failures due to the rest of the IO range not
> -	 * covered by this map triggering an EAGAIN condition when it is
> -	 * subsequently mapped and aborting the IO.
> +	 * NOWAIT and unaligned IO needs to span the entire requested IO with a
> +	 * single map so that we avoid partial IO failures due to the rest of
> +	 * the IO range not covered by this map triggering an EAGAIN condition
> +	 * when it is subsequently mapped and aborting the IO.
>  	 */
> -	if ((flags & IOMAP_NOWAIT) &&
> -	    !imap_spans_range(&imap, offset_fsb, end_fsb)) {
> +	if (flags & (IOMAP_NOWAIT | IOMAP_UNALIGNED)) {
>  		error = -EAGAIN;
> -		goto out_unlock;
> +		if (!imap_spans_range(&imap, offset_fsb, end_fsb))
> +			goto out_unlock;
> +	}
> +
> +	/*
> +	 * For unsigned I/O we can't convert an unwritten extents if the I/O is
> +	 * not block size aligned, as such a conversion would have to do
> +	 * sub-block zeroing, and that can only be done under an exclusive
> +	 * IOLOCK. Hence if this is not a written extent, return EAGAIN to tell
> +	 * the caller to try again.
> +	 */
> +	if (flags & IOMAP_UNALIGNED) {
> +		error = -EAGAIN;
> +		if (imap.br_state != XFS_EXT_NORM &&
> +		    ((offset & mp->m_blockmask) ||
> +		     ((offset + length) & mp->m_blockmask)))
> +			goto out_unlock;
>  	}
>  
>  	xfs_iunlock(ip, lockmode);
> @@ -801,7 +816,7 @@ xfs_direct_write_iomap_begin(
>  
>  allocate_blocks:
>  	error = -EAGAIN;
> -	if (flags & IOMAP_NOWAIT)
> +	if (flags & (IOMAP_NOWAIT | IOMAP_UNALIGNED))
>  		goto out_unlock;
>  
>  	/*
> -- 
> 2.29.2
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 01/11] xfs: factor out a xfs_ilock_iocb helper
  2021-01-18 19:35 ` [PATCH 01/11] xfs: factor out a xfs_ilock_iocb helper Christoph Hellwig
@ 2021-01-20 18:41   ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2021-01-20 18:41 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner, Brian Foster

On Mon, Jan 18, 2021 at 08:35:06PM +0100, Christoph Hellwig wrote:
> Add a helper to factor out the nowait locking logical for the read/write
> helpers.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
> Reviewed-by: Brian Foster <bfoster@redhat.com>

Looks pretty straightforward,
Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_file.c | 55 +++++++++++++++++++++++++----------------------
>  1 file changed, 29 insertions(+), 26 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 5b0f93f738372d..c441cddfa4acbc 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -197,6 +197,23 @@ xfs_file_fsync(
>  	return error;
>  }
>  
> +static int
> +xfs_ilock_iocb(
> +	struct kiocb		*iocb,
> +	unsigned int		lock_mode)
> +{
> +	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
> +
> +	if (iocb->ki_flags & IOCB_NOWAIT) {
> +		if (!xfs_ilock_nowait(ip, lock_mode))
> +			return -EAGAIN;
> +	} else {
> +		xfs_ilock(ip, lock_mode);
> +	}
> +
> +	return 0;
> +}
> +
>  STATIC ssize_t
>  xfs_file_dio_aio_read(
>  	struct kiocb		*iocb,
> @@ -213,12 +230,9 @@ xfs_file_dio_aio_read(
>  
>  	file_accessed(iocb->ki_filp);
>  
> -	if (iocb->ki_flags & IOCB_NOWAIT) {
> -		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
> -			return -EAGAIN;
> -	} else {
> -		xfs_ilock(ip, XFS_IOLOCK_SHARED);
> -	}
> +	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
> +	if (ret)
> +		return ret;
>  	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
>  			is_sync_kiocb(iocb));
>  	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
> @@ -240,13 +254,9 @@ xfs_file_dax_read(
>  	if (!count)
>  		return 0; /* skip atime */
>  
> -	if (iocb->ki_flags & IOCB_NOWAIT) {
> -		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
> -			return -EAGAIN;
> -	} else {
> -		xfs_ilock(ip, XFS_IOLOCK_SHARED);
> -	}
> -
> +	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
> +	if (ret)
> +		return ret;
>  	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
>  	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
>  
> @@ -264,12 +274,9 @@ xfs_file_buffered_aio_read(
>  
>  	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
>  
> -	if (iocb->ki_flags & IOCB_NOWAIT) {
> -		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
> -			return -EAGAIN;
> -	} else {
> -		xfs_ilock(ip, XFS_IOLOCK_SHARED);
> -	}
> +	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
> +	if (ret)
> +		return ret;
>  	ret = generic_file_read_iter(iocb, to);
>  	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
>  
> @@ -608,13 +615,9 @@ xfs_file_dax_write(
>  	size_t			count;
>  	loff_t			pos;
>  
> -	if (iocb->ki_flags & IOCB_NOWAIT) {
> -		if (!xfs_ilock_nowait(ip, iolock))
> -			return -EAGAIN;
> -	} else {
> -		xfs_ilock(ip, iolock);
> -	}
> -
> +	ret = xfs_ilock_iocb(iocb, iolock);
> +	if (ret)
> +		return ret;
>  	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
>  	if (ret)
>  		goto out;
> -- 
> 2.29.2
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 02/11] xfs: make xfs_file_aio_write_checks IOCB_NOWAIT-aware
  2021-01-18 19:35 ` [PATCH 02/11] xfs: make xfs_file_aio_write_checks IOCB_NOWAIT-aware Christoph Hellwig
       [not found]   ` <CACz=WeeaqMrGM53pJF0C_Wt2JuavTOnOV26-osPviYLUpqUmFw@mail.gmail.com>
@ 2021-01-20 18:42   ` Darrick J. Wong
  1 sibling, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2021-01-20 18:42 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner, Brian Foster

On Mon, Jan 18, 2021 at 08:35:07PM +0100, Christoph Hellwig wrote:
> Ensure we don't block on the iolock, or waiting for I/O in
> xfs_file_aio_write_checks if the caller asked to avoid that.
> 
> Fixes: 29a5d29ec181 ("xfs: nowait aio support")
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
> Reviewed-by: Brian Foster <bfoster@redhat.com>

Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_file.c | 25 +++++++++++++++++++++----
>  1 file changed, 21 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index c441cddfa4acbc..fb4e6f2852bb8b 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -335,7 +335,14 @@ xfs_file_aio_write_checks(
>  	if (error <= 0)
>  		return error;
>  
> -	error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
> +	if (iocb->ki_flags & IOCB_NOWAIT) {
> +		error = break_layout(inode, false);
> +		if (error == -EWOULDBLOCK)
> +			error = -EAGAIN;
> +	} else {
> +		error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
> +	}
> +
>  	if (error)
>  		return error;
>  
> @@ -346,7 +353,11 @@ xfs_file_aio_write_checks(
>  	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
>  		xfs_iunlock(ip, *iolock);
>  		*iolock = XFS_IOLOCK_EXCL;
> -		xfs_ilock(ip, *iolock);
> +		error = xfs_ilock_iocb(iocb, *iolock);
> +		if (error) {
> +			*iolock = 0;
> +			return error;
> +		}
>  		goto restart;
>  	}
>  	/*
> @@ -368,6 +379,10 @@ xfs_file_aio_write_checks(
>  	isize = i_size_read(inode);
>  	if (iocb->ki_pos > isize) {
>  		spin_unlock(&ip->i_flags_lock);
> +
> +		if (iocb->ki_flags & IOCB_NOWAIT)
> +			return -EAGAIN;
> +
>  		if (!drained_dio) {
>  			if (*iolock == XFS_IOLOCK_SHARED) {
>  				xfs_iunlock(ip, *iolock);
> @@ -593,7 +608,8 @@ xfs_file_dio_aio_write(
>  			   &xfs_dio_write_ops,
>  			   is_sync_kiocb(iocb) || unaligned_io);
>  out:
> -	xfs_iunlock(ip, iolock);
> +	if (iolock)
> +		xfs_iunlock(ip, iolock);
>  
>  	/*
>  	 * No fallback to buffered IO after short writes for XFS, direct I/O
> @@ -632,7 +648,8 @@ xfs_file_dax_write(
>  		error = xfs_setfilesize(ip, pos, ret);
>  	}
>  out:
> -	xfs_iunlock(ip, iolock);
> +	if (iolock)
> +		xfs_iunlock(ip, iolock);
>  	if (error)
>  		return error;
>  
> -- 
> 2.29.2
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 03/11] xfs: cleanup the read/write helper naming
  2021-01-18 19:35 ` [PATCH 03/11] xfs: cleanup the read/write helper naming Christoph Hellwig
  2021-01-19 15:23   ` Brian Foster
@ 2021-01-20 18:43   ` Darrick J. Wong
  1 sibling, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2021-01-20 18:43 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner

On Mon, Jan 18, 2021 at 08:35:08PM +0100, Christoph Hellwig wrote:
> Drop a few pointless aio_ prefixes.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>

Mmmm shortening!
Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_file.c | 30 +++++++++++++++---------------
>  1 file changed, 15 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index fb4e6f2852bb8b..ae7313ccaa11ed 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -215,7 +215,7 @@ xfs_ilock_iocb(
>  }
>  
>  STATIC ssize_t
> -xfs_file_dio_aio_read(
> +xfs_file_dio_read(
>  	struct kiocb		*iocb,
>  	struct iov_iter		*to)
>  {
> @@ -265,7 +265,7 @@ xfs_file_dax_read(
>  }
>  
>  STATIC ssize_t
> -xfs_file_buffered_aio_read(
> +xfs_file_buffered_read(
>  	struct kiocb		*iocb,
>  	struct iov_iter		*to)
>  {
> @@ -300,9 +300,9 @@ xfs_file_read_iter(
>  	if (IS_DAX(inode))
>  		ret = xfs_file_dax_read(iocb, to);
>  	else if (iocb->ki_flags & IOCB_DIRECT)
> -		ret = xfs_file_dio_aio_read(iocb, to);
> +		ret = xfs_file_dio_read(iocb, to);
>  	else
> -		ret = xfs_file_buffered_aio_read(iocb, to);
> +		ret = xfs_file_buffered_read(iocb, to);
>  
>  	if (ret > 0)
>  		XFS_STATS_ADD(mp, xs_read_bytes, ret);
> @@ -317,7 +317,7 @@ xfs_file_read_iter(
>   * if called for a direct write beyond i_size.
>   */
>  STATIC ssize_t
> -xfs_file_aio_write_checks(
> +xfs_file_write_checks(
>  	struct kiocb		*iocb,
>  	struct iov_iter		*from,
>  	int			*iolock)
> @@ -502,7 +502,7 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
>  };
>  
>  /*
> - * xfs_file_dio_aio_write - handle direct IO writes
> + * xfs_file_dio_write - handle direct IO writes
>   *
>   * Lock the inode appropriately to prepare for and issue a direct IO write.
>   * By separating it from the buffered write path we remove all the tricky to
> @@ -527,7 +527,7 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
>   * negative return values.
>   */
>  STATIC ssize_t
> -xfs_file_dio_aio_write(
> +xfs_file_dio_write(
>  	struct kiocb		*iocb,
>  	struct iov_iter		*from)
>  {
> @@ -549,7 +549,7 @@ xfs_file_dio_aio_write(
>  	/*
>  	 * Don't take the exclusive iolock here unless the I/O is unaligned to
>  	 * the file system block size.  We don't need to consider the EOF
> -	 * extension case here because xfs_file_aio_write_checks() will relock
> +	 * extension case here because xfs_file_write_checks() will relock
>  	 * the inode as necessary for EOF zeroing cases and fill out the new
>  	 * inode size as appropriate.
>  	 */
> @@ -580,7 +580,7 @@ xfs_file_dio_aio_write(
>  		xfs_ilock(ip, iolock);
>  	}
>  
> -	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
> +	ret = xfs_file_write_checks(iocb, from, &iolock);
>  	if (ret)
>  		goto out;
>  	count = iov_iter_count(from);
> @@ -590,7 +590,7 @@ xfs_file_dio_aio_write(
>  	 * in-flight at the same time or we risk data corruption. Wait for all
>  	 * other IO to drain before we submit. If the IO is aligned, demote the
>  	 * iolock if we had to take the exclusive lock in
> -	 * xfs_file_aio_write_checks() for other reasons.
> +	 * xfs_file_write_checks() for other reasons.
>  	 */
>  	if (unaligned_io) {
>  		inode_dio_wait(inode);
> @@ -634,7 +634,7 @@ xfs_file_dax_write(
>  	ret = xfs_ilock_iocb(iocb, iolock);
>  	if (ret)
>  		return ret;
> -	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
> +	ret = xfs_file_write_checks(iocb, from, &iolock);
>  	if (ret)
>  		goto out;
>  
> @@ -663,7 +663,7 @@ xfs_file_dax_write(
>  }
>  
>  STATIC ssize_t
> -xfs_file_buffered_aio_write(
> +xfs_file_buffered_write(
>  	struct kiocb		*iocb,
>  	struct iov_iter		*from)
>  {
> @@ -682,7 +682,7 @@ xfs_file_buffered_aio_write(
>  	iolock = XFS_IOLOCK_EXCL;
>  	xfs_ilock(ip, iolock);
>  
> -	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
> +	ret = xfs_file_write_checks(iocb, from, &iolock);
>  	if (ret)
>  		goto out;
>  
> @@ -769,12 +769,12 @@ xfs_file_write_iter(
>  		 * CoW.  In all other directio scenarios we do not
>  		 * allow an operation to fall back to buffered mode.
>  		 */
> -		ret = xfs_file_dio_aio_write(iocb, from);
> +		ret = xfs_file_dio_write(iocb, from);
>  		if (ret != -ENOTBLK)
>  			return ret;
>  	}
>  
> -	return xfs_file_buffered_aio_write(iocb, from);
> +	return xfs_file_buffered_write(iocb, from);
>  }
>  
>  static void
> -- 
> 2.29.2
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 04/11] xfs: remove the buffered I/O fallback assert
  2021-01-18 19:35 ` [PATCH 04/11] xfs: remove the buffered I/O fallback assert Christoph Hellwig
  2021-01-19 15:23   ` Brian Foster
@ 2021-01-20 18:43   ` Darrick J. Wong
  1 sibling, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2021-01-20 18:43 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner

On Mon, Jan 18, 2021 at 08:35:09PM +0100, Christoph Hellwig wrote:
> The iomap code has been designed from the start not to do magic fallback,
> so remove the assert in preparation for further code cleanups.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>

Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_file.c | 6 ------
>  1 file changed, 6 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index ae7313ccaa11ed..97836ec53397d4 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -610,12 +610,6 @@ xfs_file_dio_write(
>  out:
>  	if (iolock)
>  		xfs_iunlock(ip, iolock);
> -
> -	/*
> -	 * No fallback to buffered IO after short writes for XFS, direct I/O
> -	 * will either complete fully or return an error.
> -	 */
> -	ASSERT(ret < 0 || ret == count);
>  	return ret;
>  }
>  
> -- 
> 2.29.2
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 11/11] xfs: reduce exclusive locking on unaligned dio
  2021-01-20 18:40   ` Darrick J. Wong
@ 2021-01-20 18:44     ` Christoph Hellwig
  2021-01-20 19:58       ` Darrick J. Wong
  0 siblings, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-20 18:44 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, avi, Dave Chinner

[another full quote removed, guys please send properly formatted email]

On Wed, Jan 20, 2021 at 10:40:56AM -0800, Darrick J. Wong wrote:
> > +	if (!(flags & IOMAP_DIO_UNALIGNED))
> > +		inode_dio_wait(VFS_I(ip));
> 
> Er... this really confused me when I read it -- my first thought was
> "How can we be in the unaligned direct write function but DIO_UNALIGNED
> isn't set?  Wouldn't we be in some other function if we're doing an
> aligned direct write?"
> 
> Then I looked upthread to where Christph said he'd renamed it
> IOMAP_DIO_SUBBLOCK, but I didn't think that was sufficiently better:
> 
> 	if (!(flags & IOMAP_DIO_SUBBLOCK))
> 		iomap_dio_wait(...);
> 
> This flag doesn't have a 1:1 relationship with the iocb asking for an
> (fsblock-)unaligned write or the iocb saying that the write involves
> sub-block io -- this flag really means "I require a stable written
> mapping, no post-processing (of the disk block) allowed".

Would:

	if (flags & IOMAP_DIO_FORCE_WAIT)
		inode_dio_wait(VFS_I(ip));

look any better to you?  Behavior would be the same.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 05/11] xfs: simplify the read/write tracepoints
  2021-01-18 19:35 ` [PATCH 05/11] xfs: simplify the read/write tracepoints Christoph Hellwig
  2021-01-19 15:23   ` Brian Foster
@ 2021-01-20 18:45   ` Darrick J. Wong
  1 sibling, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2021-01-20 18:45 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner

On Mon, Jan 18, 2021 at 08:35:10PM +0100, Christoph Hellwig wrote:
> Pass the iocb and iov_iter to the tracepoints and leave decoding of
> actual arguments to the code only run when tracing is enabled.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>

I've been thinking for a while that we really should be pushing
structure decoding and whatnot to the tracepoint code to keep it out of
the callers, so I like this:

Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_file.c  | 20 ++++++++------------
>  fs/xfs/xfs_trace.h | 18 +++++++++---------
>  2 files changed, 17 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 97836ec53397d4..aa64e78fc3c467 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -220,12 +220,11 @@ xfs_file_dio_read(
>  	struct iov_iter		*to)
>  {
>  	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
> -	size_t			count = iov_iter_count(to);
>  	ssize_t			ret;
>  
> -	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
> +	trace_xfs_file_direct_read(iocb, to);
>  
> -	if (!count)
> +	if (!iov_iter_count(to))
>  		return 0; /* skip atime */
>  
>  	file_accessed(iocb->ki_filp);
> @@ -246,12 +245,11 @@ xfs_file_dax_read(
>  	struct iov_iter		*to)
>  {
>  	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
> -	size_t			count = iov_iter_count(to);
>  	ssize_t			ret = 0;
>  
> -	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
> +	trace_xfs_file_dax_read(iocb, to);
>  
> -	if (!count)
> +	if (!iov_iter_count(to))
>  		return 0; /* skip atime */
>  
>  	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
> @@ -272,7 +270,7 @@ xfs_file_buffered_read(
>  	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
>  	ssize_t			ret;
>  
> -	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
> +	trace_xfs_file_buffered_read(iocb, to);
>  
>  	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
>  	if (ret)
> @@ -599,7 +597,7 @@ xfs_file_dio_write(
>  		iolock = XFS_IOLOCK_SHARED;
>  	}
>  
> -	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
> +	trace_xfs_file_direct_write(iocb, from);
>  	/*
>  	 * If unaligned, this is the only IO in-flight. Wait on it before we
>  	 * release the iolock to prevent subsequent overlapping IO.
> @@ -622,7 +620,6 @@ xfs_file_dax_write(
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	int			iolock = XFS_IOLOCK_EXCL;
>  	ssize_t			ret, error = 0;
> -	size_t			count;
>  	loff_t			pos;
>  
>  	ret = xfs_ilock_iocb(iocb, iolock);
> @@ -633,9 +630,8 @@ xfs_file_dax_write(
>  		goto out;
>  
>  	pos = iocb->ki_pos;
> -	count = iov_iter_count(from);
>  
> -	trace_xfs_file_dax_write(ip, count, pos);
> +	trace_xfs_file_dax_write(iocb, from);
>  	ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
>  	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
>  		i_size_write(inode, iocb->ki_pos);
> @@ -683,7 +679,7 @@ xfs_file_buffered_write(
>  	/* We can write back this queue in page reclaim */
>  	current->backing_dev_info = inode_to_bdi(inode);
>  
> -	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
> +	trace_xfs_file_buffered_write(iocb, from);
>  	ret = iomap_file_buffered_write(iocb, from,
>  			&xfs_buffered_write_iomap_ops);
>  	if (likely(ret >= 0))
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 5a263ae3d4f008..a6d04d860a565e 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -1287,8 +1287,8 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
>  )
>  
>  DECLARE_EVENT_CLASS(xfs_file_class,
> -	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
> -	TP_ARGS(ip, count, offset),
> +	TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),
> +	TP_ARGS(iocb, iter),
>  	TP_STRUCT__entry(
>  		__field(dev_t, dev)
>  		__field(xfs_ino_t, ino)
> @@ -1297,11 +1297,11 @@ DECLARE_EVENT_CLASS(xfs_file_class,
>  		__field(size_t, count)
>  	),
>  	TP_fast_assign(
> -		__entry->dev = VFS_I(ip)->i_sb->s_dev;
> -		__entry->ino = ip->i_ino;
> -		__entry->size = ip->i_d.di_size;
> -		__entry->offset = offset;
> -		__entry->count = count;
> +		__entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
> +		__entry->ino = XFS_I(file_inode(iocb->ki_filp))->i_ino;
> +		__entry->size = XFS_I(file_inode(iocb->ki_filp))->i_d.di_size;
> +		__entry->offset = iocb->ki_pos;
> +		__entry->count = iov_iter_count(iter);
>  	),
>  	TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
> @@ -1313,8 +1313,8 @@ DECLARE_EVENT_CLASS(xfs_file_class,
>  
>  #define DEFINE_RW_EVENT(name)		\
>  DEFINE_EVENT(xfs_file_class, name,	\
> -	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),	\
> -	TP_ARGS(ip, count, offset))
> +	TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),		\
> +	TP_ARGS(iocb, iter))
>  DEFINE_RW_EVENT(xfs_file_buffered_read);
>  DEFINE_RW_EVENT(xfs_file_direct_read);
>  DEFINE_RW_EVENT(xfs_file_dax_read);
> -- 
> 2.29.2
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 06/11] xfs: improve the reflink_bounce_dio_write tracepoint
  2021-01-18 19:35 ` [PATCH 06/11] xfs: improve the reflink_bounce_dio_write tracepoint Christoph Hellwig
  2021-01-19 15:23   ` Brian Foster
@ 2021-01-20 18:45   ` Darrick J. Wong
  1 sibling, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2021-01-20 18:45 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner

On Mon, Jan 18, 2021 at 08:35:11PM +0100, Christoph Hellwig wrote:
> Use a more suitable event class.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>

Woot!
Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_file.c  | 2 +-
>  fs/xfs/xfs_trace.h | 4 ++--
>  2 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index aa64e78fc3c467..a696bd34f71d21 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -560,7 +560,7 @@ xfs_file_dio_write(
>  		 * files yet, as we can't unshare a partial block.
>  		 */
>  		if (xfs_is_cow_inode(ip)) {
> -			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
> +			trace_xfs_reflink_bounce_dio_write(iocb, from);
>  			return -ENOTBLK;
>  		}
>  		iolock = XFS_IOLOCK_EXCL;
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index a6d04d860a565e..0cfd65cd67c190 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -1321,6 +1321,8 @@ DEFINE_RW_EVENT(xfs_file_dax_read);
>  DEFINE_RW_EVENT(xfs_file_buffered_write);
>  DEFINE_RW_EVENT(xfs_file_direct_write);
>  DEFINE_RW_EVENT(xfs_file_dax_write);
> +DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
> +
>  
>  DECLARE_EVENT_CLASS(xfs_imap_class,
>  	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
> @@ -3294,8 +3296,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
>  
> -DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
> -
>  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
>  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
> -- 
> 2.29.2
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 07/11] xfs: split unaligned DIO write code out
  2021-01-18 19:35 ` [PATCH 07/11] xfs: split unaligned DIO write code out Christoph Hellwig
  2021-01-19 15:23   ` Brian Foster
@ 2021-01-20 18:46   ` Darrick J. Wong
  1 sibling, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2021-01-20 18:46 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner

On Mon, Jan 18, 2021 at 08:35:12PM +0100, Christoph Hellwig wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> The unaligned DIO write path is more convolted than the normal path,
> and we are about to make it more complex. Keep the block aligned
> fast path dio write code trim and simple by splitting out the
> unaligned DIO code from it.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> [hch: rebased, fixed a few minor nits]
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good,
Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_file.c | 168 +++++++++++++++++++++++++---------------------
>  1 file changed, 92 insertions(+), 76 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index a696bd34f71d21..bffd7240cefb7f 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -500,117 +500,133 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
>  };
>  
>  /*
> - * xfs_file_dio_write - handle direct IO writes
> + * Handle block aligned direct IO writes
>   *
>   * Lock the inode appropriately to prepare for and issue a direct IO write.
> - * By separating it from the buffered write path we remove all the tricky to
> - * follow locking changes and looping.
>   *
>   * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
>   * until we're sure the bytes at the new EOF have been zeroed and/or the cached
>   * pages are flushed out.
> + */
> +static noinline ssize_t
> +xfs_file_dio_write_aligned(
> +	struct xfs_inode	*ip,
> +	struct kiocb		*iocb,
> +	struct iov_iter		*from)
> +{
> +	int			iolock = XFS_IOLOCK_SHARED;
> +	ssize_t			ret;
> +
> +	ret = xfs_ilock_iocb(iocb, iolock);
> +	if (ret)
> +		return ret;
> +	ret = xfs_file_write_checks(iocb, from, &iolock);
> +	if (ret)
> +		goto out_unlock;
> +
> +	/*
> +	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
> +	 * the iolock back to shared if we had to take the exclusive lock in
> +	 * xfs_file_write_checks() for other reasons.
> +	 */
> +	if (iolock == XFS_IOLOCK_EXCL) {
> +		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
> +		iolock = XFS_IOLOCK_SHARED;
> +	}
> +	trace_xfs_file_direct_write(iocb, from);
> +	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> +			   &xfs_dio_write_ops, is_sync_kiocb(iocb));
> +out_unlock:
> +	if (iolock)
> +		xfs_iunlock(ip, iolock);
> +	return ret;
> +}
> +
> +/*
> + * Handle block unaligned direct IO writes
> + *
> + * In most cases direct IO writes will be done holding IOLOCK_SHARED, allowing
> + * them to be done in parallel with reads and other direct IO writes.  However,
> + * if the I/O is not aligned to filesystem blocks, the direct I/O layer may
> + * need to do sub-block zeroing and that requires serialisation against other
> + * direct I/Os to the same block. In this case we need to serialise the
> + * submission of the unaligned I/Os so that we don't get racing block zeroing in
> + * the dio layer.
>   *
> - * In most cases the direct IO writes will be done holding IOLOCK_SHARED
> - * allowing them to be done in parallel with reads and other direct IO writes.
> - * However, if the IO is not aligned to filesystem blocks, the direct IO layer
> - * needs to do sub-block zeroing and that requires serialisation against other
> - * direct IOs to the same block. In this case we need to serialise the
> - * submission of the unaligned IOs so that we don't get racing block zeroing in
> - * the dio layer.  To avoid the problem with aio, we also need to wait for
> + * To provide the same serialisation for AIO, we also need to wait for
>   * outstanding IOs to complete so that unwritten extent conversion is completed
>   * before we try to map the overlapping block. This is currently implemented by
>   * hitting it with a big hammer (i.e. inode_dio_wait()).
>   *
> - * Returns with locks held indicated by @iolock and errors indicated by
> - * negative return values.
> + * This means that unaligned dio writes always block. There is no "nowait" fast
> + * path in this code - if IOCB_NOWAIT is set we simply return -EAGAIN up front
> + * and we don't have to worry about that anymore.
>   */
> -STATIC ssize_t
> -xfs_file_dio_write(
> +static noinline ssize_t
> +xfs_file_dio_write_unaligned(
> +	struct xfs_inode	*ip,
>  	struct kiocb		*iocb,
>  	struct iov_iter		*from)
>  {
> -	struct file		*file = iocb->ki_filp;
> -	struct address_space	*mapping = file->f_mapping;
> -	struct inode		*inode = mapping->host;
> -	struct xfs_inode	*ip = XFS_I(inode);
> -	struct xfs_mount	*mp = ip->i_mount;
> -	ssize_t			ret = 0;
> -	int			unaligned_io = 0;
> -	int			iolock;
> -	size_t			count = iov_iter_count(from);
> -	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
> +	int			iolock = XFS_IOLOCK_EXCL;
> +	ssize_t			ret;
>  
> -	/* DIO must be aligned to device logical sector size */
> -	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
> -		return -EINVAL;
> +	/* unaligned dio always waits, bail */
> +	if (iocb->ki_flags & IOCB_NOWAIT)
> +		return -EAGAIN;
> +	xfs_ilock(ip, iolock);
>  
>  	/*
> -	 * Don't take the exclusive iolock here unless the I/O is unaligned to
> -	 * the file system block size.  We don't need to consider the EOF
> -	 * extension case here because xfs_file_write_checks() will relock
> -	 * the inode as necessary for EOF zeroing cases and fill out the new
> -	 * inode size as appropriate.
> +	 * We can't properly handle unaligned direct I/O to reflink files yet,
> +	 * as we can't unshare a partial block.
>  	 */
> -	if ((iocb->ki_pos & mp->m_blockmask) ||
> -	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
> -		unaligned_io = 1;
> -
> -		/*
> -		 * We can't properly handle unaligned direct I/O to reflink
> -		 * files yet, as we can't unshare a partial block.
> -		 */
> -		if (xfs_is_cow_inode(ip)) {
> -			trace_xfs_reflink_bounce_dio_write(iocb, from);
> -			return -ENOTBLK;
> -		}
> -		iolock = XFS_IOLOCK_EXCL;
> -	} else {
> -		iolock = XFS_IOLOCK_SHARED;
> -	}
> -
> -	if (iocb->ki_flags & IOCB_NOWAIT) {
> -		/* unaligned dio always waits, bail */
> -		if (unaligned_io)
> -			return -EAGAIN;
> -		if (!xfs_ilock_nowait(ip, iolock))
> -			return -EAGAIN;
> -	} else {
> -		xfs_ilock(ip, iolock);
> +	if (xfs_is_cow_inode(ip)) {
> +		trace_xfs_reflink_bounce_dio_write(iocb, from);
> +		ret = -ENOTBLK;
> +		goto out_unlock;
>  	}
>  
>  	ret = xfs_file_write_checks(iocb, from, &iolock);
>  	if (ret)
> -		goto out;
> -	count = iov_iter_count(from);
> +		goto out_unlock;
>  
>  	/*
> -	 * If we are doing unaligned IO, we can't allow any other overlapping IO
> -	 * in-flight at the same time or we risk data corruption. Wait for all
> -	 * other IO to drain before we submit. If the IO is aligned, demote the
> -	 * iolock if we had to take the exclusive lock in
> -	 * xfs_file_write_checks() for other reasons.
> +	 * If we are doing unaligned I/O, we can't allow any other overlapping
> +	 * I/O in-flight at the same time or we risk data corruption. Wait for
> +	 * all other I/O to drain before we submit.
>  	 */
> -	if (unaligned_io) {
> -		inode_dio_wait(inode);
> -	} else if (iolock == XFS_IOLOCK_EXCL) {
> -		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
> -		iolock = XFS_IOLOCK_SHARED;
> -	}
> +	inode_dio_wait(VFS_I(ip));
>  
> -	trace_xfs_file_direct_write(iocb, from);
>  	/*
> -	 * If unaligned, this is the only IO in-flight. Wait on it before we
> -	 * release the iolock to prevent subsequent overlapping IO.
> +	 * This must be the only I/O in-flight. Wait on it before we release the
> +	 * iolock to prevent subsequent overlapping I/O.
>  	 */
> +	trace_xfs_file_direct_write(iocb, from);
>  	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> -			   &xfs_dio_write_ops,
> -			   is_sync_kiocb(iocb) || unaligned_io);
> -out:
> +			   &xfs_dio_write_ops, true);
> +out_unlock:
>  	if (iolock)
>  		xfs_iunlock(ip, iolock);
>  	return ret;
>  }
>  
> +static ssize_t
> +xfs_file_dio_write(
> +	struct kiocb		*iocb,
> +	struct iov_iter		*from)
> +{
> +	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
> +	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
> +	size_t			count = iov_iter_count(from);
> +
> +	/* DIO must be aligned to device logical sector size */
> +	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
> +		return -EINVAL;
> +	if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
> +		return xfs_file_dio_write_unaligned(ip, iocb, from);
> +	return xfs_file_dio_write_aligned(ip, iocb, from);
> +}
> +
>  static noinline ssize_t
>  xfs_file_dax_write(
>  	struct kiocb		*iocb,
> -- 
> 2.29.2
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 08/11] iomap: rename the flags variable in __iomap_dio_rw
  2021-01-18 19:35 ` [PATCH 08/11] iomap: rename the flags variable in __iomap_dio_rw Christoph Hellwig
  2021-01-18 20:34   ` Dave Chinner
  2021-01-19 15:23   ` Brian Foster
@ 2021-01-20 18:46   ` Darrick J. Wong
  2 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2021-01-20 18:46 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi

On Mon, Jan 18, 2021 at 08:35:13PM +0100, Christoph Hellwig wrote:
> Rename flags to iomap_flags to make the usage a little more clear.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Pretty straightforward...
Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> ---
>  fs/iomap/direct-io.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 933f234d5becd0..604103ab76f9c5 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -427,7 +427,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	size_t count = iov_iter_count(iter);
>  	loff_t pos = iocb->ki_pos;
>  	loff_t end = iocb->ki_pos + count - 1, ret = 0;
> -	unsigned int flags = IOMAP_DIRECT;
> +	unsigned int iomap_flags = IOMAP_DIRECT;
>  	struct blk_plug plug;
>  	struct iomap_dio *dio;
>  
> @@ -461,7 +461,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		if (iter_is_iovec(iter))
>  			dio->flags |= IOMAP_DIO_DIRTY;
>  	} else {
> -		flags |= IOMAP_WRITE;
> +		iomap_flags |= IOMAP_WRITE;
>  		dio->flags |= IOMAP_DIO_WRITE;
>  
>  		/* for data sync or sync, we need sync completion processing */
> @@ -483,7 +483,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  			ret = -EAGAIN;
>  			goto out_free_dio;
>  		}
> -		flags |= IOMAP_NOWAIT;
> +		iomap_flags |= IOMAP_NOWAIT;
>  	}
>  
>  	ret = filemap_write_and_wait_range(mapping, pos, end);
> @@ -514,7 +514,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  
>  	blk_start_plug(&plug);
>  	do {
> -		ret = iomap_apply(inode, pos, count, flags, ops, dio,
> +		ret = iomap_apply(inode, pos, count, iomap_flags, ops, dio,
>  				iomap_dio_actor);
>  		if (ret <= 0) {
>  			/* magic error code to fall back to buffered I/O */
> -- 
> 2.29.2
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 10/11] iomap: add a IOMAP_DIO_UNALIGNED flag
  2021-01-18 19:35 ` [PATCH 10/11] iomap: add a IOMAP_DIO_UNALIGNED flag Christoph Hellwig
  2021-01-18 20:45   ` Dave Chinner
  2021-01-18 21:41   ` Matthew Wilcox
@ 2021-01-20 18:47   ` Darrick J. Wong
  2 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2021-01-20 18:47 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi

On Mon, Jan 18, 2021 at 08:35:15PM +0100, Christoph Hellwig wrote:
> Add a flag to signal an I/O that is not file system block aligned.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/iomap/direct-io.c  | 7 +++++++
>  include/linux/iomap.h | 8 ++++++++
>  2 files changed, 15 insertions(+)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 32dbbf7dd4aadb..d93019ee4c9e3e 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -485,6 +485,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		iomap_flags |= IOMAP_NOWAIT;
>  	}
>  
> +	if (dio_flags & IOMAP_DIO_UNALIGNED) {
> +		ret = -EAGAIN;
> +		if (pos >= dio->i_size)
> +			goto out_free_dio;
> +		iomap_flags |= IOMAP_UNALIGNED;
> +	}
> +
>  	ret = filemap_write_and_wait_range(mapping, pos, end);
>  	if (ret)
>  		goto out_free_dio;
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index b322598dc10ec0..2fa94ec9583d0a 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -122,6 +122,7 @@ struct iomap_page_ops {
>  #define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
>  #define IOMAP_DIRECT		(1 << 4) /* direct I/O */
>  #define IOMAP_NOWAIT		(1 << 5) /* do not block */
> +#define IOMAP_UNALIGNED		(1 << 6) /* do not allocate blocks */
>  
>  struct iomap_ops {
>  	/*
> @@ -262,6 +263,13 @@ struct iomap_dio_ops {
>   */
>  #define IOMAP_DIO_FORCE_WAIT	(1 << 0)
>  
> +/*
> + * Direct I/O that is not aligned to the file system block.  Do not allocate
> + * blocks and do not zero partial blocks, fall back to the caller by returning
> + * -EAGAIN instead.
> + */
> +#define IOMAP_DIO_UNALIGNED	(1 << 1)

The code changes look fine, but as for the name, I found it a little
confusing even after changing it to IOMAP_DIO_SUBBLOCK.

See my reply to patch 11 for more details.

--D

> +
>  ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
>  		unsigned int flags);
> -- 
> 2.29.2
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 11/11] xfs: reduce exclusive locking on unaligned dio
  2021-01-20 18:44     ` Christoph Hellwig
@ 2021-01-20 19:58       ` Darrick J. Wong
  0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2021-01-20 19:58 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-fsdevel, avi, Dave Chinner

On Wed, Jan 20, 2021 at 07:44:00PM +0100, Christoph Hellwig wrote:
> [another full quote removed, guys please send properly formatted email]
> 
> On Wed, Jan 20, 2021 at 10:40:56AM -0800, Darrick J. Wong wrote:
> > > +	if (!(flags & IOMAP_DIO_UNALIGNED))
> > > +		inode_dio_wait(VFS_I(ip));
> > 
> > Er... this really confused me when I read it -- my first thought was
> > "How can we be in the unaligned direct write function but DIO_UNALIGNED
> > isn't set?  Wouldn't we be in some other function if we're doing an
> > aligned direct write?"
> > 
> > Then I looked upthread to where Christph said he'd renamed it
> > IOMAP_DIO_SUBBLOCK, but I didn't think that was sufficiently better:
> > 
> > 	if (!(flags & IOMAP_DIO_SUBBLOCK))
> > 		iomap_dio_wait(...);
> > 
> > This flag doesn't have a 1:1 relationship with the iocb asking for an
> > (fsblock-)unaligned write or the iocb saying that the write involves
> > sub-block io -- this flag really means "I require a stable written
> > mapping, no post-processing (of the disk block) allowed".
> 
> Would:
> 
> 	if (flags & IOMAP_DIO_FORCE_WAIT)
> 		inode_dio_wait(VFS_I(ip));
> 
> look any better to you?  Behavior would be the same.

Looks fine to me.

--D

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH 01/11] xfs: factor out a xfs_ilock_iocb helper
  2021-01-22 16:20 reduce sub-block DIO serialisation v4 Christoph Hellwig
@ 2021-01-22 16:20 ` Christoph Hellwig
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-22 16:20 UTC (permalink / raw)
  To: linux-xfs
  Cc: linux-fsdevel, avi, Dave Chinner, Brian Foster, Darrick J . Wong

Add a helper to factor out the nowait locking logical for the read/write
helpers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_file.c | 55 +++++++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 26 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5b0f93f738372d..c441cddfa4acbc 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -197,6 +197,23 @@ xfs_file_fsync(
 	return error;
 }
 
+static int
+xfs_ilock_iocb(
+	struct kiocb		*iocb,
+	unsigned int		lock_mode)
+{
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (!xfs_ilock_nowait(ip, lock_mode))
+			return -EAGAIN;
+	} else {
+		xfs_ilock(ip, lock_mode);
+	}
+
+	return 0;
+}
+
 STATIC ssize_t
 xfs_file_dio_aio_read(
 	struct kiocb		*iocb,
@@ -213,12 +230,9 @@ xfs_file_dio_aio_read(
 
 	file_accessed(iocb->ki_filp);
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
+	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+	if (ret)
+		return ret;
 	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
 			is_sync_kiocb(iocb));
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -240,13 +254,9 @@ xfs_file_dax_read(
 	if (!count)
 		return 0; /* skip atime */
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
-
+	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+	if (ret)
+		return ret;
 	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
@@ -264,12 +274,9 @@ xfs_file_buffered_aio_read(
 
 	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
+	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+	if (ret)
+		return ret;
 	ret = generic_file_read_iter(iocb, to);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
@@ -608,13 +615,9 @@ xfs_file_dax_write(
 	size_t			count;
 	loff_t			pos;
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (!xfs_ilock_nowait(ip, iolock))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, iolock);
-	}
-
+	ret = xfs_ilock_iocb(iocb, iolock);
+	if (ret)
+		return ret;
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
 		goto out;
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 01/11] xfs: factor out a xfs_ilock_iocb helper
  2021-01-21  8:58 reduce sub-block DIO serialisation v3 Christoph Hellwig
@ 2021-01-21  8:58 ` Christoph Hellwig
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2021-01-21  8:58 UTC (permalink / raw)
  To: linux-xfs
  Cc: linux-fsdevel, avi, Dave Chinner, Brian Foster, Darrick J . Wong

Add a helper to factor out the nowait locking logical for the read/write
helpers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_file.c | 55 +++++++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 26 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5b0f93f738372d..c441cddfa4acbc 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -197,6 +197,23 @@ xfs_file_fsync(
 	return error;
 }
 
+static int
+xfs_ilock_iocb(
+	struct kiocb		*iocb,
+	unsigned int		lock_mode)
+{
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (!xfs_ilock_nowait(ip, lock_mode))
+			return -EAGAIN;
+	} else {
+		xfs_ilock(ip, lock_mode);
+	}
+
+	return 0;
+}
+
 STATIC ssize_t
 xfs_file_dio_aio_read(
 	struct kiocb		*iocb,
@@ -213,12 +230,9 @@ xfs_file_dio_aio_read(
 
 	file_accessed(iocb->ki_filp);
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
+	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+	if (ret)
+		return ret;
 	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
 			is_sync_kiocb(iocb));
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -240,13 +254,9 @@ xfs_file_dax_read(
 	if (!count)
 		return 0; /* skip atime */
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
-
+	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+	if (ret)
+		return ret;
 	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
@@ -264,12 +274,9 @@ xfs_file_buffered_aio_read(
 
 	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
+	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+	if (ret)
+		return ret;
 	ret = generic_file_read_iter(iocb, to);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
@@ -608,13 +615,9 @@ xfs_file_dax_write(
 	size_t			count;
 	loff_t			pos;
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (!xfs_ilock_nowait(ip, iolock))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, iolock);
-	}
-
+	ret = xfs_ilock_iocb(iocb, iolock);
+	if (ret)
+		return ret;
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
 		goto out;
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2021-01-22 16:27 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-18 19:35 reduce sub-block DIO serialisation v2 Christoph Hellwig
2021-01-18 19:35 ` [PATCH 01/11] xfs: factor out a xfs_ilock_iocb helper Christoph Hellwig
2021-01-20 18:41   ` Darrick J. Wong
2021-01-18 19:35 ` [PATCH 02/11] xfs: make xfs_file_aio_write_checks IOCB_NOWAIT-aware Christoph Hellwig
     [not found]   ` <CACz=WeeaqMrGM53pJF0C_Wt2JuavTOnOV26-osPviYLUpqUmFw@mail.gmail.com>
2021-01-20 16:28     ` Christoph Hellwig
2021-01-20 18:42   ` Darrick J. Wong
2021-01-18 19:35 ` [PATCH 03/11] xfs: cleanup the read/write helper naming Christoph Hellwig
2021-01-19 15:23   ` Brian Foster
2021-01-20 18:43   ` Darrick J. Wong
2021-01-18 19:35 ` [PATCH 04/11] xfs: remove the buffered I/O fallback assert Christoph Hellwig
2021-01-19 15:23   ` Brian Foster
2021-01-20 18:43   ` Darrick J. Wong
2021-01-18 19:35 ` [PATCH 05/11] xfs: simplify the read/write tracepoints Christoph Hellwig
2021-01-19 15:23   ` Brian Foster
2021-01-20 18:45   ` Darrick J. Wong
2021-01-18 19:35 ` [PATCH 06/11] xfs: improve the reflink_bounce_dio_write tracepoint Christoph Hellwig
2021-01-19 15:23   ` Brian Foster
2021-01-20 18:45   ` Darrick J. Wong
2021-01-18 19:35 ` [PATCH 07/11] xfs: split unaligned DIO write code out Christoph Hellwig
2021-01-19 15:23   ` Brian Foster
2021-01-20 18:46   ` Darrick J. Wong
2021-01-18 19:35 ` [PATCH 08/11] iomap: rename the flags variable in __iomap_dio_rw Christoph Hellwig
2021-01-18 20:34   ` Dave Chinner
2021-01-19 15:23   ` Brian Foster
2021-01-20 18:46   ` Darrick J. Wong
2021-01-18 19:35 ` [PATCH 09/11] iomap: pass a flags argument to iomap_dio_rw Christoph Hellwig
2021-01-19 15:23   ` Brian Foster
2021-01-20 18:17   ` Darrick J. Wong
2021-01-20 18:35     ` Christoph Hellwig
2021-01-18 19:35 ` [PATCH 10/11] iomap: add a IOMAP_DIO_UNALIGNED flag Christoph Hellwig
2021-01-18 20:45   ` Dave Chinner
2021-01-18 21:41   ` Matthew Wilcox
2021-01-20 16:40     ` Christoph Hellwig
2021-01-20 18:47   ` Darrick J. Wong
2021-01-18 19:35 ` [PATCH 11/11] xfs: reduce exclusive locking on unaligned dio Christoph Hellwig
2021-01-18 20:55   ` Dave Chinner
2021-01-20 16:36     ` Christoph Hellwig
2021-01-20 18:40   ` Darrick J. Wong
2021-01-20 18:44     ` Christoph Hellwig
2021-01-20 19:58       ` Darrick J. Wong
2021-01-21  8:58 reduce sub-block DIO serialisation v3 Christoph Hellwig
2021-01-21  8:58 ` [PATCH 01/11] xfs: factor out a xfs_ilock_iocb helper Christoph Hellwig
2021-01-22 16:20 reduce sub-block DIO serialisation v4 Christoph Hellwig
2021-01-22 16:20 ` [PATCH 01/11] xfs: factor out a xfs_ilock_iocb helper Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).