Linux-XFS Archive on lore.kernel.org
 help / color / Atom feed
From: Christoph Hellwig <hch@lst.de>
To: linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	Waiman Long <longman@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, Will Deacon <will@kernel.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-ext4@vger.kernel.org, cluster-devel@redhat.com
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org
Subject: [PATCH 07/12] iomap: allow holding i_rwsem until aio completion
Date: Tue, 14 Jan 2020 17:12:20 +0100
Message-ID: <20200114161225.309792-8-hch@lst.de> (raw)
In-Reply-To: <20200114161225.309792-1-hch@lst.de>

The direct I/O code currently uses a hand crafted i_dio_count that needs
to be incremented under i_rwsem and then is decremented when I/O
completes.  That scheme means file system code needs to be very careful
to wait for i_dio_count to reach zero under i_rwsem in various places
that are very cumbersome to get rid.  It also means we can't get the
effect of an exclusive i_rwsem for actually asynchronous I/O, forcing
pointless synchronous execution of sub-blocksize writes.

Replace the i_dio_count scheme with holding i_rwsem over the duration
of the whole I/O.  While this introduces a non-owner unlock that isn't
nice to RT workload, the open coded locking primitive using i_dio_count
isn't any better.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/direct-io.c  | 44 +++++++++++++++++++++++++++++++++++++------
 include/linux/iomap.h |  2 ++
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index e706329d71a0..0113ac33b0a0 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -70,7 +70,7 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
 	dio->submit.cookie = submit_bio(bio);
 }
 
-static ssize_t iomap_dio_complete(struct iomap_dio *dio)
+static ssize_t iomap_dio_complete(struct iomap_dio *dio, bool unlock)
 {
 	const struct iomap_dio_ops *dops = dio->dops;
 	struct kiocb *iocb = dio->iocb;
@@ -112,6 +112,13 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 			dio_warn_stale_pagecache(iocb->ki_filp);
 	}
 
+	if (unlock) {
+		if (dio->flags & IOMAP_DIO_RWSEM_EXCL)
+			up_write(&inode->i_rwsem);
+		else if (dio->flags & IOMAP_DIO_RWSEM_SHARED)
+			up_read(&inode->i_rwsem);
+	}
+
 	/*
 	 * If this is a DSYNC write, make sure we push it to stable storage now
 	 * that we've written data.
@@ -129,8 +136,22 @@ static void iomap_dio_complete_work(struct work_struct *work)
 {
 	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
 	struct kiocb *iocb = dio->iocb;
+	struct inode *inode = file_inode(iocb->ki_filp);
 
-	iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
+	/*
+	 * XXX: For reads this code is directly called from bio ->end_io, which
+	 * often is hard or softirq context.  In that case lockdep records the
+	 * below as lock acquisitions from irq context and causes warnings.
+	 */
+	if (dio->flags & IOMAP_DIO_RWSEM_EXCL) {
+		rwsem_acquire(&inode->i_rwsem.dep_map, 0, 0, _THIS_IP_);
+		if (IS_ENABLED(CONFIG_RWSEM_SPIN_ON_OWNER))
+			atomic_long_set(&inode->i_rwsem.owner, (long)current);
+	} else if (dio->flags & IOMAP_DIO_RWSEM_SHARED) {
+		rwsem_acquire_read(&inode->i_rwsem.dep_map, 0, 0, _THIS_IP_);
+	}
+
+	iocb->ki_complete(iocb, iomap_dio_complete(dio, true), 0);
 }
 
 /*
@@ -430,7 +451,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	dio->i_size = i_size_read(inode);
 	dio->dops = dops;
 	dio->error = 0;
-	dio->flags = 0;
+	dio->flags = dio_flags;
 
 	dio->submit.iter = iter;
 	dio->submit.waiter = current;
@@ -551,8 +572,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	dio->wait_for_completion = wait_for_completion;
 	if (!atomic_dec_and_test(&dio->ref)) {
 		if (!wait_for_completion)
-			return -EIOCBQUEUED;
-
+			goto async_completion;
 		for (;;) {
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			if (!READ_ONCE(dio->submit.waiter))
@@ -567,10 +587,22 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		__set_current_state(TASK_RUNNING);
 	}
 
-	return iomap_dio_complete(dio);
+	return iomap_dio_complete(dio, false);
 
 out_free_dio:
 	kfree(dio);
 	return ret;
+
+async_completion:
+	/*
+	 * We are returning to userspace now, but i_rwsem is still held until
+	 * the I/O completion comes back.
+	 */
+	if (dio_flags & (IOMAP_DIO_RWSEM_EXCL | IOMAP_DIO_RWSEM_SHARED))
+		rwsem_release(&inode->i_rwsem.dep_map, _THIS_IP_);
+	if ((dio_flags & IOMAP_DIO_RWSEM_EXCL) &&
+	    IS_ENABLED(CONFIG_RWSEM_SPIN_ON_OWNER))
+		atomic_long_set(&inode->i_rwsem.owner, RWSEM_OWNER_UNKNOWN);
+	return -EIOCBQUEUED;
 }
 EXPORT_SYMBOL_GPL(iomap_dio_rw);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 3faeb8fd0961..f259bb979d7f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -249,6 +249,8 @@ int iomap_writepages(struct address_space *mapping,
 #define IOMAP_DIO_UNWRITTEN	(1 << 0)	/* covers unwritten extent(s) */
 #define IOMAP_DIO_COW		(1 << 1)	/* covers COW extent(s) */
 #define IOMAP_DIO_SYNCHRONOUS	(1 << 2)	/* no async completion */
+#define IOMAP_DIO_RWSEM_EXCL	(1 << 3)	/* holds shared i_rwsem */
+#define IOMAP_DIO_RWSEM_SHARED	(1 << 4)	/* holds exclusive i_rwsem */
 
 struct iomap_dio_ops {
 	int (*end_io)(struct kiocb *iocb, ssize_t size, int error,
-- 
2.24.1


  parent reply index

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-14 16:12 RFC: hold i_rwsem until aio completes Christoph Hellwig
2020-01-14 16:12 ` [PATCH 01/12] mm: fix a comment in sys_swapon Christoph Hellwig
2020-02-10 23:29   ` Andrew Morton
2020-02-12  7:37     ` Christoph Hellwig
2020-01-14 16:12 ` [PATCH 02/12] locking/rwsem: Exit early when held by an anonymous owner Christoph Hellwig
2020-01-14 18:17   ` Waiman Long
2020-01-14 18:25     ` Christoph Hellwig
2020-01-14 18:33       ` Waiman Long
2020-01-14 18:55       ` Waiman Long
2020-01-14 16:12 ` [PATCH 03/12] xfs: fix IOCB_NOWAIT handling in xfs_file_dio_aio_read Christoph Hellwig
2020-01-14 16:12 ` [PATCH 04/12] gfs2: move setting current->backing_dev_info Christoph Hellwig
2020-01-14 16:12 ` [PATCH 05/12] gfs2: fix O_SYNC write handling Christoph Hellwig
2020-02-06 15:31   ` [Cluster-devel] " Andreas Gruenbacher
2020-01-14 16:12 ` [PATCH 06/12] iomap: pass a flags value to iomap_dio_rw Christoph Hellwig
2020-01-14 16:12 ` Christoph Hellwig [this message]
2020-01-14 16:12 ` [PATCH 08/12] ext4: hold i_rwsem until AIO completes Christoph Hellwig
2020-01-14 21:50   ` Theodore Y. Ts'o
2020-01-15  6:48     ` Christoph Hellwig
2020-01-14 16:12 ` [PATCH 09/12] gfs2: " Christoph Hellwig
2020-01-14 16:12 ` [PATCH 10/12] xfs: " Christoph Hellwig
2020-01-14 16:12 ` [PATCH 11/12] xfs: don't set IOMAP_DIO_SYNCHRONOUS for unaligned I/O Christoph Hellwig
2020-01-14 16:12 ` [PATCH 12/12] iomap: remove the inode_dio_begin/end calls Christoph Hellwig
2020-01-14 18:47 ` RFC: hold i_rwsem until aio completes Matthew Wilcox
2020-01-15  6:54   ` Christoph Hellwig
2020-01-14 19:27 ` Jason Gunthorpe
2020-01-15  6:56   ` Christoph Hellwig
2020-01-15 13:24     ` Jason Gunthorpe
2020-01-15 14:33       ` Peter Zijlstra
2020-01-15 14:49         ` Jason Gunthorpe
2020-01-15 19:03           ` Waiman Long
2020-01-15 19:07             ` Christoph Hellwig
2020-01-18 22:40         ` Matthew Wilcox
2020-01-15 15:36       ` Christoph Hellwig
2020-01-15 16:26         ` Jason Gunthorpe
2020-01-16 14:00 ` Jan Kara
2020-02-03 17:44   ` Christoph Hellwig
2020-01-18  9:28 ` Dave Chinner
2020-02-03 17:46   ` Christoph Hellwig
2020-02-03 23:02     ` Dave Chinner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200114161225.309792-8-hch@lst.de \
    --to=hch@lst.de \
    --cc=akpm@linux-foundation.org \
    --cc=cluster-devel@redhat.com \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=longman@redhat.com \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=tglx@linutronix.de \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-XFS Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-xfs/0 linux-xfs/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-xfs linux-xfs/ https://lore.kernel.org/linux-xfs \
		linux-xfs@vger.kernel.org
	public-inbox-index linux-xfs

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-xfs


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git