All of lore.kernel.org
 help / color / mirror / Atom feed
From: Anand Jain <anand.jain@oracle.com>
To: stable@vger.kernel.org
Cc: linux-btrfs@vger.kernel.org,
	Andreas Gruenbacher <agruenba@redhat.com>,
	Anand Jain <anand.jain@oracle.com>
Subject: [PATCH 15/17 stable-5.15.y] gfs2: Fix mmap + page fault deadlocks for direct I/O
Date: Sat,  2 Apr 2022 18:25:52 +0800	[thread overview]
Message-ID: <78e2ee8d1958c50a0fbadb15dc18df91b538bdf7.1648636044.git.anand.jain@oracle.com> (raw)
In-Reply-To: <cover.1648636044.git.anand.jain@oracle.com>

From: Andreas Gruenbacher <agruenba@redhat.com>

commit b01b2d72da25c000aeb124bc78daf3fb998be2b6 upstream

Also disable page faults during direct I/O requests and implement a
similar kind of retry logic as in the buffered I/O case.

The retry logic in the direct I/O case differs from the buffered I/O
case in the following way: direct I/O doesn't provide the kinds of
consistency guarantees between concurrent reads and writes that buffered
I/O provides, so once we lose the inode glock while faulting in user
pages, we always resume the operation.  We never need to return a
partial read or write.

This locking problem was originally reported by Jan Kara.  Linus came up
with the idea of disabling page faults.  Many thanks to Al Viro and
Matthew Wilcox for their feedback.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
 fs/gfs2/file.c | 99 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 87 insertions(+), 12 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index aaeb0e9bc04d..d7b55d0ca09c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -812,22 +812,64 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
 {
 	struct file *file = iocb->ki_filp;
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-	size_t count = iov_iter_count(to);
+	size_t prev_count = 0, window_size = 0;
+	size_t written = 0;
 	ssize_t ret;
 
-	if (!count)
+	/*
+	 * In this function, we disable page faults when we're holding the
+	 * inode glock while doing I/O.  If a page fault occurs, we indicate
+	 * that the inode glock may be dropped, fault in the pages manually,
+	 * and retry.
+	 *
+	 * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger
+	 * physical as well as manual page faults, and we need to disable both
+	 * kinds.
+	 *
+	 * For direct I/O, gfs2 takes the inode glock in deferred mode.  This
+	 * locking mode is compatible with other deferred holders, so multiple
+	 * processes and nodes can do direct I/O to a file at the same time.
+	 * There's no guarantee that reads or writes will be atomic.  Any
+	 * coordination among readers and writers needs to happen externally.
+	 */
+
+	if (!iov_iter_count(to))
 		return 0; /* skip atime */
 
 	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+retry:
 	ret = gfs2_glock_nq(gh);
 	if (ret)
 		goto out_uninit;
+retry_under_glock:
+	pagefault_disable();
+	to->nofault = true;
+	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
+			   IOMAP_DIO_PARTIAL, written);
+	to->nofault = false;
+	pagefault_enable();
+	if (ret > 0)
+		written = ret;
 
-	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0, 0);
-	gfs2_glock_dq(gh);
+	if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
+		size_t leftover;
+
+		gfs2_holder_allow_demote(gh);
+		leftover = fault_in_iov_iter_writeable(to, window_size);
+		gfs2_holder_disallow_demote(gh);
+		if (leftover != window_size) {
+			if (!gfs2_holder_queued(gh))
+				goto retry;
+			goto retry_under_glock;
+		}
+	}
+	if (gfs2_holder_queued(gh))
+		gfs2_glock_dq(gh);
 out_uninit:
 	gfs2_holder_uninit(gh);
-	return ret;
+	if (ret < 0)
+		return ret;
+	return written;
 }
 
 static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
@@ -836,10 +878,20 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
-	size_t len = iov_iter_count(from);
-	loff_t offset = iocb->ki_pos;
+	size_t prev_count = 0, window_size = 0;
+	size_t read = 0;
 	ssize_t ret;
 
+	/*
+	 * In this function, we disable page faults when we're holding the
+	 * inode glock while doing I/O.  If a page fault occurs, we indicate
+	 * that the inode glock may be dropped, fault in the pages manually,
+	 * and retry.
+	 *
+	 * For writes, iomap_dio_rw only triggers manual page faults, so we
+	 * don't need to disable physical ones.
+	 */
+
 	/*
 	 * Deferred lock, even if its a write, since we do no allocation on
 	 * this path. All we need to change is the atime, and this lock mode
@@ -849,22 +901,45 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
 	 * VFS does.
 	 */
 	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+retry:
 	ret = gfs2_glock_nq(gh);
 	if (ret)
 		goto out_uninit;
-
+retry_under_glock:
 	/* Silently fall back to buffered I/O when writing beyond EOF */
-	if (offset + len > i_size_read(&ip->i_inode))
+	if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
 		goto out;
 
-	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0, 0);
+	from->nofault = true;
+	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
+			   IOMAP_DIO_PARTIAL, read);
+	from->nofault = false;
+
 	if (ret == -ENOTBLK)
 		ret = 0;
+	if (ret > 0)
+		read = ret;
+
+	if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
+		size_t leftover;
+
+		gfs2_holder_allow_demote(gh);
+		leftover = fault_in_iov_iter_readable(from, window_size);
+		gfs2_holder_disallow_demote(gh);
+		if (leftover != window_size) {
+			if (!gfs2_holder_queued(gh))
+				goto retry;
+			goto retry_under_glock;
+		}
+	}
 out:
-	gfs2_glock_dq(gh);
+	if (gfs2_holder_queued(gh))
+		gfs2_glock_dq(gh);
 out_uninit:
 	gfs2_holder_uninit(gh);
-	return ret;
+	if (ret < 0)
+		return ret;
+	return read;
 }
 
 static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
-- 
2.33.1


  parent reply	other threads:[~2022-04-02 10:28 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-02 10:25 [PATCH 00/17 stable-5.15.y] Fix mmap + page fault deadlocks Anand Jain
2022-04-02 10:25 ` [PATCH 01/17 stable-5.15.y] powerpc/kvm: Fix kvm_use_magic_page Anand Jain
2022-04-02 10:25 ` [PATCH 02/17 stable-5.15.y] gup: Turn fault_in_pages_{readable,writeable} into fault_in_{readable,writeable} Anand Jain
2022-04-02 10:25 ` [PATCH 03/17 stable-5.15.y] iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable Anand Jain
2022-04-02 10:25 ` [PATCH 04/17 stable-5.15.y] iov_iter: Introduce fault_in_iov_iter_writeable Anand Jain
2022-04-02 10:25 ` [PATCH 05/17 stable-5.15.y] gfs2: Add wrapper for iomap_file_buffered_write Anand Jain
2022-04-02 10:25 ` [PATCH 06/17 stable-5.15.y] gfs2: Clean up function may_grant Anand Jain
2022-04-02 10:25 ` [PATCH 07/17 stable-5.15.y] gfs2: Move the inode glock locking to gfs2_file_buffered_write Anand Jain
2022-04-02 10:25 ` [PATCH 08/17 stable-5.15.y] gfs2: Eliminate ip->i_gh Anand Jain
2022-04-02 10:25 ` [PATCH 09/17 stable-5.15.y] gfs2: Fix mmap + page fault deadlocks for buffered I/O Anand Jain
2022-04-02 10:25 ` [PATCH 10/17 stable-5.15.y] iomap: Fix iomap_dio_rw return value for user copies Anand Jain
2022-04-02 10:25 ` [PATCH 11/17 stable-5.15.y] iomap: Support partial direct I/O on user copy failures Anand Jain
2022-04-02 10:25 ` [PATCH 12/17 stable-5.15.y] iomap: Add done_before argument to iomap_dio_rw Anand Jain
2022-04-02 10:25 ` [PATCH 13/17 stable-5.15.y] gup: Introduce FOLL_NOFAULT flag to disable page faults Anand Jain
2022-04-02 10:25 ` [PATCH 14/17 stable-5.15.y] iov_iter: Introduce nofault " Anand Jain
2022-04-02 10:25 ` Anand Jain [this message]
2022-04-02 10:25 ` [PATCH 16/17 stable-5.15.y] gfs2: Introduce flag for glock holder auto-demotion Anand Jain
2022-04-02 10:25 ` [PATCH 17/17 stable-5.15.y] btrfs: fix deadlock due to page faults during direct IO reads and writes Anand Jain
2022-04-04  9:41 ` [PATCH 00/17 stable-5.15.y] Fix mmap + page fault deadlocks Filipe Manana
2022-04-05 11:32   ` Anand Jain
2022-04-11  7:49     ` Greg KH
2022-04-12  5:14       ` Anand Jain

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=78e2ee8d1958c50a0fbadb15dc18df91b538bdf7.1648636044.git.anand.jain@oracle.com \
    --to=anand.jain@oracle.com \
    --cc=agruenba@redhat.com \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.