From: Andreas Gruenbacher <agruenba@redhat.com>
To: Alexander Viro <viro@zeniv.linux.org.uk>, cluster-devel@redhat.com
Cc: linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
Jan Kara <jack@suse.cz>,
Andreas Gruenbacher <agruenba@redhat.com>
Subject: [PATCH 6/6] gfs2: Fix mmap + page fault deadlocks (part 2)
Date: Thu, 20 May 2021 14:25:36 +0200 [thread overview]
Message-ID: <20210520122536.1596602-7-agruenba@redhat.com> (raw)
In-Reply-To: <20210520122536.1596602-1-agruenba@redhat.com>
Now that we handle self-recursion on the inode glock in gfs2_fault and
gfs2_page_mkwrite, we need to take care of more complex deadlock
scenarios like the following (example by Jan Kara):
Two independent processes P1, P2. Two files F1, F2, and two mappings M1,
M2 where M1 is a mapping of F1, M2 is a mapping of F2. Now P1 does DIO
to F1 with M2 as a buffer, P2 does DIO to F2 with M1 as a buffer. They
can race like:
P1 P2
read() read()
gfs2_file_read_iter() gfs2_file_read_iter()
gfs2_file_direct_read() gfs2_file_direct_read()
locks glock of F1 locks glock of F2
iomap_dio_rw() iomap_dio_rw()
bio_iov_iter_get_pages() bio_iov_iter_get_pages()
<fault in M2> <fault in M1>
gfs2_fault() gfs2_fault()
tries to grab glock of F2 tries to grab glock of F1
Those kinds of scenarios are much harder to reproduce than
self-recursion.
We deal with such situations by using the LM_FLAG_OUTER flag to mark
"outer" glock taking. Then, when taking an "inner" glock, we use the
LM_FLAG_TRY flag so that locking attempts that don't immediately succeed
will be aborted. In case of a failed locking attempt, we "unroll" to
where the "outer" glock was taken, drop the "outer" glock, and fault in
the first offending user page. This will re-trigger the "inner" locking
attempt but without the LM_FLAG_TRY flag. Once that has happened, we
re-acquire the "outer" glock and retry the original operation.
Reported-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
fs/gfs2/bmap.c | 3 ++-
fs/gfs2/file.c | 62 +++++++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 58 insertions(+), 7 deletions(-)
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 2ff501c413f4..82e4506984e3 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -967,7 +967,8 @@ static int gfs2_write_lock(struct inode *inode)
struct gfs2_sbd *sdp = GFS2_SB(inode);
int error;
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_OUTER,
+ &ip->i_gh);
error = gfs2_glock_nq(&ip->i_gh);
if (error)
goto out_uninit;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7d88abb4629b..8b26893f8dc6 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -431,21 +431,30 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
vm_fault_t ret = VM_FAULT_LOCKED;
struct gfs2_holder gh;
unsigned int length;
+ u16 flags = 0;
loff_t size;
int err;
sb_start_pagefault(inode->i_sb);
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ if (current_holds_glock())
+ flags |= LM_FLAG_TRY;
+
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, flags, &gh);
if (likely(!outer_gh)) {
err = gfs2_glock_nq(&gh);
if (err) {
ret = block_page_mkwrite_return(err);
+ if (err == GLR_TRYFAILED) {
+ set_current_needs_retry(true);
+ ret = VM_FAULT_SIGBUS;
+ }
goto out_uninit;
}
} else {
if (!gfs2_holder_is_compatible(outer_gh, LM_ST_EXCLUSIVE)) {
/* We could try to upgrade outer_gh here. */
+ set_current_needs_retry(true);
ret = VM_FAULT_SIGBUS;
goto out_uninit;
}
@@ -568,20 +577,28 @@ static vm_fault_t gfs2_fault(struct vm_fault *vmf)
struct gfs2_holder *outer_gh = gfs2_glock_is_locked_by_me(ip->i_gl);
struct gfs2_holder gh;
vm_fault_t ret;
- u16 state;
+ u16 state, flags = 0;
int err;
+ if (current_holds_glock())
+ flags |= LM_FLAG_TRY;
+
state = (vmf->flags & FAULT_FLAG_WRITE) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
- gfs2_holder_init(ip->i_gl, state, 0, &gh);
+ gfs2_holder_init(ip->i_gl, state, flags, &gh);
if (likely(!outer_gh)) {
err = gfs2_glock_nq(&gh);
if (err) {
ret = block_page_mkwrite_return(err);
+ if (err == GLR_TRYFAILED) {
+ set_current_needs_retry(true);
+ ret = VM_FAULT_SIGBUS;
+ }
goto out_uninit;
}
} else {
if (!gfs2_holder_is_compatible(outer_gh, state)) {
/* We could try to upgrade outer_gh here. */
+ set_current_needs_retry(true);
ret = VM_FAULT_SIGBUS;
goto out_uninit;
}
@@ -807,13 +824,21 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
if (!count)
return 0; /* skip atime */
- gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, LM_FLAG_OUTER, gh);
+retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
gfs2_glock_dq(gh);
+ if (unlikely(current_needs_retry())) {
+ set_current_needs_retry(false);
+ if (ret == -EFAULT) {
+ if (!iov_iter_fault_in_writeable(to, PAGE_SIZE))
+ goto retry;
+ }
+ }
out_uninit:
gfs2_holder_uninit(gh);
return ret;
@@ -837,7 +862,8 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
* unfortunately, have the option of only flushing a range like the
* VFS does.
*/
- gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, LM_FLAG_OUTER, gh);
+retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
@@ -851,6 +877,13 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
ret = 0;
out:
gfs2_glock_dq(gh);
+ if (unlikely(current_needs_retry())) {
+ set_current_needs_retry(false);
+ if (ret == -EFAULT) {
+ if (!iov_iter_fault_in_readable(from, PAGE_SIZE))
+ goto retry;
+ }
+ }
out_uninit:
gfs2_holder_uninit(gh);
return ret;
@@ -883,7 +916,8 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return ret;
}
ip = GFS2_I(iocb->ki_filp->f_mapping->host);
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_OUTER, &gh);
+retry:
ret = gfs2_glock_nq(&gh);
if (ret)
goto out_uninit;
@@ -891,6 +925,13 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (ret > 0)
written += ret;
gfs2_glock_dq(&gh);
+ if (unlikely(current_needs_retry())) {
+ set_current_needs_retry(false);
+ if (ret == -EFAULT) {
+ if (!iov_iter_fault_in_writeable(to, PAGE_SIZE))
+ goto retry;
+ }
+ }
out_uninit:
gfs2_holder_uninit(&gh);
return written ? written : ret;
@@ -902,9 +943,18 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct iov_iter *fro
struct inode *inode = file_inode(file);
ssize_t ret;
+retry:
current->backing_dev_info = inode_to_bdi(inode);
ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
current->backing_dev_info = NULL;
+ if (unlikely(current_needs_retry())) {
+ set_current_needs_retry(false);
+ if (ret == -EFAULT) {
+ if (!iov_iter_fault_in_readable(from, PAGE_SIZE))
+ goto retry;
+ }
+ }
+
return ret;
}
--
2.26.3
next prev parent reply other threads:[~2021-05-20 12:27 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-05-20 12:25 [PATCH 0/6] gfs2: Handle page faults during read and write Andreas Gruenbacher
2021-05-20 12:25 ` [PATCH 1/6] gfs2: Fix mmap + page fault deadlocks (part 1) Andreas Gruenbacher
2021-05-20 12:25 ` [PATCH 2/6] iov_iter: Add iov_iter_fault_in_writeable() Andreas Gruenbacher
2021-05-20 12:25 ` [PATCH 3/6] gfs2: Add wrappers for accessing journal_info Andreas Gruenbacher
2021-05-20 12:25 ` [PATCH 4/6] gfs2: Encode glock holding and retry flags in journal_info Andreas Gruenbacher
2021-05-20 12:25 ` [PATCH 5/6] gfs2: Add LM_FLAG_OUTER glock holder flag Andreas Gruenbacher
2021-05-20 12:25 ` Andreas Gruenbacher [this message]
2021-05-20 13:30 ` [PATCH 6/6] gfs2: Fix mmap + page fault deadlocks (part 2) Jan Kara
2021-05-20 14:07 ` Andreas Gruenbacher
2021-05-21 15:23 ` Jan Kara
2021-05-21 15:46 ` Andreas Gruenbacher
2021-05-24 8:39 ` Jan Kara
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210520122536.1596602-7-agruenba@redhat.com \
--to=agruenba@redhat.com \
--cc=cluster-devel@redhat.com \
--cc=jack@suse.cz \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).