From: Andreas Gruenbacher <agruenba@redhat.com>
To: Theodore Ts'o <tytso@mit.edu>
Cc: Andreas Gruenbacher <agruenba@redhat.com>,
Dave Hansen <dave.hansen@linux.intel.com>,
Catalin Marinas <catalin.marinas@arm.com>,
Linus Torvalds <torvalds@linux-foundation.org>,
Paul Mackerras <paulus@ozlabs.org>,
Alexander Viro <viro@zeniv.linux.org.uk>,
Christoph Hellwig <hch@infradead.org>,
"Darrick J. Wong" <djwong@kernel.org>, Jan Kara <jack@suse.cz>,
Matthew Wilcox <willy@infradead.org>,
cluster-devel@redhat.com, linux-fsdevel@vger.kernel.org,
linux-kernel@vger.kernel.org, ocfs2-devel@oss.oracle.com,
kvm-ppc@vger.kernel.org, linux-btrfs@vger.kernel.org
Subject: Re: [PATCH v8 00/17] gfs2: Fix mmap + page fault deadlocks
Date: Wed, 27 Oct 2021 23:21:38 +0200 [thread overview]
Message-ID: <20211027212138.3722977-1-agruenba@redhat.com> (raw)
In-Reply-To: <20211026094430.3669156-1-agruenba@redhat.com>
One of the arguments against Dave Hansen's patch that eliminates the
pre-faulting was that it doubles the number of page faults in the slow
case. This can be avoided by using get_user_pages() to do the
"post-faulting", though. For what it's worth, here's a patch for that
(on top of this series).
Andreas
--
fs: Avoid predictable page faults for sys_write() user buffer pages
Introduce a new fault_in_iov_iter_slow_readable() helper for faulting in
an iterator via get_user_pages() instead of triggering page faults.
This is slower than a simple memory read when the underlying pages are
resident, but avoids the page fault overhead when the underlying pages
need to be faulted in.
Use fault_in_iov_iter_slow_readable() in generic_perform_write and
iomap_write_iter when reading from the user buffer fails.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
fs/iomap/buffered-io.c | 2 +-
include/linux/pagemap.h | 3 ++-
include/linux/uio.h | 17 ++++++++++++++++-
lib/iov_iter.c | 10 ++++++----
mm/filemap.c | 2 +-
mm/gup.c | 10 ++++++----
6 files changed, 32 insertions(+), 12 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d8809cd9ab31..15a0b4bb9528 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -770,7 +770,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
bytes = copied;
goto again;
}
- if (fault_in_iov_iter_readable(i, bytes) != bytes)
+ if (fault_in_iov_iter_slow_readable(i, bytes) != bytes)
goto again;
status = -EFAULT;
break;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2f7dd14083d9..43844ed5675f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -736,8 +736,9 @@ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
* Fault in userspace address range.
*/
size_t fault_in_writeable(char __user *uaddr, size_t size);
-size_t fault_in_safe_writeable(const char __user *uaddr, size_t size);
size_t fault_in_readable(const char __user *uaddr, size_t size);
+size_t __fault_in_slow(const char __user *uaddr, size_t size,
+ unsigned int flags);
int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 6350354f97e9..b071f4445059 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -8,6 +8,7 @@
#include <linux/kernel.h>
#include <linux/thread_info.h>
#include <uapi/linux/uio.h>
+#include <linux/mm.h>
struct page;
struct pipe_inode_info;
@@ -135,7 +136,21 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
void iov_iter_advance(struct iov_iter *i, size_t bytes);
void iov_iter_revert(struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
-size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes);
+size_t __fault_in_iov_iter_slow(const struct iov_iter *i, size_t bytes,
+ unsigned int flags);
+
+static inline size_t fault_in_iov_iter_slow_readable(const struct iov_iter *i,
+ size_t bytes)
+{
+ return __fault_in_iov_iter_slow(i, bytes, 0);
+}
+
+static inline size_t fault_in_iov_iter_writeable(const struct iov_iter *i,
+ size_t bytes)
+{
+ return __fault_in_iov_iter_slow(i, bytes, FOLL_WRITE);
+}
+
size_t iov_iter_single_seg_count(const struct iov_iter *i);
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 66a740e6e153..73789a5409f6 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -468,9 +468,10 @@ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
EXPORT_SYMBOL(fault_in_iov_iter_readable);
/*
- * fault_in_iov_iter_writeable - fault in iov iterator for writing
+ * __fault_in_iov_iter_slow - fault in iov iterator for reading/writing
* @i: iterator
* @size: maximum length
+ * @flags: FOLL_* flags (FOLL_WRITE for writing)
*
* Faults in the iterator using get_user_pages(), i.e., without triggering
* hardware page faults. This is primarily useful when we already know that
@@ -481,7 +482,8 @@ EXPORT_SYMBOL(fault_in_iov_iter_readable);
*
* Always returns 0 for non-user-space iterators.
*/
-size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
+size_t __fault_in_iov_iter_slow(const struct iov_iter *i, size_t size,
+ unsigned int flags)
{
if (iter_is_iovec(i)) {
size_t count = min(size, iov_iter_count(i));
@@ -495,7 +497,7 @@ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
if (unlikely(!len))
continue;
- ret = fault_in_safe_writeable(p->iov_base + skip, len);
+ ret = __fault_in_slow(p->iov_base + skip, len, flags);
count -= len - ret;
if (ret)
break;
@@ -504,7 +506,7 @@ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
}
return 0;
}
-EXPORT_SYMBOL(fault_in_iov_iter_writeable);
+EXPORT_SYMBOL(__fault_in_iov_iter_slow);
void iov_iter_init(struct iov_iter *i, unsigned int direction,
const struct iovec *iov, unsigned long nr_segs,
diff --git a/mm/filemap.c b/mm/filemap.c
index 467cdb7d086d..7ca76f4aa974 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3787,7 +3787,7 @@ ssize_t generic_perform_write(struct file *file,
bytes = copied;
goto again;
}
- if (fault_in_iov_iter_readable(i, bytes) != bytes)
+ if (fault_in_iov_iter_slow_readable(i, bytes) != bytes)
goto again;
status = -EFAULT;
break;
diff --git a/mm/gup.c b/mm/gup.c
index e1c7e4bde11f..def9f478a621 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1694,9 +1694,10 @@ size_t fault_in_writeable(char __user *uaddr, size_t size)
EXPORT_SYMBOL(fault_in_writeable);
/*
- * fault_in_safe_writeable - fault in an address range for writing
+ * __fault_in_slow - fault in an address range for reading/writing
* @uaddr: start of address range
* @size: length of address range
+ * @flags: FOLL_* flags (FOLL_WRITE for writing)
*
* Faults in an address range using get_user_pages, i.e., without triggering
* hardware page faults. This is primarily useful when we already know that
@@ -1711,7 +1712,8 @@ EXPORT_SYMBOL(fault_in_writeable);
* Returns the number of bytes not faulted in, like copy_to_user() and
* copy_from_user().
*/
-size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
+size_t __fault_in_slow(const char __user *uaddr, size_t size,
+ unsigned int flags)
{
unsigned long start = (unsigned long)untagged_addr(uaddr);
unsigned long end, nstart, nend;
@@ -1743,7 +1745,7 @@ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
nr_pages = (nend - nstart) / PAGE_SIZE;
ret = __get_user_pages_locked(mm, nstart, nr_pages,
NULL, NULL, &locked,
- FOLL_TOUCH | FOLL_WRITE);
+ FOLL_TOUCH | flags);
if (ret <= 0)
break;
nend = nstart + ret * PAGE_SIZE;
@@ -1754,7 +1756,7 @@ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
return 0;
return size - min_t(size_t, nstart - start, size);
}
-EXPORT_SYMBOL(fault_in_safe_writeable);
+EXPORT_SYMBOL(__fault_in_slow);
/**
* fault_in_readable - fault in userspace address range for reading
--
2.26.3
prev parent reply other threads:[~2021-10-27 21:23 UTC|newest]
Thread overview: 47+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-10-19 13:41 [PATCH v8 00/17] gfs2: Fix mmap + page fault deadlocks Andreas Gruenbacher
2021-10-19 13:41 ` [PATCH v8 01/17] iov_iter: Fix iov_iter_get_pages{,_alloc} page fault return value Andreas Gruenbacher
2021-10-19 13:41 ` [PATCH v8 02/17] powerpc/kvm: Fix kvm_use_magic_page Andreas Gruenbacher
2021-10-19 13:41 ` [PATCH v8 03/17] gup: Turn fault_in_pages_{readable,writeable} into fault_in_{readable,writeable} Andreas Gruenbacher
2021-10-19 13:41 ` [PATCH v8 04/17] iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable Andreas Gruenbacher
2021-10-19 13:41 ` [PATCH v8 05/17] iov_iter: Introduce fault_in_iov_iter_writeable Andreas Gruenbacher
2021-10-20 16:25 ` Catalin Marinas
2021-10-19 13:41 ` [PATCH v8 06/17] gfs2: Add wrapper for iomap_file_buffered_write Andreas Gruenbacher
2021-10-19 13:41 ` [PATCH v8 07/17] gfs2: Clean up function may_grant Andreas Gruenbacher
2021-10-19 13:41 ` [PATCH v8 08/17] gfs2: Introduce flag for glock holder auto-demotion Andreas Gruenbacher
2021-10-19 13:41 ` [PATCH v8 09/17] gfs2: Move the inode glock locking to gfs2_file_buffered_write Andreas Gruenbacher
2021-10-19 13:41 ` [PATCH v8 10/17] gfs2: Eliminate ip->i_gh Andreas Gruenbacher
2021-10-19 13:41 ` [PATCH v8 11/17] gfs2: Fix mmap + page fault deadlocks for buffered I/O Andreas Gruenbacher
2021-10-19 13:41 ` [PATCH v8 12/17] iomap: Fix iomap_dio_rw return value for user copies Andreas Gruenbacher
2021-10-19 13:42 ` [PATCH v8 13/17] iomap: Support partial direct I/O on user copy failures Andreas Gruenbacher
2021-10-19 13:42 ` [PATCH v8 14/17] iomap: Add done_before argument to iomap_dio_rw Andreas Gruenbacher
2021-10-19 15:51 ` Darrick J. Wong
2021-10-19 19:30 ` Andreas Gruenbacher
2021-10-20 1:57 ` Darrick J. Wong
2021-10-19 13:42 ` [PATCH v8 15/17] gup: Introduce FOLL_NOFAULT flag to disable page faults Andreas Gruenbacher
2021-10-19 13:42 ` [PATCH v8 16/17] iov_iter: Introduce nofault " Andreas Gruenbacher
2021-10-19 13:42 ` [PATCH v8 17/17] gfs2: Fix mmap + page fault deadlocks for direct I/O Andreas Gruenbacher
2021-10-19 15:40 ` [PATCH v8 00/17] gfs2: Fix mmap + page fault deadlocks Linus Torvalds
2021-10-19 16:00 ` Bob Peterson
2021-10-20 16:36 ` Catalin Marinas
2021-10-20 20:11 ` Linus Torvalds
2021-10-20 22:44 ` Catalin Marinas
2021-10-21 6:19 ` Linus Torvalds
2021-10-22 18:06 ` Catalin Marinas
2021-10-22 19:22 ` Linus Torvalds
2021-10-25 19:00 ` Andreas Gruenbacher
2021-10-26 18:24 ` Catalin Marinas
2021-10-26 18:50 ` Linus Torvalds
2021-10-26 19:18 ` Linus Torvalds
2021-10-27 19:13 ` Catalin Marinas
2021-10-27 21:14 ` Linus Torvalds
2021-10-28 21:20 ` Catalin Marinas
2021-10-28 21:40 ` Catalin Marinas
2021-10-28 22:15 ` Andreas Grünbacher
2021-10-29 12:50 ` Catalin Marinas
2021-10-28 22:32 ` Linus Torvalds
2021-10-29 17:50 ` Catalin Marinas
2021-10-29 18:47 ` Linus Torvalds
2021-10-25 18:24 ` Andreas Gruenbacher
2021-10-26 5:12 ` Theodore Ts'o
2021-10-26 9:44 ` Andreas Gruenbacher
2021-10-27 21:21 ` Andreas Gruenbacher [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20211027212138.3722977-1-agruenba@redhat.com \
--to=agruenba@redhat.com \
--cc=catalin.marinas@arm.com \
--cc=cluster-devel@redhat.com \
--cc=dave.hansen@linux.intel.com \
--cc=djwong@kernel.org \
--cc=hch@infradead.org \
--cc=jack@suse.cz \
--cc=kvm-ppc@vger.kernel.org \
--cc=linux-btrfs@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=ocfs2-devel@oss.oracle.com \
--cc=paulus@ozlabs.org \
--cc=torvalds@linux-foundation.org \
--cc=tytso@mit.edu \
--cc=viro@zeniv.linux.org.uk \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).