[RFC PATCH 2/3] fs: Add FAULT_FLAG_CACHED flag for filemap_fault

From: Andreas Gruenbacher <agruenba@redhat.com>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>,
	Konstantin Khlebnikov <khlebnikov@yandex-team.ru>,
	"Kirill A. Shutemov" <kirill@shutemov.name>,
	linux-mm@kvack.org, Andrew Morton <akpm@linux-foundation.org>,
	linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	Johannes Weiner <hannes@cmpxchg.org>,
	cluster-devel@redhat.com, Ronnie Sahlberg <lsahlber@redhat.com>,
	Steve French <sfrench@samba.org>,
	Bob Peterson <rpeterso@redhat.com>,
	Andreas Gruenbacher <agruenba@redhat.com>
Subject: [RFC PATCH 2/3] fs: Add FAULT_FLAG_CACHED flag for filemap_fault
Date: Sat, 23 Nov 2019 00:53:23 +0100	[thread overview]
Message-ID: <20191122235324.17245-3-agruenba@redhat.com> (raw)
In-Reply-To: <20191122235324.17245-1-agruenba@redhat.com>

Add a FAULT_FLAG_CACHED flag which indicates to filemap_fault that it
should only look at the page cache, without triggering filesystem I/O
for the actual request or for readahead.  When filesystem I/O would be
triggered, VM_FAULT_RETRY should be returned instead.

This allows the caller to tentatively satisfy a minor page fault out of
the page cache, and to retry the operation after taking the necessary
steps when that isn't possible.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 include/linux/mm.h |  4 +++-
 mm/filemap.c       | 43 ++++++++++++++++++++++++++++++-------------
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2adf95b3f9c..b3317e4b2607 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -392,6 +392,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_USER		0x40	/* The fault originated in userspace */
 #define FAULT_FLAG_REMOTE	0x80	/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100	/* The fault was during an instruction fetch */
+#define FAULT_FLAG_CACHED		0x200	/* Only look at the page cache */
 
 #define FAULT_FLAG_TRACE \
 	{ FAULT_FLAG_WRITE,		"WRITE" }, \
@@ -402,7 +403,8 @@ extern pgprot_t protection_map[16];
 	{ FAULT_FLAG_TRIED,		"TRIED" }, \
 	{ FAULT_FLAG_USER,		"USER" }, \
 	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
-	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }
+	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }, \
+	{ FAULT_FLAG_CACHED,		"CACHED" }
 
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
diff --git a/mm/filemap.c b/mm/filemap.c
index 024ff0b5fcb6..2297fad3b03a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2383,7 +2383,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
 	 * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT
 	 * is supposed to work. We have way too many special cases..
 	 */
-	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
+	if (vmf->flags & (FAULT_FLAG_RETRY_NOWAIT | FAULT_FLAG_CACHED))
 		return 0;
 
 	*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
@@ -2460,26 +2460,28 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
  * so we want to possibly extend the readahead further.  We return the file that
  * was pinned if we have to drop the mmap_sem in order to do IO.
  */
-static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
-					    struct page *page)
+static vm_fault_t do_async_mmap_readahead(struct vm_fault *vmf,
+					  struct page *page,
+					  struct file **fpin)
 {
 	struct file *file = vmf->vma->vm_file;
 	struct file_ra_state *ra = &file->f_ra;
 	struct address_space *mapping = file->f_mapping;
-	struct file *fpin = NULL;
 	pgoff_t offset = vmf->pgoff;
 
 	/* If we don't want any read-ahead, don't bother */
 	if (vmf->vma->vm_flags & VM_RAND_READ)
-		return fpin;
+		return 0;
 	if (ra->mmap_miss > 0)
 		ra->mmap_miss--;
 	if (PageReadahead(page)) {
-		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+		if (vmf->flags & FAULT_FLAG_CACHED)
+			return VM_FAULT_RETRY;
+		*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
 		page_cache_async_readahead(mapping, ra, file,
 					   page, offset, ra->ra_pages);
 	}
-	return fpin;
+	return 0;
 }
 
 /**
@@ -2495,8 +2497,11 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
  *
  * vma->vm_mm->mmap_sem must be held on entry.
  *
- * If our return value has VM_FAULT_RETRY set, it's because the mmap_sem
- * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
+ * This function may drop the mmap_sem before doing I/O or waiting for a page
+ * lock; this is indicated by the VM_FAULT_RETRY flag in our return value.
+ * Setting FAULT_FLAG_CACHED or FAULT_FLAG_RETRY_NOWAIT in vmf->flags will
+ * prevent dropping the mmap_sem; in that case, VM_FAULT_RETRY indicates that
+ * the mmap_sem would have been dropped.
  *
  * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
  * has not been released.
@@ -2518,9 +2523,15 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
 	struct page *page;
 	vm_fault_t ret = 0;
 
-	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
-	if (unlikely(offset >= max_off))
-		return VM_FAULT_SIGBUS;
+	/*
+	 * FAULT_FLAG_CACHED indicates that the inode size is only guaranteed
+	 * to be valid when the page we are looking for is in the page cache.
+	 */
+	if (!(vmf->flags & FAULT_FLAG_CACHED)) {
+		max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+		if (unlikely(offset >= max_off))
+			return VM_FAULT_SIGBUS;
+	}
 
 	/*
 	 * Do we have something in the page cache already?
@@ -2531,8 +2542,14 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
 		 * We found the page, so try async readahead before
 		 * waiting for the lock.
 		 */
-		fpin = do_async_mmap_readahead(vmf, page);
+		ret = do_async_mmap_readahead(vmf, page, &fpin);
+		if (ret) {
+			put_page(page);
+			return ret;
+		}
 	} else if (!page) {
+		if (vmf->flags & FAULT_FLAG_CACHED)
+			goto out_retry;
 		/* No page in the page cache at all */
 		count_vm_event(PGMAJFAULT);
 		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
-- 
2.20.1