[RFC] Bypass filesystems for reading cached pages

* [RFC] Bypass filesystems for reading cached pages
@ 2020-06-19 15:50 Matthew Wilcox
  2020-06-19 19:06 ` Chaitanya Kulkarni
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Matthew Wilcox @ 2020-06-19 15:50 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm, agruenba; +Cc: linux-kernel


This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
The advantage of this patch is that we can avoid taking any filesystem
lock, as long as the pages being accessed are in the cache (and we don't
need to readahead any pages into the cache).  We also avoid an indirect
function call in these cases.

I'm sure reusing the name call_read_iter() is the wrong way to go about
this, but renaming all the callers would make this a larger patch.
I'm happy to do it if something like this stands a chance of being
accepted.

Compared to Andreas' patch, I removed the -ECANCELED return value.
We can happily return 0 from generic_file_buffered_read() and it's less
code to handle that.  I bypass the attempt to read from the page cache
for O_DIRECT reads, and for inodes which have no cached pages.  Hopefully
this will avoid calling generic_file_buffered_read() for drivers which
implement read_iter() (although I haven't audited them all to check that

This could go horribly wrong if filesystems rely on doing work in their
->read_iter implementation (eg checking i_size after acquiring their
lock) instead of keeping the page cache uptodate.  On the other hand,
the ->map_pages() method is already called without locks, so filesystems
should already be prepared for this.

Arguably we could do something similar for writes.  I'm a little more
scared of that patch since filesystems are more likely to want to do
things to keep their fies in sync for writes.

diff --git a/fs/read_write.c b/fs/read_write.c
index bbfa9b12b15e..7b899538d3c0 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -401,6 +401,41 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
 				read_write == READ ? MAY_READ : MAY_WRITE);
 }
 
+ssize_t call_read_iter(struct file *file, struct kiocb *iocb,
+				     struct iov_iter *iter)
+{
+	ssize_t written, ret = 0;
+
+	if (iocb->ki_flags & IOCB_DIRECT)
+		goto uncached;
+	if (!file->f_mapping->nrpages)
+		goto uncached;
+
+	iocb->ki_flags |= IOCB_CACHED;
+	ret = generic_file_buffered_read(iocb, iter, 0);
+	iocb->ki_flags &= ~IOCB_CACHED;
+
+	if (likely(!iov_iter_count(iter)))
+		return ret;
+
+	if (ret == -EAGAIN) {
+		if (iocb->ki_flags & IOCB_NOWAIT)
+			return ret;
+		ret = 0;
+	} else if (ret < 0) {
+		return ret;
+	}
+
+uncached:
+	written = ret;
+
+	ret = file->f_op->read_iter(iocb, iter);
+	if (ret > 0)
+		written += ret;
+
+	return written ? written : ret;
+}
+
 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 {
 	struct iovec iov = { .iov_base = buf, .iov_len = len };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6c4ab4dc1cd7..0985773feffd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -315,6 +315,7 @@ enum rw_hint {
 #define IOCB_SYNC		(1 << 5)
 #define IOCB_WRITE		(1 << 6)
 #define IOCB_NOWAIT		(1 << 7)
+#define IOCB_CACHED		(1 << 8)
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -1895,11 +1896,7 @@ struct inode_operations {
 	int (*set_acl)(struct inode *, struct posix_acl *, int);
 } ____cacheline_aligned;
 
-static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
-				     struct iov_iter *iter)
-{
-	return file->f_op->read_iter(kio, iter);
-}
+ssize_t call_read_iter(struct file *, struct kiocb *, struct iov_iter *);
 
 static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
 				      struct iov_iter *iter)
diff --git a/mm/filemap.c b/mm/filemap.c
index f0ae9a6308cb..4ee97941a1f2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2028,7 +2028,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 
 		page = find_get_page(mapping, index);
 		if (!page) {
-			if (iocb->ki_flags & IOCB_NOWAIT)
+			if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED))
 				goto would_block;
 			page_cache_sync_readahead(mapping,
 					ra, filp,
@@ -2038,12 +2038,16 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 				goto no_cached_page;
 		}
 		if (PageReadahead(page)) {
+			if (iocb->ki_flags & IOCB_CACHED) {
+				put_page(page);
+				goto out;
+			}
 			page_cache_async_readahead(mapping,
 					ra, filp, page,
 					index, last_index - index);
 		}
 		if (!PageUptodate(page)) {
-			if (iocb->ki_flags & IOCB_NOWAIT) {
+			if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED)) {
 				put_page(page);
 				goto would_block;
 			}


^ permalink raw reply related	[flat|nested] 18+ messages in thread