Re: [PATCHSET v3 0/5] Support for RWF_UNCACHED

From: Jens Axboe <axboe@kernel.dk>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linux-MM <linux-mm@kvack.org>,
	linux-fsdevel <linux-fsdevel@vger.kernel.org>,
	linux-block <linux-block@vger.kernel.org>,
	Matthew Wilcox <willy@infradead.org>, Chris Mason <clm@fb.com>,
	Dave Chinner <david@fromorbit.com>,
	Johannes Weiner <hannes@cmpxchg.org>
Subject: Re: [PATCHSET v3 0/5] Support for RWF_UNCACHED
Date: Wed, 11 Dec 2019 18:09:14 -0700	[thread overview]
Message-ID: <e7fc6b37-8106-4fe2-479c-05c3f2b1c1f1@kernel.dk> (raw)
In-Reply-To: <fef996ca-a4ed-9633-1f79-91292a984a20@kernel.dk>

On 12/11/19 4:41 PM, Jens Axboe wrote:
> On 12/11/19 1:18 PM, Linus Torvalds wrote:
>> On Wed, Dec 11, 2019 at 12:08 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>
>>> $ cat /proc/meminfo | grep -i active
>>> Active:           134136 kB
>>> Inactive:       28683916 kB
>>> Active(anon):      97064 kB
>>> Inactive(anon):        4 kB
>>> Active(file):      37072 kB
>>> Inactive(file): 28683912 kB
>>
>> Yeah, that should not put pressure on some swap activity. We have 28
>> GB of basically free inactive file data, and the VM is doing something
>> very very bad if it then doesn't just quickly free it with no real
>> drama.
>>
>> In fact, I don't think it should even trigger kswapd at all, it should
>> all be direct reclaim. Of course, some of the mm people hate that with
>> a passion, but this does look like a prime example of why it should
>> just be done.
> 
> For giggles, I ran just a single thread on the file set. We're only
> doing about 100K IOPS at that point, yet when the page cache fills,
> kswapd still eats 10% cpu. That seems like a lot for something that
> slow.

Warning, the below is from the really crazy department...

Anyway, I took a closer look at the profiles for the uncached case.
We're spending a lot of time doing memsets (this is the xa_node init,
from the radix tree constructor), and call_rcu for the node free later
on. All wasted time, and something that meant we weren't as close to the
performance of O_DIRECT as we could be.

So Chris and I started talking about this, and pondered "what would
happen if we simply bypassed the page cache completely?". Case in point,
see below incremental patch. We still do the page cache lookup, and use
that page to copy from if it's there. If the page isn't there, allocate
one and do IO to it, but DON'T add it to the page cache. With that,
we're almost at O_DIRECT levels of performance for the 4k read case,
without 1-2%. I think 512b would look awesome, but we're reading full
pages, so that won't really help us much. Compared to the previous
uncached method, this is 30% faster on this device. That's substantial.

Obviously this has issues with truncate that would need to be resolved,
and it's definitely dirtier. But the performance is very enticing...

diff --git a/mm/filemap.c b/mm/filemap.c
index c37b0e221a8a..9834c394fffd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -933,8 +933,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 }
 EXPORT_SYMBOL(add_to_page_cache_locked);
 
-static int __add_to_page_cache(struct page *page, struct address_space *mapping,
-			       pgoff_t offset, gfp_t gfp_mask, bool lru)
+int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+				pgoff_t offset, gfp_t gfp_mask)
 {
 	void *shadow = NULL;
 	int ret;
@@ -956,18 +956,11 @@ static int __add_to_page_cache(struct page *page, struct address_space *mapping,
 		WARN_ON_ONCE(PageActive(page));
 		if (!(gfp_mask & __GFP_WRITE) && shadow)
 			workingset_refault(page, shadow);
-		if (lru)
-			lru_cache_add(page);
+		lru_cache_add(page);
 	}
 	return ret;
 
 }
-
-int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
-				pgoff_t offset, gfp_t gfp_mask)
-{
-	return __add_to_page_cache(page, mapping, offset, gfp_mask, true);
-}
 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
 
 #ifdef CONFIG_NUMA
@@ -1998,6 +1991,13 @@ static void shrink_readahead_size_eio(struct file *filp,
 	ra->ra_pages /= 4;
 }
 
+static void buffered_put_page(struct page *page, bool clear_mapping)
+{
+	if (clear_mapping)
+		page->mapping = NULL;
+	put_page(page);
+}
+
 /**
  * generic_file_buffered_read - generic file read routine
  * @iocb:	the iocb to read
@@ -2040,7 +2040,7 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
 	offset = *ppos & ~PAGE_MASK;
 
 	for (;;) {
-		bool drop_page = false;
+		bool clear_mapping = false;
 		struct page *page;
 		pgoff_t end_index;
 		loff_t isize;
@@ -2118,7 +2118,7 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		isize = i_size_read(inode);
 		end_index = (isize - 1) >> PAGE_SHIFT;
 		if (unlikely(!isize || index > end_index)) {
-			put_page(page);
+			buffered_put_page(page, clear_mapping);
 			goto out;
 		}
 
@@ -2127,7 +2127,7 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		if (index == end_index) {
 			nr = ((isize - 1) & ~PAGE_MASK) + 1;
 			if (nr <= offset) {
-				put_page(page);
+				buffered_put_page(page, clear_mapping);
 				goto out;
 			}
 		}
@@ -2160,27 +2160,7 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		offset &= ~PAGE_MASK;
 		prev_offset = offset;
 
-		/*
-		 * If we're dropping this page due to drop-behind, then
-		 * lock it first. Ignore errors here, we can just leave it
-		 * in the page cache. Note that we didn't add this page to
-		 * the LRU when we added it to the page cache. So if we
-		 * fail removing it, or lock it, add to the LRU.
-		 */
-		if (drop_page) {
-			bool addlru = true;
-
-			if (!lock_page_killable(page)) {
-				if (page->mapping == mapping)
-					addlru = !remove_mapping(mapping, page);
-				else
-					addlru = false;
-				unlock_page(page);
-			}
-			if (addlru)
-				lru_cache_add(page);
-		}
-		put_page(page);
+		buffered_put_page(page, clear_mapping);
 		written += ret;
 		if (!iov_iter_count(iter))
 			goto out;
@@ -2222,7 +2202,7 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
 
 		if (unlikely(error)) {
 			if (error == AOP_TRUNCATED_PAGE) {
-				put_page(page);
+				buffered_put_page(page, clear_mapping);
 				error = 0;
 				goto find_page;
 			}
@@ -2239,7 +2219,7 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
 					 * invalidate_mapping_pages got it
 					 */
 					unlock_page(page);
-					put_page(page);
+					buffered_put_page(page, clear_mapping);
 					goto find_page;
 				}
 				unlock_page(page);
@@ -2254,7 +2234,7 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
 
 readpage_error:
 		/* UHHUH! A synchronous read error occurred. Report it */
-		put_page(page);
+		buffered_put_page(page, clear_mapping);
 		goto out;
 
 no_cached_page:
@@ -2267,12 +2247,16 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
 			error = -ENOMEM;
 			goto out;
 		}
-		if (iocb->ki_flags & IOCB_UNCACHED)
-			drop_page = true;
+		if (iocb->ki_flags & IOCB_UNCACHED) {
+			__SetPageLocked(page);
+			page->mapping = mapping;
+			page->index = index;
+			clear_mapping = true;
+			goto readpage;
+		}
 
-		error = __add_to_page_cache(page, mapping, index,
-				mapping_gfp_constraint(mapping, GFP_KERNEL),
-				!drop_page);
+		error = add_to_page_cache(page, mapping, index,
+				mapping_gfp_constraint(mapping, GFP_KERNEL));
 		if (error) {
 			put_page(page);
 			if (error == -EEXIST) {

-- 
Jens Axboe