All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] readahead improvements
@ 2021-04-07 20:18 Matthew Wilcox (Oracle)
  2021-04-07 20:18 ` [PATCH 1/3] mm/filemap: Pass the file_ra_state in the ractl Matthew Wilcox (Oracle)
                   ` (5 more replies)
  0 siblings, 6 replies; 9+ messages in thread
From: Matthew Wilcox (Oracle) @ 2021-04-07 20:18 UTC (permalink / raw)
  To: David Howells; +Cc: Matthew Wilcox (Oracle), linux-fsdevel

As requested, fix up readahead_expand() so as to not confuse the ondemand
algorithm.  Also make the documentation slightly better.  Dave, could you
put in some debug and check this actually works?  I don't generally test
with any filesystems that use readahead_expand(), but printing (index,
nr_to_read, lookahead_size) in page_cache_ra_unbounded() would let a human
(such as your good self) determine whether it's working approximately
as designed.

This is against linux-next 20210407.

Matthew Wilcox (Oracle) (3):
  mm/filemap: Pass the file_ra_state in the ractl
  fs: Document file_ra_state
  mm/readahead: Adjust file_ra in readahead_expand

 fs/ext4/verity.c        |  2 +-
 fs/f2fs/file.c          |  2 +-
 fs/f2fs/verity.c        |  2 +-
 include/linux/fs.h      | 24 ++++++++++++++----------
 include/linux/pagemap.h | 20 +++++++++++---------
 mm/filemap.c            |  4 ++--
 mm/internal.h           |  7 +++----
 mm/readahead.c          | 25 ++++++++++++++-----------
 8 files changed, 47 insertions(+), 39 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/3] mm/filemap: Pass the file_ra_state in the ractl
  2021-04-07 20:18 [PATCH 0/3] readahead improvements Matthew Wilcox (Oracle)
@ 2021-04-07 20:18 ` Matthew Wilcox (Oracle)
  2021-04-07 20:18 ` [PATCH 2/3] fs: Document file_ra_state Matthew Wilcox (Oracle)
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 9+ messages in thread
From: Matthew Wilcox (Oracle) @ 2021-04-07 20:18 UTC (permalink / raw)
  To: David Howells; +Cc: Matthew Wilcox (Oracle), linux-fsdevel

For readahead_expand(), we need to modify the file ra_state, so pass it
down by adding it to the ractl.  We have to do this because it's not always
the same as f_ra in the struct file that is already being passed.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 fs/ext4/verity.c        |  2 +-
 fs/f2fs/file.c          |  2 +-
 fs/f2fs/verity.c        |  2 +-
 include/linux/pagemap.h | 20 +++++++++++---------
 mm/filemap.c            |  4 ++--
 mm/internal.h           |  7 +++----
 mm/readahead.c          | 22 +++++++++++-----------
 7 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 00e3cbde472e..07438f46b558 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -370,7 +370,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
 					       pgoff_t index,
 					       unsigned long num_ra_pages)
 {
-	DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
+	DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
 	struct page *page;
 
 	index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f3ca63b55843..e6cd08559375 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -4051,7 +4051,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg)
 
 static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
 {
-	DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, page_idx);
+	DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, page_idx);
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	pgoff_t redirty_idx = page_idx;
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 15ba36926fad..03549b5ba204 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -261,7 +261,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
 					       pgoff_t index,
 					       unsigned long num_ra_pages)
 {
-	DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
+	DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
 	struct page *page;
 
 	index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 72bca08f96a2..350e1c9726f2 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -826,20 +826,23 @@ static inline int add_to_page_cache(struct page *page,
  * @file: The file, used primarily by network filesystems for authentication.
  *	  May be NULL if invoked internally by the filesystem.
  * @mapping: Readahead this filesystem object.
+ * @ra: File readahead state.  May be NULL.
  */
 struct readahead_control {
 	struct file *file;
 	struct address_space *mapping;
+	struct file_ra_state *ra;
 /* private: use the readahead_* accessors instead */
 	pgoff_t _index;
 	unsigned int _nr_pages;
 	unsigned int _batch_count;
 };
 
-#define DEFINE_READAHEAD(rac, f, m, i)					\
-	struct readahead_control rac = {				\
+#define DEFINE_READAHEAD(ractl, f, r, m, i)				\
+	struct readahead_control ractl = {				\
 		.file = f,						\
 		.mapping = m,						\
+		.ra = r,						\
 		._index = i,						\
 	}
 
@@ -847,10 +850,9 @@ struct readahead_control {
 
 void page_cache_ra_unbounded(struct readahead_control *,
 		unsigned long nr_to_read, unsigned long lookahead_count);
-void page_cache_sync_ra(struct readahead_control *, struct file_ra_state *,
+void page_cache_sync_ra(struct readahead_control *, unsigned long req_count);
+void page_cache_async_ra(struct readahead_control *, struct page *,
 		unsigned long req_count);
-void page_cache_async_ra(struct readahead_control *, struct file_ra_state *,
-		struct page *, unsigned long req_count);
 void readahead_expand(struct readahead_control *ractl,
 		      loff_t new_start, size_t new_len);
 
@@ -872,8 +874,8 @@ void page_cache_sync_readahead(struct address_space *mapping,
 		struct file_ra_state *ra, struct file *file, pgoff_t index,
 		unsigned long req_count)
 {
-	DEFINE_READAHEAD(ractl, file, mapping, index);
-	page_cache_sync_ra(&ractl, ra, req_count);
+	DEFINE_READAHEAD(ractl, file, ra, mapping, index);
+	page_cache_sync_ra(&ractl, req_count);
 }
 
 /**
@@ -895,8 +897,8 @@ void page_cache_async_readahead(struct address_space *mapping,
 		struct file_ra_state *ra, struct file *file,
 		struct page *page, pgoff_t index, unsigned long req_count)
 {
-	DEFINE_READAHEAD(ractl, file, mapping, index);
-	page_cache_async_ra(&ractl, ra, page, req_count);
+	DEFINE_READAHEAD(ractl, file, ra, mapping, index);
+	page_cache_async_ra(&ractl, page, req_count);
 }
 
 /**
diff --git a/mm/filemap.c b/mm/filemap.c
index 5b4afb35c861..7aacec5684c7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2855,7 +2855,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 	struct file *file = vmf->vma->vm_file;
 	struct file_ra_state *ra = &file->f_ra;
 	struct address_space *mapping = file->f_mapping;
-	DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
+	DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
 	struct file *fpin = NULL;
 	unsigned int mmap_miss;
 
@@ -2867,7 +2867,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 
 	if (vmf->vma->vm_flags & VM_SEQ_READ) {
 		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
-		page_cache_sync_ra(&ractl, ra, ra->ra_pages);
+		page_cache_sync_ra(&ractl, ra->ra_pages);
 		return fpin;
 	}
 
diff --git a/mm/internal.h b/mm/internal.h
index 213762665d1d..46eb82eaa195 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -51,13 +51,12 @@ void unmap_page_range(struct mmu_gather *tlb,
 
 void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
 		unsigned long lookahead_size);
-void force_page_cache_ra(struct readahead_control *, struct file_ra_state *,
-		unsigned long nr);
+void force_page_cache_ra(struct readahead_control *, unsigned long nr);
 static inline void force_page_cache_readahead(struct address_space *mapping,
 		struct file *file, pgoff_t index, unsigned long nr_to_read)
 {
-	DEFINE_READAHEAD(ractl, file, mapping, index);
-	force_page_cache_ra(&ractl, &file->f_ra, nr_to_read);
+	DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
+	force_page_cache_ra(&ractl, nr_to_read);
 }
 
 unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
diff --git a/mm/readahead.c b/mm/readahead.c
index 4446dada0bc2..e115b1174981 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -272,9 +272,10 @@ void do_page_cache_ra(struct readahead_control *ractl,
  * memory at once.
  */
 void force_page_cache_ra(struct readahead_control *ractl,
-		struct file_ra_state *ra, unsigned long nr_to_read)
+		unsigned long nr_to_read)
 {
 	struct address_space *mapping = ractl->mapping;
+	struct file_ra_state *ra = ractl->ra;
 	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 	unsigned long max_pages, index;
 
@@ -433,10 +434,10 @@ static int try_context_readahead(struct address_space *mapping,
  * A minimal readahead algorithm for trivial sequential/random reads.
  */
 static void ondemand_readahead(struct readahead_control *ractl,
-		struct file_ra_state *ra, bool hit_readahead_marker,
-		unsigned long req_size)
+		bool hit_readahead_marker, unsigned long req_size)
 {
 	struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
+	struct file_ra_state *ra = ractl->ra;
 	unsigned long max_pages = ra->ra_pages;
 	unsigned long add_pages;
 	unsigned long index = readahead_index(ractl);
@@ -550,7 +551,7 @@ static void ondemand_readahead(struct readahead_control *ractl,
 }
 
 void page_cache_sync_ra(struct readahead_control *ractl,
-		struct file_ra_state *ra, unsigned long req_count)
+		unsigned long req_count)
 {
 	bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
 
@@ -560,7 +561,7 @@ void page_cache_sync_ra(struct readahead_control *ractl,
 	 * read-ahead will do the right thing and limit the read to just the
 	 * requested range, which we'll set to 1 page for this case.
 	 */
-	if (!ra->ra_pages || blk_cgroup_congested()) {
+	if (!ractl->ra->ra_pages || blk_cgroup_congested()) {
 		if (!ractl->file)
 			return;
 		req_count = 1;
@@ -569,21 +570,20 @@ void page_cache_sync_ra(struct readahead_control *ractl,
 
 	/* be dumb */
 	if (do_forced_ra) {
-		force_page_cache_ra(ractl, ra, req_count);
+		force_page_cache_ra(ractl, req_count);
 		return;
 	}
 
 	/* do read-ahead */
-	ondemand_readahead(ractl, ra, false, req_count);
+	ondemand_readahead(ractl, false, req_count);
 }
 EXPORT_SYMBOL_GPL(page_cache_sync_ra);
 
 void page_cache_async_ra(struct readahead_control *ractl,
-		struct file_ra_state *ra, struct page *page,
-		unsigned long req_count)
+		struct page *page, unsigned long req_count)
 {
 	/* no read-ahead */
-	if (!ra->ra_pages)
+	if (!ractl->ra->ra_pages)
 		return;
 
 	/*
@@ -604,7 +604,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
 		return;
 
 	/* do read-ahead */
-	ondemand_readahead(ractl, ra, true, req_count);
+	ondemand_readahead(ractl, true, req_count);
 }
 EXPORT_SYMBOL_GPL(page_cache_async_ra);
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/3] fs: Document file_ra_state
  2021-04-07 20:18 [PATCH 0/3] readahead improvements Matthew Wilcox (Oracle)
  2021-04-07 20:18 ` [PATCH 1/3] mm/filemap: Pass the file_ra_state in the ractl Matthew Wilcox (Oracle)
@ 2021-04-07 20:18 ` Matthew Wilcox (Oracle)
  2021-04-07 20:18 ` [PATCH 3/3] mm/readahead: Adjust file_ra in readahead_expand Matthew Wilcox (Oracle)
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 9+ messages in thread
From: Matthew Wilcox (Oracle) @ 2021-04-07 20:18 UTC (permalink / raw)
  To: David Howells; +Cc: Matthew Wilcox (Oracle), linux-fsdevel

Turn the comments into kernel-doc and improve the wording slightly.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/fs.h | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3fbb98126248..8c347fe9783d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -889,18 +889,22 @@ struct fown_struct {
 	int signum;		/* posix.1b rt signal to be delivered on IO */
 };
 
-/*
- * Track a single file's readahead state
+/**
+ * struct file_ra_state - Track a file's readahead state.
+ * @start: Where the most recent readahead started.
+ * @size: Number of pages read in the most recent readahead.
+ * @async_size: Start next readahead when this many pages are left.
+ * @ra_pages: Maximum size of a readahead request.
+ * @mmap_miss: How many mmap accesses missed in the page cache.
+ * @prev_pos: The last byte in the most recent read request.
  */
 struct file_ra_state {
-	pgoff_t start;			/* where readahead started */
-	unsigned int size;		/* # of readahead pages */
-	unsigned int async_size;	/* do asynchronous readahead when
-					   there are only # of pages ahead */
-
-	unsigned int ra_pages;		/* Maximum readahead window */
-	unsigned int mmap_miss;		/* Cache miss stat for mmap accesses */
-	loff_t prev_pos;		/* Cache last read() position */
+	pgoff_t start;
+	unsigned int size;
+	unsigned int async_size;
+	unsigned int ra_pages;
+	unsigned int mmap_miss;
+	loff_t prev_pos;
 };
 
 /*
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/3] mm/readahead: Adjust file_ra in readahead_expand
  2021-04-07 20:18 [PATCH 0/3] readahead improvements Matthew Wilcox (Oracle)
  2021-04-07 20:18 ` [PATCH 1/3] mm/filemap: Pass the file_ra_state in the ractl Matthew Wilcox (Oracle)
  2021-04-07 20:18 ` [PATCH 2/3] fs: Document file_ra_state Matthew Wilcox (Oracle)
@ 2021-04-07 20:18 ` Matthew Wilcox (Oracle)
  2021-04-07 22:43 ` [PATCH 0/3] readahead improvements David Howells
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 9+ messages in thread
From: Matthew Wilcox (Oracle) @ 2021-04-07 20:18 UTC (permalink / raw)
  To: David Howells; +Cc: Matthew Wilcox (Oracle), linux-fsdevel

If we grow the readahead window, we should tell the ondemand algorithm
that we've grown it, so that the next readahead starts at the right place.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/readahead.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/readahead.c b/mm/readahead.c
index e115b1174981..65215c48f25c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -661,6 +661,7 @@ void readahead_expand(struct readahead_control *ractl,
 		      loff_t new_start, size_t new_len)
 {
 	struct address_space *mapping = ractl->mapping;
+	struct file_ra_state *ra = ractl->ra;
 	pgoff_t new_index, new_nr_pages;
 	gfp_t gfp_mask = readahead_gfp_mask(mapping);
 
@@ -705,6 +706,8 @@ void readahead_expand(struct readahead_control *ractl,
 			return;
 		}
 		ractl->_nr_pages++;
+		ra->size++;
+		ra->async_size++;
 	}
 }
 EXPORT_SYMBOL(readahead_expand);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/3] readahead improvements
  2021-04-07 20:18 [PATCH 0/3] readahead improvements Matthew Wilcox (Oracle)
                   ` (2 preceding siblings ...)
  2021-04-07 20:18 ` [PATCH 3/3] mm/readahead: Adjust file_ra in readahead_expand Matthew Wilcox (Oracle)
@ 2021-04-07 22:43 ` David Howells
  2021-04-08 10:57 ` riteshh
  2021-04-08 12:51 ` [PATCH] mm, netfs: Fix readahead bits David Howells
  5 siblings, 0 replies; 9+ messages in thread
From: David Howells @ 2021-04-07 22:43 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: dhowells, linux-fsdevel

Matthew Wilcox (Oracle) <willy@infradead.org> wrote:

> As requested, fix up readahead_expand() so as to not confuse the ondemand
> algorithm.  Also make the documentation slightly better.  Dave, could you
> put in some debug and check this actually works?  I don't generally test
> with any filesystems that use readahead_expand(), but printing (index,
> nr_to_read, lookahead_size) in page_cache_ra_unbounded() would let a human
> (such as your good self) determine whether it's working approximately
> as designed.

I added the patches to my fscache-netfs-lib branch and ran the quick group of
xfstests with afs and got the same results.

David


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/3] readahead improvements
  2021-04-07 20:18 [PATCH 0/3] readahead improvements Matthew Wilcox (Oracle)
                   ` (3 preceding siblings ...)
  2021-04-07 22:43 ` [PATCH 0/3] readahead improvements David Howells
@ 2021-04-08 10:57 ` riteshh
  2021-04-08 13:15   ` Brendan Gregg
  2021-04-08 12:51 ` [PATCH] mm, netfs: Fix readahead bits David Howells
  5 siblings, 1 reply; 9+ messages in thread
From: riteshh @ 2021-04-08 10:57 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: David Howells, linux-fsdevel, Brendan Gregg

On 21/04/07 09:18PM, Matthew Wilcox (Oracle) wrote:
> As requested, fix up readahead_expand() so as to not confuse the ondemand
> algorithm.  Also make the documentation slightly better.  Dave, could you
> put in some debug and check this actually works?  I don't generally test
> with any filesystems that use readahead_expand(), but printing (index,
> nr_to_read, lookahead_size) in page_cache_ra_unbounded() would let a human
> (such as your good self) determine whether it's working approximately
> as designed.

Hello,

Sorry about the silly question here, since I don't have much details of how
readahead algorithm code path.

1. Do we know of a way to measure efficiency of readahead in Linux?
2. And if there is any way to validate readahead is working correctly and as
   intended in Linux?
Like is there anything designed already to measure above two things?
If not, are there any stats which can be collected and later should be parsed to
say how efficient readahead is working in different use cases and also can
verify if it's working correctly?

I guess, we can already do point 1 from below. What about point 2 & 3?
1. Turn on/off the readahead and measure file reads timings for different
   patterns. - I guess this is already doable.

2. Collecting runtime histogram showing how readahead window is
   increasing/decreasing based on changing read patterns. And collecting how
   much IOs it takes to increase/decrease the readahead size.
   Are there any tracepoints needed to be enabled for this?

3. I guess it won't be possible w/o a way to also measure page cache
   efficiency. Like in case of a memory pressure, if the page which was read
   using readahead is thrown out only to re-read it again.
   So a way to measure page cache efficiency also will be required.

Any idea from others on this?

I do see below page[1] by Brendan showing some ways to measure page cache
efficiency using cachestat. But there are also some problems mentioned in the
conclusion section, which I am not sure of what is the latest state of that.
Also it doesn't discusses much on the readahead efficiency measurement.

[1]: http://www.brendangregg.com/blog/2014-12-31/linux-page-cache-hit-ratio.html


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] mm, netfs: Fix readahead bits
  2021-04-07 20:18 [PATCH 0/3] readahead improvements Matthew Wilcox (Oracle)
                   ` (4 preceding siblings ...)
  2021-04-08 10:57 ` riteshh
@ 2021-04-08 12:51 ` David Howells
  2021-04-08 13:09   ` Matthew Wilcox
  5 siblings, 1 reply; 9+ messages in thread
From: David Howells @ 2021-04-08 12:51 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle), jlayton; +Cc: dhowells, linux-fsdevel

Hi Willy, Jeff,

I think we need the attached change to readahead_expand() to fix the oops seen
when it tries to dereference ractl->ra when called indirectly from
netfs_write_begin().

netfs_write_begin() should also be using DEFINE_READAHEAD() rather than
manually initialising the ractl variable so that Willy can find it;-).

David
---
commit 9c0931285131c3b65ab4b58b614c30c7feb1dbd4
Author: David Howells <dhowells@redhat.com>
Date:   Thu Apr 8 13:39:54 2021 +0100

    netfs: readahead fixes

diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index 762a15350242..1d3b50c5db6d 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -1065,12 +1065,7 @@ int netfs_write_begin(struct file *file, struct address_space *mapping,
 	loff_t size;
 	int ret;
 
-	struct readahead_control ractl = {
-		.file		= file,
-		.mapping	= mapping,
-		._index		= index,
-		._nr_pages	= 0,
-	};
+	DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
 
 retry:
 	page = grab_cache_page_write_begin(mapping, index, 0);
diff --git a/mm/readahead.c b/mm/readahead.c
index 65215c48f25c..f02dbebf1cef 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -706,8 +706,10 @@ void readahead_expand(struct readahead_control *ractl,
 			return;
 		}
 		ractl->_nr_pages++;
-		ra->size++;
-		ra->async_size++;
+		if (ra) {
+			ra->size++;
+			ra->async_size++;
+		}
 	}
 }
 EXPORT_SYMBOL(readahead_expand);


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm, netfs: Fix readahead bits
  2021-04-08 12:51 ` [PATCH] mm, netfs: Fix readahead bits David Howells
@ 2021-04-08 13:09   ` Matthew Wilcox
  0 siblings, 0 replies; 9+ messages in thread
From: Matthew Wilcox @ 2021-04-08 13:09 UTC (permalink / raw)
  To: David Howells; +Cc: jlayton, linux-fsdevel

On Thu, Apr 08, 2021 at 01:51:11PM +0100, David Howells wrote:
> Hi Willy, Jeff,
> 
> I think we need the attached change to readahead_expand() to fix the oops seen
> when it tries to dereference ractl->ra when called indirectly from
> netfs_write_begin().
> 
> netfs_write_begin() should also be using DEFINE_READAHEAD() rather than
> manually initialising the ractl variable so that Willy can find it;-).

ACK.  Please fold into the appropriate patches.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/3] readahead improvements
  2021-04-08 10:57 ` riteshh
@ 2021-04-08 13:15   ` Brendan Gregg
  0 siblings, 0 replies; 9+ messages in thread
From: Brendan Gregg @ 2021-04-08 13:15 UTC (permalink / raw)
  To: riteshh; +Cc: Matthew Wilcox (Oracle), David Howells, linux-fsdevel

On Thu, Apr 8, 2021 at 8:57 PM riteshh <riteshh@linux.ibm.com> wrote:
>
> On 21/04/07 09:18PM, Matthew Wilcox (Oracle) wrote:
> > As requested, fix up readahead_expand() so as to not confuse the ondemand
> > algorithm.  Also make the documentation slightly better.  Dave, could you
> > put in some debug and check this actually works?  I don't generally test
> > with any filesystems that use readahead_expand(), but printing (index,
> > nr_to_read, lookahead_size) in page_cache_ra_unbounded() would let a human
> > (such as your good self) determine whether it's working approximately
> > as designed.
>
> Hello,
>
> Sorry about the silly question here, since I don't have much details of how
> readahead algorithm code path.
>
> 1. Do we know of a way to measure efficiency of readahead in Linux?
> 2. And if there is any way to validate readahead is working correctly and as
>    intended in Linux?

I created a bpftrace tool for measuring readahead efficiency for my
LSFMM 2019 keynote, where it showed the age of readahead pages when
they were finally used:

https://www.slideshare.net/brendangregg/lsfmm-2019-bpf-observability-143092820/29

If they were mostly of a young age, one might conclude that readahead
is not only working, but could be tuned higher. Mostly of an old age,
and one might conclude readahead was tuned too high, and was reading
too many pages (that were later used unrelated to the original
workload).

I think my tool is just the start. What else should we measure for
understanding readahead efficiency? Code it today as a bpftrace tool
(and share it)!

>
> Like is there anything designed already to measure above two things?
> If not, are there any stats which can be collected and later should be parsed to
> say how efficient readahead is working in different use cases and also can
> verify if it's working correctly?
>
> I guess, we can already do point 1 from below. What about point 2 & 3?
> 1. Turn on/off the readahead and measure file reads timings for different
>    patterns. - I guess this is already doable.
>
> 2. Collecting runtime histogram showing how readahead window is
>    increasing/decreasing based on changing read patterns. And collecting how
>    much IOs it takes to increase/decrease the readahead size.
>    Are there any tracepoints needed to be enabled for this?
>
> 3. I guess it won't be possible w/o a way to also measure page cache
>    efficiency. Like in case of a memory pressure, if the page which was read
>    using readahead is thrown out only to re-read it again.
>    So a way to measure page cache efficiency also will be required.
>
> Any idea from others on this?
>
> I do see below page[1] by Brendan showing some ways to measure page cache
> efficiency using cachestat. But there are also some problems mentioned in the
> conclusion section, which I am not sure of what is the latest state of that.
> Also it doesn't discusses much on the readahead efficiency measurement.
>
> [1]: http://www.brendangregg.com/blog/2014-12-31/linux-page-cache-hit-ratio.html

Coincidentally, during the same LSFMMBPF keynote I showed cachestat
and described it as a "sandcastle," as kernel changes easily wash it
away. The MM folk discussed the various issues in measuring this
accurately: while cachestat worked for my workloads, I think there's a
lot more work to do to make it a robust tool for all workloads. I
still think it should be /proc metrics instead, as I commonly want a
page cache hit ratio metric (whereas many of my other tracing tools
are more niche, and can stay as tracing tools).

I don't think there's a video of the talk, but there was a writeup:
https://lwn.net/Articles/787131/

People keep porting my cachestat tool and building other things upon
it, but aren't updating the code, which is getting a bit annoying.
You're all assuming I solved it. But in my original Ftrace cachestat
code I thought I made it clear that it was a proof of concept for
3.13!:

#!/bin/bash
#
# cachestat - show Linux page cache hit/miss statistics.
#             Uses Linux ftrace.
#
# This is a proof of concept using Linux ftrace capabilities on older kernels,
# and works by using function profiling for in-kernel counters. Specifically,
# four kernel functions are traced:
#
# mark_page_accessed() for measuring cache accesses
# mark_buffer_dirty() for measuring cache writes
# add_to_page_cache_lru() for measuring page additions
# account_page_dirtied() for measuring page dirties
#
# It is possible that these functions have been renamed (or are different
# logically) for your kernel version, and this script will not work as-is.
# This script was written on Linux 3.13. This script is a sandcastle: the
# kernel may wash some away, and you'll need to rebuild.
[...]


Brendan

--
Brendan Gregg, Senior Performance Architect, Netflix

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-04-08 13:16 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-07 20:18 [PATCH 0/3] readahead improvements Matthew Wilcox (Oracle)
2021-04-07 20:18 ` [PATCH 1/3] mm/filemap: Pass the file_ra_state in the ractl Matthew Wilcox (Oracle)
2021-04-07 20:18 ` [PATCH 2/3] fs: Document file_ra_state Matthew Wilcox (Oracle)
2021-04-07 20:18 ` [PATCH 3/3] mm/readahead: Adjust file_ra in readahead_expand Matthew Wilcox (Oracle)
2021-04-07 22:43 ` [PATCH 0/3] readahead improvements David Howells
2021-04-08 10:57 ` riteshh
2021-04-08 13:15   ` Brendan Gregg
2021-04-08 12:51 ` [PATCH] mm, netfs: Fix readahead bits David Howells
2021-04-08 13:09   ` Matthew Wilcox

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.