* [PATCH 03/12] readahead: Put pages in cache earlier
2020-01-25 1:35 [PATCH 00/12] Change readahead API Matthew Wilcox
@ 2020-01-25 1:35 ` Matthew Wilcox
2020-01-25 19:44 ` Matthew Wilcox
2020-01-25 1:35 ` [PATCH 04/12] mm: Add readahead address space operation Matthew Wilcox
` (2 subsequent siblings)
3 siblings, 1 reply; 14+ messages in thread
From: Matthew Wilcox @ 2020-01-25 1:35 UTC (permalink / raw)
To: linux-fsdevel
Cc: Matthew Wilcox (Oracle),
linux-mm, linux-kernel, linux-btrfs, linux-erofs, linux-ext4,
linux-f2fs-devel, linux-xfs, cluster-devel, ocfs2-devel
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
At allocation time, put the pages in the cache unless we're using
->readpages.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-btrfs@vger.kernel.org
Cc: linux-erofs@lists.ozlabs.org
Cc: linux-ext4@vger.kernel.org
Cc: linux-f2fs-devel@lists.sourceforge.net
Cc: linux-xfs@vger.kernel.org
Cc: cluster-devel@redhat.com
Cc: ocfs2-devel@oss.oracle.com
---
mm/readahead.c | 51 +++++++++++++++++++++++++++++++-------------------
1 file changed, 32 insertions(+), 19 deletions(-)
diff --git a/mm/readahead.c b/mm/readahead.c
index fc77d13af556..5a6676640f20 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -114,10 +114,10 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
EXPORT_SYMBOL(read_cache_pages);
static void read_pages(struct address_space *mapping, struct file *filp,
- struct list_head *pages, unsigned int nr_pages, gfp_t gfp)
+ struct list_head *pages, pgoff_t start,
+ unsigned int nr_pages)
{
struct blk_plug plug;
- unsigned page_idx;
blk_start_plug(&plug);
@@ -125,18 +125,17 @@ static void read_pages(struct address_space *mapping, struct file *filp,
mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
/* Clean up the remaining pages */
put_pages_list(pages);
- goto out;
- }
+ } else {
+ struct page *page;
+ unsigned long index;
- for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = lru_to_page(pages);
- list_del(&page->lru);
- if (!add_to_page_cache_lru(page, mapping, page->index, gfp))
+ xa_for_each_range(&mapping->i_pages, index, page, start,
+ start + nr_pages - 1) {
mapping->a_ops->readpage(filp, page);
- put_page(page);
+ put_page(page);
+ }
}
-out:
blk_finish_plug(&plug);
}
@@ -157,9 +156,11 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool);
int page_idx;
+ pgoff_t page_offset;
unsigned long nr_pages = 0;
loff_t isize = i_size_read(inode);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
+ bool use_list = mapping->a_ops->readpages;
if (isize == 0)
goto out;
@@ -170,7 +171,7 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
* Preallocate as many pages as we will need.
*/
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
- pgoff_t page_offset = offset + page_idx;
+ page_offset = offset + page_idx;
if (page_offset > end_index)
break;
@@ -178,13 +179,14 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
page = xa_load(&mapping->i_pages, page_offset);
if (page && !xa_is_value(page)) {
/*
- * Page already present? Kick off the current batch of
- * contiguous pages before continuing with the next
- * batch.
+ * Page already present? Kick off the current batch
+ * of contiguous pages before continuing with the
+ * next batch.
*/
if (nr_pages)
- read_pages(mapping, filp, &page_pool, nr_pages,
- gfp_mask);
+ read_pages(mapping, filp, &page_pool,
+ page_offset - nr_pages,
+ nr_pages);
nr_pages = 0;
continue;
}
@@ -192,8 +194,18 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
page = __page_cache_alloc(gfp_mask);
if (!page)
break;
- page->index = page_offset;
- list_add(&page->lru, &page_pool);
+ if (use_list) {
+ page->index = page_offset;
+ list_add(&page->lru, &page_pool);
+ } else if (!add_to_page_cache_lru(page, mapping, page_offset,
+ gfp_mask)) {
+ if (nr_pages)
+ read_pages(mapping, filp, &page_pool,
+ page_offset - nr_pages,
+ nr_pages);
+ nr_pages = 0;
+ continue;
+ }
if (page_idx == nr_to_read - lookahead_size)
SetPageReadahead(page);
nr_pages++;
@@ -205,7 +217,8 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
* will then handle the error.
*/
if (nr_pages)
- read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
+ read_pages(mapping, filp, &page_pool, page_offset - nr_pages,
+ nr_pages);
BUG_ON(!list_empty(&page_pool));
out:
return nr_pages;
--
2.24.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH 03/12] readahead: Put pages in cache earlier
2020-01-25 1:35 ` [PATCH 03/12] readahead: Put pages in cache earlier Matthew Wilcox
@ 2020-01-25 19:44 ` Matthew Wilcox
0 siblings, 0 replies; 14+ messages in thread
From: Matthew Wilcox @ 2020-01-25 19:44 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-mm, linux-kernel, linux-btrfs, linux-erofs, linux-ext4,
linux-f2fs-devel, linux-xfs, cluster-devel, ocfs2-devel
On Fri, Jan 24, 2020 at 05:35:44PM -0800, Matthew Wilcox wrote:
> @@ -192,8 +194,18 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
> page = __page_cache_alloc(gfp_mask);
> if (!page)
> break;
> - page->index = page_offset;
> - list_add(&page->lru, &page_pool);
> + if (use_list) {
> + page->index = page_offset;
> + list_add(&page->lru, &page_pool);
> + } else if (!add_to_page_cache_lru(page, mapping, page_offset,
> + gfp_mask)) {
> + if (nr_pages)
> + read_pages(mapping, filp, &page_pool,
> + page_offset - nr_pages,
> + nr_pages);
> + nr_pages = 0;
This is missing a call to put_page().
> + continue;
> + }
> if (page_idx == nr_to_read - lookahead_size)
> SetPageReadahead(page);
> nr_pages++;
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 04/12] mm: Add readahead address space operation
2020-01-25 1:35 [PATCH 00/12] Change readahead API Matthew Wilcox
2020-01-25 1:35 ` [PATCH 03/12] readahead: Put pages in cache earlier Matthew Wilcox
@ 2020-01-25 1:35 ` Matthew Wilcox
2020-01-25 3:57 ` Randy Dunlap
2020-01-29 0:24 ` Dave Chinner
2020-01-25 1:35 ` [PATCH 06/12] btrfs: Convert from readpages to readahead Matthew Wilcox
2020-02-13 4:38 ` [PATCH 00/12] Change readahead API Andrew Morton
3 siblings, 2 replies; 14+ messages in thread
From: Matthew Wilcox @ 2020-01-25 1:35 UTC (permalink / raw)
To: linux-fsdevel
Cc: Matthew Wilcox (Oracle),
linux-mm, linux-kernel, linux-btrfs, linux-erofs, linux-ext4,
linux-f2fs-devel, linux-xfs, cluster-devel, ocfs2-devel
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
This replaces ->readpages with a saner interface:
- Return the number of pages not read instead of an ignored error code.
- Pages are already in the page cache when ->readahead is called.
- Implementation looks up the pages in the page cache instead of
having them passed in a linked list.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-btrfs@vger.kernel.org
Cc: linux-erofs@lists.ozlabs.org
Cc: linux-ext4@vger.kernel.org
Cc: linux-f2fs-devel@lists.sourceforge.net
Cc: linux-xfs@vger.kernel.org
Cc: cluster-devel@redhat.com
Cc: ocfs2-devel@oss.oracle.com
---
Documentation/filesystems/locking.rst | 7 ++++++-
Documentation/filesystems/vfs.rst | 11 +++++++++++
include/linux/fs.h | 2 ++
include/linux/pagemap.h | 12 ++++++++++++
mm/readahead.c | 13 ++++++++++++-
5 files changed, 43 insertions(+), 2 deletions(-)
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 5057e4d9dcd1..d8a5dde914b5 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -239,6 +239,8 @@ prototypes::
int (*readpage)(struct file *, struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
+ unsigned (*readahead)(struct file *, struct address_space *,
+ pgoff_t start, unsigned nr_pages);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
@@ -271,7 +273,8 @@ writepage: yes, unlocks (see below)
readpage: yes, unlocks
writepages:
set_page_dirty no
-readpages:
+readahead: yes, unlocks
+readpages: no
write_begin: locks the page exclusive
write_end: yes, unlocks exclusive
bmap:
@@ -295,6 +298,8 @@ the request handler (/dev/loop).
->readpage() unlocks the page, either synchronously or via I/O
completion.
+->readahead() unlocks the page like ->readpage().
+
->readpages() populates the pagecache with the passed pages and starts
I/O against them. They come unlocked upon I/O completion.
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 7d4d09dd5e6d..bb06fb7b120b 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -706,6 +706,8 @@ cache in your filesystem. The following members are defined:
int (*readpage)(struct file *, struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
+ unsigned (*readahead)(struct file *filp, struct address_space *mapping,
+ pgoff_t start, unsigned nr_pages);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
@@ -781,6 +783,15 @@ cache in your filesystem. The following members are defined:
If defined, it should set the PageDirty flag, and the
PAGECACHE_TAG_DIRTY tag in the radix tree.
+``readahead``
+ called by the VM to read pages associated with the address_space
+ object. The pages are consecutive in the page cache and are
+ locked. The implementation should decrement the page refcount after
+ attempting I/O on each page. Usually the page will be unlocked by
+ the I/O completion handler. If the function does not attempt I/O on
+ some pages, return the number of pages which were not read so the
+ common code can unlock the pages for you.
+
``readpages``
called by the VM to read pages associated with the address_space
object. This is essentially just a vector version of readpage.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98e0349adb52..a10f3a72e5ac 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -375,6 +375,8 @@ struct address_space_operations {
*/
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
+ unsigned (*readahead)(struct file *, struct address_space *,
+ pgoff_t start, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 37a4d9e32cd3..2baafd236a82 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -630,6 +630,18 @@ static inline int add_to_page_cache(struct page *page,
return error;
}
+/*
+ * Only call this from a ->readahead implementation.
+ */
+static inline
+struct page *readahead_page(struct address_space *mapping, pgoff_t index)
+{
+ struct page *page = xa_load(&mapping->i_pages, index);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ return page;
+}
+
static inline unsigned long dir_pages(struct inode *inode)
{
return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
diff --git a/mm/readahead.c b/mm/readahead.c
index 5a6676640f20..6d65dae6dad0 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -121,7 +121,18 @@ static void read_pages(struct address_space *mapping, struct file *filp,
blk_start_plug(&plug);
- if (mapping->a_ops->readpages) {
+ if (mapping->a_ops->readahead) {
+ unsigned left = mapping->a_ops->readahead(filp, mapping,
+ start, nr_pages);
+
+ while (left) {
+ struct page *page = readahead_page(mapping,
+ start + nr_pages - left - 1);
+ unlock_page(page);
+ put_page(page);
+ left--;
+ }
+ } else if (mapping->a_ops->readpages) {
mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
/* Clean up the remaining pages */
put_pages_list(pages);
--
2.24.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH 04/12] mm: Add readahead address space operation
2020-01-25 1:35 ` [PATCH 04/12] mm: Add readahead address space operation Matthew Wilcox
@ 2020-01-25 3:57 ` Randy Dunlap
2020-02-01 0:25 ` Matthew Wilcox
2020-01-29 0:24 ` Dave Chinner
1 sibling, 1 reply; 14+ messages in thread
From: Randy Dunlap @ 2020-01-25 3:57 UTC (permalink / raw)
To: Matthew Wilcox, linux-fsdevel
Cc: linux-mm, linux-kernel, linux-btrfs, linux-erofs, linux-ext4,
linux-f2fs-devel, linux-xfs, cluster-devel, ocfs2-devel
On 1/24/20 5:35 PM, Matthew Wilcox wrote:
> diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
> index 7d4d09dd5e6d..bb06fb7b120b 100644
> --- a/Documentation/filesystems/vfs.rst
> +++ b/Documentation/filesystems/vfs.rst
> @@ -706,6 +706,8 @@ cache in your filesystem. The following members are defined:
> int (*readpage)(struct file *, struct page *);
> int (*writepages)(struct address_space *, struct writeback_control *);
> int (*set_page_dirty)(struct page *page);
> + unsigned (*readahead)(struct file *filp, struct address_space *mapping,
> + pgoff_t start, unsigned nr_pages);
> int (*readpages)(struct file *filp, struct address_space *mapping,
> struct list_head *pages, unsigned nr_pages);
> int (*write_begin)(struct file *, struct address_space *mapping,
> @@ -781,6 +783,15 @@ cache in your filesystem. The following members are defined:
> If defined, it should set the PageDirty flag, and the
> PAGECACHE_TAG_DIRTY tag in the radix tree.
>
> +``readahead``
> + called by the VM to read pages associated with the address_space
> + object. The pages are consecutive in the page cache and are
> + locked. The implementation should decrement the page refcount after
> + attempting I/O on each page. Usually the page will be unlocked by
> + the I/O completion handler. If the function does not attempt I/O on
> + some pages, return the number of pages which were not read so the
> + common code can unlock the pages for you.
> +
Please use consistent indentation (tabs).
> ``readpages``
> called by the VM to read pages associated with the address_space
> object. This is essentially just a vector version of readpage.
cheers.
--
~Randy
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 04/12] mm: Add readahead address space operation
2020-01-25 3:57 ` Randy Dunlap
@ 2020-02-01 0:25 ` Matthew Wilcox
0 siblings, 0 replies; 14+ messages in thread
From: Matthew Wilcox @ 2020-02-01 0:25 UTC (permalink / raw)
To: Randy Dunlap
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel
On Fri, Jan 24, 2020 at 07:57:40PM -0800, Randy Dunlap wrote:
> > +``readahead``
> > + called by the VM to read pages associated with the address_space
> > + object. The pages are consecutive in the page cache and are
> > + locked. The implementation should decrement the page refcount after
> > + attempting I/O on each page. Usually the page will be unlocked by
> > + the I/O completion handler. If the function does not attempt I/O on
> > + some pages, return the number of pages which were not read so the
> > + common code can unlock the pages for you.
> > +
>
> Please use consistent indentation (tabs).
This turned out to be not my fault. The vim rst ... mode? plugin?
Whatever it is, it's converting tabs to spaces! To fix it, I had to
rename the file to .txt, make the edits, then rename it back. This is
very poor behaviour.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 04/12] mm: Add readahead address space operation
2020-01-25 1:35 ` [PATCH 04/12] mm: Add readahead address space operation Matthew Wilcox
2020-01-25 3:57 ` Randy Dunlap
@ 2020-01-29 0:24 ` Dave Chinner
2020-01-30 8:00 ` Matthew Wilcox
1 sibling, 1 reply; 14+ messages in thread
From: Dave Chinner @ 2020-01-29 0:24 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel
On Fri, Jan 24, 2020 at 05:35:45PM -0800, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
>
> This replaces ->readpages with a saner interface:
> - Return the number of pages not read instead of an ignored error code.
> - Pages are already in the page cache when ->readahead is called.
> - Implementation looks up the pages in the page cache instead of
> having them passed in a linked list.
....
> diff --git a/mm/readahead.c b/mm/readahead.c
> index 5a6676640f20..6d65dae6dad0 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -121,7 +121,18 @@ static void read_pages(struct address_space *mapping, struct file *filp,
>
> blk_start_plug(&plug);
>
> - if (mapping->a_ops->readpages) {
> + if (mapping->a_ops->readahead) {
> + unsigned left = mapping->a_ops->readahead(filp, mapping,
> + start, nr_pages);
> +
> + while (left) {
> + struct page *page = readahead_page(mapping,
> + start + nr_pages - left - 1);
Off by one? start = 2, nr_pages = 2, left = 1, this looks up the
page at index 2, which is the one we issued IO on, not the one we
"left behind" which is at index 3.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 04/12] mm: Add readahead address space operation
2020-01-29 0:24 ` Dave Chinner
@ 2020-01-30 8:00 ` Matthew Wilcox
0 siblings, 0 replies; 14+ messages in thread
From: Matthew Wilcox @ 2020-01-30 8:00 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel
On Wed, Jan 29, 2020 at 11:24:56AM +1100, Dave Chinner wrote:
> On Fri, Jan 24, 2020 at 05:35:45PM -0800, Matthew Wilcox wrote:
> > From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> >
> > This replaces ->readpages with a saner interface:
> > - Return the number of pages not read instead of an ignored error code.
> > - Pages are already in the page cache when ->readahead is called.
> > - Implementation looks up the pages in the page cache instead of
> > having them passed in a linked list.
> ....
> > diff --git a/mm/readahead.c b/mm/readahead.c
> > index 5a6676640f20..6d65dae6dad0 100644
> > --- a/mm/readahead.c
> > +++ b/mm/readahead.c
> > @@ -121,7 +121,18 @@ static void read_pages(struct address_space *mapping, struct file *filp,
> >
> > blk_start_plug(&plug);
> >
> > - if (mapping->a_ops->readpages) {
> > + if (mapping->a_ops->readahead) {
> > + unsigned left = mapping->a_ops->readahead(filp, mapping,
> > + start, nr_pages);
> > +
> > + while (left) {
> > + struct page *page = readahead_page(mapping,
> > + start + nr_pages - left - 1);
>
> Off by one? start = 2, nr_pages = 2, left = 1, this looks up the
> page at index 2, which is the one we issued IO on, not the one we
> "left behind" which is at index 3.
Yup. I originally had:
while (left--) ...
decided that was too confusing and didn't quite complete that thought.
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 06/12] btrfs: Convert from readpages to readahead
2020-01-25 1:35 [PATCH 00/12] Change readahead API Matthew Wilcox
2020-01-25 1:35 ` [PATCH 03/12] readahead: Put pages in cache earlier Matthew Wilcox
2020-01-25 1:35 ` [PATCH 04/12] mm: Add readahead address space operation Matthew Wilcox
@ 2020-01-25 1:35 ` Matthew Wilcox
2020-01-29 0:46 ` Dave Chinner
2020-02-13 4:38 ` [PATCH 00/12] Change readahead API Andrew Morton
3 siblings, 1 reply; 14+ messages in thread
From: Matthew Wilcox @ 2020-01-25 1:35 UTC (permalink / raw)
To: linux-fsdevel
Cc: Matthew Wilcox (Oracle), linux-mm, linux-kernel, linux-btrfs
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Use the new readahead operation in btrfs
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-btrfs@vger.kernel.org
---
fs/btrfs/extent_io.c | 15 ++++-----------
fs/btrfs/extent_io.h | 2 +-
fs/btrfs/inode.c | 18 +++++++++---------
3 files changed, 14 insertions(+), 21 deletions(-)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2f4802f405a2..b1e2acbec165 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4283,7 +4283,7 @@ int extent_writepages(struct address_space *mapping,
return ret;
}
-int extent_readpages(struct address_space *mapping, struct list_head *pages,
+unsigned extent_readahead(struct address_space *mapping, pgoff_t start,
unsigned nr_pages)
{
struct bio *bio = NULL;
@@ -4294,20 +4294,13 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,
int nr = 0;
u64 prev_em_start = (u64)-1;
- while (!list_empty(pages)) {
+ while (nr_pages) {
u64 contig_end = 0;
- for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
- struct page *page = lru_to_page(pages);
+ for (nr = 0; nr < ARRAY_SIZE(pagepool) && nr_pages--;) {
+ struct page *page = readahead_page(mapping, start++);
prefetchw(&page->flags);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping, page->index,
- readahead_gfp_mask(mapping))) {
- put_page(page);
- break;
- }
-
pagepool[nr++] = page;
contig_end = page_offset(page) + PAGE_SIZE - 1;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a8551a1f56e2..d0f154766a02 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -200,7 +200,7 @@ int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
-int extent_readpages(struct address_space *mapping, struct list_head *pages,
+unsigned extent_readahead(struct address_space *mapping, pgoff_t start,
unsigned nr_pages);
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c70baafb2a39..4f223b4f7dff 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5395,8 +5395,8 @@ static void evict_inode_truncate_pages(struct inode *inode)
/*
* Keep looping until we have no more ranges in the io tree.
- * We can have ongoing bios started by readpages (called from readahead)
- * that have their endio callback (extent_io.c:end_bio_extent_readpage)
+ * We can have ongoing bios started by readahead that have
+ * their endio callback (extent_io.c:end_bio_extent_readpage)
* still in progress (unlocked the pages in the bio but did not yet
* unlocked the ranges in the io tree). Therefore this means some
* ranges can still be locked and eviction started because before
@@ -7586,11 +7586,11 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
* for it to complete) and then invalidate the pages for
* this range (through invalidate_inode_pages2_range()),
* but that can lead us to a deadlock with a concurrent
- * call to readpages() (a buffered read or a defrag call
+ * call to readahead (a buffered read or a defrag call
* triggered a readahead) on a page lock due to an
* ordered dio extent we created before but did not have
* yet a corresponding bio submitted (whence it can not
- * complete), which makes readpages() wait for that
+ * complete), which makes readahead wait for that
* ordered extent to complete while holding a lock on
* that page.
*/
@@ -8829,11 +8829,11 @@ static int btrfs_writepages(struct address_space *mapping,
return extent_writepages(mapping, wbc);
}
-static int
-btrfs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static unsigned
+btrfs_readahead(struct file *file, struct address_space *mapping,
+ pgoff_t start, unsigned nr_pages)
{
- return extent_readpages(mapping, pages, nr_pages);
+ return extent_readahead(mapping, start, nr_pages);
}
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
@@ -11045,7 +11045,7 @@ static const struct address_space_operations btrfs_aops = {
.readpage = btrfs_readpage,
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
- .readpages = btrfs_readpages,
+ .readahead = btrfs_readahead,
.direct_IO = btrfs_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
--
2.24.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH 06/12] btrfs: Convert from readpages to readahead
2020-01-25 1:35 ` [PATCH 06/12] btrfs: Convert from readpages to readahead Matthew Wilcox
@ 2020-01-29 0:46 ` Dave Chinner
2020-01-30 8:09 ` Matthew Wilcox
0 siblings, 1 reply; 14+ messages in thread
From: Dave Chinner @ 2020-01-29 0:46 UTC (permalink / raw)
To: Matthew Wilcox; +Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs
On Fri, Jan 24, 2020 at 05:35:47PM -0800, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
>
> Use the new readahead operation in btrfs
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: linux-btrfs@vger.kernel.org
> ---
> fs/btrfs/extent_io.c | 15 ++++-----------
> fs/btrfs/extent_io.h | 2 +-
> fs/btrfs/inode.c | 18 +++++++++---------
> 3 files changed, 14 insertions(+), 21 deletions(-)
>
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 2f4802f405a2..b1e2acbec165 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -4283,7 +4283,7 @@ int extent_writepages(struct address_space *mapping,
> return ret;
> }
>
> -int extent_readpages(struct address_space *mapping, struct list_head *pages,
> +unsigned extent_readahead(struct address_space *mapping, pgoff_t start,
> unsigned nr_pages)
> {
> struct bio *bio = NULL;
> @@ -4294,20 +4294,13 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,
> int nr = 0;
> u64 prev_em_start = (u64)-1;
>
> - while (!list_empty(pages)) {
> + while (nr_pages) {
> u64 contig_end = 0;
>
> - for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
> - struct page *page = lru_to_page(pages);
> + for (nr = 0; nr < ARRAY_SIZE(pagepool) && nr_pages--;) {
What is stopping nr_pages from going negative here, and then looping
forever on the outer nr_pages loop? Perhaps "while(nr_pages > 0) {"
would be better there?
-Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 06/12] btrfs: Convert from readpages to readahead
2020-01-29 0:46 ` Dave Chinner
@ 2020-01-30 8:09 ` Matthew Wilcox
2020-01-31 2:17 ` Dave Chinner
0 siblings, 1 reply; 14+ messages in thread
From: Matthew Wilcox @ 2020-01-30 8:09 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs
On Wed, Jan 29, 2020 at 11:46:09AM +1100, Dave Chinner wrote:
> On Fri, Jan 24, 2020 at 05:35:47PM -0800, Matthew Wilcox wrote:
> > From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> >
> > Use the new readahead operation in btrfs
> >
> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > Cc: linux-btrfs@vger.kernel.org
> > ---
> > fs/btrfs/extent_io.c | 15 ++++-----------
> > fs/btrfs/extent_io.h | 2 +-
> > fs/btrfs/inode.c | 18 +++++++++---------
> > 3 files changed, 14 insertions(+), 21 deletions(-)
> >
> > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> > index 2f4802f405a2..b1e2acbec165 100644
> > --- a/fs/btrfs/extent_io.c
> > +++ b/fs/btrfs/extent_io.c
> > @@ -4283,7 +4283,7 @@ int extent_writepages(struct address_space *mapping,
> > return ret;
> > }
> >
> > -int extent_readpages(struct address_space *mapping, struct list_head *pages,
> > +unsigned extent_readahead(struct address_space *mapping, pgoff_t start,
> > unsigned nr_pages)
> > {
> > struct bio *bio = NULL;
> > @@ -4294,20 +4294,13 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,
> > int nr = 0;
> > u64 prev_em_start = (u64)-1;
> >
> > - while (!list_empty(pages)) {
> > + while (nr_pages) {
> > u64 contig_end = 0;
> >
> > - for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
> > - struct page *page = lru_to_page(pages);
> > + for (nr = 0; nr < ARRAY_SIZE(pagepool) && nr_pages--;) {
>
> What is stopping nr_pages from going negative here, and then looping
> forever on the outer nr_pages loop? Perhaps "while(nr_pages > 0) {"
> would be better there?
Ugh, nr_pages is unsigned, so that's no good. Maybe make this a more
conventional loop ...
while (nr_pages) {
u64 contig_end = 0;
for (nr = 0; nr < ARRAY_SIZE(pagepool); nr++) {
struct page *page = readahead_page(mapping, start++);
prefetchw(&page->flags);
pagepool[nr] = page;
contig_end = page_offset(page) + PAGE_SIZE - 1;
if (--nr_pages == 0)
break;
}
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 06/12] btrfs: Convert from readpages to readahead
2020-01-30 8:09 ` Matthew Wilcox
@ 2020-01-31 2:17 ` Dave Chinner
0 siblings, 0 replies; 14+ messages in thread
From: Dave Chinner @ 2020-01-31 2:17 UTC (permalink / raw)
To: Matthew Wilcox; +Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs
On Thu, Jan 30, 2020 at 12:09:39AM -0800, Matthew Wilcox wrote:
> On Wed, Jan 29, 2020 at 11:46:09AM +1100, Dave Chinner wrote:
> > On Fri, Jan 24, 2020 at 05:35:47PM -0800, Matthew Wilcox wrote:
> > > From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> > >
> > > Use the new readahead operation in btrfs
> > >
> > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > > Cc: linux-btrfs@vger.kernel.org
> > > ---
> > > fs/btrfs/extent_io.c | 15 ++++-----------
> > > fs/btrfs/extent_io.h | 2 +-
> > > fs/btrfs/inode.c | 18 +++++++++---------
> > > 3 files changed, 14 insertions(+), 21 deletions(-)
> > >
> > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> > > index 2f4802f405a2..b1e2acbec165 100644
> > > --- a/fs/btrfs/extent_io.c
> > > +++ b/fs/btrfs/extent_io.c
> > > @@ -4283,7 +4283,7 @@ int extent_writepages(struct address_space *mapping,
> > > return ret;
> > > }
> > >
> > > -int extent_readpages(struct address_space *mapping, struct list_head *pages,
> > > +unsigned extent_readahead(struct address_space *mapping, pgoff_t start,
> > > unsigned nr_pages)
> > > {
> > > struct bio *bio = NULL;
> > > @@ -4294,20 +4294,13 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,
> > > int nr = 0;
> > > u64 prev_em_start = (u64)-1;
> > >
> > > - while (!list_empty(pages)) {
> > > + while (nr_pages) {
> > > u64 contig_end = 0;
> > >
> > > - for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
> > > - struct page *page = lru_to_page(pages);
> > > + for (nr = 0; nr < ARRAY_SIZE(pagepool) && nr_pages--;) {
> >
> > What is stopping nr_pages from going negative here, and then looping
> > forever on the outer nr_pages loop? Perhaps "while(nr_pages > 0) {"
> > would be better there?
>
> Ugh, nr_pages is unsigned, so that's no good. Maybe make this a more
> conventional loop ...
>
> while (nr_pages) {
> u64 contig_end = 0;
>
> for (nr = 0; nr < ARRAY_SIZE(pagepool); nr++) {
> struct page *page = readahead_page(mapping, start++);
>
> prefetchw(&page->flags);
> pagepool[nr] = page;
> contig_end = page_offset(page) + PAGE_SIZE - 1;
> if (--nr_pages == 0)
> break;
> }
Looks like it solves the problem :)
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 00/12] Change readahead API
2020-01-25 1:35 [PATCH 00/12] Change readahead API Matthew Wilcox
` (2 preceding siblings ...)
2020-01-25 1:35 ` [PATCH 06/12] btrfs: Convert from readpages to readahead Matthew Wilcox
@ 2020-02-13 4:38 ` Andrew Morton
2020-02-13 13:43 ` Matthew Wilcox
3 siblings, 1 reply; 14+ messages in thread
From: Andrew Morton @ 2020-02-13 4:38 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel
On Fri, 24 Jan 2020 17:35:41 -0800 Matthew Wilcox <willy@infradead.org> wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
>
> This series adds a readahead address_space operation to eventually
> replace the readpages operation. The key difference is that
> pages are added to the page cache as they are allocated (and
> then looked up by the filesystem) instead of passing them on a
> list to the readpages operation and having the filesystem add
> them to the page cache. It's a net reduction in code for each
> implementation, more efficient than walking a list, and solves
> the direct-write vs buffered-read problem reported by yu kuai at
> https://lore.kernel.org/linux-fsdevel/20200116063601.39201-1-yukuai3@huawei.com/
Unclear which patch fixes this and how it did it?
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 00/12] Change readahead API
2020-02-13 4:38 ` [PATCH 00/12] Change readahead API Andrew Morton
@ 2020-02-13 13:43 ` Matthew Wilcox
0 siblings, 0 replies; 14+ messages in thread
From: Matthew Wilcox @ 2020-02-13 13:43 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel, Mark Fasheh, Joel Becker, Joseph Qi, Bob Peterson,
Andreas Gruenbacher
On Wed, Feb 12, 2020 at 08:38:52PM -0800, Andrew Morton wrote:
> On Fri, 24 Jan 2020 17:35:41 -0800 Matthew Wilcox <willy@infradead.org> wrote:
>
> > From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> >
> > This series adds a readahead address_space operation to eventually
> > replace the readpages operation. The key difference is that
> > pages are added to the page cache as they are allocated (and
> > then looked up by the filesystem) instead of passing them on a
> > list to the readpages operation and having the filesystem add
> > them to the page cache. It's a net reduction in code for each
> > implementation, more efficient than walking a list, and solves
> > the direct-write vs buffered-read problem reported by yu kuai at
> > https://lore.kernel.org/linux-fsdevel/20200116063601.39201-1-yukuai3@huawei.com/
>
> Unclear which patch fixes this and how it did it?
I suppose the problem isn't fixed until patch 13/13 is applied.
What yu kuai is seeing is a race where readahead allocates a page,
then passes it to iomap_readpages, which calls xfs_read_iomap_begin()
which looks up the extent. Then thread 2 does DIO which modifies the
extent, because there's nothing to say that thread 1 is still using it.
With this patch series, the readpages code puts the locked pages in the
cache before calling iomap_readpages, so any racing write will block on
the locked page until readahead is completed.
If you're tempted to put this into -mm, I have a couple of new changes;
one to fix a kernel-doc warning for mpage_readahead() and one to add
kernel-doc for iomap_readahead():
+++ b/fs/mpage.c
@@ -339,9 +339,7 @@
/**
* mpage_readahead - start reads against pages
- * @mapping: the address_space
- * @start: The number of the first page to read.
- * @nr_pages: The number of consecutive pages to read.
+ * @rac: Describes which pages to read.
* @get_block: The filesystem's block mapper function.
*
* This function walks the pages and the blocks within each page, building and
+++ b/fs/iomap/buffered-io.c
@@ -395,6 +395,21 @@
return done;
}
+/**
+ * iomap_readahead - Attempt to read pages from a file.
+ * @rac: Describes the pages to be read.
+ * @ops: The operations vector for the filesystem.
+ *
+ * This function is for filesystems to call to implement their readahead
+ * address_space operation.
+ *
+ * Context: The file is pinned by the caller, and the pages to be read are
+ * all locked and have an elevated refcount. This function will unlock
+ * the pages (once I/O has completed on them, or I/O has been determined to
+ * not be necessary). It will also decrease the refcount once the pages
+ * have been submitted for I/O. After this point, the page may be removed
+ * from the page cache, and should not be referenced.
+ */
void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
{
struct inode *inode = rac->mapping->host;
I'll do a v6 with those changes soon, but I would really like a bit more
review from filesystem people, particularly ocfs2 and gfs2.
^ permalink raw reply [flat|nested] 14+ messages in thread