* [PATCH 03/12] readahead: Put pages in cache earlier
2020-01-25 1:35 [PATCH 00/12] Change readahead API Matthew Wilcox
@ 2020-01-25 1:35 ` Matthew Wilcox
2020-01-25 19:44 ` Matthew Wilcox
2020-01-25 1:35 ` [PATCH 04/12] mm: Add readahead address space operation Matthew Wilcox
` (2 subsequent siblings)
3 siblings, 1 reply; 13+ messages in thread
From: Matthew Wilcox @ 2020-01-25 1:35 UTC (permalink / raw)
To: linux-fsdevel
Cc: Matthew Wilcox (Oracle),
linux-mm, linux-kernel, linux-btrfs, linux-erofs, linux-ext4,
linux-f2fs-devel, linux-xfs, cluster-devel, ocfs2-devel
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
At allocation time, put the pages in the cache unless we're using
->readpages.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-btrfs@vger.kernel.org
Cc: linux-erofs@lists.ozlabs.org
Cc: linux-ext4@vger.kernel.org
Cc: linux-f2fs-devel@lists.sourceforge.net
Cc: linux-xfs@vger.kernel.org
Cc: cluster-devel@redhat.com
Cc: ocfs2-devel@oss.oracle.com
---
mm/readahead.c | 51 +++++++++++++++++++++++++++++++-------------------
1 file changed, 32 insertions(+), 19 deletions(-)
diff --git a/mm/readahead.c b/mm/readahead.c
index fc77d13af556..5a6676640f20 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -114,10 +114,10 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
EXPORT_SYMBOL(read_cache_pages);
static void read_pages(struct address_space *mapping, struct file *filp,
- struct list_head *pages, unsigned int nr_pages, gfp_t gfp)
+ struct list_head *pages, pgoff_t start,
+ unsigned int nr_pages)
{
struct blk_plug plug;
- unsigned page_idx;
blk_start_plug(&plug);
@@ -125,18 +125,17 @@ static void read_pages(struct address_space *mapping, struct file *filp,
mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
/* Clean up the remaining pages */
put_pages_list(pages);
- goto out;
- }
+ } else {
+ struct page *page;
+ unsigned long index;
- for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = lru_to_page(pages);
- list_del(&page->lru);
- if (!add_to_page_cache_lru(page, mapping, page->index, gfp))
+ xa_for_each_range(&mapping->i_pages, index, page, start,
+ start + nr_pages - 1) {
mapping->a_ops->readpage(filp, page);
- put_page(page);
+ put_page(page);
+ }
}
-out:
blk_finish_plug(&plug);
}
@@ -157,9 +156,11 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool);
int page_idx;
+ pgoff_t page_offset;
unsigned long nr_pages = 0;
loff_t isize = i_size_read(inode);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
+ bool use_list = mapping->a_ops->readpages;
if (isize == 0)
goto out;
@@ -170,7 +171,7 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
* Preallocate as many pages as we will need.
*/
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
- pgoff_t page_offset = offset + page_idx;
+ page_offset = offset + page_idx;
if (page_offset > end_index)
break;
@@ -178,13 +179,14 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
page = xa_load(&mapping->i_pages, page_offset);
if (page && !xa_is_value(page)) {
/*
- * Page already present? Kick off the current batch of
- * contiguous pages before continuing with the next
- * batch.
+ * Page already present? Kick off the current batch
+ * of contiguous pages before continuing with the
+ * next batch.
*/
if (nr_pages)
- read_pages(mapping, filp, &page_pool, nr_pages,
- gfp_mask);
+ read_pages(mapping, filp, &page_pool,
+ page_offset - nr_pages,
+ nr_pages);
nr_pages = 0;
continue;
}
@@ -192,8 +194,18 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
page = __page_cache_alloc(gfp_mask);
if (!page)
break;
- page->index = page_offset;
- list_add(&page->lru, &page_pool);
+ if (use_list) {
+ page->index = page_offset;
+ list_add(&page->lru, &page_pool);
+ } else if (!add_to_page_cache_lru(page, mapping, page_offset,
+ gfp_mask)) {
+ if (nr_pages)
+ read_pages(mapping, filp, &page_pool,
+ page_offset - nr_pages,
+ nr_pages);
+ nr_pages = 0;
+ continue;
+ }
if (page_idx == nr_to_read - lookahead_size)
SetPageReadahead(page);
nr_pages++;
@@ -205,7 +217,8 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
* will then handle the error.
*/
if (nr_pages)
- read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
+ read_pages(mapping, filp, &page_pool, page_offset - nr_pages,
+ nr_pages);
BUG_ON(!list_empty(&page_pool));
out:
return nr_pages;
--
2.24.1
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH 03/12] readahead: Put pages in cache earlier
2020-01-25 1:35 ` [PATCH 03/12] readahead: Put pages in cache earlier Matthew Wilcox
@ 2020-01-25 19:44 ` Matthew Wilcox
0 siblings, 0 replies; 13+ messages in thread
From: Matthew Wilcox @ 2020-01-25 19:44 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-mm, linux-kernel, linux-btrfs, linux-erofs, linux-ext4,
linux-f2fs-devel, linux-xfs, cluster-devel, ocfs2-devel
On Fri, Jan 24, 2020 at 05:35:44PM -0800, Matthew Wilcox wrote:
> @@ -192,8 +194,18 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
> page = __page_cache_alloc(gfp_mask);
> if (!page)
> break;
> - page->index = page_offset;
> - list_add(&page->lru, &page_pool);
> + if (use_list) {
> + page->index = page_offset;
> + list_add(&page->lru, &page_pool);
> + } else if (!add_to_page_cache_lru(page, mapping, page_offset,
> + gfp_mask)) {
> + if (nr_pages)
> + read_pages(mapping, filp, &page_pool,
> + page_offset - nr_pages,
> + nr_pages);
> + nr_pages = 0;
This is missing a call to put_page().
> + continue;
> + }
> if (page_idx == nr_to_read - lookahead_size)
> SetPageReadahead(page);
> nr_pages++;
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 04/12] mm: Add readahead address space operation
2020-01-25 1:35 [PATCH 00/12] Change readahead API Matthew Wilcox
2020-01-25 1:35 ` [PATCH 03/12] readahead: Put pages in cache earlier Matthew Wilcox
@ 2020-01-25 1:35 ` Matthew Wilcox
2020-01-25 3:57 ` Randy Dunlap
2020-01-29 0:24 ` Dave Chinner
2020-01-25 1:35 ` [PATCH 12/12] iomap: Convert from readpages to readahead Matthew Wilcox
2020-02-13 4:38 ` [PATCH 00/12] Change readahead API Andrew Morton
3 siblings, 2 replies; 13+ messages in thread
From: Matthew Wilcox @ 2020-01-25 1:35 UTC (permalink / raw)
To: linux-fsdevel
Cc: Matthew Wilcox (Oracle),
linux-mm, linux-kernel, linux-btrfs, linux-erofs, linux-ext4,
linux-f2fs-devel, linux-xfs, cluster-devel, ocfs2-devel
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
This replaces ->readpages with a saner interface:
- Return the number of pages not read instead of an ignored error code.
- Pages are already in the page cache when ->readahead is called.
- Implementation looks up the pages in the page cache instead of
having them passed in a linked list.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-btrfs@vger.kernel.org
Cc: linux-erofs@lists.ozlabs.org
Cc: linux-ext4@vger.kernel.org
Cc: linux-f2fs-devel@lists.sourceforge.net
Cc: linux-xfs@vger.kernel.org
Cc: cluster-devel@redhat.com
Cc: ocfs2-devel@oss.oracle.com
---
Documentation/filesystems/locking.rst | 7 ++++++-
Documentation/filesystems/vfs.rst | 11 +++++++++++
include/linux/fs.h | 2 ++
include/linux/pagemap.h | 12 ++++++++++++
mm/readahead.c | 13 ++++++++++++-
5 files changed, 43 insertions(+), 2 deletions(-)
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 5057e4d9dcd1..d8a5dde914b5 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -239,6 +239,8 @@ prototypes::
int (*readpage)(struct file *, struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
+ unsigned (*readahead)(struct file *, struct address_space *,
+ pgoff_t start, unsigned nr_pages);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
@@ -271,7 +273,8 @@ writepage: yes, unlocks (see below)
readpage: yes, unlocks
writepages:
set_page_dirty no
-readpages:
+readahead: yes, unlocks
+readpages: no
write_begin: locks the page exclusive
write_end: yes, unlocks exclusive
bmap:
@@ -295,6 +298,8 @@ the request handler (/dev/loop).
->readpage() unlocks the page, either synchronously or via I/O
completion.
+->readahead() unlocks the page like ->readpage().
+
->readpages() populates the pagecache with the passed pages and starts
I/O against them. They come unlocked upon I/O completion.
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 7d4d09dd5e6d..bb06fb7b120b 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -706,6 +706,8 @@ cache in your filesystem. The following members are defined:
int (*readpage)(struct file *, struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
+ unsigned (*readahead)(struct file *filp, struct address_space *mapping,
+ pgoff_t start, unsigned nr_pages);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
@@ -781,6 +783,15 @@ cache in your filesystem. The following members are defined:
If defined, it should set the PageDirty flag, and the
PAGECACHE_TAG_DIRTY tag in the radix tree.
+``readahead``
+ called by the VM to read pages associated with the address_space
+ object. The pages are consecutive in the page cache and are
+ locked. The implementation should decrement the page refcount after
+ attempting I/O on each page. Usually the page will be unlocked by
+ the I/O completion handler. If the function does not attempt I/O on
+ some pages, return the number of pages which were not read so the
+ common code can unlock the pages for you.
+
``readpages``
called by the VM to read pages associated with the address_space
object. This is essentially just a vector version of readpage.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98e0349adb52..a10f3a72e5ac 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -375,6 +375,8 @@ struct address_space_operations {
*/
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
+ unsigned (*readahead)(struct file *, struct address_space *,
+ pgoff_t start, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 37a4d9e32cd3..2baafd236a82 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -630,6 +630,18 @@ static inline int add_to_page_cache(struct page *page,
return error;
}
+/*
+ * Only call this from a ->readahead implementation.
+ */
+static inline
+struct page *readahead_page(struct address_space *mapping, pgoff_t index)
+{
+ struct page *page = xa_load(&mapping->i_pages, index);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ return page;
+}
+
static inline unsigned long dir_pages(struct inode *inode)
{
return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
diff --git a/mm/readahead.c b/mm/readahead.c
index 5a6676640f20..6d65dae6dad0 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -121,7 +121,18 @@ static void read_pages(struct address_space *mapping, struct file *filp,
blk_start_plug(&plug);
- if (mapping->a_ops->readpages) {
+ if (mapping->a_ops->readahead) {
+ unsigned left = mapping->a_ops->readahead(filp, mapping,
+ start, nr_pages);
+
+ while (left) {
+ struct page *page = readahead_page(mapping,
+ start + nr_pages - left - 1);
+ unlock_page(page);
+ put_page(page);
+ left--;
+ }
+ } else if (mapping->a_ops->readpages) {
mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
/* Clean up the remaining pages */
put_pages_list(pages);
--
2.24.1
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH 04/12] mm: Add readahead address space operation
2020-01-25 1:35 ` [PATCH 04/12] mm: Add readahead address space operation Matthew Wilcox
@ 2020-01-25 3:57 ` Randy Dunlap
2020-02-01 0:25 ` Matthew Wilcox
2020-01-29 0:24 ` Dave Chinner
1 sibling, 1 reply; 13+ messages in thread
From: Randy Dunlap @ 2020-01-25 3:57 UTC (permalink / raw)
To: Matthew Wilcox, linux-fsdevel
Cc: linux-mm, linux-kernel, linux-btrfs, linux-erofs, linux-ext4,
linux-f2fs-devel, linux-xfs, cluster-devel, ocfs2-devel
On 1/24/20 5:35 PM, Matthew Wilcox wrote:
> diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
> index 7d4d09dd5e6d..bb06fb7b120b 100644
> --- a/Documentation/filesystems/vfs.rst
> +++ b/Documentation/filesystems/vfs.rst
> @@ -706,6 +706,8 @@ cache in your filesystem. The following members are defined:
> int (*readpage)(struct file *, struct page *);
> int (*writepages)(struct address_space *, struct writeback_control *);
> int (*set_page_dirty)(struct page *page);
> + unsigned (*readahead)(struct file *filp, struct address_space *mapping,
> + pgoff_t start, unsigned nr_pages);
> int (*readpages)(struct file *filp, struct address_space *mapping,
> struct list_head *pages, unsigned nr_pages);
> int (*write_begin)(struct file *, struct address_space *mapping,
> @@ -781,6 +783,15 @@ cache in your filesystem. The following members are defined:
> If defined, it should set the PageDirty flag, and the
> PAGECACHE_TAG_DIRTY tag in the radix tree.
>
> +``readahead``
> + called by the VM to read pages associated with the address_space
> + object. The pages are consecutive in the page cache and are
> + locked. The implementation should decrement the page refcount after
> + attempting I/O on each page. Usually the page will be unlocked by
> + the I/O completion handler. If the function does not attempt I/O on
> + some pages, return the number of pages which were not read so the
> + common code can unlock the pages for you.
> +
Please use consistent indentation (tabs).
> ``readpages``
> called by the VM to read pages associated with the address_space
> object. This is essentially just a vector version of readpage.
cheers.
--
~Randy
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 04/12] mm: Add readahead address space operation
2020-01-25 3:57 ` Randy Dunlap
@ 2020-02-01 0:25 ` Matthew Wilcox
0 siblings, 0 replies; 13+ messages in thread
From: Matthew Wilcox @ 2020-02-01 0:25 UTC (permalink / raw)
To: Randy Dunlap
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel
On Fri, Jan 24, 2020 at 07:57:40PM -0800, Randy Dunlap wrote:
> > +``readahead``
> > + called by the VM to read pages associated with the address_space
> > + object. The pages are consecutive in the page cache and are
> > + locked. The implementation should decrement the page refcount after
> > + attempting I/O on each page. Usually the page will be unlocked by
> > + the I/O completion handler. If the function does not attempt I/O on
> > + some pages, return the number of pages which were not read so the
> > + common code can unlock the pages for you.
> > +
>
> Please use consistent indentation (tabs).
This turned out to be not my fault. The vim rst ... mode? plugin?
Whatever it is, it's converting tabs to spaces! To fix it, I had to
rename the file to .txt, make the edits, then rename it back. This is
very poor behaviour.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 04/12] mm: Add readahead address space operation
2020-01-25 1:35 ` [PATCH 04/12] mm: Add readahead address space operation Matthew Wilcox
2020-01-25 3:57 ` Randy Dunlap
@ 2020-01-29 0:24 ` Dave Chinner
2020-01-30 8:00 ` Matthew Wilcox
1 sibling, 1 reply; 13+ messages in thread
From: Dave Chinner @ 2020-01-29 0:24 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel
On Fri, Jan 24, 2020 at 05:35:45PM -0800, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
>
> This replaces ->readpages with a saner interface:
> - Return the number of pages not read instead of an ignored error code.
> - Pages are already in the page cache when ->readahead is called.
> - Implementation looks up the pages in the page cache instead of
> having them passed in a linked list.
....
> diff --git a/mm/readahead.c b/mm/readahead.c
> index 5a6676640f20..6d65dae6dad0 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -121,7 +121,18 @@ static void read_pages(struct address_space *mapping, struct file *filp,
>
> blk_start_plug(&plug);
>
> - if (mapping->a_ops->readpages) {
> + if (mapping->a_ops->readahead) {
> + unsigned left = mapping->a_ops->readahead(filp, mapping,
> + start, nr_pages);
> +
> + while (left) {
> + struct page *page = readahead_page(mapping,
> + start + nr_pages - left - 1);
Off by one? start = 2, nr_pages = 2, left = 1, this looks up the
page at index 2, which is the one we issued IO on, not the one we
"left behind" which is at index 3.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 04/12] mm: Add readahead address space operation
2020-01-29 0:24 ` Dave Chinner
@ 2020-01-30 8:00 ` Matthew Wilcox
0 siblings, 0 replies; 13+ messages in thread
From: Matthew Wilcox @ 2020-01-30 8:00 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel
On Wed, Jan 29, 2020 at 11:24:56AM +1100, Dave Chinner wrote:
> On Fri, Jan 24, 2020 at 05:35:45PM -0800, Matthew Wilcox wrote:
> > From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> >
> > This replaces ->readpages with a saner interface:
> > - Return the number of pages not read instead of an ignored error code.
> > - Pages are already in the page cache when ->readahead is called.
> > - Implementation looks up the pages in the page cache instead of
> > having them passed in a linked list.
> ....
> > diff --git a/mm/readahead.c b/mm/readahead.c
> > index 5a6676640f20..6d65dae6dad0 100644
> > --- a/mm/readahead.c
> > +++ b/mm/readahead.c
> > @@ -121,7 +121,18 @@ static void read_pages(struct address_space *mapping, struct file *filp,
> >
> > blk_start_plug(&plug);
> >
> > - if (mapping->a_ops->readpages) {
> > + if (mapping->a_ops->readahead) {
> > + unsigned left = mapping->a_ops->readahead(filp, mapping,
> > + start, nr_pages);
> > +
> > + while (left) {
> > + struct page *page = readahead_page(mapping,
> > + start + nr_pages - left - 1);
>
> Off by one? start = 2, nr_pages = 2, left = 1, this looks up the
> page at index 2, which is the one we issued IO on, not the one we
> "left behind" which is at index 3.
Yup. I originally had:
while (left--) ...
decided that was too confusing and didn't quite complete that thought.
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 12/12] iomap: Convert from readpages to readahead
2020-01-25 1:35 [PATCH 00/12] Change readahead API Matthew Wilcox
2020-01-25 1:35 ` [PATCH 03/12] readahead: Put pages in cache earlier Matthew Wilcox
2020-01-25 1:35 ` [PATCH 04/12] mm: Add readahead address space operation Matthew Wilcox
@ 2020-01-25 1:35 ` Matthew Wilcox
2020-01-29 1:38 ` Dave Chinner
2020-02-13 4:38 ` [PATCH 00/12] Change readahead API Andrew Morton
3 siblings, 1 reply; 13+ messages in thread
From: Matthew Wilcox @ 2020-01-25 1:35 UTC (permalink / raw)
To: linux-fsdevel; +Cc: Matthew Wilcox (Oracle), linux-mm, linux-kernel, linux-xfs
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Use the new readahead operation in XFS and iomap.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-xfs@vger.kernel.org
---
fs/iomap/buffered-io.c | 72 +++++++++---------------------------------
fs/iomap/trace.h | 2 +-
fs/xfs/xfs_aops.c | 10 +++---
include/linux/iomap.h | 2 +-
4 files changed, 22 insertions(+), 64 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 1e2f3cc4579b..07aedd359c4b 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -216,7 +216,6 @@ struct iomap_readpage_ctx {
bool cur_page_in_bio;
bool is_readahead;
struct bio *bio;
- struct list_head *pages;
};
static void
@@ -367,36 +366,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
}
EXPORT_SYMBOL_GPL(iomap_readpage);
-static struct page *
-iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
- loff_t length, loff_t *done)
-{
- while (!list_empty(pages)) {
- struct page *page = lru_to_page(pages);
-
- if (page_offset(page) >= (u64)pos + length)
- break;
-
- list_del(&page->lru);
- if (!add_to_page_cache_lru(page, inode->i_mapping, page->index,
- GFP_NOFS))
- return page;
-
- /*
- * If we already have a page in the page cache at index we are
- * done. Upper layers don't care if it is uptodate after the
- * readpages call itself as every page gets checked again once
- * actually needed.
- */
- *done += PAGE_SIZE;
- put_page(page);
- }
-
- return NULL;
-}
-
static loff_t
-iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
+iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length,
void *data, struct iomap *iomap, struct iomap *srcmap)
{
struct iomap_readpage_ctx *ctx = data;
@@ -410,10 +381,8 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
ctx->cur_page = NULL;
}
if (!ctx->cur_page) {
- ctx->cur_page = iomap_next_page(inode, ctx->pages,
- pos, length, &done);
- if (!ctx->cur_page)
- break;
+ ctx->cur_page = readahead_page(inode->i_mapping,
+ pos / PAGE_SIZE);
ctx->cur_page_in_bio = false;
}
ret = iomap_readpage_actor(inode, pos + done, length - done,
@@ -423,48 +392,37 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
return done;
}
-int
-iomap_readpages(struct address_space *mapping, struct list_head *pages,
+unsigned
+iomap_readahead(struct address_space *mapping, pgoff_t start,
unsigned nr_pages, const struct iomap_ops *ops)
{
struct iomap_readpage_ctx ctx = {
- .pages = pages,
.is_readahead = true,
};
- loff_t pos = page_offset(list_entry(pages->prev, struct page, lru));
- loff_t last = page_offset(list_entry(pages->next, struct page, lru));
- loff_t length = last - pos + PAGE_SIZE, ret = 0;
+ loff_t pos = start * PAGE_SIZE;
+ loff_t length = nr_pages * PAGE_SIZE;
- trace_iomap_readpages(mapping->host, nr_pages);
+ trace_iomap_readahead(mapping->host, nr_pages);
while (length > 0) {
- ret = iomap_apply(mapping->host, pos, length, 0, ops,
- &ctx, iomap_readpages_actor);
+ loff_t ret = iomap_apply(mapping->host, pos, length, 0, ops,
+ &ctx, iomap_readahead_actor);
if (ret <= 0) {
WARN_ON_ONCE(ret == 0);
- goto done;
+ break;
}
pos += ret;
length -= ret;
}
- ret = 0;
-done:
+
if (ctx.bio)
submit_bio(ctx.bio);
- if (ctx.cur_page) {
- if (!ctx.cur_page_in_bio)
- unlock_page(ctx.cur_page);
+ if (ctx.cur_page && ctx.cur_page_in_bio)
put_page(ctx.cur_page);
- }
- /*
- * Check that we didn't lose a page due to the arcance calling
- * conventions..
- */
- WARN_ON_ONCE(!ret && !list_empty(ctx.pages));
- return ret;
+ return length / PAGE_SIZE;
}
-EXPORT_SYMBOL_GPL(iomap_readpages);
+EXPORT_SYMBOL_GPL(iomap_readahead);
/*
* iomap_is_partially_uptodate checks whether blocks within a page are
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 6dc227b8c47e..d6ba705f938a 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -39,7 +39,7 @@ DEFINE_EVENT(iomap_readpage_class, name, \
TP_PROTO(struct inode *inode, int nr_pages), \
TP_ARGS(inode, nr_pages))
DEFINE_READPAGE_EVENT(iomap_readpage);
-DEFINE_READPAGE_EVENT(iomap_readpages);
+DEFINE_READPAGE_EVENT(iomap_readahead);
DECLARE_EVENT_CLASS(iomap_page_class,
TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3a688eb5c5ae..4d9da34e759b 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -621,14 +621,14 @@ xfs_vm_readpage(
return iomap_readpage(page, &xfs_read_iomap_ops);
}
-STATIC int
-xfs_vm_readpages(
+STATIC unsigned
+xfs_vm_readahead(
struct file *unused,
struct address_space *mapping,
- struct list_head *pages,
+ pgoff_t start,
unsigned nr_pages)
{
- return iomap_readpages(mapping, pages, nr_pages, &xfs_read_iomap_ops);
+ return iomap_readahead(mapping, start, nr_pages, &xfs_read_iomap_ops);
}
static int
@@ -644,7 +644,7 @@ xfs_iomap_swapfile_activate(
const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage,
- .readpages = xfs_vm_readpages,
+ .readahead = xfs_vm_readahead,
.writepage = xfs_vm_writepage,
.writepages = xfs_vm_writepages,
.set_page_dirty = iomap_set_page_dirty,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 8b09463dae0d..81c6067e9b61 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -155,7 +155,7 @@ loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
const struct iomap_ops *ops);
int iomap_readpage(struct page *page, const struct iomap_ops *ops);
-int iomap_readpages(struct address_space *mapping, struct list_head *pages,
+unsigned iomap_readahead(struct address_space *, pgoff_t start,
unsigned nr_pages, const struct iomap_ops *ops);
int iomap_set_page_dirty(struct page *page);
int iomap_is_partially_uptodate(struct page *page, unsigned long from,
--
2.24.1
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH 12/12] iomap: Convert from readpages to readahead
2020-01-25 1:35 ` [PATCH 12/12] iomap: Convert from readpages to readahead Matthew Wilcox
@ 2020-01-29 1:38 ` Dave Chinner
2020-01-31 9:44 ` Matthew Wilcox
0 siblings, 1 reply; 13+ messages in thread
From: Dave Chinner @ 2020-01-29 1:38 UTC (permalink / raw)
To: Matthew Wilcox; +Cc: linux-fsdevel, linux-mm, linux-kernel, linux-xfs
On Fri, Jan 24, 2020 at 05:35:53PM -0800, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
>
> Use the new readahead operation in XFS and iomap.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: linux-xfs@vger.kernel.org
....
> +unsigned
> +iomap_readahead(struct address_space *mapping, pgoff_t start,
> unsigned nr_pages, const struct iomap_ops *ops)
> {
> struct iomap_readpage_ctx ctx = {
> - .pages = pages,
> .is_readahead = true,
> };
> - loff_t pos = page_offset(list_entry(pages->prev, struct page, lru));
> - loff_t last = page_offset(list_entry(pages->next, struct page, lru));
> - loff_t length = last - pos + PAGE_SIZE, ret = 0;
> + loff_t pos = start * PAGE_SIZE;
> + loff_t length = nr_pages * PAGE_SIZE;
>
> - trace_iomap_readpages(mapping->host, nr_pages);
> + trace_iomap_readahead(mapping->host, nr_pages);
>
> while (length > 0) {
> - ret = iomap_apply(mapping->host, pos, length, 0, ops,
> - &ctx, iomap_readpages_actor);
> + loff_t ret = iomap_apply(mapping->host, pos, length, 0, ops,
> + &ctx, iomap_readahead_actor);
> if (ret <= 0) {
> WARN_ON_ONCE(ret == 0);
> - goto done;
> + break;
> }
> pos += ret;
> length -= ret;
> }
> - ret = 0;
> -done:
> +
> if (ctx.bio)
> submit_bio(ctx.bio);
> - if (ctx.cur_page) {
> - if (!ctx.cur_page_in_bio)
> - unlock_page(ctx.cur_page);
> + if (ctx.cur_page && ctx.cur_page_in_bio)
> put_page(ctx.cur_page);
> - }
>
> - /*
> - * Check that we didn't lose a page due to the arcance calling
> - * conventions..
> - */
> - WARN_ON_ONCE(!ret && !list_empty(ctx.pages));
> - return ret;
> + return length / PAGE_SIZE;
Took me quite some time to get my head around whether this was
correct or not.
I'm still not certain in the cases where block size != page size and
we've got an extent boundary in the middle of the page and had a
read error on the second extent in the page. In this case,
ctx.cur_page_in_bio is true so we drop the readahead reference to
the page. Also, length is not a multiple of page size, and so the
nr_pages value returned includes the partial page that we have IO
underway on.
That, I think, leads to both a double unlock and a double put_page()
of the partial page in question.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 12/12] iomap: Convert from readpages to readahead
2020-01-29 1:38 ` Dave Chinner
@ 2020-01-31 9:44 ` Matthew Wilcox
0 siblings, 0 replies; 13+ messages in thread
From: Matthew Wilcox @ 2020-01-31 9:44 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-fsdevel, linux-mm, linux-kernel, linux-xfs
On Wed, Jan 29, 2020 at 12:38:39PM +1100, Dave Chinner wrote:
> On Fri, Jan 24, 2020 at 05:35:53PM -0800, Matthew Wilcox wrote:
> > From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> > Use the new readahead operation in XFS and iomap.
> > + if (ctx.cur_page && ctx.cur_page_in_bio)
> > put_page(ctx.cur_page);
> > - }
> >
> > - /*
> > - * Check that we didn't lose a page due to the arcance calling
> > - * conventions..
> > - */
> > - WARN_ON_ONCE(!ret && !list_empty(ctx.pages));
> > - return ret;
> > + return length / PAGE_SIZE;
>
> Took me quite some time to get my head around whether this was
> correct or not.
Yes. Unfortunately, this is the most complex of the conversions ;-(
> I'm still not certain in the cases where block size != page size and
> we've got an extent boundary in the middle of the page and had a
> read error on the second extent in the page. In this case,
> ctx.cur_page_in_bio is true so we drop the readahead reference to
> the page. Also, length is not a multiple of page size, and so the
> nr_pages value returned includes the partial page that we have IO
> underway on.
>
> That, I think, leads to both a double unlock and a double put_page()
> of the partial page in question.
But C division rounds down. So we neither unlock, nor put_page() the
page which was in the bio ... do we?
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 00/12] Change readahead API
2020-01-25 1:35 [PATCH 00/12] Change readahead API Matthew Wilcox
` (2 preceding siblings ...)
2020-01-25 1:35 ` [PATCH 12/12] iomap: Convert from readpages to readahead Matthew Wilcox
@ 2020-02-13 4:38 ` Andrew Morton
2020-02-13 13:43 ` Matthew Wilcox
3 siblings, 1 reply; 13+ messages in thread
From: Andrew Morton @ 2020-02-13 4:38 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel
On Fri, 24 Jan 2020 17:35:41 -0800 Matthew Wilcox <willy@infradead.org> wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
>
> This series adds a readahead address_space operation to eventually
> replace the readpages operation. The key difference is that
> pages are added to the page cache as they are allocated (and
> then looked up by the filesystem) instead of passing them on a
> list to the readpages operation and having the filesystem add
> them to the page cache. It's a net reduction in code for each
> implementation, more efficient than walking a list, and solves
> the direct-write vs buffered-read problem reported by yu kuai at
> https://lore.kernel.org/linux-fsdevel/20200116063601.39201-1-yukuai3@huawei.com/
Unclear which patch fixes this and how it did it?
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 00/12] Change readahead API
2020-02-13 4:38 ` [PATCH 00/12] Change readahead API Andrew Morton
@ 2020-02-13 13:43 ` Matthew Wilcox
0 siblings, 0 replies; 13+ messages in thread
From: Matthew Wilcox @ 2020-02-13 13:43 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel, Mark Fasheh, Joel Becker, Joseph Qi, Bob Peterson,
Andreas Gruenbacher
On Wed, Feb 12, 2020 at 08:38:52PM -0800, Andrew Morton wrote:
> On Fri, 24 Jan 2020 17:35:41 -0800 Matthew Wilcox <willy@infradead.org> wrote:
>
> > From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> >
> > This series adds a readahead address_space operation to eventually
> > replace the readpages operation. The key difference is that
> > pages are added to the page cache as they are allocated (and
> > then looked up by the filesystem) instead of passing them on a
> > list to the readpages operation and having the filesystem add
> > them to the page cache. It's a net reduction in code for each
> > implementation, more efficient than walking a list, and solves
> > the direct-write vs buffered-read problem reported by yu kuai at
> > https://lore.kernel.org/linux-fsdevel/20200116063601.39201-1-yukuai3@huawei.com/
>
> Unclear which patch fixes this and how it did it?
I suppose the problem isn't fixed until patch 13/13 is applied.
What yu kuai is seeing is a race where readahead allocates a page,
then passes it to iomap_readpages, which calls xfs_read_iomap_begin()
which looks up the extent. Then thread 2 does DIO which modifies the
extent, because there's nothing to say that thread 1 is still using it.
With this patch series, the readpages code puts the locked pages in the
cache before calling iomap_readpages, so any racing write will block on
the locked page until readahead is completed.
If you're tempted to put this into -mm, I have a couple of new changes;
one to fix a kernel-doc warning for mpage_readahead() and one to add
kernel-doc for iomap_readahead():
+++ b/fs/mpage.c
@@ -339,9 +339,7 @@
/**
* mpage_readahead - start reads against pages
- * @mapping: the address_space
- * @start: The number of the first page to read.
- * @nr_pages: The number of consecutive pages to read.
+ * @rac: Describes which pages to read.
* @get_block: The filesystem's block mapper function.
*
* This function walks the pages and the blocks within each page, building and
+++ b/fs/iomap/buffered-io.c
@@ -395,6 +395,21 @@
return done;
}
+/**
+ * iomap_readahead - Attempt to read pages from a file.
+ * @rac: Describes the pages to be read.
+ * @ops: The operations vector for the filesystem.
+ *
+ * This function is for filesystems to call to implement their readahead
+ * address_space operation.
+ *
+ * Context: The file is pinned by the caller, and the pages to be read are
+ * all locked and have an elevated refcount. This function will unlock
+ * the pages (once I/O has completed on them, or I/O has been determined to
+ * not be necessary). It will also decrease the refcount once the pages
+ * have been submitted for I/O. After this point, the page may be removed
+ * from the page cache, and should not be referenced.
+ */
void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
{
struct inode *inode = rac->mapping->host;
I'll do a v6 with those changes soon, but I would really like a bit more
review from filesystem people, particularly ocfs2 and gfs2.
^ permalink raw reply [flat|nested] 13+ messages in thread