All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm: implement POSIX_FADV_NOREUSE
@ 2014-03-11 10:25 ` Matthias Wirth
  0 siblings, 0 replies; 26+ messages in thread
From: Matthias Wirth @ 2014-03-11 10:25 UTC (permalink / raw)
  To: Matthias Wirth
  Cc: Lukas Senger, Matthew Wilcox, Jeff Layton, J. Bruce Fields,
	Andrew Morton, Johannes Weiner, Michal Hocko, Rik van Riel,
	Lisa Du, Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat

Backups, logrotation and indexers don't need files they read to remain
in the page cache. Their pages can be reclaimed early and should not
displace useful pages. POSIX specifices the POSIX_FADV_NOREUSE flag for
these use cases but it's currently a noop.

In our implementation pages marked with the NoReuse flag are added to
the tail of the LRU list the first time they are read. Therefore they
are the first to be reclaimed.

We needed to add flags to the file and page structs in order to pass
down the hint to the actual call to list_add.

Signed-off-by: Matthias Wirth <matthias.wirth@gmail.com>
Signed-off-by: Lukas Senger <lukas@fridolin.com>
---
 include/linux/fs.h         | 3 +++
 include/linux/mm_inline.h  | 7 ++++++-
 include/linux/page-flags.h | 2 ++
 mm/fadvise.c               | 4 ++++
 mm/filemap.c               | 3 +++
 mm/page_alloc.c            | 1 +
 mm/readahead.c             | 2 ++
 7 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 881accf..3e80149 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -123,6 +123,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File is opened with O_PATH; almost nothing can be done with it */
 #define FMODE_PATH		((__force fmode_t)0x4000)
 
+/* Expect one read only (effect on page cache behavior) */
+#define FMODE_NOREUSE		((__force fmode_t)0x8000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
 
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index cf55945..1bed771 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -27,7 +27,12 @@ static __always_inline void add_page_to_lru_list(struct page *page,
 {
 	int nr_pages = hpage_nr_pages(page);
 	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
-	list_add(&page->lru, &lruvec->lists[lru]);
+	if (unlikely(PageNoReuse(page))) {
+		ClearPageNoReuse(page);
+		list_add_tail(&page->lru, &lruvec->lists[lru]);
+	} else {
+		list_add(&page->lru, &lruvec->lists[lru]);
+	}
 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
 }
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d1fe1a7..ee5af4c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -109,6 +109,7 @@ enum pageflags {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	PG_compound_lock,
 #endif
+	PG_noreuse,		/* page is added to tail of LRU list */
 	__NR_PAGEFLAGS,
 
 	/* Filesystems */
@@ -206,6 +207,7 @@ __PAGEFLAG(Slab, slab)
 PAGEFLAG(Checked, checked)		/* Used by some filesystems */
 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)	/* Xen */
 PAGEFLAG(SavePinned, savepinned);			/* Xen */
+PAGEFLAG(NoReuse, noreuse);
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
 PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3bcfd81..387d10a 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -80,6 +80,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		f.file->f_ra.ra_pages = bdi->ra_pages;
 		spin_lock(&f.file->f_lock);
 		f.file->f_mode &= ~FMODE_RANDOM;
+		f.file->f_mode &= ~FMODE_NOREUSE;
 		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_RANDOM:
@@ -111,6 +112,9 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 					   nrpages);
 		break;
 	case POSIX_FADV_NOREUSE:
+		spin_lock(&f.file->f_lock);
+		f.file->f_mode |= FMODE_NOREUSE;
+		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_DONTNEED:
 		if (!bdi_write_congested(mapping->backing_dev_info))
diff --git a/mm/filemap.c b/mm/filemap.c
index 97474c1..8f57ca8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1630,6 +1630,9 @@ no_cached_page:
 			desc->error = -ENOMEM;
 			goto out;
 		}
+		if (filp->f_mode & FMODE_NOREUSE)
+			SetPageNoReuse(page);
+
 		error = add_to_page_cache_lru(page, mapping,
 						index, GFP_KERNEL);
 		if (error) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 336ee92..a756165 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6512,6 +6512,7 @@ static const struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{1UL << PG_compound_lock,	"compound_lock"	},
 #endif
+	{1UL << PG_noreuse,		"noreuse"	},
 };
 
 static void dump_page_flags(unsigned long flags)
diff --git a/mm/readahead.c b/mm/readahead.c
index 29c5e1a..e8d9221 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -189,6 +189,8 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 		list_add(&page->lru, &page_pool);
 		if (page_idx == nr_to_read - lookahead_size)
 			SetPageReadahead(page);
+		if (filp->f_mode & FMODE_NOREUSE)
+			SetPageNoReuse(page);
 		ret++;
 	}
 
-- 
1.8.3.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH] mm: implement POSIX_FADV_NOREUSE
@ 2014-03-11 10:25 ` Matthias Wirth
  0 siblings, 0 replies; 26+ messages in thread
From: Matthias Wirth @ 2014-03-11 10:25 UTC (permalink / raw)
  To: Matthias Wirth
  Cc: Lukas Senger, Matthew Wilcox, Jeff Layton, J. Bruce Fields,
	Andrew Morton, Johannes Weiner, Michal Hocko, Rik van Riel,
	Lisa Du, Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat,
	Dave Hansen, Zhang Yanfei, Raghavendra K T, Lukas Czerner,
	Damien Ramonda, Mark Rutland, linux-fsdevel, linux-kernel,
	linux-mm

Backups, logrotation and indexers don't need files they read to remain
in the page cache. Their pages can be reclaimed early and should not
displace useful pages. POSIX specifices the POSIX_FADV_NOREUSE flag for
these use cases but it's currently a noop.

In our implementation pages marked with the NoReuse flag are added to
the tail of the LRU list the first time they are read. Therefore they
are the first to be reclaimed.

We needed to add flags to the file and page structs in order to pass
down the hint to the actual call to list_add.

Signed-off-by: Matthias Wirth <matthias.wirth@gmail.com>
Signed-off-by: Lukas Senger <lukas@fridolin.com>
---
 include/linux/fs.h         | 3 +++
 include/linux/mm_inline.h  | 7 ++++++-
 include/linux/page-flags.h | 2 ++
 mm/fadvise.c               | 4 ++++
 mm/filemap.c               | 3 +++
 mm/page_alloc.c            | 1 +
 mm/readahead.c             | 2 ++
 7 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 881accf..3e80149 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -123,6 +123,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File is opened with O_PATH; almost nothing can be done with it */
 #define FMODE_PATH		((__force fmode_t)0x4000)
 
+/* Expect one read only (effect on page cache behavior) */
+#define FMODE_NOREUSE		((__force fmode_t)0x8000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
 
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index cf55945..1bed771 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -27,7 +27,12 @@ static __always_inline void add_page_to_lru_list(struct page *page,
 {
 	int nr_pages = hpage_nr_pages(page);
 	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
-	list_add(&page->lru, &lruvec->lists[lru]);
+	if (unlikely(PageNoReuse(page))) {
+		ClearPageNoReuse(page);
+		list_add_tail(&page->lru, &lruvec->lists[lru]);
+	} else {
+		list_add(&page->lru, &lruvec->lists[lru]);
+	}
 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
 }
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d1fe1a7..ee5af4c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -109,6 +109,7 @@ enum pageflags {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	PG_compound_lock,
 #endif
+	PG_noreuse,		/* page is added to tail of LRU list */
 	__NR_PAGEFLAGS,
 
 	/* Filesystems */
@@ -206,6 +207,7 @@ __PAGEFLAG(Slab, slab)
 PAGEFLAG(Checked, checked)		/* Used by some filesystems */
 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)	/* Xen */
 PAGEFLAG(SavePinned, savepinned);			/* Xen */
+PAGEFLAG(NoReuse, noreuse);
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
 PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3bcfd81..387d10a 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -80,6 +80,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		f.file->f_ra.ra_pages = bdi->ra_pages;
 		spin_lock(&f.file->f_lock);
 		f.file->f_mode &= ~FMODE_RANDOM;
+		f.file->f_mode &= ~FMODE_NOREUSE;
 		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_RANDOM:
@@ -111,6 +112,9 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 					   nrpages);
 		break;
 	case POSIX_FADV_NOREUSE:
+		spin_lock(&f.file->f_lock);
+		f.file->f_mode |= FMODE_NOREUSE;
+		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_DONTNEED:
 		if (!bdi_write_congested(mapping->backing_dev_info))
diff --git a/mm/filemap.c b/mm/filemap.c
index 97474c1..8f57ca8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1630,6 +1630,9 @@ no_cached_page:
 			desc->error = -ENOMEM;
 			goto out;
 		}
+		if (filp->f_mode & FMODE_NOREUSE)
+			SetPageNoReuse(page);
+
 		error = add_to_page_cache_lru(page, mapping,
 						index, GFP_KERNEL);
 		if (error) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 336ee92..a756165 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6512,6 +6512,7 @@ static const struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{1UL << PG_compound_lock,	"compound_lock"	},
 #endif
+	{1UL << PG_noreuse,		"noreuse"	},
 };
 
 static void dump_page_flags(unsigned long flags)
diff --git a/mm/readahead.c b/mm/readahead.c
index 29c5e1a..e8d9221 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -189,6 +189,8 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 		list_add(&page->lru, &page_pool);
 		if (page_idx == nr_to_read - lookahead_size)
 			SetPageReadahead(page);
+		if (filp->f_mode & FMODE_NOREUSE)
+			SetPageNoReuse(page);
 		ret++;
 	}
 
-- 
1.8.3.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: implement POSIX_FADV_NOREUSE
  2014-03-11 10:25 ` Matthias Wirth
@ 2014-03-11 14:06   ` Michal Hocko
  -1 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2014-03-11 14:06 UTC (permalink / raw)
  To: Matthias Wirth
  Cc: Lukas Senger, Matthew Wilcox, Jeff Layton, J. Bruce Fields,
	Andrew Morton, Johannes Weiner, Rik van Riel, Lisa Du,
	Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat

On Tue 11-03-14 11:25:41, Matthias Wirth wrote:
> Backups, logrotation and indexers don't need files they read to remain
> in the page cache. Their pages can be reclaimed early and should not
> displace useful pages. POSIX specifices the POSIX_FADV_NOREUSE flag for
> these use cases but it's currently a noop.

Why don't you use POSIX_FADV_DONTNEED when you no longer use those
pages? E.g. on close()?

> In our implementation pages marked with the NoReuse flag are added to
> the tail of the LRU list the first time they are read. Therefore they
> are the first to be reclaimed.

page flags are really scarce and I am not sure this is the best usage of
the few remaining slots.

> We needed to add flags to the file and page structs in order to pass
> down the hint to the actual call to list_add.
> 
> Signed-off-by: Matthias Wirth <matthias.wirth@gmail.com>
> Signed-off-by: Lukas Senger <lukas@fridolin.com>
[...]
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: implement POSIX_FADV_NOREUSE
@ 2014-03-11 14:06   ` Michal Hocko
  0 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2014-03-11 14:06 UTC (permalink / raw)
  To: Matthias Wirth
  Cc: Lukas Senger, Matthew Wilcox, Jeff Layton, J. Bruce Fields,
	Andrew Morton, Johannes Weiner, Rik van Riel, Lisa Du,
	Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat,
	Dave Hansen, Zhang Yanfei, Raghavendra K T, Lukas Czerner,
	Damien Ramonda, Mark Rutland, linux-fsdevel, linux-kernel,
	linux-mm

On Tue 11-03-14 11:25:41, Matthias Wirth wrote:
> Backups, logrotation and indexers don't need files they read to remain
> in the page cache. Their pages can be reclaimed early and should not
> displace useful pages. POSIX specifices the POSIX_FADV_NOREUSE flag for
> these use cases but it's currently a noop.

Why don't you use POSIX_FADV_DONTNEED when you no longer use those
pages? E.g. on close()?

> In our implementation pages marked with the NoReuse flag are added to
> the tail of the LRU list the first time they are read. Therefore they
> are the first to be reclaimed.

page flags are really scarce and I am not sure this is the best usage of
the few remaining slots.

> We needed to add flags to the file and page structs in order to pass
> down the hint to the actual call to list_add.
> 
> Signed-off-by: Matthias Wirth <matthias.wirth@gmail.com>
> Signed-off-by: Lukas Senger <lukas@fridolin.com>
[...]
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: implement POSIX_FADV_NOREUSE
  2014-03-11 14:06   ` Michal Hocko
@ 2014-03-11 15:24     ` Dave Hansen
  -1 siblings, 0 replies; 26+ messages in thread
From: Dave Hansen @ 2014-03-11 15:24 UTC (permalink / raw)
  To: Michal Hocko, Matthias Wirth
  Cc: Lukas Senger, Matthew Wilcox, Jeff Layton, J. Bruce Fields,
	Andrew Morton, Johannes Weiner, Rik van Riel, Lisa Du,
	Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat

On 03/11/2014 07:06 AM, Michal Hocko wrote:
>> > In our implementation pages marked with the NoReuse flag are added to
>> > the tail of the LRU list the first time they are read. Therefore they
>> > are the first to be reclaimed.
> page flags are really scarce and I am not sure this is the best usage of
> the few remaining slots.

Yeah, especially since the use so so transient.  I can see why using a
flag is nice for a quick prototype, but this is a far cry from needing
one. :)  You might be able to reuse a bit like PageReadahead.  You could
probably also use a bit in the page pointer of the lruvec, or even have
a percpu variable that stores a pointer to the 'struct page' you want to
mark as NOREUSE.

This also looks to ignore the reuse flag for existing pages.  Have you
thought about what the semantics should be there?

Also, *should* readahead pages really have this flag set?  If a very
important page gets brought in via readahead, doesn't this put it at a
disadvantage for getting aged out?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: implement POSIX_FADV_NOREUSE
@ 2014-03-11 15:24     ` Dave Hansen
  0 siblings, 0 replies; 26+ messages in thread
From: Dave Hansen @ 2014-03-11 15:24 UTC (permalink / raw)
  To: Michal Hocko, Matthias Wirth
  Cc: Lukas Senger, Matthew Wilcox, Jeff Layton, J. Bruce Fields,
	Andrew Morton, Johannes Weiner, Rik van Riel, Lisa Du,
	Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat,
	Zhang Yanfei, Raghavendra K T, Lukas Czerner, Damien Ramonda,
	Mark Rutland, linux-fsdevel, linux-kernel, linux-mm

On 03/11/2014 07:06 AM, Michal Hocko wrote:
>> > In our implementation pages marked with the NoReuse flag are added to
>> > the tail of the LRU list the first time they are read. Therefore they
>> > are the first to be reclaimed.
> page flags are really scarce and I am not sure this is the best usage of
> the few remaining slots.

Yeah, especially since the use so so transient.  I can see why using a
flag is nice for a quick prototype, but this is a far cry from needing
one. :)  You might be able to reuse a bit like PageReadahead.  You could
probably also use a bit in the page pointer of the lruvec, or even have
a percpu variable that stores a pointer to the 'struct page' you want to
mark as NOREUSE.

This also looks to ignore the reuse flag for existing pages.  Have you
thought about what the semantics should be there?

Also, *should* readahead pages really have this flag set?  If a very
important page gets brought in via readahead, doesn't this put it at a
disadvantage for getting aged out?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: implement POSIX_FADV_NOREUSE
  2014-03-11 15:24     ` Dave Hansen
@ 2014-03-11 21:27       ` Andrew Morton
  -1 siblings, 0 replies; 26+ messages in thread
From: Andrew Morton @ 2014-03-11 21:27 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Michal Hocko, Matthias Wirth, Lukas Senger, Matthew Wilcox,
	Jeff Layton, J. Bruce Fields, Johannes Weiner, Rik van Riel,
	Lisa Du, Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat

On Tue, 11 Mar 2014 08:24:42 -0700 Dave Hansen <dave.hansen@linux.intel.com> wrote:

> On 03/11/2014 07:06 AM, Michal Hocko wrote:
> >> > In our implementation pages marked with the NoReuse flag are added to
> >> > the tail of the LRU list the first time they are read. Therefore they
> >> > are the first to be reclaimed.
> > page flags are really scarce and I am not sure this is the best usage of
> > the few remaining slots.
> 
> Yeah, especially since the use so so transient.

Yes, we're short on page flags.

> This also looks to ignore the reuse flag for existing pages. 

And it sets PG_noreuse on new pages whether or not they were within the
fadvise range (offset...offset+len).  It's not really an fadvise
operation at all.

A practical implementation might go through the indicated pages, clear
any referenced bits and move them to the tail of the inactive LRU?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: implement POSIX_FADV_NOREUSE
@ 2014-03-11 21:27       ` Andrew Morton
  0 siblings, 0 replies; 26+ messages in thread
From: Andrew Morton @ 2014-03-11 21:27 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Michal Hocko, Matthias Wirth, Lukas Senger, Matthew Wilcox,
	Jeff Layton, J. Bruce Fields, Johannes Weiner, Rik van Riel,
	Lisa Du, Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat,
	Zhang Yanfei, Raghavendra K T, Lukas Czerner, Damien Ramonda,
	Mark Rutland, linux-fsdevel, linux-kernel, linux-mm

On Tue, 11 Mar 2014 08:24:42 -0700 Dave Hansen <dave.hansen@linux.intel.com> wrote:

> On 03/11/2014 07:06 AM, Michal Hocko wrote:
> >> > In our implementation pages marked with the NoReuse flag are added to
> >> > the tail of the LRU list the first time they are read. Therefore they
> >> > are the first to be reclaimed.
> > page flags are really scarce and I am not sure this is the best usage of
> > the few remaining slots.
> 
> Yeah, especially since the use so so transient.

Yes, we're short on page flags.

> This also looks to ignore the reuse flag for existing pages. 

And it sets PG_noreuse on new pages whether or not they were within the
fadvise range (offset...offset+len).  It's not really an fadvise
operation at all.

A practical implementation might go through the indicated pages, clear
any referenced bits and move them to the tail of the inactive LRU?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: implement POSIX_FADV_NOREUSE
  2014-03-11 21:27       ` Andrew Morton
@ 2014-03-12 11:59         ` Lukas Senger
  -1 siblings, 0 replies; 26+ messages in thread
From: Lukas Senger @ 2014-03-12 11:59 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Dave Hansen, Michal Hocko, Matthias Wirth, Matthew Wilcox,
	Jeff Layton, J. Bruce Fields, Johannes Weiner, Rik van Riel,
	Lisa Du, Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat

> Why don't you use POSIX_FADV_DONTNEED when you no longer use those
> pages? E.g. on close()?

Because DONTNEED throws out the pages even if other processes use them
as well, so additional hacks are needed to prevent that (see for
example[1]).

> This also looks to ignore the reuse flag for existing pages.  Have you
> thought about what the semantics should be there?

The idea is to only treat the pages special when they are first read
from disk. This way we achieve the main goal of not displacing useful
cache content.

> Also, *should* readahead pages really have this flag set?  If a very
> important page gets brought in via readahead, doesn't this put it at a
> disadvantage for getting aged out?

If the flag is not set on readahead pages, the advise barely has any
effect at all, since most of the file gets read through readahead. Of
course that very important page has a disadvantage at the beginning, but
as soon as it has been moved into the active list the NOREUSE doesn't
affect it anymore. Worst case it gets read once more without the flag.

On Tue, 2014-03-11 at 14:27 -0700, Andrew Morton wrote:
> And it sets PG_noreuse on new pages whether or not they were within the
> fadvise range (offset...offset+len).  It's not really an fadvise
> operation at all.

NORMAL, SEQUENTIAL and RANDOM don't honor the range either. So we
figured it would be ok to do so for the sake of keeping the
implementation simple.

> > page flags are really scarce and I am not sure this is the best
> usage of
> > the few remaining slots.
> 
> Yeah, especially since the use so so transient.  I can see why using a
> flag is nice for a quick prototype, but this is a far cry from needing
> one. :)  You might be able to reuse a bit like PageReadahead.  You
> could
> probably also use a bit in the page pointer of the lruvec, or even
> have
> a percpu variable that stores a pointer to the 'struct page' you want
> to
> mark as NOREUSE.

Ok, we understand that we can't add a page flag. We tried to find a flag
to recycle but did not succeed. lruvec doesn't have page pointers and we
don't have access to a pagevec and the file struct at the same time. We
don't really understand the last suggestion, as we need to save this
information for more than one page and going over a list every time we
add something to an lru list doesn't seem like a good idea.

Would it be acceptable to add a member to struct page for our purpose?

---

[1] http://insights.oetiker.ch/linux/fadvise.html

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: implement POSIX_FADV_NOREUSE
@ 2014-03-12 11:59         ` Lukas Senger
  0 siblings, 0 replies; 26+ messages in thread
From: Lukas Senger @ 2014-03-12 11:59 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Dave Hansen, Michal Hocko, Matthias Wirth, Matthew Wilcox,
	Jeff Layton, J. Bruce Fields, Johannes Weiner, Rik van Riel,
	Lisa Du, Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat,
	Zhang Yanfei, Raghavendra K T, Lukas Czerner, Damien Ramonda,
	Mark Rutland, linux-fsdevel, linux-kernel, linux-mm, i4passt

> Why don't you use POSIX_FADV_DONTNEED when you no longer use those
> pages? E.g. on close()?

Because DONTNEED throws out the pages even if other processes use them
as well, so additional hacks are needed to prevent that (see for
example[1]).

> This also looks to ignore the reuse flag for existing pages.  Have you
> thought about what the semantics should be there?

The idea is to only treat the pages special when they are first read
from disk. This way we achieve the main goal of not displacing useful
cache content.

> Also, *should* readahead pages really have this flag set?  If a very
> important page gets brought in via readahead, doesn't this put it at a
> disadvantage for getting aged out?

If the flag is not set on readahead pages, the advise barely has any
effect at all, since most of the file gets read through readahead. Of
course that very important page has a disadvantage at the beginning, but
as soon as it has been moved into the active list the NOREUSE doesn't
affect it anymore. Worst case it gets read once more without the flag.

On Tue, 2014-03-11 at 14:27 -0700, Andrew Morton wrote:
> And it sets PG_noreuse on new pages whether or not they were within the
> fadvise range (offset...offset+len).  It's not really an fadvise
> operation at all.

NORMAL, SEQUENTIAL and RANDOM don't honor the range either. So we
figured it would be ok to do so for the sake of keeping the
implementation simple.

> > page flags are really scarce and I am not sure this is the best
> usage of
> > the few remaining slots.
> 
> Yeah, especially since the use so so transient.  I can see why using a
> flag is nice for a quick prototype, but this is a far cry from needing
> one. :)  You might be able to reuse a bit like PageReadahead.  You
> could
> probably also use a bit in the page pointer of the lruvec, or even
> have
> a percpu variable that stores a pointer to the 'struct page' you want
> to
> mark as NOREUSE.

Ok, we understand that we can't add a page flag. We tried to find a flag
to recycle but did not succeed. lruvec doesn't have page pointers and we
don't have access to a pagevec and the file struct at the same time. We
don't really understand the last suggestion, as we need to save this
information for more than one page and going over a list every time we
add something to an lru list doesn't seem like a good idea.

Would it be acceptable to add a member to struct page for our purpose?

---

[1] http://insights.oetiker.ch/linux/fadvise.html

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: implement POSIX_FADV_NOREUSE
  2014-03-12 11:59         ` Lukas Senger
@ 2014-03-12 14:46           ` Michal Hocko
  -1 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2014-03-12 14:46 UTC (permalink / raw)
  To: Lukas Senger
  Cc: Andrew Morton, Dave Hansen, Matthias Wirth, Matthew Wilcox,
	Jeff Layton, J. Bruce Fields, Johannes Weiner, Rik van Riel,
	Lisa Du, Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes,

On Wed 12-03-14 12:59:52, Lukas Senger wrote:
> > Why don't you use POSIX_FADV_DONTNEED when you no longer use those
> > pages? E.g. on close()?
> 
> Because DONTNEED throws out the pages even if other processes use them
> as well, so additional hacks are needed to prevent that (see for
> example[1]).

OK, that might be indeed to harsh.

[...]
> Ok, we understand that we can't add a page flag. We tried to find a flag
> to recycle but did not succeed. lruvec doesn't have page pointers and we
> don't have access to a pagevec and the file struct at the same time. We
> don't really understand the last suggestion, as we need to save this
> information for more than one page and going over a list every time we
> add something to an lru list doesn't seem like a good idea.
> 
> Would it be acceptable to add a member to struct page for our purpose?

No, it won't be that easy ;).

I think the Andrew's proposal makes sense. Why not simply move the pages
to the tail of inactive LRUs.

Or another approach might be to drop only those pages from the range
which are not mapped by other processes (something like a lite
DONTNEED).

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: implement POSIX_FADV_NOREUSE
@ 2014-03-12 14:46           ` Michal Hocko
  0 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2014-03-12 14:46 UTC (permalink / raw)
  To: Lukas Senger
  Cc: Andrew Morton, Dave Hansen, Matthias Wirth, Matthew Wilcox,
	Jeff Layton, J. Bruce Fields, Johannes Weiner, Rik van Riel,
	Lisa Du, Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat,
	Zhang Yanfei, Raghavendra K T, Lukas Czerner, Damien Ramonda,
	Mark Rutland, linux-fsdevel, linux-kernel, linux-mm, i4passt

On Wed 12-03-14 12:59:52, Lukas Senger wrote:
> > Why don't you use POSIX_FADV_DONTNEED when you no longer use those
> > pages? E.g. on close()?
> 
> Because DONTNEED throws out the pages even if other processes use them
> as well, so additional hacks are needed to prevent that (see for
> example[1]).

OK, that might be indeed to harsh.

[...]
> Ok, we understand that we can't add a page flag. We tried to find a flag
> to recycle but did not succeed. lruvec doesn't have page pointers and we
> don't have access to a pagevec and the file struct at the same time. We
> don't really understand the last suggestion, as we need to save this
> information for more than one page and going over a list every time we
> add something to an lru list doesn't seem like a good idea.
> 
> Would it be acceptable to add a member to struct page for our purpose?

No, it won't be that easy ;).

I think the Andrew's proposal makes sense. Why not simply move the pages
to the tail of inactive LRUs.

Or another approach might be to drop only those pages from the range
which are not mapped by other processes (something like a lite
DONTNEED).

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: implement POSIX_FADV_NOREUSE
  2014-03-12 11:59         ` Lukas Senger
@ 2014-03-12 16:05           ` Dave Hansen
  -1 siblings, 0 replies; 26+ messages in thread
From: Dave Hansen @ 2014-03-12 16:05 UTC (permalink / raw)
  To: Lukas Senger, Andrew Morton
  Cc: Michal Hocko, Matthias Wirth, Matthew Wilcox, Jeff Layton,
	J. Bruce Fields, Johannes Weiner, Rik van Riel, Lisa Du,
	Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat

On 03/12/2014 04:59 AM, Lukas Senger wrote:
>> This also looks to ignore the reuse flag for existing pages.  Have you
>> thought about what the semantics should be there?
> 
> The idea is to only treat the pages special when they are first read
> from disk. This way we achieve the main goal of not displacing useful
> cache content.
> 
>> Also, *should* readahead pages really have this flag set?  If a very
>> important page gets brought in via readahead, doesn't this put it at a
>> disadvantage for getting aged out?
> 
> If the flag is not set on readahead pages, the advise barely has any
> effect at all, since most of the file gets read through readahead. Of
> course that very important page has a disadvantage at the beginning, but
> as soon as it has been moved into the active list the NOREUSE doesn't
> affect it anymore. Worst case it gets read once more without the flag.

That's a good point, and it's a much more important change to the
existing code than the fadvise bits are.  Probably best to make a bigger
deal about it in the patch description.

> On Tue, 2014-03-11 at 14:27 -0700, Andrew Morton wrote:
>> And it sets PG_noreuse on new pages whether or not they were within the
>> fadvise range (offset...offset+len).  It's not really an fadvise
>> operation at all.
> 
> NORMAL, SEQUENTIAL and RANDOM don't honor the range either. So we
> figured it would be ok to do so for the sake of keeping the
> implementation simple.
> 
>>> page flags are really scarce and I am not sure this is the best
>> usage of
>>> the few remaining slots.
>>
>> Yeah, especially since the use so so transient.  I can see why using a
>> flag is nice for a quick prototype, but this is a far cry from needing
>> one. :)  You might be able to reuse a bit like PageReadahead.  You
>> could
>> probably also use a bit in the page pointer of the lruvec, or even
>> have
>> a percpu variable that stores a pointer to the 'struct page' you want
>> to
>> mark as NOREUSE.
> 
> Ok, we understand that we can't add a page flag. We tried to find a flag
> to recycle but did not succeed. lruvec doesn't have page pointers and we
> don't have access to a pagevec and the file struct at the same time. We
> don't really understand the last suggestion, as we need to save this
> information for more than one page and going over a list every time we
> add something to an lru list doesn't seem like a good idea.

Yeah, you're right.  I was ignoring the readahead code here.

But, why wouldn't this work there?  Define a percpu variable, and assign
it to the target page in readahead's read_pages() and in
do_generic_file_read() which deal with pages one at a time and not in lists.

struct page *read_me_once;
void hint_page_read_once(struct page *page)
{
	read_me_once = page;
}

Then check for (read_me_once == page) in add_page_to_lru_list() instead
of the page flag.  Then, make read_me_once per-cpu.  This won't be
preempt safe, but we're talking about readahead and hints here, so we
can probably just bail in the cases where we race.

> Would it be acceptable to add a member to struct page for our purpose?

'struct page' must be aligned to two pointers due to constraints from
the slub allocator.  Adding a single byte to it would bloat it by 16
bytes for me, which translates in to 2GB of lost space on my 1TB system.
 There are 6TB systems out there today which would lose 12GB.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: implement POSIX_FADV_NOREUSE
@ 2014-03-12 16:05           ` Dave Hansen
  0 siblings, 0 replies; 26+ messages in thread
From: Dave Hansen @ 2014-03-12 16:05 UTC (permalink / raw)
  To: Lukas Senger, Andrew Morton
  Cc: Michal Hocko, Matthias Wirth, Matthew Wilcox, Jeff Layton,
	J. Bruce Fields, Johannes Weiner, Rik van Riel, Lisa Du,
	Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat,
	Zhang Yanfei, Raghavendra K T, Lukas Czerner, Damien Ramonda,
	Mark Rutland, linux-fsdevel, linux-kernel, linux-mm, i4passt

On 03/12/2014 04:59 AM, Lukas Senger wrote:
>> This also looks to ignore the reuse flag for existing pages.  Have you
>> thought about what the semantics should be there?
> 
> The idea is to only treat the pages special when they are first read
> from disk. This way we achieve the main goal of not displacing useful
> cache content.
> 
>> Also, *should* readahead pages really have this flag set?  If a very
>> important page gets brought in via readahead, doesn't this put it at a
>> disadvantage for getting aged out?
> 
> If the flag is not set on readahead pages, the advise barely has any
> effect at all, since most of the file gets read through readahead. Of
> course that very important page has a disadvantage at the beginning, but
> as soon as it has been moved into the active list the NOREUSE doesn't
> affect it anymore. Worst case it gets read once more without the flag.

That's a good point, and it's a much more important change to the
existing code than the fadvise bits are.  Probably best to make a bigger
deal about it in the patch description.

> On Tue, 2014-03-11 at 14:27 -0700, Andrew Morton wrote:
>> And it sets PG_noreuse on new pages whether or not they were within the
>> fadvise range (offset...offset+len).  It's not really an fadvise
>> operation at all.
> 
> NORMAL, SEQUENTIAL and RANDOM don't honor the range either. So we
> figured it would be ok to do so for the sake of keeping the
> implementation simple.
> 
>>> page flags are really scarce and I am not sure this is the best
>> usage of
>>> the few remaining slots.
>>
>> Yeah, especially since the use so so transient.  I can see why using a
>> flag is nice for a quick prototype, but this is a far cry from needing
>> one. :)  You might be able to reuse a bit like PageReadahead.  You
>> could
>> probably also use a bit in the page pointer of the lruvec, or even
>> have
>> a percpu variable that stores a pointer to the 'struct page' you want
>> to
>> mark as NOREUSE.
> 
> Ok, we understand that we can't add a page flag. We tried to find a flag
> to recycle but did not succeed. lruvec doesn't have page pointers and we
> don't have access to a pagevec and the file struct at the same time. We
> don't really understand the last suggestion, as we need to save this
> information for more than one page and going over a list every time we
> add something to an lru list doesn't seem like a good idea.

Yeah, you're right.  I was ignoring the readahead code here.

But, why wouldn't this work there?  Define a percpu variable, and assign
it to the target page in readahead's read_pages() and in
do_generic_file_read() which deal with pages one at a time and not in lists.

struct page *read_me_once;
void hint_page_read_once(struct page *page)
{
	read_me_once = page;
}

Then check for (read_me_once == page) in add_page_to_lru_list() instead
of the page flag.  Then, make read_me_once per-cpu.  This won't be
preempt safe, but we're talking about readahead and hints here, so we
can probably just bail in the cases where we race.

> Would it be acceptable to add a member to struct page for our purpose?

'struct page' must be aligned to two pointers due to constraints from
the slub allocator.  Adding a single byte to it would bloat it by 16
bytes for me, which translates in to 2GB of lost space on my 1TB system.
 There are 6TB systems out there today which would lose 12GB.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: implement POSIX_FADV_NOREUSE
  2014-03-12 16:05           ` Dave Hansen
@ 2014-03-13 12:40             ` Lukas Senger
  -1 siblings, 0 replies; 26+ messages in thread
From: Lukas Senger @ 2014-03-13 12:40 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Andrew Morton, Michal Hocko, Matthias Wirth, Matthew Wilcox,
	Jeff Layton, J. Bruce Fields, Johannes Weiner, Rik van Riel,
	Lisa Du, Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat


> But, why wouldn't this work there?  Define a percpu variable, and assign
> it to the target page in readahead's read_pages() and in
> do_generic_file_read() which deal with pages one at a time and not in lists.
> 
> struct page *read_me_once;
> void hint_page_read_once(struct page *page)
> {
> 	read_me_once = page;
> }
> 
> Then check for (read_me_once == page) in add_page_to_lru_list() instead
> of the page flag.  Then, make read_me_once per-cpu.  This won't be
> preempt safe, but we're talking about readahead and hints here, so we
> can probably just bail in the cases where we race.

Thanks for clarifying that. The problem now is that by the time we get
to add_page_to_lru_list we're dealing with multiple pages again, because
of the buffering in pagevecs. We could do the (read_me_once == page)
check in __lru_cache_add and then add it to a (new) lru_add_tail_pvec
that adds its pages to the tail of the lru_lists.

If this way isn't feasible, we'll take a look at Andrew and Michal's
DONTNEED lite idea. However, with a DONTNEED lite implemented in the
posix_fadvise, the syscall would be more cumbersome to use for
application programmers. They would need to call it after every read.
The tail-pvec approach only needs a single syscall after open, as do
NORMAL, SEQUENTIAL and RANDOM. Furthermore we suspect that implementing
it in a way that respects other processes (unlike DONTNEED) won't be
much simpler than the tail-pvec approach.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: implement POSIX_FADV_NOREUSE
@ 2014-03-13 12:40             ` Lukas Senger
  0 siblings, 0 replies; 26+ messages in thread
From: Lukas Senger @ 2014-03-13 12:40 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Andrew Morton, Michal Hocko, Matthias Wirth, Matthew Wilcox,
	Jeff Layton, J. Bruce Fields, Johannes Weiner, Rik van Riel,
	Lisa Du, Paul Mackerras, Sasha Levin, Benjamin Herrenschmidt,
	Fengguang Wu, Shaohua Li, Alexey Kardashevskiy, Minchan Kim,
	Kirill A. Shutemov, Al Viro, Steven Whitehouse, Mel Gorman,
	Cody P Schafer, Jiang Liu, David Rientjes, Srivatsa S. Bhat,
	Zhang Yanfei, Raghavendra K T, Lukas Czerner, Damien Ramonda,
	Mark Rutland, linux-fsdevel, linux-kernel, linux-mm, i4passt


> But, why wouldn't this work there?  Define a percpu variable, and assign
> it to the target page in readahead's read_pages() and in
> do_generic_file_read() which deal with pages one at a time and not in lists.
> 
> struct page *read_me_once;
> void hint_page_read_once(struct page *page)
> {
> 	read_me_once = page;
> }
> 
> Then check for (read_me_once == page) in add_page_to_lru_list() instead
> of the page flag.  Then, make read_me_once per-cpu.  This won't be
> preempt safe, but we're talking about readahead and hints here, so we
> can probably just bail in the cases where we race.

Thanks for clarifying that. The problem now is that by the time we get
to add_page_to_lru_list we're dealing with multiple pages again, because
of the buffering in pagevecs. We could do the (read_me_once == page)
check in __lru_cache_add and then add it to a (new) lru_add_tail_pvec
that adds its pages to the tail of the lru_lists.

If this way isn't feasible, we'll take a look at Andrew and Michal's
DONTNEED lite idea. However, with a DONTNEED lite implemented in the
posix_fadvise, the syscall would be more cumbersome to use for
application programmers. They would need to call it after every read.
The tail-pvec approach only needs a single syscall after open, as do
NORMAL, SEQUENTIAL and RANDOM. Furthermore we suspect that implementing
it in a way that respects other processes (unlike DONTNEED) won't be
much simpler than the tail-pvec approach.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCHv2] mm: implement POSIX_FADV_NOREUSE
  2014-03-11 10:25 ` Matthias Wirth
@ 2014-03-13 18:43   ` Matthias Wirth
  -1 siblings, 0 replies; 26+ messages in thread
From: Matthias Wirth @ 2014-03-13 18:43 UTC (permalink / raw)
  To: Matthias Wirth
  Cc: Lukas Senger, i4passt, Dave Hansen, Matthew Wilcox, Jeff Layton,
	J. Bruce Fields, Andrew Morton, Johannes Weiner, Michal Hocko,
	Rik van Riel, Lisa Du, Jan Kara, Mel Gorman, Minchan Kim,
	Kirill A. Shutemov, Sasha Levin, Al Viro, Steven Whitehouse,
	Fengguang Wu, Raghavendra K T, Lukas Czerner, Damien Ramonda,
	Mark Rutland, Andrea Arcangeli

Backups, logrotation and indexers don't need files they read to remain
in the page cache. Their pages can be reclaimed early and should not
displace useful pages. POSIX specifices the POSIX_FADV_NOREUSE flag for
these use cases but it's currently a noop.

Pages coming from files with FMODE_NOREUSE that are to be added to the
page cache via add_to_page_cache_lru get their page struct pointer saved
in a per_cpu variable which gets checked further along the way in
__lru_cache_add. If the variable is set they get added to the new
lru_add_tail_pvec which as a whole later gets added to the tail of the
LRU list. Therefore these pages are the first to be reclaimed.

It might happen that a page is brought in via readahead for a file that
has NOREUSE set and is then requested by another process. This can lead
to the page being dropped from the page cache earlier even though the
competing process still needs it. The impact of this however, is small
as the likelihood of the page getting dropped is reduced because it
probably moves to the active list when the page is accessed by the
second process.

Signed-off-by: Matthias Wirth <matthias.wirth@gmail.com>
Signed-off-by: Lukas Senger <lukas@fridolin.com>
---
 include/linux/fs.h        |  3 +++
 include/linux/mm_inline.h |  9 ++++++++
 include/linux/pagevec.h   |  1 +
 mm/fadvise.c              |  4 ++++
 mm/filemap.c              |  7 +++++++
 mm/readahead.c            | 14 ++++++++++++-
 mm/swap.c                 | 52 ++++++++++++++++++++++++++++++++++++++++++-----
 7 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42b70bc..68ccf93 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -126,6 +126,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File needs atomic accesses to f_pos */
 #define FMODE_ATOMIC_POS	((__force fmode_t)0x8000)
 
+/* Expect one read only (effect on page cache behavior) */
+#define FMODE_NOREUSE		((__force fmode_t)0x10000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
 
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index cf55945..11347f7 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -31,6 +31,15 @@ static __always_inline void add_page_to_lru_list(struct page *page,
 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
 }
 
+static __always_inline void add_page_to_lru_list_tail(struct page *page,
+				struct lruvec *lruvec, enum lru_list lru)
+{
+	int nr_pages = hpage_nr_pages(page);
+	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+	list_add_tail(&page->lru, &lruvec->lists[lru]);
+	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
+}
+
 static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 3c6b8b1..d1d3223 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -22,6 +22,7 @@ struct pagevec {
 
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
+void __pagevec_lru_add_tail(struct pagevec *pvec);
 unsigned __pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 			  pgoff_t start, unsigned nr_pages, pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3bcfd81..387d10a 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -80,6 +80,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		f.file->f_ra.ra_pages = bdi->ra_pages;
 		spin_lock(&f.file->f_lock);
 		f.file->f_mode &= ~FMODE_RANDOM;
+		f.file->f_mode &= ~FMODE_NOREUSE;
 		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_RANDOM:
@@ -111,6 +112,9 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 					   nrpages);
 		break;
 	case POSIX_FADV_NOREUSE:
+		spin_lock(&f.file->f_lock);
+		f.file->f_mode |= FMODE_NOREUSE;
+		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_DONTNEED:
 		if (!bdi_write_congested(mapping->backing_dev_info))
diff --git a/mm/filemap.c b/mm/filemap.c
index 97474c1..54d1aaa 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -39,6 +39,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/filemap.h>
 
+DECLARE_PER_CPU(struct page*, noreuse_page);
+
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
@@ -1630,6 +1632,11 @@ no_cached_page:
 			desc->error = -ENOMEM;
 			goto out;
 		}
+		if (filp->f_mode & FMODE_NOREUSE) {
+			get_cpu_var(noreuse_page) = page;
+			put_cpu_var(noreuse_page);
+		}
+
 		error = add_to_page_cache_lru(page, mapping,
 						index, GFP_KERNEL);
 		if (error) {
diff --git a/mm/readahead.c b/mm/readahead.c
index 29c5e1a..61fd79e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -20,6 +20,8 @@
 #include <linux/syscalls.h>
 #include <linux/file.h>
 
+DECLARE_PER_CPU(struct page*, noreuse_page);
+
 /*
  * Initialise a struct file's readahead state.  Assumes that the caller has
  * memset *ra to zero.
@@ -117,7 +119,13 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 
 	blk_start_plug(&plug);
 
-	if (mapping->a_ops->readpages) {
+	/*
+	 * If the file was marked NOREUSE we need to save a page in
+	 * noreuse_page before calling add_to_page_cache_lru on it so that it's
+	 * added to the tail of the LRU further along the way. This is not
+	 * possible in mpage_readpages as there is no filp there.
+	 */
+	if (mapping->a_ops->readpages && !(filp->f_mode & FMODE_NOREUSE)) {
 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
 		/* Clean up the remaining pages */
 		put_pages_list(pages);
@@ -127,6 +135,10 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_to_page(pages);
 		list_del(&page->lru);
+		if (filp->f_mode & FMODE_NOREUSE) {
+			get_cpu_var(noreuse_page) = page;
+			put_cpu_var(noreuse_page);
+		}
 		if (!add_to_page_cache_lru(page, mapping,
 					page->index, GFP_KERNEL)) {
 			mapping->a_ops->readpage(filp, page);
diff --git a/mm/swap.c b/mm/swap.c
index f4d5f59..8cef7ac 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -41,8 +41,10 @@
 int page_cluster;
 
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
+static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+DEFINE_PER_CPU(struct page*, noreuse_page);
 
 /*
  * This path almost never happens for VM activity - pages are normally
@@ -587,16 +589,32 @@ EXPORT_SYMBOL(mark_page_accessed);
  * to add the page to the [in]active [file|anon] list is deferred until the
  * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
  * have the page added to the active list using mark_page_accessed().
+ *
+ * If the the page was marked noreuse by posix_fadvise it is added to the tail
+ * of the LRU via the lru_add_tail_pvec.
  */
 void __lru_cache_add(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+	struct pagevec *pvec;
+	struct page *noreuse = get_cpu_var(noreuse_page);
 
 	page_cache_get(page);
-	if (!pagevec_space(pvec))
-		__pagevec_lru_add(pvec);
-	pagevec_add(pvec, page);
-	put_cpu_var(lru_add_pvec);
+
+	if (noreuse == page) {
+		pvec = &get_cpu_var(lru_add_tail_pvec);
+		if (!pagevec_space(pvec))
+			__pagevec_lru_add_tail(pvec);
+		pagevec_add(pvec, page);
+		put_cpu_var(lru_add_tail_pvec);
+	} else {
+		pvec = &get_cpu_var(lru_add_pvec);
+		if (!pagevec_space(pvec))
+			__pagevec_lru_add(pvec);
+		pagevec_add(pvec, page);
+		put_cpu_var(lru_add_pvec);
+	}
+	noreuse = NULL;
+	put_cpu_var(noreuse_page);
 }
 EXPORT_SYMBOL(__lru_cache_add);
 
@@ -939,6 +957,21 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
 	trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
 }
 
+static void __pagevec_lru_add_tail_fn(struct page *page, struct lruvec *lruvec,
+				 void *arg)
+{
+	int file = page_is_file_cache(page);
+	int active = PageActive(page);
+	enum lru_list lru = page_lru(page);
+
+	VM_BUG_ON_PAGE(PageLRU(page), page);
+
+	SetPageLRU(page);
+	add_page_to_lru_list_tail(page, lruvec, lru);
+	update_page_reclaim_stat(lruvec, file, active);
+	trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
+}
+
 /*
  * Add the passed pages to the LRU, then drop the caller's refcount
  * on them.  Reinitialises the caller's pagevec.
@@ -949,6 +982,15 @@ void __pagevec_lru_add(struct pagevec *pvec)
 }
 EXPORT_SYMBOL(__pagevec_lru_add);
 
+/*
+ * Same as __pagevec_lru_add, but pages are added to the tail of the LRU.
+ */
+void __pagevec_lru_add_tail(struct pagevec *pvec)
+{
+	pagevec_lru_move_fn(pvec, __pagevec_lru_add_tail_fn, NULL);
+}
+EXPORT_SYMBOL(__pagevec_lru_add_tail);
+
 /**
  * __pagevec_lookup - gang pagecache lookup
  * @pvec:	Where the resulting entries are placed
-- 
1.8.3.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCHv2] mm: implement POSIX_FADV_NOREUSE
@ 2014-03-13 18:43   ` Matthias Wirth
  0 siblings, 0 replies; 26+ messages in thread
From: Matthias Wirth @ 2014-03-13 18:43 UTC (permalink / raw)
  To: Matthias Wirth
  Cc: Lukas Senger, i4passt, Dave Hansen, Matthew Wilcox, Jeff Layton,
	J. Bruce Fields, Andrew Morton, Johannes Weiner, Michal Hocko,
	Rik van Riel, Lisa Du, Jan Kara, Mel Gorman, Minchan Kim,
	Kirill A. Shutemov, Sasha Levin, Al Viro, Steven Whitehouse,
	Fengguang Wu, Raghavendra K T, Lukas Czerner, Damien Ramonda,
	Mark Rutland, Andrea Arcangeli, David Rientjes, Khalid Aziz,
	linux-fsdevel, linux-kernel, linux-mm

Backups, logrotation and indexers don't need files they read to remain
in the page cache. Their pages can be reclaimed early and should not
displace useful pages. POSIX specifices the POSIX_FADV_NOREUSE flag for
these use cases but it's currently a noop.

Pages coming from files with FMODE_NOREUSE that are to be added to the
page cache via add_to_page_cache_lru get their page struct pointer saved
in a per_cpu variable which gets checked further along the way in
__lru_cache_add. If the variable is set they get added to the new
lru_add_tail_pvec which as a whole later gets added to the tail of the
LRU list. Therefore these pages are the first to be reclaimed.

It might happen that a page is brought in via readahead for a file that
has NOREUSE set and is then requested by another process. This can lead
to the page being dropped from the page cache earlier even though the
competing process still needs it. The impact of this however, is small
as the likelihood of the page getting dropped is reduced because it
probably moves to the active list when the page is accessed by the
second process.

Signed-off-by: Matthias Wirth <matthias.wirth@gmail.com>
Signed-off-by: Lukas Senger <lukas@fridolin.com>
---
 include/linux/fs.h        |  3 +++
 include/linux/mm_inline.h |  9 ++++++++
 include/linux/pagevec.h   |  1 +
 mm/fadvise.c              |  4 ++++
 mm/filemap.c              |  7 +++++++
 mm/readahead.c            | 14 ++++++++++++-
 mm/swap.c                 | 52 ++++++++++++++++++++++++++++++++++++++++++-----
 7 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42b70bc..68ccf93 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -126,6 +126,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File needs atomic accesses to f_pos */
 #define FMODE_ATOMIC_POS	((__force fmode_t)0x8000)
 
+/* Expect one read only (effect on page cache behavior) */
+#define FMODE_NOREUSE		((__force fmode_t)0x10000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
 
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index cf55945..11347f7 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -31,6 +31,15 @@ static __always_inline void add_page_to_lru_list(struct page *page,
 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
 }
 
+static __always_inline void add_page_to_lru_list_tail(struct page *page,
+				struct lruvec *lruvec, enum lru_list lru)
+{
+	int nr_pages = hpage_nr_pages(page);
+	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+	list_add_tail(&page->lru, &lruvec->lists[lru]);
+	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
+}
+
 static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 3c6b8b1..d1d3223 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -22,6 +22,7 @@ struct pagevec {
 
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
+void __pagevec_lru_add_tail(struct pagevec *pvec);
 unsigned __pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 			  pgoff_t start, unsigned nr_pages, pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3bcfd81..387d10a 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -80,6 +80,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		f.file->f_ra.ra_pages = bdi->ra_pages;
 		spin_lock(&f.file->f_lock);
 		f.file->f_mode &= ~FMODE_RANDOM;
+		f.file->f_mode &= ~FMODE_NOREUSE;
 		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_RANDOM:
@@ -111,6 +112,9 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 					   nrpages);
 		break;
 	case POSIX_FADV_NOREUSE:
+		spin_lock(&f.file->f_lock);
+		f.file->f_mode |= FMODE_NOREUSE;
+		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_DONTNEED:
 		if (!bdi_write_congested(mapping->backing_dev_info))
diff --git a/mm/filemap.c b/mm/filemap.c
index 97474c1..54d1aaa 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -39,6 +39,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/filemap.h>
 
+DECLARE_PER_CPU(struct page*, noreuse_page);
+
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
@@ -1630,6 +1632,11 @@ no_cached_page:
 			desc->error = -ENOMEM;
 			goto out;
 		}
+		if (filp->f_mode & FMODE_NOREUSE) {
+			get_cpu_var(noreuse_page) = page;
+			put_cpu_var(noreuse_page);
+		}
+
 		error = add_to_page_cache_lru(page, mapping,
 						index, GFP_KERNEL);
 		if (error) {
diff --git a/mm/readahead.c b/mm/readahead.c
index 29c5e1a..61fd79e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -20,6 +20,8 @@
 #include <linux/syscalls.h>
 #include <linux/file.h>
 
+DECLARE_PER_CPU(struct page*, noreuse_page);
+
 /*
  * Initialise a struct file's readahead state.  Assumes that the caller has
  * memset *ra to zero.
@@ -117,7 +119,13 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 
 	blk_start_plug(&plug);
 
-	if (mapping->a_ops->readpages) {
+	/*
+	 * If the file was marked NOREUSE we need to save a page in
+	 * noreuse_page before calling add_to_page_cache_lru on it so that it's
+	 * added to the tail of the LRU further along the way. This is not
+	 * possible in mpage_readpages as there is no filp there.
+	 */
+	if (mapping->a_ops->readpages && !(filp->f_mode & FMODE_NOREUSE)) {
 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
 		/* Clean up the remaining pages */
 		put_pages_list(pages);
@@ -127,6 +135,10 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_to_page(pages);
 		list_del(&page->lru);
+		if (filp->f_mode & FMODE_NOREUSE) {
+			get_cpu_var(noreuse_page) = page;
+			put_cpu_var(noreuse_page);
+		}
 		if (!add_to_page_cache_lru(page, mapping,
 					page->index, GFP_KERNEL)) {
 			mapping->a_ops->readpage(filp, page);
diff --git a/mm/swap.c b/mm/swap.c
index f4d5f59..8cef7ac 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -41,8 +41,10 @@
 int page_cluster;
 
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
+static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+DEFINE_PER_CPU(struct page*, noreuse_page);
 
 /*
  * This path almost never happens for VM activity - pages are normally
@@ -587,16 +589,32 @@ EXPORT_SYMBOL(mark_page_accessed);
  * to add the page to the [in]active [file|anon] list is deferred until the
  * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
  * have the page added to the active list using mark_page_accessed().
+ *
+ * If the the page was marked noreuse by posix_fadvise it is added to the tail
+ * of the LRU via the lru_add_tail_pvec.
  */
 void __lru_cache_add(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+	struct pagevec *pvec;
+	struct page *noreuse = get_cpu_var(noreuse_page);
 
 	page_cache_get(page);
-	if (!pagevec_space(pvec))
-		__pagevec_lru_add(pvec);
-	pagevec_add(pvec, page);
-	put_cpu_var(lru_add_pvec);
+
+	if (noreuse == page) {
+		pvec = &get_cpu_var(lru_add_tail_pvec);
+		if (!pagevec_space(pvec))
+			__pagevec_lru_add_tail(pvec);
+		pagevec_add(pvec, page);
+		put_cpu_var(lru_add_tail_pvec);
+	} else {
+		pvec = &get_cpu_var(lru_add_pvec);
+		if (!pagevec_space(pvec))
+			__pagevec_lru_add(pvec);
+		pagevec_add(pvec, page);
+		put_cpu_var(lru_add_pvec);
+	}
+	noreuse = NULL;
+	put_cpu_var(noreuse_page);
 }
 EXPORT_SYMBOL(__lru_cache_add);
 
@@ -939,6 +957,21 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
 	trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
 }
 
+static void __pagevec_lru_add_tail_fn(struct page *page, struct lruvec *lruvec,
+				 void *arg)
+{
+	int file = page_is_file_cache(page);
+	int active = PageActive(page);
+	enum lru_list lru = page_lru(page);
+
+	VM_BUG_ON_PAGE(PageLRU(page), page);
+
+	SetPageLRU(page);
+	add_page_to_lru_list_tail(page, lruvec, lru);
+	update_page_reclaim_stat(lruvec, file, active);
+	trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
+}
+
 /*
  * Add the passed pages to the LRU, then drop the caller's refcount
  * on them.  Reinitialises the caller's pagevec.
@@ -949,6 +982,15 @@ void __pagevec_lru_add(struct pagevec *pvec)
 }
 EXPORT_SYMBOL(__pagevec_lru_add);
 
+/*
+ * Same as __pagevec_lru_add, but pages are added to the tail of the LRU.
+ */
+void __pagevec_lru_add_tail(struct pagevec *pvec)
+{
+	pagevec_lru_move_fn(pvec, __pagevec_lru_add_tail_fn, NULL);
+}
+EXPORT_SYMBOL(__pagevec_lru_add_tail);
+
 /**
  * __pagevec_lookup - gang pagecache lookup
  * @pvec:	Where the resulting entries are placed
-- 
1.8.3.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCHv2] mm: implement POSIX_FADV_NOREUSE
  2014-03-13 18:43   ` Matthias Wirth
@ 2014-03-13 20:01     ` Andrew Morton
  -1 siblings, 0 replies; 26+ messages in thread
From: Andrew Morton @ 2014-03-13 20:01 UTC (permalink / raw)
  To: Matthias Wirth
  Cc: Lukas Senger, i4passt, Dave Hansen, Matthew Wilcox, Jeff Layton,
	J. Bruce Fields, Johannes Weiner, Michal Hocko, Rik van Riel,
	Lisa Du, Jan Kara, Mel Gorman, Minchan Kim, Kirill A. Shutemov,
	Sasha Levin, Al Viro, Steven Whitehouse, Fengguang Wu,
	Raghavendra K T, Lukas Czerner, Damien Ramonda, Mark Rutland,
	Andrea Arcangeli, David Rientjes

On Thu, 13 Mar 2014 19:43:41 +0100 Matthias Wirth <matthias.wirth@gmail.com> wrote:

> Backups, logrotation and indexers don't need files they read to remain
> in the page cache. Their pages can be reclaimed early and should not
> displace useful pages. POSIX specifices the POSIX_FADV_NOREUSE flag for
> these use cases but it's currently a noop.

As far as I can tell, POSIX_FADV_DONTNEED suits these applications
quite well.  Why is this patch happening?

> Pages coming from files with FMODE_NOREUSE that are to be added to the
> page cache via add_to_page_cache_lru get their page struct pointer saved
> in a per_cpu variable which gets checked further along the way in
> __lru_cache_add. If the variable is set they get added to the new
> lru_add_tail_pvec which as a whole later gets added to the tail of the
> LRU list. Therefore these pages are the first to be reclaimed.
> 
> It might happen that a page is brought in via readahead for a file that
> has NOREUSE set and is then requested by another process. This can lead
> to the page being dropped from the page cache earlier even though the
> competing process still needs it. The impact of this however, is small
> as the likelihood of the page getting dropped is reduced because it
> probably moves to the active list when the page is accessed by the
> second process.

opengroup.org sayeth:

: The posix_fadvise() function shall advise the implementation on the
: expected behavior of the application with respect to the data in the
: file associated with the open file descriptor, fd, starting at offset
: and continuing for len bytes.  The specified range need not currently
: exist in the file.  If len is zero, all data following offset is
: specified.  The implementation may use this information to optimize
: handling of the specified data.  The posix_fadvise() function shall
: have no effect on the semantics of other operations on the specified
: data, although it may affect the performance of other operations.
:
: ...
:
: POSIX_FADV_NOREUSE
:   Specifies that the application expects to access the specified data
:   once and then not reuse it thereafter.

My proposal to deactivate the pages within the fadvise() call violates
that, because the spec wants us to act *after* the app has touched the
pages.

Your proposed implementation violates it because it affects data
outside the specified range.

It would be interesting to know what the *bsd guys chose to do, but I
don't understand it from the amount of context in
http://lists.freebsd.org/pipermail/svn-src-stable-9/2012-August/002608.html

Ignoring the range and impacting the entire file (for this fd) is a
bit lame.  Alternatives include:

a) Implement a per-fd tree of (start,len) ranges and maintain and
   search that.  blah.

b) violate the spec in a different fashion and implement NOREUSE
   synchronously within fadvise.

>From a practical point of view, I'm currently inclining toward b). 
Yes, we require NOREUSE be run *after* the read() instead of before it,
but what's wrong with that?  It's just as easy to implement from
userspace.  Perhaps we should call it POSIX_FADV_NOREUSE_LINUX to make
it clear that we went our own way.

It's difficult.  The spec's a-priori aspect makes implementation much
more difficult.


Your patch doesn't apply to current mainline, btw.  Minor rejects.

I don't think that per-cpu page thing is suitable, really.  If this
task context-switches to a different CPU then we get the wrong page. 
This will happen pretty often as the task is performing physical IO. 
This can be fixed by putting the page* into the task_struct instead,
but passing function args via current-> is a bit of a hack.  Why not
create add_to_page_cache_lru_tail()?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv2] mm: implement POSIX_FADV_NOREUSE
@ 2014-03-13 20:01     ` Andrew Morton
  0 siblings, 0 replies; 26+ messages in thread
From: Andrew Morton @ 2014-03-13 20:01 UTC (permalink / raw)
  To: Matthias Wirth
  Cc: Lukas Senger, i4passt, Dave Hansen, Matthew Wilcox, Jeff Layton,
	J. Bruce Fields, Johannes Weiner, Michal Hocko, Rik van Riel,
	Lisa Du, Jan Kara, Mel Gorman, Minchan Kim, Kirill A. Shutemov,
	Sasha Levin, Al Viro, Steven Whitehouse, Fengguang Wu,
	Raghavendra K T, Lukas Czerner, Damien Ramonda, Mark Rutland,
	Andrea Arcangeli, David Rientjes, Khalid Aziz, linux-fsdevel,
	linux-kernel, linux-mm

On Thu, 13 Mar 2014 19:43:41 +0100 Matthias Wirth <matthias.wirth@gmail.com> wrote:

> Backups, logrotation and indexers don't need files they read to remain
> in the page cache. Their pages can be reclaimed early and should not
> displace useful pages. POSIX specifices the POSIX_FADV_NOREUSE flag for
> these use cases but it's currently a noop.

As far as I can tell, POSIX_FADV_DONTNEED suits these applications
quite well.  Why is this patch happening?

> Pages coming from files with FMODE_NOREUSE that are to be added to the
> page cache via add_to_page_cache_lru get their page struct pointer saved
> in a per_cpu variable which gets checked further along the way in
> __lru_cache_add. If the variable is set they get added to the new
> lru_add_tail_pvec which as a whole later gets added to the tail of the
> LRU list. Therefore these pages are the first to be reclaimed.
> 
> It might happen that a page is brought in via readahead for a file that
> has NOREUSE set and is then requested by another process. This can lead
> to the page being dropped from the page cache earlier even though the
> competing process still needs it. The impact of this however, is small
> as the likelihood of the page getting dropped is reduced because it
> probably moves to the active list when the page is accessed by the
> second process.

opengroup.org sayeth:

: The posix_fadvise() function shall advise the implementation on the
: expected behavior of the application with respect to the data in the
: file associated with the open file descriptor, fd, starting at offset
: and continuing for len bytes.  The specified range need not currently
: exist in the file.  If len is zero, all data following offset is
: specified.  The implementation may use this information to optimize
: handling of the specified data.  The posix_fadvise() function shall
: have no effect on the semantics of other operations on the specified
: data, although it may affect the performance of other operations.
:
: ...
:
: POSIX_FADV_NOREUSE
:   Specifies that the application expects to access the specified data
:   once and then not reuse it thereafter.

My proposal to deactivate the pages within the fadvise() call violates
that, because the spec wants us to act *after* the app has touched the
pages.

Your proposed implementation violates it because it affects data
outside the specified range.

It would be interesting to know what the *bsd guys chose to do, but I
don't understand it from the amount of context in
http://lists.freebsd.org/pipermail/svn-src-stable-9/2012-August/002608.html

Ignoring the range and impacting the entire file (for this fd) is a
bit lame.  Alternatives include:

a) Implement a per-fd tree of (start,len) ranges and maintain and
   search that.  blah.

b) violate the spec in a different fashion and implement NOREUSE
   synchronously within fadvise.

>From a practical point of view, I'm currently inclining toward b). 
Yes, we require NOREUSE be run *after* the read() instead of before it,
but what's wrong with that?  It's just as easy to implement from
userspace.  Perhaps we should call it POSIX_FADV_NOREUSE_LINUX to make
it clear that we went our own way.

It's difficult.  The spec's a-priori aspect makes implementation much
more difficult.


Your patch doesn't apply to current mainline, btw.  Minor rejects.

I don't think that per-cpu page thing is suitable, really.  If this
task context-switches to a different CPU then we get the wrong page. 
This will happen pretty often as the task is performing physical IO. 
This can be fixed by putting the page* into the task_struct instead,
but passing function args via current-> is a bit of a hack.  Why not
create add_to_page_cache_lru_tail()?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv2] mm: implement POSIX_FADV_NOREUSE
  2014-03-13 20:01     ` Andrew Morton
@ 2014-03-14 12:34       ` Lukas Senger
  -1 siblings, 0 replies; 26+ messages in thread
From: Lukas Senger @ 2014-03-14 12:34 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthias Wirth, i4passt, Dave Hansen, Matthew Wilcox,
	Jeff Layton, J. Bruce Fields, Johannes Weiner, Michal Hocko,
	Rik van Riel, Lisa Du, Jan Kara, Mel Gorman, Minchan Kim,
	Kirill A. Shutemov, Sasha Levin, Al Viro, Steven Whitehouse,
	Fengguang Wu, Raghavendra K T, Lukas Czerner, Damien Ramonda,
	Mark Rutland, Andrea Arcangeli, David Rientjes

On Thu, 2014-03-13 at 13:01 -0700, Andrew Morton wrote:
> On Thu, 13 Mar 2014 19:43:41 +0100 Matthias Wirth <matthias.wirth@gmail.com> wrote:
> 
> > Backups, logrotation and indexers don't need files they read to remain
> > in the page cache. Their pages can be reclaimed early and should not
> > displace useful pages. POSIX specifices the POSIX_FADV_NOREUSE flag for
> > these use cases but it's currently a noop.
> 
> As far as I can tell, POSIX_FADV_DONTNEED suits these applications
> quite well.  Why is this patch happening?

Using DONTNEED means the application will throw out its pages even if
they are used by other processes. If the application wants to be more
polite it needs a way to find out whether thats the case. One way is to
use mincore to get a snapshot of pages before mmaping the file and then
keeping pages that were already cached before we accessed them. This of
course ignores all accesses by other processes occuring while we use the
file and doesn't work with read. Apart from those flaws, does that kind
of page cache management belong into userspace?

> My proposal to deactivate the pages within the fadvise() call violates
> that, because the spec wants us to act *after* the app has touched the
> pages.
> 
> Your proposed implementation violates it because it affects data
> outside the specified range.
> 
> It would be interesting to know what the *bsd guys chose to do, but I
> don't understand it from the amount of context in
> http://lists.freebsd.org/pipermail/svn-src-stable-9/2012-August/002608.html
> 
> Ignoring the range and impacting the entire file (for this fd) is a
> bit lame.  Alternatives include:
> 
> a) Implement a per-fd tree of (start,len) ranges and maintain and
>    search that.  blah.
> 
> b) violate the spec in a different fashion and implement NOREUSE
>    synchronously within fadvise.
> 
> From a practical point of view, I'm currently inclining toward b). 
> Yes, we require NOREUSE be run *after* the read() instead of before it,
> but what's wrong with that?  It's just as easy to implement from
> userspace.  Perhaps we should call it POSIX_FADV_NOREUSE_LINUX to make
> it clear that we went our own way.

The problem with calling fadvise with NOREUSE_LINUX after read is that
it makes writing applications in a portable way harder. As you point
out, our version doesn't adhere to the spec perfectly either, but I'd
wager it covers the most common use case. And a) would at least allow a
spec faithful implementation in the future.

> I don't think that per-cpu page thing is suitable, really.  If this
> task context-switches to a different CPU then we get the wrong page. 
> This will happen pretty often as the task is performing physical IO. 
> This can be fixed by putting the page* into the task_struct instead,
> but passing function args via current-> is a bit of a hack.  Why not
> create add_to_page_cache_lru_tail()?

We agree and will send a new version with add_to_page_cache_lru_tail.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv2] mm: implement POSIX_FADV_NOREUSE
@ 2014-03-14 12:34       ` Lukas Senger
  0 siblings, 0 replies; 26+ messages in thread
From: Lukas Senger @ 2014-03-14 12:34 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthias Wirth, i4passt, Dave Hansen, Matthew Wilcox,
	Jeff Layton, J. Bruce Fields, Johannes Weiner, Michal Hocko,
	Rik van Riel, Lisa Du, Jan Kara, Mel Gorman, Minchan Kim,
	Kirill A. Shutemov, Sasha Levin, Al Viro, Steven Whitehouse,
	Fengguang Wu, Raghavendra K T, Lukas Czerner, Damien Ramonda,
	Mark Rutland, Andrea Arcangeli, David Rientjes, Khalid Aziz,
	linux-fsdevel, linux-kernel, linux-mm

On Thu, 2014-03-13 at 13:01 -0700, Andrew Morton wrote:
> On Thu, 13 Mar 2014 19:43:41 +0100 Matthias Wirth <matthias.wirth@gmail.com> wrote:
> 
> > Backups, logrotation and indexers don't need files they read to remain
> > in the page cache. Their pages can be reclaimed early and should not
> > displace useful pages. POSIX specifices the POSIX_FADV_NOREUSE flag for
> > these use cases but it's currently a noop.
> 
> As far as I can tell, POSIX_FADV_DONTNEED suits these applications
> quite well.  Why is this patch happening?

Using DONTNEED means the application will throw out its pages even if
they are used by other processes. If the application wants to be more
polite it needs a way to find out whether thats the case. One way is to
use mincore to get a snapshot of pages before mmaping the file and then
keeping pages that were already cached before we accessed them. This of
course ignores all accesses by other processes occuring while we use the
file and doesn't work with read. Apart from those flaws, does that kind
of page cache management belong into userspace?

> My proposal to deactivate the pages within the fadvise() call violates
> that, because the spec wants us to act *after* the app has touched the
> pages.
> 
> Your proposed implementation violates it because it affects data
> outside the specified range.
> 
> It would be interesting to know what the *bsd guys chose to do, but I
> don't understand it from the amount of context in
> http://lists.freebsd.org/pipermail/svn-src-stable-9/2012-August/002608.html
> 
> Ignoring the range and impacting the entire file (for this fd) is a
> bit lame.  Alternatives include:
> 
> a) Implement a per-fd tree of (start,len) ranges and maintain and
>    search that.  blah.
> 
> b) violate the spec in a different fashion and implement NOREUSE
>    synchronously within fadvise.
> 
> From a practical point of view, I'm currently inclining toward b). 
> Yes, we require NOREUSE be run *after* the read() instead of before it,
> but what's wrong with that?  It's just as easy to implement from
> userspace.  Perhaps we should call it POSIX_FADV_NOREUSE_LINUX to make
> it clear that we went our own way.

The problem with calling fadvise with NOREUSE_LINUX after read is that
it makes writing applications in a portable way harder. As you point
out, our version doesn't adhere to the spec perfectly either, but I'd
wager it covers the most common use case. And a) would at least allow a
spec faithful implementation in the future.

> I don't think that per-cpu page thing is suitable, really.  If this
> task context-switches to a different CPU then we get the wrong page. 
> This will happen pretty often as the task is performing physical IO. 
> This can be fixed by putting the page* into the task_struct instead,
> but passing function args via current-> is a bit of a hack.  Why not
> create add_to_page_cache_lru_tail()?

We agree and will send a new version with add_to_page_cache_lru_tail.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCHv3] mm: implement POSIX_FADV_NOREUSE
  2014-03-11 10:25 ` Matthias Wirth
@ 2014-03-14 15:52   ` Matthias Wirth
  -1 siblings, 0 replies; 26+ messages in thread
From: Matthias Wirth @ 2014-03-14 15:52 UTC (permalink / raw)
  To: Matthias Wirth
  Cc: Lukas Senger, i4passt, Dave Hansen, Matthew Wilcox, Jeff Layton,
	J. Bruce Fields, Andrew Morton, Johannes Weiner, Michal Hocko,
	Rik van Riel, Lisa Du, Minchan Kim, Naoya Horiguchi, Sasha Levin,
	Paul E. McKenney, Jan Kara, Mel Gorman, Shaohua Li, Bob Liu,
	Seth Jennings, Joonsoo Kim, Rafael Aquini, Kirill A. Shutemov,
	Al Viro

Backups, logrotation and indexers don't need files they read to remain
in the page cache. Their pages can be reclaimed early and should not
displace useful pages. POSIX specifices the POSIX_FADV_NOREUSE flag for
these use cases but it's currently a noop.

Using DONTNEED is not a good solution: It means the application will
throw out its pages even if they are used by other processes. If the
application wants to be more polite it needs a way to find out whether
thats the case. One way is to use mincore to get a snapshot of pages
before mmaping the file and then keeping pages that were already cached
before we accessed them. This of course ignores all accesses by other
processes occuring while we use the file and doesn't work with read.

The idea of the patch is to add pages from files with FMODE_NOREUSE at
the tail of the lru list. Therefore these pages are the first to be
reclaimed. We added add_to_page_cache_lru_tail and corresponding
functions, complementing add_to_page_cache_lru.

Our implementation on the other hand is alot easier to implement for
userspace as you only need to call posix_fadvise once after each open.
We currently ignore ranges and apply it to the complete file, which
should cover most usecases. Range functionality can be added with a
list/tree of ranges in the filp.

It might happen that a page is brought in via readahead for a file that
has NOREUSE set and is then requested by another process. This can lead
to the page being dropped from the page cache earlier even though the
competing process still needs it. The impact of this however, is small
as the likelihood of the page getting dropped is reduced because it
probably moves to the active list when the page is accessed by the
second process.

Signed-off-by: Matthias Wirth <matthias.wirth@gmail.com>
Signed-off-by: Lukas Senger <lukas@fridolin.com>
---
 include/linux/fs.h        |  3 +++
 include/linux/mm_inline.h |  9 ++++++++
 include/linux/pagemap.h   |  2 ++
 include/linux/pagevec.h   |  1 +
 include/linux/swap.h      |  8 ++++++++
 mm/fadvise.c              |  4 ++++
 mm/filemap.c              | 27 ++++++++++++++++++++++--
 mm/readahead.c            | 20 ++++++++++++++----
 mm/swap.c                 | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 120 insertions(+), 6 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4f59e18..0c1b031 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -126,6 +126,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File needs atomic accesses to f_pos */
 #define FMODE_ATOMIC_POS	((__force fmode_t)0x8000)
 
+/* Expect one read only (effect on page cache behavior) */
+#define FMODE_NOREUSE		((__force fmode_t)0x10000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
 
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index cf55945..11347f7 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -31,6 +31,15 @@ static __always_inline void add_page_to_lru_list(struct page *page,
 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
 }
 
+static __always_inline void add_page_to_lru_list_tail(struct page *page,
+				struct lruvec *lruvec, enum lru_list lru)
+{
+	int nr_pages = hpage_nr_pages(page);
+	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+	list_add_tail(&page->lru, &lruvec->lists[lru]);
+	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
+}
+
 static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 532cedc..0191357 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -575,6 +575,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
+int add_to_page_cache_lru_tail(struct page *page, struct address_space *mapping,
+				pgoff_t index, gfp_t gfp_mask);
 extern void delete_from_page_cache(struct page *page);
 extern void __delete_from_page_cache(struct page *page, void *shadow);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 3c6b8b1..d1d3223 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -22,6 +22,7 @@ struct pagevec {
 
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
+void __pagevec_lru_add_tail(struct pagevec *pvec);
 unsigned __pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 			  pgoff_t start, unsigned nr_pages, pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3507115..c6bb26f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -309,7 +309,9 @@ extern unsigned long nr_free_pagecache_pages(void);
 
 /* linux/mm/swap.c */
 extern void __lru_cache_add(struct page *);
+extern void __lru_cache_add_tail(struct page *);
 extern void lru_cache_add(struct page *);
+extern void lru_cache_add_tail(struct page *);
 extern void lru_add_page_tail(struct page *page, struct page *page_tail,
 			 struct lruvec *lruvec, struct list_head *head);
 extern void activate_page(struct page *);
@@ -339,6 +341,12 @@ static inline void lru_cache_add_file(struct page *page)
 	__lru_cache_add(page);
 }
 
+static inline void lru_cache_add_tail_file(struct page *page)
+{
+	ClearPageActive(page);
+	__lru_cache_add_tail(page);
+}
+
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3bcfd81..387d10a 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -80,6 +80,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		f.file->f_ra.ra_pages = bdi->ra_pages;
 		spin_lock(&f.file->f_lock);
 		f.file->f_mode &= ~FMODE_RANDOM;
+		f.file->f_mode &= ~FMODE_NOREUSE;
 		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_RANDOM:
@@ -111,6 +112,9 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 					   nrpages);
 		break;
 	case POSIX_FADV_NOREUSE:
+		spin_lock(&f.file->f_lock);
+		f.file->f_mode |= FMODE_NOREUSE;
+		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_DONTNEED:
 		if (!bdi_write_congested(mapping->backing_dev_info))
diff --git a/mm/filemap.c b/mm/filemap.c
index 97474c1..49b488a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -642,6 +642,23 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 }
 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
 
+/*
+ * Pages added to the tail are less important and should not be activated even
+ * if they were recently accessed. Therefore we behave different from
+ * add_to_page_cache_lru.
+ */
+int add_to_page_cache_lru_tail(struct page *page, struct address_space *mapping,
+				pgoff_t offset, gfp_t gfp_mask)
+{
+	int ret;
+
+	ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+	if (ret == 0)
+		lru_cache_add_tail_file(page);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(add_to_page_cache_lru_tail);
+
 #ifdef CONFIG_NUMA
 struct page *__page_cache_alloc(gfp_t gfp)
 {
@@ -1630,8 +1647,14 @@ no_cached_page:
 			desc->error = -ENOMEM;
 			goto out;
 		}
-		error = add_to_page_cache_lru(page, mapping,
-						index, GFP_KERNEL);
+
+		if (unlikely(filp->f_mode & FMODE_NOREUSE)) {
+			error = add_to_page_cache_lru_tail(page, mapping,
+							index, GFP_KERNEL);
+		} else {
+			error = add_to_page_cache_lru(page, mapping,
+							index, GFP_KERNEL);
+		}
 		if (error) {
 			page_cache_release(page);
 			if (error == -EEXIST)
diff --git a/mm/readahead.c b/mm/readahead.c
index 29c5e1a..749df01 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -117,7 +117,13 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 
 	blk_start_plug(&plug);
 
-	if (mapping->a_ops->readpages) {
+	/*
+	 * If the file was marked POSIX_FADV_NOREUSE we call need to call
+	 * add_to_page_cache_lru_tail on it so that it's added to the tail of
+	 * the LRU further along the way. This is not possible in
+	 * mpage_readpages as there is no filp there.
+	 */
+	if (mapping->a_ops->readpages && !(filp->f_mode & FMODE_NOREUSE)) {
 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
 		/* Clean up the remaining pages */
 		put_pages_list(pages);
@@ -127,10 +133,16 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_to_page(pages);
 		list_del(&page->lru);
-		if (!add_to_page_cache_lru(page, mapping,
-					page->index, GFP_KERNEL)) {
-			mapping->a_ops->readpage(filp, page);
+
+		if (unlikely(filp->f_mode & FMODE_NOREUSE)) {
+			ret = add_to_page_cache_lru_tail(page, mapping,
+					page->index, GFP_KERNEL);
+		} else {
+			ret = add_to_page_cache_lru(page, mapping,
+					page->index, GFP_KERNEL);
 		}
+		if (!ret)
+			mapping->a_ops->readpage(filp, page);
 		page_cache_release(page);
 	}
 	ret = 0;
diff --git a/mm/swap.c b/mm/swap.c
index f4d5f59..ebf2d2c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -41,6 +41,7 @@
 int page_cluster;
 
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
+static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 
@@ -600,6 +601,22 @@ void __lru_cache_add(struct page *page)
 }
 EXPORT_SYMBOL(__lru_cache_add);
 
+/*
+ * Same as __lru_cache_add but add to tail.
+ */
+
+void __lru_cache_add_tail(struct page *page)
+{
+	struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvec);
+
+	page_cache_get(page);
+	if (!pagevec_space(pvec))
+		__pagevec_lru_add_tail(pvec);
+	pagevec_add(pvec, page);
+	put_cpu_var(lru_add_tail_pvec);
+}
+EXPORT_SYMBOL(__lru_cache_add_tail);
+
 /**
  * lru_cache_add - add a page to a page list
  * @page: the page to be added to the LRU.
@@ -612,6 +629,17 @@ void lru_cache_add(struct page *page)
 }
 
 /**
+ * lru_cache_add_tail - add a page to a page list at the tail
+ * @page: the page to be added to the tail of the LRU.
+ */
+void lru_cache_add_tail(struct page *page)
+{
+	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+	VM_BUG_ON_PAGE(PageLRU(page), page);
+	__lru_cache_add_tail(page);
+}
+
+/**
  * add_page_to_unevictable_list - add a page to the unevictable list
  * @page:  the page to be added to the unevictable list
  *
@@ -939,6 +967,21 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
 	trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
 }
 
+static void __pagevec_lru_add_tail_fn(struct page *page, struct lruvec *lruvec,
+				 void *arg)
+{
+	int file = page_is_file_cache(page);
+	int active = PageActive(page);
+	enum lru_list lru = page_lru(page);
+
+	VM_BUG_ON_PAGE(PageLRU(page), page);
+
+	SetPageLRU(page);
+	add_page_to_lru_list_tail(page, lruvec, lru);
+	update_page_reclaim_stat(lruvec, file, active);
+	trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
+}
+
 /*
  * Add the passed pages to the LRU, then drop the caller's refcount
  * on them.  Reinitialises the caller's pagevec.
@@ -949,6 +992,15 @@ void __pagevec_lru_add(struct pagevec *pvec)
 }
 EXPORT_SYMBOL(__pagevec_lru_add);
 
+/*
+ * Same as __pagevec_lru_add, but pages are added to the tail of the LRU.
+ */
+void __pagevec_lru_add_tail(struct pagevec *pvec)
+{
+	pagevec_lru_move_fn(pvec, __pagevec_lru_add_tail_fn, NULL);
+}
+EXPORT_SYMBOL(__pagevec_lru_add_tail);
+
 /**
  * __pagevec_lookup - gang pagecache lookup
  * @pvec:	Where the resulting entries are placed
-- 
1.8.3.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCHv3] mm: implement POSIX_FADV_NOREUSE
@ 2014-03-14 15:52   ` Matthias Wirth
  0 siblings, 0 replies; 26+ messages in thread
From: Matthias Wirth @ 2014-03-14 15:52 UTC (permalink / raw)
  To: Matthias Wirth
  Cc: Lukas Senger, i4passt, Dave Hansen, Matthew Wilcox, Jeff Layton,
	J. Bruce Fields, Andrew Morton, Johannes Weiner, Michal Hocko,
	Rik van Riel, Lisa Du, Minchan Kim, Naoya Horiguchi, Sasha Levin,
	Paul E. McKenney, Jan Kara, Mel Gorman, Shaohua Li, Bob Liu,
	Seth Jennings, Joonsoo Kim, Rafael Aquini, Kirill A. Shutemov,
	Al Viro, Steven Whitehouse, Fengguang Wu, Raghavendra K T,
	Lukas Czerner, Damien Ramonda, Mark Rutland, Andrea Arcangeli,
	David Rientjes, Khalid Aziz, linux-fsdevel, linux-kernel,
	linux-mm

Backups, logrotation and indexers don't need files they read to remain
in the page cache. Their pages can be reclaimed early and should not
displace useful pages. POSIX specifices the POSIX_FADV_NOREUSE flag for
these use cases but it's currently a noop.

Using DONTNEED is not a good solution: It means the application will
throw out its pages even if they are used by other processes. If the
application wants to be more polite it needs a way to find out whether
thats the case. One way is to use mincore to get a snapshot of pages
before mmaping the file and then keeping pages that were already cached
before we accessed them. This of course ignores all accesses by other
processes occuring while we use the file and doesn't work with read.

The idea of the patch is to add pages from files with FMODE_NOREUSE at
the tail of the lru list. Therefore these pages are the first to be
reclaimed. We added add_to_page_cache_lru_tail and corresponding
functions, complementing add_to_page_cache_lru.

Our implementation on the other hand is alot easier to implement for
userspace as you only need to call posix_fadvise once after each open.
We currently ignore ranges and apply it to the complete file, which
should cover most usecases. Range functionality can be added with a
list/tree of ranges in the filp.

It might happen that a page is brought in via readahead for a file that
has NOREUSE set and is then requested by another process. This can lead
to the page being dropped from the page cache earlier even though the
competing process still needs it. The impact of this however, is small
as the likelihood of the page getting dropped is reduced because it
probably moves to the active list when the page is accessed by the
second process.

Signed-off-by: Matthias Wirth <matthias.wirth@gmail.com>
Signed-off-by: Lukas Senger <lukas@fridolin.com>
---
 include/linux/fs.h        |  3 +++
 include/linux/mm_inline.h |  9 ++++++++
 include/linux/pagemap.h   |  2 ++
 include/linux/pagevec.h   |  1 +
 include/linux/swap.h      |  8 ++++++++
 mm/fadvise.c              |  4 ++++
 mm/filemap.c              | 27 ++++++++++++++++++++++--
 mm/readahead.c            | 20 ++++++++++++++----
 mm/swap.c                 | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 120 insertions(+), 6 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4f59e18..0c1b031 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -126,6 +126,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File needs atomic accesses to f_pos */
 #define FMODE_ATOMIC_POS	((__force fmode_t)0x8000)
 
+/* Expect one read only (effect on page cache behavior) */
+#define FMODE_NOREUSE		((__force fmode_t)0x10000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
 
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index cf55945..11347f7 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -31,6 +31,15 @@ static __always_inline void add_page_to_lru_list(struct page *page,
 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
 }
 
+static __always_inline void add_page_to_lru_list_tail(struct page *page,
+				struct lruvec *lruvec, enum lru_list lru)
+{
+	int nr_pages = hpage_nr_pages(page);
+	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+	list_add_tail(&page->lru, &lruvec->lists[lru]);
+	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
+}
+
 static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 532cedc..0191357 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -575,6 +575,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
+int add_to_page_cache_lru_tail(struct page *page, struct address_space *mapping,
+				pgoff_t index, gfp_t gfp_mask);
 extern void delete_from_page_cache(struct page *page);
 extern void __delete_from_page_cache(struct page *page, void *shadow);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 3c6b8b1..d1d3223 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -22,6 +22,7 @@ struct pagevec {
 
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
+void __pagevec_lru_add_tail(struct pagevec *pvec);
 unsigned __pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 			  pgoff_t start, unsigned nr_pages, pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3507115..c6bb26f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -309,7 +309,9 @@ extern unsigned long nr_free_pagecache_pages(void);
 
 /* linux/mm/swap.c */
 extern void __lru_cache_add(struct page *);
+extern void __lru_cache_add_tail(struct page *);
 extern void lru_cache_add(struct page *);
+extern void lru_cache_add_tail(struct page *);
 extern void lru_add_page_tail(struct page *page, struct page *page_tail,
 			 struct lruvec *lruvec, struct list_head *head);
 extern void activate_page(struct page *);
@@ -339,6 +341,12 @@ static inline void lru_cache_add_file(struct page *page)
 	__lru_cache_add(page);
 }
 
+static inline void lru_cache_add_tail_file(struct page *page)
+{
+	ClearPageActive(page);
+	__lru_cache_add_tail(page);
+}
+
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3bcfd81..387d10a 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -80,6 +80,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		f.file->f_ra.ra_pages = bdi->ra_pages;
 		spin_lock(&f.file->f_lock);
 		f.file->f_mode &= ~FMODE_RANDOM;
+		f.file->f_mode &= ~FMODE_NOREUSE;
 		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_RANDOM:
@@ -111,6 +112,9 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 					   nrpages);
 		break;
 	case POSIX_FADV_NOREUSE:
+		spin_lock(&f.file->f_lock);
+		f.file->f_mode |= FMODE_NOREUSE;
+		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_DONTNEED:
 		if (!bdi_write_congested(mapping->backing_dev_info))
diff --git a/mm/filemap.c b/mm/filemap.c
index 97474c1..49b488a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -642,6 +642,23 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 }
 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
 
+/*
+ * Pages added to the tail are less important and should not be activated even
+ * if they were recently accessed. Therefore we behave different from
+ * add_to_page_cache_lru.
+ */
+int add_to_page_cache_lru_tail(struct page *page, struct address_space *mapping,
+				pgoff_t offset, gfp_t gfp_mask)
+{
+	int ret;
+
+	ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+	if (ret == 0)
+		lru_cache_add_tail_file(page);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(add_to_page_cache_lru_tail);
+
 #ifdef CONFIG_NUMA
 struct page *__page_cache_alloc(gfp_t gfp)
 {
@@ -1630,8 +1647,14 @@ no_cached_page:
 			desc->error = -ENOMEM;
 			goto out;
 		}
-		error = add_to_page_cache_lru(page, mapping,
-						index, GFP_KERNEL);
+
+		if (unlikely(filp->f_mode & FMODE_NOREUSE)) {
+			error = add_to_page_cache_lru_tail(page, mapping,
+							index, GFP_KERNEL);
+		} else {
+			error = add_to_page_cache_lru(page, mapping,
+							index, GFP_KERNEL);
+		}
 		if (error) {
 			page_cache_release(page);
 			if (error == -EEXIST)
diff --git a/mm/readahead.c b/mm/readahead.c
index 29c5e1a..749df01 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -117,7 +117,13 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 
 	blk_start_plug(&plug);
 
-	if (mapping->a_ops->readpages) {
+	/*
+	 * If the file was marked POSIX_FADV_NOREUSE we call need to call
+	 * add_to_page_cache_lru_tail on it so that it's added to the tail of
+	 * the LRU further along the way. This is not possible in
+	 * mpage_readpages as there is no filp there.
+	 */
+	if (mapping->a_ops->readpages && !(filp->f_mode & FMODE_NOREUSE)) {
 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
 		/* Clean up the remaining pages */
 		put_pages_list(pages);
@@ -127,10 +133,16 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_to_page(pages);
 		list_del(&page->lru);
-		if (!add_to_page_cache_lru(page, mapping,
-					page->index, GFP_KERNEL)) {
-			mapping->a_ops->readpage(filp, page);
+
+		if (unlikely(filp->f_mode & FMODE_NOREUSE)) {
+			ret = add_to_page_cache_lru_tail(page, mapping,
+					page->index, GFP_KERNEL);
+		} else {
+			ret = add_to_page_cache_lru(page, mapping,
+					page->index, GFP_KERNEL);
 		}
+		if (!ret)
+			mapping->a_ops->readpage(filp, page);
 		page_cache_release(page);
 	}
 	ret = 0;
diff --git a/mm/swap.c b/mm/swap.c
index f4d5f59..ebf2d2c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -41,6 +41,7 @@
 int page_cluster;
 
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
+static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 
@@ -600,6 +601,22 @@ void __lru_cache_add(struct page *page)
 }
 EXPORT_SYMBOL(__lru_cache_add);
 
+/*
+ * Same as __lru_cache_add but add to tail.
+ */
+
+void __lru_cache_add_tail(struct page *page)
+{
+	struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvec);
+
+	page_cache_get(page);
+	if (!pagevec_space(pvec))
+		__pagevec_lru_add_tail(pvec);
+	pagevec_add(pvec, page);
+	put_cpu_var(lru_add_tail_pvec);
+}
+EXPORT_SYMBOL(__lru_cache_add_tail);
+
 /**
  * lru_cache_add - add a page to a page list
  * @page: the page to be added to the LRU.
@@ -612,6 +629,17 @@ void lru_cache_add(struct page *page)
 }
 
 /**
+ * lru_cache_add_tail - add a page to a page list at the tail
+ * @page: the page to be added to the tail of the LRU.
+ */
+void lru_cache_add_tail(struct page *page)
+{
+	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+	VM_BUG_ON_PAGE(PageLRU(page), page);
+	__lru_cache_add_tail(page);
+}
+
+/**
  * add_page_to_unevictable_list - add a page to the unevictable list
  * @page:  the page to be added to the unevictable list
  *
@@ -939,6 +967,21 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
 	trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
 }
 
+static void __pagevec_lru_add_tail_fn(struct page *page, struct lruvec *lruvec,
+				 void *arg)
+{
+	int file = page_is_file_cache(page);
+	int active = PageActive(page);
+	enum lru_list lru = page_lru(page);
+
+	VM_BUG_ON_PAGE(PageLRU(page), page);
+
+	SetPageLRU(page);
+	add_page_to_lru_list_tail(page, lruvec, lru);
+	update_page_reclaim_stat(lruvec, file, active);
+	trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
+}
+
 /*
  * Add the passed pages to the LRU, then drop the caller's refcount
  * on them.  Reinitialises the caller's pagevec.
@@ -949,6 +992,15 @@ void __pagevec_lru_add(struct pagevec *pvec)
 }
 EXPORT_SYMBOL(__pagevec_lru_add);
 
+/*
+ * Same as __pagevec_lru_add, but pages are added to the tail of the LRU.
+ */
+void __pagevec_lru_add_tail(struct pagevec *pvec)
+{
+	pagevec_lru_move_fn(pvec, __pagevec_lru_add_tail_fn, NULL);
+}
+EXPORT_SYMBOL(__pagevec_lru_add_tail);
+
 /**
  * __pagevec_lookup - gang pagecache lookup
  * @pvec:	Where the resulting entries are placed
-- 
1.8.3.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCHv3] mm: implement POSIX_FADV_NOREUSE
  2014-03-14 15:52   ` Matthias Wirth
@ 2014-03-18 15:14     ` Michal Hocko
  -1 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2014-03-18 15:14 UTC (permalink / raw)
  To: Matthias Wirth
  Cc: Lukas Senger, i4passt, Dave Hansen, Matthew Wilcox, Jeff Layton,
	J. Bruce Fields, Andrew Morton, Johannes Weiner, Rik van Riel,
	Lisa Du, Minchan Kim, Naoya Horiguchi, Sasha Levin,
	Paul E. McKenney, Jan Kara, Mel Gorman, Shaohua Li, Bob Liu,
	Seth Jennings, Joonsoo Kim, Rafael Aquini, Kirill A. Shutemov,
	Al Viro, Steven Whitehouse

On Fri 14-03-14 16:52:38, Matthias Wirth wrote:
[...]
> The idea of the patch is to add pages from files with FMODE_NOREUSE at
> the tail of the lru list. Therefore these pages are the first to be
> reclaimed. We added add_to_page_cache_lru_tail and corresponding
> functions, complementing add_to_page_cache_lru.

If this is set before the read then you can end up trashing on those
pages during heavy memory pressure I am afraid. Page would get reclaimed
before the read gets to it.

What you could do instead, I think, is to reclaim pages belonging to
a FMODE_NOREUSE file away when they would be activated normally during
reclaim. That would require tweaking page_check_references which
implements used-once logic currently.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3] mm: implement POSIX_FADV_NOREUSE
@ 2014-03-18 15:14     ` Michal Hocko
  0 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2014-03-18 15:14 UTC (permalink / raw)
  To: Matthias Wirth
  Cc: Lukas Senger, i4passt, Dave Hansen, Matthew Wilcox, Jeff Layton,
	J. Bruce Fields, Andrew Morton, Johannes Weiner, Rik van Riel,
	Lisa Du, Minchan Kim, Naoya Horiguchi, Sasha Levin,
	Paul E. McKenney, Jan Kara, Mel Gorman, Shaohua Li, Bob Liu,
	Seth Jennings, Joonsoo Kim, Rafael Aquini, Kirill A. Shutemov,
	Al Viro, Steven Whitehouse, Fengguang Wu, Raghavendra K T,
	Lukas Czerner, Damien Ramonda, Mark Rutland, Andrea Arcangeli,
	David Rientjes, Khalid Aziz, linux-fsdevel, linux-kernel,
	linux-mm

On Fri 14-03-14 16:52:38, Matthias Wirth wrote:
[...]
> The idea of the patch is to add pages from files with FMODE_NOREUSE at
> the tail of the lru list. Therefore these pages are the first to be
> reclaimed. We added add_to_page_cache_lru_tail and corresponding
> functions, complementing add_to_page_cache_lru.

If this is set before the read then you can end up trashing on those
pages during heavy memory pressure I am afraid. Page would get reclaimed
before the read gets to it.

What you could do instead, I think, is to reclaim pages belonging to
a FMODE_NOREUSE file away when they would be activated normally during
reclaim. That would require tweaking page_check_references which
implements used-once logic currently.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2014-03-18 15:14 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-03-11 10:25 [PATCH] mm: implement POSIX_FADV_NOREUSE Matthias Wirth
2014-03-11 10:25 ` Matthias Wirth
2014-03-11 14:06 ` Michal Hocko
2014-03-11 14:06   ` Michal Hocko
2014-03-11 15:24   ` Dave Hansen
2014-03-11 15:24     ` Dave Hansen
2014-03-11 21:27     ` Andrew Morton
2014-03-11 21:27       ` Andrew Morton
2014-03-12 11:59       ` Lukas Senger
2014-03-12 11:59         ` Lukas Senger
2014-03-12 14:46         ` Michal Hocko
2014-03-12 14:46           ` Michal Hocko
2014-03-12 16:05         ` Dave Hansen
2014-03-12 16:05           ` Dave Hansen
2014-03-13 12:40           ` Lukas Senger
2014-03-13 12:40             ` Lukas Senger
2014-03-13 18:43 ` [PATCHv2] " Matthias Wirth
2014-03-13 18:43   ` Matthias Wirth
2014-03-13 20:01   ` Andrew Morton
2014-03-13 20:01     ` Andrew Morton
2014-03-14 12:34     ` Lukas Senger
2014-03-14 12:34       ` Lukas Senger
2014-03-14 15:52 ` [PATCHv3] " Matthias Wirth
2014-03-14 15:52   ` Matthias Wirth
2014-03-18 15:14   ` Michal Hocko
2014-03-18 15:14     ` Michal Hocko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.