linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] O_STREAMING - flag for optimal streaming I/O
@ 2002-10-08  2:38 Robert Love
  2002-10-08 10:42 ` J.A. Magallon
                   ` (3 more replies)
  0 siblings, 4 replies; 68+ messages in thread
From: Robert Love @ 2002-10-08  2:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, riel

Attached patch implements an O_STREAMING file I/O flag which enables
manual drop-behind of pages.

If the file has O_STREAMING set then the user has explicitly said "this
is streaming data, I know I will not revisit this, do not cache
anything".  So we drop pages from the pagecache before our current
index.  We have to fiddle a bit to get writes working since we do
write-behind but the logic is there and it works.

Some numbers.  A simple streaming read to verify the pagecache effects:

        Streaming 1GB Read (avg of many runs, mem=2GB):
        O_STREAMING     Wall time       Change in Page Cache
        Yes             25.58s          0
        No              25.55s          +835MB

Another read with much more VM pressure:

        Streaming 1GB Read (avg of many runs, mem=8M)
        O_STREAMING     Wall time       Change in Page Cache
        Yes             25.76s          0
        No              29.01s          +1MB

And now the kicker:

	Kernel compile (make -j2) and concurrent streaming I/O
        (avg of two runs, mem=128M):
        O_STREAMING     Time to complete Kernel Compile
        Yes             3m27.863s
        No              4m15.818s

This is c/o Andrew Morton.

Patch is against 2.4.20-pre9.  Why not 2.5?  Because Andrew says we can
do better, perhaps with a real drop-behind heuristic.  As 20 Oct looms
quite close, we shall see.

	Robert Love

Implement O_STREAMING for streaming I/O for manual drop-behind of pages.

 include/asm-arm/fcntl.h  |    1
 include/asm-i386/fcntl.h |    1
 include/asm-mips/fcntl.h |    1
 include/asm-ppc/fcntl.h  |    1
 include/asm-sh/fcntl.h   |    1
 mm/filemap.c             |   89 +++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 94 insertions(+)

diff -urN linux-2.4.20-pre9/include/asm-arm/fcntl.h linux/include/asm-arm/fcntl.h
--- linux-2.4.20-pre9/include/asm-arm/fcntl.h	2002-10-06 14:57:26.000000000 -0400
+++ linux/include/asm-arm/fcntl.h	2002-10-07 18:45:51.000000000 -0400
@@ -20,6 +20,7 @@
 #define O_NOFOLLOW	0100000	/* don't follow links */
 #define O_DIRECT	0200000	/* direct disk access hint - currently ignored */
 #define O_LARGEFILE	0400000
+#define O_STREAMING    04000000	/* streaming access */
 
 #define F_DUPFD		0	/* dup */
 #define F_GETFD		1	/* get close_on_exec */
diff -urN linux-2.4.20-pre9/include/asm-i386/fcntl.h linux/include/asm-i386/fcntl.h
--- linux-2.4.20-pre9/include/asm-i386/fcntl.h	2002-10-06 14:57:21.000000000 -0400
+++ linux/include/asm-i386/fcntl.h	2002-10-07 18:45:51.000000000 -0400
@@ -20,6 +20,7 @@
 #define O_LARGEFILE	0100000
 #define O_DIRECTORY	0200000	/* must be a directory */
 #define O_NOFOLLOW	0400000 /* don't follow links */
+#define O_STREAMING    04000000	/* streaming access */
 
 #define F_DUPFD		0	/* dup */
 #define F_GETFD		1	/* get close_on_exec */
diff -urN linux-2.4.20-pre9/include/asm-mips/fcntl.h linux/include/asm-mips/fcntl.h
--- linux-2.4.20-pre9/include/asm-mips/fcntl.h	2002-10-06 14:57:21.000000000 -0400
+++ linux/include/asm-mips/fcntl.h	2002-10-07 18:45:51.000000000 -0400
@@ -26,6 +26,7 @@
 #define O_DIRECT	0x8000	/* direct disk access hint */
 #define O_DIRECTORY	0x10000	/* must be a directory */
 #define O_NOFOLLOW	0x20000	/* don't follow links */
+#define O_STREAMING    0x400000	/* streaming access */
 
 #define O_NDELAY	O_NONBLOCK
 
diff -urN linux-2.4.20-pre9/include/asm-ppc/fcntl.h linux/include/asm-ppc/fcntl.h
--- linux-2.4.20-pre9/include/asm-ppc/fcntl.h	2002-10-06 14:57:22.000000000 -0400
+++ linux/include/asm-ppc/fcntl.h	2002-10-07 18:45:51.000000000 -0400
@@ -23,6 +23,7 @@
 #define O_NOFOLLOW      0100000	/* don't follow links */
 #define O_LARGEFILE     0200000
 #define O_DIRECT	0400000	/* direct disk access hint */
+#define O_STREAMING    04000000	/* streaming access */
 
 #define F_DUPFD		0	/* dup */
 #define F_GETFD		1	/* get close_on_exec */
diff -urN linux-2.4.20-pre9/include/asm-sh/fcntl.h linux/include/asm-sh/fcntl.h
--- linux-2.4.20-pre9/include/asm-sh/fcntl.h	2002-10-06 14:57:27.000000000 -0400
+++ linux/include/asm-sh/fcntl.h	2002-10-07 18:45:51.000000000 -0400
@@ -20,6 +20,7 @@
 #define O_LARGEFILE	0100000
 #define O_DIRECTORY	0200000	/* must be a directory */
 #define O_NOFOLLOW	0400000 /* don't follow links */
+#define O_STREAMING    04000000	/* streaming access */
 
 #define F_DUPFD		0	/* dup */
 #define F_GETFD		1	/* get close_on_exec */
diff -urN linux-2.4.20-pre9/mm/filemap.c linux/mm/filemap.c
--- linux-2.4.20-pre9/mm/filemap.c	2002-10-06 14:57:20.000000000 -0400
+++ linux/mm/filemap.c	2002-10-07 18:45:51.000000000 -0400
@@ -1322,6 +1322,90 @@
 		SetPageReferenced(page);
 }
 
+/**
+ * shrink_list - non-blockingly drop pages from the given cache list
+ * @mapping: the mapping from which we want to drop pages
+ * @list: which list (e.g. locked, dirty, clean)?
+ * @max_index: greatest index from which we will drop pages
+ */
+static unsigned long shrink_list(struct address_space *mapping,
+				 struct list_head *list,
+				 unsigned long max_index)
+{
+	struct list_head *curr = list->prev;
+	unsigned long nr_shrunk = 0;
+
+	spin_lock(&pagemap_lru_lock);
+	spin_lock(&pagecache_lock);
+
+	while ((curr != list)) {
+		struct page *page = list_entry(curr, struct page, list);
+
+		curr = curr->prev;
+
+		if (page->index > max_index)
+			continue;
+
+		if (PageDirty(page))
+			continue;
+
+		if (TryLockPage(page))
+			break;
+
+		if (page->buffers && !try_to_release_page(page, 0)) {
+			/* probably dirty buffers */
+			unlock_page(page);
+			break;
+		}
+
+		if (page_count(page) != 1) {
+			unlock_page(page);
+			continue;
+		}
+
+		__lru_cache_del(page);
+		__remove_inode_page(page);
+		unlock_page(page);
+		page_cache_release(page);
+		nr_shrunk++;
+	}
+
+	spin_unlock(&pagecache_lock);
+	spin_unlock(&pagemap_lru_lock);
+
+	return nr_shrunk;
+}
+
+/**
+ * shrink_pagecache - nonblockingly drop pages from the mapping.
+ * @file: the file we are doing I/O on
+ * @max_index: the maximum index from which we are willing to drop pages
+ * 
+ * This is for O_STREAMING, which says "I am streaming data, I know I will not
+ * revisit this; do not cache anything".
+ * 
+ * max_index allows us to only drop pages which are behind `index', to avoid
+ * trashing readahead.
+ */
+static unsigned long shrink_pagecache(struct file *file,
+				      unsigned long max_index)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	unsigned long nr_locked, nr_clean, nr_dirty;
+
+	/*
+	 * ensure we have a decent amount of work todo
+	 */
+	if (mapping->nrpages < 256)
+		return 0;
+
+	nr_locked = shrink_list(mapping, &mapping->locked_pages, max_index);
+	nr_clean = shrink_list(mapping, &mapping->clean_pages, max_index);
+	nr_dirty = shrink_list(mapping, &mapping->dirty_pages, max_index);
+
+	return nr_locked + nr_clean + nr_dirty;
+}
+
 /*
  * This is a generic file read routine, and uses the
  * inode->i_op->readpage() function for the actual low-level
@@ -1538,6 +1622,8 @@
 	filp->f_reada = 1;
 	if (cached_page)
 		page_cache_release(cached_page);
+	if (filp->f_flags & O_STREAMING)
+		shrink_pagecache(filp, index);
 	UPDATE_ATIME(inode);
 }
 
@@ -3047,6 +3133,9 @@
 	if (file->f_flags & O_DIRECT)
 		goto o_direct;
 
+	if (file->f_flags & O_STREAMING)
+		shrink_pagecache(file, pos >> PAGE_CACHE_SHIFT);
+
 	do {
 		unsigned long index, offset;
 		long page_fault;


^ permalink raw reply	[flat|nested] 68+ messages in thread
* Re: [PATCH] O_STREAMING - flag for optimal streaming I/O
@ 2002-10-11  4:16 Hank Leininger
  0 siblings, 0 replies; 68+ messages in thread
From: Hank Leininger @ 2002-10-11  4:16 UTC (permalink / raw)
  To: linux-kernel

On 2002-10-10, Mike Fedyk <mfedyk@matchmail.com> wrote: 
 
> On Thu, Oct 10, 2002 at 03:17:35PM +0200, Giuliano Pochini wrote: 
> > Thing again about to backup a large database. I don't 
> > want to use tar because it kills the caches. I would 
> > like a way to read the db so that the cached part of 
> > the db (the 20% which gets 80% of accesses) is not 
> > expunged. 
 
> Unless you are pausing the database (causing the files on disk to be in 
> a useful state) and then reading the file you will have trouble.  
> Anything else will have to syncronize with the database itself, and 
> thus can't use O_STREAMING. 
 
Pausing the database != putting the database into readonly mode, which is 
all that would really be required.  If your writer-processes are distinct 
from your reader-processes, you could suspend them (and/or batch up writes 
to temp tables to shrink your externally-felt maintenance window), tell 
the DB to flush pending writes, then dump with O_STREAMING-aware tar (or 
db-specific tools that still must pass through all tables/files) while 
read performance is only somewhat impacted, and cache isn't completely 
killed. 
 
Or, consider the case where the database isn't anywhere near all that the 
system does.  Think static content + DB-driven webserver, where the DB 
*can* be completely shut down (and those parts unavailable) during 
backups, while static content serving still goes on efficiently. 
 
-- 
Hank Leininger <hlein@progressive-comp.com>  
   

^ permalink raw reply	[flat|nested] 68+ messages in thread

end of thread, other threads:[~2002-10-11  8:26 UTC | newest]

Thread overview: 68+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-10-08  2:38 [PATCH] O_STREAMING - flag for optimal streaming I/O Robert Love
2002-10-08 10:42 ` J.A. Magallon
2002-10-08 18:08   ` Robert Love
2002-10-08 18:38 ` Chris Wedgwood
2002-10-08 18:49   ` Robert Love
2002-10-08 19:05     ` Chris Wedgwood
2002-10-08 19:17       ` Robert Love
2002-10-08 19:30         ` Andrew Morton
2002-10-09 14:14           ` Marco Colombo
2002-10-09 16:30           ` kernel
2002-10-08 19:52         ` Chris Wedgwood
2002-10-08 19:59           ` Robert Love
2002-10-08 20:01             ` Chris Wedgwood
2002-10-09  8:33         ` Giuliano Pochini
2002-10-09  8:43           ` Andrew Morton
2002-10-09 10:55             ` Giuliano Pochini
2002-10-09 17:05           ` Mark Mielke
2002-10-09 19:36             ` Giuliano Pochini
2002-10-09 22:24               ` Mark Mielke
2002-10-09 23:20                 ` Jamie Lokier
2002-10-10  3:07                   ` Mark Mielke
2002-10-10 10:55                     ` Helge Hafting
2002-10-10 17:50                       ` Mark Mielke
2002-10-10  3:29                   ` Erik Andersen
2002-10-10  3:37                     ` Robert Love
2002-10-10 13:39                       ` Giuliano Pochini
2002-10-10 22:50                         ` Mike Fedyk
2002-10-10 22:58                           ` Erik Andersen
2002-10-11  8:26                           ` Giuliano Pochini
2002-10-11  8:32                           ` Helge Hafting
2002-10-10  8:33                     ` Giuliano Pochini
2002-10-10  9:10                       ` Erik Andersen
2002-10-10  9:38                         ` Giuliano Pochini
2002-10-10 10:40                           ` Miquel van Smoorenburg
2002-10-10 11:01                           ` Helge Hafting
2002-10-10 12:29                             ` Xavier Bestel
2002-10-10 13:17                             ` Giuliano Pochini
2002-10-10 22:44                               ` Mike Fedyk
2002-10-11  8:13                                 ` Giuliano Pochini
2002-10-10 11:38                     ` O_STREAMING has insufficient info - how about fadvise() ? Alan Cox
2002-10-10 11:47                       ` William Lee Irwin III
2002-10-10 15:34                       ` Andrew Morton
2002-10-10 16:08                         ` Alan Cox
2002-10-10 16:49                         ` Oliver Xymoron
2002-10-10 15:37                     ` [PATCH] O_STREAMING - flag for optimal streaming I/O Gerhard Mack
2002-10-10 22:47                       ` Mike Fedyk
2002-10-11  2:14                         ` Gerhard Mack
2002-10-11  8:10                           ` Chris Wedgwood
2002-10-10  9:14                 ` David Lang
2002-10-10 14:51               ` Denis Vlasenko
2002-10-08 19:53       ` Matthias Schniedermeyer
2002-10-08 19:59         ` Chris Wedgwood
2002-10-08 20:03         ` Andrew Morton
2002-10-08 20:34           ` Matthias Schniedermeyer
2002-10-08 20:42             ` Andrew Morton
2002-10-08 20:37           ` Larry McVoy
2002-10-09 11:53 ` Roy Sigurd Karlsbakk
2002-10-09 14:10 ` Marco Colombo
2002-10-09 14:14   ` Robert Love
2002-10-09 14:33     ` Richard B. Johnson
2002-10-09 15:27     ` Andreas Dilger
2002-10-09 23:17       ` Jamie Lokier
2002-10-09 23:46       ` Rik van Riel
2002-10-10  0:16         ` Jamie Lokier
2002-10-10  2:39           ` Erik Andersen
2002-10-10 10:33         ` Marco Colombo
2002-10-10 20:00           ` Erik Andersen
2002-10-11  4:16 Hank Leininger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).