linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] context readahead for concurrent IO take 2
@ 2009-04-12  7:19 Wu Fengguang
  2009-04-12  7:19 ` [PATCH 1/3] radix-tree: add radix_tree_prev_hole() Wu Fengguang
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Wu Fengguang @ 2009-04-12  7:19 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Vladislav Bolkhovitin, LKML

Andrew,

The context readahead patches have been floating around for a while.  Now that
Vladislav confirmed that it helps SCST IO performance, I'd like to push it for
more -mm testing outs. I believe this will help many concurrent IO workloads,
to name a few: NFS, SCST and even lighttpd.

The patchset is against linux-next, after the earlier filemap and readahead
cleanup/fix patches.

Thanks,
Fengguang
-- 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/3] radix-tree: add radix_tree_prev_hole()
  2009-04-12  7:19 [PATCH 0/3] context readahead for concurrent IO take 2 Wu Fengguang
@ 2009-04-12  7:19 ` Wu Fengguang
  2009-04-12 17:29   ` Andrew Morton
  2009-04-12  7:19 ` [PATCH 2/3] readahead: move the random read case to bottom Wu Fengguang
  2009-04-12  7:19 ` [PATCH 3/3] readahead: introduce context readahead algorithm Wu Fengguang
  2 siblings, 1 reply; 16+ messages in thread
From: Wu Fengguang @ 2009-04-12  7:19 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Vladislav Bolkhovitin, Wu Fengguang, LKML

[-- Attachment #1: radixtree-prev-hole.patch --]
[-- Type: text/plain, Size: 2364 bytes --]

The counterpart of radix_tree_next_hole(). To be used by context readahead.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/radix-tree.h |    2 +
 lib/radix-tree.c           |   37 +++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

--- mm.orig/lib/radix-tree.c
+++ mm/lib/radix-tree.c
@@ -666,6 +666,43 @@ unsigned long radix_tree_next_hole(struc
 }
 EXPORT_SYMBOL(radix_tree_next_hole);
 
+/**
+ *	radix_tree_prev_hole    -    find the prev hole (not-present entry)
+ *	@root:		tree root
+ *	@index:		index key
+ *	@max_scan:	maximum range to search
+ *
+ *	Search backwards in the range [max(index-max_scan+1, 0), index]
+ *	for the first hole.
+ *
+ *	Returns: the index of the hole if found, otherwise returns an index
+ *	outside of the set specified (in which case 'index - return >= max_scan'
+ *	will be true). In rare cases of wrap-around, LONG_MAX will be returned.
+ *
+ *	radix_tree_next_hole may be called under rcu_read_lock. However, like
+ *	radix_tree_gang_lookup, this will not atomically search a snapshot of
+ *	the tree at a single point in time. For example, if a hole is created
+ *	at index 10, then subsequently a hole is created at index 5,
+ *	radix_tree_prev_hole covering both indexes may return 5 if called under
+ *	rcu_read_lock.
+ */
+unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
+				   unsigned long index, unsigned long max_scan)
+{
+	unsigned long i;
+
+	for (i = 0; i < max_scan; i++) {
+		if (!radix_tree_lookup(root, index))
+			break;
+		index--;
+		if (index == LONG_MAX)
+			break;
+	}
+
+	return index;
+}
+EXPORT_SYMBOL(radix_tree_prev_hole);
+
 static unsigned int
 __lookup(struct radix_tree_node *slot, void ***results, unsigned long index,
 	unsigned int max_items, unsigned long *next_index)
--- mm.orig/include/linux/radix-tree.h
+++ mm/include/linux/radix-tree.h
@@ -167,6 +167,8 @@ radix_tree_gang_lookup_slot(struct radix
 			unsigned long first_index, unsigned int max_items);
 unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan);
+unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
+				unsigned long index, unsigned long max_scan);
 int radix_tree_preload(gfp_t gfp_mask);
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,

-- 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 2/3] readahead: move the random read case to bottom
  2009-04-12  7:19 [PATCH 0/3] context readahead for concurrent IO take 2 Wu Fengguang
  2009-04-12  7:19 ` [PATCH 1/3] radix-tree: add radix_tree_prev_hole() Wu Fengguang
@ 2009-04-12  7:19 ` Wu Fengguang
  2009-04-12  7:19 ` [PATCH 3/3] readahead: introduce context readahead algorithm Wu Fengguang
  2 siblings, 0 replies; 16+ messages in thread
From: Wu Fengguang @ 2009-04-12  7:19 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Vladislav Bolkhovitin, Wu Fengguang, LKML

[-- Attachment #1: readahead-reorder-cases.patch --]
[-- Type: text/plain, Size: 2383 bytes --]

Split all readahead cases, and move the random one to bottom.

No behavior changes.

This is to prepare for the introduction of context readahead,
and make it easy for inserting accounting/tracing points for each case.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 mm/readahead.c |   46 +++++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 21 deletions(-)

--- mm.orig/mm/readahead.c
+++ mm/mm/readahead.c
@@ -339,33 +339,25 @@ ondemand_readahead(struct address_space 
 		   unsigned long req_size)
 {
 	unsigned long max = max_sane_readahead(ra->ra_pages);
-	pgoff_t prev_offset;
-	int	sequential;
+
+	/*
+	 * start of file
+	 */
+	if (!offset)
+		goto initial_readahead;
 
 	/*
 	 * It's the expected callback offset, assume sequential access.
 	 * Ramp up sizes, and push forward the readahead window.
 	 */
-	if (offset && (offset == (ra->start + ra->size - ra->async_size) ||
-			offset == (ra->start + ra->size))) {
+	if ((offset == (ra->start + ra->size - ra->async_size) ||
+	     offset == (ra->start + ra->size))) {
 		ra->start += ra->size;
 		ra->size = get_next_ra_size(ra, max);
 		ra->async_size = ra->size;
 		goto readit;
 	}
 
-	prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
-	sequential = offset - prev_offset <= 1UL || req_size > max;
-
-	/*
-	 * Standalone, small read.
-	 * Read as is, and do not pollute the readahead state.
-	 */
-	if (!hit_readahead_marker && !sequential) {
-		return __do_page_cache_readahead(mapping, filp,
-						offset, req_size, 0);
-	}
-
 	/*
 	 * Hit a marked page without valid readahead state.
 	 * E.g. interleaved reads.
@@ -391,12 +383,24 @@ ondemand_readahead(struct address_space 
 	}
 
 	/*
-	 * It may be one of
-	 * 	- first read on start of file
-	 * 	- sequential cache miss
-	 * 	- oversize random read
-	 * Start readahead for it.
+	 * oversize read
 	 */
+	if (req_size > max)
+		goto initial_readahead;
+
+	/*
+	 * sequential cache miss
+	 */
+	if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
+		goto initial_readahead;
+
+	/*
+	 * standalone, small random read
+	 * Read as is, and do not pollute the readahead state.
+	 */
+	return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
+
+initial_readahead:
 	ra->start = offset;
 	ra->size = get_init_ra_size(req_size, max);
 	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

-- 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 3/3] readahead: introduce context readahead algorithm
  2009-04-12  7:19 [PATCH 0/3] context readahead for concurrent IO take 2 Wu Fengguang
  2009-04-12  7:19 ` [PATCH 1/3] radix-tree: add radix_tree_prev_hole() Wu Fengguang
  2009-04-12  7:19 ` [PATCH 2/3] readahead: move the random read case to bottom Wu Fengguang
@ 2009-04-12  7:19 ` Wu Fengguang
  2009-04-12  8:48   ` Ingo Molnar
  2009-04-15  3:43   ` Jeff Moyer
  2 siblings, 2 replies; 16+ messages in thread
From: Wu Fengguang @ 2009-04-12  7:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vladislav Bolkhovitin, Jens Axboe, Jeff Moyer, Wu Fengguang, LKML

[-- Attachment #1: readahead-context.patch --]
[-- Type: text/plain, Size: 4980 bytes --]

Introduce page cache context based readahead algorithm.
This is to better support concurrent read streams in general.

RATIONALE
---------
The current readahead algorithm detects interleaved reads in a _passive_ way.
Given a sequence of interleaved streams 1,1001,2,1002,3,4,1003,5,1004,1005,6,...
By checking for (offset == prev_offset + 1), it will discover the sequentialness
between 3,4 and between 1004,1005, and start doing sequential readahead for the
individual streams since page 4 and page 1005.

The context readahead algorithm guarantees to discover the sequentialness no
matter how the streams are interleaved. For the above example, it will start
sequential readahead since page 2 and 1002.

The trick is to poke for page @offset-1 in the page cache when it has no other
clues on the sequentialness of request @offset: if the current requenst belongs
to a sequential stream, that stream must have accessed page @offset-1 recently,
and the page will still be cached now. So if page @offset-1 is there, we can
take request @offset as a sequential access.

BENEFICIARIES
-------------
- strictly interleaved reads  i.e. 1,1001,2,1002,3,1003,...
  the current readahead will take them as silly random reads;
  the context readahead will take them as two sequential streams.

- cooperative IO processes   i.e. NFS and SCST
  They create a thread pool, farming off (sequential) IO requests to different
  threads which will be performing interleaved IO.

  It was not easy(or possible) to reliably tell from file->f_ra all those
  cooperative processes working on the same sequential stream, since they will
  have different file->f_ra instances. And NFSD's file->f_ra is particularly
  unusable, since their file objects are dynamically created for each request.
  The nfsd does have code trying to restore the f_ra bits, but not satisfactory.

  The new scheme is to detect the sequential pattern via looking up the page
  cache, which provides one single and consistent view of the pages recently
  accessed. That makes sequential detection for cooperative processes possible.

USER REPORT
-----------
Vladislav recommends the addition of context readahead as a result of his SCST
benchmarks. It leads to 6%~40% performance gains in various cases and achieves
equal performance in others.                http://lkml.org/lkml/2009/3/19/239

OVERHEADS
---------
In theory, it introduces one extra page cache lookup per random read.  However
the below benchmark shows context readahead to be slightly faster, wondering..

Randomly reading 200MB amount of data on a sparse file, repeat 20 times for
each block size. The average throughputs are:

                       	original ra	context ra	gain
 4K random reads:	 65.561MB/s	 65.648MB/s	+0.1%
16K random reads:	124.767MB/s	124.951MB/s	+0.1%
64K random reads: 	162.123MB/s	162.278MB/s	+0.1%

Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Tested-by: Vladislav Bolkhovitin <vst@vlnb.net>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 mm/readahead.c |   60 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

--- mm.orig/mm/readahead.c
+++ mm/mm/readahead.c
@@ -330,6 +330,59 @@ static unsigned long get_next_ra_size(st
  */
 
 /*
+ * Count contiguously cached pages from @offset-1 to @offset-@max,
+ * this count is a conservative estimation of
+ * 	- length of the sequential read sequence, or
+ * 	- thrashing threshold in memory tight systems
+ */
+static pgoff_t count_history_pages(struct address_space *mapping,
+				   struct file_ra_state *ra,
+				   pgoff_t offset, unsigned long max)
+{
+	pgoff_t head;
+
+	rcu_read_lock();
+	head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
+	rcu_read_unlock();
+
+	return offset - 1 - head;
+}
+
+/*
+ * page cache context based read-ahead
+ */
+static int try_context_readahead(struct address_space *mapping,
+				 struct file_ra_state *ra,
+				 pgoff_t offset,
+				 unsigned long req_size,
+				 unsigned long max)
+{
+	pgoff_t size;
+
+	size = count_history_pages(mapping, ra, offset, max);
+
+	/*
+	 * no history pages:
+	 * it could be a random read
+	 */
+	if (!size)
+		return 0;
+
+	/*
+	 * starts from beginning of file:
+	 * it is a strong indication of long-run stream (or whole-file-read)
+	 */
+	if (size >= offset)
+		size *= 2;
+
+	ra->start = offset;
+	ra->size = get_init_ra_size(size + req_size, max);
+	ra->async_size = ra->size;
+
+	return 1;
+}
+
+/*
  * A minimal readahead algorithm for trivial sequential/random reads.
  */
 static unsigned long
@@ -395,6 +448,13 @@ ondemand_readahead(struct address_space 
 		goto initial_readahead;
 
 	/*
+	 * Query the page cache and look for the traces(cached history pages)
+	 * that a sequential stream would leave behind.
+	 */
+	if (try_context_readahead(mapping, ra, offset, req_size, max))
+		goto readit;
+
+	/*
 	 * standalone, small random read
 	 * Read as is, and do not pollute the readahead state.
 	 */

-- 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] readahead: introduce context readahead algorithm
  2009-04-12  7:19 ` [PATCH 3/3] readahead: introduce context readahead algorithm Wu Fengguang
@ 2009-04-12  8:48   ` Ingo Molnar
  2009-04-12 12:35     ` Wu Fengguang
  2009-04-15  3:43   ` Jeff Moyer
  1 sibling, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2009-04-12  8:48 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Andrew Morton, Vladislav Bolkhovitin, Jens Axboe, Jeff Moyer,
	LKML, Peter Zijlstra


* Wu Fengguang <fengguang.wu@intel.com> wrote:

> Introduce page cache context based readahead algorithm.
> This is to better support concurrent read streams in general.

>  /*
> + * Count contiguously cached pages from @offset-1 to @offset-@max,
> + * this count is a conservative estimation of
> + * 	- length of the sequential read sequence, or
> + * 	- thrashing threshold in memory tight systems
> + */
> +static pgoff_t count_history_pages(struct address_space *mapping,
> +				   struct file_ra_state *ra,
> +				   pgoff_t offset, unsigned long max)
> +{
> +	pgoff_t head;
> +
> +	rcu_read_lock();
> +	head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
> +	rcu_read_unlock();
> +
> +	return offset - 1 - head;
> +}

Very elegant method! I suspect this will work far better 
than adding various increasingly more complex heuristics.

Emphatically-Acked-by: Ingo Molnar <mingo@elte.hu>

	Ingo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] readahead: introduce context readahead algorithm
  2009-04-12  8:48   ` Ingo Molnar
@ 2009-04-12 12:35     ` Wu Fengguang
  2009-04-16 17:12       ` Vladislav Bolkhovitin
  0 siblings, 1 reply; 16+ messages in thread
From: Wu Fengguang @ 2009-04-12 12:35 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Vladislav Bolkhovitin, Jens Axboe, Jeff Moyer,
	LKML, Peter Zijlstra, Nick Piggin, Rik van Riel, Linus Torvalds,
	Chenfeng Xu, linux-mm

On Sun, Apr 12, 2009 at 04:48:19PM +0800, Ingo Molnar wrote:
> 
> * Wu Fengguang <fengguang.wu@intel.com> wrote:
> 
> > Introduce page cache context based readahead algorithm.
> > This is to better support concurrent read streams in general.
> 
> >  /*
> > + * Count contiguously cached pages from @offset-1 to @offset-@max,
> > + * this count is a conservative estimation of
> > + * 	- length of the sequential read sequence, or
> > + * 	- thrashing threshold in memory tight systems
> > + */
> > +static pgoff_t count_history_pages(struct address_space *mapping,
> > +				   struct file_ra_state *ra,
> > +				   pgoff_t offset, unsigned long max)
> > +{
> > +	pgoff_t head;
> > +
> > +	rcu_read_lock();
> > +	head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
> > +	rcu_read_unlock();
> > +
> > +	return offset - 1 - head;
> > +}
> 
> Very elegant method! I suspect this will work far better 
> than adding various increasingly more complex heuristics.
> 
> Emphatically-Acked-by: Ingo Molnar <mingo@elte.hu>

Thank you Ingo!

The only pity is that this heuristic can be defeated by some user
space program that tries to do aggressive drop-behind via
fadvise(DONTNEED) calls. But as long as the drop-behind algorithm
be a bit lazy and does not try to squeeze the last page at @offset-1,
this patch will work just OK.

The context readahead idea is so fundamental, that a slightly modified
algorithm can be used for all kinds of sequential patterns, and it can
automatically adapt to the thrashing threshold.

        1 if (probe_page(index - 1)) {
        2          begin = next_hole(index, max);
        3          H     = index - prev_hole(index, 2*max);
        4          end   = index + H;
        5          update_window(begin, end);
        6          submit_io();
        7 }

            [=] history [#] current [_] readahead [.] new readahead
            ==========================#____________..............
        1                            ^index-1
        2                             |----------->[begin
        3  |<----------- H -----------|
        4                             |----------- H ----------->]end
        5                                          [ new window ]


We didn't do that because we want to
- avoid unnecessary page cache lookups for normal cases
- be more aggressive when thrashings are not a concern

However, readahead thrashings are far more prevalent than one would
expect in a FTP/HTTP file streaming server. It can happen in a modern
server with 16GB memory, 1Gbps outbound bandwidth and 1MB readahead
size, due to the existences of slow streams.

Let's do a coarse calculation. The 8GB inactive_list pages will be
cycled in 64Gb/1Gbps=64 seconds. This means an async readahead window
must be consumed in 64s, or it will be thrashed.  That's a speed of
2048KB/64s=32KB/s. Any client below this speed will create thrashings
in the server. In practice, those poor slow clients could amount to
half of the total connections(partly because it will take them more
time to download anything). The frequent thrashings will in return
speedup the LRU cycling/aging speed...

We need a thrashing safe mode which do
- the above modified context readahead algorithm
- conservative ramp up of readahead size
- conservative async readahead size

The main problem is: when shall we switch into the mode?

We can start with aggressive readahead and try to detect readahead
thrashings and switch into thrashing safe mode automatically. This
will work for non-interleaved reads.  However the big file streamer -
lighttpd - does interleaved reads.  The current data structure is not
able to detect most readahead thrashings for lighttpd.

Luckily, the non-resident page tracking facility could help this case.
There the thrashed pages with their timing info can be found, based on
which we can have some extended context readahead algorithm that could
even overcome the drop-behind problem :)

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/3] radix-tree: add radix_tree_prev_hole()
  2009-04-12  7:19 ` [PATCH 1/3] radix-tree: add radix_tree_prev_hole() Wu Fengguang
@ 2009-04-12 17:29   ` Andrew Morton
  2009-04-13 13:44     ` Wu Fengguang
  0 siblings, 1 reply; 16+ messages in thread
From: Andrew Morton @ 2009-04-12 17:29 UTC (permalink / raw)
  To: Wu Fengguang; +Cc: Vladislav Bolkhovitin, LKML

On Sun, 12 Apr 2009 15:19:51 +0800 Wu Fengguang <fengguang.wu@intel.com> wrote:

> +unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
> +				   unsigned long index, unsigned long max_scan)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < max_scan; i++) {
> +		if (!radix_tree_lookup(root, index))
> +			break;
> +		index--;
> +		if (index == LONG_MAX)
> +			break;
> +	}
> +
> +	return index;
> +}

argh.  This is about as inefficient as we could possibly make it :(

Really, this function should dive into radix-tree internals and walk
individual radix_tree_node.slots[] arrays.  And heck, it can peek at
radix_tree_node.count and _bypass_ entire nodes, too.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/3] radix-tree: add radix_tree_prev_hole()
  2009-04-12 17:29   ` Andrew Morton
@ 2009-04-13 13:44     ` Wu Fengguang
  0 siblings, 0 replies; 16+ messages in thread
From: Wu Fengguang @ 2009-04-13 13:44 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Vladislav Bolkhovitin, LKML

On Sun, Apr 12, 2009 at 10:29:52AM -0700, Andrew Morton wrote:
> On Sun, 12 Apr 2009 15:19:51 +0800 Wu Fengguang <fengguang.wu@intel.com> wrote:
> 
> > +unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
> > +				   unsigned long index, unsigned long max_scan)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < max_scan; i++) {
> > +		if (!radix_tree_lookup(root, index))
> > +			break;
> > +		index--;
> > +		if (index == LONG_MAX)
> > +			break;
> > +	}
> > +
> > +	return index;
> > +}
> 
> argh.  This is about as inefficient as we could possibly make it :(

Right, a dumb loop!

> Really, this function should dive into radix-tree internals and walk
> individual radix_tree_node.slots[] arrays.  And heck, it can peek at
> radix_tree_node.count and _bypass_ entire nodes, too.

Good idea! In fact I'm planning such a smart version. It will be using
radix_tree_lookup_slot() to access the ->count member, in order to
check if the whole slot can be bypassed.

radix_tree_next_hole() is another optimization candidate.
But that will be a post 2.6.30 stuff.

The current dumb-but-obvious-right version is OK for 2.6.30, because
in most radix_tree_prev_hole() invocations the actual loop count will
be merely 1.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] readahead: introduce context readahead algorithm
  2009-04-12  7:19 ` [PATCH 3/3] readahead: introduce context readahead algorithm Wu Fengguang
  2009-04-12  8:48   ` Ingo Molnar
@ 2009-04-15  3:43   ` Jeff Moyer
  2009-04-15  4:43     ` Wu Fengguang
  1 sibling, 1 reply; 16+ messages in thread
From: Jeff Moyer @ 2009-04-15  3:43 UTC (permalink / raw)
  To: Wu Fengguang; +Cc: Andrew Morton, Vladislav Bolkhovitin, Jens Axboe, LKML

Wu Fengguang <fengguang.wu@intel.com> writes:

> Introduce page cache context based readahead algorithm.
> This is to better support concurrent read streams in general.
>
> RATIONALE
> ---------
> The current readahead algorithm detects interleaved reads in a _passive_ way.
> Given a sequence of interleaved streams 1,1001,2,1002,3,4,1003,5,1004,1005,6,...
> By checking for (offset == prev_offset + 1), it will discover the sequentialness
> between 3,4 and between 1004,1005, and start doing sequential readahead for the
> individual streams since page 4 and page 1005.
>
> The context readahead algorithm guarantees to discover the sequentialness no
> matter how the streams are interleaved. For the above example, it will start
> sequential readahead since page 2 and 1002.
>
> The trick is to poke for page @offset-1 in the page cache when it has no other
> clues on the sequentialness of request @offset: if the current requenst belongs
> to a sequential stream, that stream must have accessed page @offset-1 recently,
> and the page will still be cached now. So if page @offset-1 is there, we can
> take request @offset as a sequential access.
>
> BENEFICIARIES
> -------------
> - strictly interleaved reads  i.e. 1,1001,2,1002,3,1003,...
>   the current readahead will take them as silly random reads;
>   the context readahead will take them as two sequential streams.
>
> - cooperative IO processes   i.e. NFS and SCST
>   They create a thread pool, farming off (sequential) IO requests to different
>   threads which will be performing interleaved IO.
>
>   It was not easy(or possible) to reliably tell from file->f_ra all those
>   cooperative processes working on the same sequential stream, since they will
>   have different file->f_ra instances. And NFSD's file->f_ra is particularly
>   unusable, since their file objects are dynamically created for each request.
>   The nfsd does have code trying to restore the f_ra bits, but not satisfactory.

Hi, Wu,

I tested out your patches.  Below are some basic iozone numbers for a
single NFS client reading a file.  The iozone command line is:

  iozone -s 2000000 -r 64 -f /mnt/test/testfile -i 1 -w

The file system is unmounted after each run to flush the cache.  The
numbers below reflect only a single run each.  The file system was also
unmounted on the NFS client after each run.

KEY
---
vanilla:	   2.6.30-rc1
readahead:	   2.6.30-rc1 + your 10 readahead patches
context readahead: 2.6.30-rc1 + your 10 readahead patches + the 3
		   context readahead patches.
nfsd's:		   number of NFSD threads on the server

I'll note that the cfq in 2.6.30-rc1 is crippled, and that Jens has a
patch posted that makes the numbers look at least a little better, but
that's immaterial to this discussion, I think.

                vanilla

nfsd's  |   1   |   2   |   4   |   8
--------+---------------+-------+------
cfq     | 43127 | 22354 | 20858 | 21179
deadline| 43732 | 68059 | 76659 | 83231

                readahead

nfsd's  |   1   |   2   |   4   |   8
--------+---------------+-------+------
cfq     | 42471 | 21913 | 21252 | 20979
deadline| 42801 | 70158 | 82068 | 82406

           context readahead

nfsd's  |   1   |   2   |   4   |   8
--------+---------------+-------+------
cfq     | 42827 | 21882 | 20678 | 21508
deadline| 43040 | 71173 | 82407 | 86583

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] readahead: introduce context readahead algorithm
  2009-04-15  3:43   ` Jeff Moyer
@ 2009-04-15  4:43     ` Wu Fengguang
  2009-04-15 17:55       ` Jeff Moyer
  0 siblings, 1 reply; 16+ messages in thread
From: Wu Fengguang @ 2009-04-15  4:43 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Andrew Morton, Vladislav Bolkhovitin, Jens Axboe, LKML

On Wed, Apr 15, 2009 at 11:43:32AM +0800, Jeff Moyer wrote:
> Wu Fengguang <fengguang.wu@intel.com> writes:
> 
> > Introduce page cache context based readahead algorithm.
> > This is to better support concurrent read streams in general.
> >
> > RATIONALE
> > ---------
> > The current readahead algorithm detects interleaved reads in a _passive_ way.
> > Given a sequence of interleaved streams 1,1001,2,1002,3,4,1003,5,1004,1005,6,...
> > By checking for (offset == prev_offset + 1), it will discover the sequentialness
> > between 3,4 and between 1004,1005, and start doing sequential readahead for the
> > individual streams since page 4 and page 1005.
> >
> > The context readahead algorithm guarantees to discover the sequentialness no
> > matter how the streams are interleaved. For the above example, it will start
> > sequential readahead since page 2 and 1002.
> >
> > The trick is to poke for page @offset-1 in the page cache when it has no other
> > clues on the sequentialness of request @offset: if the current requenst belongs
> > to a sequential stream, that stream must have accessed page @offset-1 recently,
> > and the page will still be cached now. So if page @offset-1 is there, we can
> > take request @offset as a sequential access.
> >
> > BENEFICIARIES
> > -------------
> > - strictly interleaved reads  i.e. 1,1001,2,1002,3,1003,...
> >   the current readahead will take them as silly random reads;
> >   the context readahead will take them as two sequential streams.
> >
> > - cooperative IO processes   i.e. NFS and SCST
> >   They create a thread pool, farming off (sequential) IO requests to different
> >   threads which will be performing interleaved IO.
> >
> >   It was not easy(or possible) to reliably tell from file->f_ra all those
> >   cooperative processes working on the same sequential stream, since they will
> >   have different file->f_ra instances. And NFSD's file->f_ra is particularly
> >   unusable, since their file objects are dynamically created for each request.
> >   The nfsd does have code trying to restore the f_ra bits, but not satisfactory.
> 
> Hi, Wu,
> 
> I tested out your patches.  Below are some basic iozone numbers for a
> single NFS client reading a file.  The iozone command line is:
> 
>   iozone -s 2000000 -r 64 -f /mnt/test/testfile -i 1 -w

Jeff, thank you very much for the testing out!

> The file system is unmounted after each run to flush the cache.  The
> numbers below reflect only a single run each.  The file system was also
> unmounted on the NFS client after each run.
> 
> KEY
> ---
> vanilla:	   2.6.30-rc1
> readahead:	   2.6.30-rc1 + your 10 readahead patches
> context readahead: 2.6.30-rc1 + your 10 readahead patches + the 3
> 		   context readahead patches.
> nfsd's:		   number of NFSD threads on the server

I guess you are applying the readahead patches to the server side?

What's the NFS mount options and client/server side readahead size?
The context readahead is pretty sensible to these parameters.

> I'll note that the cfq in 2.6.30-rc1 is crippled, and that Jens has a
> patch posted that makes the numbers look at least a little better, but
> that's immaterial to this discussion, I think.
> 
>                 vanilla
> 
> nfsd's  |   1   |   2   |   4   |   8
> --------+---------------+-------+------
> cfq     | 43127 | 22354 | 20858 | 21179
> deadline| 43732 | 68059 | 76659 | 83231
> 
>                 readahead
> 
> nfsd's  |   1   |   2   |   4   |   8
> --------+---------------+-------+------
> cfq     | 42471 | 21913 | 21252 | 20979
> deadline| 42801 | 70158 | 82068 | 82406
> 
>            context readahead
> 
> nfsd's  |   1   |   2   |   4   |   8
> --------+---------------+-------+------
> cfq     | 42827 | 21882 | 20678 | 21508
> deadline| 43040 | 71173 | 82407 | 86583

Let me transform them into relative numbers:

             A     B     C      A..B      A..C         
cfq-1      43127 42471 42827    -1.5%     -0.7%         
cfq-2      22354 21913 21882    -2.0%     -2.1%         
cfq-4      20858 21252 20678    +1.9%     -0.9%         
cfq-8      21179 20979 21508    -0.9%     +1.6%         
           
deadline-1 43732 42801 43040    -2.1%     -1.6%         
deadline-2 68059 70158 71173    +3.1%     +4.6%         
deadline-4 76659 82068 82407    +7.1%     +7.5%         
deadline-8 83231 82406 86583    -1.0%     +4.0%         

Summaries:
1) the overall numbers are slightly negative for CFQ and looks better
   with deadline.

Anyway we have the io context problem for CFQ.  And I'm planning to
dive into the CFQ code and your patch on that :-)

2) the single thread case performance consistently dropped by 1-2%.

It seems not related to the behavior changes introduced by the mmap
readahead patches and context readahead patches. And looks more like
some overheads created by the code reorganization and the patch
"readahead: apply max_sane_readahead() limit in ondemand_readahead()"
which adds a bit overhead with the call max_sane_readahead().

I'll try to root cause it.

Thanks again for the numbers!

Regards,
Fengguang

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] readahead: introduce context readahead algorithm
  2009-04-15  4:43     ` Wu Fengguang
@ 2009-04-15 17:55       ` Jeff Moyer
  2009-04-27  4:48         ` Wu Fengguang
  0 siblings, 1 reply; 16+ messages in thread
From: Jeff Moyer @ 2009-04-15 17:55 UTC (permalink / raw)
  To: Wu Fengguang; +Cc: Andrew Morton, Vladislav Bolkhovitin, Jens Axboe, LKML

Hi, Fengguang,

Wu Fengguang <fengguang.wu@intel.com> writes:

>> I tested out your patches.  Below are some basic iozone numbers for a
>> single NFS client reading a file.  The iozone command line is:
>> 
>>   iozone -s 2000000 -r 64 -f /mnt/test/testfile -i 1 -w
>
> Jeff, thank you very much for the testing out!
>
>> The file system is unmounted after each run to flush the cache.  The
>> numbers below reflect only a single run each.  The file system was also
>> unmounted on the NFS client after each run.
>> 
>> KEY
>> ---
>> vanilla:	   2.6.30-rc1
>> readahead:	   2.6.30-rc1 + your 10 readahead patches
>> context readahead: 2.6.30-rc1 + your 10 readahead patches + the 3
>> 		   context readahead patches.
>> nfsd's:		   number of NFSD threads on the server
>
> I guess you are applying the readahead patches to the server side?

That's right.

> What's the NFS mount options and client/server side readahead size?
> The context readahead is pretty sensible to these parameters.

Default options everywhere.

>> I'll note that the cfq in 2.6.30-rc1 is crippled, and that Jens has a
>> patch posted that makes the numbers look at least a little better, but
>> that's immaterial to this discussion, I think.
>> 
>>                 vanilla
>> 
>> nfsd's  |   1   |   2   |   4   |   8
>> --------+---------------+-------+------
>> cfq     | 43127 | 22354 | 20858 | 21179
>> deadline| 43732 | 68059 | 76659 | 83231
>> 
>>                 readahead
>> 
>> nfsd's  |   1   |   2   |   4   |   8
>> --------+---------------+-------+------
>> cfq     | 42471 | 21913 | 21252 | 20979
>> deadline| 42801 | 70158 | 82068 | 82406
>> 
>>            context readahead
>> 
>> nfsd's  |   1   |   2   |   4   |   8
>> --------+---------------+-------+------
>> cfq     | 42827 | 21882 | 20678 | 21508
>> deadline| 43040 | 71173 | 82407 | 86583
>
> Let me transform them into relative numbers:
>
>              A     B     C      A..B      A..C         
> cfq-1      43127 42471 42827    -1.5%     -0.7%         
> cfq-2      22354 21913 21882    -2.0%     -2.1%         
> cfq-4      20858 21252 20678    +1.9%     -0.9%         
> cfq-8      21179 20979 21508    -0.9%     +1.6%         
>            
> deadline-1 43732 42801 43040    -2.1%     -1.6%         
> deadline-2 68059 70158 71173    +3.1%     +4.6%         
> deadline-4 76659 82068 82407    +7.1%     +7.5%         
> deadline-8 83231 82406 86583    -1.0%     +4.0%         
>
> Summaries:
> 1) the overall numbers are slightly negative for CFQ and looks better
>    with deadline.

The variance is probably 1-2%.  I'll try to quantify that for you.

> Anyway we have the io context problem for CFQ.  And I'm planning to
> dive into the CFQ code and your patch on that :-)

Jens already reworked the patch and included it in his for-linus branch
of the block tree.  So, you can start there.  ;-)

> 2) the single thread case performance consistently dropped by 1-2%.

> It seems not related to the behavior changes introduced by the mmap
> readahead patches and context readahead patches. And looks more like
> some overheads created by the code reorganization and the patch
> "readahead: apply max_sane_readahead() limit in ondemand_readahead()"
> which adds a bit overhead with the call max_sane_readahead().
>
> I'll try to root cause it.
>
> Thanks again for the numbers!

No problem.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] readahead: introduce context readahead algorithm
  2009-04-12 12:35     ` Wu Fengguang
@ 2009-04-16 17:12       ` Vladislav Bolkhovitin
  0 siblings, 0 replies; 16+ messages in thread
From: Vladislav Bolkhovitin @ 2009-04-16 17:12 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Ingo Molnar, Andrew Morton, Jens Axboe, Jeff Moyer, LKML,
	Peter Zijlstra, Nick Piggin, Rik van Riel, Linus Torvalds,
	Chenfeng Xu, linux-mm

Wu Fengguang, on 04/12/2009 04:35 PM wrote:
> On Sun, Apr 12, 2009 at 04:48:19PM +0800, Ingo Molnar wrote:
>> * Wu Fengguang <fengguang.wu@intel.com> wrote:
>>
>>> Introduce page cache context based readahead algorithm.
>>> This is to better support concurrent read streams in general.
>>>  /*
>>> + * Count contiguously cached pages from @offset-1 to @offset-@max,
>>> + * this count is a conservative estimation of
>>> + * 	- length of the sequential read sequence, or
>>> + * 	- thrashing threshold in memory tight systems
>>> + */
>>> +static pgoff_t count_history_pages(struct address_space *mapping,
>>> +				   struct file_ra_state *ra,
>>> +				   pgoff_t offset, unsigned long max)
>>> +{
>>> +	pgoff_t head;
>>> +
>>> +	rcu_read_lock();
>>> +	head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
>>> +	rcu_read_unlock();
>>> +
>>> +	return offset - 1 - head;
>>> +}
>> Very elegant method! I suspect this will work far better 
>> than adding various increasingly more complex heuristics.
>>
>> Emphatically-Acked-by: Ingo Molnar <mingo@elte.hu>
> 
> Thank you Ingo!
> 
> The only pity is that this heuristic can be defeated by some user
> space program that tries to do aggressive drop-behind via
> fadvise(DONTNEED) calls. But as long as the drop-behind algorithm
> be a bit lazy and does not try to squeeze the last page at @offset-1,
> this patch will work just OK.
> 
> The context readahead idea is so fundamental, that a slightly modified
> algorithm can be used for all kinds of sequential patterns, and it can
> automatically adapt to the thrashing threshold.
> 
>         1 if (probe_page(index - 1)) {
>         2          begin = next_hole(index, max);
>         3          H     = index - prev_hole(index, 2*max);
>         4          end   = index + H;
>         5          update_window(begin, end);
>         6          submit_io();
>         7 }
> 
>             [=] history [#] current [_] readahead [.] new readahead
>             ==========================#____________..............
>         1                            ^index-1
>         2                             |----------->[begin
>         3  |<----------- H -----------|
>         4                             |----------- H ----------->]end
>         5                                          [ new window ]
> 
> 
> We didn't do that because we want to
> - avoid unnecessary page cache lookups for normal cases
> - be more aggressive when thrashings are not a concern
> 
> However, readahead thrashings are far more prevalent than one would
> expect in a FTP/HTTP file streaming server. It can happen in a modern
> server with 16GB memory, 1Gbps outbound bandwidth and 1MB readahead
> size, due to the existences of slow streams.
> 
> Let's do a coarse calculation. The 8GB inactive_list pages will be
> cycled in 64Gb/1Gbps=64 seconds. This means an async readahead window
> must be consumed in 64s, or it will be thrashed.  That's a speed of
> 2048KB/64s=32KB/s. Any client below this speed will create thrashings
> in the server. In practice, those poor slow clients could amount to
> half of the total connections(partly because it will take them more
> time to download anything). The frequent thrashings will in return
> speedup the LRU cycling/aging speed...
> 
> We need a thrashing safe mode which do
> - the above modified context readahead algorithm
> - conservative ramp up of readahead size
> - conservative async readahead size
> 
> The main problem is: when shall we switch into the mode?

More I think about an ideal readahead, more it looks like page cache 
should also keep fairness between its users, similarly as it's done for 
CPU (CFS) and disk (CFQ). A slow user should have a chance to use its 
chunk the cache in face of too fast thrasher.

The main problems with it are to define what "user" is and how to 
implement the fairness in an acceptably simple way.

Maybe something like that: "user" is IO context (or IO stream?) and, if 
there would be a need to get some pages for "user" A, pages belonging to 
other "users" would be evicted with additional wight W, so A's pages 
would be preferred during eviction.

Just my 0.05c.

> We can start with aggressive readahead and try to detect readahead
> thrashings and switch into thrashing safe mode automatically. This
> will work for non-interleaved reads.  However the big file streamer -
> lighttpd - does interleaved reads.  The current data structure is not
> able to detect most readahead thrashings for lighttpd.
> 
> Luckily, the non-resident page tracking facility could help this case.
> There the thrashed pages with their timing info can be found, based on
> which we can have some extended context readahead algorithm that could
> even overcome the drop-behind problem :)
> 
> Thanks,
> Fengguang
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] readahead: introduce context readahead algorithm
  2009-04-15 17:55       ` Jeff Moyer
@ 2009-04-27  4:48         ` Wu Fengguang
  0 siblings, 0 replies; 16+ messages in thread
From: Wu Fengguang @ 2009-04-27  4:48 UTC (permalink / raw)
  To: Jeff Moyer
  Cc: Andrew Morton, Vladislav Bolkhovitin, Jens Axboe, LKML,
	linux-nfs, Trond Myklebust, Neil Brown

Hi Jeff,

I did some more NFS readahead tests. Judging from your and mine tests, I can
say that the context readahead is safe for trivial NFS workloads :-) It is
behaving in the expected way, and the overheads, if any, are close enough
to the fluctuating margin.

On Thu, Apr 16, 2009 at 01:55:48AM +0800, Jeff Moyer wrote:
> Hi, Fengguang,
>
> Wu Fengguang <fengguang.wu@intel.com> writes:
>
> >> I tested out your patches.  Below are some basic iozone numbers for a
> >> single NFS client reading a file.  The iozone command line is:
> >>
> >>   iozone -s 2000000 -r 64 -f /mnt/test/testfile -i 1 -w
> >
> > Jeff, thank you very much for the testing out!
> >
> >> The file system is unmounted after each run to flush the cache.  The
> >> numbers below reflect only a single run each.  The file system was also
> >> unmounted on the NFS client after each run.
> >>
> >> KEY
> >> ---
> >> vanilla:	   2.6.30-rc1
> >> readahead:	   2.6.30-rc1 + your 10 readahead patches
> >> context readahead: 2.6.30-rc1 + your 10 readahead patches + the 3
> >> 		   context readahead patches.
> >> nfsd's:		   number of NFSD threads on the server
> >
> > I guess you are applying the readahead patches to the server side?
>
> That's right.
>
> > What's the NFS mount options and client/server side readahead size?
> > The context readahead is pretty sensible to these parameters.
>
> Default options everywhere.

The default options observed in my test platforms:
        - client: CFQ, kernel 2.6.30-rc3 + linux-2.6-block.git for linus
        - server: CFQ, kernel 2.6.30-rc2-next-20090417
is
        - rsize=256k
        - NFS readahead size=3840k (= 256k * 15)
        - sda readahead size=128k

> >> I'll note that the cfq in 2.6.30-rc1 is crippled, and that Jens has a
> >> patch posted that makes the numbers look at least a little better, but
> >> that's immaterial to this discussion, I think.
[snip]
> > Let me transform them into relative numbers:
> >
> >              A     B     C      A..B      A..C
> > cfq-1      43127 42471 42827    -1.5%     -0.7%
> > cfq-2      22354 21913 21882    -2.0%     -2.1%
> > cfq-4      20858 21252 20678    +1.9%     -0.9%
> > cfq-8      21179 20979 21508    -0.9%     +1.6%
> >
> > deadline-1 43732 42801 43040    -2.1%     -1.6%
> > deadline-2 68059 70158 71173    +3.1%     +4.6%
> > deadline-4 76659 82068 82407    +7.1%     +7.5%
> > deadline-8 83231 82406 86583    -1.0%     +4.0%
> >
> > Summaries:
> > 1) the overall numbers are slightly negative for CFQ and looks better
> >    with deadline.
>
> The variance is probably 1-2%.  I'll try to quantify that for you.

I tried to measure the overheads, here is the approach:
- random read(4K) syscalls on a huge sparse file over NFS
- server side readahead size=1M, otherwise all default options

The -0.1%, +0.5% differences in time are close enough to the variance.

                  vanilla    +max_sane_readahead()      +mmap readahead
        run-1     77.01s      77.18                     77.96s
        run-2     77.18s      77.53                     77.76s
        run-3     77.93s      77.57                     77.84s
        run-4     77.76s                                78.16s
        run-5     77.55s                                77.76s
        run-6                                           77.90s
        avg       77.486      77.427                    77.897
        diff%                 -0.1%                     +0.5%

> > Anyway we have the io context problem for CFQ.  And I'm planning to
> > dive into the CFQ code and your patch on that :-)
>
> Jens already reworked the patch and included it in his for-linus branch
> of the block tree.  So, you can start there.  ;-)

Good news. I'm running with it :-)

> > 2) the single thread case performance consistently dropped by 1-2%.
>
> > It seems not related to the behavior changes introduced by the mmap
> > readahead patches and context readahead patches. And looks more like
> > some overheads created by the code reorganization and the patch
> > "readahead: apply max_sane_readahead() limit in ondemand_readahead()"
> > which adds a bit overhead with the call max_sane_readahead().
> >
> > I'll try to root cause it.

Then I go on to test sequential reads on real files over NFS.

Again the differences are small enough.

        vanilla        +mmap&context readahead   diff%
nfsd=1  28.875s        28.770s                   -0.4%
nfsd=8  42.533s        42.255s                   -0.7%

For the single nfsd case, the readahead sequence is perfect and exactly the
same before/after the context readahead patch:

[   60.542986] readahead-initial0(pid=3124(nfsd), dev=08:02(sda2), ino=129(vmlinux-2.6.29), req=0+64, ra=0+128-64, async=0) = 128
[   60.573652] readahead-subsequent(pid=3124(nfsd), dev=08:02(sda2), ino=129(vmlinux-2.6.29), req=64+32, ra=128+256-256, async=1) = 2
56
[   60.590312] readahead-subsequent(pid=3124(nfsd), dev=08:02(sda2), ino=129(vmlinux-2.6.29), req=128+32, ra=384+256-256, async=1) =
256
[   60.652863] readahead-subsequent(pid=3124(nfsd), dev=08:02(sda2), ino=129(vmlinux-2.6.29), req=384+32, ra=640+256-256, async=1) =
256
[   60.713916] readahead-subsequent(pid=3124(nfsd), dev=08:02(sda2), ino=129(vmlinux-2.6.29), req=640+32, ra=896+256-256, async=1) =
256
[   60.776168] readahead-subsequent(pid=3124(nfsd), dev=08:02(sda2), ino=129(vmlinux-2.6.29), req=896+32, ra=1152+256-256, async=1) =
 256
[   60.837423] readahead-subsequent(pid=3124(nfsd), dev=08:02(sda2), ino=129(vmlinux-2.6.29), req=1152+32, ra=1408+256-256, async=1)
= 256
[   60.899360] readahead-subsequent(pid=3124(nfsd), dev=08:02(sda2), ino=129(vmlinux-2.6.29), req=1408+32, ra=1664+256-256, async=1)
= 256


Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] readahead: introduce context readahead algorithm
  2009-04-11  0:16   ` Andrew Morton
@ 2009-04-12  7:11     ` Wu Fengguang
  0 siblings, 0 replies; 16+ messages in thread
From: Wu Fengguang @ 2009-04-12  7:11 UTC (permalink / raw)
  To: Andrew Morton; +Cc: vst, jens.axboe, jmoyer, linux-kernel

On Sat, Apr 11, 2009 at 08:16:52AM +0800, Andrew Morton wrote:
> On Fri, 10 Apr 2009 21:12:50 +0800
> Wu Fengguang <fengguang.wu@intel.com> wrote:
> 
> > Introduce page cache context based readahead algorithm.
> > This is to better support concurrent read streams in general.
> > 
> > RATIONALE
> > ---------
> > The current readahead algorithm detects interleaved reads in a _passive_ way.
> > Given a sequence of interleaved streams 1,1001,2,1002,3,4,1003,5,1004,1005,6,...
> > By checking for (offset == prev_offset + 1), it will discover the sequentialness
> > between 3,4 and between 1004,1005, and start doing sequential readahead for the
> > individual streams since page 4 and page 1005.
> > 
> > The context readahead algorithm guarantees to discover the sequentialness no
> > matter how the streams are interleaved. For the above example, it will start
> > sequential readahead since page 2 and 1002.
> > 
> > The trick is to poke for page @offset-1 in the page cache when it has no other
> > clues on the sequentialness of request @offset: if the current requenst belongs
> > to a sequential stream, that stream must have accessed page @offset-1 recently,
> > and the page will still be cached now. So if page @offset-1 is there, we can
> > take request @offset as a sequential access.
> > 
> > BENEFICIARIES
> > -------------
> > - strictly interleaved reads  i.e. 1,1001,2,1002,3,1003,...
> >   the current readahead will take them as silly random reads;
> >   the context readahead will take them as two sequential streams.
> > 
> > - seeky _column_ iterations on a huge matrix
> >   Yes it can be regard as _massively_ interleaved streams!
> >   Context readahead could transform the 1-page IOs (@offset+@size):
> > 	0+1, 1000+1, 2000+1, 3000+1, ...,
> > 	1+1, 1001+1, 2001+1, 3001+1, ...,
> > 	2+1, 1002+1, 2002+1, 3002+1, ...
> >   into larger sized IOs:
> > 	0+1, 1000+1, 2000+1, 3000+1, ...,
> > 	1+4, 1001+4, 2001+4, 3001+4, ...,
> > 	5+8, 1005+8, 2005+8, 3005+8, ...
> > 
> > - cooperative IO processes   i.e. NFS and SCST
> >   They create a thread pool, farming off (sequential) IO requests to different
> >   threads which will be performing interleaved IO.
> > 
> >   It was not easy(or possible) to reliably tell from file->f_ra all those
> >   cooperative processes working on the same sequential stream, since they will
> >   have different file->f_ra instances. And NFSD's file->f_ra is particularly
> >   unusable, since their file objects are dynamically created for each request.
> >   The nfsd does have code trying to restore the f_ra bits, but not satisfactory.
> > 
> >   The new scheme is to detect the sequential pattern via looking up the page
> >   cache, which provides one single and consistent view of the pages recently
> >   accessed. That makes sequential detection for cooperative processes possible.
> > 
> > USER REPORT
> > -----------
> > Vladislav recommends the addition of context readahead as a result of his SCST
> > benchmarks. It leads to 6%~40% performance gains in various cases and achieves
> > equal performance in others.                http://lkml.org/lkml/2009/3/19/239
> > 
> > OVERHEADS
> > ---------
> > In theory, it introduces one extra page cache lookup per random read.  However
> > the below benchmark shows context readahead to be slightly faster, wondering..
> > 
> > Randomly reading 200MB amount of data on a sparse file, repeat 20 times for
> > each block size. The average throughputs are:
> > 
> >                        	original ra	context ra	gain
> >  4K random reads:	 65.561MB/s	 65.648MB/s	+0.1%
> > 16K random reads:	124.767MB/s	124.951MB/s	+0.1%
> > 64K random reads: 	162.123MB/s	162.278MB/s	+0.1%
> > 
> > Cc: Jens Axboe <jens.axboe@oracle.com>
> > Cc: Jeff Moyer <jmoyer@redhat.com>
> > Tested-by: Vladislav Bolkhovitin <vst@vlnb.net>
> > Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
> 
> > ---
> >  mm/readahead.c |   60 +++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 60 insertions(+)
> > 
> > --- mm.orig/mm/readahead.c
> > +++ mm/mm/readahead.c
> > @@ -330,6 +330,59 @@ static unsigned long get_next_ra_size(st
> >   */
> >  
> >  /*
> > + * Count continuously cached pages from @offset-1 to @offset-@max,
> 
> You meant "contiguously" here, yes?

Ah yes, continuously for time and contiguously for space?

> > + * this count is a conservative estimation of
> > + * 	- length of the sequential read sequence, or
> > + * 	- thrashing threshold in memory tight systems
> > + */
> > +static unsigned long count_history_pages(struct address_space *mapping,
> > +					 struct file_ra_state *ra,
> > +					 pgoff_t offset, unsigned long max)
> > +{
> > +	pgoff_t head;
> > +
> > +	rcu_read_lock();
> > +	head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
> > +	rcu_read_unlock();
> > +
> > +	return offset - 1 - head;
> > +}
> 
> Doesn't matter much, but perhaps this should return pgoff_t.

Do you indicate to use pgoff_t for size?

> > +/*
> > + * page cache context based read-ahead
> > + */
> > +static int try_context_readahead(struct address_space *mapping,
> > +				 struct file_ra_state *ra,
> > +				 pgoff_t offset,
> > +				 unsigned long req_size,
> > +				 unsigned long max)
> > +{
> > +	unsigned long size;
> 
> And this could be pgoff_t too.

OK. I'll repost the whole series.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] readahead: introduce context readahead algorithm
  2009-04-10 13:12 ` [PATCH 3/3] readahead: introduce context readahead algorithm Wu Fengguang
@ 2009-04-11  0:16   ` Andrew Morton
  2009-04-12  7:11     ` Wu Fengguang
  0 siblings, 1 reply; 16+ messages in thread
From: Andrew Morton @ 2009-04-11  0:16 UTC (permalink / raw)
  To: Wu Fengguang; +Cc: vst, jens.axboe, jmoyer, fengguang.wu, linux-kernel

On Fri, 10 Apr 2009 21:12:50 +0800
Wu Fengguang <fengguang.wu@intel.com> wrote:

> Introduce page cache context based readahead algorithm.
> This is to better support concurrent read streams in general.
> 
> RATIONALE
> ---------
> The current readahead algorithm detects interleaved reads in a _passive_ way.
> Given a sequence of interleaved streams 1,1001,2,1002,3,4,1003,5,1004,1005,6,...
> By checking for (offset == prev_offset + 1), it will discover the sequentialness
> between 3,4 and between 1004,1005, and start doing sequential readahead for the
> individual streams since page 4 and page 1005.
> 
> The context readahead algorithm guarantees to discover the sequentialness no
> matter how the streams are interleaved. For the above example, it will start
> sequential readahead since page 2 and 1002.
> 
> The trick is to poke for page @offset-1 in the page cache when it has no other
> clues on the sequentialness of request @offset: if the current requenst belongs
> to a sequential stream, that stream must have accessed page @offset-1 recently,
> and the page will still be cached now. So if page @offset-1 is there, we can
> take request @offset as a sequential access.
> 
> BENEFICIARIES
> -------------
> - strictly interleaved reads  i.e. 1,1001,2,1002,3,1003,...
>   the current readahead will take them as silly random reads;
>   the context readahead will take them as two sequential streams.
> 
> - seeky _column_ iterations on a huge matrix
>   Yes it can be regard as _massively_ interleaved streams!
>   Context readahead could transform the 1-page IOs (@offset+@size):
> 	0+1, 1000+1, 2000+1, 3000+1, ...,
> 	1+1, 1001+1, 2001+1, 3001+1, ...,
> 	2+1, 1002+1, 2002+1, 3002+1, ...
>   into larger sized IOs:
> 	0+1, 1000+1, 2000+1, 3000+1, ...,
> 	1+4, 1001+4, 2001+4, 3001+4, ...,
> 	5+8, 1005+8, 2005+8, 3005+8, ...
> 
> - cooperative IO processes   i.e. NFS and SCST
>   They create a thread pool, farming off (sequential) IO requests to different
>   threads which will be performing interleaved IO.
> 
>   It was not easy(or possible) to reliably tell from file->f_ra all those
>   cooperative processes working on the same sequential stream, since they will
>   have different file->f_ra instances. And NFSD's file->f_ra is particularly
>   unusable, since their file objects are dynamically created for each request.
>   The nfsd does have code trying to restore the f_ra bits, but not satisfactory.
> 
>   The new scheme is to detect the sequential pattern via looking up the page
>   cache, which provides one single and consistent view of the pages recently
>   accessed. That makes sequential detection for cooperative processes possible.
> 
> USER REPORT
> -----------
> Vladislav recommends the addition of context readahead as a result of his SCST
> benchmarks. It leads to 6%~40% performance gains in various cases and achieves
> equal performance in others.                http://lkml.org/lkml/2009/3/19/239
> 
> OVERHEADS
> ---------
> In theory, it introduces one extra page cache lookup per random read.  However
> the below benchmark shows context readahead to be slightly faster, wondering..
> 
> Randomly reading 200MB amount of data on a sparse file, repeat 20 times for
> each block size. The average throughputs are:
> 
>                        	original ra	context ra	gain
>  4K random reads:	 65.561MB/s	 65.648MB/s	+0.1%
> 16K random reads:	124.767MB/s	124.951MB/s	+0.1%
> 64K random reads: 	162.123MB/s	162.278MB/s	+0.1%
> 
> Cc: Jens Axboe <jens.axboe@oracle.com>
> Cc: Jeff Moyer <jmoyer@redhat.com>
> Tested-by: Vladislav Bolkhovitin <vst@vlnb.net>
> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>

> ---
>  mm/readahead.c |   60 +++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 60 insertions(+)
> 
> --- mm.orig/mm/readahead.c
> +++ mm/mm/readahead.c
> @@ -330,6 +330,59 @@ static unsigned long get_next_ra_size(st
>   */
>  
>  /*
> + * Count continuously cached pages from @offset-1 to @offset-@max,

You meant "contiguously" here, yes?

> + * this count is a conservative estimation of
> + * 	- length of the sequential read sequence, or
> + * 	- thrashing threshold in memory tight systems
> + */
> +static unsigned long count_history_pages(struct address_space *mapping,
> +					 struct file_ra_state *ra,
> +					 pgoff_t offset, unsigned long max)
> +{
> +	pgoff_t head;
> +
> +	rcu_read_lock();
> +	head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
> +	rcu_read_unlock();
> +
> +	return offset - 1 - head;
> +}

Doesn't matter much, but perhaps this should return pgoff_t.

> +/*
> + * page cache context based read-ahead
> + */
> +static int try_context_readahead(struct address_space *mapping,
> +				 struct file_ra_state *ra,
> +				 pgoff_t offset,
> +				 unsigned long req_size,
> +				 unsigned long max)
> +{
> +	unsigned long size;

And this could be pgoff_t too.

> +	size = count_history_pages(mapping, ra, offset, max);
> +
> +	/*
> +	 * no history pages:
> +	 * it could be a random read
> +	 */
> +	if (!size)
> +		return 0;
> +
> +	/*
> +	 * starts from beginning of file:
> +	 * it is a strong indication of long-run stream (or whole-file-read)
> +	 */
> +	if (size >= offset)
> +		size *= 2;
> +
> +	ra->start = offset;
> +	ra->size = get_init_ra_size(size + req_size, max);
> +	ra->async_size = ra->size;
> +
> +	return 1;
> +}
> +
> +/*
>   * A minimal readahead algorithm for trivial sequential/random reads.
>   */
>  static unsigned long
> @@ -395,6 +448,13 @@ ondemand_readahead(struct address_space 
>  		goto initial_readahead;
>  
>  	/*
> +	 * Query the page cache and look for the traces(cached history pages)
> +	 * that a sequential stream would leave behind.
> +	 */
> +	if (try_context_readahead(mapping, ra, offset, req_size, max))
> +		goto readit;
> +
> +	/*
>  	 * standalone, small random read
>  	 * Read as is, and do not pollute the readahead state.
>  	 */
> 
> -- 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 3/3] readahead: introduce context readahead algorithm
  2009-04-10 13:12 [PATCH 0/3] context readahead for concurrent IO Wu Fengguang
@ 2009-04-10 13:12 ` Wu Fengguang
  2009-04-11  0:16   ` Andrew Morton
  0 siblings, 1 reply; 16+ messages in thread
From: Wu Fengguang @ 2009-04-10 13:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vladislav Bolkhovitin, Jens Axboe, Jeff Moyer, Wu Fengguang, LKML

[-- Attachment #1: readahead-context.patch --]
[-- Type: text/plain, Size: 5396 bytes --]

Introduce page cache context based readahead algorithm.
This is to better support concurrent read streams in general.

RATIONALE
---------
The current readahead algorithm detects interleaved reads in a _passive_ way.
Given a sequence of interleaved streams 1,1001,2,1002,3,4,1003,5,1004,1005,6,...
By checking for (offset == prev_offset + 1), it will discover the sequentialness
between 3,4 and between 1004,1005, and start doing sequential readahead for the
individual streams since page 4 and page 1005.

The context readahead algorithm guarantees to discover the sequentialness no
matter how the streams are interleaved. For the above example, it will start
sequential readahead since page 2 and 1002.

The trick is to poke for page @offset-1 in the page cache when it has no other
clues on the sequentialness of request @offset: if the current requenst belongs
to a sequential stream, that stream must have accessed page @offset-1 recently,
and the page will still be cached now. So if page @offset-1 is there, we can
take request @offset as a sequential access.

BENEFICIARIES
-------------
- strictly interleaved reads  i.e. 1,1001,2,1002,3,1003,...
  the current readahead will take them as silly random reads;
  the context readahead will take them as two sequential streams.

- seeky _column_ iterations on a huge matrix
  Yes it can be regard as _massively_ interleaved streams!
  Context readahead could transform the 1-page IOs (@offset+@size):
	0+1, 1000+1, 2000+1, 3000+1, ...,
	1+1, 1001+1, 2001+1, 3001+1, ...,
	2+1, 1002+1, 2002+1, 3002+1, ...
  into larger sized IOs:
	0+1, 1000+1, 2000+1, 3000+1, ...,
	1+4, 1001+4, 2001+4, 3001+4, ...,
	5+8, 1005+8, 2005+8, 3005+8, ...

- cooperative IO processes   i.e. NFS and SCST
  They create a thread pool, farming off (sequential) IO requests to different
  threads which will be performing interleaved IO.

  It was not easy(or possible) to reliably tell from file->f_ra all those
  cooperative processes working on the same sequential stream, since they will
  have different file->f_ra instances. And NFSD's file->f_ra is particularly
  unusable, since their file objects are dynamically created for each request.
  The nfsd does have code trying to restore the f_ra bits, but not satisfactory.

  The new scheme is to detect the sequential pattern via looking up the page
  cache, which provides one single and consistent view of the pages recently
  accessed. That makes sequential detection for cooperative processes possible.

USER REPORT
-----------
Vladislav recommends the addition of context readahead as a result of his SCST
benchmarks. It leads to 6%~40% performance gains in various cases and achieves
equal performance in others.                http://lkml.org/lkml/2009/3/19/239

OVERHEADS
---------
In theory, it introduces one extra page cache lookup per random read.  However
the below benchmark shows context readahead to be slightly faster, wondering..

Randomly reading 200MB amount of data on a sparse file, repeat 20 times for
each block size. The average throughputs are:

                       	original ra	context ra	gain
 4K random reads:	 65.561MB/s	 65.648MB/s	+0.1%
16K random reads:	124.767MB/s	124.951MB/s	+0.1%
64K random reads: 	162.123MB/s	162.278MB/s	+0.1%

Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Tested-by: Vladislav Bolkhovitin <vst@vlnb.net>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 mm/readahead.c |   60 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

--- mm.orig/mm/readahead.c
+++ mm/mm/readahead.c
@@ -330,6 +330,59 @@ static unsigned long get_next_ra_size(st
  */
 
 /*
+ * Count continuously cached pages from @offset-1 to @offset-@max,
+ * this count is a conservative estimation of
+ * 	- length of the sequential read sequence, or
+ * 	- thrashing threshold in memory tight systems
+ */
+static unsigned long count_history_pages(struct address_space *mapping,
+					 struct file_ra_state *ra,
+					 pgoff_t offset, unsigned long max)
+{
+	pgoff_t head;
+
+	rcu_read_lock();
+	head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
+	rcu_read_unlock();
+
+	return offset - 1 - head;
+}
+
+/*
+ * page cache context based read-ahead
+ */
+static int try_context_readahead(struct address_space *mapping,
+				 struct file_ra_state *ra,
+				 pgoff_t offset,
+				 unsigned long req_size,
+				 unsigned long max)
+{
+	unsigned long size;
+
+	size = count_history_pages(mapping, ra, offset, max);
+
+	/*
+	 * no history pages:
+	 * it could be a random read
+	 */
+	if (!size)
+		return 0;
+
+	/*
+	 * starts from beginning of file:
+	 * it is a strong indication of long-run stream (or whole-file-read)
+	 */
+	if (size >= offset)
+		size *= 2;
+
+	ra->start = offset;
+	ra->size = get_init_ra_size(size + req_size, max);
+	ra->async_size = ra->size;
+
+	return 1;
+}
+
+/*
  * A minimal readahead algorithm for trivial sequential/random reads.
  */
 static unsigned long
@@ -395,6 +448,13 @@ ondemand_readahead(struct address_space 
 		goto initial_readahead;
 
 	/*
+	 * Query the page cache and look for the traces(cached history pages)
+	 * that a sequential stream would leave behind.
+	 */
+	if (try_context_readahead(mapping, ra, offset, req_size, max))
+		goto readit;
+
+	/*
 	 * standalone, small random read
 	 * Read as is, and do not pollute the readahead state.
 	 */

-- 


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2009-04-27  4:48 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-12  7:19 [PATCH 0/3] context readahead for concurrent IO take 2 Wu Fengguang
2009-04-12  7:19 ` [PATCH 1/3] radix-tree: add radix_tree_prev_hole() Wu Fengguang
2009-04-12 17:29   ` Andrew Morton
2009-04-13 13:44     ` Wu Fengguang
2009-04-12  7:19 ` [PATCH 2/3] readahead: move the random read case to bottom Wu Fengguang
2009-04-12  7:19 ` [PATCH 3/3] readahead: introduce context readahead algorithm Wu Fengguang
2009-04-12  8:48   ` Ingo Molnar
2009-04-12 12:35     ` Wu Fengguang
2009-04-16 17:12       ` Vladislav Bolkhovitin
2009-04-15  3:43   ` Jeff Moyer
2009-04-15  4:43     ` Wu Fengguang
2009-04-15 17:55       ` Jeff Moyer
2009-04-27  4:48         ` Wu Fengguang
  -- strict thread matches above, loose matches on Subject: below --
2009-04-10 13:12 [PATCH 0/3] context readahead for concurrent IO Wu Fengguang
2009-04-10 13:12 ` [PATCH 3/3] readahead: introduce context readahead algorithm Wu Fengguang
2009-04-11  0:16   ` Andrew Morton
2009-04-12  7:11     ` Wu Fengguang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).