linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/7] readahead cleanups and interleaved readahead take 3
       [not found] <20070721044300.909424569@mail.ustc.edu.cn>
@ 2007-07-21  4:43 ` Fengguang Wu
       [not found] ` <20070721044346.554186594@mail.ustc.edu.cn>
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 15+ messages in thread
From: Fengguang Wu @ 2007-07-21  4:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Andrew,

The following patches are based on yesterday's discussions, compiled and
tested OK:

smaller file_ra_state:
	[PATCH 1/7] readahead: compacting file_ra_state
	[PATCH 2/7] readahead: mmap read-around simplification
	[PATCH 3/7] readahead: combine file_ra_state.prev_index/prev_offset into prev_

code cleanups:
	[PATCH 4/7] readahead: remove several readahead macros
	[PATCH 5/7] readahead: remove the limit max_sectors_kb imposed on max_readahead_kb

support of interleaved reads:
	[PATCH 6/7] radixtree: introduce radix_tree_scan_hole()
	[PATCH 7/7] readahead: basic support of interleaved reads

The diffstat is

 block/ll_rw_blk.c          |    9 -----
 fs/ext3/dir.c              |    2 -
 fs/ext4/dir.c              |    2 -
 fs/splice.c                |    2 -
 include/linux/fs.h         |   14 +++-----
 include/linux/mm.h         |    2 -
 include/linux/radix-tree.h |    2 +
 lib/radix-tree.c           |   34 ++++++++++++++++++++
 mm/filemap.c               |   17 +++++-----
 mm/readahead.c             |   58 +++++++++++++++++++----------------
 10 files changed, 86 insertions(+), 56 deletions(-)

Regards,
Fengguang Wu
--

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/7] readahead: compacting file_ra_state
       [not found] ` <20070721044346.554186594@mail.ustc.edu.cn>
@ 2007-07-21  4:43   ` Fengguang Wu
  0 siblings, 0 replies; 15+ messages in thread
From: Fengguang Wu @ 2007-07-21  4:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Andi Kleen

[-- Attachment #1: short-rasize.patch --]
[-- Type: text/plain, Size: 2010 bytes --]

Use 'unsigned int' instead of 'unsigned long' for readahead sizes.

This helps reduce memory consumption on 64bit CPU when
a lot of files are opened.

CC: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
 include/linux/fs.h |    8 ++++----
 mm/filemap.c       |    2 +-
 mm/readahead.c     |    2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

--- linux-2.6.22-rc6-mm1.orig/include/linux/fs.h
+++ linux-2.6.22-rc6-mm1/include/linux/fs.h
@@ -771,12 +771,12 @@ struct fown_struct {
  * Track a single file's readahead state
  */
 struct file_ra_state {
-	pgoff_t start;                  /* where readahead started */
-	unsigned long size;             /* # of readahead pages */
-	unsigned long async_size;       /* do asynchronous readahead when
+	pgoff_t start;			/* where readahead started */
+	unsigned int size;		/* # of readahead pages */
+	unsigned int async_size;	/* do asynchronous readahead when
 					   there are only # of pages ahead */
 
-	unsigned long ra_pages;		/* Maximum readahead window */
+	unsigned int ra_pages;		/* Maximum readahead window */
 	unsigned long mmap_hit;		/* Cache hit stat for mmap accesses */
 	unsigned long mmap_miss;	/* Cache miss stat for mmap accesses */
 	unsigned long prev_index;	/* Cache last read() position */
--- linux-2.6.22-rc6-mm1.orig/mm/filemap.c
+++ linux-2.6.22-rc6-mm1/mm/filemap.c
@@ -840,7 +840,7 @@ static void shrink_readahead_size_eio(st
 	if (count > 5)
 		return;
 	count++;
-	printk(KERN_WARNING "Reducing readahead size to %luK\n",
+	printk(KERN_WARNING "Reducing readahead size to %dK\n",
 		ra->ra_pages << (PAGE_CACHE_SHIFT - 10));
 }
 
--- linux-2.6.22-rc6-mm1.orig/mm/readahead.c
+++ linux-2.6.22-rc6-mm1/mm/readahead.c
@@ -342,7 +342,7 @@ ondemand_readahead(struct address_space 
 		   bool hit_readahead_marker, pgoff_t offset,
 		   unsigned long req_size)
 {
-	unsigned long max;	/* max readahead pages */
+	int max;	/* max readahead pages */
 	int sequential;
 
 	max = ra->ra_pages;

--

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 2/7] readahead: mmap read-around simplification
       [not found] ` <20070721044346.687587063@mail.ustc.edu.cn>
@ 2007-07-21  4:43   ` Fengguang Wu
  0 siblings, 0 replies; 15+ messages in thread
From: Fengguang Wu @ 2007-07-21  4:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

[-- Attachment #1: remove-mmap-hit.patch --]
[-- Type: text/plain, Size: 1360 bytes --]

Fold file_ra_state.mmap_hit into file_ra_state.mmap_miss
and make it an int.

Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
 include/linux/fs.h |    3 +--
 mm/filemap.c       |    4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

--- linux-2.6.22-rc6-mm1.orig/include/linux/fs.h
+++ linux-2.6.22-rc6-mm1/include/linux/fs.h
@@ -777,8 +777,7 @@ struct file_ra_state {
 					   there are only # of pages ahead */
 
 	unsigned int ra_pages;		/* Maximum readahead window */
-	unsigned long mmap_hit;		/* Cache hit stat for mmap accesses */
-	unsigned long mmap_miss;	/* Cache miss stat for mmap accesses */
+	int mmap_miss;			/* Cache miss stat for mmap accesses */
 	unsigned long prev_index;	/* Cache last read() position */
 	unsigned int prev_offset;	/* Offset where last read() ended in a page */
 };
--- linux-2.6.22-rc6-mm1.orig/mm/filemap.c
+++ linux-2.6.22-rc6-mm1/mm/filemap.c
@@ -1389,7 +1389,7 @@ retry_find:
 		 * Do we miss much more than hit in this file? If so,
 		 * stop bothering with read-ahead. It will only hurt.
 		 */
-		if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
+		if (ra->mmap_miss > MMAP_LOTSAMISS)
 			goto no_cached_page;
 
 		/*
@@ -1415,7 +1415,7 @@ retry_find:
 	}
 
 	if (!did_readaround)
-		ra->mmap_hit++;
+		ra->mmap_miss--;
 
 	/*
 	 * We have a locked page in the page cache, now we need to check

--

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 3/7] readahead: combine file_ra_state.prev_index/prev_offset into prev_pos
       [not found] ` <20070721044346.824927556@mail.ustc.edu.cn>
@ 2007-07-21  4:43   ` Fengguang Wu
  0 siblings, 0 replies; 15+ messages in thread
From: Fengguang Wu @ 2007-07-21  4:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: merge-start-prev_index.patch --]
[-- Type: text/plain, Size: 4936 bytes --]

Combine the file_ra_state members
				unsigned long prev_index
				unsigned int prev_offset
into
				loff_t prev_pos

It is more consistent and better supports huge files.

Thanks to Peter for the nice proposal!

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
 fs/ext3/dir.c      |    2 +-
 fs/ext4/dir.c      |    2 +-
 fs/splice.c        |    2 +-
 include/linux/fs.h |    3 +--
 mm/filemap.c       |   11 ++++++-----
 mm/readahead.c     |   15 ++++++++-------
 6 files changed, 18 insertions(+), 17 deletions(-)

--- linux-2.6.22-rc6-mm1.orig/include/linux/fs.h
+++ linux-2.6.22-rc6-mm1/include/linux/fs.h
@@ -778,8 +778,7 @@ struct file_ra_state {
 
 	unsigned int ra_pages;		/* Maximum readahead window */
 	int mmap_miss;			/* Cache miss stat for mmap accesses */
-	unsigned long prev_index;	/* Cache last read() position */
-	unsigned int prev_offset;	/* Offset where last read() ended in a page */
+	loff_t prev_pos;		/* Cache last read() position */
 };
 
 /*
--- linux-2.6.22-rc6-mm1.orig/mm/filemap.c
+++ linux-2.6.22-rc6-mm1/mm/filemap.c
@@ -881,8 +881,8 @@ void do_generic_mapping_read(struct addr
 
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	next_index = index;
-	prev_index = ra.prev_index;
-	prev_offset = ra.prev_offset;
+	prev_index = ra.prev_pos >> PAGE_CACHE_SHIFT;
+	prev_offset = ra.prev_pos & (PAGE_CACHE_SIZE-1);
 	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
 	offset = *ppos & ~PAGE_CACHE_MASK;
 
@@ -968,7 +968,6 @@ page_ok:
 		index += offset >> PAGE_CACHE_SHIFT;
 		offset &= ~PAGE_CACHE_MASK;
 		prev_offset = offset;
-		ra.prev_offset = offset;
 
 		page_cache_release(page);
 		if (ret == nr && desc->count)
@@ -1055,7 +1054,9 @@ no_cached_page:
 
 out:
 	*_ra = ra;
-	_ra->prev_index = prev_index;
+	_ra->prev_pos = prev_index;
+	_ra->prev_pos <<= PAGE_CACHE_SHIFT;
+	_ra->prev_pos |= prev_offset;
 
 	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
 	if (filp)
@@ -1435,7 +1436,7 @@ retry_find:
 	 * Found the page and have a reference on it.
 	 */
 	mark_page_accessed(page);
-	ra->prev_index = page->index;
+	ra->prev_pos = page->index << PAGE_CACHE_SHIFT;
 	return page;
 
 outside_data_content:
--- linux-2.6.22-rc6-mm1.orig/mm/readahead.c
+++ linux-2.6.22-rc6-mm1/mm/readahead.c
@@ -45,7 +45,7 @@ void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 {
 	ra->ra_pages = mapping->backing_dev_info->ra_pages;
-	ra->prev_index = -1;
+	ra->prev_pos = -1;
 }
 EXPORT_SYMBOL_GPL(file_ra_state_init);
 
@@ -318,7 +318,7 @@ static unsigned long get_next_ra_size(st
  * indicator. The flag won't be set on already cached pages, to avoid the
  * readahead-for-nothing fuss, saving pointless page cache lookups.
  *
- * prev_index tracks the last visited page in the _previous_ read request.
+ * prev_pos tracks the last visited byte in the _previous_ read request.
  * It should be maintained by the caller, and will be used for detecting
  * small random reads. Note that the readahead algorithm checks loosely
  * for sequential patterns. Hence interleaved reads might be served as
@@ -342,11 +342,9 @@ ondemand_readahead(struct address_space 
 		   bool hit_readahead_marker, pgoff_t offset,
 		   unsigned long req_size)
 {
-	int max;	/* max readahead pages */
-	int sequential;
-
-	max = ra->ra_pages;
-	sequential = (offset - ra->prev_index <= 1UL) || (req_size > max);
+	int	max = ra->ra_pages;	/* max readahead pages */
+	pgoff_t prev_offset;
+	int	sequential;
 
 	/*
 	 * It's the expected callback offset, assume sequential access.
@@ -360,6 +358,9 @@ ondemand_readahead(struct address_space 
 		goto readit;
 	}
 
+	prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
+	sequential = offset - prev_offset <= 1UL || req_size > max;
+
 	/*
 	 * Standalone, small read.
 	 * Read as is, and do not pollute the readahead state.
--- linux-2.6.22-rc6-mm1.orig/fs/ext3/dir.c
+++ linux-2.6.22-rc6-mm1/fs/ext3/dir.c
@@ -143,7 +143,7 @@ static int ext3_readdir(struct file * fi
 					sb->s_bdev->bd_inode->i_mapping,
 					&filp->f_ra, filp,
 					index, 1);
-			filp->f_ra.prev_index = index;
+			filp->f_ra.prev_pos = index << PAGE_CACHE_SHIFT;
 			bh = ext3_bread(NULL, inode, blk, 0, &err);
 		}
 
--- linux-2.6.22-rc6-mm1.orig/fs/ext4/dir.c
+++ linux-2.6.22-rc6-mm1/fs/ext4/dir.c
@@ -142,7 +142,7 @@ static int ext4_readdir(struct file * fi
 					sb->s_bdev->bd_inode->i_mapping,
 					&filp->f_ra, filp,
 					index, 1);
-			filp->f_ra.prev_index = index;
+			filp->f_ra.prev_pos = index << PAGE_CACHE_SHIFT;
 			bh = ext4_bread(NULL, inode, blk, 0, &err);
 		}
 
--- linux-2.6.22-rc6-mm1.orig/fs/splice.c
+++ linux-2.6.22-rc6-mm1/fs/splice.c
@@ -455,7 +455,7 @@ fill_it:
 	 */
 	while (page_nr < nr_pages)
 		page_cache_release(pages[page_nr++]);
-	in->f_ra.prev_index = index;
+	in->f_ra.prev_pos = index << PAGE_CACHE_SHIFT;
 
 	if (spd.nr_pages)
 		return splice_to_pipe(pipe, &spd);

--

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 4/7] readahead: remove several readahead macros
       [not found] ` <20070721044347.008643456@mail.ustc.edu.cn>
@ 2007-07-21  4:43   ` Fengguang Wu
  0 siblings, 0 replies; 15+ messages in thread
From: Fengguang Wu @ 2007-07-21  4:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

[-- Attachment #1: readahead-macros-cleanup.patch --]
[-- Type: text/plain, Size: 1523 bytes --]

Remove VM_MAX_CACHE_HIT, MAX_RA_PAGES and MIN_RA_PAGES.

Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
 include/linux/mm.h |    2 --
 mm/readahead.c     |   10 +---------
 2 files changed, 1 insertion(+), 11 deletions(-)

--- linux-2.6.22-rc6-mm1.orig/include/linux/mm.h
+++ linux-2.6.22-rc6-mm1/include/linux/mm.h
@@ -1148,8 +1148,6 @@ int write_one_page(struct page *page, in
 /* readahead.c */
 #define VM_MAX_READAHEAD	128	/* kbytes */
 #define VM_MIN_READAHEAD	16	/* kbytes (includes current page) */
-#define VM_MAX_CACHE_HIT    	256	/* max pages in a row in cache before
-					 * turning readahead off */
 
 int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 			pgoff_t offset, unsigned long nr_to_read);
--- linux-2.6.22-rc6-mm1.orig/mm/readahead.c
+++ linux-2.6.22-rc6-mm1/mm/readahead.c
@@ -21,16 +21,8 @@ void default_unplug_io_fn(struct backing
 }
 EXPORT_SYMBOL(default_unplug_io_fn);
 
-/*
- * Convienent macros for min/max read-ahead pages.
- * Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up.
- * The latter is necessary for systems with large page size(i.e. 64k).
- */
-#define MAX_RA_PAGES	(VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE)
-#define MIN_RA_PAGES	DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE)
-
 struct backing_dev_info default_backing_dev_info = {
-	.ra_pages	= MAX_RA_PAGES,
+	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
 	.state		= 0,
 	.capabilities	= BDI_CAP_MAP_COPY,
 	.unplug_io_fn	= default_unplug_io_fn,

--

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 5/7] readahead: remove the limit max_sectors_kb imposed on max_readahead_kb
       [not found] ` <20070721044347.111061630@mail.ustc.edu.cn>
@ 2007-07-21  4:43   ` Fengguang Wu
  0 siblings, 0 replies; 15+ messages in thread
From: Fengguang Wu @ 2007-07-21  4:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Jens Axboe

[-- Attachment #1: remove-readahead-size-limit.patch --]
[-- Type: text/plain, Size: 1300 bytes --]

Remove the size limit max_sectors_kb imposed on max_readahead_kb.

The size restriction is unreasonable. Especially when max_sectors_kb cannot
grow larger than max_hw_sectors_kb, which can be rather small for some disk
drives.

Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/ll_rw_blk.c |    9 ---------
 1 file changed, 9 deletions(-)

--- linux-2.6.22-rc6-mm1.orig/block/ll_rw_blk.c
+++ linux-2.6.22-rc6-mm1/block/ll_rw_blk.c
@@ -3945,7 +3945,6 @@ queue_max_sectors_store(struct request_q
 			max_hw_sectors_kb = q->max_hw_sectors >> 1,
 			page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
 	ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
-	int ra_kb;
 
 	if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
 		return -EINVAL;
@@ -3954,14 +3953,6 @@ queue_max_sectors_store(struct request_q
 	 * values synchronously:
 	 */
 	spin_lock_irq(q->queue_lock);
-	/*
-	 * Trim readahead window as well, if necessary:
-	 */
-	ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
-	if (ra_kb > max_sectors_kb)
-		q->backing_dev_info.ra_pages =
-				max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);
-
 	q->max_sectors = max_sectors_kb << 1;
 	spin_unlock_irq(q->queue_lock);
 

--

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 6/7] radixtree: introduce radix_tree_scan_hole()
       [not found] ` <20070721044347.250839328@mail.ustc.edu.cn>
@ 2007-07-21  4:43   ` Fengguang Wu
  2007-07-21  5:48     ` Andrew Morton
  2007-07-23  7:58     ` Nick Piggin
  0 siblings, 2 replies; 15+ messages in thread
From: Fengguang Wu @ 2007-07-21  4:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Nick Piggin

[-- Attachment #1: radixtree-introduce-scan-hole-data-functions.patch --]
[-- Type: text/plain, Size: 2271 bytes --]

Introduce radix_tree_scan_hole(root, index, max_scan) to scan radix tree
for the first hole. It will be used in interleaved readahead.

The implementation is dumb and obviously correct.
It can help debug(and document) the possible smart one in future.

Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---

 include/linux/radix-tree.h |    2 ++
 lib/radix-tree.c           |   34 ++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

--- linux-2.6.22-rc6-mm1.orig/include/linux/radix-tree.h
+++ linux-2.6.22-rc6-mm1/include/linux/radix-tree.h
@@ -155,6 +155,8 @@ void *radix_tree_delete(struct radix_tre
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items);
+unsigned long radix_tree_scan_hole(struct radix_tree_root *root,
+				unsigned long index, unsigned long max_scan);
 int radix_tree_preload(gfp_t gfp_mask);
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
--- linux-2.6.22-rc6-mm1.orig/lib/radix-tree.c
+++ linux-2.6.22-rc6-mm1/lib/radix-tree.c
@@ -601,6 +601,40 @@ int radix_tree_tag_get(struct radix_tree
 EXPORT_SYMBOL(radix_tree_tag_get);
 #endif
 
+static unsigned long
+radix_tree_scan_hole_dumb(struct radix_tree_root *root,
+				unsigned long index, unsigned long max_scan)
+{
+	unsigned long i;
+
+	for (i = 0; i < max_scan; i++) {
+		if (!radix_tree_lookup(root, index))
+			break;
+		if (++index == 0)
+			break;
+	}
+
+	return index;
+}
+
+/**
+ *	radix_tree_scan_hole    -    scan for hole
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@max_scan:      advice on max items to scan (it may scan a little more)
+ *
+ *      Scan forward from @index for a hole/empty item, stop when
+ *      - hit hole
+ *      - wrap-around to index 0
+ *      - @max_scan or more items scanned
+ */
+unsigned long radix_tree_scan_hole(struct radix_tree_root *root,
+				unsigned long index, unsigned long max_scan)
+{
+	return radix_tree_scan_hole_dumb(root, index, max_scan);
+}
+EXPORT_SYMBOL(radix_tree_scan_hole);
+
 static unsigned int
 __lookup(struct radix_tree_node *slot, void **results, unsigned long index,
 	unsigned int max_items, unsigned long *next_index)

--

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 7/7] readahead: basic support of interleaved reads
       [not found] ` <20070721044347.388744012@mail.ustc.edu.cn>
@ 2007-07-21  4:43   ` Fengguang Wu
  2007-07-21  7:24   ` Rusty Russell
  1 sibling, 0 replies; 15+ messages in thread
From: Fengguang Wu @ 2007-07-21  4:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Rusty Russell

[-- Attachment #1: readahead-interleaved-reads.patch --]
[-- Type: text/plain, Size: 3282 bytes --]

This is a simplified version of the pagecache context based readahead.
It handles the case of multiple threads reading on the same fd and invalidating
each others' readahead state. It does the trick by scanning the pagecache and
recovering the current read stream's readahead status.

The algorithm works in a opportunistic way, in that it do not try to detect
interleaved reads _actively_, which requires a probe into the page cache(which
means a little more overheads for random reads). It only tries to handle a
previously started sequential readahead whose state was overwritten by
another concurrent stream, and it can do this job pretty well.

Negative and positive examples(or what you can expect from it):

1) it cannot detect and serve perfect request-by-request interleaved reads
   right:
	time	stream 1  stream 2
	0 	1         
	1 	          1001
	2 	2
	3 	          1002
	4 	3
	5 	          1003
	6 	4
	7 	          1004
	8 	5
	9	          1005
Here no single readahead will be carried out.

2) However, if it's two concurrent reads by two threads, the chance of the
   initial sequential readahead be started is huge. Once the first sequential
   readahead is started for a stream, this patch will ensure that the readahead
   window continues to rampup and won't be disturbed by other streams.

	time	stream 1  stream 2
	0 	1         
	1 	2
	2 	          1001
	3 	3
	4 	          1002
	5 	          1003
	6 	4
	7 	5
	8 	          1004
	9 	6
	10	          1005
	11	7
	12	          1006
	13	          1007
Here steam 1 will start a readahead at page 2, and stream 2 will start its
first readahead at page 1003. From then on the two streams will be served right.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
 mm/readahead.c |   33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

--- linux-2.6.22-rc6-mm1.orig/mm/readahead.c
+++ linux-2.6.22-rc6-mm1/mm/readahead.c
@@ -363,6 +363,29 @@ ondemand_readahead(struct address_space 
 	}
 
 	/*
+	 * Hit a marked page without valid readahead state.
+	 * E.g. interleaved reads.
+	 * Query the pagecache for async_size, which normally equals to
+	 * readahead size. Ramp it up and use it as the new readahead size.
+	 */
+	if (hit_readahead_marker) {
+		pgoff_t start;
+
+		read_lock_irq(&mapping->tree_lock);
+		start = radix_tree_scan_hole(&mapping->page_tree, offset, max+1);
+		read_unlock_irq(&mapping->tree_lock);
+
+		if (!start || start - offset > max)
+			return 0;
+
+		ra->start = start;
+		ra->size = start - offset;	/* old async_size */
+		ra->size = get_next_ra_size(ra, max);
+		ra->async_size = ra->size;
+		goto readit;
+	}
+
+	/*
 	 * It may be one of
 	 * 	- first read on start of file
 	 * 	- sequential cache miss
@@ -373,16 +396,6 @@ ondemand_readahead(struct address_space 
 	ra->size = get_init_ra_size(req_size, max);
 	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
 
-	/*
-	 * Hit on a marked page without valid readahead state.
-	 * E.g. interleaved reads.
-	 * Not knowing its readahead pos/size, bet on the minimal possible one.
-	 */
-	if (hit_readahead_marker) {
-		ra->start++;
-		ra->size = get_next_ra_size(ra, max);
-	}
-
 readit:
 	return ra_submit(ra, mapping, filp);
 }

--

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 6/7] radixtree: introduce radix_tree_scan_hole()
  2007-07-21  4:43   ` [PATCH 6/7] radixtree: introduce radix_tree_scan_hole() Fengguang Wu
@ 2007-07-21  5:48     ` Andrew Morton
       [not found]       ` <20070721063629.GA7013@mail.ustc.edu.cn>
  2007-07-23  7:58     ` Nick Piggin
  1 sibling, 1 reply; 15+ messages in thread
From: Andrew Morton @ 2007-07-21  5:48 UTC (permalink / raw)
  To: Fengguang Wu; +Cc: linux-kernel, Nick Piggin

On Sat, 21 Jul 2007 12:43:06 +0800 Fengguang Wu <wfg@mail.ustc.edu.cn> wrote:

> Introduce radix_tree_scan_hole(root, index, max_scan) to scan radix tree
> for the first hole. It will be used in interleaved readahead.

If you're ever feeling fantastically bored, please consider updating the
userspace radix-tree test harness for this?  Cook up a couple of testcases
for the new functionality?

Thanks.

http://www.zip.com.au/~akpm/linux/patches/stuff/rtth.tar.gz is the latest.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 6/7] radixtree: introduce radix_tree_scan_hole()
       [not found]       ` <20070721063629.GA7013@mail.ustc.edu.cn>
@ 2007-07-21  6:36         ` Fengguang Wu
  0 siblings, 0 replies; 15+ messages in thread
From: Fengguang Wu @ 2007-07-21  6:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Nick Piggin

On Fri, Jul 20, 2007 at 10:48:59PM -0700, Andrew Morton wrote:
> On Sat, 21 Jul 2007 12:43:06 +0800 Fengguang Wu <wfg@mail.ustc.edu.cn> wrote:
> 
> > Introduce radix_tree_scan_hole(root, index, max_scan) to scan radix tree
> > for the first hole. It will be used in interleaved readahead.
> 
> If you're ever feeling fantastically bored, please consider updating the
> userspace radix-tree test harness for this?  Cook up a couple of testcases
> for the new functionality?

Thanks for the reminding.  I'd add some test cases if I'm to optimize it.

But this _dumb_ scan function is obviously correct(and won't be too slow on
1M readahead).  In fact it is pretty suitable for testing the optimized one ;)


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 7/7] readahead: basic support of interleaved reads
       [not found] ` <20070721044347.388744012@mail.ustc.edu.cn>
  2007-07-21  4:43   ` [PATCH 7/7] readahead: basic support of interleaved reads Fengguang Wu
@ 2007-07-21  7:24   ` Rusty Russell
       [not found]     ` <20070721075744.GA10790@mail.ustc.edu.cn>
  1 sibling, 1 reply; 15+ messages in thread
From: Rusty Russell @ 2007-07-21  7:24 UTC (permalink / raw)
  To: Fengguang Wu; +Cc: Andrew Morton, linux-kernel

On Sat, 2007-07-21 at 12:43 +0800, Fengguang Wu wrote:
> plain text document attachment (readahead-interleaved-reads.patch)
> This is a simplified version of the pagecache context based readahead.
> It handles the case of multiple threads reading on the same fd and invalidating
> each others' readahead state. It does the trick by scanning the pagecache and
> recovering the current read stream's readahead status.
> 
> The algorithm works in a opportunistic way, in that it do not try to detect
> interleaved reads _actively_, which requires a probe into the page cache(which
> means a little more overheads for random reads). It only tries to handle a
> previously started sequential readahead whose state was overwritten by
> another concurrent stream, and it can do this job pretty well.

Hi Fengguang,

	This is really clever!

	Only one slight complaint: I wonder if "radix_tree_scan_hole" could be
expressed as "radix_tree_extent_size" which return the number of
populated indices up to "max_scan".  Returning a length seems more
intuitive to me (and I think gets rid of the wraparound error case?)

Cheers,
Rusty.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 7/7] readahead: basic support of interleaved reads
       [not found]     ` <20070721075744.GA10790@mail.ustc.edu.cn>
@ 2007-07-21  7:57       ` Fengguang Wu
  0 siblings, 0 replies; 15+ messages in thread
From: Fengguang Wu @ 2007-07-21  7:57 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Andrew Morton, linux-kernel, Nick Piggin, Andi Kleen

On Sat, Jul 21, 2007 at 05:24:39PM +1000, Rusty Russell wrote:
> 	This is really clever!

Thank you :)

> 	Only one slight complaint: I wonder if "radix_tree_scan_hole" could be
> expressed as "radix_tree_extent_size" which return the number of

In fact the full context based readahead requires three functions:

        - radix_tree_scan_hole(root, index, max_scan)                                                                   
        - radix_tree_scan_hole_backward(root, index, max_scan)                                                          
        - radix_tree_scan_data_backward(root, index, max_scan)                                                          

I would be very glad if you can work up a consistent naming scheme
for them all ;)

> populated indices up to "max_scan".  Returning a length seems more
> intuitive to me (and I think gets rid of the wraparound error case?)

No, changing the return value won't eliminate the wraparound case.

But I guess we can safely assume that it won't wrap around at all?
The last page in the pagecache address space will not be
granted/allocated thanks to the not-too-aggressive MAX_LFS_FILESIZE?


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 6/7] radixtree: introduce radix_tree_scan_hole()
  2007-07-21  4:43   ` [PATCH 6/7] radixtree: introduce radix_tree_scan_hole() Fengguang Wu
  2007-07-21  5:48     ` Andrew Morton
@ 2007-07-23  7:58     ` Nick Piggin
       [not found]       ` <20070723080405.GA7420@mail.ustc.edu.cn>
  1 sibling, 1 reply; 15+ messages in thread
From: Nick Piggin @ 2007-07-23  7:58 UTC (permalink / raw)
  To: Fengguang Wu; +Cc: Andrew Morton, linux-kernel

Fengguang Wu wrote:
> Introduce radix_tree_scan_hole(root, index, max_scan) to scan radix tree
> for the first hole. It will be used in interleaved readahead.
> 
> The implementation is dumb and obviously correct.
> It can help debug(and document) the possible smart one in future.

Reasonable function to want. Is radix_tree_scan_hole the best name?
What about radix_tree_next_hole or _find_next_hole? (Andrew, any
suggestions?)

> 
> Cc: Nick Piggin <nickpiggin@yahoo.com.au>
> Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
> ---
> 
>  include/linux/radix-tree.h |    2 ++
>  lib/radix-tree.c           |   34 ++++++++++++++++++++++++++++++++++
>  2 files changed, 36 insertions(+)
> 
> --- linux-2.6.22-rc6-mm1.orig/include/linux/radix-tree.h
> +++ linux-2.6.22-rc6-mm1/include/linux/radix-tree.h
> @@ -155,6 +155,8 @@ void *radix_tree_delete(struct radix_tre
>  unsigned int
>  radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
>  			unsigned long first_index, unsigned int max_items);
> +unsigned long radix_tree_scan_hole(struct radix_tree_root *root,
> +				unsigned long index, unsigned long max_scan);
>  int radix_tree_preload(gfp_t gfp_mask);
>  void radix_tree_init(void);
>  void *radix_tree_tag_set(struct radix_tree_root *root,
> --- linux-2.6.22-rc6-mm1.orig/lib/radix-tree.c
> +++ linux-2.6.22-rc6-mm1/lib/radix-tree.c
> @@ -601,6 +601,40 @@ int radix_tree_tag_get(struct radix_tree
>  EXPORT_SYMBOL(radix_tree_tag_get);
>  #endif
>  
> +static unsigned long
> +radix_tree_scan_hole_dumb(struct radix_tree_root *root,
> +				unsigned long index, unsigned long max_scan)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < max_scan; i++) {
> +		if (!radix_tree_lookup(root, index))
> +			break;
> +		if (++index == 0)
> +			break;

Minor nit: I think it is preferred to have index++; on its own line.

> +	}
> +
> +	return index;
> +}
> +
> +/**
> + *	radix_tree_scan_hole    -    scan for hole
> + *	@root:		radix tree root
> + *	@index:		index key
> + *	@max_scan:      advice on max items to scan (it may scan a little more)
> + *
> + *      Scan forward from @index for a hole/empty item, stop when
> + *      - hit hole
> + *      - wrap-around to index 0
> + *      - @max_scan or more items scanned
> + */

Some suggestions on the comment:

/**
  *	radix_tree_next_hole    -    find the next hole (not-present entry)
  *	@root:		radix tree root
  *	@index:		index key
  *	@max_scan:      maximum range to search
  *
  *      Search the set [index,  min(index+max_scan-1, MAX_INDEX)] for the lowest
  *      indexed hole.
  *
  *      Returns: the index of the hole if found, otherwise returns an index
  *      outside of the set specified (in which case 'return - index >= max_scan'
  *      will be true).
  *
  *      radix_tree_next_hole may be called under rcu_read_lock. However, like
  *      radix_tree_gang_lookup, this will not atomically search a snapshot of the
  *      tree at a single point in time. For example, if a hole is created at index
  *      5, then subsequently a hole is created at index 10, radix_tree_next_hole
  *      covering both indexes may return 10 if called under rcu_read_lock.
  */


> +unsigned long radix_tree_scan_hole(struct radix_tree_root *root,
> +				unsigned long index, unsigned long max_scan)
> +{
> +	return radix_tree_scan_hole_dumb(root, index, max_scan);
> +}
> +EXPORT_SYMBOL(radix_tree_scan_hole);

Don't do this scan_hole_dumb thing. Just implement it in place.

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 6/7] radixtree: introduce radix_tree_scan_hole()
       [not found]       ` <20070723080405.GA7420@mail.ustc.edu.cn>
@ 2007-07-23  8:04         ` Fengguang Wu
       [not found]         ` <20070723081209.GA12393@mail.ustc.edu.cn>
  1 sibling, 0 replies; 15+ messages in thread
From: Fengguang Wu @ 2007-07-23  8:04 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel

On Mon, Jul 23, 2007 at 05:58:02PM +1000, Nick Piggin wrote:
> Fengguang Wu wrote:
> >Introduce radix_tree_scan_hole(root, index, max_scan) to scan radix tree
> >for the first hole. It will be used in interleaved readahead.
> >
> >The implementation is dumb and obviously correct.
> >It can help debug(and document) the possible smart one in future.
> 
> Reasonable function to want. Is radix_tree_scan_hole the best name?
> What about radix_tree_next_hole or _find_next_hole? (Andrew, any
> suggestions?)

Thank you!

All comments seems reasonable, so I simply attach the updated patch.

Fengguang
---
Subject: radixtree: introduce radix_tree_next_hole()
Cc: Nick Piggin <nickpiggin@yahoo.com.au>

Introduce radix_tree_next_hole(root, index, max_scan) to scan radix tree
for the first hole. It will be used in interleaved readahead.

Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---

 include/linux/radix-tree.h |    2 +
 lib/radix-tree.c           |   36 +++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

--- linux-2.6.22-rc6-mm1.orig/include/linux/radix-tree.h
+++ linux-2.6.22-rc6-mm1/include/linux/radix-tree.h
@@ -155,6 +155,8 @@ void *radix_tree_delete(struct radix_tre
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items);
+unsigned long radix_tree_next_hole(struct radix_tree_root *root,
+				unsigned long index, unsigned long max_scan);
 int radix_tree_preload(gfp_t gfp_mask);
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
--- linux-2.6.22-rc6-mm1.orig/lib/radix-tree.c
+++ linux-2.6.22-rc6-mm1/lib/radix-tree.c
@@ -601,6 +601,42 @@ int radix_tree_tag_get(struct radix_tree
 EXPORT_SYMBOL(radix_tree_tag_get);
 #endif
 
+/**
+ *	radix_tree_next_hole    -    find the next hole (not-present entry)
+ *	@root:		tree root
+ *	@index:		index key
+ *	@max_scan:	maximum range to search
+ *
+ *	Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest
+ *	indexed hole.
+ *
+ *	Returns: the index of the hole if found, otherwise returns an index
+ *	outside of the set specified (in which case 'return - index >= max_scan'
+ *	will be true).
+ *
+ *	radix_tree_next_hole may be called under rcu_read_lock. However, like
+ *	radix_tree_gang_lookup, this will not atomically search a snapshot of the
+ *	tree at a single point in time. For example, if a hole is created at index
+ *	5, then subsequently a hole is created at index 10, radix_tree_next_hole
+ *	covering both indexes may return 10 if called under rcu_read_lock.
+ */
+unsigned long radix_tree_next_hole(struct radix_tree_root *root,
+				unsigned long index, unsigned long max_scan)
+{
+	unsigned long i;
+
+	for (i = 0; i < max_scan; i++) {
+		if (!radix_tree_lookup(root, index))
+			break;
+		index++;
+		if (index == 0)
+			break;
+	}
+
+	return index;
+}
+EXPORT_SYMBOL(radix_tree_next_hole);
+
 static unsigned int
 __lookup(struct radix_tree_node *slot, void **results, unsigned long index,
 	unsigned int max_items, unsigned long *next_index)


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 6/7] radixtree: introduce radix_tree_scan_hole()
       [not found]         ` <20070723081209.GA12393@mail.ustc.edu.cn>
@ 2007-07-23  8:12           ` Fengguang Wu
  0 siblings, 0 replies; 15+ messages in thread
From: Fengguang Wu @ 2007-07-23  8:12 UTC (permalink / raw)
  To: Nick Piggin, Andrew Morton, linux-kernel

On Mon, Jul 23, 2007 at 04:04:05PM +0800, Fengguang Wu wrote:
> On Mon, Jul 23, 2007 at 05:58:02PM +1000, Nick Piggin wrote:
> > Fengguang Wu wrote:
> > >Introduce radix_tree_scan_hole(root, index, max_scan) to scan radix tree
> > >for the first hole. It will be used in interleaved readahead.
> > >
> > >The implementation is dumb and obviously correct.
> > >It can help debug(and document) the possible smart one in future.
> > 
> > Reasonable function to want. Is radix_tree_scan_hole the best name?
> > What about radix_tree_next_hole or _find_next_hole? (Andrew, any
> > suggestions?)
> 
> Thank you!
> 
> All comments seems reasonable, so I simply attach the updated patch.
> 
> Fengguang
> ---
> Subject: radixtree: introduce radix_tree_next_hole()
> Cc: Nick Piggin <nickpiggin@yahoo.com.au>
> 
> Introduce radix_tree_next_hole(root, index, max_scan) to scan radix tree
> for the first hole. It will be used in interleaved readahead.
> 
> Cc: Nick Piggin <nickpiggin@yahoo.com.au>
> Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>

And another rename fix for the interleaved readahead patch.

Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
 mm/readahead.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- linux-2.6.22-rc6-mm1.orig/mm/readahead.c
+++ linux-2.6.22-rc6-mm1/mm/readahead.c
@@ -372,7 +372,7 @@ ondemand_readahead(struct address_space 
 		pgoff_t start;
 
 		read_lock_irq(&mapping->tree_lock);
-		start = radix_tree_scan_hole(&mapping->page_tree, offset, max+1);
+		start = radix_tree_next_hole(&mapping->page_tree, offset, max+1);
 		read_unlock_irq(&mapping->tree_lock);
 
 		if (!start || start - offset > max)


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2007-07-23  8:23 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20070721044300.909424569@mail.ustc.edu.cn>
2007-07-21  4:43 ` [PATCH 0/7] readahead cleanups and interleaved readahead take 3 Fengguang Wu
     [not found] ` <20070721044346.554186594@mail.ustc.edu.cn>
2007-07-21  4:43   ` [PATCH 1/7] readahead: compacting file_ra_state Fengguang Wu
     [not found] ` <20070721044346.687587063@mail.ustc.edu.cn>
2007-07-21  4:43   ` [PATCH 2/7] readahead: mmap read-around simplification Fengguang Wu
     [not found] ` <20070721044346.824927556@mail.ustc.edu.cn>
2007-07-21  4:43   ` [PATCH 3/7] readahead: combine file_ra_state.prev_index/prev_offset into prev_pos Fengguang Wu
     [not found] ` <20070721044347.008643456@mail.ustc.edu.cn>
2007-07-21  4:43   ` [PATCH 4/7] readahead: remove several readahead macros Fengguang Wu
     [not found] ` <20070721044347.111061630@mail.ustc.edu.cn>
2007-07-21  4:43   ` [PATCH 5/7] readahead: remove the limit max_sectors_kb imposed on max_readahead_kb Fengguang Wu
     [not found] ` <20070721044347.250839328@mail.ustc.edu.cn>
2007-07-21  4:43   ` [PATCH 6/7] radixtree: introduce radix_tree_scan_hole() Fengguang Wu
2007-07-21  5:48     ` Andrew Morton
     [not found]       ` <20070721063629.GA7013@mail.ustc.edu.cn>
2007-07-21  6:36         ` Fengguang Wu
2007-07-23  7:58     ` Nick Piggin
     [not found]       ` <20070723080405.GA7420@mail.ustc.edu.cn>
2007-07-23  8:04         ` Fengguang Wu
     [not found]         ` <20070723081209.GA12393@mail.ustc.edu.cn>
2007-07-23  8:12           ` Fengguang Wu
     [not found] ` <20070721044347.388744012@mail.ustc.edu.cn>
2007-07-21  4:43   ` [PATCH 7/7] readahead: basic support of interleaved reads Fengguang Wu
2007-07-21  7:24   ` Rusty Russell
     [not found]     ` <20070721075744.GA10790@mail.ustc.edu.cn>
2007-07-21  7:57       ` Fengguang Wu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).