linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH][CFT] new IO scheduler for 2.4.20
@ 2003-04-17 17:28 Neil Schemenauer
  2003-04-17 20:41 ` Andrew Morton
  2003-04-21 11:33 ` Andrea Arcangeli
  0 siblings, 2 replies; 11+ messages in thread
From: Neil Schemenauer @ 2003-04-17 17:28 UTC (permalink / raw)
  To: linux-kernel; +Cc: axboe, akpm, conman

[-- Attachment #1: Type: text/plain, Size: 1113 bytes --]

Hi all,

Recently I was bitten badly by bad IO scheduler behavior on an important
Linux server.  An easy way to trigger this problem is to start a
streaming write process:

    while :
    do
            dd if=/dev/zero of=foo bs=1M count=512 conv=notrunc
    done

and then try doing a bunch of small reads:

    time (find kernel-tree -type f | xargs cat > /dev/null)

With Linux 2.4.20, the reading takes a very long time to finish.  I
looked at backporting Jen's deadline scheduler to 2.4 but decided it
would be too much work (especially for someone like me who doesn't know
much about kernel hacking).  

I've come up with a fairly simple patch that seems to solve the
starvation problem.  Perhaps it is simple enough to get merged into the
stable branch (after a little more polish).  The basic idea is to give
reads a boost if there are lots of write requests in the queue.  I think
my solution also prevents reads and writes from being starved
indefinitely.

Throughput also seems good although I have not done a lot of testing
yet.  I would greatly appreciate it if people could give it a test.

  Neil

[-- Attachment #2: neil-iosched-1.diff --]
[-- Type: text/plain, Size: 4186 bytes --]

diff -u -ur linux-2.4.20/drivers/block/elevator.c linux-iosched-1d/drivers/block/elevator.c
--- linux-2.4.20/drivers/block/elevator.c	2003-04-14 14:47:22.000000000 -0400
+++ linux-iosched-1d/drivers/block/elevator.c	2003-04-17 12:15:10.000000000 -0400
@@ -74,6 +74,86 @@
 	return 0;
 }
 
+int elevator_neil_merge(request_queue_t *q, struct request **req,
+			 struct list_head * head,
+			 struct buffer_head *bh, int rw,
+			 int max_sectors)
+{
+	struct list_head *entry = &q->queue_head;
+	unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE;
+	unsigned long read_sectors = 0, write_sectors = 0;
+	unsigned int expire_time = jiffies - 1*HZ;
+	struct request *__rq;
+
+	while ((entry = entry->prev) != head) {
+		__rq = blkdev_entry_to_request(entry);
+
+		/*
+		 * we can't insert beyond a zero sequence point
+		 */
+		if (__rq->elevator_sequence == 0) {
+			goto append;
+		}
+
+		if (__rq->cmd == READ)
+			read_sectors += __rq->nr_sectors;
+		else
+			write_sectors += __rq->nr_sectors;
+
+		if (__rq->rq_dev != bh->b_rdev)
+			continue;
+
+		if (__rq->waiting)
+			continue;
+
+		if (__rq->cmd != rw)
+			continue;
+
+		if (time_before(__rq->start_time, expire_time)) {
+			break;
+		}
+
+		if (__rq->sector + __rq->nr_sectors == bh->b_rsector) {
+			if (__rq->nr_sectors + count <= max_sectors)
+				ret = ELEVATOR_BACK_MERGE;
+			*req = __rq;
+			break;
+		} else if (__rq->sector - count == bh->b_rsector) {
+			if (__rq->nr_sectors + count <= max_sectors)
+				ret = ELEVATOR_FRONT_MERGE;
+			*req = __rq;
+			break;
+		}
+	}
+
+	if (!*req && rw == READ) {
+		long extra_writes = write_sectors - read_sectors;
+		/*
+		 * If there are more writes than reads in the queue then put
+		 * read requests ahead of the extra writes.  This prevents
+		 * writes from starving reads.
+		 */
+		entry = q->queue_head.prev;
+		while (extra_writes > 0 && entry != head) {
+			__rq = blkdev_entry_to_request(entry);
+			if (__rq->cmd == WRITE)
+				extra_writes -= __rq->nr_sectors;
+			entry = entry->prev;
+		}
+		*req = blkdev_entry_to_request(entry);
+	}
+
+append:
+
+	return ret;
+}
+
+void elevator_neil_merge_req(struct request *req, struct request *next)
+{
+	if (time_before(next->start_time, req->start_time))
+		req->start_time = next->start_time;
+}
+
 
 int elevator_linus_merge(request_queue_t *q, struct request **req,
 			 struct list_head * head,
diff -u -ur linux-2.4.20/drivers/block/ll_rw_blk.c linux-iosched-1d/drivers/block/ll_rw_blk.c
--- linux-2.4.20/drivers/block/ll_rw_blk.c	2003-04-14 14:47:22.000000000 -0400
+++ linux-iosched-1d/drivers/block/ll_rw_blk.c	2003-04-17 12:21:59.000000000 -0400
@@ -480,7 +480,7 @@
 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
 {
 	INIT_LIST_HEAD(&q->queue_head);
-	elevator_init(&q->elevator, ELEVATOR_LINUS);
+	elevator_init(&q->elevator, ELEVATOR_NEIL);
 	blk_init_free_list(q);
 	q->request_fn     	= rfn;
 	q->back_merge_fn       	= ll_back_merge_fn;
@@ -922,7 +922,8 @@
 			rw = READ;	/* drop into READ */
 		case READ:
 		case WRITE:
-			latency = elevator_request_latency(elevator, rw);
+			/* latency = elevator_request_latency(elevator, rw); */
+			latency = 1;
 			break;
 		default:
 			BUG();
diff -u -ur linux-2.4.20/include/linux/elevator.h linux-iosched-1d/include/linux/elevator.h
--- linux-2.4.20/include/linux/elevator.h	2003-04-14 14:47:24.000000000 -0400
+++ linux-iosched-1d/include/linux/elevator.h	2003-04-15 10:17:23.000000000 -0400
@@ -31,6 +31,9 @@
 void elevator_linus_merge_cleanup(request_queue_t *, struct request *, int);
 void elevator_linus_merge_req(struct request *, struct request *);
 
+int elevator_neil_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int);
+void elevator_neil_merge_req(struct request *, struct request *);
+
 typedef struct blkelv_ioctl_arg_s {
 	int queue_ID;
 	int read_latency;
@@ -101,3 +104,12 @@
 	})
 
 #endif
+
+#define ELEVATOR_NEIL							\
+((elevator_t) {								\
+	0,				/* read_latency */		\
+	0,				/* write_latency */		\
+									\
+	elevator_neil_merge,		/* elevator_merge_fn */		\
+	elevator_neil_merge_req,	/* elevator_merge_req_fn */	\
+	})
Only in linux-iosched-1d: inst.sh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][CFT] new IO scheduler for 2.4.20
  2003-04-17 17:28 [PATCH][CFT] new IO scheduler for 2.4.20 Neil Schemenauer
@ 2003-04-17 20:41 ` Andrew Morton
  2003-04-20 18:26   ` Neil Schemenauer
  2003-04-21 11:33 ` Andrea Arcangeli
  1 sibling, 1 reply; 11+ messages in thread
From: Andrew Morton @ 2003-04-17 20:41 UTC (permalink / raw)
  To: Neil Schemenauer; +Cc: linux-kernel, axboe, conman

Neil Schemenauer <nas@arctrix.com> wrote:
>
> Hi all,
> 
> Recently I was bitten badly by bad IO scheduler behavior on an important
> Linux server.  An easy way to trigger this problem is to start a
> streaming write process:
> 
>     while :
>     do
>             dd if=/dev/zero of=foo bs=1M count=512 conv=notrunc
>     done

That's a local DoS.

> and then try doing a bunch of small reads:
> 
>     time (find kernel-tree -type f | xargs cat > /dev/null)

Awful, isn't it?

> +int elevator_neil_merge(request_queue_t *q, struct request **req,

This is a nice looking patch!

> ...
> +	unsigned int expire_time = jiffies - 1*HZ;
> ...
> +		if (time_before(__rq->start_time, expire_time)) {
> +			break;
> +		}

It has a deadline component.  One second is probably too long.

> +	if (!*req && rw == READ) {
> +		long extra_writes = write_sectors - read_sectors;
> +		/*
> +		 * If there are more writes than reads in the queue then put
> +		 * read requests ahead of the extra writes.  This prevents
> +		 * writes from starving reads.
> +		 */
> +		entry = q->queue_head.prev;
> +		while (extra_writes > 0 && entry != head) {
> +			__rq = blkdev_entry_to_request(entry);
> +			if (__rq->cmd == WRITE)
> +				extra_writes -= __rq->nr_sectors;
> +			entry = entry->prev;
> +		}
> +		*req = blkdev_entry_to_request(entry);

One suggestion I'd make here is to not count "sectors" at all.  Just count
requests.

See, the code at present is assuming that 100 discrete requests are
equivalent to a single 100 sector request.  And that is most definitely not
the case.  A 100 sector request is worth (guess) just two discontiguous
requests.


I'd be interested in seeing some comparative benchmark results with the above
DoS attack.  And contest numbers too...


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][CFT] new IO scheduler for 2.4.20
  2003-04-17 20:41 ` Andrew Morton
@ 2003-04-20 18:26   ` Neil Schemenauer
  2003-04-20 22:06     ` Marc-Christian Petersen
  0 siblings, 1 reply; 11+ messages in thread
From: Neil Schemenauer @ 2003-04-20 18:26 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Andrew Morton wrote:
> One suggestion I'd make here is to not count "sectors" at all.  Just
> count requests.
[...]
> I'd be interested in seeing some comparative benchmark results with
> the above DoS attack.  And contest numbers too...

Okay, nas1 is the patch I orginally posted.  With nas2, I am counting
requests instead of sectors.  The times below are for the find/cat
command to complete while the dd command is running.

            time
    2.4.20  ????????? (I gave up after about 15 minutes)
    -nas1   1m56.746s
    -nas2   2m36.928s

Here's the contest results:

    no_load:
    Kernel      [runs]	Time	CPU%	Loads	LCPU%	Ratio
    2.4.20           1	50	90.0	0.0	0.0	1.00
    2.4.20-nas1      1	50	90.0	0.0	0.0	1.00
    2.4.20-nas2      1	50	92.0	0.0	0.0	1.00
    cacherun:
    Kernel      [runs]	Time	CPU%	Loads	LCPU%	Ratio
    2.4.20           1	46	97.8	0.0	0.0	0.92
    2.4.20-nas1      1	46	97.8	0.0	0.0	0.92
    2.4.20-nas2      1	46	97.8	0.0	0.0	0.92
    process_load:
    Kernel      [runs]	Time	CPU%	Loads	LCPU%	Ratio
    2.4.20           1	74	58.1	70.0	40.5	1.48
    2.4.20-nas1      1	75	57.3	73.0	40.0	1.50
    2.4.20-nas2      1	76	56.6	76.0	40.8	1.52
    ctar_load:
    Kernel      [runs]	Time	CPU%	Loads	LCPU%	Ratio
    2.4.20           1	60	78.3	9.0	16.7	1.20
    2.4.20-nas1      1	58	81.0	8.0	13.8	1.16
    2.4.20-nas2      1	60	78.3	9.0	16.7	1.20
    xtar_load:
    Kernel      [runs]	Time	CPU%	Loads	LCPU%	Ratio
    2.4.20           1	139	34.5	6.0	7.9	2.78
    2.4.20-nas1      1	71	64.8	2.0	4.2	1.42
    2.4.20-nas2      1	71	66.2	1.0	4.2	1.42
    io_load:
    Kernel      [runs]	Time	CPU%	Loads	LCPU%	Ratio
    2.4.20           1	191	27.7	42.1	13.1	3.82
    2.4.20-nas1      1	71	71.8	12.1	8.5	1.42
    2.4.20-nas2      1	76	69.7	12.3	9.1	1.52
    io_other:
    Kernel      [runs]	Time	CPU%	Loads	LCPU%	Ratio
    2.4.20           1	97	55.7	21.5	12.4	1.94
    2.4.20-nas1      1	65	80.0	12.3	9.0	1.30
    2.4.20-nas2      1	68	79.4	15.5	11.8	1.36
    read_load:
    Kernel      [runs]	Time	CPU%	Loads	LCPU%	Ratio
    2.4.20           1	75	73.3	3.6	9.3	1.50
    2.4.20-nas1      1	75	70.7	3.4	9.3	1.50
    2.4.20-nas2      1	79	69.6	3.7	8.9	1.58
    list_load:
    Kernel      [runs]	Time	CPU%	Loads	LCPU%	Ratio
    2.4.20           1	66	89.4	0.0	7.5	1.32
    2.4.20-nas1      1	66	87.9	0.0	7.6	1.32
    2.4.20-nas2      1	66	87.9	0.0	7.6	1.32
    mem_load:
    Kernel      [runs]	Time	CPU%	Loads	LCPU%	Ratio
    2.4.20           1	78	62.8	77.0	3.8	1.56
    2.4.20-nas1      1	70	68.6	77.0	5.7	1.40
    2.4.20-nas2      1	68	72.1	75.0	5.9	1.36
    dbench_load:
    Kernel      [runs]	Time	CPU%	Loads	LCPU%	Ratio
    2.4.20           1	202	23.8	6.0	60.4	4.04
    2.4.20-nas1      1	194	24.2	6.0	63.4	3.88
    2.4.20-nas2      1	194	24.2	6.0	63.4	3.88

It looks like nas1 works a little better.  I think the reason is that if
requests are counted then as soon as there is one read in queue, however
small, the next non-contiguous read request will be inserted after a
write request, however large.

  Neil

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][CFT] new IO scheduler for 2.4.20
  2003-04-20 18:26   ` Neil Schemenauer
@ 2003-04-20 22:06     ` Marc-Christian Petersen
  2003-04-21  1:46       ` Neil Schemenauer
  0 siblings, 1 reply; 11+ messages in thread
From: Marc-Christian Petersen @ 2003-04-20 22:06 UTC (permalink / raw)
  To: Neil Schemenauer, Andrew Morton; +Cc: linux-kernel

On Sunday 20 April 2003 20:26, Neil Schemenauer wrote:

Hi Neil,

> Okay, nas1 is the patch I orginally posted.  With nas2, I am counting
> requests instead of sectors.  The times below are for the find/cat
> command to complete while the dd command is running.
>     2.4.20  ????????? (I gave up after about 15 minutes)
>     -nas1   1m56.746s
>     -nas2   2m36.928s
> Here's the contest results:
what about i/o throughput, bonnie or such?

ciao, Marc



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][CFT] new IO scheduler for 2.4.20
  2003-04-20 22:06     ` Marc-Christian Petersen
@ 2003-04-21  1:46       ` Neil Schemenauer
  0 siblings, 0 replies; 11+ messages in thread
From: Neil Schemenauer @ 2003-04-21  1:46 UTC (permalink / raw)
  To: Marc-Christian Petersen; +Cc: Andrew Morton, linux-kernel

Marc-Christian Petersen wrote:
> what about i/o throughput, bonnie or such?

bonnie++
Version  1.03       ------Sequential Output------ --Sequential Input- --Random-
                    -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
               Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP  /sec %CP
2.4.20           1G           36236  18 13423   7           29948  11 210.1   0
2.4.20-nas1      1G           35998  18 12076   7           19056   9 206.7   0
2.4.20-nas2      1G           37450  18 12101   7           19363   9 210.7   0

Read throughput has definitely taken a hit.  I don't know why.  Perhaps
the elevator needs to have a little more relaxed definition of
contiguous requests.  Right now it only puts requests together if they
are perfectly contiguous (the first sector in one request is the next
after the last sector of another).  I'll try to test that.

Any other theories?

  Neil

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][CFT] new IO scheduler for 2.4.20
  2003-04-17 17:28 [PATCH][CFT] new IO scheduler for 2.4.20 Neil Schemenauer
  2003-04-17 20:41 ` Andrew Morton
@ 2003-04-21 11:33 ` Andrea Arcangeli
  1 sibling, 0 replies; 11+ messages in thread
From: Andrea Arcangeli @ 2003-04-21 11:33 UTC (permalink / raw)
  To: Neil Schemenauer; +Cc: linux-kernel, axboe, akpm, conman

On Thu, Apr 17, 2003 at 10:28:19AM -0700, Neil Schemenauer wrote:
> Hi all,
> 
> Recently I was bitten badly by bad IO scheduler behavior on an important
> Linux server.  An easy way to trigger this problem is to start a
> streaming write process:
> 
>     while :
>     do
>             dd if=/dev/zero of=foo bs=1M count=512 conv=notrunc
>     done
> 
> and then try doing a bunch of small reads:
> 
>     time (find kernel-tree -type f | xargs cat > /dev/null)

can you try the above on 2.4.21pre5aa2? I also spent effort to fix it
some month ago.

The interesting patch is this:

	http://www.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.4/2.4.21pre5aa2/9981_elevator-lowlatency-4

the reason 2.4 mainline stalls so much is that the size of the queue is
overkill for no good reason, so no matter the elvtune numbers, you're
going to wait several dozen mbytes to be read or written before you can
read or write the next 1k from another task.

the above patch is fairly old and it basically fixes the showstopper
problem for me, contest looks fine now and throughput still is the best.

I don't like special "read hacks" for generic kernels that are critical
with O_SYNC and journaling responsiveness too.

So I recommend to apply the above 2.4 patch if you suffer any I/O
latency issue.

Andrea

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][CFT] new IO scheduler for 2.4.20
@ 2003-06-02 17:21 Andreas Dilger
  0 siblings, 0 replies; 11+ messages in thread
From: Andreas Dilger @ 2003-06-02 17:21 UTC (permalink / raw)
  To: Neil Schemenauer; +Cc: linux-kernel

On Sat, 31 May 2003 08:09, Neil Schemenauer wrote:
> The major benefit of this patch is that read latency is much lower while
> lots of writes are occuring.  On my machine, running:
>
>  while :; do dd if=/dev/zero of=foo bs=1M count=1000 conv=notrunc; done
>
> makes 2.4.20 unusable.  With this patch the "write bomb" causes no
> particular problems.
>
> With this version of the patch I've improved the bulk read performance
> of the elevator.  The bonnie++ results are now:
>
>                     -Per Chr- --Block-- -Rewrite- -Per Chr- --Block--
>                Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP
> 2.4.20           1G 13001  97 34939  18 13034   7 12175  92 34112  14
> 2.4.20-nas       1G 12923  98 36471  17 13340   8 10809  83 35569  13
>
> Note that the "rewrite" and "per-char read" stats are slightly bogus for
> 2.4.20-nas.  Reads get a boost in priority over writes.  When the
> "per-char read" test has started there is still some writing happening
> from the rewrite test.  I think the net effect is that the "rewrite"
> number is too high and the "per-char read" number is too low.
>
> I would be very pleased if someone could run some tests on using bonnie,
> contest, or their other favorite benchmarks and post the results.

In local testing we hit the following problem at boot:

SCSI device sda: 35565080 512-byte hdwr sectors (18209 MB)
Partition check:
  sda:elevator returned crap (-1069467520)
------------[ cut here ]------------
kernel BUG at ll_rw_blk.c:1043!
invalid operand: 0000

CPU:    0
EIP:    0060:[<c01d40ab>]    Not tainted
EFLAGS: 00010086

EIP is at __make_request [kernel] 0x3bb 
(2.4.20-9smp-l18-b_devel-30-05-2003-fercher-nas)
eax: 00000025   ebx: c466a2e0   ecx: 00000001   edx: 00000001
esi: 00000000   edi: c464b638   ebp: c44c9d30   esp: c44c9c90
ds: 0068   es: 0068   ss: 0068
Process swapper (pid: 1, stackpage=c44c9000)
Stack: c02c7ea6 c0413880 c464b640 c463b880 00000400 00000000 00000002 
00000002
        00000000 c44c8000 00000000 c0413880 c03e1fc4 c011ac62 c44c9d10 
c0413880
        00000000 c44c9cec c020707b c4623a38 c4623a38 c4695980 c4652400 
c4623a00
Call Trace:   [<c011ac62>] schedule [kernel] 0x362 (0xc44c9cc4))
[<c020707b>] scsi_insert_special_req [kernel] 0x1b (0xc44c9cd8))
[<c020723a>] scsi_queue_next_request [kernel] 0x4a (0xc44c9d04))
[<c01d43f9>] generic_make_request [kernel] 0x119 (0xc44c9d34))
[<c01d445f>] submit_bh [kernel] 0x4f (0xc44c9d68))
[<c014f140>] block_read_full_page [kernel] 0x2e0 (0xc44c9d84))
[<c012411d>] bh_action [kernel] 0x4d (0xc44c9dc8))
[<c0123fc4>] tasklet_hi_action [kernel] 0x74 (0xc44c9dd4))
[<c0135981>] add_to_page_cache_unique [kernel] 0x81 (0xc44c9e14))
[<c0138459>] read_cache_page [kernel] 0x89 (0xc44c9e30))
[<c0152a20>] blkdev_get_block [kernel] 0x0 (0xc44c9e38))
[<c014367c>] __alloc_pages [kernel] 0x7c (0xc44c9e44))
[<c0174d46>] read_dev_sector [kernel] 0x36 (0xc44c9e5c))
[<c0152ab0>] blkdev_readpage [kernel] 0x0 (0xc44c9e68))
[<c01756e9>] handle_ide_mess [kernel] 0x29 (0xc44c9e90))
[<c01758c5>] msdos_partition [kernel] 0x65 (0xc44c9ebc))
[<c0163cfc>] new_inode [kernel] 0xc (0xc44c9ed0))
[<c0174b4f>] check_partition [kernel] 0xff (0xc44c9ef0))
[<c02236fa>] sd_init_onedisk [kernel] 0x7ba (0xc44c9f24))
[<c0174cd2>] grok_partitions [kernel] 0xc2 (0xc44c9f5c))
[<c0223d76>] sd_finish [kernel] 0x106 (0xc44c9f80))
[<c0201985>] scsi_register_device_module [kernel] 0x155 (0xc44c9fa8))
[<c0105094>] init [kernel] 0x34 (0xc44c9fdc))
[<c0105060>] init [kernel] 0x0 (0xc44c9fe0))
[<c0107459>] kernel_thread_helper [kernel] 0x5 (0xc44c9ff0))


Code: 0f 0b 13 04 9a 7e 2c c0 59 5f 8b 95 74 ff ff ff 85 d2 74 15
  <0>Kernel panic: Attempted to kill init!

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][CFT] new IO scheduler for 2.4.20
  2003-05-30 23:40 ` Con Kolivas
@ 2003-05-31  0:52   ` Neil Schemenauer
  2003-05-30 17:58     ` Robert Love
  0 siblings, 1 reply; 11+ messages in thread
From: Neil Schemenauer @ 2003-05-31  0:52 UTC (permalink / raw)
  To: Con Kolivas; +Cc: linux-kernel, akpm

Con Kolivas wrote:
> How does this compare to akpm's read-latency2 patch that he posed some
> time ago? That seems to make a massive difference but was knocked back
> for style or approach.

It looks like they do fairly similar things.  Andrew's patch puts
unmergable read requests at a fixed distance from the front of the
queue.  My patch lets unmerged reads skip max((reads - writes), 0)
requests.  That's probably more fair when lots of reads and writes are
in queue.

Andrew's idea of always allowing a merge is probably a good idea and
could be adopted.

My patch uses a fixed deadline for requests (similar to Jen's deadline
scheduler).  I'm not sure if that's an advantage or not.  Note that the
deadline of writes are ignored when inserting a read.

I didn't change the size of the request queue.  I can't find where that
gets set in 2.4.20. :-(

Sorry for the hand-waving.  I didn't know about Andrew's patch and I
obviously didn't do enough testing yet.

  Neil

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][CFT] new IO scheduler for 2.4.20
  2003-05-30 22:09 Neil Schemenauer
@ 2003-05-30 23:40 ` Con Kolivas
  2003-05-31  0:52   ` Neil Schemenauer
  0 siblings, 1 reply; 11+ messages in thread
From: Con Kolivas @ 2003-05-30 23:40 UTC (permalink / raw)
  To: Neil Schemenauer, linux-kernel; +Cc: Marc-Christian Petersen, Matt

On Sat, 31 May 2003 08:09, Neil Schemenauer wrote:
> The major benefit of this patch is that read latency is much lower while
> lots of writes are occuring.  On my machine, running:
>
>  while :; do dd if=/dev/zero of=foo bs=1M count=1000 conv=notrunc; done
>
> makes 2.4.20 unusable.  With this patch the "write bomb" causes no
> particular problems.
>
> With this version of the patch I've improved the bulk read performance
> of the elevator.  The bonnie++ results are now:
>
>                     -Per Chr- --Block-- -Rewrite- -Per Chr- --Block--
>                Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP
> 2.4.20           1G 13001  97 34939  18 13034   7 12175  92 34112  14
> 2.4.20-nas       1G 12923  98 36471  17 13340   8 10809  83 35569  13
>
> Note that the "rewrite" and "per-char read" stats are slightly bogus for
> 2.4.20-nas.  Reads get a boost in priority over writes.  When the
> "per-char read" test has started there is still some writing happening
> from the rewrite test.  I think the net effect is that the "rewrite"
> number is too high and the "per-char read" number is too low.
>
> I would be very pleased if someone could run some tests on using bonnie,
> contest, or their other favorite benchmarks and post the results.

Nice to see 2.4 getting some attention. I'll try and get around to contesting 
it.

How does this compare to akpm's read-latency2 patch that he posed some time 
ago? That seems to make a massive difference but was knocked back for style 
or approach.

Con

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH][CFT] new IO scheduler for 2.4.20
@ 2003-05-30 22:09 Neil Schemenauer
  2003-05-30 23:40 ` Con Kolivas
  0 siblings, 1 reply; 11+ messages in thread
From: Neil Schemenauer @ 2003-05-30 22:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: conman, Marc-Christian Petersen, Matt

The major benefit of this patch is that read latency is much lower while
lots of writes are occuring.  On my machine, running:

 while :; do dd if=/dev/zero of=foo bs=1M count=1000 conv=notrunc; done

makes 2.4.20 unusable.  With this patch the "write bomb" causes no
particular problems.

With this version of the patch I've improved the bulk read performance
of the elevator.  The bonnie++ results are now:

                    -Per Chr- --Block-- -Rewrite- -Per Chr- --Block--
               Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP
2.4.20           1G 13001  97 34939  18 13034   7 12175  92 34112  14
2.4.20-nas       1G 12923  98 36471  17 13340   8 10809  83 35569  13

Note that the "rewrite" and "per-char read" stats are slightly bogus for
2.4.20-nas.  Reads get a boost in priority over writes.  When the
"per-char read" test has started there is still some writing happening
from the rewrite test.  I think the net effect is that the "rewrite"
number is too high and the "per-char read" number is too low.

I would be very pleased if someone could run some tests on using bonnie,
contest, or their other favorite benchmarks and post the results.

  Neil


diff -u -ur linux-2.4.20/Makefile linux-iosched-2/Makefile
--- linux-2.4.20/Makefile	2003-04-14 14:47:20.000000000 -0400
+++ linux-iosched-2/Makefile	2003-05-30 17:27:16.000000000 -0400
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 20
-EXTRAVERSION =
+EXTRAVERSION = -nas
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff -u -ur linux-2.4.20/drivers/block/elevator.c linux-iosched-2/drivers/block/elevator.c
--- linux-2.4.20/drivers/block/elevator.c	2003-04-14 14:47:22.000000000 -0400
+++ linux-iosched-2/drivers/block/elevator.c	2003-05-30 17:28:57.000000000 -0400
@@ -74,6 +74,81 @@
 	return 0;
 }
 
+int elevator_neil_merge(request_queue_t *q, struct request **req,
+			 struct list_head * head,
+			 struct buffer_head *bh, int rw,
+			 int max_sectors)
+{
+	struct list_head *entry = &q->queue_head;
+	struct request *__rq;
+	unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE;
+	unsigned int reads = 0, writes = 0;
+	/* XXX make tunable? */
+	unsigned int expire_time = jiffies - 1*HZ;
+
+	if (list_empty(&q->queue_head))
+		goto out;
+
+	/* try to merge requests, fall back to ordering them by sector */
+	while ((entry = entry->prev) != head) {
+		__rq = blkdev_entry_to_request(entry);
+
+		if (__rq->elevator_sequence <= 0)
+			break;
+		if (__rq->cmd == READ)
+			++reads;
+		else
+			++writes;
+		if (__rq->rq_dev != bh->b_rdev)
+			continue;
+		if (__rq->waiting)
+			continue;
+		if (__rq->cmd != rw)
+			continue;
+		if (time_before(__rq->start_time, expire_time))
+			break;
+		if (bh->b_rsector > __rq->sector)
+			*req = __rq;
+		if (__rq->nr_sectors + count > max_sectors)
+			continue;
+		if (__rq->sector + __rq->nr_sectors == bh->b_rsector) {
+			ret = ELEVATOR_BACK_MERGE;
+			*req = __rq;
+			goto out;
+		} else if (__rq->sector - count == bh->b_rsector) {
+			ret = ELEVATOR_FRONT_MERGE;
+			*req = __rq;
+			goto out;
+		}
+	}
+	if (!*req && rw == READ) {
+		int extra_writes = writes - reads;
+		/*
+		 * If there are more writes than reads in the queue then put
+		 * read requests ahead of the extra writes.  This prevents
+		 * writes from starving reads.
+		 */
+		entry = q->queue_head.prev;
+		while (extra_writes > 0 && entry != head) {
+			__rq = blkdev_entry_to_request(entry);
+			if (__rq->cmd == WRITE)
+				--extra_writes;
+			else if (time_before(__rq->start_time, expire_time))
+				break;
+			entry = entry->prev;
+		}
+		*req = blkdev_entry_to_request(entry);
+	}
+out:
+	return ret;
+}
+
+void elevator_neil_merge_req(struct request *req, struct request *next)
+{
+	if (time_before(next->start_time, req->start_time))
+		req->start_time = next->start_time;
+}
+
 
 int elevator_linus_merge(request_queue_t *q, struct request **req,
 			 struct list_head * head,
diff -u -ur linux-2.4.20/drivers/block/ll_rw_blk.c linux-iosched-2/drivers/block/ll_rw_blk.c
--- linux-2.4.20/drivers/block/ll_rw_blk.c	2003-04-14 14:47:22.000000000 -0400
+++ linux-iosched-2/drivers/block/ll_rw_blk.c	2003-05-30 17:27:16.000000000 -0400
@@ -480,7 +480,7 @@
 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
 {
 	INIT_LIST_HEAD(&q->queue_head);
-	elevator_init(&q->elevator, ELEVATOR_LINUS);
+	elevator_init(&q->elevator, ELEVATOR_NEIL);
 	blk_init_free_list(q);
 	q->request_fn     	= rfn;
 	q->back_merge_fn       	= ll_back_merge_fn;
@@ -922,7 +922,8 @@
 			rw = READ;	/* drop into READ */
 		case READ:
 		case WRITE:
-			latency = elevator_request_latency(elevator, rw);
+			/* latency = elevator_request_latency(elevator, rw); */
+			latency = 1;
 			break;
 		default:
 			BUG();
diff -u -ur linux-2.4.20/include/linux/elevator.h linux-iosched-2/include/linux/elevator.h
--- linux-2.4.20/include/linux/elevator.h	2003-04-14 14:47:24.000000000 -0400
+++ linux-iosched-2/include/linux/elevator.h	2003-05-30 17:27:16.000000000 -0400
@@ -31,6 +31,9 @@
 void elevator_linus_merge_cleanup(request_queue_t *, struct request *, int);
 void elevator_linus_merge_req(struct request *, struct request *);
 
+int elevator_neil_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int);
+void elevator_neil_merge_req(struct request *, struct request *);
+
 typedef struct blkelv_ioctl_arg_s {
 	int queue_ID;
 	int read_latency;
@@ -101,3 +104,12 @@
 	})
 
 #endif
+
+#define ELEVATOR_NEIL							\
+((elevator_t) {								\
+	0,				/* read_latency */		\
+	0,				/* write_latency */		\
+									\
+	elevator_neil_merge,		/* elevator_merge_fn */		\
+	elevator_neil_merge_req,	/* elevator_merge_req_fn */	\
+	})

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][CFT] new IO scheduler for 2.4.20
  2003-05-31  0:52   ` Neil Schemenauer
@ 2003-05-30 17:58     ` Robert Love
  0 siblings, 0 replies; 11+ messages in thread
From: Robert Love @ 2003-05-30 17:58 UTC (permalink / raw)
  To: Neil Schemenauer; +Cc: Con Kolivas, linux-kernel, akpm

On Sat, 2003-05-31 at 00:52, Neil Schemenauer wrote:

> I didn't change the size of the request queue.  I can't find where that
> gets set in 2.4.20. :-(

in drivers/block/ll_rw_blk.c :: blk_init_free_list()

I think the main reason (aside from memory consumption) 2.4 has such
small queues is to minimize read latency. If your patch goes off fixes
that, you can probably get away with larger queues, which will help in
merging. But then you definitely want to look into always merging reads,
like Andrew does.

A maximum of ~1000 request entries seems nice if you really give a boost
to reads otherwise.

	Robert Love


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2003-06-02 17:11 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-04-17 17:28 [PATCH][CFT] new IO scheduler for 2.4.20 Neil Schemenauer
2003-04-17 20:41 ` Andrew Morton
2003-04-20 18:26   ` Neil Schemenauer
2003-04-20 22:06     ` Marc-Christian Petersen
2003-04-21  1:46       ` Neil Schemenauer
2003-04-21 11:33 ` Andrea Arcangeli
2003-05-30 22:09 Neil Schemenauer
2003-05-30 23:40 ` Con Kolivas
2003-05-31  0:52   ` Neil Schemenauer
2003-05-30 17:58     ` Robert Love
2003-06-02 17:21 Andreas Dilger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).