Re: Reduce latencies for syncronous writes and high I/O priority requests in deadline IO scheduler

From: Corrado Zoccolo <czoccolo@gmail.com>
To: Jens Axboe <jens.axboe@oracle.com>
Cc: Aaron Carroll <aaronc@cse.unsw.edu.au>,
	Linux-Kernel <linux-kernel@vger.kernel.org>
Subject: Re: Reduce latencies for syncronous writes and high I/O priority  requests in deadline IO scheduler
Date: Fri, 1 May 2009 21:30:59 +0200	[thread overview]
Message-ID: <4e5e476b0905011230x32ffcbffq388354327710b1a8@mail.gmail.com> (raw)
In-Reply-To: <4e5e476b0904260543r589be3a4k96884cd079641a7@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1775 bytes --]

On Sun, Apr 26, 2009 at 2:43 PM, Corrado Zoccolo <czoccolo@gmail.com> wrote:
> * on my machine, there is a regression on sequential write

I found that the regression was just an artifact of my testing (the
test partition was almost full, and the written files were re-created
at each test, resulting in non-uniform fragmentation across tests).
Changing the test to preallocate also the write file made the test
more repeateable, with the result that patched deadline and original
perform equal.

Here is the last patch of the series, that add I/O priority support to
deadline. All requests are sorted into 3 levels of priorities :
* 0: async reads/writes, and all Idle class requests
* 1: sync Best Effort reads/writes, and sync Real Time writes
* 2: Real Time reads.

Aaron, I found your previous attempt at modifying deadline to use
sync/async instead of read/write.
My approach is slightly different, since I changed only the fifos to
respect the new scheme, while the RB trees are still partitioned as
reads vs writes.
Since the RB trees are used for merging and for batch formation,
having the RB trees as in original deadline should guarantee the same
success rate for merging, and allow to create longer batches that span
across priority levels, when requests on a given priority level are
too few to fully utilize the disk bandwidth (this is usually the case
for writes, where we have few sync writes to the journal, mixed with
lots of async writes to the data).

Corrado

-- 
__________________________________________________________________________

dott. Corrado Zoccolo                          mailto:czoccolo@gmail.com
PhD - Department of Computer Science - University of Pisa, Italy
--------------------------------------------------------------------------

[-- Attachment #2: deadline-patch-rt --]
[-- Type: application/octet-stream, Size: 7149 bytes --]

Deadline IOscheduler rt patch

This is the third (and last) patch of the series, and contains the changes
to propagate I/O priority from process to requests, and use this info
to schedule requests.

Requests are classified in 3 priority levels, from lowest to highest:
* 0: async reads/writes, and all Idle class requests
* 1: sync Best Effort reads/writes, and sync Real Time writes
* 2: sync Real Time reads.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>

diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 57e67c8..1b9fd51 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -17,9 +17,10 @@
 /*
  * See Documentation/block/deadline-iosched.txt
  */
+static const int rt_sync_expire = HZ / 8;  /* max time before a real-time sync operation is submitted. */
 static const int sync_expire = HZ / 2;     /* max time before a sync operation is submitted. */
 static const int async_expire = 5 * HZ;    /* ditto for async operations, these limits are SOFT! */
-static const int async_starved = 2;        /* max times SYNC can starve ASYNC requests */
+static const int async_starved = 3;        /* max times SYNC can starve ASYNC requests */
 static const int fifo_batch = 16;       /* # of sequential requests treated as one
 				     by the above parameters. For throughput. */
 
@@ -32,7 +33,7 @@ struct deadline_data {
 	 * requests (deadline_rq s) are present on both sort_list and fifo_list
 	 */
 	struct rb_root sort_list[2]; /* READ, WRITE */
-	struct list_head fifo_list[2]; /* 0=ASYNC, 1=SYNC */
+	struct list_head fifo_list[3]; /* 0=ASYNC (or IDLE), 1=SYNC (or RT ASYNC), 2=RT SYNC */
 
 	/*
 	 * next in sort order.
@@ -44,7 +45,7 @@ struct deadline_data {
 	/*
 	 * settings that change how the i/o scheduler behaves
 	 */
-	int fifo_expire[2];
+	int fifo_expire[3];
 	int fifo_batch;
 	int async_starved;
 	int front_merges;
@@ -96,10 +97,65 @@ deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
 	elv_rb_del(deadline_rb_root(dd, rq), rq);
 }
 
+static int ioprio_lub(unsigned short aprio, unsigned short bprio)
+{
+	unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
+	unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
+
+	if (aclass == IOPRIO_CLASS_NONE)
+		return bprio;
+	if (bclass == IOPRIO_CLASS_NONE)
+		return aprio;
+
+	if (aclass == bclass)
+		return min(aprio, bprio);
+	if (aclass > bclass)
+		return bprio;
+	else
+		return aprio;
+}
+
+static void
+deadline_merge_prio_data(struct request_queue *q, struct request *rq)
+{
+	struct task_struct *tsk = current;
+	struct io_context *ioc = get_io_context(GFP_ATOMIC,q->node);
+	int ioprio_class = IOPRIO_CLASS_NONE;
+	int ioprio = IOPRIO_NORM;
+
+	if(ioc) {
+		ioprio_class = task_ioprio_class(ioc);
+	}
+
+	switch (ioprio_class) {
+	default:
+		printk(KERN_ERR "deadline: bad prio %x\n", ioprio_class);
+	case IOPRIO_CLASS_NONE:
+		/*
+		 * no prio set, inherit CPU scheduling settings
+		 */
+		ioprio = task_nice_ioprio(tsk);
+		ioprio_class = task_nice_ioclass(tsk);
+		break;
+	case IOPRIO_CLASS_RT:
+	case IOPRIO_CLASS_BE:
+		ioprio = task_ioprio(ioc);
+		break;
+	case IOPRIO_CLASS_IDLE:
+		ioprio = 7;
+		break;
+	}
+
+	ioprio=IOPRIO_PRIO_VALUE(ioprio_class,ioprio);
+	rq->ioprio=ioprio_lub(rq->ioprio,ioprio);
+}
+
 static int
 deadline_compute_req_priority(struct request *req)
 {
-	return !!rq_is_sync(req);
+	unsigned short ioprio_class=IOPRIO_PRIO_CLASS(req_get_ioprio(req));
+	return (ioprio_class!=IOPRIO_CLASS_IDLE)*
+		(!!rq_is_sync(req) + (rq_data_dir(req)==READ)*(ioprio_class==IOPRIO_CLASS_RT));
 }
 
 /*
@@ -110,6 +166,7 @@ deadline_add_request(struct request_queue *q, struct request *rq)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
 
+	deadline_merge_prio_data(q,rq);
 	deadline_add_rq_rb(dd, rq);
 
 	/*
@@ -173,6 +230,8 @@ static void deadline_merged_request(struct request_queue *q,
 		elv_rb_del(deadline_rb_root(dd, req), req);
 		deadline_add_rq_rb(dd, req);
 	}
+
+	deadline_merge_prio_data(q,req);
 }
 
 static void
@@ -262,6 +321,7 @@ static inline int deadline_check_fifo(struct deadline_data *dd, unsigned prio)
 static int deadline_dispatch_requests(struct request_queue *q, int force)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
+	const int rt_reqs = !list_empty(&dd->fifo_list[2]);
 	const int sync_reqs = !list_empty(&dd->fifo_list[1]);
 	const int async_reqs = !list_empty(&dd->fifo_list[0]);
 	struct request *rq = dd->next_rq;
@@ -277,6 +337,11 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
 	 * data direction (read / write)
 	 */
 
+	if (rt_reqs) {
+		request_prio = 2;
+		goto dispatch_find_request;
+	}
+
 	if (sync_reqs) {
 		if (async_reqs && (dd->starved++ >= dd->async_starved))
 			goto dispatch_async;
@@ -338,7 +403,8 @@ static int deadline_queue_empty(struct request_queue *q)
 	struct deadline_data *dd = q->elevator->elevator_data;
 
 	return list_empty(&dd->fifo_list[0])
-		&& list_empty(&dd->fifo_list[1]);
+		&& list_empty(&dd->fifo_list[1])
+		&& list_empty(&dd->fifo_list[2]);
 }
 
 static void deadline_exit_queue(struct elevator_queue *e)
@@ -347,6 +413,7 @@ static void deadline_exit_queue(struct elevator_queue *e)
 
 	BUG_ON(!list_empty(&dd->fifo_list[0]));
 	BUG_ON(!list_empty(&dd->fifo_list[1]));
+	BUG_ON(!list_empty(&dd->fifo_list[2]));
 
 	kfree(dd);
 }
@@ -364,10 +431,12 @@ static void *deadline_init_queue(struct request_queue *q)
 
 	INIT_LIST_HEAD(&dd->fifo_list[0]);
 	INIT_LIST_HEAD(&dd->fifo_list[1]);
+	INIT_LIST_HEAD(&dd->fifo_list[2]);
 	dd->sort_list[READ] = RB_ROOT;
 	dd->sort_list[WRITE] = RB_ROOT;
 	dd->fifo_expire[0] = async_expire;
 	dd->fifo_expire[1] = sync_expire;
+	dd->fifo_expire[2] = rt_sync_expire;
 	dd->async_starved = async_starved;
 	dd->front_merges = 1;
 	dd->fifo_batch = fifo_batch;
@@ -404,6 +473,7 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 }
 SHOW_FUNCTION(deadline_async_expire_show, dd->fifo_expire[0], 1);
 SHOW_FUNCTION(deadline_sync_expire_show, dd->fifo_expire[1], 1);
+SHOW_FUNCTION(deadline_rt_sync_expire_show, dd->fifo_expire[2], 1);
 SHOW_FUNCTION(deadline_async_starved_show, dd->async_starved, 0);
 SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
 SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
@@ -427,6 +497,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
 }
 STORE_FUNCTION(deadline_async_expire_store, &dd->fifo_expire[0], 0, INT_MAX, 1);
 STORE_FUNCTION(deadline_sync_expire_store, &dd->fifo_expire[1], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_rt_sync_expire_store, &dd->fifo_expire[2], 0, INT_MAX, 1);
 STORE_FUNCTION(deadline_async_starved_store, &dd->async_starved, INT_MIN, INT_MAX, 0);
 STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
 STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
@@ -439,6 +510,7 @@ STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
 static struct elv_fs_entry deadline_attrs[] = {
 	DD_ATTR(async_expire),
 	DD_ATTR(sync_expire),
+	DD_ATTR(rt_sync_expire),
 	DD_ATTR(async_starved),
 	DD_ATTR(front_merges),
 	DD_ATTR(fifo_batch),