linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/6] IO context sharing
@ 2008-01-22  9:49 Jens Axboe
  2008-01-22  9:49 ` [PATCH 1/6] ioprio: move io priority from task_struct to io_context Jens Axboe
                   ` (7 more replies)
  0 siblings, 8 replies; 18+ messages in thread
From: Jens Axboe @ 2008-01-22  9:49 UTC (permalink / raw)
  To: linux-kernel; +Cc: knikanth

Hi,

Today io contexts are per-process and define the (surprise) io context
of that process. In some situations it would be handy if several
processes share an IO context. With syslets it's especially crucial that
the threads working on behalf of the parent process are seen as the same
IO context. That goes from both a fairness and performance point of
view, since IO schedulers like AS and CFQ base decisions on what the
IO context state is. This patchset adds support for processes sharing a
single io context and also adds a CLONE_IO flag to denote sharing of
IO context.

In the block git repo here's a syslet-share branch that merges these
patches with the latest syslet patches posted from Zach and sets
CLONE_IO for syslet threads. The result is that performance of AS
and CFQ is as good as it would have been had a single process issued
the IO async through libaio.

This work was originally started by Nikanth Karthikesan
<knikanth@novell.com>, but then later completely reworked by me.

The patches are against latest -git.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/6] ioprio: move io priority from task_struct to io_context
  2008-01-22  9:49 [PATCH 0/6] IO context sharing Jens Axboe
@ 2008-01-22  9:49 ` Jens Axboe
  2008-01-23 22:07   ` Andrew Morton
  2008-01-22  9:49 ` [PATCH 2/6] io context sharing: preliminary support Jens Axboe
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2008-01-22  9:49 UTC (permalink / raw)
  To: linux-kernel; +Cc: knikanth, Jens Axboe

This is where it belongs and then it doesn't take up space for a
process that doesn't do IO.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c       |   34 +++++++++----------
 block/ll_rw_blk.c         |   42 ++++++++++++++++-------
 fs/ioprio.c               |   29 ++++++++++++----
 include/linux/blkdev.h    |   75 +------------------------------------------
 include/linux/init_task.h |    1 -
 include/linux/iocontext.h |   79 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ioprio.h    |   13 ++++---
 include/linux/sched.h     |    1 -
 kernel/fork.c             |   31 +++++++++++++++---
 9 files changed, 179 insertions(+), 126 deletions(-)
 create mode 100644 include/linux/iocontext.h

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 13553e0..533af75 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -199,7 +199,7 @@ CFQ_CFQQ_FNS(sync);
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
-				       struct task_struct *, gfp_t);
+				       struct io_context *, gfp_t);
 static struct cfq_io_context *cfq_cic_rb_lookup(struct cfq_data *,
 						struct io_context *);
 
@@ -1273,7 +1273,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	return cic;
 }
 
-static void cfq_init_prio_data(struct cfq_queue *cfqq)
+static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 {
 	struct task_struct *tsk = current;
 	int ioprio_class;
@@ -1281,7 +1281,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 	if (!cfq_cfqq_prio_changed(cfqq))
 		return;
 
-	ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
+	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
 	switch (ioprio_class) {
 		default:
 			printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
@@ -1293,11 +1293,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 			cfqq->ioprio_class = IOPRIO_CLASS_BE;
 			break;
 		case IOPRIO_CLASS_RT:
-			cfqq->ioprio = task_ioprio(tsk);
+			cfqq->ioprio = task_ioprio(ioc);
 			cfqq->ioprio_class = IOPRIO_CLASS_RT;
 			break;
 		case IOPRIO_CLASS_BE:
-			cfqq->ioprio = task_ioprio(tsk);
+			cfqq->ioprio = task_ioprio(ioc);
 			cfqq->ioprio_class = IOPRIO_CLASS_BE;
 			break;
 		case IOPRIO_CLASS_IDLE:
@@ -1330,8 +1330,7 @@ static inline void changed_ioprio(struct cfq_io_context *cic)
 	cfqq = cic->cfqq[ASYNC];
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
-		new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc->task,
-					 GFP_ATOMIC);
+		new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc, GFP_ATOMIC);
 		if (new_cfqq) {
 			cic->cfqq[ASYNC] = new_cfqq;
 			cfq_put_queue(cfqq);
@@ -1363,13 +1362,13 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
 
 static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync,
-		     struct task_struct *tsk, gfp_t gfp_mask)
+		     struct io_context *ioc, gfp_t gfp_mask)
 {
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 	struct cfq_io_context *cic;
 
 retry:
-	cic = cfq_cic_rb_lookup(cfqd, tsk->io_context);
+	cic = cfq_cic_rb_lookup(cfqd, ioc);
 	/* cic always exists here */
 	cfqq = cic_to_cfqq(cic, is_sync);
 
@@ -1412,7 +1411,7 @@ retry:
 		cfq_mark_cfqq_prio_changed(cfqq);
 		cfq_mark_cfqq_queue_new(cfqq);
 
-		cfq_init_prio_data(cfqq);
+		cfq_init_prio_data(cfqq, ioc);
 	}
 
 	if (new_cfqq)
@@ -1439,11 +1438,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 }
 
 static struct cfq_queue *
-cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct task_struct *tsk,
+cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc,
 	      gfp_t gfp_mask)
 {
-	const int ioprio = task_ioprio(tsk);
-	const int ioprio_class = task_ioprio_class(tsk);
+	const int ioprio = task_ioprio(ioc);
+	const int ioprio_class = task_ioprio_class(ioc);
 	struct cfq_queue **async_cfqq = NULL;
 	struct cfq_queue *cfqq = NULL;
 
@@ -1453,7 +1452,7 @@ cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct task_struct *tsk,
 	}
 
 	if (!cfqq) {
-		cfqq = cfq_find_alloc_queue(cfqd, is_sync, tsk, gfp_mask);
+		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
 		if (!cfqq)
 			return NULL;
 	}
@@ -1793,7 +1792,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
-	cfq_init_prio_data(cfqq);
+	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
 
 	cfq_add_rq_rb(rq);
 
@@ -1900,7 +1899,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
 
 	cfqq = cic_to_cfqq(cic, rw & REQ_RW_SYNC);
 	if (cfqq) {
-		cfq_init_prio_data(cfqq);
+		cfq_init_prio_data(cfqq, cic->ioc);
 		cfq_prio_boost(cfqq);
 
 		return __cfq_may_queue(cfqq);
@@ -1938,7 +1937,6 @@ static int
 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct task_struct *tsk = current;
 	struct cfq_io_context *cic;
 	const int rw = rq_data_dir(rq);
 	const int is_sync = rq_is_sync(rq);
@@ -1956,7 +1954,7 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq) {
-		cfqq = cfq_get_queue(cfqd, is_sync, tsk, gfp_mask);
+		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
 
 		if (!cfqq)
 			goto queue_fail;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 8b91994..bcd0354 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3882,6 +3882,26 @@ void exit_io_context(void)
 	put_io_context(ioc);
 }
 
+struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
+{
+	struct io_context *ret;
+
+	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
+	if (ret) {
+		atomic_set(&ret->refcount, 1);
+		ret->task = current;
+		ret->ioprio_changed = 0;
+		ret->ioprio = 0;
+		ret->last_waited = jiffies; /* doesn't matter... */
+		ret->nr_batch_requests = 0; /* because this is 0 */
+		ret->aic = NULL;
+		ret->cic_root.rb_node = NULL;
+		ret->ioc_data = NULL;
+	}
+
+	return ret;
+}
+
 /*
  * If the current task has no IO context then create one and initialise it.
  * Otherwise, return its existing IO context.
@@ -3899,16 +3919,8 @@ static struct io_context *current_io_context(gfp_t gfp_flags, int node)
 	if (likely(ret))
 		return ret;
 
-	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
+	ret = alloc_io_context(gfp_flags, node);
 	if (ret) {
-		atomic_set(&ret->refcount, 1);
-		ret->task = current;
-		ret->ioprio_changed = 0;
-		ret->last_waited = jiffies; /* doesn't matter... */
-		ret->nr_batch_requests = 0; /* because this is 0 */
-		ret->aic = NULL;
-		ret->cic_root.rb_node = NULL;
-		ret->ioc_data = NULL;
 		/* make sure set_task_ioprio() sees the settings above */
 		smp_wmb();
 		tsk->io_context = ret;
@@ -3925,10 +3937,14 @@ static struct io_context *current_io_context(gfp_t gfp_flags, int node)
  */
 struct io_context *get_io_context(gfp_t gfp_flags, int node)
 {
-	struct io_context *ret;
-	ret = current_io_context(gfp_flags, node);
-	if (likely(ret))
-		atomic_inc(&ret->refcount);
+	struct io_context *ret = NULL;
+
+	do {
+		ret = current_io_context(gfp_flags, node);
+		if (unlikely(!ret))
+			break;
+	} while (!atomic_inc_not_zero(&ret->refcount));
+
 	return ret;
 }
 EXPORT_SYMBOL(get_io_context);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index e4e01bc..a760040 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -41,18 +41,29 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 		return err;
 
 	task_lock(task);
+	do {
+		ioc = task->io_context;
+		/* see wmb() in current_io_context() */
+		smp_read_barrier_depends();
+		if (ioc)
+			break;
 
-	task->ioprio = ioprio;
-
-	ioc = task->io_context;
-	/* see wmb() in current_io_context() */
-	smp_read_barrier_depends();
+		ioc = alloc_io_context(GFP_ATOMIC, -1);
+		if (!ioc) {
+			err = -ENOMEM;
+			break;
+		}
+		task->io_context = ioc;
+		ioc->task = task;
+	} while (1);
 
-	if (ioc)
+	if (!err) {
+		ioc->ioprio = ioprio;
 		ioc->ioprio_changed = 1;
+	}
 
 	task_unlock(task);
-	return 0;
+	return err;
 }
 
 asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
@@ -148,7 +159,9 @@ static int get_task_ioprio(struct task_struct *p)
 	ret = security_task_getioprio(p);
 	if (ret)
 		goto out;
-	ret = p->ioprio;
+	ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
+	if (p->io_context)
+		ret = p->io_context->ioprio;
 out:
 	return ret;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d18ee67..d61c6f5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -34,83 +34,10 @@ struct sg_io_hdr;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 
-/*
- * This is the per-process anticipatory I/O scheduler state.
- */
-struct as_io_context {
-	spinlock_t lock;
-
-	void (*dtor)(struct as_io_context *aic); /* destructor */
-	void (*exit)(struct as_io_context *aic); /* called on task exit */
-
-	unsigned long state;
-	atomic_t nr_queued; /* queued reads & sync writes */
-	atomic_t nr_dispatched; /* number of requests gone to the drivers */
-
-	/* IO History tracking */
-	/* Thinktime */
-	unsigned long last_end_request;
-	unsigned long ttime_total;
-	unsigned long ttime_samples;
-	unsigned long ttime_mean;
-	/* Layout pattern */
-	unsigned int seek_samples;
-	sector_t last_request_pos;
-	u64 seek_total;
-	sector_t seek_mean;
-};
-
-struct cfq_queue;
-struct cfq_io_context {
-	struct rb_node rb_node;
-	void *key;
-
-	struct cfq_queue *cfqq[2];
-
-	struct io_context *ioc;
-
-	unsigned long last_end_request;
-	sector_t last_request_pos;
-
-	unsigned long ttime_total;
-	unsigned long ttime_samples;
-	unsigned long ttime_mean;
-
-	unsigned int seek_samples;
-	u64 seek_total;
-	sector_t seek_mean;
-
-	struct list_head queue_list;
-
-	void (*dtor)(struct io_context *); /* destructor */
-	void (*exit)(struct io_context *); /* called on task exit */
-};
-
-/*
- * This is the per-process I/O subsystem state.  It is refcounted and
- * kmalloc'ed. Currently all fields are modified in process io context
- * (apart from the atomic refcount), so require no locking.
- */
-struct io_context {
-	atomic_t refcount;
-	struct task_struct *task;
-
-	unsigned int ioprio_changed;
-
-	/*
-	 * For request batching
-	 */
-	unsigned long last_waited; /* Time last woken after wait for request */
-	int nr_batch_requests;     /* Number of requests left in the batch */
-
-	struct as_io_context *aic;
-	struct rb_root cic_root;
-	void *ioc_data;
-};
-
 void put_io_context(struct io_context *ioc);
 void exit_io_context(void);
 struct io_context *get_io_context(gfp_t gfp_flags, int node);
+struct io_context *alloc_io_context(gfp_t, int);
 void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index cae35b6..7011480 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -133,7 +133,6 @@ extern struct group_info init_groups;
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
-	.ioprio		= 0,						\
 	.time_slice	= HZ,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
new file mode 100644
index 0000000..186807e
--- /dev/null
+++ b/include/linux/iocontext.h
@@ -0,0 +1,79 @@
+#ifndef IOCONTEXT_H
+#define IOCONTEXT_H
+
+/*
+ * This is the per-process anticipatory I/O scheduler state.
+ */
+struct as_io_context {
+	spinlock_t lock;
+
+	void (*dtor)(struct as_io_context *aic); /* destructor */
+	void (*exit)(struct as_io_context *aic); /* called on task exit */
+
+	unsigned long state;
+	atomic_t nr_queued; /* queued reads & sync writes */
+	atomic_t nr_dispatched; /* number of requests gone to the drivers */
+
+	/* IO History tracking */
+	/* Thinktime */
+	unsigned long last_end_request;
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+	/* Layout pattern */
+	unsigned int seek_samples;
+	sector_t last_request_pos;
+	u64 seek_total;
+	sector_t seek_mean;
+};
+
+struct cfq_queue;
+struct cfq_io_context {
+	struct rb_node rb_node;
+	void *key;
+
+	struct cfq_queue *cfqq[2];
+
+	struct io_context *ioc;
+
+	unsigned long last_end_request;
+	sector_t last_request_pos;
+
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+
+	unsigned int seek_samples;
+	u64 seek_total;
+	sector_t seek_mean;
+
+	struct list_head queue_list;
+
+	void (*dtor)(struct io_context *); /* destructor */
+	void (*exit)(struct io_context *); /* called on task exit */
+};
+
+/*
+ * This is the per-process I/O subsystem state.  It is refcounted and
+ * kmalloc'ed. Currently all fields are modified in process io context
+ * (apart from the atomic refcount), so require no locking.
+ */
+struct io_context {
+	atomic_t refcount;
+	struct task_struct *task;
+
+	unsigned short ioprio;
+	unsigned short ioprio_changed;
+
+	/*
+	 * For request batching
+	 */
+	unsigned long last_waited; /* Time last woken after wait for request */
+	int nr_batch_requests;     /* Number of requests left in the batch */
+
+	struct as_io_context *aic;
+	struct rb_root cic_root;
+	void *ioc_data;
+};
+
+#endif
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index baf2938..2a3bb1b 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -2,6 +2,7 @@
 #define IOPRIO_H
 
 #include <linux/sched.h>
+#include <linux/iocontext.h>
 
 /*
  * Gives us 8 prio classes with 13-bits of data for each class
@@ -45,18 +46,18 @@ enum {
  * the cpu scheduler nice value to an io priority
  */
 #define IOPRIO_NORM	(4)
-static inline int task_ioprio(struct task_struct *task)
+static inline int task_ioprio(struct io_context *ioc)
 {
-	if (ioprio_valid(task->ioprio))
-		return IOPRIO_PRIO_DATA(task->ioprio);
+	if (ioprio_valid(ioc->ioprio))
+		return IOPRIO_PRIO_DATA(ioc->ioprio);
 
 	return IOPRIO_NORM;
 }
 
-static inline int task_ioprio_class(struct task_struct *task)
+static inline int task_ioprio_class(struct io_context *ioc)
 {
-	if (ioprio_valid(task->ioprio))
-		return IOPRIO_PRIO_CLASS(task->ioprio);
+	if (ioprio_valid(ioc->ioprio))
+		return IOPRIO_PRIO_CLASS(ioc->ioprio);
 
 	return IOPRIO_CLASS_BE;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cc14656..638250c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -939,7 +939,6 @@ struct task_struct {
 	struct hlist_head preempt_notifiers;
 #endif
 
-	unsigned short ioprio;
 	/*
 	 * fpu_counter contains the number of consecutive context switches
 	 * that the FPU is used. If this is over a threshold, the lazy fpu
diff --git a/kernel/fork.c b/kernel/fork.c
index 8dd8ff2..9961fb7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -51,6 +51,7 @@
 #include <linux/random.h>
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
+#include <linux/blkdev.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -791,6 +792,25 @@ out:
 	return error;
 }
 
+static int copy_io(struct task_struct *tsk)
+{
+	struct io_context *ioc = current->io_context;
+
+	if (!ioc)
+		return 0;
+
+	if (ioprio_valid(ioc->ioprio)) {
+		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
+		if (unlikely(!tsk->io_context))
+			return -ENOMEM;
+
+		tsk->io_context->task = tsk;
+		tsk->io_context->ioprio = ioc->ioprio;
+	}
+
+	return 0;
+}
+
 /*
  *	Helper to unshare the files of the current task.
  *	We don't want to expose copy_files internals to
@@ -1147,15 +1167,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_namespaces(clone_flags, p)))
 		goto bad_fork_cleanup_keys;
+	if ((retval = copy_io(p)))
+		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
-		goto bad_fork_cleanup_namespaces;
+		goto bad_fork_cleanup_io;
 
 	if (pid != &init_struct_pid) {
 		retval = -ENOMEM;
 		pid = alloc_pid(task_active_pid_ns(p));
 		if (!pid)
-			goto bad_fork_cleanup_namespaces;
+			goto bad_fork_cleanup_io;
 
 		if (clone_flags & CLONE_NEWPID) {
 			retval = pid_ns_prepare_proc(task_active_pid_ns(p));
@@ -1224,9 +1246,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);
 
-	/* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
-	p->ioprio = current->ioprio;
-
 	/*
 	 * The task hasn't been attached yet, so its cpus_allowed mask will
 	 * not be changed, nor will its assigned CPU.
@@ -1317,6 +1336,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
+bad_fork_cleanup_io:
+	put_io_context(p->io_context);
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_keys:
-- 
1.5.4.rc2.84.gf85fd


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/6] io context sharing: preliminary support
  2008-01-22  9:49 [PATCH 0/6] IO context sharing Jens Axboe
  2008-01-22  9:49 ` [PATCH 1/6] ioprio: move io priority from task_struct to io_context Jens Axboe
@ 2008-01-22  9:49 ` Jens Axboe
  2008-01-23 22:08   ` Andrew Morton
  2008-01-22  9:49 ` [PATCH 3/6] io_context sharing - cfq changes Jens Axboe
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2008-01-22  9:49 UTC (permalink / raw)
  To: linux-kernel; +Cc: knikanth, Jens Axboe

Detach task state from ioc, instead keep track of how many processes
are accessing the ioc.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/ll_rw_blk.c         |   27 ++++++++++++++++-----------
 fs/ioprio.c               |    1 -
 include/linux/blkdev.h    |    2 +-
 include/linux/iocontext.h |   22 ++++++++++++++++++----
 kernel/fork.c             |    1 -
 5 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index bcd0354..ca8b228 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3834,10 +3834,10 @@ int __init blk_dev_init(void)
 /*
  * IO Context helper functions
  */
-void put_io_context(struct io_context *ioc)
+int put_io_context(struct io_context *ioc)
 {
 	if (ioc == NULL)
-		return;
+		return 1;
 
 	BUG_ON(atomic_read(&ioc->refcount) == 0);
 
@@ -3856,7 +3856,9 @@ void put_io_context(struct io_context *ioc)
 		rcu_read_unlock();
 
 		kmem_cache_free(iocontext_cachep, ioc);
+		return 1;
 	}
+	return 0;
 }
 EXPORT_SYMBOL(put_io_context);
 
@@ -3871,15 +3873,17 @@ void exit_io_context(void)
 	current->io_context = NULL;
 	task_unlock(current);
 
-	ioc->task = NULL;
-	if (ioc->aic && ioc->aic->exit)
-		ioc->aic->exit(ioc->aic);
-	if (ioc->cic_root.rb_node != NULL) {
-		cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
-		cic->exit(ioc);
-	}
+	if (atomic_dec_and_test(&ioc->nr_tasks)) {
+		if (ioc->aic && ioc->aic->exit)
+			ioc->aic->exit(ioc->aic);
+		if (ioc->cic_root.rb_node != NULL) {
+			cic = rb_entry(rb_first(&ioc->cic_root),
+				struct cfq_io_context, rb_node);
+			cic->exit(ioc);
+		}
 
-	put_io_context(ioc);
+		put_io_context(ioc);
+	}
 }
 
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
@@ -3889,7 +3893,8 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
 	if (ret) {
 		atomic_set(&ret->refcount, 1);
-		ret->task = current;
+		atomic_set(&ret->nr_tasks, 1);
+		spin_lock_init(&ret->lock);
 		ret->ioprio_changed = 0;
 		ret->ioprio = 0;
 		ret->last_waited = jiffies; /* doesn't matter... */
diff --git a/fs/ioprio.c b/fs/ioprio.c
index a760040..06b5d97 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -54,7 +54,6 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 			break;
 		}
 		task->io_context = ioc;
-		ioc->task = task;
 	} while (1);
 
 	if (!err) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d61c6f5..1633523 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -34,7 +34,7 @@ struct sg_io_hdr;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 
-void put_io_context(struct io_context *ioc);
+int put_io_context(struct io_context *ioc);
 void exit_io_context(void);
 struct io_context *get_io_context(gfp_t gfp_flags, int node);
 struct io_context *alloc_io_context(gfp_t, int);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 186807e..cd44d45 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -54,13 +54,15 @@ struct cfq_io_context {
 };
 
 /*
- * This is the per-process I/O subsystem state.  It is refcounted and
- * kmalloc'ed. Currently all fields are modified in process io context
- * (apart from the atomic refcount), so require no locking.
+ * I/O subsystem state of the associated processes.  It is refcounted
+ * and kmalloc'ed. These could be shared between processes.
  */
 struct io_context {
 	atomic_t refcount;
-	struct task_struct *task;
+	atomic_t nr_tasks;
+
+	/* all the fields below are protected by this lock */
+	spinlock_t lock;
 
 	unsigned short ioprio;
 	unsigned short ioprio_changed;
@@ -76,4 +78,16 @@ struct io_context {
 	void *ioc_data;
 };
 
+static inline struct io_context *ioc_task_link(struct io_context *ioc)
+{
+	/*
+	 * if ref count is zero, don't allow sharing (ioc is going away, it's
+	 * a race).
+	 */
+	if (ioc && atomic_inc_not_zero(&ioc->refcount))
+		return ioc;
+
+	return NULL;
+}
+
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 9961fb7..4cfb0f4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -804,7 +804,6 @@ static int copy_io(struct task_struct *tsk)
 		if (unlikely(!tsk->io_context))
 			return -ENOMEM;
 
-		tsk->io_context->task = tsk;
 		tsk->io_context->ioprio = ioc->ioprio;
 	}
 
-- 
1.5.4.rc2.84.gf85fd


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/6] io_context sharing - cfq changes
  2008-01-22  9:49 [PATCH 0/6] IO context sharing Jens Axboe
  2008-01-22  9:49 ` [PATCH 1/6] ioprio: move io priority from task_struct to io_context Jens Axboe
  2008-01-22  9:49 ` [PATCH 2/6] io context sharing: preliminary support Jens Axboe
@ 2008-01-22  9:49 ` Jens Axboe
  2008-01-22  9:49 ` [PATCH 4/6] block: cfq: make the io contect sharing lockless Jens Axboe
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2008-01-22  9:49 UTC (permalink / raw)
  To: linux-kernel; +Cc: knikanth, Nikanth Karthikesan

From: Nikanth Karthikesan <KNikanth@novell.com>

changes in the cfq for io_context sharing

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c |   20 ++++++++++++++++++--
 1 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 533af75..dba52b6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -895,7 +895,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	 * task has exited, don't wait
 	 */
 	cic = cfqd->active_cic;
-	if (!cic || !cic->ioc->task)
+	if (!cic || !atomic_read(&cic->ioc->nr_tasks))
 		return;
 
 	/*
@@ -1178,6 +1178,8 @@ static void cfq_free_io_context(struct io_context *ioc)
 
 	ioc->ioc_data = NULL;
 
+	spin_lock(&ioc->lock);
+
 	while ((n = rb_first(&ioc->cic_root)) != NULL) {
 		__cic = rb_entry(n, struct cfq_io_context, rb_node);
 		rb_erase(&__cic->rb_node, &ioc->cic_root);
@@ -1189,6 +1191,8 @@ static void cfq_free_io_context(struct io_context *ioc)
 
 	if (ioc_gone && !elv_ioc_count_read(ioc_count))
 		complete(ioc_gone);
+
+	spin_unlock(&ioc->lock);
 }
 
 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@ -1243,6 +1247,7 @@ static void cfq_exit_io_context(struct io_context *ioc)
 
 	ioc->ioc_data = NULL;
 
+	spin_lock(&ioc->lock);
 	/*
 	 * put the reference this task is holding to the various queues
 	 */
@@ -1253,6 +1258,8 @@ static void cfq_exit_io_context(struct io_context *ioc)
 		cfq_exit_single_io_context(__cic);
 		n = rb_next(n);
 	}
+
+	spin_unlock(&ioc->lock);
 }
 
 static struct cfq_io_context *
@@ -1349,6 +1356,8 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
 	struct cfq_io_context *cic;
 	struct rb_node *n;
 
+	spin_lock(&ioc->lock);
+
 	ioc->ioprio_changed = 0;
 
 	n = rb_first(&ioc->cic_root);
@@ -1358,6 +1367,8 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
 		changed_ioprio(cic);
 		n = rb_next(n);
 	}
+
+	spin_unlock(&ioc->lock);
 }
 
 static struct cfq_queue *
@@ -1502,6 +1513,7 @@ cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 	if (cic && cic->key == cfqd)
 		return cic;
 
+	spin_lock(&ioc->lock);
 restart:
 	n = ioc->cic_root.rb_node;
 	while (n) {
@@ -1519,10 +1531,12 @@ restart:
 			n = n->rb_right;
 		else {
 			ioc->ioc_data = cic;
+			spin_unlock(&ioc->lock);
 			return cic;
 		}
 	}
 
+	spin_unlock(&ioc->lock);
 	return NULL;
 }
 
@@ -1536,6 +1550,7 @@ cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
 	unsigned long flags;
 	void *k;
 
+	spin_lock(&ioc->lock);
 	cic->ioc = ioc;
 	cic->key = cfqd;
 
@@ -1566,6 +1581,7 @@ restart:
 	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 	list_add(&cic->queue_list, &cfqd->cic_list);
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+	spin_unlock(&ioc->lock);
 }
 
 /*
@@ -1659,7 +1675,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	enable_idle = cfq_cfqq_idle_window(cfqq);
 
-	if (!cic->ioc->task || !cfqd->cfq_slice_idle ||
+	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
 	    (cfqd->hw_tag && CIC_SEEKY(cic)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
-- 
1.5.4.rc2.84.gf85fd


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/6] block: cfq: make the io contect sharing lockless
  2008-01-22  9:49 [PATCH 0/6] IO context sharing Jens Axboe
                   ` (2 preceding siblings ...)
  2008-01-22  9:49 ` [PATCH 3/6] io_context sharing - cfq changes Jens Axboe
@ 2008-01-22  9:49 ` Jens Axboe
  2008-01-23 22:08   ` Andrew Morton
  2008-01-22  9:49 ` [PATCH 5/6] io_context sharing - anticipatory changes Jens Axboe
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2008-01-22  9:49 UTC (permalink / raw)
  To: linux-kernel; +Cc: knikanth, Jens Axboe

The io context sharing introduced a per-ioc spinlock, that would protect
the cfq io context lookup. That is a regression from the original, since
we never needed any locking there because the ioc/cic were process private.

The cic lookup is changed from an rbtree construct to a radix tree, which
we can then use RCU to make the reader side lockless. That is the performance
critical path, modifying the radix tree is only done on process creation
(when that process first does IO, actually) and on process exit (if that
process has done IO).

As it so happens, radix trees are also much faster for this type of
lookup where the key is a pointer. It's a very sparse tree.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c       |  267 +++++++++++++++++++++++++--------------------
 block/ll_rw_blk.c         |   41 +++++---
 include/linux/iocontext.h |    6 +-
 3 files changed, 177 insertions(+), 137 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index dba52b6..8830893 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -200,7 +200,7 @@ CFQ_CFQQ_FNS(sync);
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
 				       struct io_context *, gfp_t);
-static struct cfq_io_context *cfq_cic_rb_lookup(struct cfq_data *,
+static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
 						struct io_context *);
 
 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
@@ -609,7 +609,7 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 	struct cfq_io_context *cic;
 	struct cfq_queue *cfqq;
 
-	cic = cfq_cic_rb_lookup(cfqd, tsk->io_context);
+	cic = cfq_cic_lookup(cfqd, tsk->io_context);
 	if (!cic)
 		return NULL;
 
@@ -721,7 +721,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 	 * Lookup the cfqq that this bio will be queued with. Allow
 	 * merge only if rq is queued there.
 	 */
-	cic = cfq_cic_rb_lookup(cfqd, current->io_context);
+	cic = cfq_cic_lookup(cfqd, current->io_context);
 	if (!cic)
 		return 0;
 
@@ -1170,29 +1170,74 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	kmem_cache_free(cfq_pool, cfqq);
 }
 
-static void cfq_free_io_context(struct io_context *ioc)
+/*
+ * Call func for each cic attached to this ioc. Returns number of cic's seen.
+ */
+#define CIC_GANG_NR	16
+static unsigned int
+call_for_each_cic(struct io_context *ioc,
+		  void (*func)(struct io_context *, struct cfq_io_context *))
 {
-	struct cfq_io_context *__cic;
-	struct rb_node *n;
-	int freed = 0;
+	struct cfq_io_context *cics[CIC_GANG_NR];
+	unsigned long index = 0;
+	unsigned int called = 0;
+	int nr;
 
-	ioc->ioc_data = NULL;
+	rcu_read_lock();
 
-	spin_lock(&ioc->lock);
+	do {
+		int i;
 
-	while ((n = rb_first(&ioc->cic_root)) != NULL) {
-		__cic = rb_entry(n, struct cfq_io_context, rb_node);
-		rb_erase(&__cic->rb_node, &ioc->cic_root);
-		kmem_cache_free(cfq_ioc_pool, __cic);
-		freed++;
-	}
+		/*
+		 * Perhaps there's a better way - this just gang lookups from
+		 * 0 to the end, restarting after each CIC_GANG_NR from the
+		 * last key + 1.
+		 */
+		nr = radix_tree_gang_lookup(&ioc->radix_root, (void **) cics,
+						index, CIC_GANG_NR);
+		if (!nr)
+			break;
+
+		called += nr;
+		index = 1 + (unsigned long) cics[nr - 1]->key;
+
+		for (i = 0; i < nr; i++)
+			func(ioc, cics[i]);
+	} while (nr == CIC_GANG_NR);
+
+	rcu_read_unlock();
+
+	return called;
+}
+
+static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
+{
+	unsigned long flags;
+
+	BUG_ON(!cic->dead_key);
+
+	spin_lock_irqsave(&ioc->lock, flags);
+	radix_tree_delete(&ioc->radix_root, cic->dead_key);
+	spin_unlock_irqrestore(&ioc->lock, flags);
+
+	kmem_cache_free(cfq_ioc_pool, cic);
+}
+
+static void cfq_free_io_context(struct io_context *ioc)
+{
+	int freed;
+
+	/*
+	 * ioc->refcount is zero here, so no more cic's are allowed to be
+	 * linked into this ioc. So it should be ok to iterate over the known
+	 * list, we will see all cic's since no new ones are added.
+	 */
+	freed = call_for_each_cic(ioc, cic_free_func);
 
 	elv_ioc_count_mod(ioc_count, -freed);
 
 	if (ioc_gone && !elv_ioc_count_read(ioc_count))
 		complete(ioc_gone);
-
-	spin_unlock(&ioc->lock);
 }
 
 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@ -1209,7 +1254,12 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
 					 struct cfq_io_context *cic)
 {
 	list_del_init(&cic->queue_list);
+
+	/*
+	 * Make sure key == NULL is seen for dead queues
+	 */
 	smp_wmb();
+	cic->dead_key = (unsigned long) cic->key;
 	cic->key = NULL;
 
 	if (cic->cfqq[ASYNC]) {
@@ -1223,16 +1273,18 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
 	}
 }
 
-static void cfq_exit_single_io_context(struct cfq_io_context *cic)
+static void cfq_exit_single_io_context(struct io_context *ioc,
+				       struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic->key;
 
 	if (cfqd) {
 		struct request_queue *q = cfqd->queue;
+		unsigned long flags;
 
-		spin_lock_irq(q->queue_lock);
+		spin_lock_irqsave(q->queue_lock, flags);
 		__cfq_exit_single_io_context(cfqd, cic);
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irqrestore(q->queue_lock, flags);
 	}
 }
 
@@ -1242,24 +1294,8 @@ static void cfq_exit_single_io_context(struct cfq_io_context *cic)
  */
 static void cfq_exit_io_context(struct io_context *ioc)
 {
-	struct cfq_io_context *__cic;
-	struct rb_node *n;
-
-	ioc->ioc_data = NULL;
-
-	spin_lock(&ioc->lock);
-	/*
-	 * put the reference this task is holding to the various queues
-	 */
-	n = rb_first(&ioc->cic_root);
-	while (n != NULL) {
-		__cic = rb_entry(n, struct cfq_io_context, rb_node);
-
-		cfq_exit_single_io_context(__cic);
-		n = rb_next(n);
-	}
-
-	spin_unlock(&ioc->lock);
+	rcu_assign_pointer(ioc->ioc_data, NULL);
+	call_for_each_cic(ioc, cfq_exit_single_io_context);
 }
 
 static struct cfq_io_context *
@@ -1323,7 +1359,8 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
-static inline void changed_ioprio(struct cfq_io_context *cic)
+static inline void changed_ioprio(struct io_context *ioc,
+				  struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic->key;
 	struct cfq_queue *cfqq;
@@ -1353,22 +1390,8 @@ static inline void changed_ioprio(struct cfq_io_context *cic)
 
 static void cfq_ioc_set_ioprio(struct io_context *ioc)
 {
-	struct cfq_io_context *cic;
-	struct rb_node *n;
-
-	spin_lock(&ioc->lock);
-
+	call_for_each_cic(ioc, changed_ioprio);
 	ioc->ioprio_changed = 0;
-
-	n = rb_first(&ioc->cic_root);
-	while (n != NULL) {
-		cic = rb_entry(n, struct cfq_io_context, rb_node);
-
-		changed_ioprio(cic);
-		n = rb_next(n);
-	}
-
-	spin_unlock(&ioc->lock);
 }
 
 static struct cfq_queue *
@@ -1379,7 +1402,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync,
 	struct cfq_io_context *cic;
 
 retry:
-	cic = cfq_cic_rb_lookup(cfqd, ioc);
+	cic = cfq_cic_lookup(cfqd, ioc);
 	/* cic always exists here */
 	cfqq = cic_to_cfqq(cic, is_sync);
 
@@ -1480,28 +1503,42 @@ cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc,
 	return cfqq;
 }
 
+static void cfq_cic_free(struct cfq_io_context *cic)
+{
+	kmem_cache_free(cfq_ioc_pool, cic);
+	elv_ioc_count_dec(ioc_count);
+
+	if (ioc_gone && !elv_ioc_count_read(ioc_count))
+		complete(ioc_gone);
+}
+
 /*
  * We drop cfq io contexts lazily, so we may find a dead one.
  */
 static void
-cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
+cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
+		  struct cfq_io_context *cic)
 {
+	unsigned long flags;
+
 	WARN_ON(!list_empty(&cic->queue_list));
 
+	spin_lock_irqsave(&ioc->lock, flags);
+
 	if (ioc->ioc_data == cic)
-		ioc->ioc_data = NULL;
+		rcu_assign_pointer(ioc->ioc_data, NULL);
 
-	rb_erase(&cic->rb_node, &ioc->cic_root);
-	kmem_cache_free(cfq_ioc_pool, cic);
-	elv_ioc_count_dec(ioc_count);
+	radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);
+	spin_unlock_irqrestore(&ioc->lock, flags);
+
+	cfq_cic_free(cic);
 }
 
 static struct cfq_io_context *
-cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
+cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 {
-	struct rb_node *n;
 	struct cfq_io_context *cic;
-	void *k, *key = cfqd;
+	void *k;
 
 	if (unlikely(!ioc))
 		return NULL;
@@ -1509,79 +1546,65 @@ cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 	/*
 	 * we maintain a last-hit cache, to avoid browsing over the tree
 	 */
-	cic = ioc->ioc_data;
+	cic = rcu_dereference(ioc->ioc_data);
 	if (cic && cic->key == cfqd)
 		return cic;
 
-	spin_lock(&ioc->lock);
-restart:
-	n = ioc->cic_root.rb_node;
-	while (n) {
-		cic = rb_entry(n, struct cfq_io_context, rb_node);
+	do {
+		rcu_read_lock();
+		cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
+		rcu_read_unlock();
+		if (!cic)
+			break;
 		/* ->key must be copied to avoid race with cfq_exit_queue() */
 		k = cic->key;
 		if (unlikely(!k)) {
-			cfq_drop_dead_cic(ioc, cic);
-			goto restart;
+			cfq_drop_dead_cic(cfqd, ioc, cic);
+			continue;
 		}
 
-		if (key < k)
-			n = n->rb_left;
-		else if (key > k)
-			n = n->rb_right;
-		else {
-			ioc->ioc_data = cic;
-			spin_unlock(&ioc->lock);
-			return cic;
-		}
-	}
+		rcu_assign_pointer(ioc->ioc_data, cic);
+		break;
+	} while (1);
 
-	spin_unlock(&ioc->lock);
-	return NULL;
+	return cic;
 }
 
-static inline void
+/*
+ * Add cic into ioc, using cfqd as the search key. This enables us to lookup
+ * the process specific cfq io context when entered from the block layer.
+ * Also adds the cic to a per-cfqd list, used when this queue is removed.
+ */
+static inline int
 cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
-	     struct cfq_io_context *cic)
+	     struct cfq_io_context *cic, gfp_t gfp_mask)
 {
-	struct rb_node **p;
-	struct rb_node *parent;
-	struct cfq_io_context *__cic;
 	unsigned long flags;
-	void *k;
+	int ret;
 
-	spin_lock(&ioc->lock);
-	cic->ioc = ioc;
-	cic->key = cfqd;
+	ret = radix_tree_preload(gfp_mask);
+	if (!ret) {
+		cic->ioc = ioc;
+		cic->key = cfqd;
 
-restart:
-	parent = NULL;
-	p = &ioc->cic_root.rb_node;
-	while (*p) {
-		parent = *p;
-		__cic = rb_entry(parent, struct cfq_io_context, rb_node);
-		/* ->key must be copied to avoid race with cfq_exit_queue() */
-		k = __cic->key;
-		if (unlikely(!k)) {
-			cfq_drop_dead_cic(ioc, __cic);
-			goto restart;
-		}
+		spin_lock_irqsave(&ioc->lock, flags);
+		ret = radix_tree_insert(&ioc->radix_root,
+						(unsigned long) cfqd, cic);
+		spin_unlock_irqrestore(&ioc->lock, flags);
 
-		if (cic->key < k)
-			p = &(*p)->rb_left;
-		else if (cic->key > k)
-			p = &(*p)->rb_right;
-		else
-			BUG();
+		radix_tree_preload_end();
+
+		if (!ret) {
+			spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+			list_add(&cic->queue_list, &cfqd->cic_list);
+			spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+		}
 	}
 
-	rb_link_node(&cic->rb_node, parent, p);
-	rb_insert_color(&cic->rb_node, &ioc->cic_root);
+	if (ret)
+		printk(KERN_ERR "cfq: cic link failed!\n");
 
-	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-	list_add(&cic->queue_list, &cfqd->cic_list);
-	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
-	spin_unlock(&ioc->lock);
+	return ret;
 }
 
 /*
@@ -1601,7 +1624,7 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	if (!ioc)
 		return NULL;
 
-	cic = cfq_cic_rb_lookup(cfqd, ioc);
+	cic = cfq_cic_lookup(cfqd, ioc);
 	if (cic)
 		goto out;
 
@@ -1609,13 +1632,17 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	if (cic == NULL)
 		goto err;
 
-	cfq_cic_link(cfqd, ioc, cic);
+	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
+		goto err_free;
+
 out:
 	smp_read_barrier_depends();
 	if (unlikely(ioc->ioprio_changed))
 		cfq_ioc_set_ioprio(ioc);
 
 	return cic;
+err_free:
+	cfq_cic_free(cic);
 err:
 	put_io_context(ioc);
 	return NULL;
@@ -1909,7 +1936,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
 	 * so just lookup a possibly existing queue, or return 'may queue'
 	 * if that fails
 	 */
-	cic = cfq_cic_rb_lookup(cfqd, tsk->io_context);
+	cic = cfq_cic_lookup(cfqd, tsk->io_context);
 	if (!cic)
 		return ELV_MQUEUE_MAY;
 
@@ -2174,7 +2201,7 @@ static int __init cfq_slab_setup(void)
 	if (!cfq_pool)
 		goto fail;
 
-	cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
+	cfq_ioc_pool = KMEM_CACHE(cfq_io_context, SLAB_DESTROY_BY_RCU);
 	if (!cfq_ioc_pool)
 		goto fail;
 
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index ca8b228..2a84a09 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3831,6 +3831,16 @@ int __init blk_dev_init(void)
 	return 0;
 }
 
+static void cfq_dtor(struct io_context *ioc)
+{
+	struct cfq_io_context *cic[1];
+	int r;
+
+	r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
+	if (r > 0)
+		cic[0]->dtor(ioc);
+}
+
 /*
  * IO Context helper functions
  */
@@ -3842,18 +3852,11 @@ int put_io_context(struct io_context *ioc)
 	BUG_ON(atomic_read(&ioc->refcount) == 0);
 
 	if (atomic_dec_and_test(&ioc->refcount)) {
-		struct cfq_io_context *cic;
-
 		rcu_read_lock();
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
-		if (ioc->cic_root.rb_node != NULL) {
-			struct rb_node *n = rb_first(&ioc->cic_root);
-
-			cic = rb_entry(n, struct cfq_io_context, rb_node);
-			cic->dtor(ioc);
-		}
 		rcu_read_unlock();
+		cfq_dtor(ioc);
 
 		kmem_cache_free(iocontext_cachep, ioc);
 		return 1;
@@ -3862,11 +3865,23 @@ int put_io_context(struct io_context *ioc)
 }
 EXPORT_SYMBOL(put_io_context);
 
+static void cfq_exit(struct io_context *ioc)
+{
+	struct cfq_io_context *cic[1];
+	int r;
+
+	rcu_read_lock();
+	r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
+	rcu_read_unlock();
+
+	if (r > 0)
+		cic[0]->exit(ioc);
+}
+
 /* Called by the exitting task */
 void exit_io_context(void)
 {
 	struct io_context *ioc;
-	struct cfq_io_context *cic;
 
 	task_lock(current);
 	ioc = current->io_context;
@@ -3876,11 +3891,7 @@ void exit_io_context(void)
 	if (atomic_dec_and_test(&ioc->nr_tasks)) {
 		if (ioc->aic && ioc->aic->exit)
 			ioc->aic->exit(ioc->aic);
-		if (ioc->cic_root.rb_node != NULL) {
-			cic = rb_entry(rb_first(&ioc->cic_root),
-				struct cfq_io_context, rb_node);
-			cic->exit(ioc);
-		}
+		cfq_exit(ioc);
 
 		put_io_context(ioc);
 	}
@@ -3900,7 +3911,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 		ret->last_waited = jiffies; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
-		ret->cic_root.rb_node = NULL;
+		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
 		ret->ioc_data = NULL;
 	}
 
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index cd44d45..593b222 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -1,6 +1,8 @@
 #ifndef IOCONTEXT_H
 #define IOCONTEXT_H
 
+#include <linux/radix-tree.h>
+
 /*
  * This is the per-process anticipatory I/O scheduler state.
  */
@@ -29,8 +31,8 @@ struct as_io_context {
 
 struct cfq_queue;
 struct cfq_io_context {
-	struct rb_node rb_node;
 	void *key;
+	unsigned long dead_key;
 
 	struct cfq_queue *cfqq[2];
 
@@ -74,7 +76,7 @@ struct io_context {
 	int nr_batch_requests;     /* Number of requests left in the batch */
 
 	struct as_io_context *aic;
-	struct rb_root cic_root;
+	struct radix_tree_root radix_root;
 	void *ioc_data;
 };
 
-- 
1.5.4.rc2.84.gf85fd


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 5/6] io_context sharing - anticipatory changes
  2008-01-22  9:49 [PATCH 0/6] IO context sharing Jens Axboe
                   ` (3 preceding siblings ...)
  2008-01-22  9:49 ` [PATCH 4/6] block: cfq: make the io contect sharing lockless Jens Axboe
@ 2008-01-22  9:49 ` Jens Axboe
  2008-01-22  9:49 ` [PATCH 6/6] kernel: add CLONE_IO to specifically request sharing of IO contexts Jens Axboe
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2008-01-22  9:49 UTC (permalink / raw)
  To: linux-kernel; +Cc: knikanth, Jens Axboe

changes to anticipatory io scheduler for io_context sharing

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/as-iosched.c |   34 ++++++++++++++++++++++++++++------
 1 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/block/as-iosched.c b/block/as-iosched.c
index cb5e53b..b201d16 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -170,9 +170,11 @@ static void free_as_io_context(struct as_io_context *aic)
 
 static void as_trim(struct io_context *ioc)
 {
+	spin_lock(&ioc->lock);
 	if (ioc->aic)
 		free_as_io_context(ioc->aic);
 	ioc->aic = NULL;
+	spin_unlock(&ioc->lock);
 }
 
 /* Called when the task exits */
@@ -462,7 +464,9 @@ static void as_antic_timeout(unsigned long data)
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (ad->antic_status == ANTIC_WAIT_REQ
 			|| ad->antic_status == ANTIC_WAIT_NEXT) {
-		struct as_io_context *aic = ad->io_context->aic;
+		struct as_io_context *aic;
+		spin_lock(&ad->io_context->lock);
+		aic = ad->io_context->aic;
 
 		ad->antic_status = ANTIC_FINISHED;
 		kblockd_schedule_work(&ad->antic_work);
@@ -475,6 +479,7 @@ static void as_antic_timeout(unsigned long data)
 			/* process not "saved" by a cooperating request */
 			ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8;
 		}
+		spin_unlock(&ad->io_context->lock);
 	}
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
@@ -635,9 +640,11 @@ static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
 
 	ioc = ad->io_context;
 	BUG_ON(!ioc);
+	spin_lock(&ioc->lock);
 
 	if (rq && ioc == RQ_IOC(rq)) {
 		/* request from same process */
+		spin_unlock(&ioc->lock);
 		return 1;
 	}
 
@@ -646,20 +653,25 @@ static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
 		 * In this situation status should really be FINISHED,
 		 * however the timer hasn't had the chance to run yet.
 		 */
+		spin_unlock(&ioc->lock);
 		return 1;
 	}
 
 	aic = ioc->aic;
-	if (!aic)
+	if (!aic) {
+		spin_unlock(&ioc->lock);
 		return 0;
+	}
 
 	if (atomic_read(&aic->nr_queued) > 0) {
 		/* process has more requests queued */
+		spin_unlock(&ioc->lock);
 		return 1;
 	}
 
 	if (atomic_read(&aic->nr_dispatched) > 0) {
 		/* process has more requests dispatched */
+		spin_unlock(&ioc->lock);
 		return 1;
 	}
 
@@ -680,6 +692,7 @@ static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
 		}
 
 		as_update_iohist(ad, aic, rq);
+		spin_unlock(&ioc->lock);
 		return 1;
 	}
 
@@ -688,20 +701,27 @@ static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
 		if (aic->ttime_samples == 0)
 			ad->exit_prob = (7*ad->exit_prob + 256)/8;
 
-		if (ad->exit_no_coop > 128)
+		if (ad->exit_no_coop > 128) {
+			spin_unlock(&ioc->lock);
 			return 1;
+		}
 	}
 
 	if (aic->ttime_samples == 0) {
-		if (ad->new_ttime_mean > ad->antic_expire)
+		if (ad->new_ttime_mean > ad->antic_expire) {
+			spin_unlock(&ioc->lock);
 			return 1;
-		if (ad->exit_prob * ad->exit_no_coop > 128*256)
+		}
+		if (ad->exit_prob * ad->exit_no_coop > 128*256) {
+			spin_unlock(&ioc->lock);
 			return 1;
+		}
 	} else if (aic->ttime_mean > ad->antic_expire) {
 		/* the process thinks too much between requests */
+		spin_unlock(&ioc->lock);
 		return 1;
 	}
-
+	spin_unlock(&ioc->lock);
 	return 0;
 }
 
@@ -1255,7 +1275,9 @@ static void as_merged_requests(struct request_queue *q, struct request *req,
 			 * Don't copy here but swap, because when anext is
 			 * removed below, it must contain the unused context
 			 */
+			double_spin_lock(&rioc->lock, &nioc->lock, rioc < nioc);
 			swap_io_context(&rioc, &nioc);
+			double_spin_unlock(&rioc->lock, &nioc->lock, rioc < nioc);
 		}
 	}
 
-- 
1.5.4.rc2.84.gf85fd


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 6/6] kernel: add CLONE_IO to specifically request sharing of IO contexts
  2008-01-22  9:49 [PATCH 0/6] IO context sharing Jens Axboe
                   ` (4 preceding siblings ...)
  2008-01-22  9:49 ` [PATCH 5/6] io_context sharing - anticipatory changes Jens Axboe
@ 2008-01-22  9:49 ` Jens Axboe
  2008-01-22 14:49 ` [PATCH 0/6] IO context sharing Peter Zijlstra
  2008-01-23  3:50 ` David Chinner
  7 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2008-01-22  9:49 UTC (permalink / raw)
  To: linux-kernel; +Cc: knikanth, Jens Axboe

syslets can set this to enforce sharing of io context.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/sched.h |    1 +
 kernel/fork.c         |   14 ++++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 638250c..b6dbcd6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -27,6 +27,7 @@
 #define CLONE_NEWUSER		0x10000000	/* New user namespace */
 #define CLONE_NEWPID		0x20000000	/* New pid namespace */
 #define CLONE_NEWNET		0x40000000	/* New network namespace */
+#define CLONE_IO		0x80000000	/* Clone io context */
 
 /*
  * Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c
index 4cfb0f4..e7eec39 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -792,14 +792,20 @@ out:
 	return error;
 }
 
-static int copy_io(struct task_struct *tsk)
+static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
 	struct io_context *ioc = current->io_context;
 
 	if (!ioc)
 		return 0;
-
-	if (ioprio_valid(ioc->ioprio)) {
+	/*
+	 * Share io context with parent, if CLONE_IO is set
+	 */
+	if (clone_flags & CLONE_IO) {
+		tsk->io_context = ioc_task_link(ioc);
+		if (unlikely(!tsk->io_context))
+			return -ENOMEM;
+	} else if (ioprio_valid(ioc->ioprio)) {
 		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
 		if (unlikely(!tsk->io_context))
 			return -ENOMEM;
@@ -1166,7 +1172,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_namespaces(clone_flags, p)))
 		goto bad_fork_cleanup_keys;
-	if ((retval = copy_io(p)))
+	if ((retval = copy_io(clone_flags, p)))
 		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
-- 
1.5.4.rc2.84.gf85fd


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/6] IO context sharing
  2008-01-22  9:49 [PATCH 0/6] IO context sharing Jens Axboe
                   ` (5 preceding siblings ...)
  2008-01-22  9:49 ` [PATCH 6/6] kernel: add CLONE_IO to specifically request sharing of IO contexts Jens Axboe
@ 2008-01-22 14:49 ` Peter Zijlstra
  2008-01-22 18:21   ` Jens Axboe
  2008-01-23  3:50 ` David Chinner
  7 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2008-01-22 14:49 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, knikanth


On Tue, 2008-01-22 at 10:49 +0100, Jens Axboe wrote:
> Hi,
> 
> Today io contexts are per-process and define the (surprise) io context
> of that process. In some situations it would be handy if several
> processes share an IO context. With syslets it's especially crucial that
> the threads working on behalf of the parent process are seen as the same
> IO context. That goes from both a fairness and performance point of
> view, since IO schedulers like AS and CFQ base decisions on what the
> IO context state is. This patchset adds support for processes sharing a
> single io context and also adds a CLONE_IO flag to denote sharing of
> IO context.

Are you planning on extending this to a hierarchical CFQ for cgroups, or
just giving the IO context to a cgroup?




^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/6] IO context sharing
  2008-01-22 14:49 ` [PATCH 0/6] IO context sharing Peter Zijlstra
@ 2008-01-22 18:21   ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2008-01-22 18:21 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, knikanth

On Tue, Jan 22 2008, Peter Zijlstra wrote:
> 
> On Tue, 2008-01-22 at 10:49 +0100, Jens Axboe wrote:
> > Hi,
> > 
> > Today io contexts are per-process and define the (surprise) io context
> > of that process. In some situations it would be handy if several
> > processes share an IO context. With syslets it's especially crucial that
> > the threads working on behalf of the parent process are seen as the same
> > IO context. That goes from both a fairness and performance point of
> > view, since IO schedulers like AS and CFQ base decisions on what the
> > IO context state is. This patchset adds support for processes sharing a
> > single io context and also adds a CLONE_IO flag to denote sharing of
> > IO context.
> 
> Are you planning on extending this to a hierarchical CFQ for cgroups, or
> just giving the IO context to a cgroup?

Not sure what extra extensions it would need on the IO side. As long as
the other side makes sure to group its processes by sharing IO context,
then CFQ will treat that as a single entity.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/6] IO context sharing
  2008-01-22  9:49 [PATCH 0/6] IO context sharing Jens Axboe
                   ` (6 preceding siblings ...)
  2008-01-22 14:49 ` [PATCH 0/6] IO context sharing Peter Zijlstra
@ 2008-01-23  3:50 ` David Chinner
  2008-01-23  8:44   ` Andi Kleen
  7 siblings, 1 reply; 18+ messages in thread
From: David Chinner @ 2008-01-23  3:50 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, knikanth

On Tue, Jan 22, 2008 at 10:49:15AM +0100, Jens Axboe wrote:
> Hi,
> 
> Today io contexts are per-process and define the (surprise) io context
> of that process. In some situations it would be handy if several
> processes share an IO context.

I think that the nfsd threads should probably share as
well. It should probably provide an io context per thread
pool....

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/6] IO context sharing
  2008-01-23  3:50 ` David Chinner
@ 2008-01-23  8:44   ` Andi Kleen
  0 siblings, 0 replies; 18+ messages in thread
From: Andi Kleen @ 2008-01-23  8:44 UTC (permalink / raw)
  To: David Chinner; +Cc: Jens Axboe, linux-kernel, knikanth

David Chinner <dgc@sgi.com> writes:

> On Tue, Jan 22, 2008 at 10:49:15AM +0100, Jens Axboe wrote:
> > Hi,
> > 
> > Today io contexts are per-process and define the (surprise) io context
> > of that process. In some situations it would be handy if several
> > processes share an IO context.
> 
> I think that the nfsd threads should probably share as
> well. It should probably provide an io context per thread
> pool....

Wouldn't it make more sense to have an own io context for each
NFS client in this case? Ok that does mean some state per remote
client, but nfsd already keeps that anyways so it would be probably
not too much work to tug an io context in there too.

-Andi

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/6] ioprio: move io priority from task_struct to io_context
  2008-01-22  9:49 ` [PATCH 1/6] ioprio: move io priority from task_struct to io_context Jens Axboe
@ 2008-01-23 22:07   ` Andrew Morton
  2008-01-24  7:30     ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2008-01-23 22:07 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, knikanth, jens.axboe

> On Tue, 22 Jan 2008 10:49:16 +0100 Jens Axboe <jens.axboe@oracle.com> wrote:
> This is where it belongs and then it doesn't take up space for a
> process that doesn't do IO.
> 
> ...
>
>  struct io_context *get_io_context(gfp_t gfp_flags, int node)
>  {
> -	struct io_context *ret;
> -	ret = current_io_context(gfp_flags, node);
> -	if (likely(ret))
> -		atomic_inc(&ret->refcount);
> +	struct io_context *ret = NULL;
> +
> +	do {
> +		ret = current_io_context(gfp_flags, node);
> +		if (unlikely(!ret))
> +			break;
> +	} while (!atomic_inc_not_zero(&ret->refcount));

Looks weird.  Could do with a comment.  Or unweirding ;)

What's going on here?

>  	return ret;
>  }
>  EXPORT_SYMBOL(get_io_context);
> diff --git a/fs/ioprio.c b/fs/ioprio.c
> index e4e01bc..a760040 100644
> --- a/fs/ioprio.c
> +++ b/fs/ioprio.c
> @@ -41,18 +41,29 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
>  		return err;
>  
>  	task_lock(task);
> +	do {
> +		ioc = task->io_context;
> +		/* see wmb() in current_io_context() */
> +		smp_read_barrier_depends();
> +		if (ioc)
> +			break;
>  
> -	task->ioprio = ioprio;
> -
> -	ioc = task->io_context;
> -	/* see wmb() in current_io_context() */
> -	smp_read_barrier_depends();
> +		ioc = alloc_io_context(GFP_ATOMIC, -1);
> +		if (!ioc) {
> +			err = -ENOMEM;
> +			break;
> +		}
> +		task->io_context = ioc;
> +		ioc->task = task;
> +	} while (1);

argh.  Can't sit there in a loop retrying GFP_ATOMIC!

> -	if (ioc)
> +	if (!err) {
> +		ioc->ioprio = ioprio;
>  		ioc->ioprio_changed = 1;
> +	}
>  
>  	task_unlock(task);
> -	return 0;
> +	return err;
>  }
>  
>  asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
>
> ...
>
>  void put_io_context(struct io_context *ioc);
>  void exit_io_context(void);
>  struct io_context *get_io_context(gfp_t gfp_flags, int node);
> +struct io_context *alloc_io_context(gfp_t, int);
>  void copy_io_context(struct io_context **pdst, struct io_context **psrc);
>  void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);

The rest of the declarations around here nicely name their args.

> +static int copy_io(struct task_struct *tsk)
> +{
> +	struct io_context *ioc = current->io_context;
> +
> +	if (!ioc)
> +		return 0;
> +
> +	if (ioprio_valid(ioc->ioprio)) {
> +		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
> +		if (unlikely(!tsk->io_context))
> +			return -ENOMEM;
> +
> +		tsk->io_context->task = tsk;
> +		tsk->io_context->ioprio = ioc->ioprio;
> +	}
> +
> +	return 0;
> +}

Should this depend on CONFIG_BLOCK?



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/6] io context sharing: preliminary support
  2008-01-22  9:49 ` [PATCH 2/6] io context sharing: preliminary support Jens Axboe
@ 2008-01-23 22:08   ` Andrew Morton
  2008-01-24  7:36     ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2008-01-23 22:08 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, knikanth, jens.axboe

> On Tue, 22 Jan 2008 10:49:17 +0100 Jens Axboe <jens.axboe@oracle.com> wrote:
> -void put_io_context(struct io_context *ioc)
> +int put_io_context(struct io_context *ioc)
>  {
>  	if (ioc == NULL)
> -		return;
> +		return 1;
>  
>  	BUG_ON(atomic_read(&ioc->refcount) == 0);
>  
> @@ -3856,7 +3856,9 @@ void put_io_context(struct io_context *ioc)
>  		rcu_read_unlock();
>  
>  		kmem_cache_free(iocontext_cachep, ioc);
> +		return 1;
>  	}
> +	return 0;
>  }

Document the return value?  (and the function)

I assume this return value gets used in some other patch.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/6] block: cfq: make the io contect sharing lockless
  2008-01-22  9:49 ` [PATCH 4/6] block: cfq: make the io contect sharing lockless Jens Axboe
@ 2008-01-23 22:08   ` Andrew Morton
  2008-01-24  7:36     ` Jens Axboe
  2008-01-24 16:42     ` Paul E. McKenney
  0 siblings, 2 replies; 18+ messages in thread
From: Andrew Morton @ 2008-01-23 22:08 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, knikanth, jens.axboe, Paul E. McKenney

> On Tue, 22 Jan 2008 10:49:19 +0100 Jens Axboe <jens.axboe@oracle.com> wrote:
> The io context sharing introduced a per-ioc spinlock, that would protect
> the cfq io context lookup. That is a regression from the original, since
> we never needed any locking there because the ioc/cic were process private.
> 
> The cic lookup is changed from an rbtree construct to a radix tree, which
> we can then use RCU to make the reader side lockless. That is the performance
> critical path, modifying the radix tree is only done on process creation
> (when that process first does IO, actually) and on process exit (if that
> process has done IO).

Perhaps Paul would review the rcu usage here sometime?

> +/*
> + * Add cic into ioc, using cfqd as the search key. This enables us to lookup
> + * the process specific cfq io context when entered from the block layer.
> + * Also adds the cic to a per-cfqd list, used when this queue is removed.
> + */
> +static inline int

There's a lot of pointless inlining in there.

> +++ b/block/ll_rw_blk.c
> @@ -3831,6 +3831,16 @@ int __init blk_dev_init(void)
>  	return 0;
>  }
>  
> +static void cfq_dtor(struct io_context *ioc)
> +{
> +	struct cfq_io_context *cic[1];
> +	int r;
> +
> +	r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
> +	if (r > 0)
> +		cic[0]->dtor(ioc);
> +}

Some comments here might help others who are wondering why we can't just
use radix_tree_lookup().

> +static void cfq_exit(struct io_context *ioc)
> +{
> +	struct cfq_io_context *cic[1];
> +	int r;
> +
> +	rcu_read_lock();
> +	r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
> +	rcu_read_unlock();
> +
> +	if (r > 0)
> +		cic[0]->exit(ioc);
> +}

ditto.

>  /* Called by the exitting task */
>  void exit_io_context(void)
>  {
>  	struct io_context *ioc;
> -	struct cfq_io_context *cic;
>  
>  	task_lock(current);
>  	ioc = current->io_context;
> @@ -3876,11 +3891,7 @@ void exit_io_context(void)
>  	if (atomic_dec_and_test(&ioc->nr_tasks)) {
>  		if (ioc->aic && ioc->aic->exit)
>  			ioc->aic->exit(ioc->aic);
> -		if (ioc->cic_root.rb_node != NULL) {
> -			cic = rb_entry(rb_first(&ioc->cic_root),
> -				struct cfq_io_context, rb_node);
> -			cic->exit(ioc);
> -		}
> +		cfq_exit(ioc);
>  
>  		put_io_context(ioc);
>  	}
> @@ -3900,7 +3911,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
>  		ret->last_waited = jiffies; /* doesn't matter... */
>  		ret->nr_batch_requests = 0; /* because this is 0 */
>  		ret->aic = NULL;
> -		ret->cic_root.rb_node = NULL;
> +		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);

Did this need to be atomic?



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/6] ioprio: move io priority from task_struct to  io_context
  2008-01-23 22:07   ` Andrew Morton
@ 2008-01-24  7:30     ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2008-01-24  7:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, knikanth

On Wed, Jan 23 2008, Andrew Morton wrote:
> > On Tue, 22 Jan 2008 10:49:16 +0100 Jens Axboe <jens.axboe@oracle.com> wrote:
> > This is where it belongs and then it doesn't take up space for a
> > process that doesn't do IO.
> > 
> > ...
> >
> >  struct io_context *get_io_context(gfp_t gfp_flags, int node)
> >  {
> > -	struct io_context *ret;
> > -	ret = current_io_context(gfp_flags, node);
> > -	if (likely(ret))
> > -		atomic_inc(&ret->refcount);
> > +	struct io_context *ret = NULL;
> > +
> > +	do {
> > +		ret = current_io_context(gfp_flags, node);
> > +		if (unlikely(!ret))
> > +			break;
> > +	} while (!atomic_inc_not_zero(&ret->refcount));
> 
> Looks weird.  Could do with a comment.  Or unweirding ;)
> 
> What's going on here?

In the unlikely event that we find a task that is on its way to exiting.
This hunk should actually be a part of the cfq lockless stuff...

> >  	return ret;
> >  }
> >  EXPORT_SYMBOL(get_io_context);
> > diff --git a/fs/ioprio.c b/fs/ioprio.c
> > index e4e01bc..a760040 100644
> > --- a/fs/ioprio.c
> > +++ b/fs/ioprio.c
> > @@ -41,18 +41,29 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
> >  		return err;
> >  
> >  	task_lock(task);
> > +	do {
> > +		ioc = task->io_context;
> > +		/* see wmb() in current_io_context() */
> > +		smp_read_barrier_depends();
> > +		if (ioc)
> > +			break;
> >  
> > -	task->ioprio = ioprio;
> > -
> > -	ioc = task->io_context;
> > -	/* see wmb() in current_io_context() */
> > -	smp_read_barrier_depends();
> > +		ioc = alloc_io_context(GFP_ATOMIC, -1);
> > +		if (!ioc) {
> > +			err = -ENOMEM;
> > +			break;
> > +		}
> > +		task->io_context = ioc;
> > +		ioc->task = task;
> > +	} while (1);
> 
> argh.  Can't sit there in a loop retrying GFP_ATOMIC!

It's not, read the loop again!

> > -	if (ioc)
> > +	if (!err) {
> > +		ioc->ioprio = ioprio;
> >  		ioc->ioprio_changed = 1;
> > +	}
> >  
> >  	task_unlock(task);
> > -	return 0;
> > +	return err;
> >  }
> >  
> >  asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
> >
> > ...
> >
> >  void put_io_context(struct io_context *ioc);
> >  void exit_io_context(void);
> >  struct io_context *get_io_context(gfp_t gfp_flags, int node);
> > +struct io_context *alloc_io_context(gfp_t, int);
> >  void copy_io_context(struct io_context **pdst, struct io_context **psrc);
> >  void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
> 
> The rest of the declarations around here nicely name their args.

A clear sign I didn't put those declarations there, but the inconsistent
style is surely not a good thing. Will fix that up.

> > +static int copy_io(struct task_struct *tsk)
> > +{
> > +	struct io_context *ioc = current->io_context;
> > +
> > +	if (!ioc)
> > +		return 0;
> > +
> > +	if (ioprio_valid(ioc->ioprio)) {
> > +		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
> > +		if (unlikely(!tsk->io_context))
> > +			return -ENOMEM;
> > +
> > +		tsk->io_context->task = tsk;
> > +		tsk->io_context->ioprio = ioc->ioprio;
> > +	}
> > +
> > +	return 0;
> > +}
> 
> Should this depend on CONFIG_BLOCK?

Good questions, checks... Looks like it would break, I'll do a
!CONFIG_BLOCK fixup round.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/6] block: cfq: make the io contect sharing lockless
  2008-01-23 22:08   ` Andrew Morton
@ 2008-01-24  7:36     ` Jens Axboe
  2008-01-24 16:42     ` Paul E. McKenney
  1 sibling, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2008-01-24  7:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, knikanth, Paul E. McKenney

On Wed, Jan 23 2008, Andrew Morton wrote:
> > On Tue, 22 Jan 2008 10:49:19 +0100 Jens Axboe <jens.axboe@oracle.com> wrote:
> > The io context sharing introduced a per-ioc spinlock, that would protect
> > the cfq io context lookup. That is a regression from the original, since
> > we never needed any locking there because the ioc/cic were process private.
> > 
> > The cic lookup is changed from an rbtree construct to a radix tree, which
> > we can then use RCU to make the reader side lockless. That is the performance
> > critical path, modifying the radix tree is only done on process creation
> > (when that process first does IO, actually) and on process exit (if that
> > process has done IO).
> 
> Perhaps Paul would review the rcu usage here sometime?

That would indeed be awesome :-)

> > +/*
> > + * Add cic into ioc, using cfqd as the search key. This enables us to lookup
> > + * the process specific cfq io context when entered from the block layer.
> > + * Also adds the cic to a per-cfqd list, used when this queue is removed.
> > + */
> > +static inline int
> 
> There's a lot of pointless inlining in there.

Will kill.

> > +++ b/block/ll_rw_blk.c
> > @@ -3831,6 +3831,16 @@ int __init blk_dev_init(void)
> >  	return 0;
> >  }
> >  
> > +static void cfq_dtor(struct io_context *ioc)
> > +{
> > +	struct cfq_io_context *cic[1];
> > +	int r;
> > +
> > +	r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
> > +	if (r > 0)
> > +		cic[0]->dtor(ioc);
> > +}
> 
> Some comments here might help others who are wondering why we can't just
> use radix_tree_lookup().

Sure, will add a comment.

> > @@ -3900,7 +3911,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
> >  		ret->last_waited = jiffies; /* doesn't matter... */
> >  		ret->nr_batch_requests = 0; /* because this is 0 */
> >  		ret->aic = NULL;
> > -		ret->cic_root.rb_node = NULL;
> > +		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
> 
> Did this need to be atomic?

It's actually only ever used with a radix_tree_preload() where the
proper gfp mask is passed, the actual radix_tree_insert() is done under
lock protecting the tree.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/6] io context sharing: preliminary support
  2008-01-23 22:08   ` Andrew Morton
@ 2008-01-24  7:36     ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2008-01-24  7:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, knikanth

On Wed, Jan 23 2008, Andrew Morton wrote:
> > On Tue, 22 Jan 2008 10:49:17 +0100 Jens Axboe <jens.axboe@oracle.com> wrote:
> > -void put_io_context(struct io_context *ioc)
> > +int put_io_context(struct io_context *ioc)
> >  {
> >  	if (ioc == NULL)
> > -		return;
> > +		return 1;
> >  
> >  	BUG_ON(atomic_read(&ioc->refcount) == 0);
> >  
> > @@ -3856,7 +3856,9 @@ void put_io_context(struct io_context *ioc)
> >  		rcu_read_unlock();
> >  
> >  		kmem_cache_free(iocontext_cachep, ioc);
> > +		return 1;
> >  	}
> > +	return 0;
> >  }
> 
> Document the return value?  (and the function)

Will do.

> I assume this return value gets used in some other patch.

Yeah, it is.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/6] block: cfq: make the io contect sharing lockless
  2008-01-23 22:08   ` Andrew Morton
  2008-01-24  7:36     ` Jens Axboe
@ 2008-01-24 16:42     ` Paul E. McKenney
  1 sibling, 0 replies; 18+ messages in thread
From: Paul E. McKenney @ 2008-01-24 16:42 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jens Axboe, linux-kernel, knikanth

On Wed, Jan 23, 2008 at 02:08:19PM -0800, Andrew Morton wrote:
> > On Tue, 22 Jan 2008 10:49:19 +0100 Jens Axboe <jens.axboe@oracle.com> wrote:
> > The io context sharing introduced a per-ioc spinlock, that would protect
> > the cfq io context lookup. That is a regression from the original, since
> > we never needed any locking there because the ioc/cic were process private.
> > 
> > The cic lookup is changed from an rbtree construct to a radix tree, which
> > we can then use RCU to make the reader side lockless. That is the performance
> > critical path, modifying the radix tree is only done on process creation
> > (when that process first does IO, actually) and on process exit (if that
> > process has done IO).
> 
> Perhaps Paul would review the rcu usage here sometime?

I will take a look -- long plane trip about 36 hours from now.  ;-)

							Thanx, Paul

> > +/*
> > + * Add cic into ioc, using cfqd as the search key. This enables us to lookup
> > + * the process specific cfq io context when entered from the block layer.
> > + * Also adds the cic to a per-cfqd list, used when this queue is removed.
> > + */
> > +static inline int
> 
> There's a lot of pointless inlining in there.
> 
> > +++ b/block/ll_rw_blk.c
> > @@ -3831,6 +3831,16 @@ int __init blk_dev_init(void)
> >  	return 0;
> >  }
> >  
> > +static void cfq_dtor(struct io_context *ioc)
> > +{
> > +	struct cfq_io_context *cic[1];
> > +	int r;
> > +
> > +	r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
> > +	if (r > 0)
> > +		cic[0]->dtor(ioc);
> > +}
> 
> Some comments here might help others who are wondering why we can't just
> use radix_tree_lookup().
> 
> > +static void cfq_exit(struct io_context *ioc)
> > +{
> > +	struct cfq_io_context *cic[1];
> > +	int r;
> > +
> > +	rcu_read_lock();
> > +	r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
> > +	rcu_read_unlock();
> > +
> > +	if (r > 0)
> > +		cic[0]->exit(ioc);
> > +}
> 
> ditto.
> 
> >  /* Called by the exitting task */
> >  void exit_io_context(void)
> >  {
> >  	struct io_context *ioc;
> > -	struct cfq_io_context *cic;
> >  
> >  	task_lock(current);
> >  	ioc = current->io_context;
> > @@ -3876,11 +3891,7 @@ void exit_io_context(void)
> >  	if (atomic_dec_and_test(&ioc->nr_tasks)) {
> >  		if (ioc->aic && ioc->aic->exit)
> >  			ioc->aic->exit(ioc->aic);
> > -		if (ioc->cic_root.rb_node != NULL) {
> > -			cic = rb_entry(rb_first(&ioc->cic_root),
> > -				struct cfq_io_context, rb_node);
> > -			cic->exit(ioc);
> > -		}
> > +		cfq_exit(ioc);
> >  
> >  		put_io_context(ioc);
> >  	}
> > @@ -3900,7 +3911,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
> >  		ret->last_waited = jiffies; /* doesn't matter... */
> >  		ret->nr_batch_requests = 0; /* because this is 0 */
> >  		ret->aic = NULL;
> > -		ret->cic_root.rb_node = NULL;
> > +		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
> 
> Did this need to be atomic?
> 
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2008-01-24 16:43 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-01-22  9:49 [PATCH 0/6] IO context sharing Jens Axboe
2008-01-22  9:49 ` [PATCH 1/6] ioprio: move io priority from task_struct to io_context Jens Axboe
2008-01-23 22:07   ` Andrew Morton
2008-01-24  7:30     ` Jens Axboe
2008-01-22  9:49 ` [PATCH 2/6] io context sharing: preliminary support Jens Axboe
2008-01-23 22:08   ` Andrew Morton
2008-01-24  7:36     ` Jens Axboe
2008-01-22  9:49 ` [PATCH 3/6] io_context sharing - cfq changes Jens Axboe
2008-01-22  9:49 ` [PATCH 4/6] block: cfq: make the io contect sharing lockless Jens Axboe
2008-01-23 22:08   ` Andrew Morton
2008-01-24  7:36     ` Jens Axboe
2008-01-24 16:42     ` Paul E. McKenney
2008-01-22  9:49 ` [PATCH 5/6] io_context sharing - anticipatory changes Jens Axboe
2008-01-22  9:49 ` [PATCH 6/6] kernel: add CLONE_IO to specifically request sharing of IO contexts Jens Axboe
2008-01-22 14:49 ` [PATCH 0/6] IO context sharing Peter Zijlstra
2008-01-22 18:21   ` Jens Axboe
2008-01-23  3:50 ` David Chinner
2008-01-23  8:44   ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).