From mboxrd@z Thu Jan 1 00:00:00 1970 From: Vivek Goyal Subject: [PATCH 05/23] io-controller: Core scheduler changes to support hierarhical scheduling Date: Fri, 28 Aug 2009 17:30:54 -0400 Message-ID: <1251495072-7780-6-git-send-email-vgoyal__38443.3961277411$1251498394$gmane$org@redhat.com> References: <1251495072-7780-1-git-send-email-vgoyal@redhat.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <1251495072-7780-1-git-send-email-vgoyal-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: containers-bounces-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org Errors-To: containers-bounces-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org To: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, jens.axboe-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org Cc: dhaval-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org, dm-devel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org, agk-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org, balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org, paolo.valente-rcYM44yAMweonA0d6jMUrA@public.gmane.org, jmarchan-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org, fernando-gVGce1chcLdL9jVzuh4AOg@public.gmane.org, jmoyer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org, mingo-X9Un+BFzKDI@public.gmane.org, riel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org, fchecconi-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org, akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org, righi.andrea-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org, torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org List-Id: containers.vger.kernel.org o This patch introduces core changes in fair queuing scheduler to support hierarhical/group scheduling. It is enabled by CONFIG_GROUP_IOSCHED. Signed-off-by: Fabio Checconi Signed-off-by: Paolo Valente Signed-off-by: Nauman Rafique Signed-off-by: Vivek Goyal --- block/elevator-fq.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++++--- block/elevator-fq.h | 19 ++++++ init/Kconfig | 8 +++ 3 files changed, 177 insertions(+), 8 deletions(-) diff --git a/block/elevator-fq.c b/block/elevator-fq.c index 1ca7b4a..6546df0 100644 --- a/block/elevator-fq.c +++ b/block/elevator-fq.c @@ -137,6 +137,88 @@ static inline struct io_group *iog_of(struct io_entity *entity) return NULL; } +#ifdef CONFIG_GROUP_IOSCHED +/* check for entity->parent so that loop is not executed for root entity. */ +#define for_each_entity(entity) \ + for (; entity && entity->parent; entity = entity->parent) + +/* Do the two (enqueued) entities belong to the same group ? */ +static inline int +is_same_group(struct io_entity *entity, struct io_entity *new_entity) +{ + if (parent_entity(entity) == parent_entity(new_entity)) + return 1; + + return 0; +} + +/* return depth at which a io entity is present in the hierarchy */ +static inline int depth_entity(struct io_entity *entity) +{ + int depth = 0; + + for_each_entity(entity) + depth++; + + return depth; +} + +static void find_matching_io_entity(struct io_entity **entity, + struct io_entity **new_entity) +{ + int entity_depth, new_entity_depth; + + /* + * preemption test can be made between sibling entities who are in the + * same group i.e who have a common parent. Walk up the hierarchy of + * both entities until we find their ancestors who are siblings of + * common parent. + */ + + /* First walk up until both entities are at same depth */ + entity_depth = depth_entity(*entity); + new_entity_depth = depth_entity(*new_entity); + + while (entity_depth > new_entity_depth) { + entity_depth--; + *entity = parent_entity(*entity); + } + + while (new_entity_depth > entity_depth) { + new_entity_depth--; + *new_entity = parent_entity(*new_entity); + } + + while (!is_same_group(*entity, *new_entity)) { + *entity = parent_entity(*entity); + *new_entity = parent_entity(*new_entity); + } +} +struct io_group *ioq_to_io_group(struct io_queue *ioq) +{ + return iog_of(parent_entity(&ioq->entity)); +} +EXPORT_SYMBOL(ioq_to_io_group); + +static inline struct io_sched_data * +io_entity_sched_data(struct io_entity *entity) +{ + return &iog_of(parent_entity(entity))->sched_data; +} + +#else /* GROUP_IOSCHED */ +#define for_each_entity(entity) \ + for (; entity != NULL; entity = NULL) + +static void find_matching_io_entity(struct io_entity **entity, + struct io_entity **new_entity) { } + +static inline int +is_same_group(struct io_entity *entity, struct io_entity *new_entity) +{ + return 1; +} + static inline struct elv_fq_data *efqd_of(struct io_entity *entity) { return ioq_of(entity)->efqd; @@ -155,6 +237,7 @@ io_entity_sched_data(struct io_entity *entity) return &efqd->root_group->sched_data; } +#endif /* GROUP_IOSCHED */ static inline void init_io_entity_service_tree(struct io_entity *entity, struct io_entity *parent) @@ -171,8 +254,10 @@ static void entity_served(struct io_entity *entity, unsigned long served, unsigned long nr_sectors) { - entity->vdisktime += elv_delta_fair(served, entity); - update_min_vdisktime(entity->st); + for_each_entity(entity) { + entity->vdisktime += elv_delta_fair(served, entity); + update_min_vdisktime(entity->st); + } } static void place_entity(struct io_service_tree *st, struct io_entity *entity, @@ -388,14 +473,23 @@ static void put_prev_ioq(struct io_queue *ioq) { struct io_entity *entity = &ioq->entity; - put_prev_io_entity(entity); + for_each_entity(entity) { + put_prev_io_entity(entity); + } } static void dequeue_ioq(struct io_queue *ioq) { struct io_entity *entity = &ioq->entity; - dequeue_io_entity(entity); + for_each_entity(entity) { + struct io_sched_data *sd = io_entity_sched_data(entity); + + dequeue_io_entity(entity); + /* Don't dequeue parent if it has other entities besides us */ + if (sd->nr_active) + break; + } elv_put_ioq(ioq); return; } @@ -406,7 +500,12 @@ static void enqueue_ioq(struct io_queue *ioq) struct io_entity *entity = &ioq->entity; elv_get_ioq(ioq); - enqueue_io_entity(entity); + + for_each_entity(entity) { + if (entity->on_st) + break; + enqueue_io_entity(entity); + } } static inline void @@ -638,6 +737,38 @@ void elv_io_group_set_async_queue(struct io_group *iog, int ioprio_class, } EXPORT_SYMBOL(elv_io_group_set_async_queue); +#ifdef CONFIG_GROUP_IOSCHED + +static void io_free_root_group(struct elevator_queue *e) +{ + struct io_group *iog = e->efqd->root_group; + + put_io_group_queues(e, iog); + kfree(iog); +} + +static struct io_group *io_alloc_root_group(struct request_queue *q, + struct elevator_queue *e, void *key) +{ + struct io_group *iog; + int i; + + iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node); + if (iog == NULL) + return NULL; + + iog->entity.parent = NULL; + iog->entity.my_sd = &iog->sched_data; + iog->key = key; + + for (i = 0; i < IO_IOPRIO_CLASSES; i++) + iog->sched_data.service_tree[i] = ELV_SERVICE_TREE_INIT; + + return iog; +} + +#else /* CONFIG_GROUP_IOSCHED */ + static struct io_group *io_alloc_root_group(struct request_queue *q, struct elevator_queue *e, void *key) { @@ -666,6 +797,8 @@ static void io_free_root_group(struct elevator_queue *e) kfree(iog); } +#endif /* CONFIG_GROUP_IOSCHED */ + /* * Should be called after ioq prio and class has been initialized as prio * class data will be used to determine which service tree in the group @@ -691,9 +824,11 @@ static struct io_queue *elv_get_next_ioq(struct request_queue *q) return NULL; sd = &efqd->root_group->sched_data; - entity = lookup_next_io_entity(sd); - if (!entity) - return NULL; + for (; sd != NULL; sd = entity->my_sd) { + entity = lookup_next_io_entity(sd); + if (!entity) + return NULL; + } ioq = ioq_of(entity); return ioq; @@ -894,6 +1029,13 @@ static int elv_should_preempt(struct request_queue *q, struct io_queue *new_ioq, new_entity = &new_ioq->entity; /* + * In hierarchical setup, one need to traverse up the hierarchy + * till both the queues are children of same parent to make a + * decision whether to do the preemption or not. + */ + find_matching_io_entity(&entity, &new_entity); + + /* * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. */ diff --git a/block/elevator-fq.h b/block/elevator-fq.h index 6d3809f..776f429 100644 --- a/block/elevator-fq.h +++ b/block/elevator-fq.h @@ -86,6 +86,23 @@ struct io_queue { void *sched_queue; }; +#ifdef CONFIG_GROUP_IOSCHED /* CONFIG_GROUP_IOSCHED */ +struct io_group { + struct io_entity entity; + atomic_t ref; + struct io_sched_data sched_data; + /* + * async queue for each priority case for RT and BE class. + * Used only for cfq. + */ + + struct io_queue *async_queue[2][IOPRIO_BE_NR]; + struct io_queue *async_idle_queue; + void *key; +}; + +#else /* CONFIG_GROUP_IOSCHED */ + struct io_group { struct io_entity entity; struct io_sched_data sched_data; @@ -99,6 +116,8 @@ struct io_group { void *key; }; +#endif /* CONFIG_GROUP_IOSCHED */ + struct elv_fq_data { struct io_group *root_group; diff --git a/init/Kconfig b/init/Kconfig index 3f7e609..29f701d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -612,6 +612,14 @@ config CGROUP_MEM_RES_CTLR_SWAP Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page size is 4096bytes, 512k per 1Gbytes of swap. +config GROUP_IOSCHED + bool "Group IO Scheduler" + depends on CGROUPS && ELV_FAIR_QUEUING + default n + ---help--- + This feature lets IO scheduler recognize task groups and control + disk bandwidth allocation to such task groups. + endif # CGROUPS config MM_OWNER -- 1.6.0.6