All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
@ 2010-08-30  6:50 Gui Jianfeng
  2010-08-30 18:20 ` Chad Talbott
  2010-08-30 20:36 ` Vivek Goyal
  0 siblings, 2 replies; 19+ messages in thread
From: Gui Jianfeng @ 2010-08-30  6:50 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe
  Cc: Jeff Moyer, Divyesh Shah, Corrado Zoccolo, Nauman Rafique,
	Gui Jianfeng, linux kernel mailing list

Hi All,

This patch enables cfq group hierarchical scheduling.

With this patch, you can create a cgroup directory deeper than level 1.
Now, I/O Bandwidth is distributed in a hierarchy way. For example:
We create cgroup directories as following(the number represents weight):

            Root grp
           /       \
       grp_1(100) grp_2(400)
       /    \ 
  grp_3(200) grp_4(300)

If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
grp_2 will share 80% of total bandwidth.
For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)

Design:
  o Each cfq group has its own group service tree. 
  o Each cfq group contains a "group schedule entity" (gse) that 
    schedules on parent cfq group's service tree.
  o Each cfq group contains a "queue schedule entity"(qse), it
    represents all cfqqs located on this cfq group. It schedules
    on this group's service tree. For the time being, root group
    qse's weight is 1000, and subgroup qse's weight is 500.
  o All gses and qse which belones to a same cfq group schedules
    on the same group service tree.
  o cfq group allocates in a recursive manner, that means when a cfq 
    group needs to be allocated, the upper level cfq groups are also
    allocated.
  o When a cfq group served, not only charge this cfq group but also
    charge its ancestors.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 block/blk-cgroup.c  |    4 -
 block/cfq-iosched.c |  477 +++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 353 insertions(+), 128 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2fef1ef..aa06cb3 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -964,10 +964,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		goto done;
 	}
 
-	/* Currently we do not support hierarchy deeper than two level (0,1) */
-	if (parent != cgroup->top_cgroup)
-		return ERR_PTR(-EPERM);
-
 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
 	if (!blkcg)
 		return ERR_PTR(-ENOMEM);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f65c6f0..4660d89 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida);
 #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
 
 #define sample_valid(samples)	((samples) > 80)
-#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
+#define rb_entry_se(node)	\
+	rb_entry((node), struct io_sched_entity, rb_node)
 
 /*
  * Most of our rbtree usage is for sorting with min extraction, so
@@ -171,19 +172,40 @@ enum wl_type_t {
 	SYNC_WORKLOAD = 2
 };
 
-/* This is per cgroup per device grouping structure */
-struct cfq_group {
+/* 
+ * This's the schedule entity which is scheduled on group service tree.
+ * It represents shedule structure of a cfq group, or a bundle of cfq
+ * queues that locate in a cfq group.
+ */
+struct io_sched_entity {
+	struct cfq_rb_root *st;
+
 	/* group service_tree member */
 	struct rb_node rb_node;
-
-	/* group service_tree key */
 	u64 vdisktime;
 	unsigned int weight;
 	bool on_st;
+	bool is_group_se;
+	struct io_sched_entity *parent;
+};
+
+/* This is per cgroup per device grouping structure */
+struct cfq_group {
+	/* cfq group sched entity */
+	struct io_sched_entity group_se;
+
+	/* sched entity for cfqqs */
+	struct io_sched_entity queue_se;
+
+	/* Service tree for cfq_groups and cfqqs set*/
+	struct cfq_rb_root grp_service_tree;
 
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
 
+	/* number of sub cfq groups */
+	int nr_subgp;
+
 	/* Per group busy queus average. Useful for workload slice calc. */
 	unsigned int busy_queues_avg[2];
 	/*
@@ -210,8 +232,6 @@ struct cfq_group {
  */
 struct cfq_data {
 	struct request_queue *queue;
-	/* Root service tree for cfq_groups */
-	struct cfq_rb_root grp_service_tree;
 	struct cfq_group root_group;
 
 	/*
@@ -399,6 +419,22 @@ static inline bool iops_mode(struct cfq_data *cfqd)
 		return false;
 }
 
+static inline struct cfq_group *cfqg_of_gse(struct io_sched_entity *se)
+{
+	if (se->is_group_se)
+		return container_of(se, struct cfq_group, group_se);
+	else
+		return NULL;
+}
+
+static inline struct cfq_group *cfqg_of_qse(struct io_sched_entity *se)
+{
+	if (!se->is_group_se)
+		return container_of(se, struct cfq_group, queue_se);
+	else
+		return NULL;
+}
+
 static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
 {
 	if (cfq_class_idle(cfqq))
@@ -522,12 +558,13 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 
-static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
+static inline u64 cfq_scale_slice(unsigned long delta,
+				  struct io_sched_entity *se)
 {
 	u64 d = delta << CFQ_SERVICE_SHIFT;
 
 	d = d * BLKIO_WEIGHT_DEFAULT;
-	do_div(d, cfqg->weight);
+	do_div(d, se->weight);
 	return d;
 }
 
@@ -552,16 +589,16 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
 static void update_min_vdisktime(struct cfq_rb_root *st)
 {
 	u64 vdisktime = st->min_vdisktime;
-	struct cfq_group *cfqg;
+	struct io_sched_entity *se;
 
 	if (st->active) {
-		cfqg = rb_entry_cfqg(st->active);
-		vdisktime = cfqg->vdisktime;
+		se = rb_entry_se(st->active);
+		vdisktime = se->vdisktime;
 	}
 
 	if (st->left) {
-		cfqg = rb_entry_cfqg(st->left);
-		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
+		se = rb_entry_se(st->left);
+		vdisktime = min_vdisktime(vdisktime, se->vdisktime);
 	}
 
 	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
@@ -591,9 +628,10 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
 static inline unsigned
 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct io_sched_entity *gse = &cfqg->queue_se;
+	struct cfq_rb_root *st = gse->st;
 
-	return cfq_target_latency * cfqg->weight / st->total_weight;
+	return cfq_target_latency * gse->weight / st->total_weight;
 }
 
 static inline void
@@ -756,13 +794,13 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
 	return NULL;
 }
 
-static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
+static struct io_sched_entity *cfq_rb_first_se(struct cfq_rb_root *root)
 {
 	if (!root->left)
 		root->left = rb_first(&root->rb);
 
 	if (root->left)
-		return rb_entry_cfqg(root->left);
+		return rb_entry_se(root->left);
 
 	return NULL;
 }
@@ -819,25 +857,25 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 }
 
 static inline s64
-cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
+se_key(struct cfq_rb_root *st, struct io_sched_entity *se)
 {
-	return cfqg->vdisktime - st->min_vdisktime;
+	return se->vdisktime - st->min_vdisktime;
 }
 
 static void
-__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
+__io_sched_entity_add(struct cfq_rb_root *st, struct io_sched_entity *se)
 {
 	struct rb_node **node = &st->rb.rb_node;
 	struct rb_node *parent = NULL;
-	struct cfq_group *__cfqg;
-	s64 key = cfqg_key(st, cfqg);
+	struct io_sched_entity *__se;
+	s64 key = se_key(st, se);
 	int left = 1;
 
 	while (*node != NULL) {
 		parent = *node;
-		__cfqg = rb_entry_cfqg(parent);
+		__se = rb_entry_se(parent);
 
-		if (key < cfqg_key(st, __cfqg))
+		if (key < se_key(st, __se))
 			node = &parent->rb_left;
 		else {
 			node = &parent->rb_right;
@@ -846,47 +884,80 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 	}
 
 	if (left)
-		st->left = &cfqg->rb_node;
+		st->left = &se->rb_node;
 
-	rb_link_node(&cfqg->rb_node, parent, node);
-	rb_insert_color(&cfqg->rb_node, &st->rb);
+	rb_link_node(&se->rb_node, parent, node);
+	rb_insert_color(&se->rb_node, &st->rb);
 }
 
-static void
-cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
+static void io_sched_entity_add(struct cfq_rb_root *st,
+				struct io_sched_entity *se)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-	struct cfq_group *__cfqg;
 	struct rb_node *n;
+	struct io_sched_entity *__se;
 
-	cfqg->nr_cfqq++;
-	if (cfqg->on_st)
-		return;
 
+	if (se->on_st)
+		return;
 	/*
 	 * Currently put the group at the end. Later implement something
 	 * so that groups get lesser vtime based on their weights, so that
 	 * if group does not loose all if it was not continously backlogged.
 	 */
-	n = rb_last(&st->rb);
+	n = rb_last(&se->st->rb);
 	if (n) {
-		__cfqg = rb_entry_cfqg(n);
-		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
+		__se = rb_entry_se(n);
+		se->vdisktime = __se->vdisktime + CFQ_IDLE_DELAY;
 	} else
-		cfqg->vdisktime = st->min_vdisktime;
+		se->vdisktime = st->min_vdisktime;
 
-	__cfq_group_service_tree_add(st, cfqg);
-	cfqg->on_st = true;
-	st->total_weight += cfqg->weight;
+	__io_sched_entity_add(se->st, se);
+	se->on_st = true;
+	st->total_weight += se->weight;
+}
+
+static void
+cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+	struct io_sched_entity *gse = &cfqg->group_se;
+	struct io_sched_entity *qse = &cfqg->queue_se;
+	struct cfq_group *__cfqg;
+
+	cfqg->nr_cfqq++;
+
+	io_sched_entity_add(qse->st, qse);
+
+	while (gse && gse->parent) {
+		if (gse->on_st)
+			return;
+		io_sched_entity_add(gse->st, gse);
+		gse = gse->parent;
+		__cfqg = cfqg_of_gse(gse);
+		__cfqg->nr_subgp++;
+	}
+}
+
+static void io_sched_entity_del(struct io_sched_entity *se)
+{
+	if (!RB_EMPTY_NODE(&se->rb_node))
+		cfq_rb_erase(&se->rb_node, se->st);
+
+	se->on_st = false;
+	se->st->total_weight -= se->weight;
 }
 
 static void
 cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct io_sched_entity *gse = &cfqg->group_se;
+	struct io_sched_entity *qse = &cfqg->queue_se;
+	struct cfq_group *__cfqg, *p_cfqg;
+
+	if (gse->st && gse->st->active == &gse->rb_node)
+		gse->st->active = NULL;
 
-	if (st->active == &cfqg->rb_node)
-		st->active = NULL;
+	if (qse->st && qse->st->active == &qse->rb_node)
+		qse->st->active = NULL;
 
 	BUG_ON(cfqg->nr_cfqq < 1);
 	cfqg->nr_cfqq--;
@@ -895,13 +966,25 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	if (cfqg->nr_cfqq)
 		return;
 
-	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-	cfqg->on_st = false;
-	st->total_weight -= cfqg->weight;
-	if (!RB_EMPTY_NODE(&cfqg->rb_node))
-		cfq_rb_erase(&cfqg->rb_node, st);
-	cfqg->saved_workload_slice = 0;
-	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
+	/* dequeue queue se from group */
+	io_sched_entity_del(qse);
+
+	if (cfqg->nr_subgp)
+		return;
+
+	/* prevent from dequeuing root group */
+	while (gse && gse->parent) {
+		__cfqg = cfqg_of_gse(gse);
+		p_cfqg = cfqg_of_gse(gse->parent);
+		io_sched_entity_del(gse);
+		cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
+		cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
+		__cfqg->saved_workload_slice = 0;
+		gse = gse->parent;
+		p_cfqg->nr_subgp--;
+		if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
+			return;
+	}
 }
 
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -933,7 +1016,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 				struct cfq_queue *cfqq)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct io_sched_entity *gse = &cfqg->group_se;
+	struct io_sched_entity *qse = &cfqg->queue_se;
 	unsigned int used_sl, charge;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
@@ -946,10 +1030,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
 		charge = cfqq->allocated_slice;
 
-	/* Can't update vdisktime while group is on service tree */
-	cfq_rb_erase(&cfqg->rb_node, st);
-	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
-	__cfq_group_service_tree_add(st, cfqg);
+	/*
+	 *  update queue se's vdisktime.
+	 *  Can't update vdisktime while group is on service tree.
+	 */
+
+	cfq_rb_erase(&qse->rb_node, qse->st);
+	qse->vdisktime += cfq_scale_slice(charge, qse);
+	__io_sched_entity_add(qse->st, qse);
+	if (&qse->rb_node == qse->st->active)
+		qse->st->active = NULL;
+
+	while (gse && gse->parent) {
+		cfq_rb_erase(&gse->rb_node, gse->st);
+		gse->vdisktime += cfq_scale_slice(charge, gse);
+		__io_sched_entity_add(gse->st, gse);
+		if (&gse->rb_node == gse->st->active)
+			gse->st->active = NULL;
+		gse = gse->parent;
+	}
 
 	/* This group is being expired. Save the context */
 	if (time_after(cfqd->workload_expires, jiffies)) {
@@ -960,8 +1059,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	} else
 		cfqg->saved_workload_slice = 0;
 
-	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
-					st->min_vdisktime);
+	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", gse->vdisktime,
+		     gse->st->min_vdisktime);
+
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
 			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
 			iops_mode(cfqd), cfqq->nr_sectors);
@@ -977,39 +1077,84 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 	return NULL;
 }
 
+static void cfq_put_cfqg(struct cfq_group *cfqg)
+{
+	struct cfq_rb_root *st;
+	int i, j;
+	struct io_sched_entity *gse;
+	struct cfq_group *p_cfqg;
+
+	BUG_ON(atomic_read(&cfqg->ref) <= 0);
+	if (!atomic_dec_and_test(&cfqg->ref))
+		return;
+	for_each_cfqg_st(cfqg, i, j, st)
+		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
+
+	gse = &cfqg->group_se;
+	if (gse->parent) {
+		p_cfqg = cfqg_of_gse(gse->parent);
+		/* Drop the reference taken by children */
+		atomic_dec(&p_cfqg->ref);
+	}
+
+	kfree(cfqg);
+}
+
+static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+	/* Something wrong if we are trying to remove same group twice */
+	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
+
+	hlist_del_init(&cfqg->cfqd_node);
+
+	/*
+	 * Put the reference taken at the time of creation so that when all
+	 * queues are gone, group can be destroyed.
+	 */
+	cfq_put_cfqg(cfqg);
+}
+
 void
 cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
 {
-	cfqg_of_blkg(blkg)->weight = weight;
+	struct cfq_group *cfqg;
+	struct io_sched_entity *gse;
+
+	cfqg = cfqg_of_blkg(blkg);
+	gse = &cfqg->group_se;
+	gse->weight = weight;
 }
 
-static struct cfq_group *
-cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+static void init_group_queue_entity(struct blkio_cgroup *blkcg,
+				   struct cfq_group *cfqg)
+{
+	struct io_sched_entity *gse = &cfqg->group_se;
+	struct io_sched_entity *qse = &cfqg->queue_se;
+
+	gse->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+	RB_CLEAR_NODE(&gse->rb_node);
+	gse->is_group_se = true;
+	gse->on_st = false;
+
+	/* Set to 500 for the time being */
+	qse->weight = 500;
+	RB_CLEAR_NODE(&qse->rb_node);
+	qse->is_group_se = false;
+	qse->on_st = false;
+}
+
+static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
+		      struct cfq_group *cfqg)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
-	struct cfq_group *cfqg = NULL;
-	void *key = cfqd;
 	int i, j;
 	struct cfq_rb_root *st;
-	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 	unsigned int major, minor;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 
-	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
-	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfqg->blkg.dev = MKDEV(major, minor);
-		goto done;
-	}
-	if (cfqg || !create)
-		goto done;
-
-	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
-	if (!cfqg)
-		goto done;
+	cfqg->grp_service_tree = CFQ_RB_ROOT;
 
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
-	RB_CLEAR_NODE(&cfqg->rb_node);
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -1023,11 +1168,102 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
 	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
 					MKDEV(major, minor));
-	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
-
+	init_group_queue_entity(blkcg, cfqg);
 	/* Add group on cfqd list */
 	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+	if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
+		cfq_destroy_cfqg(cfqd, cfqg);
+}
+
+static void cfqg_set_parent(struct cfq_group *cfqg, struct cfq_group *p_cfqg)
+{
+	struct io_sched_entity *gse = &cfqg->group_se;
+	struct io_sched_entity *qse = &cfqg->queue_se;
+	struct io_sched_entity *p_gse = &p_cfqg->group_se;
+
+	gse->parent = p_gse;
+	gse->st = &p_cfqg->grp_service_tree;
+
+	qse->parent = gse;
+	qse->st = &cfqg->grp_service_tree;
+
+	/* child reference */
+	atomic_inc(&p_cfqg->ref);
+}
+
+int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	struct blkio_cgroup *p_blkcg;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+	unsigned int major, minor;
+	struct cfq_group *cfqg, *p_cfqg;
+	void *key = cfqd;
+	int ret;
+
+	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+	if (cfqg) {
+		if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+			sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+			cfqg->blkg.dev = MKDEV(major, minor);
+		}
+		/* chain has already been built */
+		return 0;
+	}
+
+	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
+	if (!cfqg)
+		return -1;
+
+	init_cfqg(cfqd, blkcg, cfqg);
+
+	/* Already to the top group */
+	if (!cgroup->parent)
+		return 0;
+
+	ret = cfqg_chain_alloc(cfqd, cgroup->parent);
+	if (ret == -1) {
+		uninit_cfqg(cfqd, cfqg);
+		return -1;
+	}
+
+	p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
+	p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
+	BUG_ON(p_cfqg == NULL);
+
+	cfqg_set_parent(cfqg, p_cfqg);
+	return 0;
+}
+
+static struct cfq_group *
+cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	struct cfq_group *cfqg = NULL;
+	void *key = cfqd;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+	unsigned int major, minor;
+	int ret;
 
+	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+		cfqg->blkg.dev = MKDEV(major, minor);
+		goto done;
+	}
+	if (cfqg || !create)
+		goto done;
+
+	ret = cfqg_chain_alloc(cfqd, cgroup);
+	if (!ret) {
+		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+		BUG_ON(cfqg == NULL);
+		goto done;
+	}
 done:
 	return cfqg;
 }
@@ -1067,33 +1303,6 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 	atomic_inc(&cfqq->cfqg->ref);
 }
 
-static void cfq_put_cfqg(struct cfq_group *cfqg)
-{
-	struct cfq_rb_root *st;
-	int i, j;
-
-	BUG_ON(atomic_read(&cfqg->ref) <= 0);
-	if (!atomic_dec_and_test(&cfqg->ref))
-		return;
-	for_each_cfqg_st(cfqg, i, j, st)
-		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
-	kfree(cfqg);
-}
-
-static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
-{
-	/* Something wrong if we are trying to remove same group twice */
-	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
-
-	hlist_del_init(&cfqg->cfqd_node);
-
-	/*
-	 * Put the reference taken at the time of creation so that when all
-	 * queues are gone, group can be destroyed.
-	 */
-	cfq_put_cfqg(cfqg);
-}
-
 static void cfq_release_cfq_groups(struct cfq_data *cfqd)
 {
 	struct hlist_node *pos, *n;
@@ -1668,9 +1877,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (cfqq == cfqd->active_queue)
 		cfqd->active_queue = NULL;
 
-	if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
-		cfqd->grp_service_tree.active = NULL;
-
 	if (cfqd->active_cic) {
 		put_io_context(cfqd->active_cic->ioc);
 		cfqd->active_cic = NULL;
@@ -2173,17 +2379,26 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	cfqd->noidle_tree_requires_idle = false;
 }
 
+
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct cfq_group *root_group = &cfqd->root_group;
+	struct cfq_rb_root *st = &root_group->grp_service_tree;
 	struct cfq_group *cfqg;
+	struct io_sched_entity *se;
 
-	if (RB_EMPTY_ROOT(&st->rb))
-		return NULL;
-	cfqg = cfq_rb_first_group(st);
-	st->active = &cfqg->rb_node;
-	update_min_vdisktime(st);
-	return cfqg;
+	do {
+		se = cfq_rb_first_se(st);
+		if (!se)
+			return NULL;
+		st->active = &se->rb_node;
+		update_min_vdisktime(st);
+		cfqg = cfqg_of_qse(se);
+		if (cfqg)
+			return cfqg;
+		cfqg = cfqg_of_gse(se);
+		st = &cfqg->grp_service_tree;
+	} while (1);
 }
 
 static void cfq_choose_cfqg(struct cfq_data *cfqd)
@@ -2215,15 +2430,18 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	if (!cfqq)
 		goto new_queue;
 
+
 	if (!cfqd->rq_queued)
 		return NULL;
 
+
 	/*
 	 * We were waiting for group to get backlogged. Expire the queue
 	 */
 	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
 		goto expire;
 
+
 	/*
 	 * The active queue has run out of time, expire it and select new.
 	 */
@@ -2245,6 +2463,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 			goto check_group_idle;
 	}
 
+
 	/*
 	 * The active queue has requests and isn't expired, allow it to
 	 * dispatch.
@@ -2252,6 +2471,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 		goto keep_queue;
 
+
 	/*
 	 * If another queue has a request waiting within our mean seek
 	 * distance, let it run.  The expire code will check for close
@@ -2505,6 +2725,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 	}
 
 	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
+
 	return 1;
 }
 
@@ -3854,17 +4075,25 @@ static void *cfq_init_queue(struct request_queue *q)
 
 	cfqd->cic_index = i;
 
-	/* Init root service tree */
-	cfqd->grp_service_tree = CFQ_RB_ROOT;
-
 	/* Init root group */
 	cfqg = &cfqd->root_group;
+	cfqg->grp_service_tree = CFQ_RB_ROOT;
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
-	RB_CLEAR_NODE(&cfqg->rb_node);
-
+	cfqg->group_se.is_group_se = true;
+	RB_CLEAR_NODE(&cfqg->group_se.rb_node);
+	cfqg->group_se.on_st = false;
 	/* Give preference to root group over other groups */
-	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
+	cfqg->group_se.weight = 2*BLKIO_WEIGHT_DEFAULT;
+	cfqg->group_se.parent = NULL;
+	cfqg->group_se.st = NULL;
+
+	cfqg->queue_se.is_group_se = false;
+	RB_CLEAR_NODE(&cfqg->queue_se.rb_node);
+	cfqg->queue_se.on_st = false;
+	cfqg->queue_se.weight = 2*BLKIO_WEIGHT_DEFAULT;
+	cfqg->queue_se.parent = &cfqg->group_se;
+	cfqg->queue_se.st = &cfqg->grp_service_tree;
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	/*
-- 
1.6.5.2




^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-08-30  6:50 [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support Gui Jianfeng
@ 2010-08-30 18:20 ` Chad Talbott
  2010-08-31  0:35   ` Gui Jianfeng
  2010-08-30 20:36 ` Vivek Goyal
  1 sibling, 1 reply; 19+ messages in thread
From: Chad Talbott @ 2010-08-30 18:20 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Vivek Goyal, Jens Axboe, Jeff Moyer, Divyesh Shah,
	Corrado Zoccolo, Nauman Rafique, linux kernel mailing list

On Sun, Aug 29, 2010 at 11:50 PM, Gui Jianfeng
<guijianfeng@cn.fujitsu.com> wrote:
>  o Each cfq group has its own group service tree.
>  o Each cfq group contains a "group schedule entity" (gse) that
>    schedules on parent cfq group's service tree.
>  o Each cfq group contains a "queue schedule entity"(qse), it
>    represents all cfqqs located on this cfq group. It schedules
>    on this group's service tree. For the time being, root group
>    qse's weight is 1000, and subgroup qse's weight is 500.
>  o All gses and qse which belones to a same cfq group schedules
>    on the same group service tree.

Overall, I like this approach.  It's similar to what we are using with
earlier hierarchical patches.  Can you improve the naming a little
bit?  "qse" and "gse" look almost identical and breaks my brain.
Maybe "queue_entity" and "group_entity"?  If entity is to much to
type, then "item"?

Chad

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-08-30  6:50 [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support Gui Jianfeng
  2010-08-30 18:20 ` Chad Talbott
@ 2010-08-30 20:36 ` Vivek Goyal
  2010-08-31  0:29   ` Gui Jianfeng
  1 sibling, 1 reply; 19+ messages in thread
From: Vivek Goyal @ 2010-08-30 20:36 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo,
	Nauman Rafique, linux kernel mailing list

On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote:
> Hi All,
> 
> This patch enables cfq group hierarchical scheduling.
> 
> With this patch, you can create a cgroup directory deeper than level 1.
> Now, I/O Bandwidth is distributed in a hierarchy way. For example:
> We create cgroup directories as following(the number represents weight):
> 
>             Root grp
>            /       \
>        grp_1(100) grp_2(400)
>        /    \ 
>   grp_3(200) grp_4(300)
> 
> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
> grp_2 will share 80% of total bandwidth.
> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)
> 
> Design:
>   o Each cfq group has its own group service tree. 
>   o Each cfq group contains a "group schedule entity" (gse) that 
>     schedules on parent cfq group's service tree.
>   o Each cfq group contains a "queue schedule entity"(qse), it
>     represents all cfqqs located on this cfq group. It schedules
>     on this group's service tree. For the time being, root group
>     qse's weight is 1000, and subgroup qse's weight is 500.
>   o All gses and qse which belones to a same cfq group schedules
>     on the same group service tree.

Hi Gui,

Thanks for the patch. I have few questions.

- So how does the hierarchy look like, w.r.t root group. Something as
  follows?


			root
		       / | \
		     q1  q2 G1

Assume there are two processes doin IO in root group and q1 and q2 are
cfqq queues for those processes and G1 is the cgroup created by user.

If yes, then what algorithm do you use to do scheduling between q1, q2
and G1? IOW, currently we have two algorithms operating in CFQ. One for
cfqq and other for groups. Group algorithm does not use the logic of
cfq_slice_offset().

The reason being they are different because CFQ wanted to provide ioprio
service differentiation on SSD where slice idle is effectively zero. But
that slice_offset() logic is approximate at best and is not suitable
for group scheduling.

I have yet to go through the patches in detail, so thought will ask you
that how have you handeled above.

Vivek

>   o cfq group allocates in a recursive manner, that means when a cfq 
>     group needs to be allocated, the upper level cfq groups are also
>     allocated.
>   o When a cfq group served, not only charge this cfq group but also
>     charge its ancestors.
> 
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> ---
>  block/blk-cgroup.c  |    4 -
>  block/cfq-iosched.c |  477 +++++++++++++++++++++++++++++++++++++-------------
>  2 files changed, 353 insertions(+), 128 deletions(-)
> 
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 2fef1ef..aa06cb3 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -964,10 +964,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>  		goto done;
>  	}
>  
> -	/* Currently we do not support hierarchy deeper than two level (0,1) */
> -	if (parent != cgroup->top_cgroup)
> -		return ERR_PTR(-EPERM);
> -
>  	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
>  	if (!blkcg)
>  		return ERR_PTR(-ENOMEM);
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index f65c6f0..4660d89 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida);
>  #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
>  
>  #define sample_valid(samples)	((samples) > 80)
> -#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
> +#define rb_entry_se(node)	\
> +	rb_entry((node), struct io_sched_entity, rb_node)
>  
>  /*
>   * Most of our rbtree usage is for sorting with min extraction, so
> @@ -171,19 +172,40 @@ enum wl_type_t {
>  	SYNC_WORKLOAD = 2
>  };
>  
> -/* This is per cgroup per device grouping structure */
> -struct cfq_group {
> +/* 
> + * This's the schedule entity which is scheduled on group service tree.
> + * It represents shedule structure of a cfq group, or a bundle of cfq
> + * queues that locate in a cfq group.
> + */
> +struct io_sched_entity {
> +	struct cfq_rb_root *st;
> +
>  	/* group service_tree member */
>  	struct rb_node rb_node;
> -
> -	/* group service_tree key */
>  	u64 vdisktime;
>  	unsigned int weight;
>  	bool on_st;
> +	bool is_group_se;
> +	struct io_sched_entity *parent;
> +};
> +
> +/* This is per cgroup per device grouping structure */
> +struct cfq_group {
> +	/* cfq group sched entity */
> +	struct io_sched_entity group_se;
> +
> +	/* sched entity for cfqqs */
> +	struct io_sched_entity queue_se;
> +
> +	/* Service tree for cfq_groups and cfqqs set*/
> +	struct cfq_rb_root grp_service_tree;
>  
>  	/* number of cfqq currently on this group */
>  	int nr_cfqq;
>  
> +	/* number of sub cfq groups */
> +	int nr_subgp;
> +
>  	/* Per group busy queus average. Useful for workload slice calc. */
>  	unsigned int busy_queues_avg[2];
>  	/*
> @@ -210,8 +232,6 @@ struct cfq_group {
>   */
>  struct cfq_data {
>  	struct request_queue *queue;
> -	/* Root service tree for cfq_groups */
> -	struct cfq_rb_root grp_service_tree;
>  	struct cfq_group root_group;
>  
>  	/*
> @@ -399,6 +419,22 @@ static inline bool iops_mode(struct cfq_data *cfqd)
>  		return false;
>  }
>  
> +static inline struct cfq_group *cfqg_of_gse(struct io_sched_entity *se)
> +{
> +	if (se->is_group_se)
> +		return container_of(se, struct cfq_group, group_se);
> +	else
> +		return NULL;
> +}
> +
> +static inline struct cfq_group *cfqg_of_qse(struct io_sched_entity *se)
> +{
> +	if (!se->is_group_se)
> +		return container_of(se, struct cfq_group, queue_se);
> +	else
> +		return NULL;
> +}
> +
>  static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
>  {
>  	if (cfq_class_idle(cfqq))
> @@ -522,12 +558,13 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>  	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
>  }
>  
> -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
> +static inline u64 cfq_scale_slice(unsigned long delta,
> +				  struct io_sched_entity *se)
>  {
>  	u64 d = delta << CFQ_SERVICE_SHIFT;
>  
>  	d = d * BLKIO_WEIGHT_DEFAULT;
> -	do_div(d, cfqg->weight);
> +	do_div(d, se->weight);
>  	return d;
>  }
>  
> @@ -552,16 +589,16 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
>  static void update_min_vdisktime(struct cfq_rb_root *st)
>  {
>  	u64 vdisktime = st->min_vdisktime;
> -	struct cfq_group *cfqg;
> +	struct io_sched_entity *se;
>  
>  	if (st->active) {
> -		cfqg = rb_entry_cfqg(st->active);
> -		vdisktime = cfqg->vdisktime;
> +		se = rb_entry_se(st->active);
> +		vdisktime = se->vdisktime;
>  	}
>  
>  	if (st->left) {
> -		cfqg = rb_entry_cfqg(st->left);
> -		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
> +		se = rb_entry_se(st->left);
> +		vdisktime = min_vdisktime(vdisktime, se->vdisktime);
>  	}
>  
>  	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
> @@ -591,9 +628,10 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>  static inline unsigned
>  cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>  {
> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> +	struct io_sched_entity *gse = &cfqg->queue_se;
> +	struct cfq_rb_root *st = gse->st;
>  
> -	return cfq_target_latency * cfqg->weight / st->total_weight;
> +	return cfq_target_latency * gse->weight / st->total_weight;
>  }
>  
>  static inline void
> @@ -756,13 +794,13 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
>  	return NULL;
>  }
>  
> -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
> +static struct io_sched_entity *cfq_rb_first_se(struct cfq_rb_root *root)
>  {
>  	if (!root->left)
>  		root->left = rb_first(&root->rb);
>  
>  	if (root->left)
> -		return rb_entry_cfqg(root->left);
> +		return rb_entry_se(root->left);
>  
>  	return NULL;
>  }
> @@ -819,25 +857,25 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
>  }
>  
>  static inline s64
> -cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
> +se_key(struct cfq_rb_root *st, struct io_sched_entity *se)
>  {
> -	return cfqg->vdisktime - st->min_vdisktime;
> +	return se->vdisktime - st->min_vdisktime;
>  }
>  
>  static void
> -__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
> +__io_sched_entity_add(struct cfq_rb_root *st, struct io_sched_entity *se)
>  {
>  	struct rb_node **node = &st->rb.rb_node;
>  	struct rb_node *parent = NULL;
> -	struct cfq_group *__cfqg;
> -	s64 key = cfqg_key(st, cfqg);
> +	struct io_sched_entity *__se;
> +	s64 key = se_key(st, se);
>  	int left = 1;
>  
>  	while (*node != NULL) {
>  		parent = *node;
> -		__cfqg = rb_entry_cfqg(parent);
> +		__se = rb_entry_se(parent);
>  
> -		if (key < cfqg_key(st, __cfqg))
> +		if (key < se_key(st, __se))
>  			node = &parent->rb_left;
>  		else {
>  			node = &parent->rb_right;
> @@ -846,47 +884,80 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
>  	}
>  
>  	if (left)
> -		st->left = &cfqg->rb_node;
> +		st->left = &se->rb_node;
>  
> -	rb_link_node(&cfqg->rb_node, parent, node);
> -	rb_insert_color(&cfqg->rb_node, &st->rb);
> +	rb_link_node(&se->rb_node, parent, node);
> +	rb_insert_color(&se->rb_node, &st->rb);
>  }
>  
> -static void
> -cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> +static void io_sched_entity_add(struct cfq_rb_root *st,
> +				struct io_sched_entity *se)
>  {
> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> -	struct cfq_group *__cfqg;
>  	struct rb_node *n;
> +	struct io_sched_entity *__se;
>  
> -	cfqg->nr_cfqq++;
> -	if (cfqg->on_st)
> -		return;
>  
> +	if (se->on_st)
> +		return;
>  	/*
>  	 * Currently put the group at the end. Later implement something
>  	 * so that groups get lesser vtime based on their weights, so that
>  	 * if group does not loose all if it was not continously backlogged.
>  	 */
> -	n = rb_last(&st->rb);
> +	n = rb_last(&se->st->rb);
>  	if (n) {
> -		__cfqg = rb_entry_cfqg(n);
> -		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
> +		__se = rb_entry_se(n);
> +		se->vdisktime = __se->vdisktime + CFQ_IDLE_DELAY;
>  	} else
> -		cfqg->vdisktime = st->min_vdisktime;
> +		se->vdisktime = st->min_vdisktime;
>  
> -	__cfq_group_service_tree_add(st, cfqg);
> -	cfqg->on_st = true;
> -	st->total_weight += cfqg->weight;
> +	__io_sched_entity_add(se->st, se);
> +	se->on_st = true;
> +	st->total_weight += se->weight;
> +}
> +
> +static void
> +cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> +{
> +	struct io_sched_entity *gse = &cfqg->group_se;
> +	struct io_sched_entity *qse = &cfqg->queue_se;
> +	struct cfq_group *__cfqg;
> +
> +	cfqg->nr_cfqq++;
> +
> +	io_sched_entity_add(qse->st, qse);
> +
> +	while (gse && gse->parent) {
> +		if (gse->on_st)
> +			return;
> +		io_sched_entity_add(gse->st, gse);
> +		gse = gse->parent;
> +		__cfqg = cfqg_of_gse(gse);
> +		__cfqg->nr_subgp++;
> +	}
> +}
> +
> +static void io_sched_entity_del(struct io_sched_entity *se)
> +{
> +	if (!RB_EMPTY_NODE(&se->rb_node))
> +		cfq_rb_erase(&se->rb_node, se->st);
> +
> +	se->on_st = false;
> +	se->st->total_weight -= se->weight;
>  }
>  
>  static void
>  cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>  {
> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> +	struct io_sched_entity *gse = &cfqg->group_se;
> +	struct io_sched_entity *qse = &cfqg->queue_se;
> +	struct cfq_group *__cfqg, *p_cfqg;
> +
> +	if (gse->st && gse->st->active == &gse->rb_node)
> +		gse->st->active = NULL;
>  
> -	if (st->active == &cfqg->rb_node)
> -		st->active = NULL;
> +	if (qse->st && qse->st->active == &qse->rb_node)
> +		qse->st->active = NULL;
>  
>  	BUG_ON(cfqg->nr_cfqq < 1);
>  	cfqg->nr_cfqq--;
> @@ -895,13 +966,25 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>  	if (cfqg->nr_cfqq)
>  		return;
>  
> -	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
> -	cfqg->on_st = false;
> -	st->total_weight -= cfqg->weight;
> -	if (!RB_EMPTY_NODE(&cfqg->rb_node))
> -		cfq_rb_erase(&cfqg->rb_node, st);
> -	cfqg->saved_workload_slice = 0;
> -	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
> +	/* dequeue queue se from group */
> +	io_sched_entity_del(qse);
> +
> +	if (cfqg->nr_subgp)
> +		return;
> +
> +	/* prevent from dequeuing root group */
> +	while (gse && gse->parent) {
> +		__cfqg = cfqg_of_gse(gse);
> +		p_cfqg = cfqg_of_gse(gse->parent);
> +		io_sched_entity_del(gse);
> +		cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
> +		cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
> +		__cfqg->saved_workload_slice = 0;
> +		gse = gse->parent;
> +		p_cfqg->nr_subgp--;
> +		if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
> +			return;
> +	}
>  }
>  
>  static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
> @@ -933,7 +1016,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>  static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>  				struct cfq_queue *cfqq)
>  {
> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> +	struct io_sched_entity *gse = &cfqg->group_se;
> +	struct io_sched_entity *qse = &cfqg->queue_se;
>  	unsigned int used_sl, charge;
>  	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
>  			- cfqg->service_tree_idle.count;
> @@ -946,10 +1030,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>  	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
>  		charge = cfqq->allocated_slice;
>  
> -	/* Can't update vdisktime while group is on service tree */
> -	cfq_rb_erase(&cfqg->rb_node, st);
> -	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
> -	__cfq_group_service_tree_add(st, cfqg);
> +	/*
> +	 *  update queue se's vdisktime.
> +	 *  Can't update vdisktime while group is on service tree.
> +	 */
> +
> +	cfq_rb_erase(&qse->rb_node, qse->st);
> +	qse->vdisktime += cfq_scale_slice(charge, qse);
> +	__io_sched_entity_add(qse->st, qse);
> +	if (&qse->rb_node == qse->st->active)
> +		qse->st->active = NULL;
> +
> +	while (gse && gse->parent) {
> +		cfq_rb_erase(&gse->rb_node, gse->st);
> +		gse->vdisktime += cfq_scale_slice(charge, gse);
> +		__io_sched_entity_add(gse->st, gse);
> +		if (&gse->rb_node == gse->st->active)
> +			gse->st->active = NULL;
> +		gse = gse->parent;
> +	}
>  
>  	/* This group is being expired. Save the context */
>  	if (time_after(cfqd->workload_expires, jiffies)) {
> @@ -960,8 +1059,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>  	} else
>  		cfqg->saved_workload_slice = 0;
>  
> -	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
> -					st->min_vdisktime);
> +	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", gse->vdisktime,
> +		     gse->st->min_vdisktime);
> +
>  	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
>  			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
>  			iops_mode(cfqd), cfqq->nr_sectors);
> @@ -977,39 +1077,84 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
>  	return NULL;
>  }
>  
> +static void cfq_put_cfqg(struct cfq_group *cfqg)
> +{
> +	struct cfq_rb_root *st;
> +	int i, j;
> +	struct io_sched_entity *gse;
> +	struct cfq_group *p_cfqg;
> +
> +	BUG_ON(atomic_read(&cfqg->ref) <= 0);
> +	if (!atomic_dec_and_test(&cfqg->ref))
> +		return;
> +	for_each_cfqg_st(cfqg, i, j, st)
> +		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
> +
> +	gse = &cfqg->group_se;
> +	if (gse->parent) {
> +		p_cfqg = cfqg_of_gse(gse->parent);
> +		/* Drop the reference taken by children */
> +		atomic_dec(&p_cfqg->ref);
> +	}
> +
> +	kfree(cfqg);
> +}
> +
> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
> +{
> +	/* Something wrong if we are trying to remove same group twice */
> +	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
> +
> +	hlist_del_init(&cfqg->cfqd_node);
> +
> +	/*
> +	 * Put the reference taken at the time of creation so that when all
> +	 * queues are gone, group can be destroyed.
> +	 */
> +	cfq_put_cfqg(cfqg);
> +}
> +
>  void
>  cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
>  {
> -	cfqg_of_blkg(blkg)->weight = weight;
> +	struct cfq_group *cfqg;
> +	struct io_sched_entity *gse;
> +
> +	cfqg = cfqg_of_blkg(blkg);
> +	gse = &cfqg->group_se;
> +	gse->weight = weight;
>  }
>  
> -static struct cfq_group *
> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> +static void init_group_queue_entity(struct blkio_cgroup *blkcg,
> +				   struct cfq_group *cfqg)
> +{
> +	struct io_sched_entity *gse = &cfqg->group_se;
> +	struct io_sched_entity *qse = &cfqg->queue_se;
> +
> +	gse->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
> +	RB_CLEAR_NODE(&gse->rb_node);
> +	gse->is_group_se = true;
> +	gse->on_st = false;
> +
> +	/* Set to 500 for the time being */
> +	qse->weight = 500;
> +	RB_CLEAR_NODE(&qse->rb_node);
> +	qse->is_group_se = false;
> +	qse->on_st = false;
> +}
> +
> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
> +		      struct cfq_group *cfqg)
>  {
> -	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> -	struct cfq_group *cfqg = NULL;
> -	void *key = cfqd;
>  	int i, j;
>  	struct cfq_rb_root *st;
> -	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>  	unsigned int major, minor;
> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>  
> -	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> -	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> -		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> -		cfqg->blkg.dev = MKDEV(major, minor);
> -		goto done;
> -	}
> -	if (cfqg || !create)
> -		goto done;
> -
> -	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
> -	if (!cfqg)
> -		goto done;
> +	cfqg->grp_service_tree = CFQ_RB_ROOT;
>  
>  	for_each_cfqg_st(cfqg, i, j, st)
>  		*st = CFQ_RB_ROOT;
> -	RB_CLEAR_NODE(&cfqg->rb_node);
>  
>  	/*
>  	 * Take the initial reference that will be released on destroy
> @@ -1023,11 +1168,102 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>  	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>  	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>  					MKDEV(major, minor));
> -	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
> -
> +	init_group_queue_entity(blkcg, cfqg);
>  	/* Add group on cfqd list */
>  	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
> +}
> +
> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
> +{
> +	if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
> +		cfq_destroy_cfqg(cfqd, cfqg);
> +}
> +
> +static void cfqg_set_parent(struct cfq_group *cfqg, struct cfq_group *p_cfqg)
> +{
> +	struct io_sched_entity *gse = &cfqg->group_se;
> +	struct io_sched_entity *qse = &cfqg->queue_se;
> +	struct io_sched_entity *p_gse = &p_cfqg->group_se;
> +
> +	gse->parent = p_gse;
> +	gse->st = &p_cfqg->grp_service_tree;
> +
> +	qse->parent = gse;
> +	qse->st = &cfqg->grp_service_tree;
> +
> +	/* child reference */
> +	atomic_inc(&p_cfqg->ref);
> +}
> +
> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
> +{
> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> +	struct blkio_cgroup *p_blkcg;
> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> +	unsigned int major, minor;
> +	struct cfq_group *cfqg, *p_cfqg;
> +	void *key = cfqd;
> +	int ret;
> +
> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> +	if (cfqg) {
> +		if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> +			sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> +			cfqg->blkg.dev = MKDEV(major, minor);
> +		}
> +		/* chain has already been built */
> +		return 0;
> +	}
> +
> +	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
> +	if (!cfqg)
> +		return -1;
> +
> +	init_cfqg(cfqd, blkcg, cfqg);
> +
> +	/* Already to the top group */
> +	if (!cgroup->parent)
> +		return 0;
> +
> +	ret = cfqg_chain_alloc(cfqd, cgroup->parent);
> +	if (ret == -1) {
> +		uninit_cfqg(cfqd, cfqg);
> +		return -1;
> +	}
> +
> +	p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
> +	p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
> +	BUG_ON(p_cfqg == NULL);
> +
> +	cfqg_set_parent(cfqg, p_cfqg);
> +	return 0;
> +}
> +
> +static struct cfq_group *
> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> +{
> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> +	struct cfq_group *cfqg = NULL;
> +	void *key = cfqd;
> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> +	unsigned int major, minor;
> +	int ret;
>  
> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> +	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> +		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> +		cfqg->blkg.dev = MKDEV(major, minor);
> +		goto done;
> +	}
> +	if (cfqg || !create)
> +		goto done;
> +
> +	ret = cfqg_chain_alloc(cfqd, cgroup);
> +	if (!ret) {
> +		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> +		BUG_ON(cfqg == NULL);
> +		goto done;
> +	}
>  done:
>  	return cfqg;
>  }
> @@ -1067,33 +1303,6 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
>  	atomic_inc(&cfqq->cfqg->ref);
>  }
>  
> -static void cfq_put_cfqg(struct cfq_group *cfqg)
> -{
> -	struct cfq_rb_root *st;
> -	int i, j;
> -
> -	BUG_ON(atomic_read(&cfqg->ref) <= 0);
> -	if (!atomic_dec_and_test(&cfqg->ref))
> -		return;
> -	for_each_cfqg_st(cfqg, i, j, st)
> -		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
> -	kfree(cfqg);
> -}
> -
> -static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
> -{
> -	/* Something wrong if we are trying to remove same group twice */
> -	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
> -
> -	hlist_del_init(&cfqg->cfqd_node);
> -
> -	/*
> -	 * Put the reference taken at the time of creation so that when all
> -	 * queues are gone, group can be destroyed.
> -	 */
> -	cfq_put_cfqg(cfqg);
> -}
> -
>  static void cfq_release_cfq_groups(struct cfq_data *cfqd)
>  {
>  	struct hlist_node *pos, *n;
> @@ -1668,9 +1877,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>  	if (cfqq == cfqd->active_queue)
>  		cfqd->active_queue = NULL;
>  
> -	if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
> -		cfqd->grp_service_tree.active = NULL;
> -
>  	if (cfqd->active_cic) {
>  		put_io_context(cfqd->active_cic->ioc);
>  		cfqd->active_cic = NULL;
> @@ -2173,17 +2379,26 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>  	cfqd->noidle_tree_requires_idle = false;
>  }
>  
> +
>  static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
>  {
> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> +	struct cfq_group *root_group = &cfqd->root_group;
> +	struct cfq_rb_root *st = &root_group->grp_service_tree;
>  	struct cfq_group *cfqg;
> +	struct io_sched_entity *se;
>  
> -	if (RB_EMPTY_ROOT(&st->rb))
> -		return NULL;
> -	cfqg = cfq_rb_first_group(st);
> -	st->active = &cfqg->rb_node;
> -	update_min_vdisktime(st);
> -	return cfqg;
> +	do {
> +		se = cfq_rb_first_se(st);
> +		if (!se)
> +			return NULL;
> +		st->active = &se->rb_node;
> +		update_min_vdisktime(st);
> +		cfqg = cfqg_of_qse(se);
> +		if (cfqg)
> +			return cfqg;
> +		cfqg = cfqg_of_gse(se);
> +		st = &cfqg->grp_service_tree;
> +	} while (1);
>  }
>  
>  static void cfq_choose_cfqg(struct cfq_data *cfqd)
> @@ -2215,15 +2430,18 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>  	if (!cfqq)
>  		goto new_queue;
>  
> +
>  	if (!cfqd->rq_queued)
>  		return NULL;
>  
> +
>  	/*
>  	 * We were waiting for group to get backlogged. Expire the queue
>  	 */
>  	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
>  		goto expire;
>  
> +
>  	/*
>  	 * The active queue has run out of time, expire it and select new.
>  	 */
> @@ -2245,6 +2463,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>  			goto check_group_idle;
>  	}
>  
> +
>  	/*
>  	 * The active queue has requests and isn't expired, allow it to
>  	 * dispatch.
> @@ -2252,6 +2471,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>  	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
>  		goto keep_queue;
>  
> +
>  	/*
>  	 * If another queue has a request waiting within our mean seek
>  	 * distance, let it run.  The expire code will check for close
> @@ -2505,6 +2725,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
>  	}
>  
>  	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
> +
>  	return 1;
>  }
>  
> @@ -3854,17 +4075,25 @@ static void *cfq_init_queue(struct request_queue *q)
>  
>  	cfqd->cic_index = i;
>  
> -	/* Init root service tree */
> -	cfqd->grp_service_tree = CFQ_RB_ROOT;
> -
>  	/* Init root group */
>  	cfqg = &cfqd->root_group;
> +	cfqg->grp_service_tree = CFQ_RB_ROOT;
>  	for_each_cfqg_st(cfqg, i, j, st)
>  		*st = CFQ_RB_ROOT;
> -	RB_CLEAR_NODE(&cfqg->rb_node);
> -
> +	cfqg->group_se.is_group_se = true;
> +	RB_CLEAR_NODE(&cfqg->group_se.rb_node);
> +	cfqg->group_se.on_st = false;
>  	/* Give preference to root group over other groups */
> -	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
> +	cfqg->group_se.weight = 2*BLKIO_WEIGHT_DEFAULT;
> +	cfqg->group_se.parent = NULL;
> +	cfqg->group_se.st = NULL;
> +
> +	cfqg->queue_se.is_group_se = false;
> +	RB_CLEAR_NODE(&cfqg->queue_se.rb_node);
> +	cfqg->queue_se.on_st = false;
> +	cfqg->queue_se.weight = 2*BLKIO_WEIGHT_DEFAULT;
> +	cfqg->queue_se.parent = &cfqg->group_se;
> +	cfqg->queue_se.st = &cfqg->grp_service_tree;
>  
>  #ifdef CONFIG_CFQ_GROUP_IOSCHED
>  	/*
> -- 
> 1.6.5.2
> 
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-08-30 20:36 ` Vivek Goyal
@ 2010-08-31  0:29   ` Gui Jianfeng
  2010-08-31 12:57     ` Vivek Goyal
  0 siblings, 1 reply; 19+ messages in thread
From: Gui Jianfeng @ 2010-08-31  0:29 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo,
	Nauman Rafique, linux kernel mailing list

Vivek Goyal wrote:
> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote:
>> Hi All,
>>
>> This patch enables cfq group hierarchical scheduling.
>>
>> With this patch, you can create a cgroup directory deeper than level 1.
>> Now, I/O Bandwidth is distributed in a hierarchy way. For example:
>> We create cgroup directories as following(the number represents weight):
>>
>>             Root grp
>>            /       \
>>        grp_1(100) grp_2(400)
>>        /    \ 
>>   grp_3(200) grp_4(300)
>>
>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
>> grp_2 will share 80% of total bandwidth.
>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)
>>
>> Design:
>>   o Each cfq group has its own group service tree. 
>>   o Each cfq group contains a "group schedule entity" (gse) that 
>>     schedules on parent cfq group's service tree.
>>   o Each cfq group contains a "queue schedule entity"(qse), it
>>     represents all cfqqs located on this cfq group. It schedules
>>     on this group's service tree. For the time being, root group
>>     qse's weight is 1000, and subgroup qse's weight is 500.
>>   o All gses and qse which belones to a same cfq group schedules
>>     on the same group service tree.
> 
> Hi Gui,
> 
> Thanks for the patch. I have few questions.
> 
> - So how does the hierarchy look like, w.r.t root group. Something as
>   follows?
> 
> 
> 			root
> 		       / | \
> 		     q1  q2 G1
> 
> Assume there are two processes doin IO in root group and q1 and q2 are
> cfqq queues for those processes and G1 is the cgroup created by user.
> 
> If yes, then what algorithm do you use to do scheduling between q1, q2
> and G1? IOW, currently we have two algorithms operating in CFQ. One for
> cfqq and other for groups. Group algorithm does not use the logic of
> cfq_slice_offset().

Hi Vivek,

This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq.
If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and
it will schedule on root group service with G1, as following:

                         root group
                        /         \
                    qse(q1,q2)    gse(G1)

Thanks,
Gui

> 
> The reason being they are different because CFQ wanted to provide ioprio
> service differentiation on SSD where slice idle is effectively zero. But
> that slice_offset() logic is approximate at best and is not suitable
> for group scheduling.
> 
> I have yet to go through the patches in detail, so thought will ask you
> that how have you handeled above.
> 
> Vivek
> 
>>   o cfq group allocates in a recursive manner, that means when a cfq 
>>     group needs to be allocated, the upper level cfq groups are also
>>     allocated.
>>   o When a cfq group served, not only charge this cfq group but also
>>     charge its ancestors.
>>
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>> ---
>>  block/blk-cgroup.c  |    4 -
>>  block/cfq-iosched.c |  477 +++++++++++++++++++++++++++++++++++++-------------
>>  2 files changed, 353 insertions(+), 128 deletions(-)
>>
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index 2fef1ef..aa06cb3 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -964,10 +964,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>>  		goto done;
>>  	}
>>  
>> -	/* Currently we do not support hierarchy deeper than two level (0,1) */
>> -	if (parent != cgroup->top_cgroup)
>> -		return ERR_PTR(-EPERM);
>> -
>>  	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
>>  	if (!blkcg)
>>  		return ERR_PTR(-ENOMEM);
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index f65c6f0..4660d89 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida);
>>  #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
>>  
>>  #define sample_valid(samples)	((samples) > 80)
>> -#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
>> +#define rb_entry_se(node)	\
>> +	rb_entry((node), struct io_sched_entity, rb_node)
>>  
>>  /*
>>   * Most of our rbtree usage is for sorting with min extraction, so
>> @@ -171,19 +172,40 @@ enum wl_type_t {
>>  	SYNC_WORKLOAD = 2
>>  };
>>  
>> -/* This is per cgroup per device grouping structure */
>> -struct cfq_group {
>> +/* 
>> + * This's the schedule entity which is scheduled on group service tree.
>> + * It represents shedule structure of a cfq group, or a bundle of cfq
>> + * queues that locate in a cfq group.
>> + */
>> +struct io_sched_entity {
>> +	struct cfq_rb_root *st;
>> +
>>  	/* group service_tree member */
>>  	struct rb_node rb_node;
>> -
>> -	/* group service_tree key */
>>  	u64 vdisktime;
>>  	unsigned int weight;
>>  	bool on_st;
>> +	bool is_group_se;
>> +	struct io_sched_entity *parent;
>> +};
>> +
>> +/* This is per cgroup per device grouping structure */
>> +struct cfq_group {
>> +	/* cfq group sched entity */
>> +	struct io_sched_entity group_se;
>> +
>> +	/* sched entity for cfqqs */
>> +	struct io_sched_entity queue_se;
>> +
>> +	/* Service tree for cfq_groups and cfqqs set*/
>> +	struct cfq_rb_root grp_service_tree;
>>  
>>  	/* number of cfqq currently on this group */
>>  	int nr_cfqq;
>>  
>> +	/* number of sub cfq groups */
>> +	int nr_subgp;
>> +
>>  	/* Per group busy queus average. Useful for workload slice calc. */
>>  	unsigned int busy_queues_avg[2];
>>  	/*
>> @@ -210,8 +232,6 @@ struct cfq_group {
>>   */
>>  struct cfq_data {
>>  	struct request_queue *queue;
>> -	/* Root service tree for cfq_groups */
>> -	struct cfq_rb_root grp_service_tree;
>>  	struct cfq_group root_group;
>>  
>>  	/*
>> @@ -399,6 +419,22 @@ static inline bool iops_mode(struct cfq_data *cfqd)
>>  		return false;
>>  }
>>  
>> +static inline struct cfq_group *cfqg_of_gse(struct io_sched_entity *se)
>> +{
>> +	if (se->is_group_se)
>> +		return container_of(se, struct cfq_group, group_se);
>> +	else
>> +		return NULL;
>> +}
>> +
>> +static inline struct cfq_group *cfqg_of_qse(struct io_sched_entity *se)
>> +{
>> +	if (!se->is_group_se)
>> +		return container_of(se, struct cfq_group, queue_se);
>> +	else
>> +		return NULL;
>> +}
>> +
>>  static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
>>  {
>>  	if (cfq_class_idle(cfqq))
>> @@ -522,12 +558,13 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>>  	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
>>  }
>>  
>> -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
>> +static inline u64 cfq_scale_slice(unsigned long delta,
>> +				  struct io_sched_entity *se)
>>  {
>>  	u64 d = delta << CFQ_SERVICE_SHIFT;
>>  
>>  	d = d * BLKIO_WEIGHT_DEFAULT;
>> -	do_div(d, cfqg->weight);
>> +	do_div(d, se->weight);
>>  	return d;
>>  }
>>  
>> @@ -552,16 +589,16 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
>>  static void update_min_vdisktime(struct cfq_rb_root *st)
>>  {
>>  	u64 vdisktime = st->min_vdisktime;
>> -	struct cfq_group *cfqg;
>> +	struct io_sched_entity *se;
>>  
>>  	if (st->active) {
>> -		cfqg = rb_entry_cfqg(st->active);
>> -		vdisktime = cfqg->vdisktime;
>> +		se = rb_entry_se(st->active);
>> +		vdisktime = se->vdisktime;
>>  	}
>>  
>>  	if (st->left) {
>> -		cfqg = rb_entry_cfqg(st->left);
>> -		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
>> +		se = rb_entry_se(st->left);
>> +		vdisktime = min_vdisktime(vdisktime, se->vdisktime);
>>  	}
>>  
>>  	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
>> @@ -591,9 +628,10 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>>  static inline unsigned
>>  cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  {
>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> +	struct io_sched_entity *gse = &cfqg->queue_se;
>> +	struct cfq_rb_root *st = gse->st;
>>  
>> -	return cfq_target_latency * cfqg->weight / st->total_weight;
>> +	return cfq_target_latency * gse->weight / st->total_weight;
>>  }
>>  
>>  static inline void
>> @@ -756,13 +794,13 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
>>  	return NULL;
>>  }
>>  
>> -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
>> +static struct io_sched_entity *cfq_rb_first_se(struct cfq_rb_root *root)
>>  {
>>  	if (!root->left)
>>  		root->left = rb_first(&root->rb);
>>  
>>  	if (root->left)
>> -		return rb_entry_cfqg(root->left);
>> +		return rb_entry_se(root->left);
>>  
>>  	return NULL;
>>  }
>> @@ -819,25 +857,25 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
>>  }
>>  
>>  static inline s64
>> -cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
>> +se_key(struct cfq_rb_root *st, struct io_sched_entity *se)
>>  {
>> -	return cfqg->vdisktime - st->min_vdisktime;
>> +	return se->vdisktime - st->min_vdisktime;
>>  }
>>  
>>  static void
>> -__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
>> +__io_sched_entity_add(struct cfq_rb_root *st, struct io_sched_entity *se)
>>  {
>>  	struct rb_node **node = &st->rb.rb_node;
>>  	struct rb_node *parent = NULL;
>> -	struct cfq_group *__cfqg;
>> -	s64 key = cfqg_key(st, cfqg);
>> +	struct io_sched_entity *__se;
>> +	s64 key = se_key(st, se);
>>  	int left = 1;
>>  
>>  	while (*node != NULL) {
>>  		parent = *node;
>> -		__cfqg = rb_entry_cfqg(parent);
>> +		__se = rb_entry_se(parent);
>>  
>> -		if (key < cfqg_key(st, __cfqg))
>> +		if (key < se_key(st, __se))
>>  			node = &parent->rb_left;
>>  		else {
>>  			node = &parent->rb_right;
>> @@ -846,47 +884,80 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
>>  	}
>>  
>>  	if (left)
>> -		st->left = &cfqg->rb_node;
>> +		st->left = &se->rb_node;
>>  
>> -	rb_link_node(&cfqg->rb_node, parent, node);
>> -	rb_insert_color(&cfqg->rb_node, &st->rb);
>> +	rb_link_node(&se->rb_node, parent, node);
>> +	rb_insert_color(&se->rb_node, &st->rb);
>>  }
>>  
>> -static void
>> -cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> +static void io_sched_entity_add(struct cfq_rb_root *st,
>> +				struct io_sched_entity *se)
>>  {
>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> -	struct cfq_group *__cfqg;
>>  	struct rb_node *n;
>> +	struct io_sched_entity *__se;
>>  
>> -	cfqg->nr_cfqq++;
>> -	if (cfqg->on_st)
>> -		return;
>>  
>> +	if (se->on_st)
>> +		return;
>>  	/*
>>  	 * Currently put the group at the end. Later implement something
>>  	 * so that groups get lesser vtime based on their weights, so that
>>  	 * if group does not loose all if it was not continously backlogged.
>>  	 */
>> -	n = rb_last(&st->rb);
>> +	n = rb_last(&se->st->rb);
>>  	if (n) {
>> -		__cfqg = rb_entry_cfqg(n);
>> -		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
>> +		__se = rb_entry_se(n);
>> +		se->vdisktime = __se->vdisktime + CFQ_IDLE_DELAY;
>>  	} else
>> -		cfqg->vdisktime = st->min_vdisktime;
>> +		se->vdisktime = st->min_vdisktime;
>>  
>> -	__cfq_group_service_tree_add(st, cfqg);
>> -	cfqg->on_st = true;
>> -	st->total_weight += cfqg->weight;
>> +	__io_sched_entity_add(se->st, se);
>> +	se->on_st = true;
>> +	st->total_weight += se->weight;
>> +}
>> +
>> +static void
>> +cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> +{
>> +	struct io_sched_entity *gse = &cfqg->group_se;
>> +	struct io_sched_entity *qse = &cfqg->queue_se;
>> +	struct cfq_group *__cfqg;
>> +
>> +	cfqg->nr_cfqq++;
>> +
>> +	io_sched_entity_add(qse->st, qse);
>> +
>> +	while (gse && gse->parent) {
>> +		if (gse->on_st)
>> +			return;
>> +		io_sched_entity_add(gse->st, gse);
>> +		gse = gse->parent;
>> +		__cfqg = cfqg_of_gse(gse);
>> +		__cfqg->nr_subgp++;
>> +	}
>> +}
>> +
>> +static void io_sched_entity_del(struct io_sched_entity *se)
>> +{
>> +	if (!RB_EMPTY_NODE(&se->rb_node))
>> +		cfq_rb_erase(&se->rb_node, se->st);
>> +
>> +	se->on_st = false;
>> +	se->st->total_weight -= se->weight;
>>  }
>>  
>>  static void
>>  cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  {
>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> +	struct io_sched_entity *gse = &cfqg->group_se;
>> +	struct io_sched_entity *qse = &cfqg->queue_se;
>> +	struct cfq_group *__cfqg, *p_cfqg;
>> +
>> +	if (gse->st && gse->st->active == &gse->rb_node)
>> +		gse->st->active = NULL;
>>  
>> -	if (st->active == &cfqg->rb_node)
>> -		st->active = NULL;
>> +	if (qse->st && qse->st->active == &qse->rb_node)
>> +		qse->st->active = NULL;
>>  
>>  	BUG_ON(cfqg->nr_cfqq < 1);
>>  	cfqg->nr_cfqq--;
>> @@ -895,13 +966,25 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  	if (cfqg->nr_cfqq)
>>  		return;
>>  
>> -	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
>> -	cfqg->on_st = false;
>> -	st->total_weight -= cfqg->weight;
>> -	if (!RB_EMPTY_NODE(&cfqg->rb_node))
>> -		cfq_rb_erase(&cfqg->rb_node, st);
>> -	cfqg->saved_workload_slice = 0;
>> -	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
>> +	/* dequeue queue se from group */
>> +	io_sched_entity_del(qse);
>> +
>> +	if (cfqg->nr_subgp)
>> +		return;
>> +
>> +	/* prevent from dequeuing root group */
>> +	while (gse && gse->parent) {
>> +		__cfqg = cfqg_of_gse(gse);
>> +		p_cfqg = cfqg_of_gse(gse->parent);
>> +		io_sched_entity_del(gse);
>> +		cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
>> +		cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
>> +		__cfqg->saved_workload_slice = 0;
>> +		gse = gse->parent;
>> +		p_cfqg->nr_subgp--;
>> +		if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
>> +			return;
>> +	}
>>  }
>>  
>>  static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>> @@ -933,7 +1016,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>>  static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>  				struct cfq_queue *cfqq)
>>  {
>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> +	struct io_sched_entity *gse = &cfqg->group_se;
>> +	struct io_sched_entity *qse = &cfqg->queue_se;
>>  	unsigned int used_sl, charge;
>>  	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
>>  			- cfqg->service_tree_idle.count;
>> @@ -946,10 +1030,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>  	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
>>  		charge = cfqq->allocated_slice;
>>  
>> -	/* Can't update vdisktime while group is on service tree */
>> -	cfq_rb_erase(&cfqg->rb_node, st);
>> -	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
>> -	__cfq_group_service_tree_add(st, cfqg);
>> +	/*
>> +	 *  update queue se's vdisktime.
>> +	 *  Can't update vdisktime while group is on service tree.
>> +	 */
>> +
>> +	cfq_rb_erase(&qse->rb_node, qse->st);
>> +	qse->vdisktime += cfq_scale_slice(charge, qse);
>> +	__io_sched_entity_add(qse->st, qse);
>> +	if (&qse->rb_node == qse->st->active)
>> +		qse->st->active = NULL;
>> +
>> +	while (gse && gse->parent) {
>> +		cfq_rb_erase(&gse->rb_node, gse->st);
>> +		gse->vdisktime += cfq_scale_slice(charge, gse);
>> +		__io_sched_entity_add(gse->st, gse);
>> +		if (&gse->rb_node == gse->st->active)
>> +			gse->st->active = NULL;
>> +		gse = gse->parent;
>> +	}
>>  
>>  	/* This group is being expired. Save the context */
>>  	if (time_after(cfqd->workload_expires, jiffies)) {
>> @@ -960,8 +1059,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>  	} else
>>  		cfqg->saved_workload_slice = 0;
>>  
>> -	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
>> -					st->min_vdisktime);
>> +	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", gse->vdisktime,
>> +		     gse->st->min_vdisktime);
>> +
>>  	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
>>  			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
>>  			iops_mode(cfqd), cfqq->nr_sectors);
>> @@ -977,39 +1077,84 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
>>  	return NULL;
>>  }
>>  
>> +static void cfq_put_cfqg(struct cfq_group *cfqg)
>> +{
>> +	struct cfq_rb_root *st;
>> +	int i, j;
>> +	struct io_sched_entity *gse;
>> +	struct cfq_group *p_cfqg;
>> +
>> +	BUG_ON(atomic_read(&cfqg->ref) <= 0);
>> +	if (!atomic_dec_and_test(&cfqg->ref))
>> +		return;
>> +	for_each_cfqg_st(cfqg, i, j, st)
>> +		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
>> +
>> +	gse = &cfqg->group_se;
>> +	if (gse->parent) {
>> +		p_cfqg = cfqg_of_gse(gse->parent);
>> +		/* Drop the reference taken by children */
>> +		atomic_dec(&p_cfqg->ref);
>> +	}
>> +
>> +	kfree(cfqg);
>> +}
>> +
>> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> +{
>> +	/* Something wrong if we are trying to remove same group twice */
>> +	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
>> +
>> +	hlist_del_init(&cfqg->cfqd_node);
>> +
>> +	/*
>> +	 * Put the reference taken at the time of creation so that when all
>> +	 * queues are gone, group can be destroyed.
>> +	 */
>> +	cfq_put_cfqg(cfqg);
>> +}
>> +
>>  void
>>  cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
>>  {
>> -	cfqg_of_blkg(blkg)->weight = weight;
>> +	struct cfq_group *cfqg;
>> +	struct io_sched_entity *gse;
>> +
>> +	cfqg = cfqg_of_blkg(blkg);
>> +	gse = &cfqg->group_se;
>> +	gse->weight = weight;
>>  }
>>  
>> -static struct cfq_group *
>> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +static void init_group_queue_entity(struct blkio_cgroup *blkcg,
>> +				   struct cfq_group *cfqg)
>> +{
>> +	struct io_sched_entity *gse = &cfqg->group_se;
>> +	struct io_sched_entity *qse = &cfqg->queue_se;
>> +
>> +	gse->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> +	RB_CLEAR_NODE(&gse->rb_node);
>> +	gse->is_group_se = true;
>> +	gse->on_st = false;
>> +
>> +	/* Set to 500 for the time being */
>> +	qse->weight = 500;
>> +	RB_CLEAR_NODE(&qse->rb_node);
>> +	qse->is_group_se = false;
>> +	qse->on_st = false;
>> +}
>> +
>> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
>> +		      struct cfq_group *cfqg)
>>  {
>> -	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> -	struct cfq_group *cfqg = NULL;
>> -	void *key = cfqd;
>>  	int i, j;
>>  	struct cfq_rb_root *st;
>> -	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>  	unsigned int major, minor;
>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>  
>> -	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> -	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> -		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> -		cfqg->blkg.dev = MKDEV(major, minor);
>> -		goto done;
>> -	}
>> -	if (cfqg || !create)
>> -		goto done;
>> -
>> -	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>> -	if (!cfqg)
>> -		goto done;
>> +	cfqg->grp_service_tree = CFQ_RB_ROOT;
>>  
>>  	for_each_cfqg_st(cfqg, i, j, st)
>>  		*st = CFQ_RB_ROOT;
>> -	RB_CLEAR_NODE(&cfqg->rb_node);
>>  
>>  	/*
>>  	 * Take the initial reference that will be released on destroy
>> @@ -1023,11 +1168,102 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>  	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>  	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>>  					MKDEV(major, minor));
>> -	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> -
>> +	init_group_queue_entity(blkcg, cfqg);
>>  	/* Add group on cfqd list */
>>  	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> +}
>> +
>> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> +{
>> +	if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
>> +		cfq_destroy_cfqg(cfqd, cfqg);
>> +}
>> +
>> +static void cfqg_set_parent(struct cfq_group *cfqg, struct cfq_group *p_cfqg)
>> +{
>> +	struct io_sched_entity *gse = &cfqg->group_se;
>> +	struct io_sched_entity *qse = &cfqg->queue_se;
>> +	struct io_sched_entity *p_gse = &p_cfqg->group_se;
>> +
>> +	gse->parent = p_gse;
>> +	gse->st = &p_cfqg->grp_service_tree;
>> +
>> +	qse->parent = gse;
>> +	qse->st = &cfqg->grp_service_tree;
>> +
>> +	/* child reference */
>> +	atomic_inc(&p_cfqg->ref);
>> +}
>> +
>> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
>> +{
>> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> +	struct blkio_cgroup *p_blkcg;
>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> +	unsigned int major, minor;
>> +	struct cfq_group *cfqg, *p_cfqg;
>> +	void *key = cfqd;
>> +	int ret;
>> +
>> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> +	if (cfqg) {
>> +		if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> +			sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> +			cfqg->blkg.dev = MKDEV(major, minor);
>> +		}
>> +		/* chain has already been built */
>> +		return 0;
>> +	}
>> +
>> +	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>> +	if (!cfqg)
>> +		return -1;
>> +
>> +	init_cfqg(cfqd, blkcg, cfqg);
>> +
>> +	/* Already to the top group */
>> +	if (!cgroup->parent)
>> +		return 0;
>> +
>> +	ret = cfqg_chain_alloc(cfqd, cgroup->parent);
>> +	if (ret == -1) {
>> +		uninit_cfqg(cfqd, cfqg);
>> +		return -1;
>> +	}
>> +
>> +	p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
>> +	p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
>> +	BUG_ON(p_cfqg == NULL);
>> +
>> +	cfqg_set_parent(cfqg, p_cfqg);
>> +	return 0;
>> +}
>> +
>> +static struct cfq_group *
>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +{
>> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> +	struct cfq_group *cfqg = NULL;
>> +	void *key = cfqd;
>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> +	unsigned int major, minor;
>> +	int ret;
>>  
>> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> +	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> +		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> +		cfqg->blkg.dev = MKDEV(major, minor);
>> +		goto done;
>> +	}
>> +	if (cfqg || !create)
>> +		goto done;
>> +
>> +	ret = cfqg_chain_alloc(cfqd, cgroup);
>> +	if (!ret) {
>> +		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> +		BUG_ON(cfqg == NULL);
>> +		goto done;
>> +	}
>>  done:
>>  	return cfqg;
>>  }
>> @@ -1067,33 +1303,6 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
>>  	atomic_inc(&cfqq->cfqg->ref);
>>  }
>>  
>> -static void cfq_put_cfqg(struct cfq_group *cfqg)
>> -{
>> -	struct cfq_rb_root *st;
>> -	int i, j;
>> -
>> -	BUG_ON(atomic_read(&cfqg->ref) <= 0);
>> -	if (!atomic_dec_and_test(&cfqg->ref))
>> -		return;
>> -	for_each_cfqg_st(cfqg, i, j, st)
>> -		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
>> -	kfree(cfqg);
>> -}
>> -
>> -static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> -{
>> -	/* Something wrong if we are trying to remove same group twice */
>> -	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
>> -
>> -	hlist_del_init(&cfqg->cfqd_node);
>> -
>> -	/*
>> -	 * Put the reference taken at the time of creation so that when all
>> -	 * queues are gone, group can be destroyed.
>> -	 */
>> -	cfq_put_cfqg(cfqg);
>> -}
>> -
>>  static void cfq_release_cfq_groups(struct cfq_data *cfqd)
>>  {
>>  	struct hlist_node *pos, *n;
>> @@ -1668,9 +1877,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>  	if (cfqq == cfqd->active_queue)
>>  		cfqd->active_queue = NULL;
>>  
>> -	if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
>> -		cfqd->grp_service_tree.active = NULL;
>> -
>>  	if (cfqd->active_cic) {
>>  		put_io_context(cfqd->active_cic->ioc);
>>  		cfqd->active_cic = NULL;
>> @@ -2173,17 +2379,26 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  	cfqd->noidle_tree_requires_idle = false;
>>  }
>>  
>> +
>>  static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
>>  {
>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> +	struct cfq_group *root_group = &cfqd->root_group;
>> +	struct cfq_rb_root *st = &root_group->grp_service_tree;
>>  	struct cfq_group *cfqg;
>> +	struct io_sched_entity *se;
>>  
>> -	if (RB_EMPTY_ROOT(&st->rb))
>> -		return NULL;
>> -	cfqg = cfq_rb_first_group(st);
>> -	st->active = &cfqg->rb_node;
>> -	update_min_vdisktime(st);
>> -	return cfqg;
>> +	do {
>> +		se = cfq_rb_first_se(st);
>> +		if (!se)
>> +			return NULL;
>> +		st->active = &se->rb_node;
>> +		update_min_vdisktime(st);
>> +		cfqg = cfqg_of_qse(se);
>> +		if (cfqg)
>> +			return cfqg;
>> +		cfqg = cfqg_of_gse(se);
>> +		st = &cfqg->grp_service_tree;
>> +	} while (1);
>>  }
>>  
>>  static void cfq_choose_cfqg(struct cfq_data *cfqd)
>> @@ -2215,15 +2430,18 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>>  	if (!cfqq)
>>  		goto new_queue;
>>  
>> +
>>  	if (!cfqd->rq_queued)
>>  		return NULL;
>>  
>> +
>>  	/*
>>  	 * We were waiting for group to get backlogged. Expire the queue
>>  	 */
>>  	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
>>  		goto expire;
>>  
>> +
>>  	/*
>>  	 * The active queue has run out of time, expire it and select new.
>>  	 */
>> @@ -2245,6 +2463,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>>  			goto check_group_idle;
>>  	}
>>  
>> +
>>  	/*
>>  	 * The active queue has requests and isn't expired, allow it to
>>  	 * dispatch.
>> @@ -2252,6 +2471,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>>  	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
>>  		goto keep_queue;
>>  
>> +
>>  	/*
>>  	 * If another queue has a request waiting within our mean seek
>>  	 * distance, let it run.  The expire code will check for close
>> @@ -2505,6 +2725,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
>>  	}
>>  
>>  	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
>> +
>>  	return 1;
>>  }
>>  
>> @@ -3854,17 +4075,25 @@ static void *cfq_init_queue(struct request_queue *q)
>>  
>>  	cfqd->cic_index = i;
>>  
>> -	/* Init root service tree */
>> -	cfqd->grp_service_tree = CFQ_RB_ROOT;
>> -
>>  	/* Init root group */
>>  	cfqg = &cfqd->root_group;
>> +	cfqg->grp_service_tree = CFQ_RB_ROOT;
>>  	for_each_cfqg_st(cfqg, i, j, st)
>>  		*st = CFQ_RB_ROOT;
>> -	RB_CLEAR_NODE(&cfqg->rb_node);
>> -
>> +	cfqg->group_se.is_group_se = true;
>> +	RB_CLEAR_NODE(&cfqg->group_se.rb_node);
>> +	cfqg->group_se.on_st = false;
>>  	/* Give preference to root group over other groups */
>> -	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
>> +	cfqg->group_se.weight = 2*BLKIO_WEIGHT_DEFAULT;
>> +	cfqg->group_se.parent = NULL;
>> +	cfqg->group_se.st = NULL;
>> +
>> +	cfqg->queue_se.is_group_se = false;
>> +	RB_CLEAR_NODE(&cfqg->queue_se.rb_node);
>> +	cfqg->queue_se.on_st = false;
>> +	cfqg->queue_se.weight = 2*BLKIO_WEIGHT_DEFAULT;
>> +	cfqg->queue_se.parent = &cfqg->group_se;
>> +	cfqg->queue_se.st = &cfqg->grp_service_tree;
>>  
>>  #ifdef CONFIG_CFQ_GROUP_IOSCHED
>>  	/*
>> -- 
>> 1.6.5.2
>>
>>
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-08-30 18:20 ` Chad Talbott
@ 2010-08-31  0:35   ` Gui Jianfeng
  0 siblings, 0 replies; 19+ messages in thread
From: Gui Jianfeng @ 2010-08-31  0:35 UTC (permalink / raw)
  To: Chad Talbott
  Cc: Vivek Goyal, Jens Axboe, Jeff Moyer, Divyesh Shah,
	Corrado Zoccolo, Nauman Rafique, linux kernel mailing list

Chad Talbott wrote:
> On Sun, Aug 29, 2010 at 11:50 PM, Gui Jianfeng
> <guijianfeng@cn.fujitsu.com> wrote:
>>  o Each cfq group has its own group service tree.
>>  o Each cfq group contains a "group schedule entity" (gse) that
>>    schedules on parent cfq group's service tree.
>>  o Each cfq group contains a "queue schedule entity"(qse), it
>>    represents all cfqqs located on this cfq group. It schedules
>>    on this group's service tree. For the time being, root group
>>    qse's weight is 1000, and subgroup qse's weight is 500.
>>  o All gses and qse which belones to a same cfq group schedules
>>    on the same group service tree.
> 
> Overall, I like this approach.  It's similar to what we are using with
> earlier hierarchical patches.  Can you improve the naming a little
> bit?  "qse" and "gse" look almost identical and breaks my brain.
> Maybe "queue_entity" and "group_entity"?  If entity is to much to
> type, then "item"?

Thanks for reviewing Chad.
I will modify the name.

Thanks,
Gui

> 
> Chad
> 
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-08-31  0:29   ` Gui Jianfeng
@ 2010-08-31 12:57     ` Vivek Goyal
  2010-08-31 15:40       ` Nauman Rafique
  2010-09-01  8:48       ` Gui Jianfeng
  0 siblings, 2 replies; 19+ messages in thread
From: Vivek Goyal @ 2010-08-31 12:57 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo,
	Nauman Rafique, linux kernel mailing list

On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote:
> >> Hi All,
> >>
> >> This patch enables cfq group hierarchical scheduling.
> >>
> >> With this patch, you can create a cgroup directory deeper than level 1.
> >> Now, I/O Bandwidth is distributed in a hierarchy way. For example:
> >> We create cgroup directories as following(the number represents weight):
> >>
> >>             Root grp
> >>            /       \
> >>        grp_1(100) grp_2(400)
> >>        /    \ 
> >>   grp_3(200) grp_4(300)
> >>
> >> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
> >> grp_2 will share 80% of total bandwidth.
> >> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)
> >>
> >> Design:
> >>   o Each cfq group has its own group service tree. 
> >>   o Each cfq group contains a "group schedule entity" (gse) that 
> >>     schedules on parent cfq group's service tree.
> >>   o Each cfq group contains a "queue schedule entity"(qse), it
> >>     represents all cfqqs located on this cfq group. It schedules
> >>     on this group's service tree. For the time being, root group
> >>     qse's weight is 1000, and subgroup qse's weight is 500.
> >>   o All gses and qse which belones to a same cfq group schedules
> >>     on the same group service tree.
> > 
> > Hi Gui,
> > 
> > Thanks for the patch. I have few questions.
> > 
> > - So how does the hierarchy look like, w.r.t root group. Something as
> >   follows?
> > 
> > 
> > 			root
> > 		       / | \
> > 		     q1  q2 G1
> > 
> > Assume there are two processes doin IO in root group and q1 and q2 are
> > cfqq queues for those processes and G1 is the cgroup created by user.
> > 
> > If yes, then what algorithm do you use to do scheduling between q1, q2
> > and G1? IOW, currently we have two algorithms operating in CFQ. One for
> > cfqq and other for groups. Group algorithm does not use the logic of
> > cfq_slice_offset().
> 
> Hi Vivek,
> 
> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq.
> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and
> it will schedule on root group service with G1, as following:
> 
>                          root group
>                         /         \
>                     qse(q1,q2)    gse(G1)
> 

Ok. That's interesting. That raises another question that how hierarchy
should look like. IOW, how queue and groups should be treated in
hierarchy.

CFS cpu scheduler treats queues and group at the same level. That is as
follows.

			root
			/ | \
		       q1 q2 G1

In the past I had raised this question and Jens and corrado liked treating
queues and group at same level.

Logically, q1, q2 and G1 are all children of root, so it makes sense to
treat them at same level and not group q1 and q2 in to a single entity and
group.

One of the possible way forward could be this.

- Treat queue and group at same level (like CFS)

- Get rid of cfq_slice_offset() logic. That means without idling on, there
  will be no ioprio difference between cfq queues. I think anyway as of 
  today that logic helps in so little situations that I would not mind
  getting rid of it. Just that Jens should agree to it.

- With this new scheme, it will break the existing semantics of root group
  being at same level as child groups. To avoid that, we can probably
  implement two modes (flat and hierarchical), something similar to what
  memory cgroup controller has done. May be one tunable in root cgroup of
  blkio "use_hierarchy".  By default everything will be in flat mode and
  if user wants hiearchical control, he needs to set user_hierarchy in
  root group.

  I think memory controller provides "use_hierarchy" tunable in each
  cgroup. I am not sure why do we need it in each cgroup and not just
  in root cgroup.

Thanks
Vivek 

> Thanks,
> Gui
> 
> > 
> > The reason being they are different because CFQ wanted to provide ioprio
> > service differentiation on SSD where slice idle is effectively zero. But
> > that slice_offset() logic is approximate at best and is not suitable
> > for group scheduling.
> > 
> > I have yet to go through the patches in detail, so thought will ask you
> > that how have you handeled above.
> > 
> > Vivek
> > 
> >>   o cfq group allocates in a recursive manner, that means when a cfq 
> >>     group needs to be allocated, the upper level cfq groups are also
> >>     allocated.
> >>   o When a cfq group served, not only charge this cfq group but also
> >>     charge its ancestors.
> >>  block/cfq-iosched.c |  477 +++++++++++++++++++++++++++++++++++++-------------
> >>  2 files changed, 353 insertions(+), 128 deletions(-)
> >>
> >> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> >> index 2fef1ef..aa06cb3 100644
> >> --- a/block/blk-cgroup.c
> >> +++ b/block/blk-cgroup.c
> >> @@ -964,10 +964,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
> >>  		goto done;
> >>  	}
> >>  
> >> -	/* Currently we do not support hierarchy deeper than two level (0,1) */
> >> -	if (parent != cgroup->top_cgroup)
> >> -		return ERR_PTR(-EPERM);
> >> -
> >>  	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
> >>  	if (!blkcg)
> >>  		return ERR_PTR(-ENOMEM);
> >> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> >> index f65c6f0..4660d89 100644
> >> --- a/block/cfq-iosched.c
> >> +++ b/block/cfq-iosched.c
> >> @@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida);
> >>  #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
> >>  
> >>  #define sample_valid(samples)	((samples) > 80)
> >> -#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
> >> +#define rb_entry_se(node)	\
> >> +	rb_entry((node), struct io_sched_entity, rb_node)
> >>  
> >>  /*
> >>   * Most of our rbtree usage is for sorting with min extraction, so
> >> @@ -171,19 +172,40 @@ enum wl_type_t {
> >>  	SYNC_WORKLOAD = 2
> >>  };
> >>  
> >> -/* This is per cgroup per device grouping structure */
> >> -struct cfq_group {
> >> +/* 
> >> + * This's the schedule entity which is scheduled on group service tree.
> >> + * It represents shedule structure of a cfq group, or a bundle of cfq
> >> + * queues that locate in a cfq group.
> >> + */
> >> +struct io_sched_entity {
> >> +	struct cfq_rb_root *st;
> >> +
> >>  	/* group service_tree member */
> >>  	struct rb_node rb_node;
> >> -
> >> -	/* group service_tree key */
> >>  	u64 vdisktime;
> >>  	unsigned int weight;
> >>  	bool on_st;
> >> +	bool is_group_se;
> >> +	struct io_sched_entity *parent;
> >> +};
> >> +
> >> +/* This is per cgroup per device grouping structure */
> >> +struct cfq_group {
> >> +	/* cfq group sched entity */
> >> +	struct io_sched_entity group_se;
> >> +
> >> +	/* sched entity for cfqqs */
> >> +	struct io_sched_entity queue_se;
> >> +
> >> +	/* Service tree for cfq_groups and cfqqs set*/
> >> +	struct cfq_rb_root grp_service_tree;
> >>  
> >>  	/* number of cfqq currently on this group */
> >>  	int nr_cfqq;
> >>  
> >> +	/* number of sub cfq groups */
> >> +	int nr_subgp;
> >> +
> >>  	/* Per group busy queus average. Useful for workload slice calc. */
> >>  	unsigned int busy_queues_avg[2];
> >>  	/*
> >> @@ -210,8 +232,6 @@ struct cfq_group {
> >>   */
> >>  struct cfq_data {
> >>  	struct request_queue *queue;
> >> -	/* Root service tree for cfq_groups */
> >> -	struct cfq_rb_root grp_service_tree;
> >>  	struct cfq_group root_group;
> >>  
> >>  	/*
> >> @@ -399,6 +419,22 @@ static inline bool iops_mode(struct cfq_data *cfqd)
> >>  		return false;
> >>  }
> >>  
> >> +static inline struct cfq_group *cfqg_of_gse(struct io_sched_entity *se)
> >> +{
> >> +	if (se->is_group_se)
> >> +		return container_of(se, struct cfq_group, group_se);
> >> +	else
> >> +		return NULL;
> >> +}
> >> +
> >> +static inline struct cfq_group *cfqg_of_qse(struct io_sched_entity *se)
> >> +{
> >> +	if (!se->is_group_se)
> >> +		return container_of(se, struct cfq_group, queue_se);
> >> +	else
> >> +		return NULL;
> >> +}
> >> +
> >>  static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
> >>  {
> >>  	if (cfq_class_idle(cfqq))
> >> @@ -522,12 +558,13 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> >>  	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
> >>  }
> >>  
> >> -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
> >> +static inline u64 cfq_scale_slice(unsigned long delta,
> >> +				  struct io_sched_entity *se)
> >>  {
> >>  	u64 d = delta << CFQ_SERVICE_SHIFT;
> >>  
> >>  	d = d * BLKIO_WEIGHT_DEFAULT;
> >> -	do_div(d, cfqg->weight);
> >> +	do_div(d, se->weight);
> >>  	return d;
> >>  }
> >>  
> >> @@ -552,16 +589,16 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
> >>  static void update_min_vdisktime(struct cfq_rb_root *st)
> >>  {
> >>  	u64 vdisktime = st->min_vdisktime;
> >> -	struct cfq_group *cfqg;
> >> +	struct io_sched_entity *se;
> >>  
> >>  	if (st->active) {
> >> -		cfqg = rb_entry_cfqg(st->active);
> >> -		vdisktime = cfqg->vdisktime;
> >> +		se = rb_entry_se(st->active);
> >> +		vdisktime = se->vdisktime;
> >>  	}
> >>  
> >>  	if (st->left) {
> >> -		cfqg = rb_entry_cfqg(st->left);
> >> -		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
> >> +		se = rb_entry_se(st->left);
> >> +		vdisktime = min_vdisktime(vdisktime, se->vdisktime);
> >>  	}
> >>  
> >>  	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
> >> @@ -591,9 +628,10 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
> >>  static inline unsigned
> >>  cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >>  {
> >> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >> +	struct io_sched_entity *gse = &cfqg->queue_se;
> >> +	struct cfq_rb_root *st = gse->st;
> >>  
> >> -	return cfq_target_latency * cfqg->weight / st->total_weight;
> >> +	return cfq_target_latency * gse->weight / st->total_weight;
> >>  }
> >>  
> >>  static inline void
> >> @@ -756,13 +794,13 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
> >>  	return NULL;
> >>  }
> >>  
> >> -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
> >> +static struct io_sched_entity *cfq_rb_first_se(struct cfq_rb_root *root)
> >>  {
> >>  	if (!root->left)
> >>  		root->left = rb_first(&root->rb);
> >>  
> >>  	if (root->left)
> >> -		return rb_entry_cfqg(root->left);
> >> +		return rb_entry_se(root->left);
> >>  
> >>  	return NULL;
> >>  }
> >> @@ -819,25 +857,25 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
> >>  }
> >>  
> >>  static inline s64
> >> -cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
> >> +se_key(struct cfq_rb_root *st, struct io_sched_entity *se)
> >>  {
> >> -	return cfqg->vdisktime - st->min_vdisktime;
> >> +	return se->vdisktime - st->min_vdisktime;
> >>  }
> >>  
> >>  static void
> >> -__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
> >> +__io_sched_entity_add(struct cfq_rb_root *st, struct io_sched_entity *se)
> >>  {
> >>  	struct rb_node **node = &st->rb.rb_node;
> >>  	struct rb_node *parent = NULL;
> >> -	struct cfq_group *__cfqg;
> >> -	s64 key = cfqg_key(st, cfqg);
> >> +	struct io_sched_entity *__se;
> >> +	s64 key = se_key(st, se);
> >>  	int left = 1;
> >>  
> >>  	while (*node != NULL) {
> >>  		parent = *node;
> >> -		__cfqg = rb_entry_cfqg(parent);
> >> +		__se = rb_entry_se(parent);
> >>  
> >> -		if (key < cfqg_key(st, __cfqg))
> >> +		if (key < se_key(st, __se))
> >>  			node = &parent->rb_left;
> >>  		else {
> >>  			node = &parent->rb_right;
> >> @@ -846,47 +884,80 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
> >>  	}
> >>  
> >>  	if (left)
> >> -		st->left = &cfqg->rb_node;
> >> +		st->left = &se->rb_node;
> >>  
> >> -	rb_link_node(&cfqg->rb_node, parent, node);
> >> -	rb_insert_color(&cfqg->rb_node, &st->rb);
> >> +	rb_link_node(&se->rb_node, parent, node);
> >> +	rb_insert_color(&se->rb_node, &st->rb);
> >>  }
> >>  
> >> -static void
> >> -cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >> +static void io_sched_entity_add(struct cfq_rb_root *st,
> >> +				struct io_sched_entity *se)
> >>  {
> >> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >> -	struct cfq_group *__cfqg;
> >>  	struct rb_node *n;
> >> +	struct io_sched_entity *__se;
> >>  
> >> -	cfqg->nr_cfqq++;
> >> -	if (cfqg->on_st)
> >> -		return;
> >>  
> >> +	if (se->on_st)
> >> +		return;
> >>  	/*
> >>  	 * Currently put the group at the end. Later implement something
> >>  	 * so that groups get lesser vtime based on their weights, so that
> >>  	 * if group does not loose all if it was not continously backlogged.
> >>  	 */
> >> -	n = rb_last(&st->rb);
> >> +	n = rb_last(&se->st->rb);
> >>  	if (n) {
> >> -		__cfqg = rb_entry_cfqg(n);
> >> -		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
> >> +		__se = rb_entry_se(n);
> >> +		se->vdisktime = __se->vdisktime + CFQ_IDLE_DELAY;
> >>  	} else
> >> -		cfqg->vdisktime = st->min_vdisktime;
> >> +		se->vdisktime = st->min_vdisktime;
> >>  
> >> -	__cfq_group_service_tree_add(st, cfqg);
> >> -	cfqg->on_st = true;
> >> -	st->total_weight += cfqg->weight;
> >> +	__io_sched_entity_add(se->st, se);
> >> +	se->on_st = true;
> >> +	st->total_weight += se->weight;
> >> +}
> >> +
> >> +static void
> >> +cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >> +{
> >> +	struct io_sched_entity *gse = &cfqg->group_se;
> >> +	struct io_sched_entity *qse = &cfqg->queue_se;
> >> +	struct cfq_group *__cfqg;
> >> +
> >> +	cfqg->nr_cfqq++;
> >> +
> >> +	io_sched_entity_add(qse->st, qse);
> >> +
> >> +	while (gse && gse->parent) {
> >> +		if (gse->on_st)
> >> +			return;
> >> +		io_sched_entity_add(gse->st, gse);
> >> +		gse = gse->parent;
> >> +		__cfqg = cfqg_of_gse(gse);
> >> +		__cfqg->nr_subgp++;
> >> +	}
> >> +}
> >> +
> >> +static void io_sched_entity_del(struct io_sched_entity *se)
> >> +{
> >> +	if (!RB_EMPTY_NODE(&se->rb_node))
> >> +		cfq_rb_erase(&se->rb_node, se->st);
> >> +
> >> +	se->on_st = false;
> >> +	se->st->total_weight -= se->weight;
> >>  }
> >>  
> >>  static void
> >>  cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >>  {
> >> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >> +	struct io_sched_entity *gse = &cfqg->group_se;
> >> +	struct io_sched_entity *qse = &cfqg->queue_se;
> >> +	struct cfq_group *__cfqg, *p_cfqg;
> >> +
> >> +	if (gse->st && gse->st->active == &gse->rb_node)
> >> +		gse->st->active = NULL;
> >>  
> >> -	if (st->active == &cfqg->rb_node)
> >> -		st->active = NULL;
> >> +	if (qse->st && qse->st->active == &qse->rb_node)
> >> +		qse->st->active = NULL;
> >>  
> >>  	BUG_ON(cfqg->nr_cfqq < 1);
> >>  	cfqg->nr_cfqq--;
> >> @@ -895,13 +966,25 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >>  	if (cfqg->nr_cfqq)
> >>  		return;
> >>  
> >> -	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
> >> -	cfqg->on_st = false;
> >> -	st->total_weight -= cfqg->weight;
> >> -	if (!RB_EMPTY_NODE(&cfqg->rb_node))
> >> -		cfq_rb_erase(&cfqg->rb_node, st);
> >> -	cfqg->saved_workload_slice = 0;
> >> -	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
> >> +	/* dequeue queue se from group */
> >> +	io_sched_entity_del(qse);
> >> +
> >> +	if (cfqg->nr_subgp)
> >> +		return;
> >> +
> >> +	/* prevent from dequeuing root group */
> >> +	while (gse && gse->parent) {
> >> +		__cfqg = cfqg_of_gse(gse);
> >> +		p_cfqg = cfqg_of_gse(gse->parent);
> >> +		io_sched_entity_del(gse);
> >> +		cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
> >> +		cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
> >> +		__cfqg->saved_workload_slice = 0;
> >> +		gse = gse->parent;
> >> +		p_cfqg->nr_subgp--;
> >> +		if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
> >> +			return;
> >> +	}
> >>  }
> >>  
> >>  static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
> >> @@ -933,7 +1016,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
> >>  static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
> >>  				struct cfq_queue *cfqq)
> >>  {
> >> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >> +	struct io_sched_entity *gse = &cfqg->group_se;
> >> +	struct io_sched_entity *qse = &cfqg->queue_se;
> >>  	unsigned int used_sl, charge;
> >>  	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
> >>  			- cfqg->service_tree_idle.count;
> >> @@ -946,10 +1030,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
> >>  	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
> >>  		charge = cfqq->allocated_slice;
> >>  
> >> -	/* Can't update vdisktime while group is on service tree */
> >> -	cfq_rb_erase(&cfqg->rb_node, st);
> >> -	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
> >> -	__cfq_group_service_tree_add(st, cfqg);
> >> +	/*
> >> +	 *  update queue se's vdisktime.
> >> +	 *  Can't update vdisktime while group is on service tree.
> >> +	 */
> >> +
> >> +	cfq_rb_erase(&qse->rb_node, qse->st);
> >> +	qse->vdisktime += cfq_scale_slice(charge, qse);
> >> +	__io_sched_entity_add(qse->st, qse);
> >> +	if (&qse->rb_node == qse->st->active)
> >> +		qse->st->active = NULL;
> >> +
> >> +	while (gse && gse->parent) {
> >> +		cfq_rb_erase(&gse->rb_node, gse->st);
> >> +		gse->vdisktime += cfq_scale_slice(charge, gse);
> >> +		__io_sched_entity_add(gse->st, gse);
> >> +		if (&gse->rb_node == gse->st->active)
> >> +			gse->st->active = NULL;
> >> +		gse = gse->parent;
> >> +	}
> >>  
> >>  	/* This group is being expired. Save the context */
> >>  	if (time_after(cfqd->workload_expires, jiffies)) {
> >> @@ -960,8 +1059,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
> >>  	} else
> >>  		cfqg->saved_workload_slice = 0;
> >>  
> >> -	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
> >> -					st->min_vdisktime);
> >> +	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", gse->vdisktime,
> >> +		     gse->st->min_vdisktime);
> >> +
> >>  	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
> >>  			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
> >>  			iops_mode(cfqd), cfqq->nr_sectors);
> >> @@ -977,39 +1077,84 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
> >>  	return NULL;
> >>  }
> >>  
> >> +static void cfq_put_cfqg(struct cfq_group *cfqg)
> >> +{
> >> +	struct cfq_rb_root *st;
> >> +	int i, j;
> >> +	struct io_sched_entity *gse;
> >> +	struct cfq_group *p_cfqg;
> >> +
> >> +	BUG_ON(atomic_read(&cfqg->ref) <= 0);
> >> +	if (!atomic_dec_and_test(&cfqg->ref))
> >> +		return;
> >> +	for_each_cfqg_st(cfqg, i, j, st)
> >> +		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
> >> +
> >> +	gse = &cfqg->group_se;
> >> +	if (gse->parent) {
> >> +		p_cfqg = cfqg_of_gse(gse->parent);
> >> +		/* Drop the reference taken by children */
> >> +		atomic_dec(&p_cfqg->ref);
> >> +	}
> >> +
> >> +	kfree(cfqg);
> >> +}
> >> +
> >> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >> +{
> >> +	/* Something wrong if we are trying to remove same group twice */
> >> +	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
> >> +
> >> +	hlist_del_init(&cfqg->cfqd_node);
> >> +
> >> +	/*
> >> +	 * Put the reference taken at the time of creation so that when all
> >> +	 * queues are gone, group can be destroyed.
> >> +	 */
> >> +	cfq_put_cfqg(cfqg);
> >> +}
> >> +
> >>  void
> >>  cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
> >>  {
> >> -	cfqg_of_blkg(blkg)->weight = weight;
> >> +	struct cfq_group *cfqg;
> >> +	struct io_sched_entity *gse;
> >> +
> >> +	cfqg = cfqg_of_blkg(blkg);
> >> +	gse = &cfqg->group_se;
> >> +	gse->weight = weight;
> >>  }
> >>  
> >> -static struct cfq_group *
> >> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> >> +static void init_group_queue_entity(struct blkio_cgroup *blkcg,
> >> +				   struct cfq_group *cfqg)
> >> +{
> >> +	struct io_sched_entity *gse = &cfqg->group_se;
> >> +	struct io_sched_entity *qse = &cfqg->queue_se;
> >> +
> >> +	gse->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
> >> +	RB_CLEAR_NODE(&gse->rb_node);
> >> +	gse->is_group_se = true;
> >> +	gse->on_st = false;
> >> +
> >> +	/* Set to 500 for the time being */
> >> +	qse->weight = 500;
> >> +	RB_CLEAR_NODE(&qse->rb_node);
> >> +	qse->is_group_se = false;
> >> +	qse->on_st = false;
> >> +}
> >> +
> >> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
> >> +		      struct cfq_group *cfqg)
> >>  {
> >> -	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> >> -	struct cfq_group *cfqg = NULL;
> >> -	void *key = cfqd;
> >>  	int i, j;
> >>  	struct cfq_rb_root *st;
> >> -	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> >>  	unsigned int major, minor;
> >> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> >>  
> >> -	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> >> -	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> >> -		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >> -		cfqg->blkg.dev = MKDEV(major, minor);
> >> -		goto done;
> >> -	}
> >> -	if (cfqg || !create)
> >> -		goto done;
> >> -
> >> -	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
> >> -	if (!cfqg)
> >> -		goto done;
> >> +	cfqg->grp_service_tree = CFQ_RB_ROOT;
> >>  
> >>  	for_each_cfqg_st(cfqg, i, j, st)
> >>  		*st = CFQ_RB_ROOT;
> >> -	RB_CLEAR_NODE(&cfqg->rb_node);
> >>  
> >>  	/*
> >>  	 * Take the initial reference that will be released on destroy
> >> @@ -1023,11 +1168,102 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> >>  	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >>  	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
> >>  					MKDEV(major, minor));
> >> -	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
> >> -
> >> +	init_group_queue_entity(blkcg, cfqg);
> >>  	/* Add group on cfqd list */
> >>  	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
> >> +}
> >> +
> >> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >> +{
> >> +	if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
> >> +		cfq_destroy_cfqg(cfqd, cfqg);
> >> +}
> >> +
> >> +static void cfqg_set_parent(struct cfq_group *cfqg, struct cfq_group *p_cfqg)
> >> +{
> >> +	struct io_sched_entity *gse = &cfqg->group_se;
> >> +	struct io_sched_entity *qse = &cfqg->queue_se;
> >> +	struct io_sched_entity *p_gse = &p_cfqg->group_se;
> >> +
> >> +	gse->parent = p_gse;
> >> +	gse->st = &p_cfqg->grp_service_tree;
> >> +
> >> +	qse->parent = gse;
> >> +	qse->st = &cfqg->grp_service_tree;
> >> +
> >> +	/* child reference */
> >> +	atomic_inc(&p_cfqg->ref);
> >> +}
> >> +
> >> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
> >> +{
> >> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> >> +	struct blkio_cgroup *p_blkcg;
> >> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> >> +	unsigned int major, minor;
> >> +	struct cfq_group *cfqg, *p_cfqg;
> >> +	void *key = cfqd;
> >> +	int ret;
> >> +
> >> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> >> +	if (cfqg) {
> >> +		if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> >> +			sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >> +			cfqg->blkg.dev = MKDEV(major, minor);
> >> +		}
> >> +		/* chain has already been built */
> >> +		return 0;
> >> +	}
> >> +
> >> +	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
> >> +	if (!cfqg)
> >> +		return -1;
> >> +
> >> +	init_cfqg(cfqd, blkcg, cfqg);
> >> +
> >> +	/* Already to the top group */
> >> +	if (!cgroup->parent)
> >> +		return 0;
> >> +
> >> +	ret = cfqg_chain_alloc(cfqd, cgroup->parent);
> >> +	if (ret == -1) {
> >> +		uninit_cfqg(cfqd, cfqg);
> >> +		return -1;
> >> +	}
> >> +
> >> +	p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
> >> +	p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
> >> +	BUG_ON(p_cfqg == NULL);
> >> +
> >> +	cfqg_set_parent(cfqg, p_cfqg);
> >> +	return 0;
> >> +}
> >> +
> >> +static struct cfq_group *
> >> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> >> +{
> >> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> >> +	struct cfq_group *cfqg = NULL;
> >> +	void *key = cfqd;
> >> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> >> +	unsigned int major, minor;
> >> +	int ret;
> >>  
> >> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> >> +	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> >> +		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >> +		cfqg->blkg.dev = MKDEV(major, minor);
> >> +		goto done;
> >> +	}
> >> +	if (cfqg || !create)
> >> +		goto done;
> >> +
> >> +	ret = cfqg_chain_alloc(cfqd, cgroup);
> >> +	if (!ret) {
> >> +		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> >> +		BUG_ON(cfqg == NULL);
> >> +		goto done;
> >> +	}
> >>  done:
> >>  	return cfqg;
> >>  }
> >> @@ -1067,33 +1303,6 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
> >>  	atomic_inc(&cfqq->cfqg->ref);
> >>  }
> >>  
> >> -static void cfq_put_cfqg(struct cfq_group *cfqg)
> >> -{
> >> -	struct cfq_rb_root *st;
> >> -	int i, j;
> >> -
> >> -	BUG_ON(atomic_read(&cfqg->ref) <= 0);
> >> -	if (!atomic_dec_and_test(&cfqg->ref))
> >> -		return;
> >> -	for_each_cfqg_st(cfqg, i, j, st)
> >> -		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
> >> -	kfree(cfqg);
> >> -}
> >> -
> >> -static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >> -{
> >> -	/* Something wrong if we are trying to remove same group twice */
> >> -	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
> >> -
> >> -	hlist_del_init(&cfqg->cfqd_node);
> >> -
> >> -	/*
> >> -	 * Put the reference taken at the time of creation so that when all
> >> -	 * queues are gone, group can be destroyed.
> >> -	 */
> >> -	cfq_put_cfqg(cfqg);
> >> -}
> >> -
> >>  static void cfq_release_cfq_groups(struct cfq_data *cfqd)
> >>  {
> >>  	struct hlist_node *pos, *n;
> >> @@ -1668,9 +1877,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> >>  	if (cfqq == cfqd->active_queue)
> >>  		cfqd->active_queue = NULL;
> >>  
> >> -	if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
> >> -		cfqd->grp_service_tree.active = NULL;
> >> -
> >>  	if (cfqd->active_cic) {
> >>  		put_io_context(cfqd->active_cic->ioc);
> >>  		cfqd->active_cic = NULL;
> >> @@ -2173,17 +2379,26 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >>  	cfqd->noidle_tree_requires_idle = false;
> >>  }
> >>  
> >> +
> >>  static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
> >>  {
> >> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >> +	struct cfq_group *root_group = &cfqd->root_group;
> >> +	struct cfq_rb_root *st = &root_group->grp_service_tree;
> >>  	struct cfq_group *cfqg;
> >> +	struct io_sched_entity *se;
> >>  
> >> -	if (RB_EMPTY_ROOT(&st->rb))
> >> -		return NULL;
> >> -	cfqg = cfq_rb_first_group(st);
> >> -	st->active = &cfqg->rb_node;
> >> -	update_min_vdisktime(st);
> >> -	return cfqg;
> >> +	do {
> >> +		se = cfq_rb_first_se(st);
> >> +		if (!se)
> >> +			return NULL;
> >> +		st->active = &se->rb_node;
> >> +		update_min_vdisktime(st);
> >> +		cfqg = cfqg_of_qse(se);
> >> +		if (cfqg)
> >> +			return cfqg;
> >> +		cfqg = cfqg_of_gse(se);
> >> +		st = &cfqg->grp_service_tree;
> >> +	} while (1);
> >>  }
> >>  
> >>  static void cfq_choose_cfqg(struct cfq_data *cfqd)
> >> @@ -2215,15 +2430,18 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
> >>  	if (!cfqq)
> >>  		goto new_queue;
> >>  
> >> +
> >>  	if (!cfqd->rq_queued)
> >>  		return NULL;
> >>  
> >> +
> >>  	/*
> >>  	 * We were waiting for group to get backlogged. Expire the queue
> >>  	 */
> >>  	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
> >>  		goto expire;
> >>  
> >> +
> >>  	/*
> >>  	 * The active queue has run out of time, expire it and select new.
> >>  	 */
> >> @@ -2245,6 +2463,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
> >>  			goto check_group_idle;
> >>  	}
> >>  
> >> +
> >>  	/*
> >>  	 * The active queue has requests and isn't expired, allow it to
> >>  	 * dispatch.
> >> @@ -2252,6 +2471,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
> >>  	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
> >>  		goto keep_queue;
> >>  
> >> +
> >>  	/*
> >>  	 * If another queue has a request waiting within our mean seek
> >>  	 * distance, let it run.  The expire code will check for close
> >> @@ -2505,6 +2725,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
> >>  	}
> >>  
> >>  	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
> >> +
> >>  	return 1;
> >>  }
> >>  
> >> @@ -3854,17 +4075,25 @@ static void *cfq_init_queue(struct request_queue *q)
> >>  
> >>  	cfqd->cic_index = i;
> >>  
> >> -	/* Init root service tree */
> >> -	cfqd->grp_service_tree = CFQ_RB_ROOT;
> >> -
> >>  	/* Init root group */
> >>  	cfqg = &cfqd->root_group;
> >> +	cfqg->grp_service_tree = CFQ_RB_ROOT;
> >>  	for_each_cfqg_st(cfqg, i, j, st)
> >>  		*st = CFQ_RB_ROOT;
> >> -	RB_CLEAR_NODE(&cfqg->rb_node);
> >> -
> >> +	cfqg->group_se.is_group_se = true;
> >> +	RB_CLEAR_NODE(&cfqg->group_se.rb_node);
> >> +	cfqg->group_se.on_st = false;
> >>  	/* Give preference to root group over other groups */
> >> -	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
> >> +	cfqg->group_se.weight = 2*BLKIO_WEIGHT_DEFAULT;
> >> +	cfqg->group_se.parent = NULL;
> >> +	cfqg->group_se.st = NULL;
> >> +
> >> +	cfqg->queue_se.is_group_se = false;
> >> +	RB_CLEAR_NODE(&cfqg->queue_se.rb_node);
> >> +	cfqg->queue_se.on_st = false;
> >> +	cfqg->queue_se.weight = 2*BLKIO_WEIGHT_DEFAULT;
> >> +	cfqg->queue_se.parent = &cfqg->group_se;
> >> +	cfqg->queue_se.st = &cfqg->grp_service_tree;
> >>  
> >>  #ifdef CONFIG_CFQ_GROUP_IOSCHED
> >>  	/*
> >> -- 
> >> 1.6.5.2
> >>
> >>
> > 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-08-31 12:57     ` Vivek Goyal
@ 2010-08-31 15:40       ` Nauman Rafique
  2010-08-31 19:25         ` Vivek Goyal
  2010-09-01  8:48       ` Gui Jianfeng
  1 sibling, 1 reply; 19+ messages in thread
From: Nauman Rafique @ 2010-08-31 15:40 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Gui Jianfeng, Jens Axboe, Jeff Moyer, Divyesh Shah,
	Corrado Zoccolo, linux kernel mailing list

On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
> On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>> > On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote:
>> >> Hi All,
>> >>
>> >> This patch enables cfq group hierarchical scheduling.
>> >>
>> >> With this patch, you can create a cgroup directory deeper than level 1.
>> >> Now, I/O Bandwidth is distributed in a hierarchy way. For example:
>> >> We create cgroup directories as following(the number represents weight):
>> >>
>> >>             Root grp
>> >>            /       \
>> >>        grp_1(100) grp_2(400)
>> >>        /    \
>> >>   grp_3(200) grp_4(300)
>> >>
>> >> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
>> >> grp_2 will share 80% of total bandwidth.
>> >> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)
>> >>
>> >> Design:
>> >>   o Each cfq group has its own group service tree.
>> >>   o Each cfq group contains a "group schedule entity" (gse) that
>> >>     schedules on parent cfq group's service tree.
>> >>   o Each cfq group contains a "queue schedule entity"(qse), it
>> >>     represents all cfqqs located on this cfq group. It schedules
>> >>     on this group's service tree. For the time being, root group
>> >>     qse's weight is 1000, and subgroup qse's weight is 500.
>> >>   o All gses and qse which belones to a same cfq group schedules
>> >>     on the same group service tree.
>> >
>> > Hi Gui,
>> >
>> > Thanks for the patch. I have few questions.
>> >
>> > - So how does the hierarchy look like, w.r.t root group. Something as
>> >   follows?
>> >
>> >
>> >                     root
>> >                    / | \
>> >                  q1  q2 G1
>> >
>> > Assume there are two processes doin IO in root group and q1 and q2 are
>> > cfqq queues for those processes and G1 is the cgroup created by user.
>> >
>> > If yes, then what algorithm do you use to do scheduling between q1, q2
>> > and G1? IOW, currently we have two algorithms operating in CFQ. One for
>> > cfqq and other for groups. Group algorithm does not use the logic of
>> > cfq_slice_offset().
>>
>> Hi Vivek,
>>
>> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq.
>> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and
>> it will schedule on root group service with G1, as following:
>>
>>                          root group
>>                         /         \
>>                     qse(q1,q2)    gse(G1)
>>
>
> Ok. That's interesting. That raises another question that how hierarchy
> should look like. IOW, how queue and groups should be treated in
> hierarchy.
>
> CFS cpu scheduler treats queues and group at the same level. That is as
> follows.
>
>                        root
>                        / | \
>                       q1 q2 G1
>
> In the past I had raised this question and Jens and corrado liked treating
> queues and group at same level.
>
> Logically, q1, q2 and G1 are all children of root, so it makes sense to
> treat them at same level and not group q1 and q2 in to a single entity and
> group.
>
> One of the possible way forward could be this.
>
> - Treat queue and group at same level (like CFS)
>
> - Get rid of cfq_slice_offset() logic. That means without idling on, there
>  will be no ioprio difference between cfq queues. I think anyway as of
>  today that logic helps in so little situations that I would not mind
>  getting rid of it. Just that Jens should agree to it.
>
> - With this new scheme, it will break the existing semantics of root group
>  being at same level as child groups. To avoid that, we can probably
>  implement two modes (flat and hierarchical), something similar to what
>  memory cgroup controller has done. May be one tunable in root cgroup of
>  blkio "use_hierarchy".  By default everything will be in flat mode and
>  if user wants hiearchical control, he needs to set user_hierarchy in
>  root group.

Vivek, may be I am reading you wrong here. But you are first
suggesting to add more complexity to treat queues and group at the
same level. Then you are suggesting add even more complexity to fix
the problems caused by that approach.

Why do we need to treat queues and group at the same level? "CFS does
it" is not a good argument. Please suggest cases in which it would be
useful.

>
>  I think memory controller provides "use_hierarchy" tunable in each
>  cgroup. I am not sure why do we need it in each cgroup and not just
>  in root cgroup.
>
> Thanks
> Vivek
>
>> Thanks,
>> Gui
>>
>> >
>> > The reason being they are different because CFQ wanted to provide ioprio
>> > service differentiation on SSD where slice idle is effectively zero. But
>> > that slice_offset() logic is approximate at best and is not suitable
>> > for group scheduling.
>> >
>> > I have yet to go through the patches in detail, so thought will ask you
>> > that how have you handeled above.
>> >
>> > Vivek
>> >
>> >>   o cfq group allocates in a recursive manner, that means when a cfq
>> >>     group needs to be allocated, the upper level cfq groups are also
>> >>     allocated.
>> >>   o When a cfq group served, not only charge this cfq group but also
>> >>     charge its ancestors.
>> >>  block/cfq-iosched.c |  477 +++++++++++++++++++++++++++++++++++++-------------
>> >>  2 files changed, 353 insertions(+), 128 deletions(-)
>> >>
>> >> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> >> index 2fef1ef..aa06cb3 100644
>> >> --- a/block/blk-cgroup.c
>> >> +++ b/block/blk-cgroup.c
>> >> @@ -964,10 +964,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>> >>            goto done;
>> >>    }
>> >>
>> >> -  /* Currently we do not support hierarchy deeper than two level (0,1) */
>> >> -  if (parent != cgroup->top_cgroup)
>> >> -          return ERR_PTR(-EPERM);
>> >> -
>> >>    blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
>> >>    if (!blkcg)
>> >>            return ERR_PTR(-ENOMEM);
>> >> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> >> index f65c6f0..4660d89 100644
>> >> --- a/block/cfq-iosched.c
>> >> +++ b/block/cfq-iosched.c
>> >> @@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida);
>> >>  #define cfq_class_rt(cfqq)        ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
>> >>
>> >>  #define sample_valid(samples)     ((samples) > 80)
>> >> -#define rb_entry_cfqg(node)       rb_entry((node), struct cfq_group, rb_node)
>> >> +#define rb_entry_se(node) \
>> >> +  rb_entry((node), struct io_sched_entity, rb_node)
>> >>
>> >>  /*
>> >>   * Most of our rbtree usage is for sorting with min extraction, so
>> >> @@ -171,19 +172,40 @@ enum wl_type_t {
>> >>    SYNC_WORKLOAD = 2
>> >>  };
>> >>
>> >> -/* This is per cgroup per device grouping structure */
>> >> -struct cfq_group {
>> >> +/*
>> >> + * This's the schedule entity which is scheduled on group service tree.
>> >> + * It represents shedule structure of a cfq group, or a bundle of cfq
>> >> + * queues that locate in a cfq group.
>> >> + */
>> >> +struct io_sched_entity {
>> >> +  struct cfq_rb_root *st;
>> >> +
>> >>    /* group service_tree member */
>> >>    struct rb_node rb_node;
>> >> -
>> >> -  /* group service_tree key */
>> >>    u64 vdisktime;
>> >>    unsigned int weight;
>> >>    bool on_st;
>> >> +  bool is_group_se;
>> >> +  struct io_sched_entity *parent;
>> >> +};
>> >> +
>> >> +/* This is per cgroup per device grouping structure */
>> >> +struct cfq_group {
>> >> +  /* cfq group sched entity */
>> >> +  struct io_sched_entity group_se;
>> >> +
>> >> +  /* sched entity for cfqqs */
>> >> +  struct io_sched_entity queue_se;
>> >> +
>> >> +  /* Service tree for cfq_groups and cfqqs set*/
>> >> +  struct cfq_rb_root grp_service_tree;
>> >>
>> >>    /* number of cfqq currently on this group */
>> >>    int nr_cfqq;
>> >>
>> >> +  /* number of sub cfq groups */
>> >> +  int nr_subgp;
>> >> +
>> >>    /* Per group busy queus average. Useful for workload slice calc. */
>> >>    unsigned int busy_queues_avg[2];
>> >>    /*
>> >> @@ -210,8 +232,6 @@ struct cfq_group {
>> >>   */
>> >>  struct cfq_data {
>> >>    struct request_queue *queue;
>> >> -  /* Root service tree for cfq_groups */
>> >> -  struct cfq_rb_root grp_service_tree;
>> >>    struct cfq_group root_group;
>> >>
>> >>    /*
>> >> @@ -399,6 +419,22 @@ static inline bool iops_mode(struct cfq_data *cfqd)
>> >>            return false;
>> >>  }
>> >>
>> >> +static inline struct cfq_group *cfqg_of_gse(struct io_sched_entity *se)
>> >> +{
>> >> +  if (se->is_group_se)
>> >> +          return container_of(se, struct cfq_group, group_se);
>> >> +  else
>> >> +          return NULL;
>> >> +}
>> >> +
>> >> +static inline struct cfq_group *cfqg_of_qse(struct io_sched_entity *se)
>> >> +{
>> >> +  if (!se->is_group_se)
>> >> +          return container_of(se, struct cfq_group, queue_se);
>> >> +  else
>> >> +          return NULL;
>> >> +}
>> >> +
>> >>  static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
>> >>  {
>> >>    if (cfq_class_idle(cfqq))
>> >> @@ -522,12 +558,13 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>> >>    return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
>> >>  }
>> >>
>> >> -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
>> >> +static inline u64 cfq_scale_slice(unsigned long delta,
>> >> +                            struct io_sched_entity *se)
>> >>  {
>> >>    u64 d = delta << CFQ_SERVICE_SHIFT;
>> >>
>> >>    d = d * BLKIO_WEIGHT_DEFAULT;
>> >> -  do_div(d, cfqg->weight);
>> >> +  do_div(d, se->weight);
>> >>    return d;
>> >>  }
>> >>
>> >> @@ -552,16 +589,16 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
>> >>  static void update_min_vdisktime(struct cfq_rb_root *st)
>> >>  {
>> >>    u64 vdisktime = st->min_vdisktime;
>> >> -  struct cfq_group *cfqg;
>> >> +  struct io_sched_entity *se;
>> >>
>> >>    if (st->active) {
>> >> -          cfqg = rb_entry_cfqg(st->active);
>> >> -          vdisktime = cfqg->vdisktime;
>> >> +          se = rb_entry_se(st->active);
>> >> +          vdisktime = se->vdisktime;
>> >>    }
>> >>
>> >>    if (st->left) {
>> >> -          cfqg = rb_entry_cfqg(st->left);
>> >> -          vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
>> >> +          se = rb_entry_se(st->left);
>> >> +          vdisktime = min_vdisktime(vdisktime, se->vdisktime);
>> >>    }
>> >>
>> >>    st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
>> >> @@ -591,9 +628,10 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>> >>  static inline unsigned
>> >>  cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> >>  {
>> >> -  struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> >> +  struct io_sched_entity *gse = &cfqg->queue_se;
>> >> +  struct cfq_rb_root *st = gse->st;
>> >>
>> >> -  return cfq_target_latency * cfqg->weight / st->total_weight;
>> >> +  return cfq_target_latency * gse->weight / st->total_weight;
>> >>  }
>> >>
>> >>  static inline void
>> >> @@ -756,13 +794,13 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
>> >>    return NULL;
>> >>  }
>> >>
>> >> -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
>> >> +static struct io_sched_entity *cfq_rb_first_se(struct cfq_rb_root *root)
>> >>  {
>> >>    if (!root->left)
>> >>            root->left = rb_first(&root->rb);
>> >>
>> >>    if (root->left)
>> >> -          return rb_entry_cfqg(root->left);
>> >> +          return rb_entry_se(root->left);
>> >>
>> >>    return NULL;
>> >>  }
>> >> @@ -819,25 +857,25 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
>> >>  }
>> >>
>> >>  static inline s64
>> >> -cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
>> >> +se_key(struct cfq_rb_root *st, struct io_sched_entity *se)
>> >>  {
>> >> -  return cfqg->vdisktime - st->min_vdisktime;
>> >> +  return se->vdisktime - st->min_vdisktime;
>> >>  }
>> >>
>> >>  static void
>> >> -__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
>> >> +__io_sched_entity_add(struct cfq_rb_root *st, struct io_sched_entity *se)
>> >>  {
>> >>    struct rb_node **node = &st->rb.rb_node;
>> >>    struct rb_node *parent = NULL;
>> >> -  struct cfq_group *__cfqg;
>> >> -  s64 key = cfqg_key(st, cfqg);
>> >> +  struct io_sched_entity *__se;
>> >> +  s64 key = se_key(st, se);
>> >>    int left = 1;
>> >>
>> >>    while (*node != NULL) {
>> >>            parent = *node;
>> >> -          __cfqg = rb_entry_cfqg(parent);
>> >> +          __se = rb_entry_se(parent);
>> >>
>> >> -          if (key < cfqg_key(st, __cfqg))
>> >> +          if (key < se_key(st, __se))
>> >>                    node = &parent->rb_left;
>> >>            else {
>> >>                    node = &parent->rb_right;
>> >> @@ -846,47 +884,80 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
>> >>    }
>> >>
>> >>    if (left)
>> >> -          st->left = &cfqg->rb_node;
>> >> +          st->left = &se->rb_node;
>> >>
>> >> -  rb_link_node(&cfqg->rb_node, parent, node);
>> >> -  rb_insert_color(&cfqg->rb_node, &st->rb);
>> >> +  rb_link_node(&se->rb_node, parent, node);
>> >> +  rb_insert_color(&se->rb_node, &st->rb);
>> >>  }
>> >>
>> >> -static void
>> >> -cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> >> +static void io_sched_entity_add(struct cfq_rb_root *st,
>> >> +                          struct io_sched_entity *se)
>> >>  {
>> >> -  struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> >> -  struct cfq_group *__cfqg;
>> >>    struct rb_node *n;
>> >> +  struct io_sched_entity *__se;
>> >>
>> >> -  cfqg->nr_cfqq++;
>> >> -  if (cfqg->on_st)
>> >> -          return;
>> >>
>> >> +  if (se->on_st)
>> >> +          return;
>> >>    /*
>> >>     * Currently put the group at the end. Later implement something
>> >>     * so that groups get lesser vtime based on their weights, so that
>> >>     * if group does not loose all if it was not continously backlogged.
>> >>     */
>> >> -  n = rb_last(&st->rb);
>> >> +  n = rb_last(&se->st->rb);
>> >>    if (n) {
>> >> -          __cfqg = rb_entry_cfqg(n);
>> >> -          cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
>> >> +          __se = rb_entry_se(n);
>> >> +          se->vdisktime = __se->vdisktime + CFQ_IDLE_DELAY;
>> >>    } else
>> >> -          cfqg->vdisktime = st->min_vdisktime;
>> >> +          se->vdisktime = st->min_vdisktime;
>> >>
>> >> -  __cfq_group_service_tree_add(st, cfqg);
>> >> -  cfqg->on_st = true;
>> >> -  st->total_weight += cfqg->weight;
>> >> +  __io_sched_entity_add(se->st, se);
>> >> +  se->on_st = true;
>> >> +  st->total_weight += se->weight;
>> >> +}
>> >> +
>> >> +static void
>> >> +cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> >> +{
>> >> +  struct io_sched_entity *gse = &cfqg->group_se;
>> >> +  struct io_sched_entity *qse = &cfqg->queue_se;
>> >> +  struct cfq_group *__cfqg;
>> >> +
>> >> +  cfqg->nr_cfqq++;
>> >> +
>> >> +  io_sched_entity_add(qse->st, qse);
>> >> +
>> >> +  while (gse && gse->parent) {
>> >> +          if (gse->on_st)
>> >> +                  return;
>> >> +          io_sched_entity_add(gse->st, gse);
>> >> +          gse = gse->parent;
>> >> +          __cfqg = cfqg_of_gse(gse);
>> >> +          __cfqg->nr_subgp++;
>> >> +  }
>> >> +}
>> >> +
>> >> +static void io_sched_entity_del(struct io_sched_entity *se)
>> >> +{
>> >> +  if (!RB_EMPTY_NODE(&se->rb_node))
>> >> +          cfq_rb_erase(&se->rb_node, se->st);
>> >> +
>> >> +  se->on_st = false;
>> >> +  se->st->total_weight -= se->weight;
>> >>  }
>> >>
>> >>  static void
>> >>  cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> >>  {
>> >> -  struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> >> +  struct io_sched_entity *gse = &cfqg->group_se;
>> >> +  struct io_sched_entity *qse = &cfqg->queue_se;
>> >> +  struct cfq_group *__cfqg, *p_cfqg;
>> >> +
>> >> +  if (gse->st && gse->st->active == &gse->rb_node)
>> >> +          gse->st->active = NULL;
>> >>
>> >> -  if (st->active == &cfqg->rb_node)
>> >> -          st->active = NULL;
>> >> +  if (qse->st && qse->st->active == &qse->rb_node)
>> >> +          qse->st->active = NULL;
>> >>
>> >>    BUG_ON(cfqg->nr_cfqq < 1);
>> >>    cfqg->nr_cfqq--;
>> >> @@ -895,13 +966,25 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> >>    if (cfqg->nr_cfqq)
>> >>            return;
>> >>
>> >> -  cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
>> >> -  cfqg->on_st = false;
>> >> -  st->total_weight -= cfqg->weight;
>> >> -  if (!RB_EMPTY_NODE(&cfqg->rb_node))
>> >> -          cfq_rb_erase(&cfqg->rb_node, st);
>> >> -  cfqg->saved_workload_slice = 0;
>> >> -  cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
>> >> +  /* dequeue queue se from group */
>> >> +  io_sched_entity_del(qse);
>> >> +
>> >> +  if (cfqg->nr_subgp)
>> >> +          return;
>> >> +
>> >> +  /* prevent from dequeuing root group */
>> >> +  while (gse && gse->parent) {
>> >> +          __cfqg = cfqg_of_gse(gse);
>> >> +          p_cfqg = cfqg_of_gse(gse->parent);
>> >> +          io_sched_entity_del(gse);
>> >> +          cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
>> >> +          cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
>> >> +          __cfqg->saved_workload_slice = 0;
>> >> +          gse = gse->parent;
>> >> +          p_cfqg->nr_subgp--;
>> >> +          if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
>> >> +                  return;
>> >> +  }
>> >>  }
>> >>
>> >>  static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>> >> @@ -933,7 +1016,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>> >>  static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> >>                            struct cfq_queue *cfqq)
>> >>  {
>> >> -  struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> >> +  struct io_sched_entity *gse = &cfqg->group_se;
>> >> +  struct io_sched_entity *qse = &cfqg->queue_se;
>> >>    unsigned int used_sl, charge;
>> >>    int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
>> >>                    - cfqg->service_tree_idle.count;
>> >> @@ -946,10 +1030,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> >>    else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
>> >>            charge = cfqq->allocated_slice;
>> >>
>> >> -  /* Can't update vdisktime while group is on service tree */
>> >> -  cfq_rb_erase(&cfqg->rb_node, st);
>> >> -  cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
>> >> -  __cfq_group_service_tree_add(st, cfqg);
>> >> +  /*
>> >> +   *  update queue se's vdisktime.
>> >> +   *  Can't update vdisktime while group is on service tree.
>> >> +   */
>> >> +
>> >> +  cfq_rb_erase(&qse->rb_node, qse->st);
>> >> +  qse->vdisktime += cfq_scale_slice(charge, qse);
>> >> +  __io_sched_entity_add(qse->st, qse);
>> >> +  if (&qse->rb_node == qse->st->active)
>> >> +          qse->st->active = NULL;
>> >> +
>> >> +  while (gse && gse->parent) {
>> >> +          cfq_rb_erase(&gse->rb_node, gse->st);
>> >> +          gse->vdisktime += cfq_scale_slice(charge, gse);
>> >> +          __io_sched_entity_add(gse->st, gse);
>> >> +          if (&gse->rb_node == gse->st->active)
>> >> +                  gse->st->active = NULL;
>> >> +          gse = gse->parent;
>> >> +  }
>> >>
>> >>    /* This group is being expired. Save the context */
>> >>    if (time_after(cfqd->workload_expires, jiffies)) {
>> >> @@ -960,8 +1059,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> >>    } else
>> >>            cfqg->saved_workload_slice = 0;
>> >>
>> >> -  cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
>> >> -                                  st->min_vdisktime);
>> >> +  cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", gse->vdisktime,
>> >> +               gse->st->min_vdisktime);
>> >> +
>> >>    cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
>> >>                    " sect=%u", used_sl, cfqq->slice_dispatch, charge,
>> >>                    iops_mode(cfqd), cfqq->nr_sectors);
>> >> @@ -977,39 +1077,84 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
>> >>    return NULL;
>> >>  }
>> >>
>> >> +static void cfq_put_cfqg(struct cfq_group *cfqg)
>> >> +{
>> >> +  struct cfq_rb_root *st;
>> >> +  int i, j;
>> >> +  struct io_sched_entity *gse;
>> >> +  struct cfq_group *p_cfqg;
>> >> +
>> >> +  BUG_ON(atomic_read(&cfqg->ref) <= 0);
>> >> +  if (!atomic_dec_and_test(&cfqg->ref))
>> >> +          return;
>> >> +  for_each_cfqg_st(cfqg, i, j, st)
>> >> +          BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
>> >> +
>> >> +  gse = &cfqg->group_se;
>> >> +  if (gse->parent) {
>> >> +          p_cfqg = cfqg_of_gse(gse->parent);
>> >> +          /* Drop the reference taken by children */
>> >> +          atomic_dec(&p_cfqg->ref);
>> >> +  }
>> >> +
>> >> +  kfree(cfqg);
>> >> +}
>> >> +
>> >> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> >> +{
>> >> +  /* Something wrong if we are trying to remove same group twice */
>> >> +  BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
>> >> +
>> >> +  hlist_del_init(&cfqg->cfqd_node);
>> >> +
>> >> +  /*
>> >> +   * Put the reference taken at the time of creation so that when all
>> >> +   * queues are gone, group can be destroyed.
>> >> +   */
>> >> +  cfq_put_cfqg(cfqg);
>> >> +}
>> >> +
>> >>  void
>> >>  cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
>> >>  {
>> >> -  cfqg_of_blkg(blkg)->weight = weight;
>> >> +  struct cfq_group *cfqg;
>> >> +  struct io_sched_entity *gse;
>> >> +
>> >> +  cfqg = cfqg_of_blkg(blkg);
>> >> +  gse = &cfqg->group_se;
>> >> +  gse->weight = weight;
>> >>  }
>> >>
>> >> -static struct cfq_group *
>> >> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> >> +static void init_group_queue_entity(struct blkio_cgroup *blkcg,
>> >> +                             struct cfq_group *cfqg)
>> >> +{
>> >> +  struct io_sched_entity *gse = &cfqg->group_se;
>> >> +  struct io_sched_entity *qse = &cfqg->queue_se;
>> >> +
>> >> +  gse->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> >> +  RB_CLEAR_NODE(&gse->rb_node);
>> >> +  gse->is_group_se = true;
>> >> +  gse->on_st = false;
>> >> +
>> >> +  /* Set to 500 for the time being */
>> >> +  qse->weight = 500;
>> >> +  RB_CLEAR_NODE(&qse->rb_node);
>> >> +  qse->is_group_se = false;
>> >> +  qse->on_st = false;
>> >> +}
>> >> +
>> >> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
>> >> +                struct cfq_group *cfqg)
>> >>  {
>> >> -  struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> >> -  struct cfq_group *cfqg = NULL;
>> >> -  void *key = cfqd;
>> >>    int i, j;
>> >>    struct cfq_rb_root *st;
>> >> -  struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> >>    unsigned int major, minor;
>> >> +  struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> >>
>> >> -  cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> >> -  if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> >> -          sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> >> -          cfqg->blkg.dev = MKDEV(major, minor);
>> >> -          goto done;
>> >> -  }
>> >> -  if (cfqg || !create)
>> >> -          goto done;
>> >> -
>> >> -  cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>> >> -  if (!cfqg)
>> >> -          goto done;
>> >> +  cfqg->grp_service_tree = CFQ_RB_ROOT;
>> >>
>> >>    for_each_cfqg_st(cfqg, i, j, st)
>> >>            *st = CFQ_RB_ROOT;
>> >> -  RB_CLEAR_NODE(&cfqg->rb_node);
>> >>
>> >>    /*
>> >>     * Take the initial reference that will be released on destroy
>> >> @@ -1023,11 +1168,102 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> >>    sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> >>    cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> >>                                    MKDEV(major, minor));
>> >> -  cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> >> -
>> >> +  init_group_queue_entity(blkcg, cfqg);
>> >>    /* Add group on cfqd list */
>> >>    hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> >> +}
>> >> +
>> >> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> >> +{
>> >> +  if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
>> >> +          cfq_destroy_cfqg(cfqd, cfqg);
>> >> +}
>> >> +
>> >> +static void cfqg_set_parent(struct cfq_group *cfqg, struct cfq_group *p_cfqg)
>> >> +{
>> >> +  struct io_sched_entity *gse = &cfqg->group_se;
>> >> +  struct io_sched_entity *qse = &cfqg->queue_se;
>> >> +  struct io_sched_entity *p_gse = &p_cfqg->group_se;
>> >> +
>> >> +  gse->parent = p_gse;
>> >> +  gse->st = &p_cfqg->grp_service_tree;
>> >> +
>> >> +  qse->parent = gse;
>> >> +  qse->st = &cfqg->grp_service_tree;
>> >> +
>> >> +  /* child reference */
>> >> +  atomic_inc(&p_cfqg->ref);
>> >> +}
>> >> +
>> >> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
>> >> +{
>> >> +  struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> >> +  struct blkio_cgroup *p_blkcg;
>> >> +  struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> >> +  unsigned int major, minor;
>> >> +  struct cfq_group *cfqg, *p_cfqg;
>> >> +  void *key = cfqd;
>> >> +  int ret;
>> >> +
>> >> +  cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> >> +  if (cfqg) {
>> >> +          if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> >> +                  sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> >> +                  cfqg->blkg.dev = MKDEV(major, minor);
>> >> +          }
>> >> +          /* chain has already been built */
>> >> +          return 0;
>> >> +  }
>> >> +
>> >> +  cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>> >> +  if (!cfqg)
>> >> +          return -1;
>> >> +
>> >> +  init_cfqg(cfqd, blkcg, cfqg);
>> >> +
>> >> +  /* Already to the top group */
>> >> +  if (!cgroup->parent)
>> >> +          return 0;
>> >> +
>> >> +  ret = cfqg_chain_alloc(cfqd, cgroup->parent);
>> >> +  if (ret == -1) {
>> >> +          uninit_cfqg(cfqd, cfqg);
>> >> +          return -1;
>> >> +  }
>> >> +
>> >> +  p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
>> >> +  p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
>> >> +  BUG_ON(p_cfqg == NULL);
>> >> +
>> >> +  cfqg_set_parent(cfqg, p_cfqg);
>> >> +  return 0;
>> >> +}
>> >> +
>> >> +static struct cfq_group *
>> >> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> >> +{
>> >> +  struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> >> +  struct cfq_group *cfqg = NULL;
>> >> +  void *key = cfqd;
>> >> +  struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> >> +  unsigned int major, minor;
>> >> +  int ret;
>> >>
>> >> +  cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> >> +  if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> >> +          sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> >> +          cfqg->blkg.dev = MKDEV(major, minor);
>> >> +          goto done;
>> >> +  }
>> >> +  if (cfqg || !create)
>> >> +          goto done;
>> >> +
>> >> +  ret = cfqg_chain_alloc(cfqd, cgroup);
>> >> +  if (!ret) {
>> >> +          cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> >> +          BUG_ON(cfqg == NULL);
>> >> +          goto done;
>> >> +  }
>> >>  done:
>> >>    return cfqg;
>> >>  }
>> >> @@ -1067,33 +1303,6 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
>> >>    atomic_inc(&cfqq->cfqg->ref);
>> >>  }
>> >>
>> >> -static void cfq_put_cfqg(struct cfq_group *cfqg)
>> >> -{
>> >> -  struct cfq_rb_root *st;
>> >> -  int i, j;
>> >> -
>> >> -  BUG_ON(atomic_read(&cfqg->ref) <= 0);
>> >> -  if (!atomic_dec_and_test(&cfqg->ref))
>> >> -          return;
>> >> -  for_each_cfqg_st(cfqg, i, j, st)
>> >> -          BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
>> >> -  kfree(cfqg);
>> >> -}
>> >> -
>> >> -static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> >> -{
>> >> -  /* Something wrong if we are trying to remove same group twice */
>> >> -  BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
>> >> -
>> >> -  hlist_del_init(&cfqg->cfqd_node);
>> >> -
>> >> -  /*
>> >> -   * Put the reference taken at the time of creation so that when all
>> >> -   * queues are gone, group can be destroyed.
>> >> -   */
>> >> -  cfq_put_cfqg(cfqg);
>> >> -}
>> >> -
>> >>  static void cfq_release_cfq_groups(struct cfq_data *cfqd)
>> >>  {
>> >>    struct hlist_node *pos, *n;
>> >> @@ -1668,9 +1877,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> >>    if (cfqq == cfqd->active_queue)
>> >>            cfqd->active_queue = NULL;
>> >>
>> >> -  if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
>> >> -          cfqd->grp_service_tree.active = NULL;
>> >> -
>> >>    if (cfqd->active_cic) {
>> >>            put_io_context(cfqd->active_cic->ioc);
>> >>            cfqd->active_cic = NULL;
>> >> @@ -2173,17 +2379,26 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> >>    cfqd->noidle_tree_requires_idle = false;
>> >>  }
>> >>
>> >> +
>> >>  static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
>> >>  {
>> >> -  struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> >> +  struct cfq_group *root_group = &cfqd->root_group;
>> >> +  struct cfq_rb_root *st = &root_group->grp_service_tree;
>> >>    struct cfq_group *cfqg;
>> >> +  struct io_sched_entity *se;
>> >>
>> >> -  if (RB_EMPTY_ROOT(&st->rb))
>> >> -          return NULL;
>> >> -  cfqg = cfq_rb_first_group(st);
>> >> -  st->active = &cfqg->rb_node;
>> >> -  update_min_vdisktime(st);
>> >> -  return cfqg;
>> >> +  do {
>> >> +          se = cfq_rb_first_se(st);
>> >> +          if (!se)
>> >> +                  return NULL;
>> >> +          st->active = &se->rb_node;
>> >> +          update_min_vdisktime(st);
>> >> +          cfqg = cfqg_of_qse(se);
>> >> +          if (cfqg)
>> >> +                  return cfqg;
>> >> +          cfqg = cfqg_of_gse(se);
>> >> +          st = &cfqg->grp_service_tree;
>> >> +  } while (1);
>> >>  }
>> >>
>> >>  static void cfq_choose_cfqg(struct cfq_data *cfqd)
>> >> @@ -2215,15 +2430,18 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>> >>    if (!cfqq)
>> >>            goto new_queue;
>> >>
>> >> +
>> >>    if (!cfqd->rq_queued)
>> >>            return NULL;
>> >>
>> >> +
>> >>    /*
>> >>     * We were waiting for group to get backlogged. Expire the queue
>> >>     */
>> >>    if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
>> >>            goto expire;
>> >>
>> >> +
>> >>    /*
>> >>     * The active queue has run out of time, expire it and select new.
>> >>     */
>> >> @@ -2245,6 +2463,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>> >>                    goto check_group_idle;
>> >>    }
>> >>
>> >> +
>> >>    /*
>> >>     * The active queue has requests and isn't expired, allow it to
>> >>     * dispatch.
>> >> @@ -2252,6 +2471,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>> >>    if (!RB_EMPTY_ROOT(&cfqq->sort_list))
>> >>            goto keep_queue;
>> >>
>> >> +
>> >>    /*
>> >>     * If another queue has a request waiting within our mean seek
>> >>     * distance, let it run.  The expire code will check for close
>> >> @@ -2505,6 +2725,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
>> >>    }
>> >>
>> >>    cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
>> >> +
>> >>    return 1;
>> >>  }
>> >>
>> >> @@ -3854,17 +4075,25 @@ static void *cfq_init_queue(struct request_queue *q)
>> >>
>> >>    cfqd->cic_index = i;
>> >>
>> >> -  /* Init root service tree */
>> >> -  cfqd->grp_service_tree = CFQ_RB_ROOT;
>> >> -
>> >>    /* Init root group */
>> >>    cfqg = &cfqd->root_group;
>> >> +  cfqg->grp_service_tree = CFQ_RB_ROOT;
>> >>    for_each_cfqg_st(cfqg, i, j, st)
>> >>            *st = CFQ_RB_ROOT;
>> >> -  RB_CLEAR_NODE(&cfqg->rb_node);
>> >> -
>> >> +  cfqg->group_se.is_group_se = true;
>> >> +  RB_CLEAR_NODE(&cfqg->group_se.rb_node);
>> >> +  cfqg->group_se.on_st = false;
>> >>    /* Give preference to root group over other groups */
>> >> -  cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
>> >> +  cfqg->group_se.weight = 2*BLKIO_WEIGHT_DEFAULT;
>> >> +  cfqg->group_se.parent = NULL;
>> >> +  cfqg->group_se.st = NULL;
>> >> +
>> >> +  cfqg->queue_se.is_group_se = false;
>> >> +  RB_CLEAR_NODE(&cfqg->queue_se.rb_node);
>> >> +  cfqg->queue_se.on_st = false;
>> >> +  cfqg->queue_se.weight = 2*BLKIO_WEIGHT_DEFAULT;
>> >> +  cfqg->queue_se.parent = &cfqg->group_se;
>> >> +  cfqg->queue_se.st = &cfqg->grp_service_tree;
>> >>
>> >>  #ifdef CONFIG_CFQ_GROUP_IOSCHED
>> >>    /*
>> >> --
>> >> 1.6.5.2
>> >>
>> >>
>> >
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-08-31 15:40       ` Nauman Rafique
@ 2010-08-31 19:25         ` Vivek Goyal
  2010-09-01  8:50           ` Gui Jianfeng
  0 siblings, 1 reply; 19+ messages in thread
From: Vivek Goyal @ 2010-08-31 19:25 UTC (permalink / raw)
  To: Nauman Rafique
  Cc: Gui Jianfeng, Jens Axboe, Jeff Moyer, Divyesh Shah,
	Corrado Zoccolo, linux kernel mailing list

On Tue, Aug 31, 2010 at 08:40:19AM -0700, Nauman Rafique wrote:
> On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
> > On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote:
> >> Vivek Goyal wrote:
> >> > On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote:
> >> >> Hi All,
> >> >>
> >> >> This patch enables cfq group hierarchical scheduling.
> >> >>
> >> >> With this patch, you can create a cgroup directory deeper than level 1.
> >> >> Now, I/O Bandwidth is distributed in a hierarchy way. For example:
> >> >> We create cgroup directories as following(the number represents weight):
> >> >>
> >> >>             Root grp
> >> >>            /       \
> >> >>        grp_1(100) grp_2(400)
> >> >>        /    \
> >> >>   grp_3(200) grp_4(300)
> >> >>
> >> >> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
> >> >> grp_2 will share 80% of total bandwidth.
> >> >> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)
> >> >>
> >> >> Design:
> >> >>   o Each cfq group has its own group service tree.
> >> >>   o Each cfq group contains a "group schedule entity" (gse) that
> >> >>     schedules on parent cfq group's service tree.
> >> >>   o Each cfq group contains a "queue schedule entity"(qse), it
> >> >>     represents all cfqqs located on this cfq group. It schedules
> >> >>     on this group's service tree. For the time being, root group
> >> >>     qse's weight is 1000, and subgroup qse's weight is 500.
> >> >>   o All gses and qse which belones to a same cfq group schedules
> >> >>     on the same group service tree.
> >> >
> >> > Hi Gui,
> >> >
> >> > Thanks for the patch. I have few questions.
> >> >
> >> > - So how does the hierarchy look like, w.r.t root group. Something as
> >> >   follows?
> >> >
> >> >
> >> >                     root
> >> >                    / | \
> >> >                  q1  q2 G1
> >> >
> >> > Assume there are two processes doin IO in root group and q1 and q2 are
> >> > cfqq queues for those processes and G1 is the cgroup created by user.
> >> >
> >> > If yes, then what algorithm do you use to do scheduling between q1, q2
> >> > and G1? IOW, currently we have two algorithms operating in CFQ. One for
> >> > cfqq and other for groups. Group algorithm does not use the logic of
> >> > cfq_slice_offset().
> >>
> >> Hi Vivek,
> >>
> >> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq.
> >> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and
> >> it will schedule on root group service with G1, as following:
> >>
> >>                          root group
> >>                         /         \
> >>                     qse(q1,q2)    gse(G1)
> >>
> >
> > Ok. That's interesting. That raises another question that how hierarchy
> > should look like. IOW, how queue and groups should be treated in
> > hierarchy.
> >
> > CFS cpu scheduler treats queues and group at the same level. That is as
> > follows.
> >
> >                        root
> >                        / | \
> >                       q1 q2 G1
> >
> > In the past I had raised this question and Jens and corrado liked treating
> > queues and group at same level.
> >
> > Logically, q1, q2 and G1 are all children of root, so it makes sense to
> > treat them at same level and not group q1 and q2 in to a single entity and
> > group.
> >
> > One of the possible way forward could be this.
> >
> > - Treat queue and group at same level (like CFS)
> >
> > - Get rid of cfq_slice_offset() logic. That means without idling on, there
> >  will be no ioprio difference between cfq queues. I think anyway as of
> >  today that logic helps in so little situations that I would not mind
> >  getting rid of it. Just that Jens should agree to it.
> >
> > - With this new scheme, it will break the existing semantics of root group
> >  being at same level as child groups. To avoid that, we can probably
> >  implement two modes (flat and hierarchical), something similar to what
> >  memory cgroup controller has done. May be one tunable in root cgroup of
> >  blkio "use_hierarchy".  By default everything will be in flat mode and
> >  if user wants hiearchical control, he needs to set user_hierarchy in
> >  root group.
> 
> Vivek, may be I am reading you wrong here. But you are first
> suggesting to add more complexity to treat queues and group at the
> same level. Then you are suggesting add even more complexity to fix
> the problems caused by that approach.
> 
> Why do we need to treat queues and group at the same level? "CFS does
> it" is not a good argument.

Sure it is not a very good argument but at the same time one would need
a very good argument that why we should do things differently.

- If a user has mounted cpu and blkio controller together and both the
  controllers are viewing the same hierarchy differently, then it is
  odd. We need a good reason that why different arrangement makes sense.

- To me, both group and cfq queue are children of root group and it
  makes sense to treat them independent childrens instead of putting
  all the queues in one logical group which inherits the weight of
  parent.

- With this new scheme, I am finding it hard to visualize the hierachy.
  How do you assign the weights to queue entities of a group. It is more
  like a invisible group with-in group. We shall have to create new
  tunable which can speicy the weight for this hidden group.


So in summary I am liking the "queue at same level as group" scheme for
two reasons.

- It is more intutive to visualize and implement. It follows the true
  hierarchy as seen by cgroup file system.

- CFS has already implemented this scheme. So we need a strong arguemnt
  to justify why we should not follow the same thing. Especially for
  the case where user has co-mounted cpu and blkio controller.

- It can achieve the same goal as "hidden group" proposal just by 
  creating a cgroup explicitly and moving all threads in that group.

Why do you think that "hidden group" proposal is better than "treating
queue at same level as group" ? 

Thanks
Vivek

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-08-31 12:57     ` Vivek Goyal
  2010-08-31 15:40       ` Nauman Rafique
@ 2010-09-01  8:48       ` Gui Jianfeng
  2010-09-01  9:02         ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 19+ messages in thread
From: Gui Jianfeng @ 2010-09-01  8:48 UTC (permalink / raw)
  To: Vivek Goyal, KAMEZAWA Hiroyuki
  Cc: Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo,
	Nauman Rafique, linux kernel mailing list

Add Kamezawa

Vivek Goyal wrote:
> On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote:
>>>> Hi All,
>>>>
>>>> This patch enables cfq group hierarchical scheduling.
>>>>
>>>> With this patch, you can create a cgroup directory deeper than level 1.
>>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example:
>>>> We create cgroup directories as following(the number represents weight):
>>>>
>>>>             Root grp
>>>>            /       \
>>>>        grp_1(100) grp_2(400)
>>>>        /    \ 
>>>>   grp_3(200) grp_4(300)
>>>>
>>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
>>>> grp_2 will share 80% of total bandwidth.
>>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)
>>>>
>>>> Design:
>>>>   o Each cfq group has its own group service tree. 
>>>>   o Each cfq group contains a "group schedule entity" (gse) that 
>>>>     schedules on parent cfq group's service tree.
>>>>   o Each cfq group contains a "queue schedule entity"(qse), it
>>>>     represents all cfqqs located on this cfq group. It schedules
>>>>     on this group's service tree. For the time being, root group
>>>>     qse's weight is 1000, and subgroup qse's weight is 500.
>>>>   o All gses and qse which belones to a same cfq group schedules
>>>>     on the same group service tree.
>>> Hi Gui,
>>>
>>> Thanks for the patch. I have few questions.
>>>
>>> - So how does the hierarchy look like, w.r.t root group. Something as
>>>   follows?
>>>
>>>
>>> 			root
>>> 		       / | \
>>> 		     q1  q2 G1
>>>
>>> Assume there are two processes doin IO in root group and q1 and q2 are
>>> cfqq queues for those processes and G1 is the cgroup created by user.
>>>
>>> If yes, then what algorithm do you use to do scheduling between q1, q2
>>> and G1? IOW, currently we have two algorithms operating in CFQ. One for
>>> cfqq and other for groups. Group algorithm does not use the logic of
>>> cfq_slice_offset().
>> Hi Vivek,
>>
>> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq.
>> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and
>> it will schedule on root group service with G1, as following:
>>
>>                          root group
>>                         /         \
>>                     qse(q1,q2)    gse(G1)
>>
> 
> Ok. That's interesting. That raises another question that how hierarchy
> should look like. IOW, how queue and groups should be treated in
> hierarchy.
> 
> CFS cpu scheduler treats queues and group at the same level. That is as
> follows.
> 
> 			root
> 			/ | \
> 		       q1 q2 G1
> 
> In the past I had raised this question and Jens and corrado liked treating
> queues and group at same level.
> 
> Logically, q1, q2 and G1 are all children of root, so it makes sense to
> treat them at same level and not group q1 and q2 in to a single entity and
> group.

Hi Vivek,

IMO, this new approach keeps the original scheduling logic, and keep things
simple. And to me, this approach works for me. So i choose it.

> 
> One of the possible way forward could be this.
> 
> - Treat queue and group at same level (like CFS)
> 
> - Get rid of cfq_slice_offset() logic. That means without idling on, there
>   will be no ioprio difference between cfq queues. I think anyway as of 
>   today that logic helps in so little situations that I would not mind
>   getting rid of it. Just that Jens should agree to it.
> 
> - With this new scheme, it will break the existing semantics of root group
>   being at same level as child groups. To avoid that, we can probably
>   implement two modes (flat and hierarchical), something similar to what
>   memory cgroup controller has done. May be one tunable in root cgroup of
>   blkio "use_hierarchy".  By default everything will be in flat mode and
>   if user wants hiearchical control, he needs to set user_hierarchy in
>   root group.
> 
>   I think memory controller provides "use_hierarchy" tunable in each
>   cgroup. I am not sure why do we need it in each cgroup and not just
>   in root cgroup.

I think Kamezawa-san should be able to answer this question. :)

Thanks
Gui

> 
> Thanks
> Vivek 
> 
>> Thanks,
>> Gui
>>
>>> The reason being they are different because CFQ wanted to provide ioprio
>>> service differentiation on SSD where slice idle is effectively zero. But
>>> that slice_offset() logic is approximate at best and is not suitable
>>> for group scheduling.
>>>
>>> I have yet to go through the patches in detail, so thought will ask you
>>> that how have you handeled above.
>>>
>>> Vivek
>>>
>>>>   o cfq group allocates in a recursive manner, that means when a cfq 
>>>>     group needs to be allocated, the upper level cfq groups are also
>>>>     allocated.
>>>>   o When a cfq group served, not only charge this cfq group but also
>>>>     charge its ancestors.
>>>>  block/cfq-iosched.c |  477 +++++++++++++++++++++++++++++++++++++-------------
>>>>  2 files changed, 353 insertions(+), 128 deletions(-)
>>>>
>>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>>> index 2fef1ef..aa06cb3 100644
>>>> --- a/block/blk-cgroup.c
>>>> +++ b/block/blk-cgroup.c
>>>> @@ -964,10 +964,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>>>>  		goto done;
>>>>  	}
>>>>  
>>>> -	/* Currently we do not support hierarchy deeper than two level (0,1) */
>>>> -	if (parent != cgroup->top_cgroup)
>>>> -		return ERR_PTR(-EPERM);
>>>> -
>>>>  	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
>>>>  	if (!blkcg)
>>>>  		return ERR_PTR(-ENOMEM);
>>>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>>>> index f65c6f0..4660d89 100644
>>>> --- a/block/cfq-iosched.c
>>>> +++ b/block/cfq-iosched.c
>>>> @@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida);
>>>>  #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
>>>>  
>>>>  #define sample_valid(samples)	((samples) > 80)
>>>> -#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
>>>> +#define rb_entry_se(node)	\
>>>> +	rb_entry((node), struct io_sched_entity, rb_node)
>>>>  
>>>>  /*
>>>>   * Most of our rbtree usage is for sorting with min extraction, so
>>>> @@ -171,19 +172,40 @@ enum wl_type_t {
>>>>  	SYNC_WORKLOAD = 2
>>>>  };
>>>>  
>>>> -/* This is per cgroup per device grouping structure */
>>>> -struct cfq_group {
>>>> +/* 
>>>> + * This's the schedule entity which is scheduled on group service tree.
>>>> + * It represents shedule structure of a cfq group, or a bundle of cfq
>>>> + * queues that locate in a cfq group.
>>>> + */
>>>> +struct io_sched_entity {
>>>> +	struct cfq_rb_root *st;
>>>> +
>>>>  	/* group service_tree member */
>>>>  	struct rb_node rb_node;
>>>> -
>>>> -	/* group service_tree key */
>>>>  	u64 vdisktime;
>>>>  	unsigned int weight;
>>>>  	bool on_st;
>>>> +	bool is_group_se;
>>>> +	struct io_sched_entity *parent;
>>>> +};
>>>> +
>>>> +/* This is per cgroup per device grouping structure */
>>>> +struct cfq_group {
>>>> +	/* cfq group sched entity */
>>>> +	struct io_sched_entity group_se;
>>>> +
>>>> +	/* sched entity for cfqqs */
>>>> +	struct io_sched_entity queue_se;
>>>> +
>>>> +	/* Service tree for cfq_groups and cfqqs set*/
>>>> +	struct cfq_rb_root grp_service_tree;
>>>>  
>>>>  	/* number of cfqq currently on this group */
>>>>  	int nr_cfqq;
>>>>  
>>>> +	/* number of sub cfq groups */
>>>> +	int nr_subgp;
>>>> +
>>>>  	/* Per group busy queus average. Useful for workload slice calc. */
>>>>  	unsigned int busy_queues_avg[2];
>>>>  	/*
>>>> @@ -210,8 +232,6 @@ struct cfq_group {
>>>>   */
>>>>  struct cfq_data {
>>>>  	struct request_queue *queue;
>>>> -	/* Root service tree for cfq_groups */
>>>> -	struct cfq_rb_root grp_service_tree;
>>>>  	struct cfq_group root_group;
>>>>  
>>>>  	/*
>>>> @@ -399,6 +419,22 @@ static inline bool iops_mode(struct cfq_data *cfqd)
>>>>  		return false;
>>>>  }
>>>>  
>>>> +static inline struct cfq_group *cfqg_of_gse(struct io_sched_entity *se)
>>>> +{
>>>> +	if (se->is_group_se)
>>>> +		return container_of(se, struct cfq_group, group_se);
>>>> +	else
>>>> +		return NULL;
>>>> +}
>>>> +
>>>> +static inline struct cfq_group *cfqg_of_qse(struct io_sched_entity *se)
>>>> +{
>>>> +	if (!se->is_group_se)
>>>> +		return container_of(se, struct cfq_group, queue_se);
>>>> +	else
>>>> +		return NULL;
>>>> +}
>>>> +
>>>>  static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
>>>>  {
>>>>  	if (cfq_class_idle(cfqq))
>>>> @@ -522,12 +558,13 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>>>>  	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
>>>>  }
>>>>  
>>>> -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
>>>> +static inline u64 cfq_scale_slice(unsigned long delta,
>>>> +				  struct io_sched_entity *se)
>>>>  {
>>>>  	u64 d = delta << CFQ_SERVICE_SHIFT;
>>>>  
>>>>  	d = d * BLKIO_WEIGHT_DEFAULT;
>>>> -	do_div(d, cfqg->weight);
>>>> +	do_div(d, se->weight);
>>>>  	return d;
>>>>  }
>>>>  
>>>> @@ -552,16 +589,16 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
>>>>  static void update_min_vdisktime(struct cfq_rb_root *st)
>>>>  {
>>>>  	u64 vdisktime = st->min_vdisktime;
>>>> -	struct cfq_group *cfqg;
>>>> +	struct io_sched_entity *se;
>>>>  
>>>>  	if (st->active) {
>>>> -		cfqg = rb_entry_cfqg(st->active);
>>>> -		vdisktime = cfqg->vdisktime;
>>>> +		se = rb_entry_se(st->active);
>>>> +		vdisktime = se->vdisktime;
>>>>  	}
>>>>  
>>>>  	if (st->left) {
>>>> -		cfqg = rb_entry_cfqg(st->left);
>>>> -		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
>>>> +		se = rb_entry_se(st->left);
>>>> +		vdisktime = min_vdisktime(vdisktime, se->vdisktime);
>>>>  	}
>>>>  
>>>>  	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
>>>> @@ -591,9 +628,10 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>>>>  static inline unsigned
>>>>  cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>>>  {
>>>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>>> +	struct io_sched_entity *gse = &cfqg->queue_se;
>>>> +	struct cfq_rb_root *st = gse->st;
>>>>  
>>>> -	return cfq_target_latency * cfqg->weight / st->total_weight;
>>>> +	return cfq_target_latency * gse->weight / st->total_weight;
>>>>  }
>>>>  
>>>>  static inline void
>>>> @@ -756,13 +794,13 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
>>>>  	return NULL;
>>>>  }
>>>>  
>>>> -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
>>>> +static struct io_sched_entity *cfq_rb_first_se(struct cfq_rb_root *root)
>>>>  {
>>>>  	if (!root->left)
>>>>  		root->left = rb_first(&root->rb);
>>>>  
>>>>  	if (root->left)
>>>> -		return rb_entry_cfqg(root->left);
>>>> +		return rb_entry_se(root->left);
>>>>  
>>>>  	return NULL;
>>>>  }
>>>> @@ -819,25 +857,25 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
>>>>  }
>>>>  
>>>>  static inline s64
>>>> -cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
>>>> +se_key(struct cfq_rb_root *st, struct io_sched_entity *se)
>>>>  {
>>>> -	return cfqg->vdisktime - st->min_vdisktime;
>>>> +	return se->vdisktime - st->min_vdisktime;
>>>>  }
>>>>  
>>>>  static void
>>>> -__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
>>>> +__io_sched_entity_add(struct cfq_rb_root *st, struct io_sched_entity *se)
>>>>  {
>>>>  	struct rb_node **node = &st->rb.rb_node;
>>>>  	struct rb_node *parent = NULL;
>>>> -	struct cfq_group *__cfqg;
>>>> -	s64 key = cfqg_key(st, cfqg);
>>>> +	struct io_sched_entity *__se;
>>>> +	s64 key = se_key(st, se);
>>>>  	int left = 1;
>>>>  
>>>>  	while (*node != NULL) {
>>>>  		parent = *node;
>>>> -		__cfqg = rb_entry_cfqg(parent);
>>>> +		__se = rb_entry_se(parent);
>>>>  
>>>> -		if (key < cfqg_key(st, __cfqg))
>>>> +		if (key < se_key(st, __se))
>>>>  			node = &parent->rb_left;
>>>>  		else {
>>>>  			node = &parent->rb_right;
>>>> @@ -846,47 +884,80 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
>>>>  	}
>>>>  
>>>>  	if (left)
>>>> -		st->left = &cfqg->rb_node;
>>>> +		st->left = &se->rb_node;
>>>>  
>>>> -	rb_link_node(&cfqg->rb_node, parent, node);
>>>> -	rb_insert_color(&cfqg->rb_node, &st->rb);
>>>> +	rb_link_node(&se->rb_node, parent, node);
>>>> +	rb_insert_color(&se->rb_node, &st->rb);
>>>>  }
>>>>  
>>>> -static void
>>>> -cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>>> +static void io_sched_entity_add(struct cfq_rb_root *st,
>>>> +				struct io_sched_entity *se)
>>>>  {
>>>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>>> -	struct cfq_group *__cfqg;
>>>>  	struct rb_node *n;
>>>> +	struct io_sched_entity *__se;
>>>>  
>>>> -	cfqg->nr_cfqq++;
>>>> -	if (cfqg->on_st)
>>>> -		return;
>>>>  
>>>> +	if (se->on_st)
>>>> +		return;
>>>>  	/*
>>>>  	 * Currently put the group at the end. Later implement something
>>>>  	 * so that groups get lesser vtime based on their weights, so that
>>>>  	 * if group does not loose all if it was not continously backlogged.
>>>>  	 */
>>>> -	n = rb_last(&st->rb);
>>>> +	n = rb_last(&se->st->rb);
>>>>  	if (n) {
>>>> -		__cfqg = rb_entry_cfqg(n);
>>>> -		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
>>>> +		__se = rb_entry_se(n);
>>>> +		se->vdisktime = __se->vdisktime + CFQ_IDLE_DELAY;
>>>>  	} else
>>>> -		cfqg->vdisktime = st->min_vdisktime;
>>>> +		se->vdisktime = st->min_vdisktime;
>>>>  
>>>> -	__cfq_group_service_tree_add(st, cfqg);
>>>> -	cfqg->on_st = true;
>>>> -	st->total_weight += cfqg->weight;
>>>> +	__io_sched_entity_add(se->st, se);
>>>> +	se->on_st = true;
>>>> +	st->total_weight += se->weight;
>>>> +}
>>>> +
>>>> +static void
>>>> +cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>>> +{
>>>> +	struct io_sched_entity *gse = &cfqg->group_se;
>>>> +	struct io_sched_entity *qse = &cfqg->queue_se;
>>>> +	struct cfq_group *__cfqg;
>>>> +
>>>> +	cfqg->nr_cfqq++;
>>>> +
>>>> +	io_sched_entity_add(qse->st, qse);
>>>> +
>>>> +	while (gse && gse->parent) {
>>>> +		if (gse->on_st)
>>>> +			return;
>>>> +		io_sched_entity_add(gse->st, gse);
>>>> +		gse = gse->parent;
>>>> +		__cfqg = cfqg_of_gse(gse);
>>>> +		__cfqg->nr_subgp++;
>>>> +	}
>>>> +}
>>>> +
>>>> +static void io_sched_entity_del(struct io_sched_entity *se)
>>>> +{
>>>> +	if (!RB_EMPTY_NODE(&se->rb_node))
>>>> +		cfq_rb_erase(&se->rb_node, se->st);
>>>> +
>>>> +	se->on_st = false;
>>>> +	se->st->total_weight -= se->weight;
>>>>  }
>>>>  
>>>>  static void
>>>>  cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>>>  {
>>>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>>> +	struct io_sched_entity *gse = &cfqg->group_se;
>>>> +	struct io_sched_entity *qse = &cfqg->queue_se;
>>>> +	struct cfq_group *__cfqg, *p_cfqg;
>>>> +
>>>> +	if (gse->st && gse->st->active == &gse->rb_node)
>>>> +		gse->st->active = NULL;
>>>>  
>>>> -	if (st->active == &cfqg->rb_node)
>>>> -		st->active = NULL;
>>>> +	if (qse->st && qse->st->active == &qse->rb_node)
>>>> +		qse->st->active = NULL;
>>>>  
>>>>  	BUG_ON(cfqg->nr_cfqq < 1);
>>>>  	cfqg->nr_cfqq--;
>>>> @@ -895,13 +966,25 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>>>  	if (cfqg->nr_cfqq)
>>>>  		return;
>>>>  
>>>> -	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
>>>> -	cfqg->on_st = false;
>>>> -	st->total_weight -= cfqg->weight;
>>>> -	if (!RB_EMPTY_NODE(&cfqg->rb_node))
>>>> -		cfq_rb_erase(&cfqg->rb_node, st);
>>>> -	cfqg->saved_workload_slice = 0;
>>>> -	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
>>>> +	/* dequeue queue se from group */
>>>> +	io_sched_entity_del(qse);
>>>> +
>>>> +	if (cfqg->nr_subgp)
>>>> +		return;
>>>> +
>>>> +	/* prevent from dequeuing root group */
>>>> +	while (gse && gse->parent) {
>>>> +		__cfqg = cfqg_of_gse(gse);
>>>> +		p_cfqg = cfqg_of_gse(gse->parent);
>>>> +		io_sched_entity_del(gse);
>>>> +		cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
>>>> +		cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
>>>> +		__cfqg->saved_workload_slice = 0;
>>>> +		gse = gse->parent;
>>>> +		p_cfqg->nr_subgp--;
>>>> +		if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
>>>> +			return;
>>>> +	}
>>>>  }
>>>>  
>>>>  static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>>>> @@ -933,7 +1016,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>>>>  static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>>>  				struct cfq_queue *cfqq)
>>>>  {
>>>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>>> +	struct io_sched_entity *gse = &cfqg->group_se;
>>>> +	struct io_sched_entity *qse = &cfqg->queue_se;
>>>>  	unsigned int used_sl, charge;
>>>>  	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
>>>>  			- cfqg->service_tree_idle.count;
>>>> @@ -946,10 +1030,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>>>  	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
>>>>  		charge = cfqq->allocated_slice;
>>>>  
>>>> -	/* Can't update vdisktime while group is on service tree */
>>>> -	cfq_rb_erase(&cfqg->rb_node, st);
>>>> -	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
>>>> -	__cfq_group_service_tree_add(st, cfqg);
>>>> +	/*
>>>> +	 *  update queue se's vdisktime.
>>>> +	 *  Can't update vdisktime while group is on service tree.
>>>> +	 */
>>>> +
>>>> +	cfq_rb_erase(&qse->rb_node, qse->st);
>>>> +	qse->vdisktime += cfq_scale_slice(charge, qse);
>>>> +	__io_sched_entity_add(qse->st, qse);
>>>> +	if (&qse->rb_node == qse->st->active)
>>>> +		qse->st->active = NULL;
>>>> +
>>>> +	while (gse && gse->parent) {
>>>> +		cfq_rb_erase(&gse->rb_node, gse->st);
>>>> +		gse->vdisktime += cfq_scale_slice(charge, gse);
>>>> +		__io_sched_entity_add(gse->st, gse);
>>>> +		if (&gse->rb_node == gse->st->active)
>>>> +			gse->st->active = NULL;
>>>> +		gse = gse->parent;
>>>> +	}
>>>>  
>>>>  	/* This group is being expired. Save the context */
>>>>  	if (time_after(cfqd->workload_expires, jiffies)) {
>>>> @@ -960,8 +1059,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>>>  	} else
>>>>  		cfqg->saved_workload_slice = 0;
>>>>  
>>>> -	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
>>>> -					st->min_vdisktime);
>>>> +	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", gse->vdisktime,
>>>> +		     gse->st->min_vdisktime);
>>>> +
>>>>  	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
>>>>  			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
>>>>  			iops_mode(cfqd), cfqq->nr_sectors);
>>>> @@ -977,39 +1077,84 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
>>>>  	return NULL;
>>>>  }
>>>>  
>>>> +static void cfq_put_cfqg(struct cfq_group *cfqg)
>>>> +{
>>>> +	struct cfq_rb_root *st;
>>>> +	int i, j;
>>>> +	struct io_sched_entity *gse;
>>>> +	struct cfq_group *p_cfqg;
>>>> +
>>>> +	BUG_ON(atomic_read(&cfqg->ref) <= 0);
>>>> +	if (!atomic_dec_and_test(&cfqg->ref))
>>>> +		return;
>>>> +	for_each_cfqg_st(cfqg, i, j, st)
>>>> +		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
>>>> +
>>>> +	gse = &cfqg->group_se;
>>>> +	if (gse->parent) {
>>>> +		p_cfqg = cfqg_of_gse(gse->parent);
>>>> +		/* Drop the reference taken by children */
>>>> +		atomic_dec(&p_cfqg->ref);
>>>> +	}
>>>> +
>>>> +	kfree(cfqg);
>>>> +}
>>>> +
>>>> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>>> +{
>>>> +	/* Something wrong if we are trying to remove same group twice */
>>>> +	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
>>>> +
>>>> +	hlist_del_init(&cfqg->cfqd_node);
>>>> +
>>>> +	/*
>>>> +	 * Put the reference taken at the time of creation so that when all
>>>> +	 * queues are gone, group can be destroyed.
>>>> +	 */
>>>> +	cfq_put_cfqg(cfqg);
>>>> +}
>>>> +
>>>>  void
>>>>  cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
>>>>  {
>>>> -	cfqg_of_blkg(blkg)->weight = weight;
>>>> +	struct cfq_group *cfqg;
>>>> +	struct io_sched_entity *gse;
>>>> +
>>>> +	cfqg = cfqg_of_blkg(blkg);
>>>> +	gse = &cfqg->group_se;
>>>> +	gse->weight = weight;
>>>>  }
>>>>  
>>>> -static struct cfq_group *
>>>> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>>> +static void init_group_queue_entity(struct blkio_cgroup *blkcg,
>>>> +				   struct cfq_group *cfqg)
>>>> +{
>>>> +	struct io_sched_entity *gse = &cfqg->group_se;
>>>> +	struct io_sched_entity *qse = &cfqg->queue_se;
>>>> +
>>>> +	gse->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>>>> +	RB_CLEAR_NODE(&gse->rb_node);
>>>> +	gse->is_group_se = true;
>>>> +	gse->on_st = false;
>>>> +
>>>> +	/* Set to 500 for the time being */
>>>> +	qse->weight = 500;
>>>> +	RB_CLEAR_NODE(&qse->rb_node);
>>>> +	qse->is_group_se = false;
>>>> +	qse->on_st = false;
>>>> +}
>>>> +
>>>> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
>>>> +		      struct cfq_group *cfqg)
>>>>  {
>>>> -	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>>>> -	struct cfq_group *cfqg = NULL;
>>>> -	void *key = cfqd;
>>>>  	int i, j;
>>>>  	struct cfq_rb_root *st;
>>>> -	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>>>  	unsigned int major, minor;
>>>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>>>  
>>>> -	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>>> -	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>>>> -		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>>> -		cfqg->blkg.dev = MKDEV(major, minor);
>>>> -		goto done;
>>>> -	}
>>>> -	if (cfqg || !create)
>>>> -		goto done;
>>>> -
>>>> -	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>>>> -	if (!cfqg)
>>>> -		goto done;
>>>> +	cfqg->grp_service_tree = CFQ_RB_ROOT;
>>>>  
>>>>  	for_each_cfqg_st(cfqg, i, j, st)
>>>>  		*st = CFQ_RB_ROOT;
>>>> -	RB_CLEAR_NODE(&cfqg->rb_node);
>>>>  
>>>>  	/*
>>>>  	 * Take the initial reference that will be released on destroy
>>>> @@ -1023,11 +1168,102 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>>>  	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>>>  	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>>>>  					MKDEV(major, minor));
>>>> -	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>>>> -
>>>> +	init_group_queue_entity(blkcg, cfqg);
>>>>  	/* Add group on cfqd list */
>>>>  	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>>>> +}
>>>> +
>>>> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>>> +{
>>>> +	if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
>>>> +		cfq_destroy_cfqg(cfqd, cfqg);
>>>> +}
>>>> +
>>>> +static void cfqg_set_parent(struct cfq_group *cfqg, struct cfq_group *p_cfqg)
>>>> +{
>>>> +	struct io_sched_entity *gse = &cfqg->group_se;
>>>> +	struct io_sched_entity *qse = &cfqg->queue_se;
>>>> +	struct io_sched_entity *p_gse = &p_cfqg->group_se;
>>>> +
>>>> +	gse->parent = p_gse;
>>>> +	gse->st = &p_cfqg->grp_service_tree;
>>>> +
>>>> +	qse->parent = gse;
>>>> +	qse->st = &cfqg->grp_service_tree;
>>>> +
>>>> +	/* child reference */
>>>> +	atomic_inc(&p_cfqg->ref);
>>>> +}
>>>> +
>>>> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
>>>> +{
>>>> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>>>> +	struct blkio_cgroup *p_blkcg;
>>>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>>> +	unsigned int major, minor;
>>>> +	struct cfq_group *cfqg, *p_cfqg;
>>>> +	void *key = cfqd;
>>>> +	int ret;
>>>> +
>>>> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>>> +	if (cfqg) {
>>>> +		if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>>>> +			sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>>> +			cfqg->blkg.dev = MKDEV(major, minor);
>>>> +		}
>>>> +		/* chain has already been built */
>>>> +		return 0;
>>>> +	}
>>>> +
>>>> +	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>>>> +	if (!cfqg)
>>>> +		return -1;
>>>> +
>>>> +	init_cfqg(cfqd, blkcg, cfqg);
>>>> +
>>>> +	/* Already to the top group */
>>>> +	if (!cgroup->parent)
>>>> +		return 0;
>>>> +
>>>> +	ret = cfqg_chain_alloc(cfqd, cgroup->parent);
>>>> +	if (ret == -1) {
>>>> +		uninit_cfqg(cfqd, cfqg);
>>>> +		return -1;
>>>> +	}
>>>> +
>>>> +	p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
>>>> +	p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
>>>> +	BUG_ON(p_cfqg == NULL);
>>>> +
>>>> +	cfqg_set_parent(cfqg, p_cfqg);
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static struct cfq_group *
>>>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>>> +{
>>>> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>>>> +	struct cfq_group *cfqg = NULL;
>>>> +	void *key = cfqd;
>>>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>>> +	unsigned int major, minor;
>>>> +	int ret;
>>>>  
>>>> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>>> +	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>>>> +		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>>> +		cfqg->blkg.dev = MKDEV(major, minor);
>>>> +		goto done;
>>>> +	}
>>>> +	if (cfqg || !create)
>>>> +		goto done;
>>>> +
>>>> +	ret = cfqg_chain_alloc(cfqd, cgroup);
>>>> +	if (!ret) {
>>>> +		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>>> +		BUG_ON(cfqg == NULL);
>>>> +		goto done;
>>>> +	}
>>>>  done:
>>>>  	return cfqg;
>>>>  }
>>>> @@ -1067,33 +1303,6 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
>>>>  	atomic_inc(&cfqq->cfqg->ref);
>>>>  }
>>>>  
>>>> -static void cfq_put_cfqg(struct cfq_group *cfqg)
>>>> -{
>>>> -	struct cfq_rb_root *st;
>>>> -	int i, j;
>>>> -
>>>> -	BUG_ON(atomic_read(&cfqg->ref) <= 0);
>>>> -	if (!atomic_dec_and_test(&cfqg->ref))
>>>> -		return;
>>>> -	for_each_cfqg_st(cfqg, i, j, st)
>>>> -		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
>>>> -	kfree(cfqg);
>>>> -}
>>>> -
>>>> -static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>>> -{
>>>> -	/* Something wrong if we are trying to remove same group twice */
>>>> -	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
>>>> -
>>>> -	hlist_del_init(&cfqg->cfqd_node);
>>>> -
>>>> -	/*
>>>> -	 * Put the reference taken at the time of creation so that when all
>>>> -	 * queues are gone, group can be destroyed.
>>>> -	 */
>>>> -	cfq_put_cfqg(cfqg);
>>>> -}
>>>> -
>>>>  static void cfq_release_cfq_groups(struct cfq_data *cfqd)
>>>>  {
>>>>  	struct hlist_node *pos, *n;
>>>> @@ -1668,9 +1877,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>>>  	if (cfqq == cfqd->active_queue)
>>>>  		cfqd->active_queue = NULL;
>>>>  
>>>> -	if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
>>>> -		cfqd->grp_service_tree.active = NULL;
>>>> -
>>>>  	if (cfqd->active_cic) {
>>>>  		put_io_context(cfqd->active_cic->ioc);
>>>>  		cfqd->active_cic = NULL;
>>>> @@ -2173,17 +2379,26 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>>>  	cfqd->noidle_tree_requires_idle = false;
>>>>  }
>>>>  
>>>> +
>>>>  static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
>>>>  {
>>>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>>> +	struct cfq_group *root_group = &cfqd->root_group;
>>>> +	struct cfq_rb_root *st = &root_group->grp_service_tree;
>>>>  	struct cfq_group *cfqg;
>>>> +	struct io_sched_entity *se;
>>>>  
>>>> -	if (RB_EMPTY_ROOT(&st->rb))
>>>> -		return NULL;
>>>> -	cfqg = cfq_rb_first_group(st);
>>>> -	st->active = &cfqg->rb_node;
>>>> -	update_min_vdisktime(st);
>>>> -	return cfqg;
>>>> +	do {
>>>> +		se = cfq_rb_first_se(st);
>>>> +		if (!se)
>>>> +			return NULL;
>>>> +		st->active = &se->rb_node;
>>>> +		update_min_vdisktime(st);
>>>> +		cfqg = cfqg_of_qse(se);
>>>> +		if (cfqg)
>>>> +			return cfqg;
>>>> +		cfqg = cfqg_of_gse(se);
>>>> +		st = &cfqg->grp_service_tree;
>>>> +	} while (1);
>>>>  }
>>>>  
>>>>  static void cfq_choose_cfqg(struct cfq_data *cfqd)
>>>> @@ -2215,15 +2430,18 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>>>>  	if (!cfqq)
>>>>  		goto new_queue;
>>>>  
>>>> +
>>>>  	if (!cfqd->rq_queued)
>>>>  		return NULL;
>>>>  
>>>> +
>>>>  	/*
>>>>  	 * We were waiting for group to get backlogged. Expire the queue
>>>>  	 */
>>>>  	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
>>>>  		goto expire;
>>>>  
>>>> +
>>>>  	/*
>>>>  	 * The active queue has run out of time, expire it and select new.
>>>>  	 */
>>>> @@ -2245,6 +2463,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>>>>  			goto check_group_idle;
>>>>  	}
>>>>  
>>>> +
>>>>  	/*
>>>>  	 * The active queue has requests and isn't expired, allow it to
>>>>  	 * dispatch.
>>>> @@ -2252,6 +2471,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>>>>  	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
>>>>  		goto keep_queue;
>>>>  
>>>> +
>>>>  	/*
>>>>  	 * If another queue has a request waiting within our mean seek
>>>>  	 * distance, let it run.  The expire code will check for close
>>>> @@ -2505,6 +2725,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
>>>>  	}
>>>>  
>>>>  	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
>>>> +
>>>>  	return 1;
>>>>  }
>>>>  
>>>> @@ -3854,17 +4075,25 @@ static void *cfq_init_queue(struct request_queue *q)
>>>>  
>>>>  	cfqd->cic_index = i;
>>>>  
>>>> -	/* Init root service tree */
>>>> -	cfqd->grp_service_tree = CFQ_RB_ROOT;
>>>> -
>>>>  	/* Init root group */
>>>>  	cfqg = &cfqd->root_group;
>>>> +	cfqg->grp_service_tree = CFQ_RB_ROOT;
>>>>  	for_each_cfqg_st(cfqg, i, j, st)
>>>>  		*st = CFQ_RB_ROOT;
>>>> -	RB_CLEAR_NODE(&cfqg->rb_node);
>>>> -
>>>> +	cfqg->group_se.is_group_se = true;
>>>> +	RB_CLEAR_NODE(&cfqg->group_se.rb_node);
>>>> +	cfqg->group_se.on_st = false;
>>>>  	/* Give preference to root group over other groups */
>>>> -	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
>>>> +	cfqg->group_se.weight = 2*BLKIO_WEIGHT_DEFAULT;
>>>> +	cfqg->group_se.parent = NULL;
>>>> +	cfqg->group_se.st = NULL;
>>>> +
>>>> +	cfqg->queue_se.is_group_se = false;
>>>> +	RB_CLEAR_NODE(&cfqg->queue_se.rb_node);
>>>> +	cfqg->queue_se.on_st = false;
>>>> +	cfqg->queue_se.weight = 2*BLKIO_WEIGHT_DEFAULT;
>>>> +	cfqg->queue_se.parent = &cfqg->group_se;
>>>> +	cfqg->queue_se.st = &cfqg->grp_service_tree;
>>>>  
>>>>  #ifdef CONFIG_CFQ_GROUP_IOSCHED
>>>>  	/*
>>>> -- 
>>>> 1.6.5.2
>>>>
>>>>
> 
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-08-31 19:25         ` Vivek Goyal
@ 2010-09-01  8:50           ` Gui Jianfeng
  2010-09-01 15:49             ` Nauman Rafique
  2010-09-02  2:20             ` Vivek Goyal
  0 siblings, 2 replies; 19+ messages in thread
From: Gui Jianfeng @ 2010-09-01  8:50 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Nauman Rafique, Jens Axboe, Jeff Moyer, Divyesh Shah,
	Corrado Zoccolo, linux kernel mailing list, KAMEZAWA Hiroyuki

Vivek Goyal wrote:
> On Tue, Aug 31, 2010 at 08:40:19AM -0700, Nauman Rafique wrote:
>> On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
>>> On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote:
>>>> Vivek Goyal wrote:
>>>>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote:
>>>>>> Hi All,
>>>>>>
>>>>>> This patch enables cfq group hierarchical scheduling.
>>>>>>
>>>>>> With this patch, you can create a cgroup directory deeper than level 1.
>>>>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example:
>>>>>> We create cgroup directories as following(the number represents weight):
>>>>>>
>>>>>>             Root grp
>>>>>>            /       \
>>>>>>        grp_1(100) grp_2(400)
>>>>>>        /    \
>>>>>>   grp_3(200) grp_4(300)
>>>>>>
>>>>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
>>>>>> grp_2 will share 80% of total bandwidth.
>>>>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)
>>>>>>
>>>>>> Design:
>>>>>>   o Each cfq group has its own group service tree.
>>>>>>   o Each cfq group contains a "group schedule entity" (gse) that
>>>>>>     schedules on parent cfq group's service tree.
>>>>>>   o Each cfq group contains a "queue schedule entity"(qse), it
>>>>>>     represents all cfqqs located on this cfq group. It schedules
>>>>>>     on this group's service tree. For the time being, root group
>>>>>>     qse's weight is 1000, and subgroup qse's weight is 500.
>>>>>>   o All gses and qse which belones to a same cfq group schedules
>>>>>>     on the same group service tree.
>>>>> Hi Gui,
>>>>>
>>>>> Thanks for the patch. I have few questions.
>>>>>
>>>>> - So how does the hierarchy look like, w.r.t root group. Something as
>>>>>   follows?
>>>>>
>>>>>
>>>>>                     root
>>>>>                    / | \
>>>>>                  q1  q2 G1
>>>>>
>>>>> Assume there are two processes doin IO in root group and q1 and q2 are
>>>>> cfqq queues for those processes and G1 is the cgroup created by user.
>>>>>
>>>>> If yes, then what algorithm do you use to do scheduling between q1, q2
>>>>> and G1? IOW, currently we have two algorithms operating in CFQ. One for
>>>>> cfqq and other for groups. Group algorithm does not use the logic of
>>>>> cfq_slice_offset().
>>>> Hi Vivek,
>>>>
>>>> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq.
>>>> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and
>>>> it will schedule on root group service with G1, as following:
>>>>
>>>>                          root group
>>>>                         /         \
>>>>                     qse(q1,q2)    gse(G1)
>>>>
>>> Ok. That's interesting. That raises another question that how hierarchy
>>> should look like. IOW, how queue and groups should be treated in
>>> hierarchy.
>>>
>>> CFS cpu scheduler treats queues and group at the same level. That is as
>>> follows.
>>>
>>>                        root
>>>                        / | \
>>>                       q1 q2 G1
>>>
>>> In the past I had raised this question and Jens and corrado liked treating
>>> queues and group at same level.
>>>
>>> Logically, q1, q2 and G1 are all children of root, so it makes sense to
>>> treat them at same level and not group q1 and q2 in to a single entity and
>>> group.
>>>
>>> One of the possible way forward could be this.
>>>
>>> - Treat queue and group at same level (like CFS)
>>>
>>> - Get rid of cfq_slice_offset() logic. That means without idling on, there
>>>  will be no ioprio difference between cfq queues. I think anyway as of
>>>  today that logic helps in so little situations that I would not mind
>>>  getting rid of it. Just that Jens should agree to it.
>>>
>>> - With this new scheme, it will break the existing semantics of root group
>>>  being at same level as child groups. To avoid that, we can probably
>>>  implement two modes (flat and hierarchical), something similar to what
>>>  memory cgroup controller has done. May be one tunable in root cgroup of
>>>  blkio "use_hierarchy".  By default everything will be in flat mode and
>>>  if user wants hiearchical control, he needs to set user_hierarchy in
>>>  root group.
>> Vivek, may be I am reading you wrong here. But you are first
>> suggesting to add more complexity to treat queues and group at the
>> same level. Then you are suggesting add even more complexity to fix
>> the problems caused by that approach.
>>
>> Why do we need to treat queues and group at the same level? "CFS does
>> it" is not a good argument.
> 
> Sure it is not a very good argument but at the same time one would need
> a very good argument that why we should do things differently.
> 
> - If a user has mounted cpu and blkio controller together and both the
>   controllers are viewing the same hierarchy differently, then it is
>   odd. We need a good reason that why different arrangement makes sense.

Hi Vivek,

Even if we mount cpu and blkio together, to me, it's ok for cpu and blkio
having their own logic, since they are totally different cgroup subsystems.

> 
> - To me, both group and cfq queue are children of root group and it
>   makes sense to treat them independent childrens instead of putting
>   all the queues in one logical group which inherits the weight of
>   parent.
> 
> - With this new scheme, I am finding it hard to visualize the hierachy.
>   How do you assign the weights to queue entities of a group. It is more
>   like a invisible group with-in group. We shall have to create new
>   tunable which can speicy the weight for this hidden group.

For the time being, the root "qse" weight is 1000 and others is 500, they don't
inherit the weight of parent. I was thinking that maybe we can determine the qse
weight in term of the queue number and weight in this group and subgroups. 

Thanks,
Gui

> 
> 
> So in summary I am liking the "queue at same level as group" scheme for
> two reasons.
> 
> - It is more intutive to visualize and implement. It follows the true
>   hierarchy as seen by cgroup file system.
> 
> - CFS has already implemented this scheme. So we need a strong arguemnt
>   to justify why we should not follow the same thing. Especially for
>   the case where user has co-mounted cpu and blkio controller.
> 
> - It can achieve the same goal as "hidden group" proposal just by 
>   creating a cgroup explicitly and moving all threads in that group.
> 
> Why do you think that "hidden group" proposal is better than "treating
> queue at same level as group" ? 
> 
> Thanks
> Vivek
> 
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-09-01  8:48       ` Gui Jianfeng
@ 2010-09-01  9:02         ` KAMEZAWA Hiroyuki
  2010-09-02  2:29           ` Vivek Goyal
  0 siblings, 1 reply; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-01  9:02 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Vivek Goyal, Jens Axboe, Jeff Moyer, Divyesh Shah,
	Corrado Zoccolo, Nauman Rafique, linux kernel mailing list

On Wed, 01 Sep 2010 16:48:25 +0800
Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote:

> Add Kamezawa
> 
> Vivek Goyal wrote:
> > On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote:
> >> Vivek Goyal wrote:
> >>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote:
> >>>> Hi All,
> >>>>
> >>>> This patch enables cfq group hierarchical scheduling.
> >>>>
> >>>> With this patch, you can create a cgroup directory deeper than level 1.
> >>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example:
> >>>> We create cgroup directories as following(the number represents weight):
> >>>>
> >>>>             Root grp
> >>>>            /       \
> >>>>        grp_1(100) grp_2(400)
> >>>>        /    \ 
> >>>>   grp_3(200) grp_4(300)
> >>>>
> >>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
> >>>> grp_2 will share 80% of total bandwidth.
> >>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)
> >>>>
> >>>> Design:
> >>>>   o Each cfq group has its own group service tree. 
> >>>>   o Each cfq group contains a "group schedule entity" (gse) that 
> >>>>     schedules on parent cfq group's service tree.
> >>>>   o Each cfq group contains a "queue schedule entity"(qse), it
> >>>>     represents all cfqqs located on this cfq group. It schedules
> >>>>     on this group's service tree. For the time being, root group
> >>>>     qse's weight is 1000, and subgroup qse's weight is 500.
> >>>>   o All gses and qse which belones to a same cfq group schedules
> >>>>     on the same group service tree.
> >>> Hi Gui,
> >>>
> >>> Thanks for the patch. I have few questions.
> >>>
> >>> - So how does the hierarchy look like, w.r.t root group. Something as
> >>>   follows?
> >>>
> >>>
> >>> 			root
> >>> 		       / | \
> >>> 		     q1  q2 G1
> >>>
> >>> Assume there are two processes doin IO in root group and q1 and q2 are
> >>> cfqq queues for those processes and G1 is the cgroup created by user.
> >>>
> >>> If yes, then what algorithm do you use to do scheduling between q1, q2
> >>> and G1? IOW, currently we have two algorithms operating in CFQ. One for
> >>> cfqq and other for groups. Group algorithm does not use the logic of
> >>> cfq_slice_offset().
> >> Hi Vivek,
> >>
> >> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq.
> >> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and
> >> it will schedule on root group service with G1, as following:
> >>
> >>                          root group
> >>                         /         \
> >>                     qse(q1,q2)    gse(G1)
> >>
> > 
> > Ok. That's interesting. That raises another question that how hierarchy
> > should look like. IOW, how queue and groups should be treated in
> > hierarchy.
> > 
> > CFS cpu scheduler treats queues and group at the same level. That is as
> > follows.
> > 
> > 			root
> > 			/ | \
> > 		       q1 q2 G1
> > 
> > In the past I had raised this question and Jens and corrado liked treating
> > queues and group at same level.
> > 
> > Logically, q1, q2 and G1 are all children of root, so it makes sense to
> > treat them at same level and not group q1 and q2 in to a single entity and
> > group.
> 
> Hi Vivek,
> 
> IMO, this new approach keeps the original scheduling logic, and keep things
> simple. And to me, this approach works for me. So i choose it.
> 
> > 
> > One of the possible way forward could be this.
> > 
> > - Treat queue and group at same level (like CFS)
> > 
> > - Get rid of cfq_slice_offset() logic. That means without idling on, there
> >   will be no ioprio difference between cfq queues. I think anyway as of 
> >   today that logic helps in so little situations that I would not mind
> >   getting rid of it. Just that Jens should agree to it.
> > 
> > - With this new scheme, it will break the existing semantics of root group
> >   being at same level as child groups. To avoid that, we can probably
> >   implement two modes (flat and hierarchical), something similar to what
> >   memory cgroup controller has done. May be one tunable in root cgroup of
> >   blkio "use_hierarchy".  By default everything will be in flat mode and
> >   if user wants hiearchical control, he needs to set user_hierarchy in
> >   root group.
> > 
> >   I think memory controller provides "use_hierarchy" tunable in each
> >   cgroup. I am not sure why do we need it in each cgroup and not just
> >   in root cgroup.
> 
> I think Kamezawa-san should be able to answer this question. :)
> 

At first, please be sure that "hierarchical accounting is _very_ slow".
Please measure how hierarchical accounting (of 4-6 levels) are slow ;)

Then, there are 2 use cases.

 1)  root/to/some/directory/A
                           /B
                           /C
                            ....
     All A, B, C ....are flat cgroup and has no relationship, not sharing limit.
     In this case, hierarchy should not be enabled.

 2)  root/to/some/directory/Gold/A,B,C...
                            Silver/D,E,F

     All A, B, C ....are limited by "Gold" or "Silver".
     But Gold and Silver has no relationthip, they has their own limitations.
     But A, B, C, D, E, F shares limit under Gold or Silver.
     In this case, hierarchy
     "root/to/some/directory" should be disabled.
      Gold/ and Silver should have use_hierarchy=1.

(Assume these Gold and Silver as Container and the user of container
 divides memory into A and B, C...)

For example,  libvirt makes very long "root/to/some/directory" ...
I never want to count-up all counters in the hierarchy even if
we'd like to use some fantasic hierarchical accounting under a container.

I don't like "all or nothing" option (as making use_hierarchy as mount
option or has parameter on root cgroup etc..) Then, allowed mixture.


Thanks,
-Kame




 







^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-09-01  8:50           ` Gui Jianfeng
@ 2010-09-01 15:49             ` Nauman Rafique
  2010-09-01 17:10               ` Vivek Goyal
  2010-09-02  2:20             ` Vivek Goyal
  1 sibling, 1 reply; 19+ messages in thread
From: Nauman Rafique @ 2010-09-01 15:49 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Vivek Goyal, Jens Axboe, Jeff Moyer, Divyesh Shah,
	Corrado Zoccolo, linux kernel mailing list, KAMEZAWA Hiroyuki

On Wed, Sep 1, 2010 at 1:50 AM, Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote:
> Vivek Goyal wrote:
>> On Tue, Aug 31, 2010 at 08:40:19AM -0700, Nauman Rafique wrote:
>>> On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
>>>> On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote:
>>>>> Vivek Goyal wrote:
>>>>>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote:
>>>>>>> Hi All,
>>>>>>>
>>>>>>> This patch enables cfq group hierarchical scheduling.
>>>>>>>
>>>>>>> With this patch, you can create a cgroup directory deeper than level 1.
>>>>>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example:
>>>>>>> We create cgroup directories as following(the number represents weight):
>>>>>>>
>>>>>>>             Root grp
>>>>>>>            /       \
>>>>>>>        grp_1(100) grp_2(400)
>>>>>>>        /    \
>>>>>>>   grp_3(200) grp_4(300)
>>>>>>>
>>>>>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
>>>>>>> grp_2 will share 80% of total bandwidth.
>>>>>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)
>>>>>>>
>>>>>>> Design:
>>>>>>>   o Each cfq group has its own group service tree.
>>>>>>>   o Each cfq group contains a "group schedule entity" (gse) that
>>>>>>>     schedules on parent cfq group's service tree.
>>>>>>>   o Each cfq group contains a "queue schedule entity"(qse), it
>>>>>>>     represents all cfqqs located on this cfq group. It schedules
>>>>>>>     on this group's service tree. For the time being, root group
>>>>>>>     qse's weight is 1000, and subgroup qse's weight is 500.
>>>>>>>   o All gses and qse which belones to a same cfq group schedules
>>>>>>>     on the same group service tree.
>>>>>> Hi Gui,
>>>>>>
>>>>>> Thanks for the patch. I have few questions.
>>>>>>
>>>>>> - So how does the hierarchy look like, w.r.t root group. Something as
>>>>>>   follows?
>>>>>>
>>>>>>
>>>>>>                     root
>>>>>>                    / | \
>>>>>>                  q1  q2 G1
>>>>>>
>>>>>> Assume there are two processes doin IO in root group and q1 and q2 are
>>>>>> cfqq queues for those processes and G1 is the cgroup created by user.
>>>>>>
>>>>>> If yes, then what algorithm do you use to do scheduling between q1, q2
>>>>>> and G1? IOW, currently we have two algorithms operating in CFQ. One for
>>>>>> cfqq and other for groups. Group algorithm does not use the logic of
>>>>>> cfq_slice_offset().
>>>>> Hi Vivek,
>>>>>
>>>>> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq.
>>>>> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and
>>>>> it will schedule on root group service with G1, as following:
>>>>>
>>>>>                          root group
>>>>>                         /         \
>>>>>                     qse(q1,q2)    gse(G1)
>>>>>
>>>> Ok. That's interesting. That raises another question that how hierarchy
>>>> should look like. IOW, how queue and groups should be treated in
>>>> hierarchy.
>>>>
>>>> CFS cpu scheduler treats queues and group at the same level. That is as
>>>> follows.
>>>>
>>>>                        root
>>>>                        / | \
>>>>                       q1 q2 G1
>>>>
>>>> In the past I had raised this question and Jens and corrado liked treating
>>>> queues and group at same level.
>>>>
>>>> Logically, q1, q2 and G1 are all children of root, so it makes sense to
>>>> treat them at same level and not group q1 and q2 in to a single entity and
>>>> group.
>>>>
>>>> One of the possible way forward could be this.
>>>>
>>>> - Treat queue and group at same level (like CFS)
>>>>
>>>> - Get rid of cfq_slice_offset() logic. That means without idling on, there
>>>>  will be no ioprio difference between cfq queues. I think anyway as of
>>>>  today that logic helps in so little situations that I would not mind
>>>>  getting rid of it. Just that Jens should agree to it.
>>>>
>>>> - With this new scheme, it will break the existing semantics of root group
>>>>  being at same level as child groups. To avoid that, we can probably
>>>>  implement two modes (flat and hierarchical), something similar to what
>>>>  memory cgroup controller has done. May be one tunable in root cgroup of
>>>>  blkio "use_hierarchy".  By default everything will be in flat mode and
>>>>  if user wants hiearchical control, he needs to set user_hierarchy in
>>>>  root group.
>>> Vivek, may be I am reading you wrong here. But you are first
>>> suggesting to add more complexity to treat queues and group at the
>>> same level. Then you are suggesting add even more complexity to fix
>>> the problems caused by that approach.
>>>
>>> Why do we need to treat queues and group at the same level? "CFS does
>>> it" is not a good argument.
>>
>> Sure it is not a very good argument but at the same time one would need
>> a very good argument that why we should do things differently.
>>
>> - If a user has mounted cpu and blkio controller together and both the
>>   controllers are viewing the same hierarchy differently, then it is
>>   odd. We need a good reason that why different arrangement makes sense.
>
> Hi Vivek,
>
> Even if we mount cpu and blkio together, to me, it's ok for cpu and blkio
> having their own logic, since they are totally different cgroup subsystems.
>
>>
>> - To me, both group and cfq queue are children of root group and it
>>   makes sense to treat them independent childrens instead of putting
>>   all the queues in one logical group which inherits the weight of
>>   parent.
>>
>> - With this new scheme, I am finding it hard to visualize the hierachy.
>>   How do you assign the weights to queue entities of a group. It is more
>>   like a invisible group with-in group. We shall have to create new
>>   tunable which can speicy the weight for this hidden group.
>
> For the time being, the root "qse" weight is 1000 and others is 500, they don't
> inherit the weight of parent. I was thinking that maybe we can determine the qse
> weight in term of the queue number and weight in this group and subgroups.
>
> Thanks,
> Gui
>
>>
>>
>> So in summary I am liking the "queue at same level as group" scheme for
>> two reasons.
>>
>> - It is more intutive to visualize and implement. It follows the true
>>   hierarchy as seen by cgroup file system.
>>
>> - CFS has already implemented this scheme. So we need a strong arguemnt
>>   to justify why we should not follow the same thing. Especially for
>>   the case where user has co-mounted cpu and blkio controller.
>>
>> - It can achieve the same goal as "hidden group" proposal just by
>>   creating a cgroup explicitly and moving all threads in that group.
>>
>> Why do you think that "hidden group" proposal is better than "treating
>> queue at same level as group" ?

There are multiple reasons for "hidden group" proposal being a better approach.

- "Hidden group" would allow us to keep scheduling queues using the
CFQ queue scheduling logic. And does not require any major changes in
CFQ. Aren't we already using that approach to deal with queues at the
root group?

- If queues and groups are treated at the same level, queues can end
up in root cgroup. And we cannot put an upper bound on the number of
those queues. Those queues can consume system resources in proportion
to their number, causing the performance of groups to suffer. If we
have "hidden group", we can configure it to a small weight, and that
would limit the impact these queues in root group can have.

>>
>> Thanks
>> Vivek
>>
>>
>
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-09-01 15:49             ` Nauman Rafique
@ 2010-09-01 17:10               ` Vivek Goyal
  2010-09-01 17:15                 ` Nauman Rafique
  0 siblings, 1 reply; 19+ messages in thread
From: Vivek Goyal @ 2010-09-01 17:10 UTC (permalink / raw)
  To: Nauman Rafique
  Cc: Gui Jianfeng, Jens Axboe, Jeff Moyer, Divyesh Shah,
	Corrado Zoccolo, linux kernel mailing list, KAMEZAWA Hiroyuki

On Wed, Sep 01, 2010 at 08:49:26AM -0700, Nauman Rafique wrote:
> On Wed, Sep 1, 2010 at 1:50 AM, Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote:
> > Vivek Goyal wrote:
> >> On Tue, Aug 31, 2010 at 08:40:19AM -0700, Nauman Rafique wrote:
> >>> On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
> >>>> On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote:
> >>>>> Vivek Goyal wrote:
> >>>>>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote:
> >>>>>>> Hi All,
> >>>>>>>
> >>>>>>> This patch enables cfq group hierarchical scheduling.
> >>>>>>>
> >>>>>>> With this patch, you can create a cgroup directory deeper than level 1.
> >>>>>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example:
> >>>>>>> We create cgroup directories as following(the number represents weight):
> >>>>>>>
> >>>>>>>             Root grp
> >>>>>>>            /       \
> >>>>>>>        grp_1(100) grp_2(400)
> >>>>>>>        /    \
> >>>>>>>   grp_3(200) grp_4(300)
> >>>>>>>
> >>>>>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
> >>>>>>> grp_2 will share 80% of total bandwidth.
> >>>>>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)
> >>>>>>>
> >>>>>>> Design:
> >>>>>>>   o Each cfq group has its own group service tree.
> >>>>>>>   o Each cfq group contains a "group schedule entity" (gse) that
> >>>>>>>     schedules on parent cfq group's service tree.
> >>>>>>>   o Each cfq group contains a "queue schedule entity"(qse), it
> >>>>>>>     represents all cfqqs located on this cfq group. It schedules
> >>>>>>>     on this group's service tree. For the time being, root group
> >>>>>>>     qse's weight is 1000, and subgroup qse's weight is 500.
> >>>>>>>   o All gses and qse which belones to a same cfq group schedules
> >>>>>>>     on the same group service tree.
> >>>>>> Hi Gui,
> >>>>>>
> >>>>>> Thanks for the patch. I have few questions.
> >>>>>>
> >>>>>> - So how does the hierarchy look like, w.r.t root group. Something as
> >>>>>>   follows?
> >>>>>>
> >>>>>>
> >>>>>>                     root
> >>>>>>                    / | \
> >>>>>>                  q1  q2 G1
> >>>>>>
> >>>>>> Assume there are two processes doin IO in root group and q1 and q2 are
> >>>>>> cfqq queues for those processes and G1 is the cgroup created by user.
> >>>>>>
> >>>>>> If yes, then what algorithm do you use to do scheduling between q1, q2
> >>>>>> and G1? IOW, currently we have two algorithms operating in CFQ. One for
> >>>>>> cfqq and other for groups. Group algorithm does not use the logic of
> >>>>>> cfq_slice_offset().
> >>>>> Hi Vivek,
> >>>>>
> >>>>> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq.
> >>>>> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and
> >>>>> it will schedule on root group service with G1, as following:
> >>>>>
> >>>>>                          root group
> >>>>>                         /         \
> >>>>>                     qse(q1,q2)    gse(G1)
> >>>>>
> >>>> Ok. That's interesting. That raises another question that how hierarchy
> >>>> should look like. IOW, how queue and groups should be treated in
> >>>> hierarchy.
> >>>>
> >>>> CFS cpu scheduler treats queues and group at the same level. That is as
> >>>> follows.
> >>>>
> >>>>                        root
> >>>>                        / | \
> >>>>                       q1 q2 G1
> >>>>
> >>>> In the past I had raised this question and Jens and corrado liked treating
> >>>> queues and group at same level.
> >>>>
> >>>> Logically, q1, q2 and G1 are all children of root, so it makes sense to
> >>>> treat them at same level and not group q1 and q2 in to a single entity and
> >>>> group.
> >>>>
> >>>> One of the possible way forward could be this.
> >>>>
> >>>> - Treat queue and group at same level (like CFS)
> >>>>
> >>>> - Get rid of cfq_slice_offset() logic. That means without idling on, there
> >>>>  will be no ioprio difference between cfq queues. I think anyway as of
> >>>>  today that logic helps in so little situations that I would not mind
> >>>>  getting rid of it. Just that Jens should agree to it.
> >>>>
> >>>> - With this new scheme, it will break the existing semantics of root group
> >>>>  being at same level as child groups. To avoid that, we can probably
> >>>>  implement two modes (flat and hierarchical), something similar to what
> >>>>  memory cgroup controller has done. May be one tunable in root cgroup of
> >>>>  blkio "use_hierarchy".  By default everything will be in flat mode and
> >>>>  if user wants hiearchical control, he needs to set user_hierarchy in
> >>>>  root group.
> >>> Vivek, may be I am reading you wrong here. But you are first
> >>> suggesting to add more complexity to treat queues and group at the
> >>> same level. Then you are suggesting add even more complexity to fix
> >>> the problems caused by that approach.
> >>>
> >>> Why do we need to treat queues and group at the same level? "CFS does
> >>> it" is not a good argument.
> >>
> >> Sure it is not a very good argument but at the same time one would need
> >> a very good argument that why we should do things differently.
> >>
> >> - If a user has mounted cpu and blkio controller together and both the
> >>   controllers are viewing the same hierarchy differently, then it is
> >>   odd. We need a good reason that why different arrangement makes sense.
> >
> > Hi Vivek,
> >
> > Even if we mount cpu and blkio together, to me, it's ok for cpu and blkio
> > having their own logic, since they are totally different cgroup subsystems.
> >
> >>
> >> - To me, both group and cfq queue are children of root group and it
> >>   makes sense to treat them independent childrens instead of putting
> >>   all the queues in one logical group which inherits the weight of
> >>   parent.
> >>
> >> - With this new scheme, I am finding it hard to visualize the hierachy.
> >>   How do you assign the weights to queue entities of a group. It is more
> >>   like a invisible group with-in group. We shall have to create new
> >>   tunable which can speicy the weight for this hidden group.
> >
> > For the time being, the root "qse" weight is 1000 and others is 500, they don't
> > inherit the weight of parent. I was thinking that maybe we can determine the qse
> > weight in term of the queue number and weight in this group and subgroups.
> >
> > Thanks,
> > Gui
> >
> >>
> >>
> >> So in summary I am liking the "queue at same level as group" scheme for
> >> two reasons.
> >>
> >> - It is more intutive to visualize and implement. It follows the true
> >>   hierarchy as seen by cgroup file system.
> >>
> >> - CFS has already implemented this scheme. So we need a strong arguemnt
> >>   to justify why we should not follow the same thing. Especially for
> >>   the case where user has co-mounted cpu and blkio controller.
> >>
> >> - It can achieve the same goal as "hidden group" proposal just by
> >>   creating a cgroup explicitly and moving all threads in that group.
> >>
> >> Why do you think that "hidden group" proposal is better than "treating
> >> queue at same level as group" ?
> 
> There are multiple reasons for "hidden group" proposal being a better approach.
> 
> - "Hidden group" would allow us to keep scheduling queues using the
> CFQ queue scheduling logic. And does not require any major changes in
> CFQ. Aren't we already using that approach to deal with queues at the
> root group?

Currently we are operating in flat mode where all the groups are at 
same level (irrespective their position in cgroup hiearchy). 

> 
> - If queues and groups are treated at the same level, queues can end
> up in root cgroup. And we cannot put an upper bound on the number of
> those queues. Those queues can consume system resources in proportion
> to their number, causing the performance of groups to suffer. If we
> have "hidden group", we can configure it to a small weight, and that
> would limit the impact these queues in root group can have.

To limit the impact of other queues in cgroup, one can use libcgroup to
automatically place new threads or tasks into a subgroup.

I understand that kernel doing it by default should help though. It is
less work in terms of configuration. But I am not sure that's a good 
argument to design kernel functionality. Kernel functionality should be
pretty generic.

Anyway, how would you assign the weight to the hidden group. What's the
interface for that? A new cgroup file inside each cgroup? Personally
I think that's little odd interface. Every group has one hidden group
where all the queues in that group go and weight of that group can be
specified by a cgroup file.

But anyway, I am not tied to any of the approach. I am just trying to
make sure that we have put enough thought into it as changing it later
will be hard.

Vivek

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-09-01 17:10               ` Vivek Goyal
@ 2010-09-01 17:15                 ` Nauman Rafique
  2010-09-01 17:21                   ` Vivek Goyal
  2010-09-02  0:30                   ` Gui Jianfeng
  0 siblings, 2 replies; 19+ messages in thread
From: Nauman Rafique @ 2010-09-01 17:15 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Gui Jianfeng, Jens Axboe, Jeff Moyer, Divyesh Shah,
	Corrado Zoccolo, linux kernel mailing list, KAMEZAWA Hiroyuki

On Wed, Sep 1, 2010 at 10:10 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
> On Wed, Sep 01, 2010 at 08:49:26AM -0700, Nauman Rafique wrote:
>> On Wed, Sep 1, 2010 at 1:50 AM, Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote:
>> > Vivek Goyal wrote:
>> >> On Tue, Aug 31, 2010 at 08:40:19AM -0700, Nauman Rafique wrote:
>> >>> On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
>> >>>> On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote:
>> >>>>> Vivek Goyal wrote:
>> >>>>>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote:
>> >>>>>>> Hi All,
>> >>>>>>>
>> >>>>>>> This patch enables cfq group hierarchical scheduling.
>> >>>>>>>
>> >>>>>>> With this patch, you can create a cgroup directory deeper than level 1.
>> >>>>>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example:
>> >>>>>>> We create cgroup directories as following(the number represents weight):
>> >>>>>>>
>> >>>>>>>             Root grp
>> >>>>>>>            /       \
>> >>>>>>>        grp_1(100) grp_2(400)
>> >>>>>>>        /    \
>> >>>>>>>   grp_3(200) grp_4(300)
>> >>>>>>>
>> >>>>>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
>> >>>>>>> grp_2 will share 80% of total bandwidth.
>> >>>>>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)
>> >>>>>>>
>> >>>>>>> Design:
>> >>>>>>>   o Each cfq group has its own group service tree.
>> >>>>>>>   o Each cfq group contains a "group schedule entity" (gse) that
>> >>>>>>>     schedules on parent cfq group's service tree.
>> >>>>>>>   o Each cfq group contains a "queue schedule entity"(qse), it
>> >>>>>>>     represents all cfqqs located on this cfq group. It schedules
>> >>>>>>>     on this group's service tree. For the time being, root group
>> >>>>>>>     qse's weight is 1000, and subgroup qse's weight is 500.
>> >>>>>>>   o All gses and qse which belones to a same cfq group schedules
>> >>>>>>>     on the same group service tree.
>> >>>>>> Hi Gui,
>> >>>>>>
>> >>>>>> Thanks for the patch. I have few questions.
>> >>>>>>
>> >>>>>> - So how does the hierarchy look like, w.r.t root group. Something as
>> >>>>>>   follows?
>> >>>>>>
>> >>>>>>
>> >>>>>>                     root
>> >>>>>>                    / | \
>> >>>>>>                  q1  q2 G1
>> >>>>>>
>> >>>>>> Assume there are two processes doin IO in root group and q1 and q2 are
>> >>>>>> cfqq queues for those processes and G1 is the cgroup created by user.
>> >>>>>>
>> >>>>>> If yes, then what algorithm do you use to do scheduling between q1, q2
>> >>>>>> and G1? IOW, currently we have two algorithms operating in CFQ. One for
>> >>>>>> cfqq and other for groups. Group algorithm does not use the logic of
>> >>>>>> cfq_slice_offset().
>> >>>>> Hi Vivek,
>> >>>>>
>> >>>>> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq.
>> >>>>> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and
>> >>>>> it will schedule on root group service with G1, as following:
>> >>>>>
>> >>>>>                          root group
>> >>>>>                         /         \
>> >>>>>                     qse(q1,q2)    gse(G1)
>> >>>>>
>> >>>> Ok. That's interesting. That raises another question that how hierarchy
>> >>>> should look like. IOW, how queue and groups should be treated in
>> >>>> hierarchy.
>> >>>>
>> >>>> CFS cpu scheduler treats queues and group at the same level. That is as
>> >>>> follows.
>> >>>>
>> >>>>                        root
>> >>>>                        / | \
>> >>>>                       q1 q2 G1
>> >>>>
>> >>>> In the past I had raised this question and Jens and corrado liked treating
>> >>>> queues and group at same level.
>> >>>>
>> >>>> Logically, q1, q2 and G1 are all children of root, so it makes sense to
>> >>>> treat them at same level and not group q1 and q2 in to a single entity and
>> >>>> group.
>> >>>>
>> >>>> One of the possible way forward could be this.
>> >>>>
>> >>>> - Treat queue and group at same level (like CFS)
>> >>>>
>> >>>> - Get rid of cfq_slice_offset() logic. That means without idling on, there
>> >>>>  will be no ioprio difference between cfq queues. I think anyway as of
>> >>>>  today that logic helps in so little situations that I would not mind
>> >>>>  getting rid of it. Just that Jens should agree to it.
>> >>>>
>> >>>> - With this new scheme, it will break the existing semantics of root group
>> >>>>  being at same level as child groups. To avoid that, we can probably
>> >>>>  implement two modes (flat and hierarchical), something similar to what
>> >>>>  memory cgroup controller has done. May be one tunable in root cgroup of
>> >>>>  blkio "use_hierarchy".  By default everything will be in flat mode and
>> >>>>  if user wants hiearchical control, he needs to set user_hierarchy in
>> >>>>  root group.
>> >>> Vivek, may be I am reading you wrong here. But you are first
>> >>> suggesting to add more complexity to treat queues and group at the
>> >>> same level. Then you are suggesting add even more complexity to fix
>> >>> the problems caused by that approach.
>> >>>
>> >>> Why do we need to treat queues and group at the same level? "CFS does
>> >>> it" is not a good argument.
>> >>
>> >> Sure it is not a very good argument but at the same time one would need
>> >> a very good argument that why we should do things differently.
>> >>
>> >> - If a user has mounted cpu and blkio controller together and both the
>> >>   controllers are viewing the same hierarchy differently, then it is
>> >>   odd. We need a good reason that why different arrangement makes sense.
>> >
>> > Hi Vivek,
>> >
>> > Even if we mount cpu and blkio together, to me, it's ok for cpu and blkio
>> > having their own logic, since they are totally different cgroup subsystems.
>> >
>> >>
>> >> - To me, both group and cfq queue are children of root group and it
>> >>   makes sense to treat them independent childrens instead of putting
>> >>   all the queues in one logical group which inherits the weight of
>> >>   parent.
>> >>
>> >> - With this new scheme, I am finding it hard to visualize the hierachy.
>> >>   How do you assign the weights to queue entities of a group. It is more
>> >>   like a invisible group with-in group. We shall have to create new
>> >>   tunable which can speicy the weight for this hidden group.
>> >
>> > For the time being, the root "qse" weight is 1000 and others is 500, they don't
>> > inherit the weight of parent. I was thinking that maybe we can determine the qse
>> > weight in term of the queue number and weight in this group and subgroups.
>> >
>> > Thanks,
>> > Gui
>> >
>> >>
>> >>
>> >> So in summary I am liking the "queue at same level as group" scheme for
>> >> two reasons.
>> >>
>> >> - It is more intutive to visualize and implement. It follows the true
>> >>   hierarchy as seen by cgroup file system.
>> >>
>> >> - CFS has already implemented this scheme. So we need a strong arguemnt
>> >>   to justify why we should not follow the same thing. Especially for
>> >>   the case where user has co-mounted cpu and blkio controller.
>> >>
>> >> - It can achieve the same goal as "hidden group" proposal just by
>> >>   creating a cgroup explicitly and moving all threads in that group.
>> >>
>> >> Why do you think that "hidden group" proposal is better than "treating
>> >> queue at same level as group" ?
>>
>> There are multiple reasons for "hidden group" proposal being a better approach.
>>
>> - "Hidden group" would allow us to keep scheduling queues using the
>> CFQ queue scheduling logic. And does not require any major changes in
>> CFQ. Aren't we already using that approach to deal with queues at the
>> root group?
>
> Currently we are operating in flat mode where all the groups are at
> same level (irrespective their position in cgroup hiearchy).
>
>>
>> - If queues and groups are treated at the same level, queues can end
>> up in root cgroup. And we cannot put an upper bound on the number of
>> those queues. Those queues can consume system resources in proportion
>> to their number, causing the performance of groups to suffer. If we
>> have "hidden group", we can configure it to a small weight, and that
>> would limit the impact these queues in root group can have.
>
> To limit the impact of other queues in cgroup, one can use libcgroup to
> automatically place new threads or tasks into a subgroup.
>
> I understand that kernel doing it by default should help though. It is
> less work in terms of configuration. But I am not sure that's a good
> argument to design kernel functionality. Kernel functionality should be
> pretty generic.
>
> Anyway, how would you assign the weight to the hidden group. What's the
> interface for that? A new cgroup file inside each cgroup? Personally
> I think that's little odd interface. Every group has one hidden group
> where all the queues in that group go and weight of that group can be
> specified by a cgroup file.

I think picking a reasonable default weight at compile time is not
that bad an option, given that threads showing up in the "hidden
group" is an uncommon case.

>
> But anyway, I am not tied to any of the approach. I am just trying to
> make sure that we have put enough thought into it as changing it later
> will be hard.
>
> Vivek
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-09-01 17:15                 ` Nauman Rafique
@ 2010-09-01 17:21                   ` Vivek Goyal
  2010-09-02  0:30                   ` Gui Jianfeng
  1 sibling, 0 replies; 19+ messages in thread
From: Vivek Goyal @ 2010-09-01 17:21 UTC (permalink / raw)
  To: Nauman Rafique
  Cc: Gui Jianfeng, Jens Axboe, Jeff Moyer, Divyesh Shah,
	Corrado Zoccolo, linux kernel mailing list, KAMEZAWA Hiroyuki

On Wed, Sep 01, 2010 at 10:15:31AM -0700, Nauman Rafique wrote:
> On Wed, Sep 1, 2010 at 10:10 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
> > On Wed, Sep 01, 2010 at 08:49:26AM -0700, Nauman Rafique wrote:
> >> On Wed, Sep 1, 2010 at 1:50 AM, Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote:
> >> > Vivek Goyal wrote:
> >> >> On Tue, Aug 31, 2010 at 08:40:19AM -0700, Nauman Rafique wrote:
> >> >>> On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
> >> >>>> On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote:
> >> >>>>> Vivek Goyal wrote:
> >> >>>>>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote:
> >> >>>>>>> Hi All,
> >> >>>>>>>
> >> >>>>>>> This patch enables cfq group hierarchical scheduling.
> >> >>>>>>>
> >> >>>>>>> With this patch, you can create a cgroup directory deeper than level 1.
> >> >>>>>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example:
> >> >>>>>>> We create cgroup directories as following(the number represents weight):
> >> >>>>>>>
> >> >>>>>>>             Root grp
> >> >>>>>>>            /       \
> >> >>>>>>>        grp_1(100) grp_2(400)
> >> >>>>>>>        /    \
> >> >>>>>>>   grp_3(200) grp_4(300)
> >> >>>>>>>
> >> >>>>>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
> >> >>>>>>> grp_2 will share 80% of total bandwidth.
> >> >>>>>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)
> >> >>>>>>>
> >> >>>>>>> Design:
> >> >>>>>>>   o Each cfq group has its own group service tree.
> >> >>>>>>>   o Each cfq group contains a "group schedule entity" (gse) that
> >> >>>>>>>     schedules on parent cfq group's service tree.
> >> >>>>>>>   o Each cfq group contains a "queue schedule entity"(qse), it
> >> >>>>>>>     represents all cfqqs located on this cfq group. It schedules
> >> >>>>>>>     on this group's service tree. For the time being, root group
> >> >>>>>>>     qse's weight is 1000, and subgroup qse's weight is 500.
> >> >>>>>>>   o All gses and qse which belones to a same cfq group schedules
> >> >>>>>>>     on the same group service tree.
> >> >>>>>> Hi Gui,
> >> >>>>>>
> >> >>>>>> Thanks for the patch. I have few questions.
> >> >>>>>>
> >> >>>>>> - So how does the hierarchy look like, w.r.t root group. Something as
> >> >>>>>>   follows?
> >> >>>>>>
> >> >>>>>>
> >> >>>>>>                     root
> >> >>>>>>                    / | \
> >> >>>>>>                  q1  q2 G1
> >> >>>>>>
> >> >>>>>> Assume there are two processes doin IO in root group and q1 and q2 are
> >> >>>>>> cfqq queues for those processes and G1 is the cgroup created by user.
> >> >>>>>>
> >> >>>>>> If yes, then what algorithm do you use to do scheduling between q1, q2
> >> >>>>>> and G1? IOW, currently we have two algorithms operating in CFQ. One for
> >> >>>>>> cfqq and other for groups. Group algorithm does not use the logic of
> >> >>>>>> cfq_slice_offset().
> >> >>>>> Hi Vivek,
> >> >>>>>
> >> >>>>> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq.
> >> >>>>> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and
> >> >>>>> it will schedule on root group service with G1, as following:
> >> >>>>>
> >> >>>>>                          root group
> >> >>>>>                         /         \
> >> >>>>>                     qse(q1,q2)    gse(G1)
> >> >>>>>
> >> >>>> Ok. That's interesting. That raises another question that how hierarchy
> >> >>>> should look like. IOW, how queue and groups should be treated in
> >> >>>> hierarchy.
> >> >>>>
> >> >>>> CFS cpu scheduler treats queues and group at the same level. That is as
> >> >>>> follows.
> >> >>>>
> >> >>>>                        root
> >> >>>>                        / | \
> >> >>>>                       q1 q2 G1
> >> >>>>
> >> >>>> In the past I had raised this question and Jens and corrado liked treating
> >> >>>> queues and group at same level.
> >> >>>>
> >> >>>> Logically, q1, q2 and G1 are all children of root, so it makes sense to
> >> >>>> treat them at same level and not group q1 and q2 in to a single entity and
> >> >>>> group.
> >> >>>>
> >> >>>> One of the possible way forward could be this.
> >> >>>>
> >> >>>> - Treat queue and group at same level (like CFS)
> >> >>>>
> >> >>>> - Get rid of cfq_slice_offset() logic. That means without idling on, there
> >> >>>>  will be no ioprio difference between cfq queues. I think anyway as of
> >> >>>>  today that logic helps in so little situations that I would not mind
> >> >>>>  getting rid of it. Just that Jens should agree to it.
> >> >>>>
> >> >>>> - With this new scheme, it will break the existing semantics of root group
> >> >>>>  being at same level as child groups. To avoid that, we can probably
> >> >>>>  implement two modes (flat and hierarchical), something similar to what
> >> >>>>  memory cgroup controller has done. May be one tunable in root cgroup of
> >> >>>>  blkio "use_hierarchy".  By default everything will be in flat mode and
> >> >>>>  if user wants hiearchical control, he needs to set user_hierarchy in
> >> >>>>  root group.
> >> >>> Vivek, may be I am reading you wrong here. But you are first
> >> >>> suggesting to add more complexity to treat queues and group at the
> >> >>> same level. Then you are suggesting add even more complexity to fix
> >> >>> the problems caused by that approach.
> >> >>>
> >> >>> Why do we need to treat queues and group at the same level? "CFS does
> >> >>> it" is not a good argument.
> >> >>
> >> >> Sure it is not a very good argument but at the same time one would need
> >> >> a very good argument that why we should do things differently.
> >> >>
> >> >> - If a user has mounted cpu and blkio controller together and both the
> >> >>   controllers are viewing the same hierarchy differently, then it is
> >> >>   odd. We need a good reason that why different arrangement makes sense.
> >> >
> >> > Hi Vivek,
> >> >
> >> > Even if we mount cpu and blkio together, to me, it's ok for cpu and blkio
> >> > having their own logic, since they are totally different cgroup subsystems.
> >> >
> >> >>
> >> >> - To me, both group and cfq queue are children of root group and it
> >> >>   makes sense to treat them independent childrens instead of putting
> >> >>   all the queues in one logical group which inherits the weight of
> >> >>   parent.
> >> >>
> >> >> - With this new scheme, I am finding it hard to visualize the hierachy.
> >> >>   How do you assign the weights to queue entities of a group. It is more
> >> >>   like a invisible group with-in group. We shall have to create new
> >> >>   tunable which can speicy the weight for this hidden group.
> >> >
> >> > For the time being, the root "qse" weight is 1000 and others is 500, they don't
> >> > inherit the weight of parent. I was thinking that maybe we can determine the qse
> >> > weight in term of the queue number and weight in this group and subgroups.
> >> >
> >> > Thanks,
> >> > Gui
> >> >
> >> >>
> >> >>
> >> >> So in summary I am liking the "queue at same level as group" scheme for
> >> >> two reasons.
> >> >>
> >> >> - It is more intutive to visualize and implement. It follows the true
> >> >>   hierarchy as seen by cgroup file system.
> >> >>
> >> >> - CFS has already implemented this scheme. So we need a strong arguemnt
> >> >>   to justify why we should not follow the same thing. Especially for
> >> >>   the case where user has co-mounted cpu and blkio controller.
> >> >>
> >> >> - It can achieve the same goal as "hidden group" proposal just by
> >> >>   creating a cgroup explicitly and moving all threads in that group.
> >> >>
> >> >> Why do you think that "hidden group" proposal is better than "treating
> >> >> queue at same level as group" ?
> >>
> >> There are multiple reasons for "hidden group" proposal being a better approach.
> >>
> >> - "Hidden group" would allow us to keep scheduling queues using the
> >> CFQ queue scheduling logic. And does not require any major changes in
> >> CFQ. Aren't we already using that approach to deal with queues at the
> >> root group?
> >
> > Currently we are operating in flat mode where all the groups are at
> > same level (irrespective their position in cgroup hiearchy).
> >
> >>
> >> - If queues and groups are treated at the same level, queues can end
> >> up in root cgroup. And we cannot put an upper bound on the number of
> >> those queues. Those queues can consume system resources in proportion
> >> to their number, causing the performance of groups to suffer. If we
> >> have "hidden group", we can configure it to a small weight, and that
> >> would limit the impact these queues in root group can have.
> >
> > To limit the impact of other queues in cgroup, one can use libcgroup to
> > automatically place new threads or tasks into a subgroup.
> >
> > I understand that kernel doing it by default should help though. It is
> > less work in terms of configuration. But I am not sure that's a good
> > argument to design kernel functionality. Kernel functionality should be
> > pretty generic.
> >
> > Anyway, how would you assign the weight to the hidden group. What's the
> > interface for that? A new cgroup file inside each cgroup? Personally
> > I think that's little odd interface. Every group has one hidden group
> > where all the queues in that group go and weight of that group can be
> > specified by a cgroup file.
> 
> I think picking a reasonable default weight at compile time is not
> that bad an option, given that threads showing up in the "hidden
> group" is an uncommon case.

I don't think that's a good idea. A user should be able to determine
what's the share of the queues in a group without looking at the kernel
config file.

Vivek

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-09-01 17:15                 ` Nauman Rafique
  2010-09-01 17:21                   ` Vivek Goyal
@ 2010-09-02  0:30                   ` Gui Jianfeng
  1 sibling, 0 replies; 19+ messages in thread
From: Gui Jianfeng @ 2010-09-02  0:30 UTC (permalink / raw)
  To: Nauman Rafique
  Cc: Vivek Goyal, Jens Axboe, Jeff Moyer, Divyesh Shah,
	Corrado Zoccolo, linux kernel mailing list, KAMEZAWA Hiroyuki

Nauman Rafique wrote:
> On Wed, Sep 1, 2010 at 10:10 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
>> On Wed, Sep 01, 2010 at 08:49:26AM -0700, Nauman Rafique wrote:
>>> On Wed, Sep 1, 2010 at 1:50 AM, Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote:
>>>> Vivek Goyal wrote:
>>>>> On Tue, Aug 31, 2010 at 08:40:19AM -0700, Nauman Rafique wrote:
>>>>>> On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
>>>>>>> On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote:
>>>>>>>> Vivek Goyal wrote:
>>>>>>>>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote:
>>>>>>>>>> Hi All,
>>>>>>>>>>
>>>>>>>>>> This patch enables cfq group hierarchical scheduling.
>>>>>>>>>>
>>>>>>>>>> With this patch, you can create a cgroup directory deeper than level 1.
>>>>>>>>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example:
>>>>>>>>>> We create cgroup directories as following(the number represents weight):
>>>>>>>>>>
>>>>>>>>>>             Root grp
>>>>>>>>>>            /       \
>>>>>>>>>>        grp_1(100) grp_2(400)
>>>>>>>>>>        /    \
>>>>>>>>>>   grp_3(200) grp_4(300)
>>>>>>>>>>
>>>>>>>>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
>>>>>>>>>> grp_2 will share 80% of total bandwidth.
>>>>>>>>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)
>>>>>>>>>>
>>>>>>>>>> Design:
>>>>>>>>>>   o Each cfq group has its own group service tree.
>>>>>>>>>>   o Each cfq group contains a "group schedule entity" (gse) that
>>>>>>>>>>     schedules on parent cfq group's service tree.
>>>>>>>>>>   o Each cfq group contains a "queue schedule entity"(qse), it
>>>>>>>>>>     represents all cfqqs located on this cfq group. It schedules
>>>>>>>>>>     on this group's service tree. For the time being, root group
>>>>>>>>>>     qse's weight is 1000, and subgroup qse's weight is 500.
>>>>>>>>>>   o All gses and qse which belones to a same cfq group schedules
>>>>>>>>>>     on the same group service tree.
>>>>>>>>> Hi Gui,
>>>>>>>>>
>>>>>>>>> Thanks for the patch. I have few questions.
>>>>>>>>>
>>>>>>>>> - So how does the hierarchy look like, w.r.t root group. Something as
>>>>>>>>>   follows?
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>                     root
>>>>>>>>>                    / | \
>>>>>>>>>                  q1  q2 G1
>>>>>>>>>
>>>>>>>>> Assume there are two processes doin IO in root group and q1 and q2 are
>>>>>>>>> cfqq queues for those processes and G1 is the cgroup created by user.
>>>>>>>>>
>>>>>>>>> If yes, then what algorithm do you use to do scheduling between q1, q2
>>>>>>>>> and G1? IOW, currently we have two algorithms operating in CFQ. One for
>>>>>>>>> cfqq and other for groups. Group algorithm does not use the logic of
>>>>>>>>> cfq_slice_offset().
>>>>>>>> Hi Vivek,
>>>>>>>>
>>>>>>>> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq.
>>>>>>>> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and
>>>>>>>> it will schedule on root group service with G1, as following:
>>>>>>>>
>>>>>>>>                          root group
>>>>>>>>                         /         \
>>>>>>>>                     qse(q1,q2)    gse(G1)
>>>>>>>>
>>>>>>> Ok. That's interesting. That raises another question that how hierarchy
>>>>>>> should look like. IOW, how queue and groups should be treated in
>>>>>>> hierarchy.
>>>>>>>
>>>>>>> CFS cpu scheduler treats queues and group at the same level. That is as
>>>>>>> follows.
>>>>>>>
>>>>>>>                        root
>>>>>>>                        / | \
>>>>>>>                       q1 q2 G1
>>>>>>>
>>>>>>> In the past I had raised this question and Jens and corrado liked treating
>>>>>>> queues and group at same level.
>>>>>>>
>>>>>>> Logically, q1, q2 and G1 are all children of root, so it makes sense to
>>>>>>> treat them at same level and not group q1 and q2 in to a single entity and
>>>>>>> group.
>>>>>>>
>>>>>>> One of the possible way forward could be this.
>>>>>>>
>>>>>>> - Treat queue and group at same level (like CFS)
>>>>>>>
>>>>>>> - Get rid of cfq_slice_offset() logic. That means without idling on, there
>>>>>>>  will be no ioprio difference between cfq queues. I think anyway as of
>>>>>>>  today that logic helps in so little situations that I would not mind
>>>>>>>  getting rid of it. Just that Jens should agree to it.
>>>>>>>
>>>>>>> - With this new scheme, it will break the existing semantics of root group
>>>>>>>  being at same level as child groups. To avoid that, we can probably
>>>>>>>  implement two modes (flat and hierarchical), something similar to what
>>>>>>>  memory cgroup controller has done. May be one tunable in root cgroup of
>>>>>>>  blkio "use_hierarchy".  By default everything will be in flat mode and
>>>>>>>  if user wants hiearchical control, he needs to set user_hierarchy in
>>>>>>>  root group.
>>>>>> Vivek, may be I am reading you wrong here. But you are first
>>>>>> suggesting to add more complexity to treat queues and group at the
>>>>>> same level. Then you are suggesting add even more complexity to fix
>>>>>> the problems caused by that approach.
>>>>>>
>>>>>> Why do we need to treat queues and group at the same level? "CFS does
>>>>>> it" is not a good argument.
>>>>> Sure it is not a very good argument but at the same time one would need
>>>>> a very good argument that why we should do things differently.
>>>>>
>>>>> - If a user has mounted cpu and blkio controller together and both the
>>>>>   controllers are viewing the same hierarchy differently, then it is
>>>>>   odd. We need a good reason that why different arrangement makes sense.
>>>> Hi Vivek,
>>>>
>>>> Even if we mount cpu and blkio together, to me, it's ok for cpu and blkio
>>>> having their own logic, since they are totally different cgroup subsystems.
>>>>
>>>>> - To me, both group and cfq queue are children of root group and it
>>>>>   makes sense to treat them independent childrens instead of putting
>>>>>   all the queues in one logical group which inherits the weight of
>>>>>   parent.
>>>>>
>>>>> - With this new scheme, I am finding it hard to visualize the hierachy.
>>>>>   How do you assign the weights to queue entities of a group. It is more
>>>>>   like a invisible group with-in group. We shall have to create new
>>>>>   tunable which can speicy the weight for this hidden group.
>>>> For the time being, the root "qse" weight is 1000 and others is 500, they don't
>>>> inherit the weight of parent. I was thinking that maybe we can determine the qse
>>>> weight in term of the queue number and weight in this group and subgroups.
>>>>
>>>> Thanks,
>>>> Gui
>>>>
>>>>>
>>>>> So in summary I am liking the "queue at same level as group" scheme for
>>>>> two reasons.
>>>>>
>>>>> - It is more intutive to visualize and implement. It follows the true
>>>>>   hierarchy as seen by cgroup file system.
>>>>>
>>>>> - CFS has already implemented this scheme. So we need a strong arguemnt
>>>>>   to justify why we should not follow the same thing. Especially for
>>>>>   the case where user has co-mounted cpu and blkio controller.
>>>>>
>>>>> - It can achieve the same goal as "hidden group" proposal just by
>>>>>   creating a cgroup explicitly and moving all threads in that group.
>>>>>
>>>>> Why do you think that "hidden group" proposal is better than "treating
>>>>> queue at same level as group" ?
>>> There are multiple reasons for "hidden group" proposal being a better approach.
>>>
>>> - "Hidden group" would allow us to keep scheduling queues using the
>>> CFQ queue scheduling logic. And does not require any major changes in
>>> CFQ. Aren't we already using that approach to deal with queues at the
>>> root group?
>> Currently we are operating in flat mode where all the groups are at
>> same level (irrespective their position in cgroup hiearchy).
>>
>>> - If queues and groups are treated at the same level, queues can end
>>> up in root cgroup. And we cannot put an upper bound on the number of
>>> those queues. Those queues can consume system resources in proportion
>>> to their number, causing the performance of groups to suffer. If we
>>> have "hidden group", we can configure it to a small weight, and that
>>> would limit the impact these queues in root group can have.
>> To limit the impact of other queues in cgroup, one can use libcgroup to
>> automatically place new threads or tasks into a subgroup.
>>
>> I understand that kernel doing it by default should help though. It is
>> less work in terms of configuration. But I am not sure that's a good
>> argument to design kernel functionality. Kernel functionality should be
>> pretty generic.
>>
>> Anyway, how would you assign the weight to the hidden group. What's the
>> interface for that? A new cgroup file inside each cgroup? Personally
>> I think that's little odd interface. Every group has one hidden group
>> where all the queues in that group go and weight of that group can be
>> specified by a cgroup file.
> 
> I think picking a reasonable default weight at compile time is not
> that bad an option, given that threads showing up in the "hidden
> group" is an uncommon case.

Hi Nauman,

Later, I think we might adjust the weight of "hidden group" automatically
according to the queue number and subgroup number and their weight.
But for the time being, i'd choose a fixed value for the sake of simplicity.

Gui

> 
>> But anyway, I am not tied to any of the approach. I am just trying to
>> make sure that we have put enough thought into it as changing it later
>> will be hard.
>>
>> Vivek
>>
> 
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-09-01  8:50           ` Gui Jianfeng
  2010-09-01 15:49             ` Nauman Rafique
@ 2010-09-02  2:20             ` Vivek Goyal
  1 sibling, 0 replies; 19+ messages in thread
From: Vivek Goyal @ 2010-09-02  2:20 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Nauman Rafique, Jens Axboe, Jeff Moyer, Divyesh Shah,
	Corrado Zoccolo, linux kernel mailing list, KAMEZAWA Hiroyuki

On Wed, Sep 01, 2010 at 04:50:22PM +0800, Gui Jianfeng wrote:

[..]

> Hi Vivek,
> 
> Even if we mount cpu and blkio together, to me, it's ok for cpu and blkio
> having their own logic, since they are totally different cgroup subsystems.
> 
> > 
> > - To me, both group and cfq queue are children of root group and it
> >   makes sense to treat them independent childrens instead of putting
> >   all the queues in one logical group which inherits the weight of
> >   parent.
> > 
> > - With this new scheme, I am finding it hard to visualize the hierachy.
> >   How do you assign the weights to queue entities of a group. It is more
> >   like a invisible group with-in group. We shall have to create new
> >   tunable which can speicy the weight for this hidden group.
> 
> For the time being, the root "qse" weight is 1000 and others is 500, they don't
> inherit the weight of parent. I was thinking that maybe we can determine the qse
> weight in term of the queue number and weight in this group and subgroups. 

If you decide queue weight in terms of queue number, then you are back to
the same problem Nauman was mentioning that share of a group is not fixed
and depends on number of queues in the group.

Vivek

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-09-01  9:02         ` KAMEZAWA Hiroyuki
@ 2010-09-02  2:29           ` Vivek Goyal
  2010-09-02  2:42             ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 19+ messages in thread
From: Vivek Goyal @ 2010-09-02  2:29 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Gui Jianfeng, Jens Axboe, Jeff Moyer, Divyesh Shah,
	Corrado Zoccolo, Nauman Rafique, linux kernel mailing list

On Wed, Sep 01, 2010 at 06:02:43PM +0900, KAMEZAWA Hiroyuki wrote:

[..]
> > > One of the possible way forward could be this.
> > > 
> > > - Treat queue and group at same level (like CFS)
> > > 
> > > - Get rid of cfq_slice_offset() logic. That means without idling on, there
> > >   will be no ioprio difference between cfq queues. I think anyway as of 
> > >   today that logic helps in so little situations that I would not mind
> > >   getting rid of it. Just that Jens should agree to it.
> > > 
> > > - With this new scheme, it will break the existing semantics of root group
> > >   being at same level as child groups. To avoid that, we can probably
> > >   implement two modes (flat and hierarchical), something similar to what
> > >   memory cgroup controller has done. May be one tunable in root cgroup of
> > >   blkio "use_hierarchy".  By default everything will be in flat mode and
> > >   if user wants hiearchical control, he needs to set user_hierarchy in
> > >   root group.
> > > 
> > >   I think memory controller provides "use_hierarchy" tunable in each
> > >   cgroup. I am not sure why do we need it in each cgroup and not just
> > >   in root cgroup.
> > 
> > I think Kamezawa-san should be able to answer this question. :)
> > 
> 
> At first, please be sure that "hierarchical accounting is _very_ slow".
> Please measure how hierarchical accounting (of 4-6 levels) are slow ;)
> 
> Then, there are 2 use cases.
> 
>  1)  root/to/some/directory/A
>                            /B
>                            /C
>                             ....
>      All A, B, C ....are flat cgroup and has no relationship, not sharing limit.
>      In this case, hierarchy should not be enabled.
> 
>  2)  root/to/some/directory/Gold/A,B,C...
>                             Silver/D,E,F
> 
>      All A, B, C ....are limited by "Gold" or "Silver".
>      But Gold and Silver has no relationthip, they has their own limitations.
>      But A, B, C, D, E, F shares limit under Gold or Silver.
>      In this case, hierarchy
>      "root/to/some/directory" should be disabled.
>       Gold/ and Silver should have use_hierarchy=1.
> 
> (Assume these Gold and Silver as Container and the user of container
>  divides memory into A and B, C...)
> 
> For example,  libvirt makes very long "root/to/some/directory" ...
> I never want to count-up all counters in the hierarchy even if
> we'd like to use some fantasic hierarchical accounting under a container.
> 
> I don't like "all or nothing" option (as making use_hierarchy as mount
> option or has parameter on root cgroup etc..) Then, allowed mixture.

Hi Kame San,

If you don't want any relationship between Gold and Silver then one can
make root as unlimited group (limit_in_bytes = -1) and practically Gold
and Silver have no dependency. There is no need of setting use_hierarchy
different at root level and inside Gold/ and Silver/ groups?

It sounds like you did it for two reasons.

- It can potentially provide more flexibility. 
- performance reason so that you can stop do hierarchical accounting
  all the way to root and stop before that (libvirt example).

I think for blkio controller we can probably begin with either a mount
time option or a use_hierachy file in root group and then later make
it per group if there are use cases.

Vivek

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
  2010-09-02  2:29           ` Vivek Goyal
@ 2010-09-02  2:42             ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-02  2:42 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Gui Jianfeng, Jens Axboe, Jeff Moyer, Divyesh Shah,
	Corrado Zoccolo, Nauman Rafique, linux kernel mailing list

On Wed, 1 Sep 2010 22:29:36 -0400
Vivek Goyal <vgoyal@redhat.com> wrote:

> On Wed, Sep 01, 2010 at 06:02:43PM +0900, KAMEZAWA Hiroyuki wrote:
> 
> [..]
> > > > One of the possible way forward could be this.
> > > > 
> > > > - Treat queue and group at same level (like CFS)
> > > > 
> > > > - Get rid of cfq_slice_offset() logic. That means without idling on, there
> > > >   will be no ioprio difference between cfq queues. I think anyway as of 
> > > >   today that logic helps in so little situations that I would not mind
> > > >   getting rid of it. Just that Jens should agree to it.
> > > > 
> > > > - With this new scheme, it will break the existing semantics of root group
> > > >   being at same level as child groups. To avoid that, we can probably
> > > >   implement two modes (flat and hierarchical), something similar to what
> > > >   memory cgroup controller has done. May be one tunable in root cgroup of
> > > >   blkio "use_hierarchy".  By default everything will be in flat mode and
> > > >   if user wants hiearchical control, he needs to set user_hierarchy in
> > > >   root group.
> > > > 
> > > >   I think memory controller provides "use_hierarchy" tunable in each
> > > >   cgroup. I am not sure why do we need it in each cgroup and not just
> > > >   in root cgroup.
> > > 
> > > I think Kamezawa-san should be able to answer this question. :)
> > > 
> > 
> > At first, please be sure that "hierarchical accounting is _very_ slow".
> > Please measure how hierarchical accounting (of 4-6 levels) are slow ;)
> > 
> > Then, there are 2 use cases.
> > 
> >  1)  root/to/some/directory/A
> >                            /B
> >                            /C
> >                             ....
> >      All A, B, C ....are flat cgroup and has no relationship, not sharing limit.
> >      In this case, hierarchy should not be enabled.
> > 
> >  2)  root/to/some/directory/Gold/A,B,C...
> >                             Silver/D,E,F
> > 
> >      All A, B, C ....are limited by "Gold" or "Silver".
> >      But Gold and Silver has no relationthip, they has their own limitations.
> >      But A, B, C, D, E, F shares limit under Gold or Silver.
> >      In this case, hierarchy
> >      "root/to/some/directory" should be disabled.
> >       Gold/ and Silver should have use_hierarchy=1.
> > 
> > (Assume these Gold and Silver as Container and the user of container
> >  divides memory into A and B, C...)
> > 
> > For example,  libvirt makes very long "root/to/some/directory" ...
> > I never want to count-up all counters in the hierarchy even if
> > we'd like to use some fantasic hierarchical accounting under a container.
> > 
> > I don't like "all or nothing" option (as making use_hierarchy as mount
> > option or has parameter on root cgroup etc..) Then, allowed mixture.
> 
> Hi Kame San,
> 
> If you don't want any relationship between Gold and Silver then one can
> make root as unlimited group (limit_in_bytes = -1) and practically Gold
> and Silver have no dependency. There is no need of setting use_hierarchy
> different at root level and inside Gold/ and Silver/ groups?
> 
and counts up 4 levels accounting ? ;)

We allow mixuture of 
	/root/to/some/directory/Gold/
			       /Silver
	     /Extraone

Gold and Silver can be under some limitation, of course.

(For example, Extraone is for system-admin and not-for-user.
 System admin is in another container than users.)



> It sounds like you did it for two reasons.
> 
> - It can potentially provide more flexibility. 
Right.

> - performance reason so that you can stop do hierarchical accounting
>   all the way to root and stop before that (libvirt example).
Yes.


> 
> I think for blkio controller we can probably begin with either a mount
> time option or a use_hierachy file in root group and then later make
> it per group if there are use cases.
> 

I hope something flexible. Complexity to the code is not very big.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2010-09-02  2:47 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-08-30  6:50 [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support Gui Jianfeng
2010-08-30 18:20 ` Chad Talbott
2010-08-31  0:35   ` Gui Jianfeng
2010-08-30 20:36 ` Vivek Goyal
2010-08-31  0:29   ` Gui Jianfeng
2010-08-31 12:57     ` Vivek Goyal
2010-08-31 15:40       ` Nauman Rafique
2010-08-31 19:25         ` Vivek Goyal
2010-09-01  8:50           ` Gui Jianfeng
2010-09-01 15:49             ` Nauman Rafique
2010-09-01 17:10               ` Vivek Goyal
2010-09-01 17:15                 ` Nauman Rafique
2010-09-01 17:21                   ` Vivek Goyal
2010-09-02  0:30                   ` Gui Jianfeng
2010-09-02  2:20             ` Vivek Goyal
2010-09-01  8:48       ` Gui Jianfeng
2010-09-01  9:02         ` KAMEZAWA Hiroyuki
2010-09-02  2:29           ` Vivek Goyal
2010-09-02  2:42             ` KAMEZAWA Hiroyuki

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.