linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/6 v3] cfq-iosched: Introduce cfq_entity for CFQ queue
       [not found] <4D180ECE.4000305@cn.fujitsu.com>
@ 2010-12-27  8:50 ` Gui Jianfeng
  2010-12-27  8:50 ` [PATCH 2/6 v3] cfq-iosched: Introduce cfq_entity for CFQ group Gui Jianfeng
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 21+ messages in thread
From: Gui Jianfeng @ 2010-12-27  8:50 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe
  Cc: Gui Jianfeng, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

Introduce cfq_entity for CFQ queue.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 block/cfq-iosched.c |  119 ++++++++++++++++++++++++++++++++------------------
 1 files changed, 76 insertions(+), 43 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c19d015..6b74302 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -91,20 +91,31 @@ struct cfq_rb_root {
 #define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
 			.count = 0, .min_vdisktime = 0, }
 
+
+/*
+ * This's the CFQ queue schedule entity which is scheduled on  service tree.
+ */
+struct cfq_entity {
+	/* service tree */
+	struct cfq_rb_root *service_tree;
+	/* service_tree member */
+	struct rb_node rb_node;
+	/* service_tree key, represent the position on the tree */
+	unsigned long rb_key;
+};
+
 /*
  * Per process-grouping structure
  */
 struct cfq_queue {
+	/* The schedule entity */
+	struct cfq_entity cfqe;
 	/* reference count */
 	atomic_t ref;
 	/* various state flags, see below */
 	unsigned int flags;
 	/* parent cfq_data */
 	struct cfq_data *cfqd;
-	/* service_tree member */
-	struct rb_node rb_node;
-	/* service_tree key */
-	unsigned long rb_key;
 	/* prio tree member */
 	struct rb_node p_node;
 	/* prio tree root we belong to, if any */
@@ -143,7 +154,6 @@ struct cfq_queue {
 	u32 seek_history;
 	sector_t last_request_pos;
 
-	struct cfq_rb_root *service_tree;
 	struct cfq_queue *new_cfqq;
 	struct cfq_group *cfqg;
 	struct cfq_group *orig_cfqg;
@@ -302,6 +312,15 @@ struct cfq_data {
 	struct rcu_head rcu;
 };
 
+static inline struct cfq_queue *
+cfqq_of_entity(struct cfq_entity *cfqe)
+{
+	if (cfqe)
+		return container_of(cfqe, struct cfq_queue, cfqe);
+
+	return NULL;
+}
+
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
@@ -743,7 +762,7 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
 /*
  * The below is leftmost cache rbtree addon
  */
-static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
+static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
 {
 	/* Service tree is empty */
 	if (!root->count)
@@ -753,7 +772,7 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
 		root->left = rb_first(&root->rb);
 
 	if (root->left)
-		return rb_entry(root->left, struct cfq_queue, rb_node);
+		return rb_entry(root->left, struct cfq_entity, rb_node);
 
 	return NULL;
 }
@@ -1170,21 +1189,24 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 				 bool add_front)
 {
+	struct cfq_entity *cfqe;
 	struct rb_node **p, *parent;
-	struct cfq_queue *__cfqq;
+	struct cfq_entity *__cfqe;
 	unsigned long rb_key;
 	struct cfq_rb_root *service_tree;
 	int left;
 	int new_cfqq = 1;
 	int group_changed = 0;
 
+	cfqe = &cfqq->cfqe;
+
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	if (!cfqd->cfq_group_isolation
 	    && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
 	    && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
 		/* Move this cfq to root group */
 		cfq_log_cfqq(cfqd, cfqq, "moving to root group");
-		if (!RB_EMPTY_NODE(&cfqq->rb_node))
+		if (!RB_EMPTY_NODE(&cfqe->rb_node))
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 		cfqq->orig_cfqg = cfqq->cfqg;
 		cfqq->cfqg = &cfqd->root_group;
@@ -1194,7 +1216,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
 		/* cfqq is sequential now needs to go to its original group */
 		BUG_ON(cfqq->cfqg != &cfqd->root_group);
-		if (!RB_EMPTY_NODE(&cfqq->rb_node))
+		if (!RB_EMPTY_NODE(&cfqe->rb_node))
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 		cfq_put_cfqg(cfqq->cfqg);
 		cfqq->cfqg = cfqq->orig_cfqg;
@@ -1209,9 +1231,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (cfq_class_idle(cfqq)) {
 		rb_key = CFQ_IDLE_DELAY;
 		parent = rb_last(&service_tree->rb);
-		if (parent && parent != &cfqq->rb_node) {
-			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
-			rb_key += __cfqq->rb_key;
+		if (parent && parent != &cfqe->rb_node) {
+			__cfqe = rb_entry(parent, struct cfq_entity, rb_node);
+			rb_key += __cfqe->rb_key;
 		} else
 			rb_key += jiffies;
 	} else if (!add_front) {
@@ -1226,37 +1248,37 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqq->slice_resid = 0;
 	} else {
 		rb_key = -HZ;
-		__cfqq = cfq_rb_first(service_tree);
-		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
+		__cfqe = cfq_rb_first(service_tree);
+		rb_key += __cfqe ? __cfqe->rb_key : jiffies;
 	}
 
-	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
+	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
 		new_cfqq = 0;
 		/*
 		 * same position, nothing more to do
 		 */
-		if (rb_key == cfqq->rb_key &&
-		    cfqq->service_tree == service_tree)
+		if (rb_key == cfqe->rb_key &&
+		    cfqe->service_tree == service_tree)
 			return;
 
-		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
-		cfqq->service_tree = NULL;
+		cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+		cfqe->service_tree = NULL;
 	}
 
 	left = 1;
 	parent = NULL;
-	cfqq->service_tree = service_tree;
+	cfqe->service_tree = service_tree;
 	p = &service_tree->rb.rb_node;
 	while (*p) {
 		struct rb_node **n;
 
 		parent = *p;
-		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
+		__cfqe = rb_entry(parent, struct cfq_entity, rb_node);
 
 		/*
 		 * sort by key, that represents service time.
 		 */
-		if (time_before(rb_key, __cfqq->rb_key))
+		if (time_before(rb_key, __cfqe->rb_key))
 			n = &(*p)->rb_left;
 		else {
 			n = &(*p)->rb_right;
@@ -1267,11 +1289,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	}
 
 	if (left)
-		service_tree->left = &cfqq->rb_node;
+		service_tree->left = &cfqe->rb_node;
 
-	cfqq->rb_key = rb_key;
-	rb_link_node(&cfqq->rb_node, parent, p);
-	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
+	cfqe->rb_key = rb_key;
+	rb_link_node(&cfqe->rb_node, parent, p);
+	rb_insert_color(&cfqe->rb_node, &service_tree->rb);
 	service_tree->count++;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
@@ -1373,13 +1395,16 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  */
 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
+	struct cfq_entity *cfqe;
 	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
 
-	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
-		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
-		cfqq->service_tree = NULL;
+	cfqe = &cfqq->cfqe;
+
+	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
+		cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+		cfqe->service_tree = NULL;
 	}
 	if (cfqq->p_root) {
 		rb_erase(&cfqq->p_node, cfqq->p_root);
@@ -1707,13 +1732,13 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 		return NULL;
 	if (RB_EMPTY_ROOT(&service_tree->rb))
 		return NULL;
-	return cfq_rb_first(service_tree);
+	return cfqq_of_entity(cfq_rb_first(service_tree));
 }
 
 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
 {
 	struct cfq_group *cfqg;
-	struct cfq_queue *cfqq;
+	struct cfq_entity *cfqe;
 	int i, j;
 	struct cfq_rb_root *st;
 
@@ -1724,9 +1749,11 @@ static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
 	if (!cfqg)
 		return NULL;
 
-	for_each_cfqg_st(cfqg, i, j, st)
-		if ((cfqq = cfq_rb_first(st)) != NULL)
-			return cfqq;
+	for_each_cfqg_st(cfqg, i, j, st) {
+		cfqe = cfq_rb_first(st);
+		if (cfqe != NULL)
+			return cfqq_of_entity(cfqe);
+	}
 	return NULL;
 }
 
@@ -1863,9 +1890,12 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 
 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
+	struct cfq_entity *cfqe;
 	enum wl_prio_t prio = cfqq_prio(cfqq);
-	struct cfq_rb_root *service_tree = cfqq->service_tree;
+	struct cfq_rb_root *service_tree;
 
+	cfqe = &cfqq->cfqe;
+	service_tree = cfqe->service_tree;
 	BUG_ON(!service_tree);
 	BUG_ON(!service_tree->count);
 
@@ -2075,7 +2105,7 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 				struct cfq_group *cfqg, enum wl_prio_t prio)
 {
-	struct cfq_queue *queue;
+	struct cfq_entity *cfqe;
 	int i;
 	bool key_valid = false;
 	unsigned long lowest_key = 0;
@@ -2083,10 +2113,10 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 
 	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
 		/* select the one with lowest rb_key */
-		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
-		if (queue &&
-		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
-			lowest_key = queue->rb_key;
+		cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
+		if (cfqe &&
+		    (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
+			lowest_key = cfqe->rb_key;
 			cur_best = i;
 			key_valid = true;
 		}
@@ -2833,7 +2863,10 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			  pid_t pid, bool is_sync)
 {
-	RB_CLEAR_NODE(&cfqq->rb_node);
+	struct cfq_entity *cfqe;
+
+	cfqe = &cfqq->cfqe;
+	RB_CLEAR_NODE(&cfqe->rb_node);
 	RB_CLEAR_NODE(&cfqq->p_node);
 	INIT_LIST_HEAD(&cfqq->fifo);
 
@@ -3242,7 +3275,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	/* Allow preemption only if we are idling on sync-noidle tree */
 	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
 	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
-	    new_cfqq->service_tree->count == 2 &&
+	    new_cfqq->cfqe.service_tree->count == 2 &&
 	    RB_EMPTY_ROOT(&cfqq->sort_list))
 		return true;
 
-- 
1.6.5.2









^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 2/6 v3] cfq-iosched: Introduce cfq_entity for CFQ group
       [not found] <4D180ECE.4000305@cn.fujitsu.com>
  2010-12-27  8:50 ` [PATCH 1/6 v3] cfq-iosched: Introduce cfq_entity for CFQ queue Gui Jianfeng
@ 2010-12-27  8:50 ` Gui Jianfeng
  2011-01-18 21:20   ` Vivek Goyal
  2010-12-27  8:51 ` [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 21+ messages in thread
From: Gui Jianfeng @ 2010-12-27  8:50 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe
  Cc: Gui Jianfeng, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

Introduce cfq_entity for CFQ group.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 block/cfq-iosched.c |  111 +++++++++++++++++++++++++++++++--------------------
 1 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6b74302..a2553c0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -73,7 +73,7 @@ static DEFINE_IDA(cic_index_ida);
 #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
 
 #define sample_valid(samples)	((samples) > 80)
-#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
+#define rb_entry_entity(node)	rb_entry((node), struct cfq_entity, rb_node)
 
 /*
  * Most of our rbtree usage is for sorting with min extraction, so
@@ -102,6 +102,11 @@ struct cfq_entity {
 	struct rb_node rb_node;
 	/* service_tree key, represent the position on the tree */
 	unsigned long rb_key;
+
+	/* group service_tree key */
+	u64 vdisktime;
+	bool is_group_entity;
+	unsigned int weight;
 };
 
 /*
@@ -183,12 +188,8 @@ enum wl_type_t {
 
 /* This is per cgroup per device grouping structure */
 struct cfq_group {
-	/* group service_tree member */
-	struct rb_node rb_node;
-
-	/* group service_tree key */
-	u64 vdisktime;
-	unsigned int weight;
+	/* cfq group sched entity */
+	struct cfq_entity cfqe;
 
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
@@ -315,12 +316,22 @@ struct cfq_data {
 static inline struct cfq_queue *
 cfqq_of_entity(struct cfq_entity *cfqe)
 {
-	if (cfqe)
+	if (cfqe && !cfqe->is_group_entity)
 		return container_of(cfqe, struct cfq_queue, cfqe);
 
 	return NULL;
 }
 
+static inline struct cfq_group *
+cfqg_of_entity(struct cfq_entity *cfqe)
+{
+	if (cfqe && cfqe->is_group_entity)
+		return container_of(cfqe, struct cfq_group, cfqe);
+
+	return NULL;
+}
+
+
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
@@ -548,12 +559,12 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 
-static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
+static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_entity *cfqe)
 {
 	u64 d = delta << CFQ_SERVICE_SHIFT;
 
 	d = d * BLKIO_WEIGHT_DEFAULT;
-	do_div(d, cfqg->weight);
+	do_div(d, cfqe->weight);
 	return d;
 }
 
@@ -578,11 +589,11 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
 static void update_min_vdisktime(struct cfq_rb_root *st)
 {
 	u64 vdisktime = st->min_vdisktime;
-	struct cfq_group *cfqg;
+	struct cfq_entity *cfqe;
 
 	if (st->left) {
-		cfqg = rb_entry_cfqg(st->left);
-		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
+		cfqe = rb_entry_entity(st->left);
+		vdisktime = min_vdisktime(vdisktime, cfqe->vdisktime);
 	}
 
 	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
@@ -613,8 +624,9 @@ static inline unsigned
 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct cfq_entity *cfqe = &cfqg->cfqe;
 
-	return cfq_target_latency * cfqg->weight / st->total_weight;
+	return cfq_target_latency * cfqe->weight / st->total_weight;
 }
 
 static inline void
@@ -777,13 +789,13 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
 	return NULL;
 }
 
-static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
+static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
 {
 	if (!root->left)
 		root->left = rb_first(&root->rb);
 
 	if (root->left)
-		return rb_entry_cfqg(root->left);
+		return rb_entry_entity(root->left);
 
 	return NULL;
 }
@@ -840,9 +852,9 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 }
 
 static inline s64
-cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
+entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
 {
-	return cfqg->vdisktime - st->min_vdisktime;
+	return entity->vdisktime - st->min_vdisktime;
 }
 
 static void
@@ -850,15 +862,16 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
 	struct rb_node **node = &st->rb.rb_node;
 	struct rb_node *parent = NULL;
-	struct cfq_group *__cfqg;
-	s64 key = cfqg_key(st, cfqg);
+	struct cfq_entity *__cfqe;
+	struct cfq_entity *cfqe = &cfqg->cfqe;
+	s64 key = entity_key(st, cfqe);
 	int left = 1;
 
 	while (*node != NULL) {
 		parent = *node;
-		__cfqg = rb_entry_cfqg(parent);
+		__cfqe = rb_entry_entity(parent);
 
-		if (key < cfqg_key(st, __cfqg))
+		if (key < entity_key(st, __cfqe))
 			node = &parent->rb_left;
 		else {
 			node = &parent->rb_right;
@@ -867,21 +880,22 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 	}
 
 	if (left)
-		st->left = &cfqg->rb_node;
+		st->left = &cfqe->rb_node;
 
-	rb_link_node(&cfqg->rb_node, parent, node);
-	rb_insert_color(&cfqg->rb_node, &st->rb);
+	rb_link_node(&cfqe->rb_node, parent, node);
+	rb_insert_color(&cfqe->rb_node, &st->rb);
 }
 
 static void
 cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-	struct cfq_group *__cfqg;
+	struct cfq_entity *cfqe = &cfqg->cfqe;
+	struct cfq_entity *__cfqe;
 	struct rb_node *n;
 
 	cfqg->nr_cfqq++;
-	if (!RB_EMPTY_NODE(&cfqg->rb_node))
+	if (!RB_EMPTY_NODE(&cfqe->rb_node))
 		return;
 
 	/*
@@ -891,19 +905,20 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	 */
 	n = rb_last(&st->rb);
 	if (n) {
-		__cfqg = rb_entry_cfqg(n);
-		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
+		__cfqe = rb_entry_entity(n);
+		cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
 	} else
-		cfqg->vdisktime = st->min_vdisktime;
+		cfqe->vdisktime = st->min_vdisktime;
 
 	__cfq_group_service_tree_add(st, cfqg);
-	st->total_weight += cfqg->weight;
+	st->total_weight += cfqe->weight;
 }
 
 static void
 cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct cfq_entity *cfqe = &cfqg->cfqe;
 
 	BUG_ON(cfqg->nr_cfqq < 1);
 	cfqg->nr_cfqq--;
@@ -913,9 +928,9 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		return;
 
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-	st->total_weight -= cfqg->weight;
-	if (!RB_EMPTY_NODE(&cfqg->rb_node))
-		cfq_rb_erase(&cfqg->rb_node, st);
+	st->total_weight -= cfqe->weight;
+	if (!RB_EMPTY_NODE(&cfqe->rb_node))
+		cfq_rb_erase(&cfqe->rb_node, st);
 	cfqg->saved_workload_slice = 0;
 	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
@@ -953,6 +968,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	unsigned int used_sl, charge;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
+	struct cfq_entity *cfqe = &cfqg->cfqe;
 
 	BUG_ON(nr_sync < 0);
 	used_sl = charge = cfq_cfqq_slice_usage(cfqq);
@@ -963,8 +979,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		charge = cfqq->allocated_slice;
 
 	/* Can't update vdisktime while group is on service tree */
-	cfq_rb_erase(&cfqg->rb_node, st);
-	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
+	cfq_rb_erase(&cfqe->rb_node, st);
+	cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
 	__cfq_group_service_tree_add(st, cfqg);
 
 	/* This group is being expired. Save the context */
@@ -976,8 +992,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	} else
 		cfqg->saved_workload_slice = 0;
 
-	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
-					st->min_vdisktime);
+	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
+		     cfqe->vdisktime, st->min_vdisktime);
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
 			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
 			iops_mode(cfqd), cfqq->nr_sectors);
@@ -996,7 +1012,7 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
 					unsigned int weight)
 {
-	cfqg_of_blkg(blkg)->weight = weight;
+	cfqg_of_blkg(blkg)->cfqe.weight = weight;
 }
 
 static struct cfq_group *
@@ -1025,7 +1041,9 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
-	RB_CLEAR_NODE(&cfqg->rb_node);
+	RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
+
+	cfqg->cfqe.is_group_entity = true;
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -1049,7 +1067,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
 					0);
 
-	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+	cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
 
 	/* Add group on cfqd list */
 	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
@@ -2209,10 +2227,13 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_group *cfqg;
+	struct cfq_entity *cfqe;
 
 	if (RB_EMPTY_ROOT(&st->rb))
 		return NULL;
-	cfqg = cfq_rb_first_group(st);
+	cfqe = cfq_rb_first_entity(st);
+	cfqg = cfqg_of_entity(cfqe);
+	BUG_ON(!cfqg);
 	update_min_vdisktime(st);
 	return cfqg;
 }
@@ -2870,6 +2891,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	RB_CLEAR_NODE(&cfqq->p_node);
 	INIT_LIST_HEAD(&cfqq->fifo);
 
+	cfqe->is_group_entity = false;
 	atomic_set(&cfqq->ref, 0);
 	cfqq->cfqd = cfqd;
 
@@ -3902,10 +3924,11 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqg = &cfqd->root_group;
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
-	RB_CLEAR_NODE(&cfqg->rb_node);
+	RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
 
 	/* Give preference to root group over other groups */
-	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
+	cfqg->cfqe.weight = 2*BLKIO_WEIGHT_DEFAULT;
+	cfqg->cfqe.is_group_entity = true;
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	/*
-- 
1.6.5.2








^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
       [not found] <4D180ECE.4000305@cn.fujitsu.com>
  2010-12-27  8:50 ` [PATCH 1/6 v3] cfq-iosched: Introduce cfq_entity for CFQ queue Gui Jianfeng
  2010-12-27  8:50 ` [PATCH 2/6 v3] cfq-iosched: Introduce cfq_entity for CFQ group Gui Jianfeng
@ 2010-12-27  8:51 ` Gui Jianfeng
  2010-12-28  2:50   ` Shaohua Li
  2011-01-19 22:58   ` Vivek Goyal
  2010-12-27  8:51 ` [PATCH 4/6 v3] cfq-iosched: Extract some common code of service tree handling for CFQ queue and CFQ group Gui Jianfeng
                   ` (2 subsequent siblings)
  5 siblings, 2 replies; 21+ messages in thread
From: Gui Jianfeng @ 2010-12-27  8:51 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe
  Cc: Gui Jianfeng, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
group scheduling on the same service tree.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 block/cfq-iosched.c |  211 ++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 158 insertions(+), 53 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a2553c0..b598798 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -100,10 +100,7 @@ struct cfq_entity {
 	struct cfq_rb_root *service_tree;
 	/* service_tree member */
 	struct rb_node rb_node;
-	/* service_tree key, represent the position on the tree */
-	unsigned long rb_key;
-
-	/* group service_tree key */
+	/* service_tree key */
 	u64 vdisktime;
 	bool is_group_entity;
 	unsigned int weight;
@@ -115,6 +112,8 @@ struct cfq_entity {
 struct cfq_queue {
 	/* The schedule entity */
 	struct cfq_entity cfqe;
+	/* Reposition time */
+	unsigned long reposition_time;
 	/* reference count */
 	atomic_t ref;
 	/* various state flags, see below */
@@ -313,6 +312,24 @@ struct cfq_data {
 	struct rcu_head rcu;
 };
 
+/*
+ * Map io priority(7 ~ 0) to io weight(100 ~ 1000) as follows
+ *     prio       0    1     2    3    4    5    6     7
+ *     weight  1000  868   740  612  484  356  228   100
+ */
+static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
+{
+	unsigned int step;
+
+	BUG_ON(ioprio >= IOPRIO_BE_NR);
+
+	step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
+	if (ioprio == 0)
+		return BLKIO_WEIGHT_MAX;
+
+	return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
+}
+
 static inline struct cfq_queue *
 cfqq_of_entity(struct cfq_entity *cfqe)
 {
@@ -841,16 +858,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
 }
 
-static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
-				      struct cfq_queue *cfqq)
-{
-	/*
-	 * just an approximation, should be ok.
-	 */
-	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
-		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
-}
-
 static inline s64
 entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
 {
@@ -1199,6 +1206,16 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
 
 #endif /* GROUP_IOSCHED */
 
+static inline u64 cfq_get_boost(struct cfq_data *cfqd,
+				 struct cfq_entity *cfqe)
+{
+	u64 d = cfqd->cfq_slice[1] << CFQ_SERVICE_SHIFT;
+
+	d = d * BLKIO_WEIGHT_DEFAULT;
+	do_div(d, BLKIO_WEIGHT_MAX - cfqe->weight + BLKIO_WEIGHT_MIN);
+	return d;
+}
+
 /*
  * The cfqd->service_trees holds all pending cfq_queue's that have
  * requests waiting to be processed. It is sorted in the order that
@@ -1210,13 +1227,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_entity *cfqe;
 	struct rb_node **p, *parent;
 	struct cfq_entity *__cfqe;
-	unsigned long rb_key;
-	struct cfq_rb_root *service_tree;
+	struct cfq_rb_root *service_tree, *orig_st;
 	int left;
 	int new_cfqq = 1;
 	int group_changed = 0;
+	s64 key;
 
 	cfqe = &cfqq->cfqe;
+	orig_st = cfqe->service_tree;
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	if (!cfqd->cfq_group_isolation
@@ -1224,8 +1242,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	    && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
 		/* Move this cfq to root group */
 		cfq_log_cfqq(cfqd, cfqq, "moving to root group");
-		if (!RB_EMPTY_NODE(&cfqe->rb_node))
+		if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+			/*
+			 * Group changed, dequeue this CFQ queue from the
+			 * original service tree.
+			 */
+			cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+			orig_st->total_weight -= cfqe->weight;
+		}
 		cfqq->orig_cfqg = cfqq->cfqg;
 		cfqq->cfqg = &cfqd->root_group;
 		atomic_inc(&cfqd->root_group.ref);
@@ -1234,8 +1259,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
 		/* cfqq is sequential now needs to go to its original group */
 		BUG_ON(cfqq->cfqg != &cfqd->root_group);
-		if (!RB_EMPTY_NODE(&cfqe->rb_node))
+		if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+			/*
+			 * Group changed, dequeue this CFQ queue from the
+			 * original service tree.
+			 */
+			cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+			orig_st->total_weight -= cfqe->weight;
+		}
 		cfq_put_cfqg(cfqq->cfqg);
 		cfqq->cfqg = cfqq->orig_cfqg;
 		cfqq->orig_cfqg = NULL;
@@ -1246,47 +1278,71 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
 						cfqq_type(cfqq));
+	/*
+	 * For the time being, put the newly added CFQ queue at the end of the
+	 * service tree.
+	 */
+	if (RB_EMPTY_NODE(&cfqe->rb_node)) {
+		/*
+		 * If this CFQ queue moves to another group, the original
+		 * vdisktime makes no sense any more, reset the vdisktime
+		 * here.
+		 */
+		parent = rb_last(&service_tree->rb);
+		if (parent) {
+			u64 boost;
+			s64 __vdisktime;
+
+			__cfqe = rb_entry_entity(parent);
+			cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
+
+			/* Give some vdisktime boost according to its weight */
+			boost = cfq_get_boost(cfqd, cfqe);
+			__vdisktime = cfqe->vdisktime - boost;
+			if (__vdisktime > service_tree->min_vdisktime)
+				cfqe->vdisktime = __vdisktime;
+			else
+				cfqe->vdisktime = service_tree->min_vdisktime;
+		} else
+			cfqe->vdisktime = service_tree->min_vdisktime;
+
+		goto insert;
+	}
+	/*
+	 * Ok, we get here, this CFQ queue is on the service tree, dequeue it
+	 * firstly.
+	 */
+	cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+	orig_st->total_weight -= cfqe->weight;
+
+	new_cfqq = 0;
+
 	if (cfq_class_idle(cfqq)) {
-		rb_key = CFQ_IDLE_DELAY;
 		parent = rb_last(&service_tree->rb);
 		if (parent && parent != &cfqe->rb_node) {
 			__cfqe = rb_entry(parent, struct cfq_entity, rb_node);
-			rb_key += __cfqe->rb_key;
+			cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
 		} else
-			rb_key += jiffies;
+			cfqe->vdisktime = service_tree->min_vdisktime;
 	} else if (!add_front) {
 		/*
-		 * Get our rb key offset. Subtract any residual slice
-		 * value carried from last service. A negative resid
-		 * count indicates slice overrun, and this should position
-		 * the next service time further away in the tree.
-		 */
-		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
-		rb_key -= cfqq->slice_resid;
-		cfqq->slice_resid = 0;
-	} else {
-		rb_key = -HZ;
-		__cfqe = cfq_rb_first(service_tree);
-		rb_key += __cfqe ? __cfqe->rb_key : jiffies;
-	}
-
-	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
-		new_cfqq = 0;
-		/*
-		 * same position, nothing more to do
+		 * We charge the CFQ queue by the time this queue runs, and
+		 * repsition it on the service tree.
 		 */
-		if (rb_key == cfqe->rb_key &&
-		    cfqe->service_tree == service_tree)
-			return;
+		unsigned int used_sl;
 
-		cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
-		cfqe->service_tree = NULL;
+		used_sl = cfq_cfqq_slice_usage(cfqq);
+		cfqe->vdisktime += cfq_scale_slice(used_sl, cfqe);
+	} else {
+		cfqe->vdisktime = service_tree->min_vdisktime;
 	}
 
+insert:
 	left = 1;
 	parent = NULL;
 	cfqe->service_tree = service_tree;
 	p = &service_tree->rb.rb_node;
+	key = entity_key(service_tree, cfqe);
 	while (*p) {
 		struct rb_node **n;
 
@@ -1296,7 +1352,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		/*
 		 * sort by key, that represents service time.
 		 */
-		if (time_before(rb_key, __cfqe->rb_key))
+		if (key < entity_key(service_tree, __cfqe))
 			n = &(*p)->rb_left;
 		else {
 			n = &(*p)->rb_right;
@@ -1309,10 +1365,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (left)
 		service_tree->left = &cfqe->rb_node;
 
-	cfqe->rb_key = rb_key;
 	rb_link_node(&cfqe->rb_node, parent, p);
 	rb_insert_color(&cfqe->rb_node, &service_tree->rb);
+	update_min_vdisktime(service_tree);
 	service_tree->count++;
+	service_tree->total_weight += cfqe->weight;
+	cfqq->reposition_time = jiffies;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
 	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
@@ -1414,14 +1472,18 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	struct cfq_entity *cfqe;
+	struct cfq_rb_root *service_tree;
+
 	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
 
 	cfqe = &cfqq->cfqe;
+	service_tree = cfqe->service_tree;
 
 	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
 		cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+		service_tree->total_weight -= cfqe->weight;
 		cfqe->service_tree = NULL;
 	}
 	if (cfqq->p_root) {
@@ -2120,23 +2182,35 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 	}
 }
 
+/*
+ * The time when a CFQ queue is put onto a service tree is recoreded in
+ * cfqq->reposition_time. Currently, we check the first priority CFQ queues
+ * on each service tree, and select the workload type that contain the lowest
+ * reposition_time CFQ queue among them.
+ */
 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 				struct cfq_group *cfqg, enum wl_prio_t prio)
 {
 	struct cfq_entity *cfqe;
+	struct cfq_queue *cfqq;
+	unsigned long lowest_start_time;
 	int i;
-	bool key_valid = false;
-	unsigned long lowest_key = 0;
+	bool time_valid = false;
 	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
 
+	/*
+	 * TODO: We may take io priority into account when choosing a workload
+	 * type. But for the time being just make use of reposition_time only.
+	 */
 	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
-		/* select the one with lowest rb_key */
 		cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
-		if (cfqe &&
-		    (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
-			lowest_key = cfqe->rb_key;
+		cfqq = cfqq_of_entity(cfqe);
+		if (cfqe && (!time_valid ||
+			     time_before(cfqq->reposition_time,
+					 lowest_start_time))) {
+			lowest_start_time = cfqq->reposition_time;
 			cur_best = i;
-			key_valid = true;
+			time_valid = true;
 		}
 	}
 
@@ -2807,10 +2881,13 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 {
 	struct task_struct *tsk = current;
 	int ioprio_class;
+	struct cfq_entity *cfqe;
 
 	if (!cfq_cfqq_prio_changed(cfqq))
 		return;
 
+	cfqe = &cfqq->cfqe;
+
 	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
 	switch (ioprio_class) {
 	default:
@@ -2837,6 +2914,17 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 		break;
 	}
 
+	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
+		/*
+		 * If this CFQ entity is already on service tree, we need to
+		 * adjust service tree's total weight accordingly.
+		 */
+		cfqe->service_tree->total_weight -= cfqe->weight;
+		cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
+		cfqe->service_tree->total_weight += cfqe->weight;
+	} else
+		cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
+
 	/*
 	 * keep track of original prio settings in case we have to temporarily
 	 * elevate the priority of this queue
@@ -3571,6 +3659,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
  */
 static void cfq_prio_boost(struct cfq_queue *cfqq)
 {
+	struct cfq_entity *cfqe;
+
+	cfqe = &cfqq->cfqe;
 	if (has_fs_excl()) {
 		/*
 		 * boost idle prio on transactions that would lock out other
@@ -3587,6 +3678,20 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
 		cfqq->ioprio_class = cfqq->org_ioprio_class;
 		cfqq->ioprio = cfqq->org_ioprio;
 	}
+
+	/*
+	 * update the io weight if io priority gets changed.
+	 */
+	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
+		/*
+		 * If this CFQ entity is already on service tree, we need to
+		 * adjust service tree's total weight accordingly.
+		 */
+		cfqe->service_tree->total_weight -= cfqe->weight;
+		cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
+		cfqe->service_tree->total_weight += cfqe->weight;
+	} else
+		cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
 }
 
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
-- 
1.6.5.2







^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 4/6 v3] cfq-iosched: Extract some common code of service tree handling for CFQ queue and CFQ group
       [not found] <4D180ECE.4000305@cn.fujitsu.com>
                   ` (2 preceding siblings ...)
  2010-12-27  8:51 ` [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
@ 2010-12-27  8:51 ` Gui Jianfeng
  2010-12-27  8:51 ` [PATCH 5/6 v3] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface Gui Jianfeng
  2010-12-27  8:51 ` [PATCH 6/6 v3] blkio-cgroup: Document for blkio.use_hierarchy interface Gui Jianfeng
  5 siblings, 0 replies; 21+ messages in thread
From: Gui Jianfeng @ 2010-12-27  8:51 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe
  Cc: Gui Jianfeng, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

Extract some common code of service tree handling for CFQ queue
and CFQ group. This helps when CFQ queue and CFQ group are scheduling
together.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 block/cfq-iosched.c |   86 +++++++++++++++++++++-----------------------------
 1 files changed, 36 insertions(+), 50 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b598798..1f21418 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -865,12 +865,11 @@ entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
 }
 
 static void
-__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
+__cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 {
 	struct rb_node **node = &st->rb.rb_node;
 	struct rb_node *parent = NULL;
 	struct cfq_entity *__cfqe;
-	struct cfq_entity *cfqe = &cfqg->cfqe;
 	s64 key = entity_key(st, cfqe);
 	int left = 1;
 
@@ -894,6 +893,14 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 }
 
 static void
+cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
+{
+	__cfq_entity_service_tree_add(st, cfqe);
+	st->count++;
+	st->total_weight += cfqe->weight;
+}
+
+static void
 cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
@@ -917,8 +924,23 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	} else
 		cfqe->vdisktime = st->min_vdisktime;
 
-	__cfq_group_service_tree_add(st, cfqg);
-	st->total_weight += cfqe->weight;
+	cfq_entity_service_tree_add(st, cfqe);
+}
+
+static void
+__cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
+{
+	cfq_rb_erase(&cfqe->rb_node, st);
+}
+
+static void
+cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
+{
+	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
+		__cfq_entity_service_tree_del(st, cfqe);
+		st->total_weight -= cfqe->weight;
+		cfqe->service_tree = NULL;
+	}
 }
 
 static void
@@ -935,9 +957,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		return;
 
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-	st->total_weight -= cfqe->weight;
-	if (!RB_EMPTY_NODE(&cfqe->rb_node))
-		cfq_rb_erase(&cfqe->rb_node, st);
+	cfq_entity_service_tree_del(st, cfqe);
 	cfqg->saved_workload_slice = 0;
 	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
@@ -986,9 +1006,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		charge = cfqq->allocated_slice;
 
 	/* Can't update vdisktime while group is on service tree */
-	cfq_rb_erase(&cfqe->rb_node, st);
+	__cfq_entity_service_tree_del(st, cfqe);
 	cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
-	__cfq_group_service_tree_add(st, cfqg);
+	__cfq_entity_service_tree_add(st, cfqe);
 
 	/* This group is being expired. Save the context */
 	if (time_after(cfqd->workload_expires, jiffies)) {
@@ -1225,13 +1245,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 				 bool add_front)
 {
 	struct cfq_entity *cfqe;
-	struct rb_node **p, *parent;
+	struct rb_node *parent;
 	struct cfq_entity *__cfqe;
 	struct cfq_rb_root *service_tree, *orig_st;
-	int left;
 	int new_cfqq = 1;
 	int group_changed = 0;
-	s64 key;
 
 	cfqe = &cfqq->cfqe;
 	orig_st = cfqe->service_tree;
@@ -1248,8 +1266,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			 * Group changed, dequeue this CFQ queue from the
 			 * original service tree.
 			 */
-			cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
-			orig_st->total_weight -= cfqe->weight;
+			cfq_entity_service_tree_del(orig_st, cfqe);
 		}
 		cfqq->orig_cfqg = cfqq->cfqg;
 		cfqq->cfqg = &cfqd->root_group;
@@ -1265,8 +1282,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			 * Group changed, dequeue this CFQ queue from the
 			 * original service tree.
 			 */
-			cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
-			orig_st->total_weight -= cfqe->weight;
+			cfq_entity_service_tree_del(orig_st, cfqe);
 		}
 		cfq_put_cfqg(cfqq->cfqg);
 		cfqq->cfqg = cfqq->orig_cfqg;
@@ -1312,8 +1328,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	 * Ok, we get here, this CFQ queue is on the service tree, dequeue it
 	 * firstly.
 	 */
-	cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
-	orig_st->total_weight -= cfqe->weight;
+	cfq_entity_service_tree_del(orig_st, cfqe);
 
 	new_cfqq = 0;
 
@@ -1338,38 +1353,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	}
 
 insert:
-	left = 1;
-	parent = NULL;
 	cfqe->service_tree = service_tree;
-	p = &service_tree->rb.rb_node;
-	key = entity_key(service_tree, cfqe);
-	while (*p) {
-		struct rb_node **n;
-
-		parent = *p;
-		__cfqe = rb_entry(parent, struct cfq_entity, rb_node);
-
-		/*
-		 * sort by key, that represents service time.
-		 */
-		if (key < entity_key(service_tree, __cfqe))
-			n = &(*p)->rb_left;
-		else {
-			n = &(*p)->rb_right;
-			left = 0;
-		}
-
-		p = n;
-	}
 
-	if (left)
-		service_tree->left = &cfqe->rb_node;
-
-	rb_link_node(&cfqe->rb_node, parent, p);
-	rb_insert_color(&cfqe->rb_node, &service_tree->rb);
+	/* Add cfqq onto service tree. */
+	cfq_entity_service_tree_add(service_tree, cfqe);
 	update_min_vdisktime(service_tree);
-	service_tree->count++;
-	service_tree->total_weight += cfqe->weight;
 	cfqq->reposition_time = jiffies;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
@@ -1482,9 +1470,7 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	service_tree = cfqe->service_tree;
 
 	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
-		cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
-		service_tree->total_weight -= cfqe->weight;
-		cfqe->service_tree = NULL;
+		cfq_entity_service_tree_del(service_tree, cfqe);
 	}
 	if (cfqq->p_root) {
 		rb_erase(&cfqq->p_node, cfqq->p_root);
-- 
1.6.5.2








^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 5/6 v3] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
       [not found] <4D180ECE.4000305@cn.fujitsu.com>
                   ` (3 preceding siblings ...)
  2010-12-27  8:51 ` [PATCH 4/6 v3] cfq-iosched: Extract some common code of service tree handling for CFQ queue and CFQ group Gui Jianfeng
@ 2010-12-27  8:51 ` Gui Jianfeng
  2011-01-24 22:52   ` Vivek Goyal
  2010-12-27  8:51 ` [PATCH 6/6 v3] blkio-cgroup: Document for blkio.use_hierarchy interface Gui Jianfeng
  5 siblings, 1 reply; 21+ messages in thread
From: Gui Jianfeng @ 2010-12-27  8:51 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe
  Cc: Gui Jianfeng, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

CFQ group hierarchical scheduling and use_hierarchy interface.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 block/blk-cgroup.c  |   61 ++++++-
 block/blk-cgroup.h  |    3 +
 block/cfq-iosched.c |  570 ++++++++++++++++++++++++++++++++++++---------------
 3 files changed, 469 insertions(+), 165 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a..c55fecd 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -25,7 +25,10 @@
 static DEFINE_SPINLOCK(blkio_list_lock);
 static LIST_HEAD(blkio_list);
 
-struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
+struct blkio_cgroup blkio_root_cgroup = {
+	.weight = 2*BLKIO_WEIGHT_DEFAULT,
+	.use_hierarchy = 0
+};
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
@@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
 	blkg->blkcg_id = 0;
 }
 
+
 /*
  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
  * indicating that blk_group was unhashed by the time we got to it.
@@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
 
+unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg)
+{
+	return blkcg->use_hierarchy;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy);
+
 uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
 {
 	struct blkio_policy_node *pn;
@@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
 		switch(name) {
 		case BLKIO_PROP_weight:
 			return (u64)blkcg->weight;
+		case BLKIO_PROP_use_hierarchy:
+			return (u64)blkcg->use_hierarchy;
 		}
 		break;
 	default:
@@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
 	return 0;
 }
 
+static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val)
+{
+	struct cgroup *parent = cgrp->parent;
+	struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
+	int ret = 0;
+
+	if (val != 0 && val != 1)
+		return -EINVAL;
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+	if (parent)
+		parent_blkcg = cgroup_to_blkio_cgroup(parent);
+
+	cgroup_lock();
+	/*
+	 * If parent's use_hierarchy is set, we can't make any modifications
+	 * in the child subtrees. If it is unset, then the change can occur,
+	 * provided the current cgroup has no children.
+	 */
+	if (!parent_blkcg || !parent_blkcg->use_hierarchy) {
+		if (list_empty(&cgrp->children))
+			blkcg->use_hierarchy = val;
+		else
+			ret = -EBUSY;
+	} else
+		ret = -EINVAL;
+	cgroup_unlock();
+	return ret;
+}
+
 static int
 blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
@@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 		switch(name) {
 		case BLKIO_PROP_weight:
 			return blkio_weight_write(blkcg, val);
+		case BLKIO_PROP_use_hierarchy:
+			return blkio_use_hierarchy_write(cgrp, val);
 		}
 		break;
 	default:
@@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = {
 		.name = "reset_stats",
 		.write_u64 = blkiocg_reset_stats,
 	},
+	{
+		.name = "use_hierarchy",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+					     BLKIO_PROP_use_hierarchy),
+		.read_u64 = blkiocg_file_read_u64,
+		.write_u64 = blkiocg_file_write_u64,
+	},
 #ifdef CONFIG_BLK_DEV_THROTTLING
 	{
 		.name = "throttle.read_bps_device",
@@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 static struct cgroup_subsys_state *
 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 {
-	struct blkio_cgroup *blkcg;
+	struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
 	struct cgroup *parent = cgroup->parent;
 
 	if (!parent) {
@@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		goto done;
 	}
 
+	parent_blkcg = cgroup_to_blkio_cgroup(parent);
 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
 	if (!blkcg)
 		return ERR_PTR(-ENOMEM);
@@ -1462,6 +1514,11 @@ done:
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 
 	INIT_LIST_HEAD(&blkcg->policy_list);
+	if (parent)
+		blkcg->use_hierarchy = parent_blkcg->use_hierarchy;
+	else
+		blkcg->use_hierarchy = 0;
+
 	return &blkcg->css;
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ea4861b..5b4b351 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -90,6 +90,7 @@ enum blkcg_file_name_prop {
 	BLKIO_PROP_idle_time,
 	BLKIO_PROP_empty_time,
 	BLKIO_PROP_dequeue,
+	BLKIO_PROP_use_hierarchy,
 };
 
 /* cgroup files owned by throttle policy */
@@ -105,6 +106,7 @@ enum blkcg_file_name_throtl {
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
+	bool use_hierarchy;
 	spinlock_t lock;
 	struct hlist_head blkg_list;
 	struct list_head policy_list; /* list of blkio_policy_node */
@@ -179,6 +181,7 @@ struct blkio_policy_node {
 
 extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 				     dev_t dev);
+extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg);
 extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
 				     dev_t dev);
 extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1f21418..727d6e6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -104,6 +104,9 @@ struct cfq_entity {
 	u64 vdisktime;
 	bool is_group_entity;
 	unsigned int weight;
+	struct cfq_entity *parent;
+	/* Reposition time */
+	unsigned long reposition_time;
 };
 
 /*
@@ -112,8 +115,6 @@ struct cfq_entity {
 struct cfq_queue {
 	/* The schedule entity */
 	struct cfq_entity cfqe;
-	/* Reposition time */
-	unsigned long reposition_time;
 	/* reference count */
 	atomic_t ref;
 	/* various state flags, see below */
@@ -193,6 +194,9 @@ struct cfq_group {
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
 
+	/* number of sub cfq groups */
+	int nr_subgp;
+
 	/*
 	 * Per group busy queus average. Useful for workload slice calc. We
 	 * create the array for each prio class but at run time it is used
@@ -228,10 +232,11 @@ struct cfq_group {
  */
 struct cfq_data {
 	struct request_queue *queue;
-	/* Root service tree for cfq_groups */
-	struct cfq_rb_root grp_service_tree;
 	struct cfq_group root_group;
 
+	/* cfq group schedule in flat or hierarchy manner. */
+	bool use_hierarchy;
+
 	/*
 	 * The priority currently being served
 	 */
@@ -240,6 +245,9 @@ struct cfq_data {
 	unsigned long workload_expires;
 	struct cfq_group *serving_group;
 
+	/* Service tree for cfq group flat scheduling mode. */
+	struct cfq_rb_root grp_service_tree;
+
 	/*
 	 * Each priority tree is sorted by next_request position.  These
 	 * trees are used when determining if two or more queues are
@@ -349,8 +357,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
 }
 
 
-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
-
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
 					    enum wl_prio_t prio,
 					    enum wl_type_t type)
@@ -640,10 +646,19 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
 static inline unsigned
 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_entity *cfqe = &cfqg->cfqe;
+	struct cfq_rb_root *st = cfqe->service_tree;
+	int group_slice = cfq_target_latency;
+
+	/* Calculate group slice in a hierarchical way */
+	do {
+		group_slice = group_slice * cfqe->weight / st->total_weight;
+		cfqe = cfqe->parent;
+		if (cfqe)
+			st = cfqe->service_tree;
+	} while (cfqe);
 
-	return cfq_target_latency * cfqe->weight / st->total_weight;
+	return group_slice;
 }
 
 static inline void
@@ -666,7 +681,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 			/* scale low_slice according to IO priority
 			 * and sync vs async */
 			unsigned low_slice =
-				min(slice, base_low_slice * slice / sync_slice);
+				min(slice, base_low_slice * slice /
+				    sync_slice);
 			/* the adapted slice value is scaled to fit all iqs
 			 * into the target latency */
 			slice = max(slice * group_slice / expect_latency,
@@ -806,17 +822,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
 	return NULL;
 }
 
-static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
-{
-	if (!root->left)
-		root->left = rb_first(&root->rb);
-
-	if (root->left)
-		return rb_entry_entity(root->left);
-
-	return NULL;
-}
-
 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 {
 	rb_erase(n, root);
@@ -890,12 +895,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 
 	rb_link_node(&cfqe->rb_node, parent, node);
 	rb_insert_color(&cfqe->rb_node, &st->rb);
+
+	update_min_vdisktime(st);
 }
 
 static void
 cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 {
 	__cfq_entity_service_tree_add(st, cfqe);
+	cfqe->reposition_time = jiffies;
 	st->count++;
 	st->total_weight += cfqe->weight;
 }
@@ -903,34 +911,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 static void
 cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_entity *cfqe = &cfqg->cfqe;
-	struct cfq_entity *__cfqe;
 	struct rb_node *n;
+	struct cfq_entity *entity;
+	struct cfq_rb_root *st;
+	struct cfq_group *__cfqg;
 
 	cfqg->nr_cfqq++;
+
 	if (!RB_EMPTY_NODE(&cfqe->rb_node))
 		return;
 
 	/*
-	 * Currently put the group at the end. Later implement something
-	 * so that groups get lesser vtime based on their weights, so that
-	 * if group does not loose all if it was not continously backlogged.
+	 * Enqueue this group and its ancestors onto their service tree.
 	 */
-	n = rb_last(&st->rb);
-	if (n) {
-		__cfqe = rb_entry_entity(n);
-		cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
-	} else
-		cfqe->vdisktime = st->min_vdisktime;
+	while (cfqe) {
+		if (!RB_EMPTY_NODE(&cfqe->rb_node))
+			return;
+
+		/*
+		 * Currently put the group at the end. Later implement
+		 * something so that groups get lesser vtime based on
+		 * their weights, so that if group does not loose all
+		 * if it was not continously backlogged.
+		 */
+		st = cfqe->service_tree;
+		n = rb_last(&st->rb);
+		if (n) {
+			entity = rb_entry_entity(n);
+			cfqe->vdisktime = entity->vdisktime +
+				CFQ_IDLE_DELAY;
+		} else
+			cfqe->vdisktime = st->min_vdisktime;
 
-	cfq_entity_service_tree_add(st, cfqe);
+		cfq_entity_service_tree_add(st, cfqe);
+		cfqe = cfqe->parent;
+		__cfqg = cfqg_of_entity(cfqe);
+		if (__cfqg)
+			__cfqg->nr_subgp++;
+	}
 }
 
 static void
 __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 {
 	cfq_rb_erase(&cfqe->rb_node, st);
+	update_min_vdisktime(st);
 }
 
 static void
@@ -939,27 +965,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
 		__cfq_entity_service_tree_del(st, cfqe);
 		st->total_weight -= cfqe->weight;
-		cfqe->service_tree = NULL;
 	}
 }
 
 static void
 cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_entity *cfqe = &cfqg->cfqe;
+	struct cfq_group *__cfqg, *p_cfqg;
 
 	BUG_ON(cfqg->nr_cfqq < 1);
 	cfqg->nr_cfqq--;
 
-	/* If there are other cfq queues under this group, don't delete it */
-	if (cfqg->nr_cfqq)
+	/*
+	 * If there are other cfq queues under this group, or there are other
+	 * cfq groups under this group, don't delete it.
+	 */
+	if (cfqg->nr_cfqq || cfqg->nr_subgp)
 		return;
 
-	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-	cfq_entity_service_tree_del(st, cfqe);
-	cfqg->saved_workload_slice = 0;
-	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
+	/*
+	 * Dequeue this group and its ancestors from their service
+	 * tree.
+	 */
+	while (cfqe) {
+		__cfqg = cfqg_of_entity(cfqe);
+		p_cfqg = cfqg_of_entity(cfqe->parent);
+		cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
+		cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
+		cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
+		__cfqg->saved_workload_slice = 0;
+		cfqe = cfqe->parent;
+		if (p_cfqg) {
+			p_cfqg->nr_subgp--;
+			if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
+				return;
+		}
+	}
 }
 
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -991,7 +1033,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 				struct cfq_queue *cfqq)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	unsigned int used_sl, charge;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
@@ -1005,10 +1046,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
 		charge = cfqq->allocated_slice;
 
-	/* Can't update vdisktime while group is on service tree */
-	__cfq_entity_service_tree_del(st, cfqe);
-	cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
-	__cfq_entity_service_tree_add(st, cfqe);
+	/*
+	 * Update the vdisktime on the whole chain.
+	 */
+	while (cfqe) {
+		struct cfq_rb_root *st = cfqe->service_tree;
+
+		/*
+		 * Can't update vdisktime while group is on service
+		 * tree.
+		 */
+		__cfq_entity_service_tree_del(st, cfqe);
+		cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
+		__cfq_entity_service_tree_add(st, cfqe);
+		st->count++;
+		cfqe->reposition_time = jiffies;
+		cfqe = cfqe->parent;
+	}
 
 	/* This group is being expired. Save the context */
 	if (time_after(cfqd->workload_expires, jiffies)) {
@@ -1020,7 +1074,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		cfqg->saved_workload_slice = 0;
 
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
-		     cfqe->vdisktime, st->min_vdisktime);
+		     cfqg->cfqe.vdisktime,
+		     cfqg->cfqe.service_tree->min_vdisktime);
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
 			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
 			iops_mode(cfqd), cfqq->nr_sectors);
@@ -1042,35 +1097,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
 	cfqg_of_blkg(blkg)->cfqe.weight = weight;
 }
 
-static struct cfq_group *
-cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+static void init_cfqe(struct blkio_cgroup *blkcg,
+				    struct cfq_group *cfqg)
+{
+	struct cfq_entity *cfqe = &cfqg->cfqe;
+
+	cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+	RB_CLEAR_NODE(&cfqe->rb_node);
+	cfqe->is_group_entity = true;
+	cfqe->parent = NULL;
+}
+
+static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
+		      struct cfq_group *cfqg)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
-	struct cfq_group *cfqg = NULL;
-	void *key = cfqd;
 	int i, j;
 	struct cfq_rb_root *st;
-	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 	unsigned int major, minor;
-
-	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
-	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfqg->blkg.dev = MKDEV(major, minor);
-		goto done;
-	}
-	if (cfqg || !create)
-		goto done;
-
-	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
-	if (!cfqg)
-		goto done;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
-	RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
-
-	cfqg->cfqe.is_group_entity = true;
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -1080,24 +1127,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	 */
 	atomic_set(&cfqg->ref, 1);
 
+	/* Add group onto cgroup list */
+	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+				    MKDEV(major, minor));
+	/* Initiate group entity */
+	init_cfqe(blkcg, cfqg);
+	/* Add group on cfqd list */
+	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
+
+static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+	if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
+		cfq_destroy_cfqg(cfqd, cfqg);
+}
+
+static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
+			    struct cfq_group *p_cfqg)
+{
+	struct cfq_entity *cfqe, *p_cfqe;
+
+	cfqe = &cfqg->cfqe;
+
 	/*
-	 * Add group onto cgroup list. It might happen that bdi->dev is
-	 * not initiliazed yet. Initialize this new group without major
-	 * and minor info and this info will be filled in once a new thread
-	 * comes for IO. See code above.
+	 * 1. If use_hierarchy of the CGroup where cfqg's parent stays is not
+	 *    set, we put this cfqg onto global service tree.
+	 * 2. If cfqg is root cfqg, put it onto global service tree.
 	 */
-	if (bdi->dev) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-					MKDEV(major, minor));
-	} else
-		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-					0);
+	if (!p_cfqg) {
+		cfqe->service_tree = &cfqd->grp_service_tree;
+		cfqe->parent = NULL;
+		return;
+	}
 
-	cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+	p_cfqe = &p_cfqg->cfqe;
 
-	/* Add group on cfqd list */
-	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+	cfqe->parent = p_cfqe;
+
+	/*
+	 * Currently, just put cfq group entity on "BE:SYNC" workload
+	 * service tree.
+	 */
+	cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
+						      SYNC_WORKLOAD);
+	/* child reference */
+	atomic_inc(&p_cfqg->ref);
+}
+
+static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg)
+{
+	struct cfq_entity *cfqe, *p_cfqe;
+
+	if (!cfqg)
+		return NULL;
+
+	cfqe = &cfqg->cfqe;
+	p_cfqe = cfqe->parent;
+	if (!p_cfqe)
+		return NULL;
+
+	return cfqg_of_entity(p_cfqe);
+}
+
+static struct cfq_group *
+cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+	unsigned int major, minor;
+	struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg;
+	void *key = cfqd;
+
+	/*
+	 * If CGroup's use_hierarchy is unset, we just need to allocate only
+	 * one CFQ group, and this group will put onto the "grp_service_tree".
+	 * We don't need to check whether the cfqg exists, the caller has
+	 * already checked it.
+	 */
+	blkcg = cgroup_to_blkio_cgroup(cgroup);
+	if (!blkcg_get_use_hierarchy(blkcg)) {
+		cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
+				    cfqd->queue->node);
+		if (!cfqg)
+			return NULL;
+
+		init_cfqg(cfqd, blkcg, cfqg);
+		cfqg_set_parent(cfqd, cfqg, NULL);
+		return cfqg;
+	}
+
+	/*
+	 * Allocate the CFQ group chain until we meet the group we'v already
+	 * allocated before, or to the CGroup whose use_hierarchy is not set.
+	 */
+	leaf_cfqg = NULL;
+	child_cfqg = NULL;
+	for (; cgroup != NULL; cgroup = cgroup->parent) {
+		blkcg = cgroup_to_blkio_cgroup(cgroup);
+		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+		if (cfqg) {
+			if (!cfqg->blkg.dev && bdi->dev &&
+			    dev_name(bdi->dev)) {
+				sscanf(dev_name(bdi->dev), "%u:%u",
+				       &major, &minor);
+				cfqg->blkg.dev = MKDEV(major, minor);
+			}
+
+			/*
+			 * Initialization of parent doesn't finish yet, get
+			 * it done.
+			 */
+			if (child_cfqg) {
+				if (blkcg_get_use_hierarchy(blkcg))
+					cfqg_set_parent(cfqd, child_cfqg,
+							cfqg);
+				else
+					cfqg_set_parent(cfqd, child_cfqg,
+							NULL);
+			}
+
+			/* chain has already been built */
+			break;
+		}
+
+		/*
+		 * We only allocate a cfqg that the corresponding cgroup's
+		 * use_hierarchy is set.
+		 */
+		if (blkcg_get_use_hierarchy(blkcg)) {
+			cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
+					    cfqd->queue->node);
+			if (!cfqg)
+				goto clean_up;
+
+			if (!leaf_cfqg)
+				leaf_cfqg = cfqg;
+
+			init_cfqg(cfqd, blkcg, cfqg);
+		} else {
+			cfqg = NULL;
+		}
+
+		if (child_cfqg)
+			cfqg_set_parent(cfqd, child_cfqg, cfqg);
+
+		/*
+		 * This CGroup's use_hierarchy isn't set, this means the CFQ
+		 * group chain has been built.
+		 */
+		if (!blkcg_get_use_hierarchy(blkcg))
+			break;
+
+		child_cfqg = cfqg;
+	}
+
+	return leaf_cfqg;
+
+clean_up:
+	/* clean up the allocated cfq groups. */
+	while (leaf_cfqg) {
+		tmp_cfqg = leaf_cfqg;
+		leaf_cfqg = cfqg_get_parent(leaf_cfqg);
+		uninit_cfqg(cfqd, tmp_cfqg);
+	}
+
+	return NULL;
+}
+
+static struct cfq_group *
+cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	struct cfq_group *cfqg = NULL;
+	void *key = cfqd;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+	unsigned int major, minor;
+
+	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+		cfqg->blkg.dev = MKDEV(major, minor);
+		goto done;
+	}
+	if (cfqg || !create)
+		goto done;
+
+	/*
+	 * Allocate CFQ group chain to the root group or we meet the CGroup
+	 * with use_hierarchy disabled.
+	 */
+	cfqg = cfqg_chain_alloc(cfqd, cgroup);
 
 done:
 	return cfqg;
@@ -1142,13 +1364,26 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st;
 	int i, j;
+	struct cfq_group *p_cfqg;
 
 	BUG_ON(atomic_read(&cfqg->ref) <= 0);
 	if (!atomic_dec_and_test(&cfqg->ref))
 		return;
+
 	for_each_cfqg_st(cfqg, i, j, st)
 		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
-	kfree(cfqg);
+
+	do {
+		p_cfqg = cfqg_get_parent(cfqg);
+		kfree(cfqg);
+		cfqg = NULL;
+		/*
+		 * Drop the reference taken by children, if nobody references
+		 * parent group, we need delete the parent also.
+		 */
+		if (p_cfqg && atomic_dec_and_test(&p_cfqg->ref))
+			cfqg = p_cfqg;
+	} while (cfqg);
 }
 
 static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
@@ -1356,9 +1591,8 @@ insert:
 	cfqe->service_tree = service_tree;
 
 	/* Add cfqq onto service tree. */
+
 	cfq_entity_service_tree_add(service_tree, cfqe);
-	update_min_vdisktime(service_tree);
-	cfqq->reposition_time = jiffies;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
 	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
@@ -1801,28 +2035,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 	return cfqq_of_entity(cfq_rb_first(service_tree));
 }
 
-static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
+struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg)
 {
-	struct cfq_group *cfqg;
-	struct cfq_entity *cfqe;
 	int i, j;
 	struct cfq_rb_root *st;
 
-	if (!cfqd->rq_queued)
-		return NULL;
+	for_each_cfqg_st(cfqg, i, j, st) {
+		if (st->count != 0)
+			return st;
+	}
 
-	cfqg = cfq_get_next_cfqg(cfqd);
-	if (!cfqg)
+	return NULL;
+}
+
+static struct cfq_entity *
+cfq_get_next_entity_forced(struct cfq_data *cfqd)
+{
+	struct cfq_entity *cfqe;
+	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct cfq_group *cfqg;
+
+	if (!cfqd->rq_queued)
 		return NULL;
 
-	for_each_cfqg_st(cfqg, i, j, st) {
+	do {
 		cfqe = cfq_rb_first(st);
-		if (cfqe != NULL)
-			return cfqq_of_entity(cfqe);
-	}
+		if (cfqe && !cfqe->is_group_entity)
+			return cfqe;
+		else if (cfqe && cfqe->is_group_entity)
+			cfqg = cfqg_of_entity(cfqe);
+
+		st = choose_service_tree_forced(cfqg);
+	} while (st);
+
 	return NULL;
 }
 
+
 /*
  * Get and set a new active queue for service.
  */
@@ -2178,7 +2427,6 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 				struct cfq_group *cfqg, enum wl_prio_t prio)
 {
 	struct cfq_entity *cfqe;
-	struct cfq_queue *cfqq;
 	unsigned long lowest_start_time;
 	int i;
 	bool time_valid = false;
@@ -2190,11 +2438,10 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 	 */
 	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
 		cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
-		cfqq = cfqq_of_entity(cfqe);
 		if (cfqe && (!time_valid ||
-			     time_before(cfqq->reposition_time,
+			     time_before(cfqe->reposition_time,
 					 lowest_start_time))) {
-			lowest_start_time = cfqq->reposition_time;
+			lowest_start_time = cfqe->reposition_time;
 			cur_best = i;
 			time_valid = true;
 		}
@@ -2203,46 +2450,13 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 	return cur_best;
 }
 
-static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
+static void set_workload_expire(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	unsigned slice;
 	unsigned count;
 	struct cfq_rb_root *st;
 	unsigned group_slice;
-	enum wl_prio_t original_prio = cfqd->serving_prio;
-
-	/* Choose next priority. RT > BE > IDLE */
-	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
-		cfqd->serving_prio = RT_WORKLOAD;
-	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
-		cfqd->serving_prio = BE_WORKLOAD;
-	else {
-		cfqd->serving_prio = IDLE_WORKLOAD;
-		cfqd->workload_expires = jiffies + 1;
-		return;
-	}
-
-	if (original_prio != cfqd->serving_prio)
-		goto new_workload;
-
-	/*
-	 * For RT and BE, we have to choose also the type
-	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
-	 * expiration time
-	 */
-	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
-	count = st->count;
-
-	/*
-	 * check workload expiration, and that we still have other queues ready
-	 */
-	if (count && !time_after(jiffies, cfqd->workload_expires))
-		return;
 
-new_workload:
-	/* otherwise select new workload type */
-	cfqd->serving_type =
-		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
 	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
 	count = st->count;
 
@@ -2281,38 +2495,63 @@ new_workload:
 	slice = max_t(unsigned, slice, CFQ_MIN_TT);
 	cfq_log(cfqd, "workload slice:%d", slice);
 	cfqd->workload_expires = jiffies + slice;
+	/* Restore the previous saved slice. */
+	if (cfqg->saved_workload_slice)
+		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
 }
 
-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
+static struct cfq_rb_root *choose_service_tree(struct cfq_data *cfqd,
+					       struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-	struct cfq_group *cfqg;
-	struct cfq_entity *cfqe;
-
-	if (RB_EMPTY_ROOT(&st->rb))
+	if (!cfqg) {
+		cfqd->serving_prio = IDLE_WORKLOAD;
+		cfqd->workload_expires = jiffies + 1;
 		return NULL;
-	cfqe = cfq_rb_first_entity(st);
-	cfqg = cfqg_of_entity(cfqe);
-	BUG_ON(!cfqg);
-	update_min_vdisktime(st);
-	return cfqg;
+	}
+
+	/* Choose next priority. RT > BE > IDLE */
+	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
+		cfqd->serving_prio = RT_WORKLOAD;
+	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
+		cfqd->serving_prio = BE_WORKLOAD;
+	else {
+		cfqd->serving_prio = IDLE_WORKLOAD;
+		cfqd->workload_expires = jiffies + 1;
+		return service_tree_for(cfqg, cfqd->serving_prio,
+					cfqd->serving_type);
+	}
+
+	/* otherwise select new workload type */
+	cfqd->serving_type =
+		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
+
+	return service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
 }
 
-static void cfq_choose_cfqg(struct cfq_data *cfqd)
+static struct cfq_entity *cfq_select_entity(struct cfq_data *cfqd)
 {
-	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
+	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct cfq_entity *cfqe;
+	struct cfq_group *cfqg = NULL;
 
-	cfqd->serving_group = cfqg;
+	if (!cfqd->rq_queued)
+		return NULL;
 
-	/* Restore the workload type data */
-	if (cfqg->saved_workload_slice) {
-		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
-		cfqd->serving_type = cfqg->saved_workload;
-		cfqd->serving_prio = cfqg->saved_serving_prio;
-	} else
-		cfqd->workload_expires = jiffies - 1;
+	do {
+		cfqe = cfq_rb_first(st);
+		if (!cfqe->is_group_entity) {
+			set_workload_expire(cfqd, cfqg);
+			cfqd->serving_group = cfqg;
+			return cfqe;
+		} else {
+			cfqg = cfqg_of_entity(cfqe);
+			st = choose_service_tree(cfqd, cfqg);
+			if (!st || RB_EMPTY_ROOT(&st->rb))
+				return NULL;
+		}
+	} while (st);
 
-	choose_service_tree(cfqd, cfqg);
+	return NULL;
 }
 
 /*
@@ -2322,6 +2561,7 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
+	struct cfq_entity *entity;
 
 	cfqq = cfqd->active_queue;
 	if (!cfqq)
@@ -2421,9 +2661,9 @@ new_queue:
 	 * Current queue expired. Check if we have to switch to a new
 	 * service tree
 	 */
-	if (!new_cfqq)
-		cfq_choose_cfqg(cfqd);
-
+	entity = cfq_select_entity(cfqd);
+	BUG_ON(entity->is_group_entity);
+	new_cfqq = cfqq_of_entity(entity);
 	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
 keep_queue:
 	return cfqq;
@@ -2453,10 +2693,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq;
 	int dispatched = 0;
+	struct cfq_entity *cfqe;
 
 	/* Expire the timeslice of the current active queue first */
 	cfq_slice_expired(cfqd, 0);
-	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
+
+	while ((cfqe = cfq_get_next_entity_forced(cfqd)) != NULL) {
+		BUG_ON(cfqe->is_group_entity);
+		cfqq = cfqq_of_entity(cfqe);
 		__cfq_set_active_queue(cfqd, cfqq);
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 	}
@@ -2464,6 +2708,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 	BUG_ON(cfqd->busy_queues);
 
 	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
+
 	return dispatched;
 }
 
@@ -4008,9 +4253,6 @@ static void *cfq_init_queue(struct request_queue *q)
 
 	cfqd->cic_index = i;
 
-	/* Init root service tree */
-	cfqd->grp_service_tree = CFQ_RB_ROOT;
-
 	/* Init root group */
 	cfqg = &cfqd->root_group;
 	for_each_cfqg_st(cfqg, i, j, st)
@@ -4020,6 +4262,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	/* Give preference to root group over other groups */
 	cfqg->cfqe.weight = 2*BLKIO_WEIGHT_DEFAULT;
 	cfqg->cfqe.is_group_entity = true;
+	cfqg_set_parent(cfqd, cfqg, NULL);
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	/*
@@ -4072,6 +4315,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_latency = 1;
 	cfqd->cfq_group_isolation = 0;
 	cfqd->hw_tag = -1;
+
 	/*
 	 * we optimistically start assuming sync ops weren't delayed in last
 	 * second, in order to have larger depth for async operations.
-- 
1.6.5.2








^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 6/6 v3] blkio-cgroup: Document for blkio.use_hierarchy interface
       [not found] <4D180ECE.4000305@cn.fujitsu.com>
                   ` (4 preceding siblings ...)
  2010-12-27  8:51 ` [PATCH 5/6 v3] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface Gui Jianfeng
@ 2010-12-27  8:51 ` Gui Jianfeng
  2011-01-18 20:27   ` Vivek Goyal
  5 siblings, 1 reply; 21+ messages in thread
From: Gui Jianfeng @ 2010-12-27  8:51 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe
  Cc: Gui Jianfeng, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

Document for blkio.use_hierarchy interface

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 Documentation/cgroups/blkio-controller.txt |   70 ++++++++++++++++++++--------
 1 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 4ed7b5c..bd01d6c 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -91,30 +91,51 @@ Throttling/Upper Limit policy
 
 Hierarchical Cgroups
 ====================
-- Currently none of the IO control policy supports hierarhical groups. But
-  cgroup interface does allow creation of hierarhical cgroups and internally
-  IO policies treat them as flat hierarchy.
+- Cgroup interface allows creation of hierarchical cgroups. Currently,
+  internally IO policies are able to treat them as flat hierarchy or
+  hierarchical hierarchy. Both hierarchical bandwidth division and flat
+  bandwidth division are supported. "blkio.use_hierarchy" can be used to
+  switch between flat mode and hierarchical mode.
 
-  So this patch will allow creation of cgroup hierarhcy but at the backend
-  everything will be treated as flat. So if somebody created a hierarchy like
-  as follows.
+  Consider the following CGroup hierarchy:
 
-			root
-			/  \
-		     test1 test2
-			|
-		     test3
+			  Root
+			/  |   \
+		     Grp1  Grp2 tsk1
+	            /  \
+		 Grp3 tsk2
 
-  CFQ and throttling will practically treat all groups at same level.
+  If blkio.use_hierarchy is disabled in all CGroups, CFQ will practically treat all groups
+  at the same level.
 
-				pivot
-			     /  |   \  \
-			root  test1 test2  test3
+			     Pivot tree
+			    /  |   |   \
+			Root Grp1 Grp2 Grp3
+			 /     |
+			tsk1  tsk2
 
-  Down the line we can implement hierarchical accounting/control support
-  and also introduce a new cgroup file "use_hierarchy" which will control
-  whether cgroup hierarchy is viewed as flat or hierarchical by the policy..
-  This is how memory controller also has implemented the things.
+  If blkio.use_hierarchy is enabled in Grp1 and Grp3, CFQ will treat groups and tasks as the
+  same view in CGroup hierarchy, it looks as follows.
+
+
+			     Pivot tree
+			    /    |    \
+			  Root  Grp1  Grp2
+			  /     /  \
+		       tsk1   Grp3 tsk2
+
+  Root, Grp1 and Grp2 are treated at the same level under Pivot tree. tsk1 stays under Root.
+  Grp3 and tsk2 are treated at the same level under Grp1. Below is the mapping between
+  task io priority and io weight:
+
+	    prio       0    1     2    3    4    5    6     7
+	    weight  1000  868   740  612  484  356  228   100
+
+  Note1: Regardless of the use_hierarchy setting in Root group, Root group is always put onto
+  Pivot tree.
+
+  Note2: Currently, "blkio.use_hierarchy" only effects proportional bandwidth division. For
+  Throttling logic, it still continue to treat everything as flat.
 
 Various user visible config options
 ===================================
@@ -169,6 +190,17 @@ Proportional weight policy files
 	  dev     weight
 	  8:16    300
 
+- blkio.use_hierarchy
+	- Switch between hierarchical mode and flat mode as stated above.
+	  blkio.use_hierarchy == 1 means hierarchical mode is enabled.
+	  blkio.use_hierarchy == 0 means flat mode is enabled.
+	  You can set this interface only if there isn't any child CGroup under
+	  this CGroup. If one CGroup's blkio.use_hierarchy is set, the created
+	  children will inherit it. it's not allowed to unset it in children.
+	  The default mode in Root CGroup is flat.
+	  blkio.use_hierarchy only works for proportional bandwidth division
+	  as of today and doesn't have any effect on throttling logic.
+
 - blkio.time
 	- disk time allocated to cgroup per device in milliseconds. First
 	  two fields specify the major and minor number of the device and
-- 
1.6.5.2








^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2010-12-27  8:51 ` [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
@ 2010-12-28  2:50   ` Shaohua Li
  2010-12-28  3:59     ` Gui Jianfeng
  2011-01-19 22:58   ` Vivek Goyal
  1 sibling, 1 reply; 21+ messages in thread
From: Shaohua Li @ 2010-12-28  2:50 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Vivek Goyal, Jens Axboe, linux kernel mailing list,
	Corrado Zoccolo, Chad Talbott, Nauman Rafique, Divyesh Shah,
	jmoyer

On Mon, 2010-12-27 at 16:51 +0800, Gui Jianfeng wrote:
> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
> group scheduling on the same service tree.
Haven't read the whole series yet. Looks this changes a lot of logic
even without cgroup support. Did you have test if this changes both
performance and latency in some mixed workloads?

> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> ---
>  block/cfq-iosched.c |  211 ++++++++++++++++++++++++++++++++++++++-------------
>  1 files changed, 158 insertions(+), 53 deletions(-)
> 
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index a2553c0..b598798 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -100,10 +100,7 @@ struct cfq_entity {
>         struct cfq_rb_root *service_tree;
>         /* service_tree member */
>         struct rb_node rb_node;
> -       /* service_tree key, represent the position on the tree */
> -       unsigned long rb_key;
> -
> -       /* group service_tree key */
> +       /* service_tree key */
>         u64 vdisktime;
>         bool is_group_entity;
>         unsigned int weight;
> @@ -115,6 +112,8 @@ struct cfq_entity {
>  struct cfq_queue {
>         /* The schedule entity */
>         struct cfq_entity cfqe;
> +       /* Reposition time */
> +       unsigned long reposition_time;
>         /* reference count */
>         atomic_t ref;
>         /* various state flags, see below */
> @@ -313,6 +312,24 @@ struct cfq_data {
>         struct rcu_head rcu;
>  };
> 
> +/*
> + * Map io priority(7 ~ 0) to io weight(100 ~ 1000) as follows
> + *     prio       0    1     2    3    4    5    6     7
> + *     weight  1000  868   740  612  484  356  228   100
> + */
> +static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
> +{
> +       unsigned int step;
> +
> +       BUG_ON(ioprio >= IOPRIO_BE_NR);
> +
> +       step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
> +       if (ioprio == 0)
> +               return BLKIO_WEIGHT_MAX;
> +
> +       return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
> +}
> +
>  static inline struct cfq_queue *
>  cfqq_of_entity(struct cfq_entity *cfqe)
>  {
> @@ -841,16 +858,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>         return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
>  }
> 
> -static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
> -                                     struct cfq_queue *cfqq)
> -{
> -       /*
> -        * just an approximation, should be ok.
> -        */
> -       return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
> -                      cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
> -}
> -
>  static inline s64
>  entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
>  {
> @@ -1199,6 +1206,16 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
> 
>  #endif /* GROUP_IOSCHED */
> 
> +static inline u64 cfq_get_boost(struct cfq_data *cfqd,
> +                                struct cfq_entity *cfqe)
> +{
> +       u64 d = cfqd->cfq_slice[1] << CFQ_SERVICE_SHIFT;
> +
> +       d = d * BLKIO_WEIGHT_DEFAULT;
> +       do_div(d, BLKIO_WEIGHT_MAX - cfqe->weight + BLKIO_WEIGHT_MIN);
> +       return d;
> +}
don't consider sync/async here? for group, we don't have such issue, but
we need boost sync otherwise.
> +
>  /*
>   * The cfqd->service_trees holds all pending cfq_queue's that have
>   * requests waiting to be processed. It is sorted in the order that
> @@ -1210,13 +1227,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>         struct cfq_entity *cfqe;
>         struct rb_node **p, *parent;
>         struct cfq_entity *__cfqe;
> -       unsigned long rb_key;
> -       struct cfq_rb_root *service_tree;
> +       struct cfq_rb_root *service_tree, *orig_st;
>         int left;
>         int new_cfqq = 1;
>         int group_changed = 0;
> +       s64 key;
> 
>         cfqe = &cfqq->cfqe;
> +       orig_st = cfqe->service_tree;
> 
>  #ifdef CONFIG_CFQ_GROUP_IOSCHED
>         if (!cfqd->cfq_group_isolation
> @@ -1224,8 +1242,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>             && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
>                 /* Move this cfq to root group */
>                 cfq_log_cfqq(cfqd, cfqq, "moving to root group");
> -               if (!RB_EMPTY_NODE(&cfqe->rb_node))
> +               if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>                         cfq_group_service_tree_del(cfqd, cfqq->cfqg);
> +                       /*
> +                        * Group changed, dequeue this CFQ queue from the
> +                        * original service tree.
> +                        */
> +                       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> +                       orig_st->total_weight -= cfqe->weight;
> +               }
>                 cfqq->orig_cfqg = cfqq->cfqg;
>                 cfqq->cfqg = &cfqd->root_group;
>                 atomic_inc(&cfqd->root_group.ref);
> @@ -1234,8 +1259,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>                    && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
>                 /* cfqq is sequential now needs to go to its original group */
>                 BUG_ON(cfqq->cfqg != &cfqd->root_group);
> -               if (!RB_EMPTY_NODE(&cfqe->rb_node))
> +               if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>                         cfq_group_service_tree_del(cfqd, cfqq->cfqg);
> +                       /*
> +                        * Group changed, dequeue this CFQ queue from the
> +                        * original service tree.
> +                        */
> +                       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> +                       orig_st->total_weight -= cfqe->weight;
> +               }
>                 cfq_put_cfqg(cfqq->cfqg);
>                 cfqq->cfqg = cfqq->orig_cfqg;
>                 cfqq->orig_cfqg = NULL;
> @@ -1246,47 +1278,71 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> 
>         service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
>                                                 cfqq_type(cfqq));
> +       /*
> +        * For the time being, put the newly added CFQ queue at the end of the
> +        * service tree.
> +        */
so this should be fixed later? please mark it as a todo.
> +       if (RB_EMPTY_NODE(&cfqe->rb_node)) {
> +               /*
> +                * If this CFQ queue moves to another group, the original
> +                * vdisktime makes no sense any more, reset the vdisktime
> +                * here.
> +                */
> +               parent = rb_last(&service_tree->rb);
> +               if (parent) {
> +                       u64 boost;
> +                       s64 __vdisktime;
> +
> +                       __cfqe = rb_entry_entity(parent);
> +                       cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> +
> +                       /* Give some vdisktime boost according to its weight */
> +                       boost = cfq_get_boost(cfqd, cfqe);
> +                       __vdisktime = cfqe->vdisktime - boost;
> +                       if (__vdisktime > service_tree->min_vdisktime)
> +                               cfqe->vdisktime = __vdisktime;
> +                       else
> +                               cfqe->vdisktime = service_tree->min_vdisktime;
> +               } else
> +                       cfqe->vdisktime = service_tree->min_vdisktime;
> +
> +               goto insert;
> +       }
> +       /*
> +        * Ok, we get here, this CFQ queue is on the service tree, dequeue it
> +        * firstly.
> +        */
> +       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> +       orig_st->total_weight -= cfqe->weight;
> +
> +       new_cfqq = 0;
> +
>         if (cfq_class_idle(cfqq)) {
> -               rb_key = CFQ_IDLE_DELAY;
>                 parent = rb_last(&service_tree->rb);
>                 if (parent && parent != &cfqe->rb_node) {
>                         __cfqe = rb_entry(parent, struct cfq_entity, rb_node);
> -                       rb_key += __cfqe->rb_key;
> +                       cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>                 } else
> -                       rb_key += jiffies;
> +                       cfqe->vdisktime = service_tree->min_vdisktime;
>         } else if (!add_front) {
>                 /*
> -                * Get our rb key offset. Subtract any residual slice
> -                * value carried from last service. A negative resid
> -                * count indicates slice overrun, and this should position
> -                * the next service time further away in the tree.
> -                */
> -               rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
> -               rb_key -= cfqq->slice_resid;
> -               cfqq->slice_resid = 0;
> -       } else {
> -               rb_key = -HZ;
> -               __cfqe = cfq_rb_first(service_tree);
> -               rb_key += __cfqe ? __cfqe->rb_key : jiffies;
> -       }
> -
> -       if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> -               new_cfqq = 0;
> -               /*
> -                * same position, nothing more to do
> +                * We charge the CFQ queue by the time this queue runs, and
> +                * repsition it on the service tree.
>                  */
> -               if (rb_key == cfqe->rb_key &&
> -                   cfqe->service_tree == service_tree)
> -                       return;
> +               unsigned int used_sl;
> 
> -               cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> -               cfqe->service_tree = NULL;
> +               used_sl = cfq_cfqq_slice_usage(cfqq);
> +               cfqe->vdisktime += cfq_scale_slice(used_sl, cfqe);
cfqq->slice_resid could be > 0 or < 0, and the queue can be boosted or
penalized. Looks the logic is completely removed. This exists in current
cfq group too, I wonder why?

> +       } else {
> +               cfqe->vdisktime = service_tree->min_vdisktime;
>         }
> 
> +insert:
>         left = 1;
>         parent = NULL;
>         cfqe->service_tree = service_tree;
>         p = &service_tree->rb.rb_node;
> +       key = entity_key(service_tree, cfqe);
>         while (*p) {
>                 struct rb_node **n;
> 
> @@ -1296,7 +1352,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>                 /*
>                  * sort by key, that represents service time.
>                  */
> -               if (time_before(rb_key, __cfqe->rb_key))
> +               if (key < entity_key(service_tree, __cfqe))
>                         n = &(*p)->rb_left;
>                 else {
>                         n = &(*p)->rb_right;
> @@ -1309,10 +1365,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>         if (left)
>                 service_tree->left = &cfqe->rb_node;
> 
> -       cfqe->rb_key = rb_key;
>         rb_link_node(&cfqe->rb_node, parent, p);
>         rb_insert_color(&cfqe->rb_node, &service_tree->rb);
> +       update_min_vdisktime(service_tree);
>         service_tree->count++;
> +       service_tree->total_weight += cfqe->weight;
> +       cfqq->reposition_time = jiffies;
>         if ((add_front || !new_cfqq) && !group_changed)
>                 return;
>         cfq_group_service_tree_add(cfqd, cfqq->cfqg);
> @@ -1414,14 +1472,18 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>  static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>  {
>         struct cfq_entity *cfqe;
> +       struct cfq_rb_root *service_tree;
> +
>         cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
>         BUG_ON(!cfq_cfqq_on_rr(cfqq));
>         cfq_clear_cfqq_on_rr(cfqq);
> 
>         cfqe = &cfqq->cfqe;
> +       service_tree = cfqe->service_tree;
> 
>         if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>                 cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> +               service_tree->total_weight -= cfqe->weight;
>                 cfqe->service_tree = NULL;
>         }
>         if (cfqq->p_root) {
> @@ -2120,23 +2182,35 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
>         }
>  }
> 
> +/*
> + * The time when a CFQ queue is put onto a service tree is recoreded in
> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
> + * on each service tree, and select the workload type that contain the lowest
> + * reposition_time CFQ queue among them.
> + */
>  static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>                                 struct cfq_group *cfqg, enum wl_prio_t prio)
>  {
>         struct cfq_entity *cfqe;
> +       struct cfq_queue *cfqq;
> +       unsigned long lowest_start_time;
>         int i;
> -       bool key_valid = false;
> -       unsigned long lowest_key = 0;
> +       bool time_valid = false;
>         enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
> 
> +       /*
> +        * TODO: We may take io priority into account when choosing a workload
> +        * type. But for the time being just make use of reposition_time only.
> +        */
not just priority, original rb_key considers sys/async too.
>         for (i = 0; i <= SYNC_WORKLOAD; ++i) {
> -               /* select the one with lowest rb_key */
>                 cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
> -               if (cfqe &&
> -                   (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
> -                       lowest_key = cfqe->rb_key;
> +               cfqq = cfqq_of_entity(cfqe);
> +               if (cfqe && (!time_valid ||
> +                            time_before(cfqq->reposition_time,
> +                                        lowest_start_time))) {
> +                       lowest_start_time = cfqq->reposition_time;
>                         cur_best = i;
> -                       key_valid = true;
> +                       time_valid = true;
>                 }
>         }

Thanks,
Shaohua


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2010-12-28  2:50   ` Shaohua Li
@ 2010-12-28  3:59     ` Gui Jianfeng
  2010-12-28  6:03       ` Shaohua Li
  0 siblings, 1 reply; 21+ messages in thread
From: Gui Jianfeng @ 2010-12-28  3:59 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Vivek Goyal, Jens Axboe, linux kernel mailing list,
	Corrado Zoccolo, Chad Talbott, Nauman Rafique, Divyesh Shah,
	jmoyer

Shaohua Li wrote:
> On Mon, 2010-12-27 at 16:51 +0800, Gui Jianfeng wrote:
>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
>> group scheduling on the same service tree.
> Haven't read the whole series yet. Looks this changes a lot of logic
> even without cgroup support. Did you have test if this changes both
> performance and latency in some mixed workloads?

Hi Shaohua,

I did some tests with different workloads and with idling on or idling off.
I don't see performance drop on my box.

> 
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>> ---
>>  block/cfq-iosched.c |  211 ++++++++++++++++++++++++++++++++++++++-------------
>>  1 files changed, 158 insertions(+), 53 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index a2553c0..b598798 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -100,10 +100,7 @@ struct cfq_entity {
>>         struct cfq_rb_root *service_tree;
>>         /* service_tree member */
>>         struct rb_node rb_node;
>> -       /* service_tree key, represent the position on the tree */
>> -       unsigned long rb_key;
>> -
>> -       /* group service_tree key */
>> +       /* service_tree key */
>>         u64 vdisktime;
>>         bool is_group_entity;
>>         unsigned int weight;
>> @@ -115,6 +112,8 @@ struct cfq_entity {
>>  struct cfq_queue {
>>         /* The schedule entity */
>>         struct cfq_entity cfqe;
>> +       /* Reposition time */
>> +       unsigned long reposition_time;
>>         /* reference count */
>>         atomic_t ref;
>>         /* various state flags, see below */
>> @@ -313,6 +312,24 @@ struct cfq_data {
>>         struct rcu_head rcu;
>>  };
>>
>> +/*
>> + * Map io priority(7 ~ 0) to io weight(100 ~ 1000) as follows
>> + *     prio       0    1     2    3    4    5    6     7
>> + *     weight  1000  868   740  612  484  356  228   100
>> + */
>> +static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
>> +{
>> +       unsigned int step;
>> +
>> +       BUG_ON(ioprio >= IOPRIO_BE_NR);
>> +
>> +       step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
>> +       if (ioprio == 0)
>> +               return BLKIO_WEIGHT_MAX;
>> +
>> +       return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
>> +}
>> +
>>  static inline struct cfq_queue *
>>  cfqq_of_entity(struct cfq_entity *cfqe)
>>  {
>> @@ -841,16 +858,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>         return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
>>  }
>>
>> -static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
>> -                                     struct cfq_queue *cfqq)
>> -{
>> -       /*
>> -        * just an approximation, should be ok.
>> -        */
>> -       return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
>> -                      cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
>> -}
>> -
>>  static inline s64
>>  entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
>>  {
>> @@ -1199,6 +1206,16 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
>>
>>  #endif /* GROUP_IOSCHED */
>>
>> +static inline u64 cfq_get_boost(struct cfq_data *cfqd,
>> +                                struct cfq_entity *cfqe)
>> +{
>> +       u64 d = cfqd->cfq_slice[1] << CFQ_SERVICE_SHIFT;
>> +
>> +       d = d * BLKIO_WEIGHT_DEFAULT;
>> +       do_div(d, BLKIO_WEIGHT_MAX - cfqe->weight + BLKIO_WEIGHT_MIN);
>> +       return d;
>> +}
> don't consider sync/async here? for group, we don't have such issue, but
> we need boost sync otherwise.

Ok, we should.

>> +
>>  /*
>>   * The cfqd->service_trees holds all pending cfq_queue's that have
>>   * requests waiting to be processed. It is sorted in the order that
>> @@ -1210,13 +1227,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>         struct cfq_entity *cfqe;
>>         struct rb_node **p, *parent;
>>         struct cfq_entity *__cfqe;
>> -       unsigned long rb_key;
>> -       struct cfq_rb_root *service_tree;
>> +       struct cfq_rb_root *service_tree, *orig_st;
>>         int left;
>>         int new_cfqq = 1;
>>         int group_changed = 0;
>> +       s64 key;
>>
>>         cfqe = &cfqq->cfqe;
>> +       orig_st = cfqe->service_tree;
>>
>>  #ifdef CONFIG_CFQ_GROUP_IOSCHED
>>         if (!cfqd->cfq_group_isolation
>> @@ -1224,8 +1242,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>             && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
>>                 /* Move this cfq to root group */
>>                 cfq_log_cfqq(cfqd, cfqq, "moving to root group");
>> -               if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> +               if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>>                         cfq_group_service_tree_del(cfqd, cfqq->cfqg);
>> +                       /*
>> +                        * Group changed, dequeue this CFQ queue from the
>> +                        * original service tree.
>> +                        */
>> +                       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
>> +                       orig_st->total_weight -= cfqe->weight;
>> +               }
>>                 cfqq->orig_cfqg = cfqq->cfqg;
>>                 cfqq->cfqg = &cfqd->root_group;
>>                 atomic_inc(&cfqd->root_group.ref);
>> @@ -1234,8 +1259,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>                    && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
>>                 /* cfqq is sequential now needs to go to its original group */
>>                 BUG_ON(cfqq->cfqg != &cfqd->root_group);
>> -               if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> +               if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>>                         cfq_group_service_tree_del(cfqd, cfqq->cfqg);
>> +                       /*
>> +                        * Group changed, dequeue this CFQ queue from the
>> +                        * original service tree.
>> +                        */
>> +                       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
>> +                       orig_st->total_weight -= cfqe->weight;
>> +               }
>>                 cfq_put_cfqg(cfqq->cfqg);
>>                 cfqq->cfqg = cfqq->orig_cfqg;
>>                 cfqq->orig_cfqg = NULL;
>> @@ -1246,47 +1278,71 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>
>>         service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
>>                                                 cfqq_type(cfqq));
>> +       /*
>> +        * For the time being, put the newly added CFQ queue at the end of the
>> +        * service tree.
>> +        */
> so this should be fixed later? please mark it as a todo.

Ahh, forgot remove this annotation, I'v already boost vdisktime according to it ioprio.

>> +       if (RB_EMPTY_NODE(&cfqe->rb_node)) {
>> +               /*
>> +                * If this CFQ queue moves to another group, the original
>> +                * vdisktime makes no sense any more, reset the vdisktime
>> +                * here.
>> +                */
>> +               parent = rb_last(&service_tree->rb);
>> +               if (parent) {
>> +                       u64 boost;
>> +                       s64 __vdisktime;
>> +
>> +                       __cfqe = rb_entry_entity(parent);
>> +                       cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>> +
>> +                       /* Give some vdisktime boost according to its weight */
>> +                       boost = cfq_get_boost(cfqd, cfqe);
>> +                       __vdisktime = cfqe->vdisktime - boost;
>> +                       if (__vdisktime > service_tree->min_vdisktime)
>> +                               cfqe->vdisktime = __vdisktime;
>> +                       else
>> +                               cfqe->vdisktime = service_tree->min_vdisktime;
>> +               } else
>> +                       cfqe->vdisktime = service_tree->min_vdisktime;
>> +
>> +               goto insert;
>> +       }
>> +       /*
>> +        * Ok, we get here, this CFQ queue is on the service tree, dequeue it
>> +        * firstly.
>> +        */
>> +       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
>> +       orig_st->total_weight -= cfqe->weight;
>> +
>> +       new_cfqq = 0;
>> +
>>         if (cfq_class_idle(cfqq)) {
>> -               rb_key = CFQ_IDLE_DELAY;
>>                 parent = rb_last(&service_tree->rb);
>>                 if (parent && parent != &cfqe->rb_node) {
>>                         __cfqe = rb_entry(parent, struct cfq_entity, rb_node);
>> -                       rb_key += __cfqe->rb_key;
>> +                       cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>>                 } else
>> -                       rb_key += jiffies;
>> +                       cfqe->vdisktime = service_tree->min_vdisktime;
>>         } else if (!add_front) {
>>                 /*
>> -                * Get our rb key offset. Subtract any residual slice
>> -                * value carried from last service. A negative resid
>> -                * count indicates slice overrun, and this should position
>> -                * the next service time further away in the tree.
>> -                */
>> -               rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
>> -               rb_key -= cfqq->slice_resid;
>> -               cfqq->slice_resid = 0;
>> -       } else {
>> -               rb_key = -HZ;
>> -               __cfqe = cfq_rb_first(service_tree);
>> -               rb_key += __cfqe ? __cfqe->rb_key : jiffies;
>> -       }
>> -
>> -       if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>> -               new_cfqq = 0;
>> -               /*
>> -                * same position, nothing more to do
>> +                * We charge the CFQ queue by the time this queue runs, and
>> +                * repsition it on the service tree.
>>                  */
>> -               if (rb_key == cfqe->rb_key &&
>> -                   cfqe->service_tree == service_tree)
>> -                       return;
>> +               unsigned int used_sl;
>>
>> -               cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
>> -               cfqe->service_tree = NULL;
>> +               used_sl = cfq_cfqq_slice_usage(cfqq);
>> +               cfqe->vdisktime += cfq_scale_slice(used_sl, cfqe);
> cfqq->slice_resid could be > 0 or < 0, and the queue can be boosted or
> penalized. Looks the logic is completely removed. This exists in current
> cfq group too, I wonder why?

With vdisktime introduced, vdisktime represents cfqq's position on a service
tree. And vdisktime is decided by the acutal running time by a cfqq. So we 
don't need slice_resid any more here.

> 
>> +       } else {
>> +               cfqe->vdisktime = service_tree->min_vdisktime;
>>         }
>>
>> +insert:
>>         left = 1;
>>         parent = NULL;
>>         cfqe->service_tree = service_tree;
>>         p = &service_tree->rb.rb_node;
>> +       key = entity_key(service_tree, cfqe);
>>         while (*p) {
>>                 struct rb_node **n;
>>
>> @@ -1296,7 +1352,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>                 /*
>>                  * sort by key, that represents service time.
>>                  */
>> -               if (time_before(rb_key, __cfqe->rb_key))
>> +               if (key < entity_key(service_tree, __cfqe))
>>                         n = &(*p)->rb_left;
>>                 else {
>>                         n = &(*p)->rb_right;
>> @@ -1309,10 +1365,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>         if (left)
>>                 service_tree->left = &cfqe->rb_node;
>>
>> -       cfqe->rb_key = rb_key;
>>         rb_link_node(&cfqe->rb_node, parent, p);
>>         rb_insert_color(&cfqe->rb_node, &service_tree->rb);
>> +       update_min_vdisktime(service_tree);
>>         service_tree->count++;
>> +       service_tree->total_weight += cfqe->weight;
>> +       cfqq->reposition_time = jiffies;
>>         if ((add_front || !new_cfqq) && !group_changed)
>>                 return;
>>         cfq_group_service_tree_add(cfqd, cfqq->cfqg);
>> @@ -1414,14 +1472,18 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>>  static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>>  {
>>         struct cfq_entity *cfqe;
>> +       struct cfq_rb_root *service_tree;
>> +
>>         cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
>>         BUG_ON(!cfq_cfqq_on_rr(cfqq));
>>         cfq_clear_cfqq_on_rr(cfqq);
>>
>>         cfqe = &cfqq->cfqe;
>> +       service_tree = cfqe->service_tree;
>>
>>         if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>>                 cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
>> +               service_tree->total_weight -= cfqe->weight;
>>                 cfqe->service_tree = NULL;
>>         }
>>         if (cfqq->p_root) {
>> @@ -2120,23 +2182,35 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
>>         }
>>  }
>>
>> +/*
>> + * The time when a CFQ queue is put onto a service tree is recoreded in
>> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
>> + * on each service tree, and select the workload type that contain the lowest
>> + * reposition_time CFQ queue among them.
>> + */
>>  static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>>                                 struct cfq_group *cfqg, enum wl_prio_t prio)
>>  {
>>         struct cfq_entity *cfqe;
>> +       struct cfq_queue *cfqq;
>> +       unsigned long lowest_start_time;
>>         int i;
>> -       bool key_valid = false;
>> -       unsigned long lowest_key = 0;
>> +       bool time_valid = false;
>>         enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>>
>> +       /*
>> +        * TODO: We may take io priority into account when choosing a workload
>> +        * type. But for the time being just make use of reposition_time only.
>> +        */
> not just priority, original rb_key considers sys/async too.

IIUC, in original CFQ, rb_key is cross tree, so we need to take sync/async into
account. But with vdisktime introduced, each service has its own vdisktime space.
So there's no need to consider sync/async here.

>>         for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>> -               /* select the one with lowest rb_key */
>>                 cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
>> -               if (cfqe &&
>> -                   (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
>> -                       lowest_key = cfqe->rb_key;
>> +               cfqq = cfqq_of_entity(cfqe);
>> +               if (cfqe && (!time_valid ||
>> +                            time_before(cfqq->reposition_time,
>> +                                        lowest_start_time))) {
>> +                       lowest_start_time = cfqq->reposition_time;
>>                         cur_best = i;
>> -                       key_valid = true;
>> +                       time_valid = true;
>>                 }
>>         }
> 
> Thanks,
> Shaohua
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

-- 
Regards
Gui Jianfeng

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2010-12-28  3:59     ` Gui Jianfeng
@ 2010-12-28  6:03       ` Shaohua Li
  2010-12-28  6:59         ` Gui Jianfeng
  0 siblings, 1 reply; 21+ messages in thread
From: Shaohua Li @ 2010-12-28  6:03 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Vivek Goyal, Jens Axboe, linux kernel mailing list,
	Corrado Zoccolo, Chad Talbott, Nauman Rafique, Divyesh Shah,
	jmoyer

On Tue, 2010-12-28 at 11:59 +0800, Gui Jianfeng wrote:
> Shaohua Li wrote:
> > On Mon, 2010-12-27 at 16:51 +0800, Gui Jianfeng wrote:
> >> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
> >> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
> >> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
> >> group scheduling on the same service tree.
> > Haven't read the whole series yet. Looks this changes a lot of logic
> > even without cgroup support. Did you have test if this changes both
> > performance and latency in some mixed workloads?
> 
> Hi Shaohua,
> 
> I did some tests with different workloads and with idling on or idling off.
> I don't see performance drop on my box.
please check latency too.

> >> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> >> ---
> >>  block/cfq-iosched.c |  211 ++++++++++++++++++++++++++++++++++++++-------------
> >>  1 files changed, 158 insertions(+), 53 deletions(-)
> >>
> >> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> >> index a2553c0..b598798 100644
> >> --- a/block/cfq-iosched.c
> >> +++ b/block/cfq-iosched.c
> >> @@ -100,10 +100,7 @@ struct cfq_entity {
> >>         struct cfq_rb_root *service_tree;
> >>         /* service_tree member */
> >>         struct rb_node rb_node;
> >> -       /* service_tree key, represent the position on the tree */
> >> -       unsigned long rb_key;
> >> -
> >> -       /* group service_tree key */
> >> +       /* service_tree key */
> >>         u64 vdisktime;
> >>         bool is_group_entity;
> >>         unsigned int weight;
> >> @@ -115,6 +112,8 @@ struct cfq_entity {
> >>  struct cfq_queue {
> >>         /* The schedule entity */
> >>         struct cfq_entity cfqe;
> >> +       /* Reposition time */
> >> +       unsigned long reposition_time;
> >>         /* reference count */
> >>         atomic_t ref;
> >>         /* various state flags, see below */
> >> @@ -313,6 +312,24 @@ struct cfq_data {
> >>         struct rcu_head rcu;
> >>  };
> >>
> >> +/*
> >> + * Map io priority(7 ~ 0) to io weight(100 ~ 1000) as follows
> >> + *     prio       0    1     2    3    4    5    6     7
> >> + *     weight  1000  868   740  612  484  356  228   100
> >> + */
> >> +static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
> >> +{
> >> +       unsigned int step;
> >> +
> >> +       BUG_ON(ioprio >= IOPRIO_BE_NR);
> >> +
> >> +       step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
> >> +       if (ioprio == 0)
> >> +               return BLKIO_WEIGHT_MAX;
> >> +
> >> +       return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
> >> +}
> >> +
> >>  static inline struct cfq_queue *
> >>  cfqq_of_entity(struct cfq_entity *cfqe)
> >>  {
> >> @@ -841,16 +858,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> >>         return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
> >>  }
> >>
> >> -static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
> >> -                                     struct cfq_queue *cfqq)
> >> -{
> >> -       /*
> >> -        * just an approximation, should be ok.
> >> -        */
> >> -       return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
> >> -                      cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
> >> -}
> >> -
> >>  static inline s64
> >>  entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
> >>  {
> >> @@ -1199,6 +1206,16 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
> >>
> >>  #endif /* GROUP_IOSCHED */
> >>
> >> +static inline u64 cfq_get_boost(struct cfq_data *cfqd,
> >> +                                struct cfq_entity *cfqe)
> >> +{
> >> +       u64 d = cfqd->cfq_slice[1] << CFQ_SERVICE_SHIFT;
> >> +
> >> +       d = d * BLKIO_WEIGHT_DEFAULT;
> >> +       do_div(d, BLKIO_WEIGHT_MAX - cfqe->weight + BLKIO_WEIGHT_MIN);
> >> +       return d;
> >> +}
> > don't consider sync/async here? for group, we don't have such issue, but
> > we need boost sync otherwise.
> 
> Ok, we should.
> 
> >> +
> >>  /*
> >>   * The cfqd->service_trees holds all pending cfq_queue's that have
> >>   * requests waiting to be processed. It is sorted in the order that
> >> @@ -1210,13 +1227,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> >>         struct cfq_entity *cfqe;
> >>         struct rb_node **p, *parent;
> >>         struct cfq_entity *__cfqe;
> >> -       unsigned long rb_key;
> >> -       struct cfq_rb_root *service_tree;
> >> +       struct cfq_rb_root *service_tree, *orig_st;
> >>         int left;
> >>         int new_cfqq = 1;
> >>         int group_changed = 0;
> >> +       s64 key;
> >>
> >>         cfqe = &cfqq->cfqe;
> >> +       orig_st = cfqe->service_tree;
> >>
> >>  #ifdef CONFIG_CFQ_GROUP_IOSCHED
> >>         if (!cfqd->cfq_group_isolation
> >> @@ -1224,8 +1242,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> >>             && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
> >>                 /* Move this cfq to root group */
> >>                 cfq_log_cfqq(cfqd, cfqq, "moving to root group");
> >> -               if (!RB_EMPTY_NODE(&cfqe->rb_node))
> >> +               if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> >>                         cfq_group_service_tree_del(cfqd, cfqq->cfqg);
> >> +                       /*
> >> +                        * Group changed, dequeue this CFQ queue from the
> >> +                        * original service tree.
> >> +                        */
> >> +                       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> >> +                       orig_st->total_weight -= cfqe->weight;
> >> +               }
> >>                 cfqq->orig_cfqg = cfqq->cfqg;
> >>                 cfqq->cfqg = &cfqd->root_group;
> >>                 atomic_inc(&cfqd->root_group.ref);
> >> @@ -1234,8 +1259,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> >>                    && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
> >>                 /* cfqq is sequential now needs to go to its original group */
> >>                 BUG_ON(cfqq->cfqg != &cfqd->root_group);
> >> -               if (!RB_EMPTY_NODE(&cfqe->rb_node))
> >> +               if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> >>                         cfq_group_service_tree_del(cfqd, cfqq->cfqg);
> >> +                       /*
> >> +                        * Group changed, dequeue this CFQ queue from the
> >> +                        * original service tree.
> >> +                        */
> >> +                       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> >> +                       orig_st->total_weight -= cfqe->weight;
> >> +               }
> >>                 cfq_put_cfqg(cfqq->cfqg);
> >>                 cfqq->cfqg = cfqq->orig_cfqg;
> >>                 cfqq->orig_cfqg = NULL;
> >> @@ -1246,47 +1278,71 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> >>
> >>         service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
> >>                                                 cfqq_type(cfqq));
> >> +       /*
> >> +        * For the time being, put the newly added CFQ queue at the end of the
> >> +        * service tree.
> >> +        */
> > so this should be fixed later? please mark it as a todo.
> 
> Ahh, forgot remove this annotation, I'v already boost vdisktime according to it ioprio.
ok, at below code, looks cfq_get_boost returns a value which is always
greater than CFQ_IDLE_DELAY, so the new queue is inserted before __cfqe,
is this intended?
> >> +       if (RB_EMPTY_NODE(&cfqe->rb_node)) {
> >> +               /*
> >> +                * If this CFQ queue moves to another group, the original
> >> +                * vdisktime makes no sense any more, reset the vdisktime
> >> +                * here.
> >> +                */
> >> +               parent = rb_last(&service_tree->rb);
> >> +               if (parent) {
> >> +                       u64 boost;
> >> +                       s64 __vdisktime;
> >> +
> >> +                       __cfqe = rb_entry_entity(parent);
> >> +                       cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> >> +
> >> +                       /* Give some vdisktime boost according to its weight */
> >> +                       boost = cfq_get_boost(cfqd, cfqe);
> >> +                       __vdisktime = cfqe->vdisktime - boost;
> >> +                       if (__vdisktime > service_tree->min_vdisktime)
> >> +                               cfqe->vdisktime = __vdisktime;
> >> +                       else
> >> +                               cfqe->vdisktime = service_tree->min_vdisktime;
> >> +               } else
> >> +                       cfqe->vdisktime = service_tree->min_vdisktime;
> >> +
> >> +               goto insert;
> >> +       }
> >> +       /*
> >> +        * Ok, we get here, this CFQ queue is on the service tree, dequeue it
> >> +        * firstly.
> >> +        */
> >> +       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> >> +       orig_st->total_weight -= cfqe->weight;
> >> +
> >> +       new_cfqq = 0;
> >> +
> >>         if (cfq_class_idle(cfqq)) {
> >> -               rb_key = CFQ_IDLE_DELAY;
> >>                 parent = rb_last(&service_tree->rb);
> >>                 if (parent && parent != &cfqe->rb_node) {
> >>                         __cfqe = rb_entry(parent, struct cfq_entity, rb_node);
> >> -                       rb_key += __cfqe->rb_key;
> >> +                       cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> >>                 } else
> >> -                       rb_key += jiffies;
> >> +                       cfqe->vdisktime = service_tree->min_vdisktime;
> >>         } else if (!add_front) {
> >>                 /*
> >> -                * Get our rb key offset. Subtract any residual slice
> >> -                * value carried from last service. A negative resid
> >> -                * count indicates slice overrun, and this should position
> >> -                * the next service time further away in the tree.
> >> -                */
> >> -               rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
> >> -               rb_key -= cfqq->slice_resid;
> >> -               cfqq->slice_resid = 0;
> >> -       } else {
> >> -               rb_key = -HZ;
> >> -               __cfqe = cfq_rb_first(service_tree);
> >> -               rb_key += __cfqe ? __cfqe->rb_key : jiffies;
> >> -       }
> >> -
> >> -       if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> >> -               new_cfqq = 0;
> >> -               /*
> >> -                * same position, nothing more to do
> >> +                * We charge the CFQ queue by the time this queue runs, and
> >> +                * repsition it on the service tree.
> >>                  */
> >> -               if (rb_key == cfqe->rb_key &&
> >> -                   cfqe->service_tree == service_tree)
> >> -                       return;
> >> +               unsigned int used_sl;
> >>
> >> -               cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> >> -               cfqe->service_tree = NULL;
> >> +               used_sl = cfq_cfqq_slice_usage(cfqq);
> >> +               cfqe->vdisktime += cfq_scale_slice(used_sl, cfqe);
> > cfqq->slice_resid could be > 0 or < 0, and the queue can be boosted or
> > penalized. Looks the logic is completely removed. This exists in current
> > cfq group too, I wonder why?
> 
> With vdisktime introduced, vdisktime represents cfqq's position on a service
> tree. And vdisktime is decided by the acutal running time by a cfqq. So we
> don't need slice_resid any more here.
I mean the logic. cfq_cfq_slice_usage does:
		slice_used = jiffies - cfqq->slice_start;
		if (slice_used > cfqq->allocated_slice)
			slice_used = cfqq->allocated_slice;
so the queue doesn't get penalty if it uses more slices than allocated,
looks like a bug here.
> >
> >> +       } else {
> >> +               cfqe->vdisktime = service_tree->min_vdisktime;
> >>         }
> >>
> >> +insert:
> >>         left = 1;
> >>         parent = NULL;
> >>         cfqe->service_tree = service_tree;
> >>         p = &service_tree->rb.rb_node;
> >> +       key = entity_key(service_tree, cfqe);
> >>         while (*p) {
> >>                 struct rb_node **n;
> >>
> >> @@ -1296,7 +1352,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> >>                 /*
> >>                  * sort by key, that represents service time.
> >>                  */
> >> -               if (time_before(rb_key, __cfqe->rb_key))
> >> +               if (key < entity_key(service_tree, __cfqe))
> >>                         n = &(*p)->rb_left;
> >>                 else {
> >>                         n = &(*p)->rb_right;
> >> @@ -1309,10 +1365,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> >>         if (left)
> >>                 service_tree->left = &cfqe->rb_node;
> >>
> >> -       cfqe->rb_key = rb_key;
> >>         rb_link_node(&cfqe->rb_node, parent, p);
> >>         rb_insert_color(&cfqe->rb_node, &service_tree->rb);
> >> +       update_min_vdisktime(service_tree);
> >>         service_tree->count++;
> >> +       service_tree->total_weight += cfqe->weight;
> >> +       cfqq->reposition_time = jiffies;
> >>         if ((add_front || !new_cfqq) && !group_changed)
> >>                 return;
> >>         cfq_group_service_tree_add(cfqd, cfqq->cfqg);
> >> @@ -1414,14 +1472,18 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> >>  static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> >>  {
> >>         struct cfq_entity *cfqe;
> >> +       struct cfq_rb_root *service_tree;
> >> +
> >>         cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
> >>         BUG_ON(!cfq_cfqq_on_rr(cfqq));
> >>         cfq_clear_cfqq_on_rr(cfqq);
> >>
> >>         cfqe = &cfqq->cfqe;
> >> +       service_tree = cfqe->service_tree;
> >>
> >>         if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> >>                 cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> >> +               service_tree->total_weight -= cfqe->weight;
> >>                 cfqe->service_tree = NULL;
> >>         }
> >>         if (cfqq->p_root) {
> >> @@ -2120,23 +2182,35 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
> >>         }
> >>  }
> >>
> >> +/*
> >> + * The time when a CFQ queue is put onto a service tree is recoreded in
> >> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
> >> + * on each service tree, and select the workload type that contain the lowest
> >> + * reposition_time CFQ queue among them.
> >> + */
> >>  static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> >>                                 struct cfq_group *cfqg, enum wl_prio_t prio)
> >>  {
> >>         struct cfq_entity *cfqe;
> >> +       struct cfq_queue *cfqq;
> >> +       unsigned long lowest_start_time;
> >>         int i;
> >> -       bool key_valid = false;
> >> -       unsigned long lowest_key = 0;
> >> +       bool time_valid = false;
> >>         enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
> >>
> >> +       /*
> >> +        * TODO: We may take io priority into account when choosing a workload
> >> +        * type. But for the time being just make use of reposition_time only.
> >> +        */
> > not just priority, original rb_key considers sys/async too.
> 
> IIUC, in original CFQ, rb_key is cross tree, so we need to take sync/async into
> account. But with vdisktime introduced, each service has its own vdisktime space.
> So there's no need to consider sync/async here.
previously rb_key is jiffies + offset, where offset is scaled by
priority and sync/async. So if you add two queues with different type
and same priority, sync queue will be dispatched first, while currently
only the inserted time is checked here, so sync queue isn't boosted.
That's why I said original code not just consider priority but also
sync/async

Thanks,
Shaohua


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2010-12-28  6:03       ` Shaohua Li
@ 2010-12-28  6:59         ` Gui Jianfeng
  0 siblings, 0 replies; 21+ messages in thread
From: Gui Jianfeng @ 2010-12-28  6:59 UTC (permalink / raw)
  To: Shaohua Li, Vivek Goyal
  Cc: Jens Axboe, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer

Shaohua Li wrote:
> On Tue, 2010-12-28 at 11:59 +0800, Gui Jianfeng wrote:
>> Shaohua Li wrote:
>>> On Mon, 2010-12-27 at 16:51 +0800, Gui Jianfeng wrote:
>>>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
>>>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
>>>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
>>>> group scheduling on the same service tree.
>>> Haven't read the whole series yet. Looks this changes a lot of logic
>>> even without cgroup support. Did you have test if this changes both
>>> performance and latency in some mixed workloads?
>> Hi Shaohua,
>>
>> I did some tests with different workloads and with idling on or idling off.
>> I don't see performance drop on my box.
> please check latency too.
> 
>>>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>>>> ---
>>>>  block/cfq-iosched.c |  211 ++++++++++++++++++++++++++++++++++++++-------------
>>>>  1 files changed, 158 insertions(+), 53 deletions(-)
>>>>
>>>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>>>> index a2553c0..b598798 100644
>>>> --- a/block/cfq-iosched.c
>>>> +++ b/block/cfq-iosched.c
>>>> @@ -100,10 +100,7 @@ struct cfq_entity {
>>>>         struct cfq_rb_root *service_tree;
>>>>         /* service_tree member */
>>>>         struct rb_node rb_node;
>>>> -       /* service_tree key, represent the position on the tree */
>>>> -       unsigned long rb_key;
>>>> -
>>>> -       /* group service_tree key */
>>>> +       /* service_tree key */
>>>>         u64 vdisktime;
>>>>         bool is_group_entity;
>>>>         unsigned int weight;
>>>> @@ -115,6 +112,8 @@ struct cfq_entity {
>>>>  struct cfq_queue {
>>>>         /* The schedule entity */
>>>>         struct cfq_entity cfqe;
>>>> +       /* Reposition time */
>>>> +       unsigned long reposition_time;
>>>>         /* reference count */
>>>>         atomic_t ref;
>>>>         /* various state flags, see below */
>>>> @@ -313,6 +312,24 @@ struct cfq_data {
>>>>         struct rcu_head rcu;
>>>>  };
>>>>
>>>> +/*
>>>> + * Map io priority(7 ~ 0) to io weight(100 ~ 1000) as follows
>>>> + *     prio       0    1     2    3    4    5    6     7
>>>> + *     weight  1000  868   740  612  484  356  228   100
>>>> + */
>>>> +static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
>>>> +{
>>>> +       unsigned int step;
>>>> +
>>>> +       BUG_ON(ioprio >= IOPRIO_BE_NR);
>>>> +
>>>> +       step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
>>>> +       if (ioprio == 0)
>>>> +               return BLKIO_WEIGHT_MAX;
>>>> +
>>>> +       return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
>>>> +}
>>>> +
>>>>  static inline struct cfq_queue *
>>>>  cfqq_of_entity(struct cfq_entity *cfqe)
>>>>  {
>>>> @@ -841,16 +858,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>>>         return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
>>>>  }
>>>>
>>>> -static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
>>>> -                                     struct cfq_queue *cfqq)
>>>> -{
>>>> -       /*
>>>> -        * just an approximation, should be ok.
>>>> -        */
>>>> -       return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
>>>> -                      cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
>>>> -}
>>>> -
>>>>  static inline s64
>>>>  entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
>>>>  {
>>>> @@ -1199,6 +1206,16 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
>>>>
>>>>  #endif /* GROUP_IOSCHED */
>>>>
>>>> +static inline u64 cfq_get_boost(struct cfq_data *cfqd,
>>>> +                                struct cfq_entity *cfqe)
>>>> +{
>>>> +       u64 d = cfqd->cfq_slice[1] << CFQ_SERVICE_SHIFT;
>>>> +
>>>> +       d = d * BLKIO_WEIGHT_DEFAULT;
>>>> +       do_div(d, BLKIO_WEIGHT_MAX - cfqe->weight + BLKIO_WEIGHT_MIN);
>>>> +       return d;
>>>> +}
>>> don't consider sync/async here? for group, we don't have such issue, but
>>> we need boost sync otherwise.
>> Ok, we should.
>>
>>>> +
>>>>  /*
>>>>   * The cfqd->service_trees holds all pending cfq_queue's that have
>>>>   * requests waiting to be processed. It is sorted in the order that
>>>> @@ -1210,13 +1227,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>>>         struct cfq_entity *cfqe;
>>>>         struct rb_node **p, *parent;
>>>>         struct cfq_entity *__cfqe;
>>>> -       unsigned long rb_key;
>>>> -       struct cfq_rb_root *service_tree;
>>>> +       struct cfq_rb_root *service_tree, *orig_st;
>>>>         int left;
>>>>         int new_cfqq = 1;
>>>>         int group_changed = 0;
>>>> +       s64 key;
>>>>
>>>>         cfqe = &cfqq->cfqe;
>>>> +       orig_st = cfqe->service_tree;
>>>>
>>>>  #ifdef CONFIG_CFQ_GROUP_IOSCHED
>>>>         if (!cfqd->cfq_group_isolation
>>>> @@ -1224,8 +1242,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>>>             && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
>>>>                 /* Move this cfq to root group */
>>>>                 cfq_log_cfqq(cfqd, cfqq, "moving to root group");
>>>> -               if (!RB_EMPTY_NODE(&cfqe->rb_node))
>>>> +               if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>>>>                         cfq_group_service_tree_del(cfqd, cfqq->cfqg);
>>>> +                       /*
>>>> +                        * Group changed, dequeue this CFQ queue from the
>>>> +                        * original service tree.
>>>> +                        */
>>>> +                       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
>>>> +                       orig_st->total_weight -= cfqe->weight;
>>>> +               }
>>>>                 cfqq->orig_cfqg = cfqq->cfqg;
>>>>                 cfqq->cfqg = &cfqd->root_group;
>>>>                 atomic_inc(&cfqd->root_group.ref);
>>>> @@ -1234,8 +1259,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>>>                    && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
>>>>                 /* cfqq is sequential now needs to go to its original group */
>>>>                 BUG_ON(cfqq->cfqg != &cfqd->root_group);
>>>> -               if (!RB_EMPTY_NODE(&cfqe->rb_node))
>>>> +               if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>>>>                         cfq_group_service_tree_del(cfqd, cfqq->cfqg);
>>>> +                       /*
>>>> +                        * Group changed, dequeue this CFQ queue from the
>>>> +                        * original service tree.
>>>> +                        */
>>>> +                       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
>>>> +                       orig_st->total_weight -= cfqe->weight;
>>>> +               }
>>>>                 cfq_put_cfqg(cfqq->cfqg);
>>>>                 cfqq->cfqg = cfqq->orig_cfqg;
>>>>                 cfqq->orig_cfqg = NULL;
>>>> @@ -1246,47 +1278,71 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>>>
>>>>         service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
>>>>                                                 cfqq_type(cfqq));
>>>> +       /*
>>>> +        * For the time being, put the newly added CFQ queue at the end of the
>>>> +        * service tree.
>>>> +        */
>>> so this should be fixed later? please mark it as a todo.
>> Ahh, forgot remove this annotation, I'v already boost vdisktime according to it ioprio.
> ok, at below code, looks cfq_get_boost returns a value which is always
> greater than CFQ_IDLE_DELAY, so the new queue is inserted before __cfqe,
> is this intended?

Currently, yes.
I may improve cfq_get_boost() logic later.

>>>> +       if (RB_EMPTY_NODE(&cfqe->rb_node)) {
>>>> +               /*
>>>> +                * If this CFQ queue moves to another group, the original
>>>> +                * vdisktime makes no sense any more, reset the vdisktime
>>>> +                * here.
>>>> +                */
>>>> +               parent = rb_last(&service_tree->rb);
>>>> +               if (parent) {
>>>> +                       u64 boost;
>>>> +                       s64 __vdisktime;
>>>> +
>>>> +                       __cfqe = rb_entry_entity(parent);
>>>> +                       cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>>>> +
>>>> +                       /* Give some vdisktime boost according to its weight */
>>>> +                       boost = cfq_get_boost(cfqd, cfqe);
>>>> +                       __vdisktime = cfqe->vdisktime - boost;
>>>> +                       if (__vdisktime > service_tree->min_vdisktime)
>>>> +                               cfqe->vdisktime = __vdisktime;
>>>> +                       else
>>>> +                               cfqe->vdisktime = service_tree->min_vdisktime;
>>>> +               } else
>>>> +                       cfqe->vdisktime = service_tree->min_vdisktime;
>>>> +
>>>> +               goto insert;
>>>> +       }
>>>> +       /*
>>>> +        * Ok, we get here, this CFQ queue is on the service tree, dequeue it
>>>> +        * firstly.
>>>> +        */
>>>> +       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
>>>> +       orig_st->total_weight -= cfqe->weight;
>>>> +
>>>> +       new_cfqq = 0;
>>>> +
>>>>         if (cfq_class_idle(cfqq)) {
>>>> -               rb_key = CFQ_IDLE_DELAY;
>>>>                 parent = rb_last(&service_tree->rb);
>>>>                 if (parent && parent != &cfqe->rb_node) {
>>>>                         __cfqe = rb_entry(parent, struct cfq_entity, rb_node);
>>>> -                       rb_key += __cfqe->rb_key;
>>>> +                       cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>>>>                 } else
>>>> -                       rb_key += jiffies;
>>>> +                       cfqe->vdisktime = service_tree->min_vdisktime;
>>>>         } else if (!add_front) {
>>>>                 /*
>>>> -                * Get our rb key offset. Subtract any residual slice
>>>> -                * value carried from last service. A negative resid
>>>> -                * count indicates slice overrun, and this should position
>>>> -                * the next service time further away in the tree.
>>>> -                */
>>>> -               rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
>>>> -               rb_key -= cfqq->slice_resid;
>>>> -               cfqq->slice_resid = 0;
>>>> -       } else {
>>>> -               rb_key = -HZ;
>>>> -               __cfqe = cfq_rb_first(service_tree);
>>>> -               rb_key += __cfqe ? __cfqe->rb_key : jiffies;
>>>> -       }
>>>> -
>>>> -       if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>>>> -               new_cfqq = 0;
>>>> -               /*
>>>> -                * same position, nothing more to do
>>>> +                * We charge the CFQ queue by the time this queue runs, and
>>>> +                * repsition it on the service tree.
>>>>                  */
>>>> -               if (rb_key == cfqe->rb_key &&
>>>> -                   cfqe->service_tree == service_tree)
>>>> -                       return;
>>>> +               unsigned int used_sl;
>>>>
>>>> -               cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
>>>> -               cfqe->service_tree = NULL;
>>>> +               used_sl = cfq_cfqq_slice_usage(cfqq);
>>>> +               cfqe->vdisktime += cfq_scale_slice(used_sl, cfqe);
>>> cfqq->slice_resid could be > 0 or < 0, and the queue can be boosted or
>>> penalized. Looks the logic is completely removed. This exists in current
>>> cfq group too, I wonder why?
>> With vdisktime introduced, vdisktime represents cfqq's position on a service
>> tree. And vdisktime is decided by the acutal running time by a cfqq. So we
>> don't need slice_resid any more here.
> I mean the logic. cfq_cfq_slice_usage does:
> 		slice_used = jiffies - cfqq->slice_start;
> 		if (slice_used > cfqq->allocated_slice)
> 			slice_used = cfqq->allocated_slice;
> so the queue doesn't get penalty if it uses more slices than allocated,
> looks like a bug here.

Ok, since cfqq and cfqg are scheduling at the same level, I'd like to let
them confirm to the same scheduling logic. I'm not sure why cfqg charge
allocated slice rather than the used slice when overshooting. 
Vivek may answer it.

>>>> +       } else {
>>>> +               cfqe->vdisktime = service_tree->min_vdisktime;
>>>>         }
>>>>
>>>> +insert:
>>>>         left = 1;
>>>>         parent = NULL;
>>>>         cfqe->service_tree = service_tree;
>>>>         p = &service_tree->rb.rb_node;
>>>> +       key = entity_key(service_tree, cfqe);
>>>>         while (*p) {
>>>>                 struct rb_node **n;
>>>>
>>>> @@ -1296,7 +1352,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>>>                 /*
>>>>                  * sort by key, that represents service time.
>>>>                  */
>>>> -               if (time_before(rb_key, __cfqe->rb_key))
>>>> +               if (key < entity_key(service_tree, __cfqe))
>>>>                         n = &(*p)->rb_left;
>>>>                 else {
>>>>                         n = &(*p)->rb_right;
>>>> @@ -1309,10 +1365,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>>>         if (left)
>>>>                 service_tree->left = &cfqe->rb_node;
>>>>
>>>> -       cfqe->rb_key = rb_key;
>>>>         rb_link_node(&cfqe->rb_node, parent, p);
>>>>         rb_insert_color(&cfqe->rb_node, &service_tree->rb);
>>>> +       update_min_vdisktime(service_tree);
>>>>         service_tree->count++;
>>>> +       service_tree->total_weight += cfqe->weight;
>>>> +       cfqq->reposition_time = jiffies;
>>>>         if ((add_front || !new_cfqq) && !group_changed)
>>>>                 return;
>>>>         cfq_group_service_tree_add(cfqd, cfqq->cfqg);
>>>> @@ -1414,14 +1472,18 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>>>>  static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>>>>  {
>>>>         struct cfq_entity *cfqe;
>>>> +       struct cfq_rb_root *service_tree;
>>>> +
>>>>         cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
>>>>         BUG_ON(!cfq_cfqq_on_rr(cfqq));
>>>>         cfq_clear_cfqq_on_rr(cfqq);
>>>>
>>>>         cfqe = &cfqq->cfqe;
>>>> +       service_tree = cfqe->service_tree;
>>>>
>>>>         if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>>>>                 cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
>>>> +               service_tree->total_weight -= cfqe->weight;
>>>>                 cfqe->service_tree = NULL;
>>>>         }
>>>>         if (cfqq->p_root) {
>>>> @@ -2120,23 +2182,35 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
>>>>         }
>>>>  }
>>>>
>>>> +/*
>>>> + * The time when a CFQ queue is put onto a service tree is recoreded in
>>>> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
>>>> + * on each service tree, and select the workload type that contain the lowest
>>>> + * reposition_time CFQ queue among them.
>>>> + */
>>>>  static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>>>>                                 struct cfq_group *cfqg, enum wl_prio_t prio)
>>>>  {
>>>>         struct cfq_entity *cfqe;
>>>> +       struct cfq_queue *cfqq;
>>>> +       unsigned long lowest_start_time;
>>>>         int i;
>>>> -       bool key_valid = false;
>>>> -       unsigned long lowest_key = 0;
>>>> +       bool time_valid = false;
>>>>         enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>>>>
>>>> +       /*
>>>> +        * TODO: We may take io priority into account when choosing a workload
>>>> +        * type. But for the time being just make use of reposition_time only.
>>>> +        */
>>> not just priority, original rb_key considers sys/async too.
>> IIUC, in original CFQ, rb_key is cross tree, so we need to take sync/async into
>> account. But with vdisktime introduced, each service has its own vdisktime space.
>> So there's no need to consider sync/async here.
> previously rb_key is jiffies + offset, where offset is scaled by
> priority and sync/async. So if you add two queues with different type
> and same priority, sync queue will be dispatched first, while currently
> only the inserted time is checked here, so sync queue isn't boosted.
> That's why I said original code not just consider priority but also
> sync/async

Ok, I got. will update the annonatation.

Thanks for the comments Shaohua.
Gui

> 
> Thanks,
> Shaohua
> 
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 6/6 v3] blkio-cgroup: Document for blkio.use_hierarchy interface
  2010-12-27  8:51 ` [PATCH 6/6 v3] blkio-cgroup: Document for blkio.use_hierarchy interface Gui Jianfeng
@ 2011-01-18 20:27   ` Vivek Goyal
  2011-01-19  1:20     ` Gui Jianfeng
  0 siblings, 1 reply; 21+ messages in thread
From: Vivek Goyal @ 2011-01-18 20:27 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Jens Axboe, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

On Mon, Dec 27, 2010 at 04:51:20PM +0800, Gui Jianfeng wrote:
> Document for blkio.use_hierarchy interface
> 
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> ---
>  Documentation/cgroups/blkio-controller.txt |   70 ++++++++++++++++++++--------
>  1 files changed, 51 insertions(+), 19 deletions(-)
> 
> diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
> index 4ed7b5c..bd01d6c 100644
> --- a/Documentation/cgroups/blkio-controller.txt
> +++ b/Documentation/cgroups/blkio-controller.txt
> @@ -91,30 +91,51 @@ Throttling/Upper Limit policy
>  
>  Hierarchical Cgroups
>  ====================
> -- Currently none of the IO control policy supports hierarhical groups. But
> -  cgroup interface does allow creation of hierarhical cgroups and internally
> -  IO policies treat them as flat hierarchy.
> +- Cgroup interface allows creation of hierarchical cgroups. Currently,
> +  internally IO policies are able to treat them as flat hierarchy or
> +  hierarchical hierarchy.

Gui, now we have 2 IO policies. Throttling policy still supports only flat
mode. Can you please make it clear here that CFQ will support both flat
and hierarhical mode while throttling supports only flat mode as of today.

> Both hierarchical bandwidth division and flat
> +  bandwidth division are supported. "blkio.use_hierarchy" can be used to
> +  switch between flat mode and hierarchical mode.
>  
> -  So this patch will allow creation of cgroup hierarhcy but at the backend
> -  everything will be treated as flat. So if somebody created a hierarchy like
> -  as follows.
> +  Consider the following CGroup hierarchy:
>  
> -			root
> -			/  \
> -		     test1 test2
> -			|
> -		     test3
> +			  Root
> +			/  |   \
> +		     Grp1  Grp2 tsk1
> +	            /  \
> +		 Grp3 tsk2
>  
> -  CFQ and throttling will practically treat all groups at same level.
> +  If blkio.use_hierarchy is disabled in all CGroups, CFQ will practically treat all groups
> +  at the same level.
>  
> -				pivot
> -			     /  |   \  \
> -			root  test1 test2  test3
> +			     Pivot tree
> +			    /  |   |   \
> +			Root Grp1 Grp2 Grp3
> +			 /     |
> +			tsk1  tsk2
>  
> -  Down the line we can implement hierarchical accounting/control support
> -  and also introduce a new cgroup file "use_hierarchy" which will control
> -  whether cgroup hierarchy is viewed as flat or hierarchical by the policy..
> -  This is how memory controller also has implemented the things.
> +  If blkio.use_hierarchy is enabled in Grp1 and Grp3, CFQ will treat groups and tasks as the

I would think that before this example, we can give one simpler example
where use_hierarhcy=1 in root cgroup and that would/should lead to all
children group under root having use_hierarchy=1 set automatically and
will look as follows.

		     	Pivot tree
			    |
		           root
			 /  |  | 
	             tsk1   G1 G2 
			   / \
			 grp3 tsk2

After this now you can give more complicated example as below where
use_hierarchy=1 is only set for sub branch of tree and not the whole
tree.

> +  same view in CGroup hierarchy, it looks as follows.
> +
> +
> +			     Pivot tree
> +			    /    |    \
> +			  Root  Grp1  Grp2
> +			  /     /  \
> +		       tsk1   Grp3 tsk2
> +
> +  Root, Grp1 and Grp2 are treated at the same level under Pivot tree. tsk1 stays under Root.
> +  Grp3 and tsk2 are treated at the same level under Grp1. Below is the mapping between
> +  task io priority and io weight:
> +
> +	    prio       0    1     2    3    4    5    6     7
> +	    weight  1000  868   740  612  484  356  228   100
> +
> +  Note1: Regardless of the use_hierarchy setting in Root group, Root group is always put onto
> +  Pivot tree.
> +
> +  Note2: Currently, "blkio.use_hierarchy" only effects proportional bandwidth division. For
> +  Throttling logic, it still continue to treat everything as flat.
>  

Ok, I see you mentioned throttling flat mode here. I guess it is better
to clarify it in the beginning itself.

>  Various user visible config options
>  ===================================
> @@ -169,6 +190,17 @@ Proportional weight policy files
>  	  dev     weight
>  	  8:16    300
>  
> +- blkio.use_hierarchy
> +	- Switch between hierarchical mode and flat mode as stated above.
> +	  blkio.use_hierarchy == 1 means hierarchical mode is enabled.
> +	  blkio.use_hierarchy == 0 means flat mode is enabled.
> +	  You can set this interface only if there isn't any child CGroup under
> +	  this CGroup. If one CGroup's blkio.use_hierarchy is set, the created
> +	  children will inherit it. it's not allowed to unset it in children.
> +	  The default mode in Root CGroup is flat.
> +	  blkio.use_hierarchy only works for proportional bandwidth division
> +	  as of today and doesn't have any effect on throttling logic.
> +

This is good. Makes use_hierarchy behavior for children and subtree very
clear.

>  - blkio.time
>  	- disk time allocated to cgroup per device in milliseconds. First
>  	  two fields specify the major and minor number of the device and
> -- 
> 1.6.5.2
> 
> 
> 
> 
> 
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/6 v3] cfq-iosched: Introduce cfq_entity for CFQ group
  2010-12-27  8:50 ` [PATCH 2/6 v3] cfq-iosched: Introduce cfq_entity for CFQ group Gui Jianfeng
@ 2011-01-18 21:20   ` Vivek Goyal
  2011-01-19  1:25     ` Gui Jianfeng
  2011-01-23  2:15     ` Gui Jianfeng
  0 siblings, 2 replies; 21+ messages in thread
From: Vivek Goyal @ 2011-01-18 21:20 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Jens Axboe, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

On Mon, Dec 27, 2010 at 04:50:51PM +0800, Gui Jianfeng wrote:

[..]
> -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
> +static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
>  {
>  	if (!root->left)
>  		root->left = rb_first(&root->rb);
>  
>  	if (root->left)
> -		return rb_entry_cfqg(root->left);
> +		return rb_entry_entity(root->left);
>  

Why are we defining cfq_rb_first_entity(). Can't we simply use cfq_rb_first()
as introduced in first patch.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 6/6 v3] blkio-cgroup: Document for blkio.use_hierarchy interface
  2011-01-18 20:27   ` Vivek Goyal
@ 2011-01-19  1:20     ` Gui Jianfeng
  0 siblings, 0 replies; 21+ messages in thread
From: Gui Jianfeng @ 2011-01-19  1:20 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Jens Axboe, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

Vivek Goyal wrote:
> On Mon, Dec 27, 2010 at 04:51:20PM +0800, Gui Jianfeng wrote:
>> Document for blkio.use_hierarchy interface
>>
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>> ---
>>  Documentation/cgroups/blkio-controller.txt |   70 ++++++++++++++++++++--------
>>  1 files changed, 51 insertions(+), 19 deletions(-)
>>
>> diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
>> index 4ed7b5c..bd01d6c 100644
>> --- a/Documentation/cgroups/blkio-controller.txt
>> +++ b/Documentation/cgroups/blkio-controller.txt
>> @@ -91,30 +91,51 @@ Throttling/Upper Limit policy
>>  
>>  Hierarchical Cgroups
>>  ====================
>> -- Currently none of the IO control policy supports hierarhical groups. But
>> -  cgroup interface does allow creation of hierarhical cgroups and internally
>> -  IO policies treat them as flat hierarchy.
>> +- Cgroup interface allows creation of hierarchical cgroups. Currently,
>> +  internally IO policies are able to treat them as flat hierarchy or
>> +  hierarchical hierarchy.
> 
> Gui, now we have 2 IO policies. Throttling policy still supports only flat
> mode. Can you please make it clear here that CFQ will support both flat
> and hierarhical mode while throttling supports only flat mode as of today.
> 
>> Both hierarchical bandwidth division and flat
>> +  bandwidth division are supported. "blkio.use_hierarchy" can be used to
>> +  switch between flat mode and hierarchical mode.
>>  
>> -  So this patch will allow creation of cgroup hierarhcy but at the backend
>> -  everything will be treated as flat. So if somebody created a hierarchy like
>> -  as follows.
>> +  Consider the following CGroup hierarchy:
>>  
>> -			root
>> -			/  \
>> -		     test1 test2
>> -			|
>> -		     test3
>> +			  Root
>> +			/  |   \
>> +		     Grp1  Grp2 tsk1
>> +	            /  \
>> +		 Grp3 tsk2
>>  
>> -  CFQ and throttling will practically treat all groups at same level.
>> +  If blkio.use_hierarchy is disabled in all CGroups, CFQ will practically treat all groups
>> +  at the same level.
>>  
>> -				pivot
>> -			     /  |   \  \
>> -			root  test1 test2  test3
>> +			     Pivot tree
>> +			    /  |   |   \
>> +			Root Grp1 Grp2 Grp3
>> +			 /     |
>> +			tsk1  tsk2
>>  
>> -  Down the line we can implement hierarchical accounting/control support
>> -  and also introduce a new cgroup file "use_hierarchy" which will control
>> -  whether cgroup hierarchy is viewed as flat or hierarchical by the policy..
>> -  This is how memory controller also has implemented the things.
>> +  If blkio.use_hierarchy is enabled in Grp1 and Grp3, CFQ will treat groups and tasks as the
> 
> I would think that before this example, we can give one simpler example
> where use_hierarhcy=1 in root cgroup and that would/should lead to all
> children group under root having use_hierarchy=1 set automatically and
> will look as follows.
> 
> 		     	Pivot tree
> 			    |
> 		           root
> 			 /  |  | 
> 	             tsk1   G1 G2 
> 			   / \
> 			 grp3 tsk2
> 
> After this now you can give more complicated example as below where
> use_hierarchy=1 is only set for sub branch of tree and not the whole
> tree.

Sure.

> 
>> +  same view in CGroup hierarchy, it looks as follows.
>> +
>> +
>> +			     Pivot tree
>> +			    /    |    \
>> +			  Root  Grp1  Grp2
>> +			  /     /  \
>> +		       tsk1   Grp3 tsk2
>> +
>> +  Root, Grp1 and Grp2 are treated at the same level under Pivot tree. tsk1 stays under Root.
>> +  Grp3 and tsk2 are treated at the same level under Grp1. Below is the mapping between
>> +  task io priority and io weight:
>> +
>> +	    prio       0    1     2    3    4    5    6     7
>> +	    weight  1000  868   740  612  484  356  228   100
>> +
>> +  Note1: Regardless of the use_hierarchy setting in Root group, Root group is always put onto
>> +  Pivot tree.
>> +
>> +  Note2: Currently, "blkio.use_hierarchy" only effects proportional bandwidth division. For
>> +  Throttling logic, it still continue to treat everything as flat.
>>  
> 
> Ok, I see you mentioned throttling flat mode here. I guess it is better
> to clarify it in the beginning itself.

Ok

Thanks,
Gui

> 
>>  Various user visible config options
>>  ===================================
>> @@ -169,6 +190,17 @@ Proportional weight policy files
>>  	  dev     weight
>>  	  8:16    300
>>  
>> +- blkio.use_hierarchy
>> +	- Switch between hierarchical mode and flat mode as stated above.
>> +	  blkio.use_hierarchy == 1 means hierarchical mode is enabled.
>> +	  blkio.use_hierarchy == 0 means flat mode is enabled.
>> +	  You can set this interface only if there isn't any child CGroup under
>> +	  this CGroup. If one CGroup's blkio.use_hierarchy is set, the created
>> +	  children will inherit it. it's not allowed to unset it in children.
>> +	  The default mode in Root CGroup is flat.
>> +	  blkio.use_hierarchy only works for proportional bandwidth division
>> +	  as of today and doesn't have any effect on throttling logic.
>> +
> 
> This is good. Makes use_hierarchy behavior for children and subtree very
> clear.
> 
>>  - blkio.time
>>  	- disk time allocated to cgroup per device in milliseconds. First
>>  	  two fields specify the major and minor number of the device and
>> -- 
>> 1.6.5.2
>>
>>
>>
>>
>>
>>
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/6 v3] cfq-iosched: Introduce cfq_entity for CFQ group
  2011-01-18 21:20   ` Vivek Goyal
@ 2011-01-19  1:25     ` Gui Jianfeng
  2011-01-23  2:15     ` Gui Jianfeng
  1 sibling, 0 replies; 21+ messages in thread
From: Gui Jianfeng @ 2011-01-19  1:25 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Jens Axboe, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

Vivek Goyal wrote:
> On Mon, Dec 27, 2010 at 04:50:51PM +0800, Gui Jianfeng wrote:
> 
> [..]
>> -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
>> +static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
>>  {
>>  	if (!root->left)
>>  		root->left = rb_first(&root->rb);
>>  
>>  	if (root->left)
>> -		return rb_entry_cfqg(root->left);
>> +		return rb_entry_entity(root->left);
>>  
> 
> Why are we defining cfq_rb_first_entity(). Can't we simply use cfq_rb_first()
> as introduced in first patch.

Ok, Will rename it.

Thanks,
Gui

> 
> Thanks
> Vivek
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2010-12-27  8:51 ` [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
  2010-12-28  2:50   ` Shaohua Li
@ 2011-01-19 22:58   ` Vivek Goyal
  2011-01-20  3:58     ` Gui Jianfeng
  1 sibling, 1 reply; 21+ messages in thread
From: Vivek Goyal @ 2011-01-19 22:58 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Jens Axboe, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

On Mon, Dec 27, 2010 at 04:51:00PM +0800, Gui Jianfeng wrote:
> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
> group scheduling on the same service tree.
> 
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>

[..]
> @@ -1246,47 +1278,71 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>  
>  	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
>  						cfqq_type(cfqq));
> +	/*
> +	 * For the time being, put the newly added CFQ queue at the end of the
> +	 * service tree.
> +	 */
> +	if (RB_EMPTY_NODE(&cfqe->rb_node)) {
> +		/*
> +		 * If this CFQ queue moves to another group, the original
> +		 * vdisktime makes no sense any more, reset the vdisktime
> +		 * here.
> +		 */
> +		parent = rb_last(&service_tree->rb);
> +		if (parent) {
> +			u64 boost;
> +			s64 __vdisktime;
> +
> +			__cfqe = rb_entry_entity(parent);
> +			cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> +
> +			/* Give some vdisktime boost according to its weight */
> +			boost = cfq_get_boost(cfqd, cfqe);
> +			__vdisktime = cfqe->vdisktime - boost;
> +			if (__vdisktime > service_tree->min_vdisktime)
> +				cfqe->vdisktime = __vdisktime;
> +			else
> +				cfqe->vdisktime = service_tree->min_vdisktime;
> +		} else
> +			cfqe->vdisktime = service_tree->min_vdisktime;

Hi Gui,

Is above logic actually working? I suspect that most of the time all the
new queues will end up getting same vdisktime and that is st->min_vdisktime
and there will be no service differentiation across various prio.

Reason being, on SSD, idle is disabled. So very few/no queue will consume
its slice and follow reque path. So every queue will be new. Now you are
doing following.

	cfqd->vdisktime = vdisktime_of_parent + IDLE_DELAY - boost

assume vdisktime_of_parent=st->min_vdisktime, then (IDLE_DELAY - boost)
is always going to be a -ve number and hence cfqd->vdisktime will 
default to st->min_vdisktime. (IDLE_DELAY=200 while boost should be a huge
value due to SERVICE_SHIFT thing).

I think this logic needs refining. Maybe instead of subtracting the boost
we can instead place entities further away from st->min_vdisktime and
offset is higher for lower ioprio queues.

	cfqe->vdisktime = st->min_vdisktime + offset

here offset is something similar to boost but reversed in nature in the
sense that lower weight has got lower offset and vice-versa.

The important test here will be to run bunch of cfqq queues of different 
ioprio on a SSD with queue depth 1 and see if you can see the service
differentiation. If yes, then you can increase the queue depth a bit
and also number of competing queues and see what's the result. Also 
monitor the blktrace and vdisktime and make sure higher prio queues
get to run more than lower prio queues.

This is the most critical piece of converting cfqq scheduling logic,
so lets make sure that we get it right.


Thanks
Vivek

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-01-19 22:58   ` Vivek Goyal
@ 2011-01-20  3:58     ` Gui Jianfeng
  2011-01-20 11:09       ` Vivek Goyal
  0 siblings, 1 reply; 21+ messages in thread
From: Gui Jianfeng @ 2011-01-20  3:58 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Jens Axboe, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

Vivek Goyal wrote:
> On Mon, Dec 27, 2010 at 04:51:00PM +0800, Gui Jianfeng wrote:
>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
>> group scheduling on the same service tree.
>>
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> 
> [..]
>> @@ -1246,47 +1278,71 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>  
>>  	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
>>  						cfqq_type(cfqq));
>> +	/*
>> +	 * For the time being, put the newly added CFQ queue at the end of the
>> +	 * service tree.
>> +	 */
>> +	if (RB_EMPTY_NODE(&cfqe->rb_node)) {
>> +		/*
>> +		 * If this CFQ queue moves to another group, the original
>> +		 * vdisktime makes no sense any more, reset the vdisktime
>> +		 * here.
>> +		 */
>> +		parent = rb_last(&service_tree->rb);
>> +		if (parent) {
>> +			u64 boost;
>> +			s64 __vdisktime;
>> +
>> +			__cfqe = rb_entry_entity(parent);
>> +			cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>> +
>> +			/* Give some vdisktime boost according to its weight */
>> +			boost = cfq_get_boost(cfqd, cfqe);
>> +			__vdisktime = cfqe->vdisktime - boost;
>> +			if (__vdisktime > service_tree->min_vdisktime)
>> +				cfqe->vdisktime = __vdisktime;
>> +			else
>> +				cfqe->vdisktime = service_tree->min_vdisktime;
>> +		} else
>> +			cfqe->vdisktime = service_tree->min_vdisktime;
> 
> Hi Gui,
> 
> Is above logic actually working? I suspect that most of the time all the
> new queues will end up getting same vdisktime and that is st->min_vdisktime
> and there will be no service differentiation across various prio.
> 
> Reason being, on SSD, idle is disabled. So very few/no queue will consume
> its slice and follow reque path. So every queue will be new. Now you are
> doing following.
> 
> 	cfqd->vdisktime = vdisktime_of_parent + IDLE_DELAY - boost
> 
> assume vdisktime_of_parent=st->min_vdisktime, then (IDLE_DELAY - boost)
> is always going to be a -ve number and hence cfqd->vdisktime will 
> default to st->min_vdisktime. (IDLE_DELAY=200 while boost should be a huge
> value due to SERVICE_SHIFT thing).

Vivek,

Actually, I tested on rotational disk with idling disabled, I saw service
differentiation between two tasks with different ioprio.
I don't have a SSD on hand, But I'll get one and do more tests.

> 
> I think this logic needs refining. Maybe instead of subtracting the boost
> we can instead place entities further away from st->min_vdisktime and
> offset is higher for lower ioprio queues.
> 
> 	cfqe->vdisktime = st->min_vdisktime + offset
> 
> here offset is something similar to boost but reversed in nature in the
> sense that lower weight has got lower offset and vice-versa.

I'll consider this idea and try it.

> 
> The important test here will be to run bunch of cfqq queues of different 
> ioprio on a SSD with queue depth 1 and see if you can see the service
> differentiation. If yes, then you can increase the queue depth a bit
> and also number of competing queues and see what's the result. Also 
> monitor the blktrace and vdisktime and make sure higher prio queues
> get to run more than lower prio queues.
> 
> This is the most critical piece of converting cfqq scheduling logic,
> so lets make sure that we get it right.

Yes, of course. :)

Thanks,
Gui

> 
> 
> Thanks
> Vivek
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-01-20  3:58     ` Gui Jianfeng
@ 2011-01-20 11:09       ` Vivek Goyal
  2011-01-24  4:45         ` Gui Jianfeng
  0 siblings, 1 reply; 21+ messages in thread
From: Vivek Goyal @ 2011-01-20 11:09 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Jens Axboe, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

On Thu, Jan 20, 2011 at 11:58:18AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Mon, Dec 27, 2010 at 04:51:00PM +0800, Gui Jianfeng wrote:
> >> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
> >> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
> >> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
> >> group scheduling on the same service tree.
> >>
> >> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> > 
> > [..]
> >> @@ -1246,47 +1278,71 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> >>  
> >>  	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
> >>  						cfqq_type(cfqq));
> >> +	/*
> >> +	 * For the time being, put the newly added CFQ queue at the end of the
> >> +	 * service tree.
> >> +	 */
> >> +	if (RB_EMPTY_NODE(&cfqe->rb_node)) {
> >> +		/*
> >> +		 * If this CFQ queue moves to another group, the original
> >> +		 * vdisktime makes no sense any more, reset the vdisktime
> >> +		 * here.
> >> +		 */
> >> +		parent = rb_last(&service_tree->rb);
> >> +		if (parent) {
> >> +			u64 boost;
> >> +			s64 __vdisktime;
> >> +
> >> +			__cfqe = rb_entry_entity(parent);
> >> +			cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> >> +
> >> +			/* Give some vdisktime boost according to its weight */
> >> +			boost = cfq_get_boost(cfqd, cfqe);
> >> +			__vdisktime = cfqe->vdisktime - boost;
> >> +			if (__vdisktime > service_tree->min_vdisktime)
> >> +				cfqe->vdisktime = __vdisktime;
> >> +			else
> >> +				cfqe->vdisktime = service_tree->min_vdisktime;
> >> +		} else
> >> +			cfqe->vdisktime = service_tree->min_vdisktime;
> > 
> > Hi Gui,
> > 
> > Is above logic actually working? I suspect that most of the time all the
> > new queues will end up getting same vdisktime and that is st->min_vdisktime
> > and there will be no service differentiation across various prio.
> > 
> > Reason being, on SSD, idle is disabled. So very few/no queue will consume
> > its slice and follow reque path. So every queue will be new. Now you are
> > doing following.
> > 
> > 	cfqd->vdisktime = vdisktime_of_parent + IDLE_DELAY - boost
> > 
> > assume vdisktime_of_parent=st->min_vdisktime, then (IDLE_DELAY - boost)
> > is always going to be a -ve number and hence cfqd->vdisktime will 
> > default to st->min_vdisktime. (IDLE_DELAY=200 while boost should be a huge
> > value due to SERVICE_SHIFT thing).
> 
> Vivek,
> 
> Actually, I tested on rotational disk with idling disabled, I saw service
> differentiation between two tasks with different ioprio.
> I don't have a SSD on hand, But I'll get one and do more tests.

Ok, I am really not sure how did it work for you because boost will
turn out to be huge number in comparision to IDLE_DELAY or vdisktime
of parent. If you have some blktrace data or if you can explain how
it is working will help in understanding it better.

> 
> > 
> > I think this logic needs refining. Maybe instead of subtracting the boost
> > we can instead place entities further away from st->min_vdisktime and
> > offset is higher for lower ioprio queues.
> > 
> > 	cfqe->vdisktime = st->min_vdisktime + offset
> > 
> > here offset is something similar to boost but reversed in nature in the
> > sense that lower weight has got lower offset and vice-versa.
> 
> I'll consider this idea and try it.

I think I wrote it reversed. offset should be higher for lower weights
and lower for higher weights so that higher weight queues/groups can
dispatch IO earlier.

If this works fine, this will help with group logic also. Currently we
place all newly backlogged group at the end of sevice tree and this
results in no service differentiation of idling is disabled. (slice_idle
and group_idle). Keeping idling on can hurt very badly on faster sotrage
like storage arrays.

So I wanted to make sure that this logic works so that even for groups one
can disable idling on faster storage and still get some service differentaion
among groups.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/6 v3] cfq-iosched: Introduce cfq_entity for CFQ group
  2011-01-18 21:20   ` Vivek Goyal
  2011-01-19  1:25     ` Gui Jianfeng
@ 2011-01-23  2:15     ` Gui Jianfeng
  1 sibling, 0 replies; 21+ messages in thread
From: Gui Jianfeng @ 2011-01-23  2:15 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Jens Axboe, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

Vivek Goyal wrote:
> On Mon, Dec 27, 2010 at 04:50:51PM +0800, Gui Jianfeng wrote:
> 
> [..]
>> -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
>> +static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
>>  {
>>  	if (!root->left)
>>  		root->left = rb_first(&root->rb);
>>  
>>  	if (root->left)
>> -		return rb_entry_cfqg(root->left);
>> +		return rb_entry_entity(root->left);
>>  
> 
> Why are we defining cfq_rb_first_entity(). Can't we simply use cfq_rb_first()
> as introduced in first patch.

Vivek,

At this time, we can't remove cfq_rb_first_entity(), because we don't
use st->count for cfqg.
In [PATCH 5/6], I remove cfq_rb_first_entity() and just use cfq_rb_first()
for cfqq and cfqg.

Thanks,
Gui

> 
> Thanks
> Vivek
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-01-20 11:09       ` Vivek Goyal
@ 2011-01-24  4:45         ` Gui Jianfeng
  2011-01-24 18:51           ` Vivek Goyal
  0 siblings, 1 reply; 21+ messages in thread
From: Gui Jianfeng @ 2011-01-24  4:45 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe
  Cc: linux kernel mailing list, Corrado Zoccolo, Chad Talbott,
	Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

Vivek Goyal wrote:
> On Thu, Jan 20, 2011 at 11:58:18AM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Mon, Dec 27, 2010 at 04:51:00PM +0800, Gui Jianfeng wrote:
>>>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
>>>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
>>>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
>>>> group scheduling on the same service tree.
>>>>
>>>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>>> [..]
>>>> @@ -1246,47 +1278,71 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>>>  
>>>>  	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
>>>>  						cfqq_type(cfqq));
>>>> +	/*
>>>> +	 * For the time being, put the newly added CFQ queue at the end of the
>>>> +	 * service tree.
>>>> +	 */
>>>> +	if (RB_EMPTY_NODE(&cfqe->rb_node)) {
>>>> +		/*
>>>> +		 * If this CFQ queue moves to another group, the original
>>>> +		 * vdisktime makes no sense any more, reset the vdisktime
>>>> +		 * here.
>>>> +		 */
>>>> +		parent = rb_last(&service_tree->rb);
>>>> +		if (parent) {
>>>> +			u64 boost;
>>>> +			s64 __vdisktime;
>>>> +
>>>> +			__cfqe = rb_entry_entity(parent);
>>>> +			cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>>>> +
>>>> +			/* Give some vdisktime boost according to its weight */
>>>> +			boost = cfq_get_boost(cfqd, cfqe);
>>>> +			__vdisktime = cfqe->vdisktime - boost;
>>>> +			if (__vdisktime > service_tree->min_vdisktime)
>>>> +				cfqe->vdisktime = __vdisktime;
>>>> +			else
>>>> +				cfqe->vdisktime = service_tree->min_vdisktime;
>>>> +		} else
>>>> +			cfqe->vdisktime = service_tree->min_vdisktime;
>>> Hi Gui,
>>>
>>> Is above logic actually working? I suspect that most of the time all the
>>> new queues will end up getting same vdisktime and that is st->min_vdisktime
>>> and there will be no service differentiation across various prio.
>>>
>>> Reason being, on SSD, idle is disabled. So very few/no queue will consume
>>> its slice and follow reque path. So every queue will be new. Now you are
>>> doing following.
>>>
>>> 	cfqd->vdisktime = vdisktime_of_parent + IDLE_DELAY - boost
>>>
>>> assume vdisktime_of_parent=st->min_vdisktime, then (IDLE_DELAY - boost)
>>> is always going to be a -ve number and hence cfqd->vdisktime will 
>>> default to st->min_vdisktime. (IDLE_DELAY=200 while boost should be a huge
>>> value due to SERVICE_SHIFT thing).
>> Vivek,
>>
>> Actually, I tested on rotational disk with idling disabled, I saw service
>> differentiation between two tasks with different ioprio.
>> I don't have a SSD on hand, But I'll get one and do more tests.
> 
> Ok, I am really not sure how did it work for you because boost will
> turn out to be huge number in comparision to IDLE_DELAY or vdisktime
> of parent. If you have some blktrace data or if you can explain how
> it is working will help in understanding it better.

Vivek,

I think there must be something configuration wrong in my previous tests.
I tested again with patch applied. I don't see service differentiation
this time.
But, I do the same test on vanilla kernel, I don't see service differentiation
either. I dig into it. It seems when I set slice_idle and group_idle to zero,
it means i close idle window by mannual. In this case, IO tasks becomes SYNC NOIDLE
workloads, they will preempts each other mutually. And the cfqq will always be put
at the front of service tree. So we don't see service differentiation.

Actually, We don't want preemption happening in such case, right? So I think we should
refine the preemption logic. 
Thoughts?

Below is my fio job file:

[global]
directory=/hdb3
ioengine=sync
runtime=30
time_based=1
direct=1
bs=128K
size=64M
numjobs=1
exec_prerun='echo 3 > /proc/sys/vm/drop_caches'

[task-prio-0]
description=task-prio-0
rw=read
prio=0
filename=file1

[task-prio-7]
description=task-prio-7
rw=read
prio=7
filename=file2

Thanks,
Gui

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-01-24  4:45         ` Gui Jianfeng
@ 2011-01-24 18:51           ` Vivek Goyal
  0 siblings, 0 replies; 21+ messages in thread
From: Vivek Goyal @ 2011-01-24 18:51 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Jens Axboe, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

On Mon, Jan 24, 2011 at 12:45:18PM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Thu, Jan 20, 2011 at 11:58:18AM +0800, Gui Jianfeng wrote:
> >> Vivek Goyal wrote:
> >>> On Mon, Dec 27, 2010 at 04:51:00PM +0800, Gui Jianfeng wrote:
> >>>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
> >>>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
> >>>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
> >>>> group scheduling on the same service tree.
> >>>>
> >>>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> >>> [..]
> >>>> @@ -1246,47 +1278,71 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> >>>>  
> >>>>  	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
> >>>>  						cfqq_type(cfqq));
> >>>> +	/*
> >>>> +	 * For the time being, put the newly added CFQ queue at the end of the
> >>>> +	 * service tree.
> >>>> +	 */
> >>>> +	if (RB_EMPTY_NODE(&cfqe->rb_node)) {
> >>>> +		/*
> >>>> +		 * If this CFQ queue moves to another group, the original
> >>>> +		 * vdisktime makes no sense any more, reset the vdisktime
> >>>> +		 * here.
> >>>> +		 */
> >>>> +		parent = rb_last(&service_tree->rb);
> >>>> +		if (parent) {
> >>>> +			u64 boost;
> >>>> +			s64 __vdisktime;
> >>>> +
> >>>> +			__cfqe = rb_entry_entity(parent);
> >>>> +			cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> >>>> +
> >>>> +			/* Give some vdisktime boost according to its weight */
> >>>> +			boost = cfq_get_boost(cfqd, cfqe);
> >>>> +			__vdisktime = cfqe->vdisktime - boost;
> >>>> +			if (__vdisktime > service_tree->min_vdisktime)
> >>>> +				cfqe->vdisktime = __vdisktime;
> >>>> +			else
> >>>> +				cfqe->vdisktime = service_tree->min_vdisktime;
> >>>> +		} else
> >>>> +			cfqe->vdisktime = service_tree->min_vdisktime;
> >>> Hi Gui,
> >>>
> >>> Is above logic actually working? I suspect that most of the time all the
> >>> new queues will end up getting same vdisktime and that is st->min_vdisktime
> >>> and there will be no service differentiation across various prio.
> >>>
> >>> Reason being, on SSD, idle is disabled. So very few/no queue will consume
> >>> its slice and follow reque path. So every queue will be new. Now you are
> >>> doing following.
> >>>
> >>> 	cfqd->vdisktime = vdisktime_of_parent + IDLE_DELAY - boost
> >>>
> >>> assume vdisktime_of_parent=st->min_vdisktime, then (IDLE_DELAY - boost)
> >>> is always going to be a -ve number and hence cfqd->vdisktime will 
> >>> default to st->min_vdisktime. (IDLE_DELAY=200 while boost should be a huge
> >>> value due to SERVICE_SHIFT thing).
> >> Vivek,
> >>
> >> Actually, I tested on rotational disk with idling disabled, I saw service
> >> differentiation between two tasks with different ioprio.
> >> I don't have a SSD on hand, But I'll get one and do more tests.
> > 
> > Ok, I am really not sure how did it work for you because boost will
> > turn out to be huge number in comparision to IDLE_DELAY or vdisktime
> > of parent. If you have some blktrace data or if you can explain how
> > it is working will help in understanding it better.
> 
> Vivek,
> 
> I think there must be something configuration wrong in my previous tests.
> I tested again with patch applied. I don't see service differentiation
> this time.
> But, I do the same test on vanilla kernel, I don't see service differentiation
> either. I dig into it. It seems when I set slice_idle and group_idle to zero,
> it means i close idle window by mannual. In this case, IO tasks becomes SYNC NOIDLE
> workloads, they will preempts each other mutually. And the cfqq will always be put
> at the front of service tree. So we don't see service differentiation.
> 
> Actually, We don't want preemption happening in such case, right? So I think we should
> refine the preemption logic. 
> Thoughts?

Gui,

That preemption only happens when we are idling on existing sync-noidle
queue and a new queue gets backlogged. It makes sense from overall
throughput perspective. So I think getting rid of this is not a good
idea.

But agreed that it does not play well with ioprio thing. I would suggest
that instead of two threads, try more 4-5 threads which are doing IO and
keep queue depth as 1. That way you should end up in a case where there
are more than 1 thread backlogged on sync-noidle service tree.

Thanks
Vivek

> 
> Below is my fio job file:
> 
> [global]
> directory=/hdb3
> ioengine=sync
> runtime=30
> time_based=1
> direct=1
> bs=128K
> size=64M
> numjobs=1
> exec_prerun='echo 3 > /proc/sys/vm/drop_caches'
> 
> [task-prio-0]
> description=task-prio-0
> rw=read
> prio=0
> filename=file1
> 
> [task-prio-7]
> description=task-prio-7
> rw=read
> prio=7
> filename=file2
> 
> Thanks,
> Gui

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 5/6 v3] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2010-12-27  8:51 ` [PATCH 5/6 v3] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface Gui Jianfeng
@ 2011-01-24 22:52   ` Vivek Goyal
  0 siblings, 0 replies; 21+ messages in thread
From: Vivek Goyal @ 2011-01-24 22:52 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Jens Axboe, linux kernel mailing list, Corrado Zoccolo,
	Chad Talbott, Nauman Rafique, Divyesh Shah, jmoyer, Shaohua Li

On Mon, Dec 27, 2010 at 04:51:14PM +0800, Gui Jianfeng wrote:

[..]
> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
> -
>  static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
>  					    enum wl_prio_t prio,
>  					    enum wl_type_t type)
> @@ -640,10 +646,19 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>  static inline unsigned
>  cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>  {
> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>  	struct cfq_entity *cfqe = &cfqg->cfqe;
> +	struct cfq_rb_root *st = cfqe->service_tree;
> +	int group_slice = cfq_target_latency;
> +
> +	/* Calculate group slice in a hierarchical way */
> +	do {
> +		group_slice = group_slice * cfqe->weight / st->total_weight;
> +		cfqe = cfqe->parent;
> +		if (cfqe)
> +			st = cfqe->service_tree;
> +	} while (cfqe);
>  
> -	return cfq_target_latency * cfqe->weight / st->total_weight;
> +	return group_slice;
>  }

Gui, I think this is still not fully correct. In flat mode there was
only 1 service tree at top and all the groups were on that service tree
so st->total_weight worked fine.

But now with hierarchical mode, children group might be on one of the
sync-idle tree and there might be other queues on other service tree
in the parent group. 

So we shall have to have a notion of total group weigt (and not just
service tree weight) to calculate this accurately I think.

Secondly, this logic does not take into account the ioprio or sync/async
to calculate the group share. I think for the time being we can keep
it simple and later look into it to refine it.

Also I want to have some integration/simplification of workload slice
a and cfqq slice calculation logic with group slice logic. I guess will take
that up later.

>  
>  static inline void
> @@ -666,7 +681,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>  			/* scale low_slice according to IO priority
>  			 * and sync vs async */
>  			unsigned low_slice =
> -				min(slice, base_low_slice * slice / sync_slice);
> +				min(slice, base_low_slice * slice /
> +				    sync_slice);

Why extra line?

Thanks
Vivek

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2011-01-24 22:53 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <4D180ECE.4000305@cn.fujitsu.com>
2010-12-27  8:50 ` [PATCH 1/6 v3] cfq-iosched: Introduce cfq_entity for CFQ queue Gui Jianfeng
2010-12-27  8:50 ` [PATCH 2/6 v3] cfq-iosched: Introduce cfq_entity for CFQ group Gui Jianfeng
2011-01-18 21:20   ` Vivek Goyal
2011-01-19  1:25     ` Gui Jianfeng
2011-01-23  2:15     ` Gui Jianfeng
2010-12-27  8:51 ` [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
2010-12-28  2:50   ` Shaohua Li
2010-12-28  3:59     ` Gui Jianfeng
2010-12-28  6:03       ` Shaohua Li
2010-12-28  6:59         ` Gui Jianfeng
2011-01-19 22:58   ` Vivek Goyal
2011-01-20  3:58     ` Gui Jianfeng
2011-01-20 11:09       ` Vivek Goyal
2011-01-24  4:45         ` Gui Jianfeng
2011-01-24 18:51           ` Vivek Goyal
2010-12-27  8:51 ` [PATCH 4/6 v3] cfq-iosched: Extract some common code of service tree handling for CFQ queue and CFQ group Gui Jianfeng
2010-12-27  8:51 ` [PATCH 5/6 v3] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface Gui Jianfeng
2011-01-24 22:52   ` Vivek Goyal
2010-12-27  8:51 ` [PATCH 6/6 v3] blkio-cgroup: Document for blkio.use_hierarchy interface Gui Jianfeng
2011-01-18 20:27   ` Vivek Goyal
2011-01-19  1:20     ` Gui Jianfeng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).